ARM64: Support 128-bit registers for SIMD.

Test: test-art-host, test-art-target

Change-Id: Ifb931a99d34ea77602a0e0781040ed092de9faaa
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 794e05c..b39a0e4 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -68,6 +68,7 @@
 using helpers::OutputCPURegister;
 using helpers::OutputFPRegister;
 using helpers::OutputRegister;
+using helpers::QRegisterFrom;
 using helpers::RegisterFrom;
 using helpers::StackOperandFrom;
 using helpers::VIXLRegCodeFromART;
@@ -1459,9 +1460,12 @@
 }
 
 Location ParallelMoveResolverARM64::AllocateScratchLocationFor(Location::Kind kind) {
-  DCHECK(kind == Location::kRegister || kind == Location::kFpuRegister ||
-         kind == Location::kStackSlot || kind == Location::kDoubleStackSlot);
-  kind = (kind == Location::kFpuRegister) ? Location::kFpuRegister : Location::kRegister;
+  DCHECK(kind == Location::kRegister || kind == Location::kFpuRegister
+         || kind == Location::kStackSlot || kind == Location::kDoubleStackSlot
+         || kind == Location::kSIMDStackSlot);
+  kind = (kind == Location::kFpuRegister || kind == Location::kSIMDStackSlot)
+      ? Location::kFpuRegister
+      : Location::kRegister;
   Location scratch = GetScratchLocation(kind);
   if (!scratch.Equals(Location::NoLocation())) {
     return scratch;
@@ -1471,7 +1475,9 @@
     scratch = LocationFrom(vixl_temps_.AcquireX());
   } else {
     DCHECK(kind == Location::kFpuRegister);
-    scratch = LocationFrom(vixl_temps_.AcquireD());
+    scratch = LocationFrom(codegen_->GetGraph()->HasSIMD()
+        ? vixl_temps_.AcquireVRegisterOfSize(kQRegSize)
+        : vixl_temps_.AcquireD());
   }
   AddScratchLocation(scratch);
   return scratch;
@@ -1482,7 +1488,7 @@
     vixl_temps_.Release(XRegisterFrom(loc));
   } else {
     DCHECK(loc.IsFpuRegister());
-    vixl_temps_.Release(DRegisterFrom(loc));
+    vixl_temps_.Release(codegen_->GetGraph()->HasSIMD() ? QRegisterFrom(loc) : DRegisterFrom(loc));
   }
   RemoveScratchLocation(loc);
 }
@@ -1745,6 +1751,8 @@
     if (source.IsStackSlot() || source.IsDoubleStackSlot()) {
       DCHECK(dst.Is64Bits() == source.IsDoubleStackSlot());
       __ Ldr(dst, StackOperandFrom(source));
+    } else if (source.IsSIMDStackSlot()) {
+      __ Ldr(QRegisterFrom(destination), StackOperandFrom(source));
     } else if (source.IsConstant()) {
       DCHECK(CoherentConstantAndType(source, dst_type));
       MoveConstant(dst, source.GetConstant());
@@ -1767,7 +1775,29 @@
         __ Fmov(RegisterFrom(destination, dst_type), FPRegisterFrom(source, source_type));
       } else {
         DCHECK(destination.IsFpuRegister());
-        __ Fmov(FPRegister(dst), FPRegisterFrom(source, dst_type));
+        if (GetGraph()->HasSIMD()) {
+          __ Mov(QRegisterFrom(destination), QRegisterFrom(source));
+        } else {
+          __ Fmov(FPRegister(dst), FPRegisterFrom(source, dst_type));
+        }
+      }
+    }
+  } else if (destination.IsSIMDStackSlot()) {
+    if (source.IsFpuRegister()) {
+      __ Str(QRegisterFrom(source), StackOperandFrom(destination));
+    } else {
+      DCHECK(source.IsSIMDStackSlot());
+      UseScratchRegisterScope temps(GetVIXLAssembler());
+      if (GetVIXLAssembler()->GetScratchFPRegisterList()->IsEmpty()) {
+        Register temp = temps.AcquireX();
+        __ Ldr(temp, MemOperand(sp, source.GetStackIndex()));
+        __ Str(temp, MemOperand(sp, destination.GetStackIndex()));
+        __ Ldr(temp, MemOperand(sp, source.GetStackIndex() + kArm64WordSize));
+        __ Str(temp, MemOperand(sp, destination.GetStackIndex() + kArm64WordSize));
+      } else {
+        FPRegister temp = temps.AcquireVRegisterOfSize(kQRegSize);
+        __ Ldr(temp, StackOperandFrom(source));
+        __ Str(temp, StackOperandFrom(destination));
       }
     }
   } else {  // The destination is not a register. It must be a stack slot.
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 10d8b84..869aad2 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -412,8 +412,9 @@
   }
 
   size_t GetFloatingPointSpillSlotSize() const OVERRIDE {
-    // Allocated in D registers, which are word sized.
-    return kArm64WordSize;
+    return GetGraph()->HasSIMD()
+        ? 2 * kArm64WordSize   // 16 bytes == 2 arm64 words for each spill
+        : 1 * kArm64WordSize;  //  8 bytes == 1 arm64 words for each spill
   }
 
   uintptr_t GetAddressOf(HBasicBlock* block) OVERRIDE {
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index f4874fe..11c5e38 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -57,21 +57,21 @@
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Dup(dst.V8B(), InputRegisterAt(instruction, 0));
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ Dup(dst.V16B(), InputRegisterAt(instruction, 0));
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Dup(dst.V4H(), InputRegisterAt(instruction, 0));
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Dup(dst.V8H(), InputRegisterAt(instruction, 0));
       break;
     case Primitive::kPrimInt:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Dup(dst.V2S(), InputRegisterAt(instruction, 0));
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Dup(dst.V4S(), InputRegisterAt(instruction, 0));
       break;
     case Primitive::kPrimFloat:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Dup(dst.V2S(), DRegisterFrom(locations->InAt(0)).V2S(), 0);
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Dup(dst.V4S(), DRegisterFrom(locations->InAt(0)).V4S(), 0);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -130,8 +130,8 @@
   Primitive::Type from = instruction->GetInputType();
   Primitive::Type to = instruction->GetResultType();
   if (from == Primitive::kPrimInt && to == Primitive::kPrimFloat) {
-    DCHECK_EQ(2u, instruction->GetVectorLength());
-    __ Scvtf(dst.V2S(), src.V2S());
+    DCHECK_EQ(4u, instruction->GetVectorLength());
+    __ Scvtf(dst.V4S(), src.V4S());
   } else {
     LOG(FATAL) << "Unsupported SIMD type";
   }
@@ -147,21 +147,21 @@
   FPRegister dst = DRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Neg(dst.V8B(), src.V8B());
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ Neg(dst.V16B(), src.V16B());
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Neg(dst.V4H(), src.V4H());
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Neg(dst.V8H(), src.V8H());
       break;
     case Primitive::kPrimInt:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Neg(dst.V2S(), src.V2S());
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Neg(dst.V4S(), src.V4S());
       break;
     case Primitive::kPrimFloat:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Fneg(dst.V2S(), src.V2S());
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Fneg(dst.V4S(), src.V4S());
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -179,21 +179,21 @@
   FPRegister dst = DRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Abs(dst.V8B(), src.V8B());
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ Abs(dst.V16B(), src.V16B());
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Abs(dst.V4H(), src.V4H());
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Abs(dst.V8H(), src.V8H());
       break;
     case Primitive::kPrimInt:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Abs(dst.V2S(), src.V2S());
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Abs(dst.V4S(), src.V4S());
       break;
     case Primitive::kPrimFloat:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Fabs(dst.V2S(), src.V2S());
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Fabs(dst.V4S(), src.V4S());
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -210,15 +210,15 @@
   FPRegister dst = DRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:  // special case boolean-not
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Movi(dst.V8B(), 1);
-      __ Eor(dst.V8B(), dst.V8B(), src.V8B());
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ Movi(dst.V16B(), 1);
+      __ Eor(dst.V16B(), dst.V16B(), src.V16B());
       break;
     case Primitive::kPrimByte:
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
-      __ Not(dst.V8B(), src.V8B());  // lanes do not matter
+      __ Not(dst.V16B(), src.V16B());  // lanes do not matter
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -257,21 +257,21 @@
   FPRegister dst = DRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Add(dst.V8B(), lhs.V8B(), rhs.V8B());
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ Add(dst.V16B(), lhs.V16B(), rhs.V16B());
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Add(dst.V4H(), lhs.V4H(), rhs.V4H());
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Add(dst.V8H(), lhs.V8H(), rhs.V8H());
       break;
     case Primitive::kPrimInt:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Add(dst.V2S(), lhs.V2S(), rhs.V2S());
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Add(dst.V4S(), lhs.V4S(), rhs.V4S());
       break;
     case Primitive::kPrimFloat:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Fadd(dst.V2S(), lhs.V2S(), rhs.V2S());
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Fadd(dst.V4S(), lhs.V4S(), rhs.V4S());
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -290,21 +290,21 @@
   FPRegister dst = DRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Sub(dst.V8B(), lhs.V8B(), rhs.V8B());
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ Sub(dst.V16B(), lhs.V16B(), rhs.V16B());
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Sub(dst.V4H(), lhs.V4H(), rhs.V4H());
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Sub(dst.V8H(), lhs.V8H(), rhs.V8H());
       break;
     case Primitive::kPrimInt:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Sub(dst.V2S(), lhs.V2S(), rhs.V2S());
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Sub(dst.V4S(), lhs.V4S(), rhs.V4S());
       break;
     case Primitive::kPrimFloat:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Fsub(dst.V2S(), lhs.V2S(), rhs.V2S());
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Fsub(dst.V4S(), lhs.V4S(), rhs.V4S());
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -323,21 +323,21 @@
   FPRegister dst = DRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Mul(dst.V8B(), lhs.V8B(), rhs.V8B());
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ Mul(dst.V16B(), lhs.V16B(), rhs.V16B());
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Mul(dst.V4H(), lhs.V4H(), rhs.V4H());
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Mul(dst.V8H(), lhs.V8H(), rhs.V8H());
       break;
     case Primitive::kPrimInt:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Mul(dst.V2S(), lhs.V2S(), rhs.V2S());
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Mul(dst.V4S(), lhs.V4S(), rhs.V4S());
       break;
     case Primitive::kPrimFloat:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Fmul(dst.V2S(), lhs.V2S(), rhs.V2S());
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Fmul(dst.V4S(), lhs.V4S(), rhs.V4S());
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -356,8 +356,8 @@
   FPRegister dst = DRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimFloat:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Fdiv(dst.V2S(), lhs.V2S(), rhs.V2S());
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Fdiv(dst.V4S(), lhs.V4S(), rhs.V4S());
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -381,7 +381,7 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimFloat:
-      __ And(dst.V8B(), lhs.V8B(), rhs.V8B());  // lanes do not matter
+      __ And(dst.V16B(), lhs.V16B(), rhs.V16B());  // lanes do not matter
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -413,7 +413,7 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimFloat:
-      __ Orr(dst.V8B(), lhs.V8B(), rhs.V8B());  // lanes do not matter
+      __ Orr(dst.V16B(), lhs.V16B(), rhs.V16B());  // lanes do not matter
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -437,7 +437,7 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimFloat:
-      __ Eor(dst.V8B(), lhs.V8B(), rhs.V8B());  // lanes do not matter
+      __ Eor(dst.V16B(), lhs.V16B(), rhs.V16B());  // lanes do not matter
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -474,17 +474,17 @@
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Shl(dst.V8B(), lhs.V8B(), value);
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ Shl(dst.V16B(), lhs.V16B(), value);
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Shl(dst.V4H(), lhs.V4H(), value);
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Shl(dst.V8H(), lhs.V8H(), value);
       break;
     case Primitive::kPrimInt:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Shl(dst.V2S(), lhs.V2S(), value);
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Shl(dst.V4S(), lhs.V4S(), value);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -503,17 +503,17 @@
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Sshr(dst.V8B(), lhs.V8B(), value);
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ Sshr(dst.V16B(), lhs.V16B(), value);
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Sshr(dst.V4H(), lhs.V4H(), value);
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Sshr(dst.V8H(), lhs.V8H(), value);
       break;
     case Primitive::kPrimInt:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Sshr(dst.V2S(), lhs.V2S(), value);
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Sshr(dst.V4S(), lhs.V4S(), value);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -532,17 +532,17 @@
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Ushr(dst.V8B(), lhs.V8B(), value);
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ Ushr(dst.V16B(), lhs.V16B(), value);
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Ushr(dst.V4H(), lhs.V4H(), value);
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Ushr(dst.V8H(), lhs.V8H(), value);
       break;
     case Primitive::kPrimInt:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Ushr(dst.V2S(), lhs.V2S(), value);
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Ushr(dst.V4S(), lhs.V4S(), value);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -617,18 +617,18 @@
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Ld1(reg.V8B(), mem);
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ Ld1(reg.V16B(), mem);
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Ld1(reg.V4H(), mem);
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Ld1(reg.V8H(), mem);
       break;
     case Primitive::kPrimInt:
     case Primitive::kPrimFloat:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Ld1(reg.V2S(), mem);
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Ld1(reg.V4S(), mem);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -647,18 +647,18 @@
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ St1(reg.V8B(), mem);
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ St1(reg.V16B(), mem);
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ St1(reg.V4H(), mem);
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ St1(reg.V8H(), mem);
       break;
     case Primitive::kPrimInt:
     case Primitive::kPrimFloat:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ St1(reg.V2S(), mem);
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ St1(reg.V4S(), mem);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index f8bbf68..4ba5c55 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -769,6 +769,45 @@
   InternalCodeAllocator code_allocator;
   codegen.Finalize(&code_allocator);
 }
+
+// Check that ParallelMoveResolver works fine for ARM64 for both cases when SIMD is on and off.
+TEST_F(CodegenTest, ARM64ParallelMoveResolverSIMD) {
+  std::unique_ptr<const Arm64InstructionSetFeatures> features(
+      Arm64InstructionSetFeatures::FromCppDefines());
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+  HGraph* graph = CreateGraph(&allocator);
+  arm64::CodeGeneratorARM64 codegen(graph, *features.get(), CompilerOptions());
+
+  codegen.Initialize();
+
+  graph->SetHasSIMD(true);
+  for (int i = 0; i < 2; i++) {
+    HParallelMove* move = new (graph->GetArena()) HParallelMove(graph->GetArena());
+    move->AddMove(Location::SIMDStackSlot(0),
+                  Location::SIMDStackSlot(257),
+                  Primitive::kPrimDouble,
+                  nullptr);
+    move->AddMove(Location::SIMDStackSlot(257),
+                  Location::SIMDStackSlot(0),
+                  Primitive::kPrimDouble,
+                  nullptr);
+    move->AddMove(Location::FpuRegisterLocation(0),
+                  Location::FpuRegisterLocation(1),
+                  Primitive::kPrimDouble,
+                  nullptr);
+    move->AddMove(Location::FpuRegisterLocation(1),
+                  Location::FpuRegisterLocation(0),
+                  Primitive::kPrimDouble,
+                  nullptr);
+    codegen.GetMoveResolver()->EmitNativeCode(move);
+    graph->SetHasSIMD(false);
+  }
+
+  InternalCodeAllocator code_allocator;
+  codegen.Finalize(&code_allocator);
+}
+
 #endif
 
 #ifdef ART_ENABLE_CODEGEN_mips
diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
index d3f431e..5372b97 100644
--- a/compiler/optimizing/common_arm64.h
+++ b/compiler/optimizing/common_arm64.h
@@ -92,6 +92,11 @@
   return vixl::aarch64::FPRegister::GetDRegFromCode(location.reg());
 }
 
+inline vixl::aarch64::FPRegister QRegisterFrom(Location location) {
+  DCHECK(location.IsFpuRegister()) << location;
+  return vixl::aarch64::FPRegister::GetQRegFromCode(location.reg());
+}
+
 inline vixl::aarch64::FPRegister SRegisterFrom(Location location) {
   DCHECK(location.IsFpuRegister()) << location;
   return vixl::aarch64::FPRegister::GetSRegFromCode(location.reg());
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index bf18cc9..ec02127 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -770,22 +770,21 @@
       return false;
     case kArm64:
       // Allow vectorization for all ARM devices, because Android assumes that
-      // ARMv8 AArch64 always supports advanced SIMD. For now, only D registers
-      // (64-bit vectors) not Q registers (128-bit vectors).
+      // ARMv8 AArch64 always supports advanced SIMD.
       switch (type) {
         case Primitive::kPrimBoolean:
         case Primitive::kPrimByte:
           *restrictions |= kNoDiv | kNoAbs;
-          return TrySetVectorLength(8);
+          return TrySetVectorLength(16);
         case Primitive::kPrimChar:
         case Primitive::kPrimShort:
           *restrictions |= kNoDiv | kNoAbs;
-          return TrySetVectorLength(4);
+          return TrySetVectorLength(8);
         case Primitive::kPrimInt:
           *restrictions |= kNoDiv;
-          return TrySetVectorLength(2);
+          return TrySetVectorLength(4);
         case Primitive::kPrimFloat:
-          return TrySetVectorLength(2);
+          return TrySetVectorLength(4);
         default:
           return false;
       }