ARM64: Support vectorization for double and long.

Test: test-art-host, test-art-target
Change-Id: I1d4db1763b64737766f9756e5d0f85c5736e3522
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 11c5e38..0923920 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -22,7 +22,7 @@
 namespace art {
 namespace arm64 {
 
-using helpers::DRegisterFrom;
+using helpers::VRegisterFrom;
 using helpers::HeapOperand;
 using helpers::InputRegisterAt;
 using helpers::Int64ConstantFrom;
@@ -38,10 +38,12 @@
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetOut(Location::RequiresFpuRegister());
       break;
     case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
       locations->SetInAt(0, Location::RequiresFpuRegister());
       locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
       break;
@@ -53,7 +55,7 @@
 
 void InstructionCodeGeneratorARM64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  FPRegister dst = DRegisterFrom(locations->Out());
+  VRegister dst = VRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
@@ -69,9 +71,17 @@
       DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Dup(dst.V4S(), InputRegisterAt(instruction, 0));
       break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Dup(dst.V2D(), XRegisterFrom(locations->InAt(0)));
+      break;
     case Primitive::kPrimFloat:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Dup(dst.V4S(), DRegisterFrom(locations->InAt(0)).V4S(), 0);
+      __ Dup(dst.V4S(), VRegisterFrom(locations->InAt(0)).V4S(), 0);
+      break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Dup(dst.V2D(), VRegisterFrom(locations->InAt(0)).V2D(), 0);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -109,7 +119,9 @@
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
     case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
       locations->SetInAt(0, Location::RequiresFpuRegister());
       locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
       break;
@@ -125,8 +137,8 @@
 
 void InstructionCodeGeneratorARM64::VisitVecCnv(HVecCnv* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  FPRegister src = DRegisterFrom(locations->InAt(0));
-  FPRegister dst = DRegisterFrom(locations->Out());
+  VRegister src = VRegisterFrom(locations->InAt(0));
+  VRegister dst = VRegisterFrom(locations->Out());
   Primitive::Type from = instruction->GetInputType();
   Primitive::Type to = instruction->GetResultType();
   if (from == Primitive::kPrimInt && to == Primitive::kPrimFloat) {
@@ -143,8 +155,8 @@
 
 void InstructionCodeGeneratorARM64::VisitVecNeg(HVecNeg* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  FPRegister src = DRegisterFrom(locations->InAt(0));
-  FPRegister dst = DRegisterFrom(locations->Out());
+  VRegister src = VRegisterFrom(locations->InAt(0));
+  VRegister dst = VRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
       DCHECK_EQ(16u, instruction->GetVectorLength());
@@ -159,10 +171,18 @@
       DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Neg(dst.V4S(), src.V4S());
       break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Neg(dst.V2D(), src.V2D());
+      break;
     case Primitive::kPrimFloat:
       DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fneg(dst.V4S(), src.V4S());
       break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Fneg(dst.V2D(), src.V2D());
+      break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
       UNREACHABLE();
@@ -175,8 +195,8 @@
 
 void InstructionCodeGeneratorARM64::VisitVecAbs(HVecAbs* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  FPRegister src = DRegisterFrom(locations->InAt(0));
-  FPRegister dst = DRegisterFrom(locations->Out());
+  VRegister src = VRegisterFrom(locations->InAt(0));
+  VRegister dst = VRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
       DCHECK_EQ(16u, instruction->GetVectorLength());
@@ -191,10 +211,18 @@
       DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Abs(dst.V4S(), src.V4S());
       break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Abs(dst.V2D(), src.V2D());
+      break;
     case Primitive::kPrimFloat:
       DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fabs(dst.V4S(), src.V4S());
       break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Fabs(dst.V2D(), src.V2D());
+      break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
   }
@@ -206,8 +234,8 @@
 
 void InstructionCodeGeneratorARM64::VisitVecNot(HVecNot* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  FPRegister src = DRegisterFrom(locations->InAt(0));
-  FPRegister dst = DRegisterFrom(locations->Out());
+  VRegister src = VRegisterFrom(locations->InAt(0));
+  VRegister dst = VRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:  // special case boolean-not
       DCHECK_EQ(16u, instruction->GetVectorLength());
@@ -218,6 +246,7 @@
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
       __ Not(dst.V16B(), src.V16B());  // lanes do not matter
       break;
     default:
@@ -235,7 +264,9 @@
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
     case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
       locations->SetInAt(0, Location::RequiresFpuRegister());
       locations->SetInAt(1, Location::RequiresFpuRegister());
       locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
@@ -252,9 +283,9 @@
 
 void InstructionCodeGeneratorARM64::VisitVecAdd(HVecAdd* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  FPRegister lhs = DRegisterFrom(locations->InAt(0));
-  FPRegister rhs = DRegisterFrom(locations->InAt(1));
-  FPRegister dst = DRegisterFrom(locations->Out());
+  VRegister lhs = VRegisterFrom(locations->InAt(0));
+  VRegister rhs = VRegisterFrom(locations->InAt(1));
+  VRegister dst = VRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
       DCHECK_EQ(16u, instruction->GetVectorLength());
@@ -269,10 +300,18 @@
       DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Add(dst.V4S(), lhs.V4S(), rhs.V4S());
       break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Add(dst.V2D(), lhs.V2D(), rhs.V2D());
+      break;
     case Primitive::kPrimFloat:
       DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fadd(dst.V4S(), lhs.V4S(), rhs.V4S());
       break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Fadd(dst.V2D(), lhs.V2D(), rhs.V2D());
+      break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
       UNREACHABLE();
@@ -285,9 +324,9 @@
 
 void InstructionCodeGeneratorARM64::VisitVecSub(HVecSub* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  FPRegister lhs = DRegisterFrom(locations->InAt(0));
-  FPRegister rhs = DRegisterFrom(locations->InAt(1));
-  FPRegister dst = DRegisterFrom(locations->Out());
+  VRegister lhs = VRegisterFrom(locations->InAt(0));
+  VRegister rhs = VRegisterFrom(locations->InAt(1));
+  VRegister dst = VRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
       DCHECK_EQ(16u, instruction->GetVectorLength());
@@ -302,10 +341,18 @@
       DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Sub(dst.V4S(), lhs.V4S(), rhs.V4S());
       break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Sub(dst.V2D(), lhs.V2D(), rhs.V2D());
+      break;
     case Primitive::kPrimFloat:
       DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fsub(dst.V4S(), lhs.V4S(), rhs.V4S());
       break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Fsub(dst.V2D(), lhs.V2D(), rhs.V2D());
+      break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
       UNREACHABLE();
@@ -318,9 +365,9 @@
 
 void InstructionCodeGeneratorARM64::VisitVecMul(HVecMul* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  FPRegister lhs = DRegisterFrom(locations->InAt(0));
-  FPRegister rhs = DRegisterFrom(locations->InAt(1));
-  FPRegister dst = DRegisterFrom(locations->Out());
+  VRegister lhs = VRegisterFrom(locations->InAt(0));
+  VRegister rhs = VRegisterFrom(locations->InAt(1));
+  VRegister dst = VRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
       DCHECK_EQ(16u, instruction->GetVectorLength());
@@ -339,6 +386,10 @@
       DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fmul(dst.V4S(), lhs.V4S(), rhs.V4S());
       break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Fmul(dst.V2D(), lhs.V2D(), rhs.V2D());
+      break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
       UNREACHABLE();
@@ -351,14 +402,18 @@
 
 void InstructionCodeGeneratorARM64::VisitVecDiv(HVecDiv* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  FPRegister lhs = DRegisterFrom(locations->InAt(0));
-  FPRegister rhs = DRegisterFrom(locations->InAt(1));
-  FPRegister dst = DRegisterFrom(locations->Out());
+  VRegister lhs = VRegisterFrom(locations->InAt(0));
+  VRegister rhs = VRegisterFrom(locations->InAt(1));
+  VRegister dst = VRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimFloat:
       DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fdiv(dst.V4S(), lhs.V4S(), rhs.V4S());
       break;
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Fdiv(dst.V2D(), lhs.V2D(), rhs.V2D());
+      break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
       UNREACHABLE();
@@ -371,16 +426,18 @@
 
 void InstructionCodeGeneratorARM64::VisitVecAnd(HVecAnd* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  FPRegister lhs = DRegisterFrom(locations->InAt(0));
-  FPRegister rhs = DRegisterFrom(locations->InAt(1));
-  FPRegister dst = DRegisterFrom(locations->Out());
+  VRegister lhs = VRegisterFrom(locations->InAt(0));
+  VRegister rhs = VRegisterFrom(locations->InAt(1));
+  VRegister dst = VRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
     case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
       __ And(dst.V16B(), lhs.V16B(), rhs.V16B());  // lanes do not matter
       break;
     default:
@@ -403,16 +460,18 @@
 
 void InstructionCodeGeneratorARM64::VisitVecOr(HVecOr* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  FPRegister lhs = DRegisterFrom(locations->InAt(0));
-  FPRegister rhs = DRegisterFrom(locations->InAt(1));
-  FPRegister dst = DRegisterFrom(locations->Out());
+  VRegister lhs = VRegisterFrom(locations->InAt(0));
+  VRegister rhs = VRegisterFrom(locations->InAt(1));
+  VRegister dst = VRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
     case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
       __ Orr(dst.V16B(), lhs.V16B(), rhs.V16B());  // lanes do not matter
       break;
     default:
@@ -427,16 +486,18 @@
 
 void InstructionCodeGeneratorARM64::VisitVecXor(HVecXor* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  FPRegister lhs = DRegisterFrom(locations->InAt(0));
-  FPRegister rhs = DRegisterFrom(locations->InAt(1));
-  FPRegister dst = DRegisterFrom(locations->Out());
+  VRegister lhs = VRegisterFrom(locations->InAt(0));
+  VRegister rhs = VRegisterFrom(locations->InAt(1));
+  VRegister dst = VRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
     case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
       __ Eor(dst.V16B(), lhs.V16B(), rhs.V16B());  // lanes do not matter
       break;
     default:
@@ -453,6 +514,7 @@
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
       locations->SetInAt(0, Location::RequiresFpuRegister());
       locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
       locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
@@ -469,8 +531,8 @@
 
 void InstructionCodeGeneratorARM64::VisitVecShl(HVecShl* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  FPRegister lhs = DRegisterFrom(locations->InAt(0));
-  FPRegister dst = DRegisterFrom(locations->Out());
+  VRegister lhs = VRegisterFrom(locations->InAt(0));
+  VRegister dst = VRegisterFrom(locations->Out());
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
@@ -486,6 +548,10 @@
       DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Shl(dst.V4S(), lhs.V4S(), value);
       break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Shl(dst.V2D(), lhs.V2D(), value);
+      break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
       UNREACHABLE();
@@ -498,8 +564,8 @@
 
 void InstructionCodeGeneratorARM64::VisitVecShr(HVecShr* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  FPRegister lhs = DRegisterFrom(locations->InAt(0));
-  FPRegister dst = DRegisterFrom(locations->Out());
+  VRegister lhs = VRegisterFrom(locations->InAt(0));
+  VRegister dst = VRegisterFrom(locations->Out());
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
@@ -515,6 +581,10 @@
       DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Sshr(dst.V4S(), lhs.V4S(), value);
       break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Sshr(dst.V2D(), lhs.V2D(), value);
+      break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
       UNREACHABLE();
@@ -527,8 +597,8 @@
 
 void InstructionCodeGeneratorARM64::VisitVecUShr(HVecUShr* instruction) {
   LocationSummary* locations = instruction->GetLocations();
-  FPRegister lhs = DRegisterFrom(locations->InAt(0));
-  FPRegister dst = DRegisterFrom(locations->Out());
+  VRegister lhs = VRegisterFrom(locations->InAt(0));
+  VRegister dst = VRegisterFrom(locations->Out());
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
@@ -544,6 +614,10 @@
       DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Ushr(dst.V4S(), lhs.V4S(), value);
       break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Ushr(dst.V2D(), lhs.V2D(), value);
+      break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
       UNREACHABLE();
@@ -561,7 +635,9 @@
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
     case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
       locations->SetInAt(0, Location::RequiresRegister());
       locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
       if (is_load) {
@@ -613,7 +689,7 @@
 void InstructionCodeGeneratorARM64::VisitVecLoad(HVecLoad* instruction) {
   Location reg_loc = Location::NoLocation();
   MemOperand mem = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ true);
-  FPRegister reg = DRegisterFrom(reg_loc);
+  VRegister reg = VRegisterFrom(reg_loc);
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
@@ -630,6 +706,11 @@
       DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Ld1(reg.V4S(), mem);
       break;
+    case Primitive::kPrimLong:
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Ld1(reg.V2D(), mem);
+      break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
       UNREACHABLE();
@@ -643,7 +724,7 @@
 void InstructionCodeGeneratorARM64::VisitVecStore(HVecStore* instruction) {
   Location reg_loc = Location::NoLocation();
   MemOperand mem = CreateVecMemRegisters(instruction, &reg_loc, /*is_load*/ false);
-  FPRegister reg = DRegisterFrom(reg_loc);
+  VRegister reg = VRegisterFrom(reg_loc);
   switch (instruction->GetPackedType()) {
     case Primitive::kPrimBoolean:
     case Primitive::kPrimByte:
@@ -660,6 +741,11 @@
       DCHECK_EQ(4u, instruction->GetVectorLength());
       __ St1(reg.V4S(), mem);
       break;
+    case Primitive::kPrimLong:
+    case Primitive::kPrimDouble:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ St1(reg.V2D(), mem);
+      break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
       UNREACHABLE();
diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
index 5372b97..721f74e 100644
--- a/compiler/optimizing/common_arm64.h
+++ b/compiler/optimizing/common_arm64.h
@@ -97,6 +97,11 @@
   return vixl::aarch64::FPRegister::GetQRegFromCode(location.reg());
 }
 
+inline vixl::aarch64::FPRegister VRegisterFrom(Location location) {
+  DCHECK(location.IsFpuRegister()) << location;
+  return vixl::aarch64::FPRegister::GetVRegFromCode(location.reg());
+}
+
 inline vixl::aarch64::FPRegister SRegisterFrom(Location location) {
   DCHECK(location.IsFpuRegister()) << location;
   return vixl::aarch64::FPRegister::GetSRegFromCode(location.reg());
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index ec02127..6337361 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -783,8 +783,13 @@
         case Primitive::kPrimInt:
           *restrictions |= kNoDiv;
           return TrySetVectorLength(4);
+        case Primitive::kPrimLong:
+          *restrictions |= kNoDiv | kNoMul;
+          return TrySetVectorLength(2);
         case Primitive::kPrimFloat:
           return TrySetVectorLength(4);
+        case Primitive::kPrimDouble:
+          return TrySetVectorLength(2);
         default:
           return false;
       }
diff --git a/test/640-checker-double-simd/src/Main.java b/test/640-checker-double-simd/src/Main.java
index 43f65f1..0d4f87a 100644
--- a/test/640-checker-double-simd/src/Main.java
+++ b/test/640-checker-double-simd/src/Main.java
@@ -32,8 +32,10 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.add(double) loop_optimization (after)
-  //
-  // TODO: fill in when supported
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void add(double x) {
     for (int i = 0; i < 128; i++)
       a[i] += x;
@@ -45,8 +47,10 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.sub(double) loop_optimization (after)
-  //
-  // TODO: fill in when supported
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void sub(double x) {
     for (int i = 0; i < 128; i++)
       a[i] -= x;
@@ -58,8 +62,10 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.mul(double) loop_optimization (after)
-  //
-  // TODO: fill in when supported
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecMul   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void mul(double x) {
     for (int i = 0; i < 128; i++)
       a[i] *= x;
@@ -71,8 +77,10 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.div(double) loop_optimization (after)
-  //
-  // TODO: fill in when supported
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecDiv   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void div(double x) {
     for (int i = 0; i < 128; i++)
       a[i] /= x;
@@ -84,8 +92,10 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.neg() loop_optimization (after)
-  //
-  // TODO: fill in when supported
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void neg() {
     for (int i = 0; i < 128; i++)
       a[i] = -a[i];
@@ -97,8 +107,10 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.abs() loop_optimization (after)
-  //
-  // TODO: fill in when supported
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecAbs   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void abs() {
     for (int i = 0; i < 128; i++)
       a[i] = Math.abs(a[i]);
diff --git a/test/640-checker-long-simd/src/Main.java b/test/640-checker-long-simd/src/Main.java
index 90a2e76..5641182 100644
--- a/test/640-checker-long-simd/src/Main.java
+++ b/test/640-checker-long-simd/src/Main.java
@@ -31,8 +31,10 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.add(long) loop_optimization (after)
-  //
-  // TODO: fill in when supported
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecAdd   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void add(long x) {
     for (int i = 0; i < 128; i++)
       a[i] += x;
@@ -44,8 +46,10 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.sub(long) loop_optimization (after)
-  //
-  // TODO: fill in when supported
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecSub   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void sub(long x) {
     for (int i = 0; i < 128; i++)
       a[i] -= x;
@@ -56,9 +60,9 @@
   /// CHECK-DAG: ArrayGet loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
+  //  Not supported for longs.
   /// CHECK-START-ARM64: void Main.mul(long) loop_optimization (after)
-  //
-  // TODO: fill in when supported
+  /// CHECK-NOT: VecMul
   static void mul(long x) {
     for (int i = 0; i < 128; i++)
       a[i] *= x;
@@ -84,8 +88,10 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.neg() loop_optimization (after)
-  //
-  // TODO: fill in when supported
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecNeg   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void neg() {
     for (int i = 0; i < 128; i++)
       a[i] = -a[i];
@@ -97,8 +103,10 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.not() loop_optimization (after)
-  //
-  // TODO: fill in when supported
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecNot   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void not() {
     for (int i = 0; i < 128; i++)
       a[i] = ~a[i];
@@ -110,8 +118,10 @@
   /// CHECK-DAG: ArraySet loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START-ARM64: void Main.shl4() loop_optimization (after)
-  //
-  // TODO: fill in when supported
+  /// CHECK-DAG: Phi      loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: VecLoad  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecShl   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: VecStore loop:<<Loop>>      outer_loop:none
   static void shl4() {
     for (int i = 0; i < 128; i++)
       a[i] <<= 4;