Implement Sum-of-Abs-Differences idiom recognition. Rationale: Currently just on ARM64 (x86 lacks proper support), using the SAD idiom yields great speedup on loops that compute the sum-of-abs-difference operation. Also includes some refinements around type conversions. Speedup ExoPlayerAudio (golem run): 1.3x on ARM64 1.1x on x86 Test: test-art-host test-art-target Bug: 64091002 Change-Id: Ia2b711d2bc23609a2ed50493dfe6719eedfe0130

commit: dbbac8f812a866b1b53f3007721f66038d208549 [log] [tgz]
author: Aart Bik <ajcbik@google.com> Fri Sep 01 13:06:08 2017 -0700
committer: Aart Bik <ajcbik@google.com> Thu Sep 21 10:20:55 2017 -0700
tree: 05cecd927afccd33fc1c14b39ada47e86873f560
parent: 2406bf17998e15bd40677a907beb3e9c41facce4 [diff]
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 18a55c8..3f576c8 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc

@@ -949,20 +949,18 @@
   }
 }
 
-void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
-  switch (instr->GetPackedType()) {
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
-      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(2, Location::RequiresFpuRegister());
       locations->SetOut(Location::SameAsFirstInput());
       break;
     default:
@@ -971,18 +969,25 @@
   }
 }
 
+void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
 // Some early revisions of the Cortex-A53 have an erratum (835769) whereby it is possible for a
 // 64-bit scalar multiply-accumulate instruction in AArch64 state to generate an incorrect result.
 // However vector MultiplyAccumulate instruction is not affected.
-void InstructionCodeGeneratorARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = instr->GetLocations();
-  VRegister acc = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputAccumulatorIndex));
-  VRegister left = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulLeftIndex));
-  VRegister right = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulRightIndex));
-  switch (instr->GetPackedType()) {
+void InstructionCodeGeneratorARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  VRegister acc = VRegisterFrom(locations->InAt(0));
+  VRegister left = VRegisterFrom(locations->InAt(1));
+  VRegister right = VRegisterFrom(locations->InAt(2));
+
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+
+  switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
-      DCHECK_EQ(16u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ Mla(acc.V16B(), left.V16B(), right.V16B());
       } else {
         __ Mls(acc.V16B(), left.V16B(), right.V16B());
@@ -990,16 +995,16 @@
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(8u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ Mla(acc.V8H(), left.V8H(), right.V8H());
       } else {
         __ Mls(acc.V8H(), left.V8H(), right.V8H());
       }
       break;
     case Primitive::kPrimInt:
-      DCHECK_EQ(4u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ Mla(acc.V4S(), left.V4S(), right.V4S());
       } else {
         __ Mls(acc.V4S(), left.V4S(), right.V4S());
@@ -1007,6 +1012,186 @@
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+  // Some conversions require temporary registers.
+  LocationSummary* locations = instruction->GetLocations();
+  HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
+  HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
+  DCHECK_EQ(a->GetPackedType(), b->GetPackedType());
+  switch (a->GetPackedType()) {
+    case Primitive::kPrimByte:
+      switch (instruction->GetPackedType()) {
+        case Primitive::kPrimLong:
+          locations->AddTemp(Location::RequiresFpuRegister());
+          locations->AddTemp(Location::RequiresFpuRegister());
+          FALLTHROUGH_INTENDED;
+        case Primitive::kPrimInt:
+          locations->AddTemp(Location::RequiresFpuRegister());
+          locations->AddTemp(Location::RequiresFpuRegister());
+          break;
+        default:
+          break;
+      }
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      if (instruction->GetPackedType() == Primitive::kPrimLong) {
+        locations->AddTemp(Location::RequiresFpuRegister());
+        locations->AddTemp(Location::RequiresFpuRegister());
+      }
+      break;
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      if (instruction->GetPackedType() == a->GetPackedType()) {
+        locations->AddTemp(Location::RequiresFpuRegister());
+      }
+      break;
+    default:
+      break;
+  }
+}
+
+void InstructionCodeGeneratorARM64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  VRegister acc = VRegisterFrom(locations->InAt(0));
+  VRegister left = VRegisterFrom(locations->InAt(1));
+  VRegister right = VRegisterFrom(locations->InAt(2));
+
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+
+  // Handle all feasible acc_T += sad(a_S, b_S) type combinations (T x S).
+  HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
+  HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
+  DCHECK_EQ(a->GetPackedType(), b->GetPackedType());
+  switch (a->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort:
+          DCHECK_EQ(8u, instruction->GetVectorLength());
+          __ Sabal(acc.V8H(), left.V8B(), right.V8B());
+          __ Sabal2(acc.V8H(), left.V16B(), right.V16B());
+          break;
+        case Primitive::kPrimInt: {
+          DCHECK_EQ(4u, instruction->GetVectorLength());
+          VRegister tmp1 = VRegisterFrom(locations->GetTemp(0));
+          VRegister tmp2 = VRegisterFrom(locations->GetTemp(1));
+          __ Sxtl(tmp1.V8H(), left.V8B());
+          __ Sxtl(tmp2.V8H(), right.V8B());
+          __ Sabal(acc.V4S(), tmp1.V4H(), tmp2.V4H());
+          __ Sabal2(acc.V4S(), tmp1.V8H(), tmp2.V8H());
+          __ Sxtl2(tmp1.V8H(), left.V16B());
+          __ Sxtl2(tmp2.V8H(), right.V16B());
+          __ Sabal(acc.V4S(), tmp1.V4H(), tmp2.V4H());
+          __ Sabal2(acc.V4S(), tmp1.V8H(), tmp2.V8H());
+          break;
+        }
+        case Primitive::kPrimLong: {
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          VRegister tmp1 = VRegisterFrom(locations->GetTemp(0));
+          VRegister tmp2 = VRegisterFrom(locations->GetTemp(1));
+          VRegister tmp3 = VRegisterFrom(locations->GetTemp(2));
+          VRegister tmp4 = VRegisterFrom(locations->GetTemp(3));
+          __ Sxtl(tmp1.V8H(), left.V8B());
+          __ Sxtl(tmp2.V8H(), right.V8B());
+          __ Sxtl(tmp3.V4S(), tmp1.V4H());
+          __ Sxtl(tmp4.V4S(), tmp2.V4H());
+          __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
+          __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
+          __ Sxtl2(tmp3.V4S(), tmp1.V8H());
+          __ Sxtl2(tmp4.V4S(), tmp2.V8H());
+          __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
+          __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
+          __ Sxtl2(tmp1.V8H(), left.V16B());
+          __ Sxtl2(tmp2.V8H(), right.V16B());
+          __ Sxtl(tmp3.V4S(), tmp1.V4H());
+          __ Sxtl(tmp4.V4S(), tmp2.V4H());
+          __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
+          __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
+          __ Sxtl2(tmp3.V4S(), tmp1.V8H());
+          __ Sxtl2(tmp4.V4S(), tmp2.V8H());
+          __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
+          __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case Primitive::kPrimInt:
+          DCHECK_EQ(4u, instruction->GetVectorLength());
+          __ Sabal(acc.V4S(), left.V4H(), right.V4H());
+          __ Sabal2(acc.V4S(), left.V8H(), right.V8H());
+          break;
+        case Primitive::kPrimLong: {
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          VRegister tmp1 = VRegisterFrom(locations->GetTemp(0));
+          VRegister tmp2 = VRegisterFrom(locations->GetTemp(1));
+          __ Sxtl(tmp1.V4S(), left.V4H());
+          __ Sxtl(tmp2.V4S(), right.V4H());
+          __ Sabal(acc.V2D(), tmp1.V2S(), tmp2.V2S());
+          __ Sabal2(acc.V2D(), tmp1.V4S(), tmp2.V4S());
+          __ Sxtl2(tmp1.V4S(), left.V8H());
+          __ Sxtl2(tmp2.V4S(), right.V8H());
+          __ Sabal(acc.V2D(), tmp1.V2S(), tmp2.V2S());
+          __ Sabal2(acc.V2D(), tmp1.V4S(), tmp2.V4S());
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case Primitive::kPrimInt: {
+          DCHECK_EQ(4u, instruction->GetVectorLength());
+          VRegister tmp = VRegisterFrom(locations->GetTemp(0));
+          __ Sub(tmp.V4S(), left.V4S(), right.V4S());
+          __ Abs(tmp.V4S(), tmp.V4S());
+          __ Add(acc.V4S(), acc.V4S(), tmp.V4S());
+          break;
+        }
+        case Primitive::kPrimLong:
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          __ Sabal(acc.V2D(), left.V2S(), right.V2S());
+          __ Sabal2(acc.V2D(), left.V4S(), right.V4S());
+          break;
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case Primitive::kPrimLong: {
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          VRegister tmp = VRegisterFrom(locations->GetTemp(0));
+          __ Sub(tmp.V2D(), left.V2D(), right.V2D());
+          __ Abs(tmp.V2D(), tmp.V2D());
+          __ Add(acc.V2D(), acc.V2D(), tmp.V2D());
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
   }
 }
 

diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc
index 7a11dff..069054c 100644
--- a/compiler/optimizing/code_generator_vector_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc

@@ -629,12 +629,40 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
-void LocationsBuilderARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LOG(FATAL) << "No SIMD for " << instr->GetId();
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(2, Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
-void InstructionCodeGeneratorARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LOG(FATAL) << "No SIMD for " << instr->GetId();
+void LocationsBuilderARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
 // Return whether the vector memory access operation is guaranteed to be word-aligned (ARM word

diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc
index c2fbf7f..0bedafc 100644
--- a/compiler/optimizing/code_generator_vector_mips.cc
+++ b/compiler/optimizing/code_generator_vector_mips.cc

@@ -826,21 +826,18 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
-void LocationsBuilderMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
-  switch (instr->GetPackedType()) {
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
-      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(2, Location::RequiresFpuRegister());
       locations->SetOut(Location::SameAsFirstInput());
       break;
     default:
@@ -849,18 +846,19 @@
   }
 }
 
-void InstructionCodeGeneratorMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = instr->GetLocations();
-  VectorRegister acc =
-      VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputAccumulatorIndex));
-  VectorRegister left =
-      VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulLeftIndex));
-  VectorRegister right =
-      VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulRightIndex));
-  switch (instr->GetPackedType()) {
+void LocationsBuilderMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister acc = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister left = VectorRegisterFrom(locations->InAt(1));
+  VectorRegister right = VectorRegisterFrom(locations->InAt(2));
+  switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
-      DCHECK_EQ(16u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ MaddvB(acc, left, right);
       } else {
         __ MsubvB(acc, left, right);
@@ -868,24 +866,24 @@
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(8u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ MaddvH(acc, left, right);
       } else {
         __ MsubvH(acc, left, right);
       }
       break;
     case Primitive::kPrimInt:
-      DCHECK_EQ(4u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ MaddvW(acc, left, right);
       } else {
         __ MsubvW(acc, left, right);
       }
       break;
     case Primitive::kPrimLong:
-      DCHECK_EQ(2u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ MaddvD(acc, left, right);
       } else {
         __ MsubvD(acc, left, right);
@@ -897,6 +895,15 @@
   }
 }
 
+void LocationsBuilderMIPS::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  // TODO: implement this, location helper already filled out (shared with MulAcc).
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* arena,
                                   HVecMemoryOperation* instruction,

diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc
index 9d3a777..db31bdc 100644
--- a/compiler/optimizing/code_generator_vector_mips64.cc
+++ b/compiler/optimizing/code_generator_vector_mips64.cc

@@ -830,21 +830,18 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
-void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
-  switch (instr->GetPackedType()) {
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
-      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(2, Location::RequiresFpuRegister());
       locations->SetOut(Location::SameAsFirstInput());
       break;
     default:
@@ -853,18 +850,19 @@
   }
 }
 
-void InstructionCodeGeneratorMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = instr->GetLocations();
-  VectorRegister acc =
-      VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputAccumulatorIndex));
-  VectorRegister left =
-      VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulLeftIndex));
-  VectorRegister right =
-      VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulRightIndex));
-  switch (instr->GetPackedType()) {
+void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister acc = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister left = VectorRegisterFrom(locations->InAt(1));
+  VectorRegister right = VectorRegisterFrom(locations->InAt(2));
+  switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
-      DCHECK_EQ(16u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ MaddvB(acc, left, right);
       } else {
         __ MsubvB(acc, left, right);
@@ -872,24 +870,24 @@
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(8u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ MaddvH(acc, left, right);
       } else {
         __ MsubvH(acc, left, right);
       }
       break;
     case Primitive::kPrimInt:
-      DCHECK_EQ(4u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ MaddvW(acc, left, right);
       } else {
         __ MsubvW(acc, left, right);
       }
       break;
     case Primitive::kPrimLong:
-      DCHECK_EQ(2u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ MaddvD(acc, left, right);
       } else {
         __ MsubvD(acc, left, right);
@@ -901,6 +899,15 @@
   }
 }
 
+void LocationsBuilderMIPS64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  // TODO: implement this, location helper already filled out (shared with MulAcc).
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* arena,
                                   HVecMemoryOperation* instruction,

diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 37190f8..5a012e7 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc

@@ -51,7 +51,6 @@
                                     : Location::RequiresFpuRegister());
       locations->SetOut(is_zero ? Location::RequiresFpuRegister()
                                 : Location::SameAsFirstInput());
-
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -1033,12 +1032,42 @@
   }
 }
 
-void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LOG(FATAL) << "No SIMD for " << instr->GetId();
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(2, Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
-void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LOG(FATAL) << "No SIMD for " << instr->GetId();
+void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  // TODO: pmaddwd?
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  // TODO: psadbw for unsigned?
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
 // Helper to set up locations for vector memory operations.

diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index edd0209..3698b7f 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc

@@ -1005,11 +1005,41 @@
   }
 }
 
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(2, Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
 }
 
 void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  // TODO: pmaddwd?
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  // TODO: psadbw for unsigned?
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 

diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index baa0453..6f8743b 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc

@@ -71,10 +71,13 @@
   return false;
 }
 
-// Detect a sign extension from the given type. Returns the promoted operand on success.
+// Detect a sign extension in instruction from the given type. The to64 parameter
+// denotes if result is long, and thus sign extension from int can be included.
+// Returns the promoted operand on success.
 static bool IsSignExtensionAndGet(HInstruction* instruction,
                                   Primitive::Type type,
-                                  /*out*/ HInstruction** operand) {
+                                  /*out*/ HInstruction** operand,
+                                  bool to64 = false) {
   // Accept any already wider constant that would be handled properly by sign
   // extension when represented in the *width* of the given narrower data type
   // (the fact that char normally zero extends does not matter here).
@@ -82,20 +85,24 @@
   if (IsInt64AndGet(instruction, /*out*/ &value)) {
     switch (type) {
       case Primitive::kPrimByte:
-        if (std::numeric_limits<int8_t>::min() <= value &&
-            std::numeric_limits<int8_t>::max() >= value) {
+        if (IsInt<8>(value)) {
           *operand = instruction;
           return true;
         }
         return false;
       case Primitive::kPrimChar:
       case Primitive::kPrimShort:
-        if (std::numeric_limits<int16_t>::min() <= value &&
-            std::numeric_limits<int16_t>::max() <= value) {
+        if (IsInt<16>(value)) {
           *operand = instruction;
           return true;
         }
         return false;
+      case Primitive::kPrimInt:
+        if (IsInt<32>(value)) {
+          *operand = instruction;
+          return to64;
+        }
+        return false;
       default:
         return false;
     }
@@ -110,40 +117,52 @@
       case Primitive::kPrimShort:
         *operand = instruction;
         return true;
+      case Primitive::kPrimInt:
+        *operand = instruction;
+        return to64;
       default:
         return false;
     }
   }
-  // TODO: perhaps explicit conversions later too?
-  //       (this may return something different from instruction)
+  // Explicit type conversion to long.
+  if (instruction->IsTypeConversion() && instruction->GetType() == Primitive::kPrimLong) {
+    return IsSignExtensionAndGet(instruction->InputAt(0), type, /*out*/ operand, /*to64*/ true);
+  }
   return false;
 }
 
-// Detect a zero extension from the given type. Returns the promoted operand on success.
+// Detect a zero extension in instruction from the given type. The to64 parameter
+// denotes if result is long, and thus zero extension from int can be included.
+// Returns the promoted operand on success.
 static bool IsZeroExtensionAndGet(HInstruction* instruction,
                                   Primitive::Type type,
-                                  /*out*/ HInstruction** operand) {
+                                  /*out*/ HInstruction** operand,
+                                  bool to64 = false) {
   // Accept any already wider constant that would be handled properly by zero
   // extension when represented in the *width* of the given narrower data type
-  // (the fact that byte/short normally sign extend does not matter here).
+  // (the fact that byte/short/int normally sign extend does not matter here).
   int64_t value = 0;
   if (IsInt64AndGet(instruction, /*out*/ &value)) {
     switch (type) {
       case Primitive::kPrimByte:
-        if (std::numeric_limits<uint8_t>::min() <= value &&
-            std::numeric_limits<uint8_t>::max() >= value) {
+        if (IsUint<8>(value)) {
           *operand = instruction;
           return true;
         }
         return false;
       case Primitive::kPrimChar:
       case Primitive::kPrimShort:
-        if (std::numeric_limits<uint16_t>::min() <= value &&
-            std::numeric_limits<uint16_t>::max() <= value) {
+        if (IsUint<16>(value)) {
           *operand = instruction;
           return true;
         }
         return false;
+      case Primitive::kPrimInt:
+        if (IsUint<32>(value)) {
+          *operand = instruction;
+          return to64;
+        }
+        return false;
       default:
         return false;
     }
@@ -170,14 +189,21 @@
         (IsInt64AndGet(b, /*out*/ &mask) && (IsSignExtensionAndGet(a, type, /*out*/ operand) ||
                                              IsZeroExtensionAndGet(a, type, /*out*/ operand)))) {
       switch ((*operand)->GetType()) {
-        case Primitive::kPrimByte:  return mask == std::numeric_limits<uint8_t>::max();
+        case Primitive::kPrimByte:
+          return mask == std::numeric_limits<uint8_t>::max();
         case Primitive::kPrimChar:
-        case Primitive::kPrimShort: return mask == std::numeric_limits<uint16_t>::max();
+        case Primitive::kPrimShort:
+          return mask == std::numeric_limits<uint16_t>::max();
+        case Primitive::kPrimInt:
+          return mask == std::numeric_limits<uint32_t>::max() && to64;
         default: return false;
       }
     }
   }
-  // TODO: perhaps explicit conversions later too?
+  // Explicit type conversion to long.
+  if (instruction->IsTypeConversion() && instruction->GetType() == Primitive::kPrimLong) {
+    return IsZeroExtensionAndGet(instruction->InputAt(0), type, /*out*/ operand, /*to64*/ true);
+  }
   return false;
 }
 
@@ -214,6 +240,55 @@
   return false;
 }
 
+// Compute relative vector length based on type difference.
+static size_t GetOtherVL(Primitive::Type other_type, Primitive::Type vector_type, size_t vl) {
+  switch (other_type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+      switch (vector_type) {
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte: return vl;
+        default: break;
+      }
+      return vl;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      switch (vector_type) {
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte: return vl >> 1;
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort: return vl;
+        default: break;
+      }
+      break;
+    case Primitive::kPrimInt:
+      switch (vector_type) {
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte: return vl >> 2;
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort: return vl >> 1;
+        case Primitive::kPrimInt: return vl;
+        default: break;
+      }
+      break;
+    case Primitive::kPrimLong:
+      switch (vector_type) {
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte: return vl >> 3;
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort: return vl >> 2;
+        case Primitive::kPrimInt: return vl >> 1;
+        case Primitive::kPrimLong: return vl;
+        default: break;
+      }
+      break;
+    default:
+      break;
+  }
+  LOG(FATAL) << "Unsupported idiom conversion";
+  UNREACHABLE();
+}
+
 // Detect up to two instructions a and b, and an acccumulated constant c.
 static bool IsAddConstHelper(HInstruction* instruction,
                              /*out*/ HInstruction** a,
@@ -260,16 +335,16 @@
 }
 
 // Detect reductions of the following forms,
-// under assumption phi has only *one* use:
 //   x = x_phi + ..
 //   x = x_phi - ..
 //   x = max(x_phi, ..)
 //   x = min(x_phi, ..)
 static bool HasReductionFormat(HInstruction* reduction, HInstruction* phi) {
   if (reduction->IsAdd()) {
-    return reduction->InputAt(0) == phi || reduction->InputAt(1) == phi;
+    return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) ||
+           (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi);
   } else if (reduction->IsSub()) {
-    return reduction->InputAt(0) == phi;
+    return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi);
   } else if (reduction->IsInvokeStaticOrDirect()) {
     switch (reduction->AsInvokeStaticOrDirect()->GetIntrinsic()) {
       case Intrinsics::kMathMinIntInt:
@@ -280,7 +355,8 @@
       case Intrinsics::kMathMaxLongLong:
       case Intrinsics::kMathMaxFloatFloat:
       case Intrinsics::kMathMaxDoubleDouble:
-        return reduction->InputAt(0) == phi || reduction->InputAt(1) == phi;
+        return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) ||
+               (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi);
       default:
         return false;
     }
@@ -288,9 +364,9 @@
   return false;
 }
 
-// Translates operation to reduction kind.
-static HVecReduce::ReductionKind GetReductionKind(HInstruction* reduction) {
-  if (reduction->IsVecAdd() || reduction->IsVecSub()) {
+// Translates vector operation to reduction kind.
+static HVecReduce::ReductionKind GetReductionKind(HVecOperation* reduction) {
+  if (reduction->IsVecAdd() || reduction->IsVecSub() || reduction->IsVecSADAccumulate()) {
     return HVecReduce::kSum;
   } else if (reduction->IsVecMin()) {
     return HVecReduce::kMin;
@@ -720,7 +796,6 @@
                                   HBasicBlock* block,
                                   HBasicBlock* exit,
                                   int64_t trip_count) {
-  Primitive::Type induc_type = Primitive::kPrimInt;
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
 
@@ -739,6 +814,10 @@
   vector_header_ = header;
   vector_body_ = block;
 
+  // Loop induction type.
+  Primitive::Type induc_type = main_phi->GetType();
+  DCHECK(induc_type == Primitive::kPrimInt || induc_type == Primitive::kPrimLong) << induc_type;
+
   // Generate dynamic loop peeling trip count, if needed, under the assumption
   // that the Android runtime guarantees at least "component size" alignment:
   // ptc = (ALIGN - (&a[initial] % ALIGN)) / type-size
@@ -767,10 +846,10 @@
     HInstruction* rem = Insert(
         preheader, new (global_allocator_) HAnd(induc_type,
                                                 diff,
-                                                graph_->GetIntConstant(chunk - 1)));
+                                                graph_->GetConstant(induc_type, chunk - 1)));
     vtc = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, rem));
   }
-  vector_index_ = graph_->GetIntConstant(0);
+  vector_index_ = graph_->GetConstant(induc_type, 0);
 
   // Generate runtime disambiguation test:
   // vtc = a != b ? vtc : 0;
@@ -779,7 +858,8 @@
         preheader,
         new (global_allocator_) HNotEqual(vector_runtime_test_a_, vector_runtime_test_b_));
     vtc = Insert(preheader,
-                 new (global_allocator_) HSelect(rt, vtc, graph_->GetIntConstant(0), kNoDexPc));
+                 new (global_allocator_)
+                 HSelect(rt, vtc, graph_->GetConstant(induc_type, 0), kNoDexPc));
     needs_cleanup = true;
   }
 
@@ -793,7 +873,7 @@
                     graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
                     vector_index_,
                     ptc,
-                    graph_->GetIntConstant(1),
+                    graph_->GetConstant(induc_type, 1),
                     kNoUnrollingFactor);
   }
 
@@ -806,7 +886,7 @@
                   graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
                   vector_index_,
                   vtc,
-                  graph_->GetIntConstant(vector_length_),  // increment per unroll
+                  graph_->GetConstant(induc_type, vector_length_),  // increment per unroll
                   unroll);
   HLoopInformation* vloop = vector_header_->GetLoopInformation();
 
@@ -820,14 +900,20 @@
                     graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
                     vector_index_,
                     stc,
-                    graph_->GetIntConstant(1),
+                    graph_->GetConstant(induc_type, 1),
                     kNoUnrollingFactor);
   }
 
   // Link reductions to their final uses.
   for (auto i = reductions_->begin(); i != reductions_->end(); ++i) {
     if (i->first->IsPhi()) {
-      i->first->ReplaceWith(ReduceAndExtractIfNeeded(i->second));
+      HInstruction* phi = i->first;
+      HInstruction* repl = ReduceAndExtractIfNeeded(i->second);
+      // Deal with regular uses.
+      for (const HUseListNode<HInstruction*>& use : phi->GetUses()) {
+        induction_range_.Replace(use.GetUser(), phi, repl);  // update induction use
+      }
+      phi->ReplaceWith(repl);
     }
   }
 
@@ -853,7 +939,7 @@
                                         HInstruction* step,
                                         uint32_t unroll) {
   DCHECK(unroll == 1 || vector_mode_ == kVector);
-  Primitive::Type induc_type = Primitive::kPrimInt;
+  Primitive::Type induc_type = lo->GetType();
   // Prepare new loop.
   vector_preheader_ = new_preheader,
   vector_header_ = vector_preheader_->GetSingleSuccessor();
@@ -942,8 +1028,10 @@
   auto redit = reductions_->find(instruction);
   if (redit != reductions_->end()) {
     Primitive::Type type = instruction->GetType();
-    if (TrySetVectorType(type, &restrictions) &&
-        VectorizeUse(node, instruction, generate_code, type, restrictions)) {
+    // Recognize SAD idiom or direct reduction.
+    if (VectorizeSADIdiom(node, instruction, generate_code, type, restrictions) ||
+        (TrySetVectorType(type, &restrictions) &&
+         VectorizeUse(node, instruction, generate_code, type, restrictions))) {
       if (generate_code) {
         HInstruction* new_red = vector_map_->Get(instruction);
         vector_permanent_map_->Put(new_red, vector_map_->Get(redit->second));
@@ -1029,14 +1117,20 @@
     HInstruction* opa = conversion->InputAt(0);
     Primitive::Type from = conversion->GetInputType();
     Primitive::Type to = conversion->GetResultType();
-    if ((to == Primitive::kPrimByte ||
-         to == Primitive::kPrimChar ||
-         to == Primitive::kPrimShort) && from == Primitive::kPrimInt) {
-      // Accept a "narrowing" type conversion from a "wider" computation for
-      // (1) conversion into final required type,
-      // (2) vectorizable operand,
-      // (3) "wider" operations cannot bring in higher order bits.
-      if (to == type && VectorizeUse(node, opa, generate_code, type, restrictions | kNoHiBits)) {
+    if (Primitive::IsIntegralType(from) && Primitive::IsIntegralType(to)) {
+      size_t size_vec = Primitive::ComponentSize(type);
+      size_t size_from = Primitive::ComponentSize(from);
+      size_t size_to = Primitive::ComponentSize(to);
+      // Accept an integral conversion
+      // (1a) narrowing into vector type, "wider" operations cannot bring in higher order bits, or
+      // (1b) widening from at least vector type, and
+      // (2) vectorizable operand.
+      if ((size_to < size_from &&
+           size_to == size_vec &&
+           VectorizeUse(node, opa, generate_code, type, restrictions | kNoHiBits)) ||
+          (size_to >= size_from &&
+           size_from >= size_vec &&
+           VectorizeUse(node, opa, generate_code, type, restrictions))) {
         if (generate_code) {
           if (vector_mode_ == kVector) {
             vector_map_->Put(instruction, vector_map_->Get(opa));  // operand pass-through
@@ -1088,7 +1182,7 @@
       return true;
     }
   } else if (instruction->IsShl() || instruction->IsShr() || instruction->IsUShr()) {
-    // Recognize vectorization idioms.
+    // Recognize halving add idiom.
     if (VectorizeHalvingAddIdiom(node, instruction, generate_code, type, restrictions)) {
       return true;
     }
@@ -1181,7 +1275,8 @@
           return false;  // reject, unless all operands are same-extension narrower
         }
         // Accept MIN/MAX(x, y) for vectorizable operands.
-        DCHECK(r != nullptr && s != nullptr);
+        DCHECK(r != nullptr);
+        DCHECK(s != nullptr);
         if (generate_code && vector_mode_ != kVector) {  // de-idiom
           r = opa;
           s = opb;
@@ -1232,11 +1327,11 @@
       switch (type) {
         case Primitive::kPrimBoolean:
         case Primitive::kPrimByte:
-          *restrictions |= kNoDiv | kNoReduction;
+          *restrictions |= kNoDiv;
           return TrySetVectorLength(16);
         case Primitive::kPrimChar:
         case Primitive::kPrimShort:
-          *restrictions |= kNoDiv | kNoReduction;
+          *restrictions |= kNoDiv;
           return TrySetVectorLength(8);
         case Primitive::kPrimInt:
           *restrictions |= kNoDiv;
@@ -1261,17 +1356,17 @@
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
             *restrictions |=
-                kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction;
+                kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoSAD;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction;
+            *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoSAD;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoSAD;
             return TrySetVectorLength(4);
           case Primitive::kPrimLong:
-            *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax;
+            *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax | kNoSAD;
             return TrySetVectorLength(2);
           case Primitive::kPrimFloat:
             *restrictions |= kNoMinMax | kNoReduction;  // minmax: -0.0 vs +0.0
@@ -1289,17 +1384,17 @@
         switch (type) {
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
-            *restrictions |= kNoDiv | kNoReduction;
+            *restrictions |= kNoDiv | kNoReduction | kNoSAD;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
+            *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoSAD;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
-            *restrictions |= kNoDiv | kNoReduction;
+            *restrictions |= kNoDiv | kNoReduction | kNoSAD;
             return TrySetVectorLength(4);
           case Primitive::kPrimLong:
-            *restrictions |= kNoDiv | kNoReduction;
+            *restrictions |= kNoDiv | kNoReduction | kNoSAD;
             return TrySetVectorLength(2);
           case Primitive::kPrimFloat:
             *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
@@ -1317,17 +1412,17 @@
         switch (type) {
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
-            *restrictions |= kNoDiv | kNoReduction;
+            *restrictions |= kNoDiv | kNoReduction | kNoSAD;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
+            *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoSAD;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
-            *restrictions |= kNoDiv | kNoReduction;
+            *restrictions |= kNoDiv | kNoReduction | kNoSAD;
             return TrySetVectorLength(4);
           case Primitive::kPrimLong:
-            *restrictions |= kNoDiv | kNoReduction;
+            *restrictions |= kNoDiv | kNoReduction | kNoSAD;
             return TrySetVectorLength(2);
           case Primitive::kPrimFloat:
             *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
@@ -1371,8 +1466,16 @@
     if (it != vector_permanent_map_->end()) {
       vector = it->second;  // reuse during unrolling
     } else {
-      vector = new (global_allocator_) HVecReplicateScalar(
-          global_allocator_, org, type, vector_length_);
+      // Generates ReplicateScalar( (optional_type_conv) org ).
+      HInstruction* input = org;
+      Primitive::Type input_type = input->GetType();
+      if (type != input_type && (type == Primitive::kPrimLong ||
+                                 input_type == Primitive::kPrimLong)) {
+        input = Insert(vector_preheader_,
+                       new (global_allocator_) HTypeConversion(type, input, kNoDexPc));
+      }
+      vector = new (global_allocator_)
+          HVecReplicateScalar(global_allocator_, input, type, vector_length_);
       vector_permanent_map_->Put(org, Insert(vector_preheader_, vector));
     }
     vector_map_->Put(org, vector);
@@ -1465,10 +1568,15 @@
   // Prepare the new initialization.
   if (vector_mode_ == kVector) {
     // Generate a [initial, 0, .., 0] vector.
-    new_init = Insert(
-            vector_preheader_,
-            new (global_allocator_) HVecSetScalars(
-                global_allocator_, &new_init, phi->GetType(), vector_length_, 1));
+    HVecOperation* red_vector = new_red->AsVecOperation();
+    size_t vector_length = red_vector->GetVectorLength();
+    Primitive::Type type = red_vector->GetPackedType();
+    new_init = Insert(vector_preheader_,
+                      new (global_allocator_) HVecSetScalars(global_allocator_,
+                                                             &new_init,
+                                                             type,
+                                                             vector_length,
+                                                             1));
   } else {
     new_init = ReduceAndExtractIfNeeded(new_init);
   }
@@ -1484,18 +1592,20 @@
   if (instruction->IsPhi()) {
     HInstruction* input = instruction->InputAt(1);
     if (input->IsVecOperation()) {
-      Primitive::Type type = input->AsVecOperation()->GetPackedType();
+      HVecOperation* input_vector = input->AsVecOperation();
+      size_t vector_length = input_vector->GetVectorLength();
+      Primitive::Type type = input_vector->GetPackedType();
+      HVecReduce::ReductionKind kind = GetReductionKind(input_vector);
       HBasicBlock* exit = instruction->GetBlock()->GetSuccessors()[0];
       // Generate a vector reduction and scalar extract
       //    x = REDUCE( [x_1, .., x_n] )
       //    y = x_1
       // along the exit of the defining loop.
-      HVecReduce::ReductionKind kind = GetReductionKind(input);
       HInstruction* reduce = new (global_allocator_) HVecReduce(
-          global_allocator_, instruction, type, vector_length_, kind);
+          global_allocator_, instruction, type, vector_length, kind);
       exit->InsertInstructionBefore(reduce, exit->GetFirstInstruction());
       instruction = new (global_allocator_) HVecExtractScalar(
-          global_allocator_, reduce, type, vector_length_, 0);
+          global_allocator_, reduce, type, vector_length, 0);
       exit->InsertInstructionAfter(instruction, reduce);
     }
   }
@@ -1516,27 +1626,19 @@
                                       HInstruction* opb,
                                       Primitive::Type type,
                                       bool is_unsigned) {
-  if (vector_mode_ == kSequential) {
-    // Non-converting scalar code follows implicit integral promotion.
-    if (!org->IsTypeConversion() && (type == Primitive::kPrimBoolean ||
-                                     type == Primitive::kPrimByte ||
-                                     type == Primitive::kPrimChar ||
-                                     type == Primitive::kPrimShort)) {
-      type = Primitive::kPrimInt;
-    }
-  }
   HInstruction* vector = nullptr;
+  Primitive::Type org_type = org->GetType();
   switch (org->GetKind()) {
     case HInstruction::kNeg:
       DCHECK(opb == nullptr);
       GENERATE_VEC(
           new (global_allocator_) HVecNeg(global_allocator_, opa, type, vector_length_),
-          new (global_allocator_) HNeg(type, opa));
+          new (global_allocator_) HNeg(org_type, opa));
     case HInstruction::kNot:
       DCHECK(opb == nullptr);
       GENERATE_VEC(
           new (global_allocator_) HVecNot(global_allocator_, opa, type, vector_length_),
-          new (global_allocator_) HNot(type, opa));
+          new (global_allocator_) HNot(org_type, opa));
     case HInstruction::kBooleanNot:
       DCHECK(opb == nullptr);
       GENERATE_VEC(
@@ -1546,47 +1648,47 @@
       DCHECK(opb == nullptr);
       GENERATE_VEC(
           new (global_allocator_) HVecCnv(global_allocator_, opa, type, vector_length_),
-          new (global_allocator_) HTypeConversion(type, opa, kNoDexPc));
+          new (global_allocator_) HTypeConversion(org_type, opa, kNoDexPc));
     case HInstruction::kAdd:
       GENERATE_VEC(
           new (global_allocator_) HVecAdd(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HAdd(type, opa, opb));
+          new (global_allocator_) HAdd(org_type, opa, opb));
     case HInstruction::kSub:
       GENERATE_VEC(
           new (global_allocator_) HVecSub(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HSub(type, opa, opb));
+          new (global_allocator_) HSub(org_type, opa, opb));
     case HInstruction::kMul:
       GENERATE_VEC(
           new (global_allocator_) HVecMul(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HMul(type, opa, opb));
+          new (global_allocator_) HMul(org_type, opa, opb));
     case HInstruction::kDiv:
       GENERATE_VEC(
           new (global_allocator_) HVecDiv(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HDiv(type, opa, opb, kNoDexPc));
+          new (global_allocator_) HDiv(org_type, opa, opb, kNoDexPc));
     case HInstruction::kAnd:
       GENERATE_VEC(
           new (global_allocator_) HVecAnd(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HAnd(type, opa, opb));
+          new (global_allocator_) HAnd(org_type, opa, opb));
     case HInstruction::kOr:
       GENERATE_VEC(
           new (global_allocator_) HVecOr(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HOr(type, opa, opb));
+          new (global_allocator_) HOr(org_type, opa, opb));
     case HInstruction::kXor:
       GENERATE_VEC(
           new (global_allocator_) HVecXor(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HXor(type, opa, opb));
+          new (global_allocator_) HXor(org_type, opa, opb));
     case HInstruction::kShl:
       GENERATE_VEC(
           new (global_allocator_) HVecShl(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HShl(type, opa, opb));
+          new (global_allocator_) HShl(org_type, opa, opb));
     case HInstruction::kShr:
       GENERATE_VEC(
           new (global_allocator_) HVecShr(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HShr(type, opa, opb));
+          new (global_allocator_) HShr(org_type, opa, opb));
     case HInstruction::kUShr:
       GENERATE_VEC(
           new (global_allocator_) HVecUShr(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HUShr(type, opa, opb));
+          new (global_allocator_) HUShr(org_type, opa, opb));
     case HInstruction::kInvokeStaticOrDirect: {
       HInvokeStaticOrDirect* invoke = org->AsInvokeStaticOrDirect();
       if (vector_mode_ == kVector) {
@@ -1667,8 +1769,8 @@
 //
 
 // Method recognizes the following idioms:
-//   rounding halving add (a + b + 1) >> 1 for unsigned/signed operands a, b
-//   regular  halving add (a + b)     >> 1 for unsigned/signed operands a, b
+//   rounding  halving add (a + b + 1) >> 1 for unsigned/signed operands a, b
+//   truncated halving add (a + b)     >> 1 for unsigned/signed operands a, b
 // Provided that the operands are promoted to a wider form to do the arithmetic and
 // then cast back to narrower form, the idioms can be mapped into efficient SIMD
 // implementation that operates directly in narrower form (plus one extra bit).
@@ -1712,7 +1814,8 @@
       }
       // Accept recognized halving add for vectorizable operands. Vectorized code uses the
       // shorthand idiomatic operation. Sequential code uses the original scalar expressions.
-      DCHECK(r != nullptr && s != nullptr);
+      DCHECK(r != nullptr);
+      DCHECK(s != nullptr);
       if (generate_code && vector_mode_ != kVector) {  // de-idiom
         r = instruction->InputAt(0);
         s = instruction->InputAt(1);
@@ -1741,6 +1844,88 @@
   return false;
 }
 
+// Method recognizes the following idiom:
+//   q += ABS(a - b) for signed operands a, b
+// Provided that the operands have the same type or are promoted to a wider form.
+// Since this may involve a vector length change, the idiom is handled by going directly
+// to a sad-accumulate node (rather than relying combining finer grained nodes later).
+// TODO: unsigned SAD too?
+bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
+                                          HInstruction* instruction,
+                                          bool generate_code,
+                                          Primitive::Type reduction_type,
+                                          uint64_t restrictions) {
+  // Filter integral "q += ABS(a - b);" reduction, where ABS and SUB
+  // are done in the same precision (either int or long).
+  if (!instruction->IsAdd() ||
+      (reduction_type != Primitive::kPrimInt && reduction_type != Primitive::kPrimLong)) {
+    return false;
+  }
+  HInstruction* q = instruction->InputAt(0);
+  HInstruction* v = instruction->InputAt(1);
+  HInstruction* a = nullptr;
+  HInstruction* b = nullptr;
+  if (v->IsInvokeStaticOrDirect() &&
+       (v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsInt ||
+        v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsLong)) {
+    HInstruction* x = v->InputAt(0);
+    if (x->IsSub() && x->GetType() == reduction_type) {
+      a = x->InputAt(0);
+      b = x->InputAt(1);
+    }
+  }
+  if (a == nullptr || b == nullptr) {
+    return false;
+  }
+  // Accept same-type or consistent sign extension for narrower-type on operands a and b.
+  // The same-type or narrower operands are called r (a or lower) and s (b or lower).
+  HInstruction* r = a;
+  HInstruction* s = b;
+  bool is_unsigned = false;
+  Primitive::Type sub_type = a->GetType();
+  if (a->IsTypeConversion()) {
+    sub_type = a->InputAt(0)->GetType();
+  } else if (b->IsTypeConversion()) {
+    sub_type = b->InputAt(0)->GetType();
+  }
+  if (reduction_type != sub_type &&
+      (!IsNarrowerOperands(a, b, sub_type, &r, &s, &is_unsigned) || is_unsigned)) {
+    return false;
+  }
+  // Try same/narrower type and deal with vector restrictions.
+  if (!TrySetVectorType(sub_type, &restrictions) || HasVectorRestrictions(restrictions, kNoSAD)) {
+    return false;
+  }
+  // Accept SAD idiom for vectorizable operands. Vectorized code uses the shorthand
+  // idiomatic operation. Sequential code uses the original scalar expressions.
+  DCHECK(r != nullptr);
+  DCHECK(s != nullptr);
+  if (generate_code && vector_mode_ != kVector) {  // de-idiom
+    r = s = v->InputAt(0);
+  }
+  if (VectorizeUse(node, q, generate_code, sub_type, restrictions) &&
+      VectorizeUse(node, r, generate_code, sub_type, restrictions) &&
+      VectorizeUse(node, s, generate_code, sub_type, restrictions)) {
+    if (generate_code) {
+      if (vector_mode_ == kVector) {
+        vector_map_->Put(instruction, new (global_allocator_) HVecSADAccumulate(
+            global_allocator_,
+            vector_map_->Get(q),
+            vector_map_->Get(r),
+            vector_map_->Get(s),
+            reduction_type,
+            GetOtherVL(reduction_type, sub_type, vector_length_)));
+        MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom);
+      } else {
+        GenerateVecOp(v, vector_map_->Get(r), nullptr, reduction_type);
+        GenerateVecOp(instruction, vector_map_->Get(q), vector_map_->Get(v), reduction_type);
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
 //
 // Vectorization heuristics.
 //

diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index f347518..ae2ea76 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h

@@ -75,6 +75,7 @@
     kNoMinMax        = 1 << 8,   // no min/max
     kNoStringCharAt  = 1 << 9,   // no StringCharAt
     kNoReduction     = 1 << 10,  // no reduction
+    kNoSAD           = 1 << 11,  // no sum of absolute differences (SAD)
   };
 
   /*
@@ -172,6 +173,11 @@
                                 bool generate_code,
                                 Primitive::Type type,
                                 uint64_t restrictions);
+  bool VectorizeSADIdiom(LoopNode* node,
+                         HInstruction* instruction,
+                         bool generate_code,
+                         Primitive::Type type,
+                         uint64_t restrictions);
 
   // Vectorization heuristics.
   bool IsVectorizationProfitable(int64_t trip_count);

diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index a6d0da1..6bc5111 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h

@@ -1396,6 +1396,7 @@
   M(VecUShr, VecBinaryOperation)                                        \
   M(VecSetScalars, VecOperation)                                        \
   M(VecMultiplyAccumulate, VecOperation)                                \
+  M(VecSADAccumulate, VecOperation)                                     \
   M(VecLoad, VecMemoryOperation)                                        \
   M(VecStore, VecMemoryOperation)                                       \
 

diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index c5e75a7..1488b70 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h

@@ -461,8 +461,8 @@
 };
 
 // Performs halving add on every component in the two vectors, viz.
-// rounded [ x1, .. , xn ] hradd [ y1, .. , yn ] = [ (x1 + y1 + 1) >> 1, .. , (xn + yn + 1) >> 1 ]
-// or      [ x1, .. , xn ] hadd  [ y1, .. , yn ] = [ (x1 + y1)     >> 1, .. , (xn + yn )    >> 1 ]
+// rounded   [ x1, .. , xn ] hradd [ y1, .. , yn ] = [ (x1 + y1 + 1) >> 1, .. , (xn + yn + 1) >> 1 ]
+// truncated [ x1, .. , xn ] hadd  [ y1, .. , yn ] = [ (x1 + y1)     >> 1, .. , (xn + yn )    >> 1 ]
 // for signed operands x, y (sign extension) or unsigned operands x, y (zero extension).
 class HVecHalvingAdd FINAL : public HVecBinaryOperation {
  public:
@@ -810,8 +810,8 @@
 //
 
 // Assigns the given scalar elements to a vector,
-// viz. set( array(x1, .., xn) ) = [ x1, .. ,           xn ] if n == m,
-//      set( array(x1, .., xm) ) = [ x1, .. , xm, 0, .., 0 ] if m <  n.
+// viz. set( array(x1, .. , xn) ) = [ x1, .. ,            xn ] if n == m,
+//      set( array(x1, .. , xm) ) = [ x1, .. , xm, 0, .. , 0 ] if m <  n.
 class HVecSetScalars FINAL : public HVecOperation {
  public:
   HVecSetScalars(ArenaAllocator* arena,
@@ -842,9 +842,8 @@
   DISALLOW_COPY_AND_ASSIGN(HVecSetScalars);
 };
 
-// Multiplies every component in the two vectors, adds the result vector to the accumulator vector.
-// viz. [ acc1, .., accn ] + [ x1, .. , xn ] * [ y1, .. , yn ] =
-//     [ acc1 + x1 * y1, .. , accn + xn * yn ].
+// Multiplies every component in the two vectors, adds the result vector to the accumulator vector,
+// viz. [ a1, .. , an ] + [ x1, .. , xn ] * [ y1, .. , yn ] = [ a1 + x1 * y1, .. , an + xn * yn ].
 class HVecMultiplyAccumulate FINAL : public HVecOperation {
  public:
   HVecMultiplyAccumulate(ArenaAllocator* arena,
@@ -866,15 +865,11 @@
     DCHECK(HasConsistentPackedTypes(accumulator, packed_type));
     DCHECK(HasConsistentPackedTypes(mul_left, packed_type));
     DCHECK(HasConsistentPackedTypes(mul_right, packed_type));
-    SetRawInputAt(kInputAccumulatorIndex, accumulator);
-    SetRawInputAt(kInputMulLeftIndex, mul_left);
-    SetRawInputAt(kInputMulRightIndex, mul_right);
+    SetRawInputAt(0, accumulator);
+    SetRawInputAt(1, mul_left);
+    SetRawInputAt(2, mul_right);
   }
 
-  static constexpr int kInputAccumulatorIndex = 0;
-  static constexpr int kInputMulLeftIndex = 1;
-  static constexpr int kInputMulRightIndex = 2;
-
   bool CanBeMoved() const OVERRIDE { return true; }
 
   bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
@@ -894,6 +889,42 @@
   DISALLOW_COPY_AND_ASSIGN(HVecMultiplyAccumulate);
 };
 
+// Takes the absolute difference of two vectors, and adds the results to
+// same-precision or wider-precision components in the accumulator,
+// viz. SAD([ a1, .. , am ], [ x1, .. , xn ], [ y1, .. , yn ] =
+//          [ a1 + sum abs(xi-yi), .. , am + sum abs(xj-yj) ],
+//      for m <= n and non-overlapping sums.
+class HVecSADAccumulate FINAL : public HVecOperation {
+ public:
+  HVecSADAccumulate(ArenaAllocator* arena,
+                    HInstruction* accumulator,
+                    HInstruction* sad_left,
+                    HInstruction* sad_right,
+                    Primitive::Type packed_type,
+                    size_t vector_length,
+                    uint32_t dex_pc = kNoDexPc)
+      : HVecOperation(arena,
+                      packed_type,
+                      SideEffects::None(),
+                      /* number_of_inputs */ 3,
+                      vector_length,
+                      dex_pc) {
+    DCHECK(HasConsistentPackedTypes(accumulator, packed_type));
+    DCHECK(sad_left->IsVecOperation());
+    DCHECK(sad_right->IsVecOperation());
+    DCHECK_EQ(sad_left->AsVecOperation()->GetPackedType(),
+              sad_right->AsVecOperation()->GetPackedType());
+    SetRawInputAt(0, accumulator);
+    SetRawInputAt(1, sad_left);
+    SetRawInputAt(2, sad_right);
+  }
+
+  DECLARE_INSTRUCTION(VecSADAccumulate);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecSADAccumulate);
+};
+
 // Loads a vector from memory, viz. load(mem, 1)
 // yield the vector [ mem(1), .. , mem(n) ].
 class HVecLoad FINAL : public HVecMemoryOperation {
commit	dbbac8f812a866b1b53f3007721f66038d208549	[log] [tgz]
author	Aart Bik <ajcbik@google.com>	Fri Sep 01 13:06:08 2017 -0700
committer	Aart Bik <ajcbik@google.com>	Thu Sep 21 10:20:55 2017 -0700
tree	05cecd927afccd33fc1c14b39ada47e86873f560
parent	2406bf17998e15bd40677a907beb3e9c41facce4 [diff]