ART: ARM64: Support DotProd SIMD idiom.

Implement support for vectorization idiom which performs dot
product of two vectors and adds the result to wider precision
components in the accumulator.

viz. DOT_PRODUCT([ a1, .. , am], [ x1, .. , xn ], [ y1, .. , yn ]) =
                 [ a1 + sum(xi * yi), .. , am + sum(xj * yj) ],
     for m <= n, non-overlapping sums,
     for either both signed or both unsigned operands x, y.

The patch shows up to 7x performance improvement on a micro
benchmark on Cortex-A57.

Test: 684-checker-simd-dotprod.
Test: test-art-host, test-art-target.

Change-Id: Ibab0d51f537fdecd1d84033197be3ebf5ec4e455
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 43169ba..e79a96b 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -1277,6 +1277,74 @@
   }
 }
 
+void LocationsBuilderARM64::VisitVecDotProd(HVecDotProd* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+  DCHECK(instruction->GetPackedType() == DataType::Type::kInt32);
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetInAt(1, Location::RequiresFpuRegister());
+  locations->SetInAt(2, Location::RequiresFpuRegister());
+  locations->SetOut(Location::SameAsFirstInput());
+
+  // For Int8 and Uint8 we need a temp register.
+  if (DataType::Size(instruction->InputAt(1)->AsVecOperation()->GetPackedType()) == 1) {
+    locations->AddTemp(Location::RequiresFpuRegister());
+  }
+}
+
+void InstructionCodeGeneratorARM64::VisitVecDotProd(HVecDotProd* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  VRegister acc = VRegisterFrom(locations->InAt(0));
+  VRegister left = VRegisterFrom(locations->InAt(1));
+  VRegister right = VRegisterFrom(locations->InAt(2));
+  HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
+  HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
+  DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
+            HVecOperation::ToSignedType(b->GetPackedType()));
+  DCHECK_EQ(instruction->GetPackedType(), DataType::Type::kInt32);
+  DCHECK_EQ(4u, instruction->GetVectorLength());
+
+  size_t inputs_data_size = DataType::Size(a->GetPackedType());
+  switch (inputs_data_size) {
+    case 1u: {
+      DCHECK_EQ(16u, a->GetVectorLength());
+      VRegister tmp = VRegisterFrom(locations->GetTemp(0));
+      if (instruction->IsZeroExtending()) {
+        // TODO: Use Armv8.4-A UDOT instruction when it is available.
+        __ Umull(tmp.V8H(), left.V8B(), right.V8B());
+        __ Uaddw(acc.V4S(), acc.V4S(), tmp.V4H());
+        __ Uaddw2(acc.V4S(), acc.V4S(), tmp.V8H());
+
+        __ Umull2(tmp.V8H(), left.V16B(), right.V16B());
+        __ Uaddw(acc.V4S(), acc.V4S(), tmp.V4H());
+        __ Uaddw2(acc.V4S(), acc.V4S(), tmp.V8H());
+      } else {
+        // TODO: Use Armv8.4-A SDOT instruction when it is available.
+        __ Smull(tmp.V8H(), left.V8B(), right.V8B());
+        __ Saddw(acc.V4S(), acc.V4S(), tmp.V4H());
+        __ Saddw2(acc.V4S(), acc.V4S(), tmp.V8H());
+
+        __ Smull2(tmp.V8H(), left.V16B(), right.V16B());
+        __ Saddw(acc.V4S(), acc.V4S(), tmp.V4H());
+        __ Saddw2(acc.V4S(), acc.V4S(), tmp.V8H());
+      }
+      break;
+    }
+    case 2u:
+      DCHECK_EQ(8u, a->GetVectorLength());
+      if (instruction->IsZeroExtending()) {
+        __ Umlal(acc.V4S(), left.V4H(), right.V4H());
+        __ Umlal2(acc.V4S(), left.V8H(), right.V8H());
+      } else {
+        __ Smlal(acc.V4S(), left.V4H(), right.V4H());
+        __ Smlal2(acc.V4S(), left.V8H(), right.V8H());
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type size: " << inputs_data_size;
+  }
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* allocator,
                                   HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc
index 7b66b17..62b6c4e 100644
--- a/compiler/optimizing/code_generator_vector_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc
@@ -854,6 +854,14 @@
   }
 }
 
+void LocationsBuilderARMVIXL::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 // Return whether the vector memory access operation is guaranteed to be word-aligned (ARM word
 // size equals to 4).
 static bool IsWordAligned(HVecMemoryOperation* instruction) {
diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc
index df0e148..24f4fb2 100644
--- a/compiler/optimizing/code_generator_vector_mips.cc
+++ b/compiler/optimizing/code_generator_vector_mips.cc
@@ -1274,6 +1274,14 @@
   }
 }
 
+void LocationsBuilderMIPS::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* allocator,
                                   HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc
index de354b6..972c49e 100644
--- a/compiler/optimizing/code_generator_vector_mips64.cc
+++ b/compiler/optimizing/code_generator_vector_mips64.cc
@@ -1272,6 +1272,14 @@
   }
 }
 
+void LocationsBuilderMIPS64::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* allocator,
                                   HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 2502275..c52ecc7 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -1143,6 +1143,14 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderX86::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorX86::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* allocator,
                                   HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 4a67daf..87d0106 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -1116,6 +1116,14 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderX86_64::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* allocator,
                                   HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/data_type.h b/compiler/optimizing/data_type.h
index 5ac6e46..3cbcc9e 100644
--- a/compiler/optimizing/data_type.h
+++ b/compiler/optimizing/data_type.h
@@ -231,6 +231,21 @@
     }
   }
 
+  static Type ToUnsigned(Type type) {
+    switch (type) {
+      case Type::kInt8:
+        return Type::kUint8;
+      case Type::kInt16:
+        return Type::kUint16;
+      case Type::kInt32:
+        return Type::kUint32;
+      case Type::kInt64:
+        return Type::kUint64;
+      default:
+        return type;
+    }
+  }
+
   static const char* PrettyDescriptor(Type type);
 
  private:
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 31db8c2..21f22af 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -564,6 +564,14 @@
     StartAttributeStream("kind") << instruction->GetOpKind();
   }
 
+  void VisitVecDotProd(HVecDotProd* instruction) override {
+    VisitVecOperation(instruction);
+    DataType::Type arg_type = instruction->InputAt(1)->AsVecOperation()->GetPackedType();
+    StartAttributeStream("type") << (instruction->IsZeroExtending() ?
+                                    DataType::ToUnsigned(arg_type) :
+                                    DataType::ToSigned(arg_type));
+  }
+
 #if defined(ART_ENABLE_CODEGEN_arm) || defined(ART_ENABLE_CODEGEN_arm64)
   void VisitMultiplyAccumulate(HMultiplyAccumulate* instruction) override {
     StartAttributeStream("kind") << instruction->GetOpKind();
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 7d66155..12b180d 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -351,7 +351,10 @@
 
 // Translates vector operation to reduction kind.
 static HVecReduce::ReductionKind GetReductionKind(HVecOperation* reduction) {
-  if (reduction->IsVecAdd() || reduction->IsVecSub() || reduction->IsVecSADAccumulate()) {
+  if (reduction->IsVecAdd() ||
+      reduction->IsVecSub() ||
+      reduction->IsVecSADAccumulate() ||
+      reduction->IsVecDotProd()) {
     return HVecReduce::kSum;
   }
   LOG(FATAL) << "Unsupported SIMD reduction " << reduction->GetId();
@@ -431,6 +434,23 @@
   }
 }
 
+// Returns the narrower type out of instructions a and b types.
+static DataType::Type GetNarrowerType(HInstruction* a, HInstruction* b) {
+  DataType::Type type = a->GetType();
+  if (DataType::Size(b->GetType()) < DataType::Size(type)) {
+    type = b->GetType();
+  }
+  if (a->IsTypeConversion() &&
+      DataType::Size(a->InputAt(0)->GetType()) < DataType::Size(type)) {
+    type = a->InputAt(0)->GetType();
+  }
+  if (b->IsTypeConversion() &&
+      DataType::Size(b->InputAt(0)->GetType()) < DataType::Size(type)) {
+    type = b->InputAt(0)->GetType();
+  }
+  return type;
+}
+
 //
 // Public methods.
 //
@@ -1289,6 +1309,7 @@
     DataType::Type type = instruction->GetType();
     // Recognize SAD idiom or direct reduction.
     if (VectorizeSADIdiom(node, instruction, generate_code, type, restrictions) ||
+        VectorizeDotProdIdiom(node, instruction, generate_code, type, restrictions) ||
         (TrySetVectorType(type, &restrictions) &&
          VectorizeUse(node, instruction, generate_code, type, restrictions))) {
       if (generate_code) {
@@ -1531,11 +1552,11 @@
         case DataType::Type::kBool:
         case DataType::Type::kUint8:
         case DataType::Type::kInt8:
-          *restrictions |= kNoDiv | kNoReduction;
+          *restrictions |= kNoDiv | kNoReduction | kNoDotProd;
           return TrySetVectorLength(8);
         case DataType::Type::kUint16:
         case DataType::Type::kInt16:
-          *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
+          *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoDotProd;
           return TrySetVectorLength(4);
         case DataType::Type::kInt32:
           *restrictions |= kNoDiv | kNoWideSAD;
@@ -1580,12 +1601,23 @@
           case DataType::Type::kBool:
           case DataType::Type::kUint8:
           case DataType::Type::kInt8:
-            *restrictions |=
-                kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoSAD;
+            *restrictions |= kNoMul |
+                             kNoDiv |
+                             kNoShift |
+                             kNoAbs |
+                             kNoSignedHAdd |
+                             kNoUnroundedHAdd |
+                             kNoSAD |
+                             kNoDotProd;
             return TrySetVectorLength(16);
           case DataType::Type::kUint16:
           case DataType::Type::kInt16:
-            *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoSAD;
+            *restrictions |= kNoDiv |
+                             kNoAbs |
+                             kNoSignedHAdd |
+                             kNoUnroundedHAdd |
+                             kNoSAD|
+                             kNoDotProd;
             return TrySetVectorLength(8);
           case DataType::Type::kInt32:
             *restrictions |= kNoDiv | kNoSAD;
@@ -1610,11 +1642,11 @@
           case DataType::Type::kBool:
           case DataType::Type::kUint8:
           case DataType::Type::kInt8:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoDotProd;
             return TrySetVectorLength(16);
           case DataType::Type::kUint16:
           case DataType::Type::kInt16:
-            *restrictions |= kNoDiv | kNoStringCharAt;
+            *restrictions |= kNoDiv | kNoStringCharAt | kNoDotProd;
             return TrySetVectorLength(8);
           case DataType::Type::kInt32:
             *restrictions |= kNoDiv;
@@ -1639,11 +1671,11 @@
           case DataType::Type::kBool:
           case DataType::Type::kUint8:
           case DataType::Type::kInt8:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoDotProd;
             return TrySetVectorLength(16);
           case DataType::Type::kUint16:
           case DataType::Type::kInt16:
-            *restrictions |= kNoDiv | kNoStringCharAt;
+            *restrictions |= kNoDiv | kNoStringCharAt | kNoDotProd;
             return TrySetVectorLength(8);
           case DataType::Type::kInt32:
             *restrictions |= kNoDiv;
@@ -2071,18 +2103,7 @@
   HInstruction* r = a;
   HInstruction* s = b;
   bool is_unsigned = false;
-  DataType::Type sub_type = a->GetType();
-  if (DataType::Size(b->GetType()) < DataType::Size(sub_type)) {
-    sub_type = b->GetType();
-  }
-  if (a->IsTypeConversion() &&
-      DataType::Size(a->InputAt(0)->GetType()) < DataType::Size(sub_type)) {
-    sub_type = a->InputAt(0)->GetType();
-  }
-  if (b->IsTypeConversion() &&
-      DataType::Size(b->InputAt(0)->GetType()) < DataType::Size(sub_type)) {
-    sub_type = b->InputAt(0)->GetType();
-  }
+  DataType::Type sub_type = GetNarrowerType(a, b);
   if (reduction_type != sub_type &&
       (!IsNarrowerOperands(a, b, sub_type, &r, &s, &is_unsigned) || is_unsigned)) {
     return false;
@@ -2123,6 +2144,75 @@
   return false;
 }
 
+// Method recognises the following dot product idiom:
+//   q += a * b for operands a, b whose type is narrower than the reduction one.
+// Provided that the operands have the same type or are promoted to a wider form.
+// Since this may involve a vector length change, the idiom is handled by going directly
+// to a dot product node (rather than relying combining finer grained nodes later).
+bool HLoopOptimization::VectorizeDotProdIdiom(LoopNode* node,
+                                              HInstruction* instruction,
+                                              bool generate_code,
+                                              DataType::Type reduction_type,
+                                              uint64_t restrictions) {
+  if (!instruction->IsAdd() || (reduction_type != DataType::Type::kInt32)) {
+    return false;
+  }
+
+  HInstruction* q = instruction->InputAt(0);
+  HInstruction* v = instruction->InputAt(1);
+  if (!v->IsMul() || v->GetType() != reduction_type) {
+    return false;
+  }
+
+  HInstruction* a = v->InputAt(0);
+  HInstruction* b = v->InputAt(1);
+  HInstruction* r = a;
+  HInstruction* s = b;
+  DataType::Type op_type = GetNarrowerType(a, b);
+  bool is_unsigned = false;
+
+  if (!IsNarrowerOperands(a, b, op_type, &r, &s, &is_unsigned)) {
+    return false;
+  }
+  op_type = HVecOperation::ToProperType(op_type, is_unsigned);
+
+  if (!TrySetVectorType(op_type, &restrictions) ||
+      HasVectorRestrictions(restrictions, kNoDotProd)) {
+    return false;
+  }
+
+  DCHECK(r != nullptr && s != nullptr);
+  // Accept dot product idiom for vectorizable operands. Vectorized code uses the shorthand
+  // idiomatic operation. Sequential code uses the original scalar expressions.
+  if (generate_code && vector_mode_ != kVector) {  // de-idiom
+    r = a;
+    s = b;
+  }
+  if (VectorizeUse(node, q, generate_code, op_type, restrictions) &&
+      VectorizeUse(node, r, generate_code, op_type, restrictions) &&
+      VectorizeUse(node, s, generate_code, op_type, restrictions)) {
+    if (generate_code) {
+      if (vector_mode_ == kVector) {
+        vector_map_->Put(instruction, new (global_allocator_) HVecDotProd(
+            global_allocator_,
+            vector_map_->Get(q),
+            vector_map_->Get(r),
+            vector_map_->Get(s),
+            reduction_type,
+            is_unsigned,
+            GetOtherVL(reduction_type, op_type, vector_length_),
+            kNoDexPc));
+        MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom);
+      } else {
+        GenerateVecOp(v, vector_map_->Get(r), vector_map_->Get(s), reduction_type);
+        GenerateVecOp(instruction, vector_map_->Get(q), vector_map_->Get(v), reduction_type);
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
 //
 // Vectorization heuristics.
 //
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 2b202fd..1a842c4 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -82,6 +82,7 @@
     kNoReduction     = 1 << 9,   // no reduction
     kNoSAD           = 1 << 10,  // no sum of absolute differences (SAD)
     kNoWideSAD       = 1 << 11,  // no sum of absolute differences (SAD) with operand widening
+    kNoDotProd       = 1 << 12,  // no dot product
   };
 
   /*
@@ -217,6 +218,11 @@
                          bool generate_code,
                          DataType::Type type,
                          uint64_t restrictions);
+  bool VectorizeDotProdIdiom(LoopNode* node,
+                             HInstruction* instruction,
+                             bool generate_code,
+                             DataType::Type type,
+                             uint64_t restrictions);
 
   // Vectorization heuristics.
   Alignment ComputeAlignment(HInstruction* offset,
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 68f1a24..76887f9 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1453,6 +1453,7 @@
   M(VecSetScalars, VecOperation)                                        \
   M(VecMultiplyAccumulate, VecOperation)                                \
   M(VecSADAccumulate, VecOperation)                                     \
+  M(VecDotProd, VecOperation)                                           \
   M(VecLoad, VecMemoryOperation)                                        \
   M(VecStore, VecMemoryOperation)                                       \
 
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index c7539f2..597e399 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -1021,6 +1021,66 @@
   DEFAULT_COPY_CONSTRUCTOR(VecSADAccumulate);
 };
 
+// Performs dot product of two vectors and adds the result to wider precision components in
+// the accumulator.
+//
+// viz. DOT_PRODUCT([ a1, .. , am], [ x1, .. , xn ], [ y1, .. , yn ]) =
+//                  [ a1 + sum(xi * yi), .. , am + sum(xj * yj) ],
+//      for m <= n, non-overlapping sums,
+//      for either both signed or both unsigned operands x, y.
+//
+// Notes:
+//   - packed type reflects the type of sum reduction, not the type of the operands.
+//   - IsZeroExtending() is used to determine the kind of signed/zero extension to be
+//     performed for the operands.
+//
+// TODO: Support types other than kInt32 for packed type.
+class HVecDotProd final : public HVecOperation {
+ public:
+  HVecDotProd(ArenaAllocator* allocator,
+              HInstruction* accumulator,
+              HInstruction* left,
+              HInstruction* right,
+              DataType::Type packed_type,
+              bool is_zero_extending,
+              size_t vector_length,
+              uint32_t dex_pc)
+    : HVecOperation(kVecDotProd,
+                    allocator,
+                    packed_type,
+                    SideEffects::None(),
+                    /* number_of_inputs */ 3,
+                    vector_length,
+                    dex_pc) {
+    DCHECK(HasConsistentPackedTypes(accumulator, packed_type));
+    DCHECK(DataType::IsIntegralType(packed_type));
+    DCHECK(left->IsVecOperation());
+    DCHECK(right->IsVecOperation());
+    DCHECK_EQ(ToSignedType(left->AsVecOperation()->GetPackedType()),
+              ToSignedType(right->AsVecOperation()->GetPackedType()));
+    SetRawInputAt(0, accumulator);
+    SetRawInputAt(1, left);
+    SetRawInputAt(2, right);
+    SetPackedFlag<kFieldHDotProdIsZeroExtending>(is_zero_extending);
+  }
+
+  bool IsZeroExtending() const { return GetPackedFlag<kFieldHDotProdIsZeroExtending>(); }
+
+  bool CanBeMoved() const override { return true; }
+
+  DECLARE_INSTRUCTION(VecDotProd);
+
+ protected:
+  DEFAULT_COPY_CONSTRUCTOR(VecDotProd);
+
+ private:
+  // Additional packed bits.
+  static constexpr size_t kFieldHDotProdIsZeroExtending =
+      HVecOperation::kNumberOfVectorOpPackedBits;
+  static constexpr size_t kNumberOfHDotProdPackedBits = kFieldHDotProdIsZeroExtending + 1;
+  static_assert(kNumberOfHDotProdPackedBits <= kMaxNumberOfPackedBits, "Too many packed fields.");
+};
+
 // Loads a vector from memory, viz. load(mem, 1)
 // yield the vector [ mem(1), .. , mem(n) ].
 class HVecLoad final : public HVecMemoryOperation {
diff --git a/test/684-checker-simd-dotprod/expected.txt b/test/684-checker-simd-dotprod/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/684-checker-simd-dotprod/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/684-checker-simd-dotprod/info.txt b/test/684-checker-simd-dotprod/info.txt
new file mode 100644
index 0000000..6c1efb6
--- /dev/null
+++ b/test/684-checker-simd-dotprod/info.txt
@@ -0,0 +1 @@
+Functional tests on dot product idiom SIMD vectorization.
diff --git a/test/684-checker-simd-dotprod/src/Main.java b/test/684-checker-simd-dotprod/src/Main.java
new file mode 100644
index 0000000..e0c8716
--- /dev/null
+++ b/test/684-checker-simd-dotprod/src/Main.java
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import other.TestByte;
+import other.TestCharShort;
+import other.TestVarious;
+
+/**
+ * Tests for dot product idiom vectorization.
+ */
+public class Main {
+  public static void main(String[] args) {
+     TestByte.run();
+     TestCharShort.run();
+     TestVarious.run();
+     System.out.println("passed");
+  }
+}
diff --git a/test/684-checker-simd-dotprod/src/other/TestByte.java b/test/684-checker-simd-dotprod/src/other/TestByte.java
new file mode 100644
index 0000000..9acfc59
--- /dev/null
+++ b/test/684-checker-simd-dotprod/src/other/TestByte.java
@@ -0,0 +1,484 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package other;
+
+/**
+ * Tests for dot product idiom vectorization: byte case.
+ */
+public class TestByte {
+
+  public static final int ARRAY_SIZE = 1024;
+
+  /// CHECK-START: int other.TestByte.testDotProdSimple(byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:b\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:b\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:i\d+>>     Mul [<<Get1>>,<<Get2>>]                               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdSimple(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
+  /// CHECK-DAG: <<Set:d\d+>>     VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set>>,{{d\d+}}]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<Load1>>,<<Load2>>] type:Int8   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const16>>]                            loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
+  /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+  public static final int testDotProdSimple(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = a[i] * b[i];
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestByte.testDotProdComplex(byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:b\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddC1:i\d+>>   Add [<<Get1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC1:b\d+>>  TypeConversion [<<AddC1>>]                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:b\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddC2:i\d+>>   Add [<<Get2>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC2:b\d+>>  TypeConversion [<<AddC2>>]                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:i\d+>>     Mul [<<TypeC1>>,<<TypeC2>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdComplex(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
+  /// CHECK-DAG: <<Repl:d\d+>>    VecReplicateScalar [<<Const1>>]                       loop:none
+  /// CHECK-DAG: <<Set:d\d+>>     VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set>>,{{d\d+}}]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<VAdd1:d\d+>>   VecAdd [<<Load1>>,<<Repl>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<VAdd2:d\d+>>   VecAdd [<<Load2>>,<<Repl>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<VAdd1>>,<<VAdd2>>] type:Int8   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const16>>]                            loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
+  /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+  public static final int testDotProdComplex(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = ((byte)(a[i] + 1)) * ((byte)(b[i] + 1));
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestByte.testDotProdSimpleUnsigned(byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:a\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:a\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:i\d+>>     Mul [<<Get1>>,<<Get2>>]                               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdSimpleUnsigned(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
+  /// CHECK-DAG: <<Set:d\d+>>     VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set>>,{{d\d+}}]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<Load1>>,<<Load2>>] type:Uint8  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const16>>]                            loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
+  /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+  public static final int testDotProdSimpleUnsigned(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = (a[i] & 0xff) * (b[i] & 0xff);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestByte.testDotProdComplexUnsigned(byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:a\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddC:i\d+>>    Add [<<Get1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC1:a\d+>>  TypeConversion [<<AddC>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:a\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddGets:i\d+>> Add [<<Get2>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC2:a\d+>>  TypeConversion [<<AddGets>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:i\d+>>     Mul [<<TypeC1>>,<<TypeC2>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdComplexUnsigned(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
+  /// CHECK-DAG: <<Repl:d\d+>>    VecReplicateScalar [<<Const1>>]                       loop:none
+  /// CHECK-DAG: <<Set:d\d+>>     VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set>>,{{d\d+}}]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<VAdd1:d\d+>>   VecAdd [<<Load1>>,<<Repl>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<VAdd2:d\d+>>   VecAdd [<<Load2>>,<<Repl>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<VAdd1>>,<<VAdd2>>] type:Uint8  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const16>>]                            loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
+  /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+  public static final int testDotProdComplexUnsigned(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = (((a[i] & 0xff) + 1) & 0xff) * (((b[i] & 0xff) + 1) & 0xff);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestByte.testDotProdComplexUnsignedCastedToSigned(byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:a\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddC:i\d+>>    Add [<<Get1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC1:b\d+>>  TypeConversion [<<AddC>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:a\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddGets:i\d+>> Add [<<Get2>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC2:b\d+>>  TypeConversion [<<AddGets>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:i\d+>>     Mul [<<TypeC1>>,<<TypeC2>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdComplexUnsignedCastedToSigned(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
+  /// CHECK-DAG: <<Repl:d\d+>>    VecReplicateScalar [<<Const1>>]                       loop:none
+  /// CHECK-DAG: <<Set:d\d+>>     VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set>>,{{d\d+}}]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<VAdd1:d\d+>>   VecAdd [<<Load1>>,<<Repl>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<VAdd2:d\d+>>   VecAdd [<<Load2>>,<<Repl>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<VAdd1>>,<<VAdd2>>] type:Int8   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const16>>]                            loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
+  /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+  public static final int testDotProdComplexUnsignedCastedToSigned(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = ((byte)((a[i] & 0xff) + 1)) * ((byte)((b[i] & 0xff) + 1));
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestByte.testDotProdComplexSignedCastedToUnsigned(byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:b\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddC:i\d+>>    Add [<<Get1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC1:a\d+>>  TypeConversion [<<AddC>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:b\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddGets:i\d+>> Add [<<Get2>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC2:a\d+>>  TypeConversion [<<AddGets>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:i\d+>>     Mul [<<TypeC1>>,<<TypeC2>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdComplexSignedCastedToUnsigned(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
+  /// CHECK-DAG: <<Repl:d\d+>>    VecReplicateScalar [<<Const1>>]                       loop:none
+  /// CHECK-DAG: <<Set:d\d+>>     VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set>>,{{d\d+}}]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<VAdd1:d\d+>>   VecAdd [<<Load1>>,<<Repl>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<VAdd2:d\d+>>   VecAdd [<<Load2>>,<<Repl>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<VAdd1>>,<<VAdd2>>] type:Uint8  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const16>>]                            loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
+  /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+  public static final int testDotProdComplexSignedCastedToUnsigned(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = ((a[i] + 1) & 0xff) * ((b[i] + 1) & 0xff);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdSignedWidening(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG:                  VecDotProd type:Int8
+  public static final int testDotProdSignedWidening(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = ((short)(a[i])) * ((short)(b[i]));
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdParamSigned(int, byte[]) loop_optimization (after)
+  /// CHECK-DAG:                  VecDotProd type:Int8
+  public static final int testDotProdParamSigned(int x, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = (byte)(x) * b[i];
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START-{ARM64}: int other.TestByte.testDotProdParamUnsigned(int, byte[]) loop_optimization (after)
+  /// CHECK-DAG:                  VecDotProd type:Uint8
+  public static final int testDotProdParamUnsigned(int x, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = (x & 0xff) * (b[i] & 0xff);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  // No DOTPROD cases.
+
+  /// CHECK-START: int other.TestByte.testDotProdIntParam(int, byte[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdIntParam(int x, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = b[i] * (x);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestByte.testDotProdSignedToChar(byte[], byte[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSignedToChar(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = ((char)(a[i])) * ((char)(b[i]));
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  // Cases when result of Mul is type-converted are not supported.
+
+  /// CHECK-START: int other.TestByte.testDotProdSimpleCastedToSignedByte(byte[], byte[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleCastedToSignedByte(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      byte temp = (byte)(a[i] * b[i]);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestByte.testDotProdSimpleCastedToUnsignedByte(byte[], byte[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleCastedToUnsignedByte(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      s += (a[i] * b[i]) & 0xff;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestByte.testDotProdSimpleUnsignedCastedToSignedByte(byte[], byte[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleUnsignedCastedToSignedByte(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      byte temp = (byte)((a[i] & 0xff) * (b[i] & 0xff));
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestByte.testDotProdSimpleUnsignedCastedToUnsignedByte(byte[], byte[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleUnsignedCastedToUnsignedByte(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      s += ((a[i] & 0xff) * (b[i] & 0xff)) & 0xff;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestByte.testDotProdSimpleCastedToShort(byte[], byte[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleCastedToShort(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      short temp = (short)(a[i] * b[i]);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestByte.testDotProdSimpleCastedToChar(byte[], byte[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleCastedToChar(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      char temp = (char)(a[i] * b[i]);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestByte.testDotProdSimpleUnsignedCastedToShort(byte[], byte[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleUnsignedCastedToShort(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      short temp = (short)((a[i] & 0xff) * (b[i] & 0xff));
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestByte.testDotProdSimpleUnsignedCastedToChar(byte[], byte[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleUnsignedCastedToChar(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      char temp = (char)((a[i] & 0xff) * (b[i] & 0xff));
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestByte.testDotProdSimpleUnsignedCastedToLong(byte[], byte[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleUnsignedCastedToLong(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      long temp = (long)((a[i] & 0xff) * (b[i] & 0xff));
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestByte.testDotProdUnsignedSigned(byte[], byte[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdUnsignedSigned(byte[] a, byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = (a[i] & 0xff) * b[i];
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  private static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  private static void testDotProd(byte[] b1, byte[] b2, int[] results) {
+    expectEquals(results[0], testDotProdSimple(b1, b2));
+    expectEquals(results[1], testDotProdComplex(b1, b2));
+    expectEquals(results[2], testDotProdSimpleUnsigned(b1, b2));
+    expectEquals(results[3], testDotProdComplexUnsigned(b1, b2));
+    expectEquals(results[4], testDotProdComplexUnsignedCastedToSigned(b1, b2));
+    expectEquals(results[5], testDotProdComplexSignedCastedToUnsigned(b1, b2));
+    expectEquals(results[6], testDotProdSignedWidening(b1, b2));
+    expectEquals(results[7], testDotProdParamSigned(-128, b2));
+    expectEquals(results[8], testDotProdParamUnsigned(-128, b2));
+    expectEquals(results[9], testDotProdIntParam(-128, b2));
+    expectEquals(results[10], testDotProdSignedToChar(b1, b2));
+    expectEquals(results[11], testDotProdSimpleCastedToSignedByte(b1, b2));
+    expectEquals(results[12], testDotProdSimpleCastedToUnsignedByte(b1, b2));
+    expectEquals(results[13], testDotProdSimpleUnsignedCastedToSignedByte(b1, b2));
+    expectEquals(results[14], testDotProdSimpleUnsignedCastedToUnsignedByte(b1, b2));
+    expectEquals(results[15], testDotProdSimpleCastedToShort(b1, b2));
+    expectEquals(results[16], testDotProdSimpleCastedToChar(b1, b2));
+    expectEquals(results[17], testDotProdSimpleUnsignedCastedToShort(b1, b2));
+    expectEquals(results[18], testDotProdSimpleUnsignedCastedToChar(b1, b2));
+    expectEquals(results[19], testDotProdSimpleUnsignedCastedToLong(b1, b2));
+    expectEquals(results[20], testDotProdUnsignedSigned(b1, b2));
+  }
+
+  public static void run() {
+    byte[] b1_1 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, 127, 127, 127 };
+    byte[] b2_1 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, 127, 127, 127 };
+    int[] results_1 = { 64516, 65548, 64516, 65548, 65548, 65548, 64516, -65024, 65024, -65024,
+                        64516, 4, 4, 4, 4, 64516, 64516, 64516, 64516, 64516, 64516 };
+    testDotProd(b1_1, b2_1, results_1);
+
+    byte[] b1_2 = { 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, 127, 127, 127 };
+    byte[] b2_2 = { 127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127, 127, 127, 127 };
+    int[] results_2 = { 80645, 81931, 80645, 81931, 81931, 81931, 80645, -81280, 81280, -81280,
+                        80645, 5, 5, 5, 5, 80645, 80645, 80645, 80645, 80645, 80645 };
+    testDotProd(b1_2, b2_2, results_2);
+
+    byte[] b1_3 = { -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -128, -128, -128, -128 };
+    byte[] b2_3 = {  127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  127,  127,  127,  127 };
+    int[] results_3 = { -81280, 81291, 81280, 82571, 81291, 82571, -81280, -81280, 81280, -81280,
+                        41534080, -640, 640, -640, 640, -81280, 246400, 81280, 81280, 81280, 81280 };
+    testDotProd(b1_3, b2_3, results_3);
+
+    byte[] b1_4 = { -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -128, -128, -128, -128 };
+    byte[] b2_4 = { -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -128, -128, -128, -128 };
+    int[] results_4 = { 81920, 80656, 81920, 83216, 80656, 83216, 81920, 81920, 81920, 81920,
+                       -83804160, 0, 0, 0, 0, 81920, 81920, 81920, 81920, 81920, -81920 };
+    testDotProd(b1_4, b2_4, results_4);
+  }
+
+  public static void main(String[] args) {
+    run();
+  }
+}
diff --git a/test/684-checker-simd-dotprod/src/other/TestCharShort.java b/test/684-checker-simd-dotprod/src/other/TestCharShort.java
new file mode 100644
index 0000000..9cb9db5
--- /dev/null
+++ b/test/684-checker-simd-dotprod/src/other/TestCharShort.java
@@ -0,0 +1,552 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package other;
+
+/**
+ * Tests for dot product idiom vectorization: char and short case.
+ */
+public class TestCharShort {
+
+  public static final int ARRAY_SIZE = 1024;
+
+  /// CHECK-START: int other.TestCharShort.testDotProdSimple(short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:i\d+>>     Mul [<<Get1>>,<<Get2>>]                               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestCharShort.testDotProdSimple(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const8:i\d+>>  IntConstant 8                                         loop:none
+  /// CHECK-DAG: <<Set:d\d+>>     VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set>>,{{d\d+}}]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<Load1>>,<<Load2>>] type:Int16  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const8>>]                             loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
+  /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+  public static final int testDotProdSimple(short[] a, short[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = a[i] * b[i];
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdComplex(short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddC1:i\d+>>   Add [<<Get1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC1:s\d+>>  TypeConversion [<<AddC1>>]                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddC2:i\d+>>   Add [<<Get2>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC2:s\d+>>  TypeConversion [<<AddC2>>]                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:i\d+>>     Mul [<<TypeC1>>,<<TypeC2>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestCharShort.testDotProdComplex(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const8:i\d+>>  IntConstant 8                                         loop:none
+  /// CHECK-DAG: <<Repl:d\d+>>    VecReplicateScalar [<<Const1>>]                       loop:none
+  /// CHECK-DAG: <<Set:d\d+>>     VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set>>,{{d\d+}}]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<VAdd1:d\d+>>   VecAdd [<<Load1>>,<<Repl>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<VAdd2:d\d+>>   VecAdd [<<Load2>>,<<Repl>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<VAdd1>>,<<VAdd2>>] type:Int16  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const8>>]                             loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
+  /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+  public static final int testDotProdComplex(short[] a, short[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = ((short)(a[i] + 1)) * ((short)(b[i] + 1));
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdSimpleUnsigned(char[], char[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:c\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:c\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:i\d+>>     Mul [<<Get1>>,<<Get2>>]                               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestCharShort.testDotProdSimpleUnsigned(char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const8:i\d+>>  IntConstant 8                                         loop:none
+  /// CHECK-DAG: <<Set:d\d+>>     VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set>>,{{d\d+}}]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<Load1>>,<<Load2>>] type:Uint16 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const8>>]                             loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
+  /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+  public static final int testDotProdSimpleUnsigned(char[] a, char[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = a[i] * b[i];
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdComplexUnsigned(char[], char[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:c\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddC:i\d+>>    Add [<<Get1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC1:c\d+>>  TypeConversion [<<AddC>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:c\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddGets:i\d+>> Add [<<Get2>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC2:c\d+>>  TypeConversion [<<AddGets>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:i\d+>>     Mul [<<TypeC1>>,<<TypeC2>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestCharShort.testDotProdComplexUnsigned(char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const8:i\d+>>  IntConstant 8                                         loop:none
+  /// CHECK-DAG: <<Repl:d\d+>>    VecReplicateScalar [<<Const1>>]                       loop:none
+  /// CHECK-DAG: <<Set:d\d+>>     VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set>>,{{d\d+}}]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<VAdd1:d\d+>>   VecAdd [<<Load1>>,<<Repl>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<VAdd2:d\d+>>   VecAdd [<<Load2>>,<<Repl>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<VAdd1>>,<<VAdd2>>] type:Uint16 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const8>>]                             loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
+  /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+  public static final int testDotProdComplexUnsigned(char[] a, char[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = ((char)(a[i] + 1)) * ((char)(b[i] + 1));
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdComplexUnsignedCastedToSigned(char[], char[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:c\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddC:i\d+>>    Add [<<Get1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC1:s\d+>>  TypeConversion [<<AddC>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:c\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddGets:i\d+>> Add [<<Get2>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC2:s\d+>>  TypeConversion [<<AddGets>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:i\d+>>     Mul [<<TypeC1>>,<<TypeC2>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestCharShort.testDotProdComplexUnsignedCastedToSigned(char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const8:i\d+>>  IntConstant 8                                         loop:none
+  /// CHECK-DAG: <<Repl:d\d+>>    VecReplicateScalar [<<Const1>>]                       loop:none
+  /// CHECK-DAG: <<Set:d\d+>>     VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set>>,{{d\d+}}]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<VAdd1:d\d+>>   VecAdd [<<Load1>>,<<Repl>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<VAdd2:d\d+>>   VecAdd [<<Load2>>,<<Repl>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<VAdd1>>,<<VAdd2>>] type:Int16  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const8>>]                             loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
+  /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+  public static final int testDotProdComplexUnsignedCastedToSigned(char[] a, char[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = ((short)(a[i] + 1)) * ((short)(b[i] + 1));
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdComplexSignedCastedToUnsigned(short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddC:i\d+>>    Add [<<Get1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC1:c\d+>>  TypeConversion [<<AddC>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddGets:i\d+>> Add [<<Get2>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC2:c\d+>>  TypeConversion [<<AddGets>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:i\d+>>     Mul [<<TypeC1>>,<<TypeC2>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestCharShort.testDotProdComplexSignedCastedToUnsigned(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const8:i\d+>>  IntConstant 8                                         loop:none
+  /// CHECK-DAG: <<Repl:d\d+>>    VecReplicateScalar [<<Const1>>]                       loop:none
+  /// CHECK-DAG: <<Set:d\d+>>     VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set>>,{{d\d+}}]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<VAdd1:d\d+>>   VecAdd [<<Load1>>,<<Repl>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<VAdd2:d\d+>>   VecAdd [<<Load2>>,<<Repl>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<VAdd1>>,<<VAdd2>>] type:Uint16 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const8>>]                             loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
+  /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+  public static final int testDotProdComplexSignedCastedToUnsigned(short[] a, short[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = ((char)(a[i] + 1)) * ((char)(b[i] + 1));
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START-{ARM64}: int other.TestCharShort.testDotProdSignedToInt(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG:                  VecDotProd type:Int16
+  public static final int testDotProdSignedToInt(short[] a, short[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = ((int)(a[i])) * ((int)(b[i]));
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START-{ARM64}: int other.TestCharShort.testDotProdParamSigned(int, short[]) loop_optimization (after)
+  /// CHECK-DAG:                  VecDotProd type:Int16
+  public static final int testDotProdParamSigned(int x, short[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = (short)(x) * b[i];
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START-{ARM64}: int other.TestCharShort.testDotProdParamUnsigned(int, char[]) loop_optimization (after)
+  /// CHECK-DAG:                  VecDotProd type:Uint16
+  public static final int testDotProdParamUnsigned(int x, char[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = (char)(x) * b[i];
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdIntParam(int, short[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdIntParam(int x, short[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = b[i] * (x);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START-{ARM64}: int other.TestCharShort.testDotProdSignedToChar(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG:                  VecDotProd type:Uint16
+  public static final int testDotProdSignedToChar(short[] a, short[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = ((char)(a[i])) * ((char)(b[i]));
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  // Cases when result of Mul is type-converted are not supported.
+
+  /// CHECK-START: int other.TestCharShort.testDotProdSimpleMulCastedToSigned(short[], short[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd type:Uint16
+  public static final int testDotProdSimpleMulCastedToSigned(short[] a, short[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      short temp = (short)(a[i] * b[i]);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+
+  /// CHECK-START: int other.TestCharShort.testDotProdSimpleMulCastedToUnsigned(short[], short[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleMulCastedToUnsigned(short[] a, short[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      char temp = (char)(a[i] * b[i]);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdSimpleUnsignedMulCastedToSigned(char[], char[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleUnsignedMulCastedToSigned(char[] a, char[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      short temp = (short)(a[i] * b[i]);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdSimpleUnsignedMulCastedToUnsigned(char[], char[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleUnsignedMulCastedToUnsigned(char[] a, char[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      char temp = (char)(a[i] * b[i]);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdSimpleCastedToShort(short[], short[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleCastedToShort(short[] a, short[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      short temp = (short)(a[i] * b[i]);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdSimpleCastedToChar(short[], short[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleCastedToChar(short[] a, short[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      char temp = (char)(a[i] * b[i]);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdSimpleUnsignedCastedToShort(char[], char[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleUnsignedCastedToShort(char[] a, char[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      short temp = (short)(a[i] * b[i]);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdSimpleUnsignedCastedToChar(char[], char[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleUnsignedCastedToChar(char[] a, char[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      char temp = (char)(a[i] * b[i]);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdSimpleUnsignedCastedToLong(char[], char[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSimpleUnsignedCastedToLong(char[] a, char[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      long temp = (long)(a[i] * b[i]);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  // Narrowing conversions.
+
+  /// CHECK-START: int other.TestCharShort.testDotProdSignedNarrowerSigned(short[], short[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSignedNarrowerSigned(short[] a, short[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = ((byte)(a[i])) * ((byte)(b[i]));
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdSignedNarrowerUnsigned(short[], short[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdSignedNarrowerUnsigned(short[] a, short[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = (a[i] & 0xff) * (b[i] & 0xff);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdUnsignedNarrowerSigned(char[], char[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdUnsignedNarrowerSigned(char[] a, char[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = ((byte)(a[i])) * ((byte)(b[i]));
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdUnsignedNarrowerUnsigned(char[], char[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdUnsignedNarrowerUnsigned(char[] a, char[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = (a[i] & 0xff) * (b[i] & 0xff);
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  /// CHECK-START: int other.TestCharShort.testDotProdUnsignedSigned(char[], short[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdUnsignedSigned(char[] a, short[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = a[i] * b[i];
+      s += temp;
+    }
+    return s - 1;
+  }
+
+  private static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  private static void testDotProd(short[] s1, short[] s2, char[] c1, char[] c2, int[] results) {
+    expectEquals(results[0], testDotProdSimple(s1, s2));
+    expectEquals(results[1], testDotProdComplex(s1, s2));
+    expectEquals(results[2], testDotProdSimpleUnsigned(c1, c2));
+    expectEquals(results[3], testDotProdComplexUnsigned(c1, c2));
+    expectEquals(results[4], testDotProdComplexUnsignedCastedToSigned(c1, c2));
+    expectEquals(results[5], testDotProdComplexSignedCastedToUnsigned(s1, s2));
+    expectEquals(results[6], testDotProdSignedToInt(s1, s2));
+    expectEquals(results[7], testDotProdParamSigned(-32768, s2));
+    expectEquals(results[8], testDotProdParamUnsigned(-32768, c2));
+    expectEquals(results[9], testDotProdIntParam(-32768, s2));
+    expectEquals(results[10], testDotProdSignedToChar(s1, s2));
+    expectEquals(results[11], testDotProdSimpleMulCastedToSigned(s1, s2));
+    expectEquals(results[12], testDotProdSimpleMulCastedToUnsigned(s1, s2));
+    expectEquals(results[13], testDotProdSimpleUnsignedMulCastedToSigned(c1, c2));
+    expectEquals(results[14], testDotProdSimpleUnsignedMulCastedToUnsigned(c1, c2));
+    expectEquals(results[15], testDotProdSimpleCastedToShort(s1, s2));
+    expectEquals(results[16], testDotProdSimpleCastedToChar(s1, s2));
+    expectEquals(results[17], testDotProdSimpleUnsignedCastedToShort(c1, c2));
+    expectEquals(results[18], testDotProdSimpleUnsignedCastedToChar(c1, c2));
+    expectEquals(results[19], testDotProdSimpleUnsignedCastedToLong(c1, c2));
+    expectEquals(results[20], testDotProdSignedNarrowerSigned(s1, s2));
+    expectEquals(results[21], testDotProdSignedNarrowerUnsigned(s1, s2));
+    expectEquals(results[22], testDotProdUnsignedNarrowerSigned(c1, c2));
+    expectEquals(results[23], testDotProdUnsignedNarrowerUnsigned(c1, c2));
+    expectEquals(results[24], testDotProdUnsignedSigned(c1, s2));
+  }
+
+  public static void run() {
+    final short MAX_S = Short.MAX_VALUE;
+    final short MIN_S = Short.MAX_VALUE;
+
+    short[] s1_1 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MAX_S, MAX_S };
+    short[] s2_1 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MAX_S, MAX_S };
+    char[]  c1_1 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MAX_S, MAX_S };
+    char[]  c2_1 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MAX_S, MAX_S };
+    int[] results_1 = { 2147352578, -2147483634, 2147352578, -2147483634, -2147483634, -2147483634,
+                        2147352578, -2147418112, 2147418112, -2147418112, 2147352578,
+                        2, 2, 2, 2, 2, 2, 2, 2, 2147352578, 2, 130050, 2, 130050, 2147352578 };
+    testDotProd(s1_1, s2_1, c1_1, c2_1, results_1);
+
+    short[] s1_2 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MAX_S, MAX_S, MAX_S, MAX_S };
+    short[] s2_2 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MAX_S, MAX_S, MAX_S, MAX_S };
+    char[]  c1_2 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MAX_S, MAX_S, MAX_S, MAX_S };
+    char[]  c2_2 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MAX_S, MAX_S, MAX_S, MAX_S };
+    int[] results_2 = { -262140, 12, -262140, 12, 12, 12, -262140, 131072, -131072, 131072,
+                        -262140, 4, 4, 4, 4, 4, 4, 4, 4, -262140, 4, 260100, 4, 260100, -262140 };
+    testDotProd(s1_2, s2_2, c1_2, c2_2, results_2);
+
+    short[] s1_3 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MIN_S, MIN_S };
+    short[] s2_3 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MAX_S, MAX_S };
+    char[]  c1_3 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MIN_S, MIN_S };
+    char[]  c2_3 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MAX_S, MAX_S };
+    int[] results_3 = { 2147352578, -2147483634, 2147352578, -2147483634, -2147483634,
+                        -2147483634, 2147352578, -2147418112, 2147418112, -2147418112,
+                        2147352578, 2, 2, 2, 2, 2, 2, 2, 2, 2147352578, 2, 130050, 2,
+                        130050, 2147352578};
+    testDotProd(s1_3, s2_3, c1_3, c2_3, results_3);
+
+
+    short[] s1_4 = { MIN_S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MIN_S, MIN_S };
+    short[] s2_4 = { MIN_S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MIN_S, MIN_S };
+    char[]  c1_4 = { MIN_S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MIN_S, MIN_S };
+    char[]  c2_4 = { MIN_S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MIN_S, MIN_S };
+    int[] results_4 = { -1073938429, -1073741811, -1073938429, -1073741811, -1073741811,
+                        -1073741811, -1073938429, 1073840128, -1073840128, 1073840128,
+                        -1073938429, 3, 3, 3, 3, 3, 3, 3, 3, -1073938429, 3, 195075, 3,
+                        195075, -1073938429 };
+    testDotProd(s1_4, s2_4, c1_4, c2_4, results_4);
+  }
+
+  public static void main(String[] args) {
+    run();
+  }
+}
diff --git a/test/684-checker-simd-dotprod/src/other/TestVarious.java b/test/684-checker-simd-dotprod/src/other/TestVarious.java
new file mode 100644
index 0000000..3f46098
--- /dev/null
+++ b/test/684-checker-simd-dotprod/src/other/TestVarious.java
@@ -0,0 +1,422 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package other;
+
+/**
+ * Tests for dot product idiom vectorization.
+ */
+public class TestVarious {
+
+  /// CHECK-START: int other.TestVarious.testDotProdConstRight(byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const89:i\d+>> IntConstant 89                                        loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:b\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:i\d+>>     Mul [<<Get1>>,<<Const89>>]                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestVarious.testDotProdConstRight(byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
+  /// CHECK-DAG: <<Const89:i\d+>> IntConstant 89                                        loop:none
+  /// CHECK-DAG: <<Set:d\d+>>     VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Repl:d\d+>>    VecReplicateScalar [<<Const89>>]                      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set>>,{{d\d+}}]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<Load1>>,<<Repl>>] type:Int8    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const16>>]                            loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
+  /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+  public static final int testDotProdConstRight(byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp =  b[i] * 89;
+      s += temp;
+    }
+    return s;
+  }
+
+  /// CHECK-START: int other.TestVarious.testDotProdConstLeft(byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const89:i\d+>> IntConstant 89                                        loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:a\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:i\d+>>     Mul [<<Get1>>,<<Const89>>]                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestVarious.testDotProdConstLeft(byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
+  /// CHECK-DAG: <<Const89:i\d+>> IntConstant 89                                        loop:none
+  /// CHECK-DAG: <<Set:d\d+>>     VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Repl:d\d+>>    VecReplicateScalar [<<Const89>>]                      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set>>,{{d\d+}}]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<Load1>>,<<Repl>>] type:Uint8   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const16>>]                            loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
+  /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+  public static final int testDotProdConstLeft(byte[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = 89 * (b[i] & 0xff);
+      s += temp;
+    }
+    return s;
+  }
+
+  /// CHECK-START: int other.TestVarious.testDotProdLoopInvariantConvRight(byte[], int) loop_optimization (before)
+  /// CHECK-DAG: <<Param:i\d+>>   ParameterValue                                        loop:none
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<ConstL:i\d+>>  IntConstant 129                                       loop:none
+  /// CHECK-DAG: <<AddP:i\d+>>    Add [<<Param>>,<<ConstL>>]                            loop:none
+  /// CHECK-DAG: <<TypeCnv:b\d+>> TypeConversion [<<AddP>>]                             loop:none
+  //
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:b\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]                          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:i\d+>>     Mul [<<Get1>>,<<TypeCnv>>]                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                             loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestVarious.testDotProdLoopInvariantConvRight(byte[], int) loop_optimization (after)
+  /// CHECK-DAG: <<Param:i\d+>>   ParameterValue                                        loop:none
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
+  /// CHECK-DAG: <<ConstL:i\d+>>  IntConstant 129                                       loop:none
+  /// CHECK-DAG: <<AddP:i\d+>>    Add [<<Param>>,<<ConstL>>]                            loop:none
+  /// CHECK-DAG: <<TypeCnv:b\d+>> TypeConversion [<<AddP>>]                             loop:none
+  /// CHECK-DAG: <<Set:d\d+>>     VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Repl:d\d+>>    VecReplicateScalar [<<TypeCnv>>]                      loop:none
+  //
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set>>,{{d\d+}}]                                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<Load1>>,<<Repl>>] type:Int8    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const16>>]                            loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                                  loop:none
+  /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]                         loop:none
+  public static final int testDotProdLoopInvariantConvRight(byte[] b, int param) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = b[i] * ((byte)(param + 129));
+      s += temp;
+    }
+    return s;
+  }
+
+  /// CHECK-START: int other.TestVarious.testDotProdByteToChar(char[], char[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdByteToChar(char[] a, char[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = ((char)((byte)(a[i] + 129))) * b[i];
+      s += temp;
+    }
+    return s;
+  }
+
+  /// CHECK-START: int other.TestVarious.testDotProdMixedSize(byte[], short[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdMixedSize(byte[] a, short[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = a[i] * b[i];
+      s += temp;
+    }
+    return s;
+  }
+
+  /// CHECK-START: int other.TestVarious.testDotProdMixedSizeAndSign(byte[], char[]) loop_optimization (after)
+  /// CHECK-NOT:                  VecDotProd
+  public static final int testDotProdMixedSizeAndSign(byte[] a, char[] b) {
+    int s = 1;
+    for (int i = 0; i < b.length; i++) {
+      int temp = a[i] * b[i];
+      s += temp;
+    }
+    return s;
+  }
+
+  /// CHECK-START: int other.TestVarious.testDotProdInt32(int[], int[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                             loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                             loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:i\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:i\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:i\d+>>     Mul [<<Get1>>,<<Get2>>]                   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul>>]                    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                 loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestVarious.testDotProdInt32(int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                             loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                             loop:none
+  /// CHECK-DAG: <<Set:d\d+>>     VecSetScalars [<<Const1>>]                loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set>>,{{d\d+}}]                    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul:d\d+>>     VecMul [<<Load1>>,<<Load2>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecAdd [<<Phi2>>,<<Mul>>]                 loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-DAG: <<Reduce:d\d+>>  VecReduce [<<Phi2>>]                      loop:none
+  /// CHECK-DAG:                  VecExtractScalar [<<Reduce>>]             loop:none
+  public static final int testDotProdInt32(int[] a, int[] b) {
+    int s = 1;
+    for (int i = 0;  i < b.length; i++) {
+      int temp = a[i] * b[i];
+      s += temp;
+    }
+    return s;
+  }
+
+  /// CHECK-START: int other.TestVarious.testDotProdBothSignedUnsigned1(byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                             loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                             loop:none
+  /// CHECK-DAG: <<Const2:i\d+>>  IntConstant 2                             loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Phi3:i\d+>>    Phi [<<Const2>>,{{i\d+}}]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:b\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:b\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul1:i\d+>>    Mul [<<Get1>>,<<Get2>>]                   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul1>>]                   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC1:a\d+>>  TypeConversion [<<Get1>>]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC2:a\d+>>  TypeConversion [<<Get2>>]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul2:i\d+>>    Mul [<<TypeC1>>,<<TypeC2>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi3>>,<<Mul2>>]                   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                 loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestVarious.testDotProdBothSignedUnsigned1(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const2:i\d+>>  IntConstant 2                                         loop:none
+  /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
+  /// CHECK-DAG: <<Set1:d\d+>>    VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Set2:d\d+>>    VecSetScalars [<<Const2>>]                            loop:none
+  //
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set1>>,{{d\d+}}]                               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Phi3:d\d+>>    Phi [<<Set2>>,{{d\d+}}]                               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<Load1>>,<<Load2>>] type:Int8   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi3>>,<<Load1>>,<<Load2>>] type:Uint8  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const16>>]                            loop:<<Loop>>      outer_loop:none
+  public static final int testDotProdBothSignedUnsigned1(byte[] a, byte[] b) {
+    int s1 = 1;
+    int s2 = 2;
+    for (int i = 0; i < b.length; i++) {
+      byte a_val = a[i];
+      byte b_val = b[i];
+      s1 += a_val * b_val;
+      s2 += (a_val & 0xff) * (b_val & 0xff);
+    }
+    return s1 + s2;
+  }
+
+  /// CHECK-START: int other.TestVarious.testDotProdBothSignedUnsigned2(byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                             loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                             loop:none
+  /// CHECK-DAG: <<Const2:i\d+>>  IntConstant 2                             loop:none
+  /// CHECK-DAG: <<Const42:i\d+>> IntConstant 42                            loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Phi3:i\d+>>    Phi [<<Const2>>,{{i\d+}}]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:b\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:a\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeC1:a\d+>>  TypeConversion [<<Get1>>]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul1:i\d+>>    Mul [<<Get2>>,<<TypeC1>>]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi3>>,<<Mul1>>]                   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul2:i\d+>>    Mul [<<Get1>>,<<Const42>>]                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul2>>]                   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                 loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestVarious.testDotProdBothSignedUnsigned2(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const2:i\d+>>  IntConstant 2                                         loop:none
+  /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
+  /// CHECK-DAG: <<Const42:i\d+>> IntConstant 42                                        loop:none
+  /// CHECK-DAG: <<Repl:d\d+>>    VecReplicateScalar [<<Const42>>]                      loop:none
+  /// CHECK-DAG: <<Set1:d\d+>>    VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Set2:d\d+>>    VecSetScalars [<<Const2>>]                            loop:none
+  //
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set1>>,{{d\d+}}]                               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Phi3:d\d+>>    Phi [<<Set2>>,{{d\d+}}]                               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi3>>,<<Load2>>,<<Load1>>] type:Uint8  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<Load1>>,<<Repl>>] type:Int8    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const16>>]                            loop:<<Loop>>      outer_loop:none
+  public static final int testDotProdBothSignedUnsigned2(byte[] a, byte[] b) {
+    int s1 = 1;
+    int s2 = 2;
+    for (int i = 0; i < b.length; i++) {
+      byte a_val = a[i];
+      byte b_val = b[i];
+      s2 += (a_val & 0xff) * (b_val & 0xff);
+      s1 += a_val * 42;
+    }
+    return s1 + s2;
+  }
+
+  /// CHECK-START: int other.TestVarious.testDotProdBothSignedUnsignedDoubleLoad(byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                             loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                             loop:none
+  /// CHECK-DAG: <<Const2:i\d+>>  IntConstant 2                             loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Phi3:i\d+>>    Phi [<<Const2>>,{{i\d+}}]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<GetB1:b\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<GetB2:b\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul1:i\d+>>    Mul [<<GetB1>>,<<GetB2>>]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul1>>]                   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<GetA1:a\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<GetA2:a\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul2:i\d+>>    Mul [<<GetA1>>,<<GetA2>>]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi3>>,<<Mul2>>]                   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                 loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestVarious.testDotProdBothSignedUnsignedDoubleLoad(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const2:i\d+>>  IntConstant 2                                         loop:none
+  /// CHECK-DAG: <<Const16:i\d+>> IntConstant 16                                        loop:none
+  /// CHECK-DAG: <<Set1:d\d+>>    VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Set2:d\d+>>    VecSetScalars [<<Const2>>]                            loop:none
+  //
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set1>>,{{d\d+}}]                               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Phi3:d\d+>>    Phi [<<Set2>>,{{d\d+}}]                               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<Load1>>,<<Load2>>] type:Int8   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load3:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load4:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi3>>,<<Load3>>,<<Load4>>] type:Uint8  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const16>>]                            loop:<<Loop>>      outer_loop:none
+  public static final int testDotProdBothSignedUnsignedDoubleLoad(byte[] a, byte[] b) {
+    int s1 = 1;
+    int s2 = 2;
+    for (int i = 0; i < b.length; i++) {
+      s1 += a[i] * b[i];
+      s2 += (a[i] & 0xff) * (b[i] & 0xff);
+    }
+    return s1 + s2;
+  }
+
+  /// CHECK-START: int other.TestVarious.testDotProdBothSignedUnsignedChar(char[], char[]) loop_optimization (before)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                             loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                             loop:none
+  /// CHECK-DAG: <<Const2:i\d+>>  IntConstant 2                             loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>    Phi [<<Const1>>,{{i\d+}}]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Phi3:i\d+>>    Phi [<<Const2>>,{{i\d+}}]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:c\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:c\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]              loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeS1:s\d+>>  TypeConversion [<<Get1>>]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<TypeS2:s\d+>>  TypeConversion [<<Get2>>]                 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul1:i\d+>>    Mul [<<TypeS1>>,<<TypeS2>>]               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi3>>,<<Mul1>>]                   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Mul2:i\d+>>    Mul [<<Get1>>,<<Get2>>]                   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi2>>,<<Mul2>>]                   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const1>>]                 loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-{ARM64}: int other.TestVarious.testDotProdBothSignedUnsignedChar(char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<Const0:i\d+>>  IntConstant 0                                         loop:none
+  /// CHECK-DAG: <<Const1:i\d+>>  IntConstant 1                                         loop:none
+  /// CHECK-DAG: <<Const2:i\d+>>  IntConstant 2                                         loop:none
+  /// CHECK-DAG: <<Const8:i\d+>>  IntConstant 8                                         loop:none
+  /// CHECK-DAG: <<Set1:d\d+>>    VecSetScalars [<<Const1>>]                            loop:none
+  /// CHECK-DAG: <<Set2:d\d+>>    VecSetScalars [<<Const2>>]                            loop:none
+  //
+  /// CHECK-DAG: <<Phi1:i\d+>>    Phi [<<Const0>>,{{i\d+}}]                             loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>    Phi [<<Set1>>,{{d\d+}}]                               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Phi3:d\d+>>    Phi [<<Set2>>,{{d\d+}}]                               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>   VecLoad [{{l\d+}},<<Phi1>>]                           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi3>>,<<Load1>>,<<Load2>>] type:Int16  loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  VecDotProd [<<Phi2>>,<<Load1>>,<<Load2>>] type:Uint16 loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                  Add [<<Phi1>>,<<Const8>>]                             loop:<<Loop>>      outer_loop:none
+  public static final int testDotProdBothSignedUnsignedChar(char[] a, char[] b) {
+    int s1 = 1;
+    int s2 = 2;
+    for (int i = 0; i < b.length; i++) {
+      char a_val = a[i];
+      char b_val = b[i];
+      s2 += ((short)a_val) * ((short)b_val);
+      s1 += a_val * b_val;
+    }
+    return s1 + s2;
+  }
+
+  private static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  public static void run() {
+    final short MAX_S = Short.MAX_VALUE;
+    final short MIN_S = Short.MAX_VALUE;
+
+    byte[] b1 = { -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -128, -128, -128, -128 };
+    byte[] b2 = {  127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  127,  127,  127,  127 };
+
+    char[] c1 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MIN_S, MIN_S };
+    char[] c2 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MIN_S, MIN_S };
+
+    int[] i1 = { -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -128, -128, -128, -128 };
+    int[] i2 = {  127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  127,  127,  127,  127 };
+
+    short[] s1 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, MIN_S, MIN_S };
+
+    expectEquals(56516, testDotProdConstRight(b2));
+    expectEquals(56516, testDotProdConstLeft(b2));
+    expectEquals(1271, testDotProdLoopInvariantConvRight(b2, 129));
+    expectEquals(-8519423, testDotProdByteToChar(c1, c2));
+    expectEquals(-8388351, testDotProdMixedSize(b1, s1));
+    expectEquals(-8388351, testDotProdMixedSizeAndSign(b1, c2));
+    expectEquals(-81279, testDotProdInt32(i1, i2));
+    expectEquals(3, testDotProdBothSignedUnsigned1(b1, b2));
+    expectEquals(54403, testDotProdBothSignedUnsigned2(b1, b2));
+    expectEquals(3, testDotProdBothSignedUnsignedDoubleLoad(b1, b2));
+    expectEquals(-262137, testDotProdBothSignedUnsignedChar(c1, c2));
+  }
+
+  public static void main(String[] args) {
+    run();
+  }
+}