18 files changed, 343 insertions, 38 deletions
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 43169ba7eb..e79a96bc2a 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -1277,6 +1277,74 @@ void InstructionCodeGeneratorARM64::VisitVecSADAccumulate(HVecSADAccumulate* ins
   }
 }
 
+void LocationsBuilderARM64::VisitVecDotProd(HVecDotProd* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+  DCHECK(instruction->GetPackedType() == DataType::Type::kInt32);
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetInAt(1, Location::RequiresFpuRegister());
+  locations->SetInAt(2, Location::RequiresFpuRegister());
+  locations->SetOut(Location::SameAsFirstInput());
+
+  // For Int8 and Uint8 we need a temp register.
+  if (DataType::Size(instruction->InputAt(1)->AsVecOperation()->GetPackedType()) == 1) {
+    locations->AddTemp(Location::RequiresFpuRegister());
+  }
+}
+
+void InstructionCodeGeneratorARM64::VisitVecDotProd(HVecDotProd* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  VRegister acc = VRegisterFrom(locations->InAt(0));
+  VRegister left = VRegisterFrom(locations->InAt(1));
+  VRegister right = VRegisterFrom(locations->InAt(2));
+  HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
+  HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
+  DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
+            HVecOperation::ToSignedType(b->GetPackedType()));
+  DCHECK_EQ(instruction->GetPackedType(), DataType::Type::kInt32);
+  DCHECK_EQ(4u, instruction->GetVectorLength());
+
+  size_t inputs_data_size = DataType::Size(a->GetPackedType());
+  switch (inputs_data_size) {
+    case 1u: {
+      DCHECK_EQ(16u, a->GetVectorLength());
+      VRegister tmp = VRegisterFrom(locations->GetTemp(0));
+      if (instruction->IsZeroExtending()) {
+        // TODO: Use Armv8.4-A UDOT instruction when it is available.
+        __ Umull(tmp.V8H(), left.V8B(), right.V8B());
+        __ Uaddw(acc.V4S(), acc.V4S(), tmp.V4H());
+        __ Uaddw2(acc.V4S(), acc.V4S(), tmp.V8H());
+
+        __ Umull2(tmp.V8H(), left.V16B(), right.V16B());
+        __ Uaddw(acc.V4S(), acc.V4S(), tmp.V4H());
+        __ Uaddw2(acc.V4S(), acc.V4S(), tmp.V8H());
+      } else {
+        // TODO: Use Armv8.4-A SDOT instruction when it is available.
+        __ Smull(tmp.V8H(), left.V8B(), right.V8B());
+        __ Saddw(acc.V4S(), acc.V4S(), tmp.V4H());
+        __ Saddw2(acc.V4S(), acc.V4S(), tmp.V8H());
+
+        __ Smull2(tmp.V8H(), left.V16B(), right.V16B());
+        __ Saddw(acc.V4S(), acc.V4S(), tmp.V4H());
+        __ Saddw2(acc.V4S(), acc.V4S(), tmp.V8H());
+      }
+      break;
+    }
+    case 2u:
+      DCHECK_EQ(8u, a->GetVectorLength());
+      if (instruction->IsZeroExtending()) {
+        __ Umlal(acc.V4S(), left.V4H(), right.V4H());
+        __ Umlal2(acc.V4S(), left.V8H(), right.V8H());
+      } else {
+        __ Smlal(acc.V4S(), left.V4H(), right.V4H());
+        __ Smlal2(acc.V4S(), left.V8H(), right.V8H());
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type size: " << inputs_data_size;
+  }
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* allocator,
                                   HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc
index 7b66b17983..62b6c4ea01 100644
--- a/compiler/optimizing/code_generator_vector_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc
@@ -854,6 +854,14 @@ void InstructionCodeGeneratorARMVIXL::VisitVecSADAccumulate(HVecSADAccumulate* i
   }
 }
 
+void LocationsBuilderARMVIXL::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 // Return whether the vector memory access operation is guaranteed to be word-aligned (ARM word
 // size equals to 4).
 static bool IsWordAligned(HVecMemoryOperation* instruction) {
diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc
index df0e1485d6..24f4fb2d7b 100644
--- a/compiler/optimizing/code_generator_vector_mips.cc
+++ b/compiler/optimizing/code_generator_vector_mips.cc
@@ -1274,6 +1274,14 @@ void InstructionCodeGeneratorMIPS::VisitVecSADAccumulate(HVecSADAccumulate* inst
   }
 }
 
+void LocationsBuilderMIPS::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* allocator,
                                   HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc
index de354b63a1..972c49ebb1 100644
--- a/compiler/optimizing/code_generator_vector_mips64.cc
+++ b/compiler/optimizing/code_generator_vector_mips64.cc
@@ -1272,6 +1272,14 @@ void InstructionCodeGeneratorMIPS64::VisitVecSADAccumulate(HVecSADAccumulate* in
   }
 }
 
+void LocationsBuilderMIPS64::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* allocator,
                                   HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 2502275b3a..c52ecc77c5 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -1143,6 +1143,14 @@ void InstructionCodeGeneratorX86::VisitVecSADAccumulate(HVecSADAccumulate* instr
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderX86::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorX86::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* allocator,
                                   HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 4a67dafd8a..87d0106c3e 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -1116,6 +1116,14 @@ void InstructionCodeGeneratorX86_64::VisitVecSADAccumulate(HVecSADAccumulate* in
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderX86_64::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecDotProd(HVecDotProd* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* allocator,
                                   HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 6c77232361..39cbe5e850 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -8301,7 +8301,7 @@ void CodeGeneratorX86::PatchJitRootUse(uint8_t* code,
   uint32_t code_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
   uintptr_t address =
       reinterpret_cast<uintptr_t>(roots_data) + index_in_table * sizeof(GcRoot<mirror::Object>);
-  typedef __attribute__((__aligned__(1))) uint32_t unaligned_uint32_t;
+  using unaligned_uint32_t __attribute__((__aligned__(1))) = uint32_t;
   reinterpret_cast<unaligned_uint32_t*>(code + code_offset)[0] =
      dchecked_integral_cast<uint32_t>(address);
 }
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 39d97899ae..e458dfffb4 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -7542,7 +7542,7 @@ void CodeGeneratorX86_64::PatchJitRootUse(uint8_t* code,
   uint32_t code_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
   uintptr_t address =
       reinterpret_cast<uintptr_t>(roots_data) + index_in_table * sizeof(GcRoot<mirror::Object>);
-  typedef __attribute__((__aligned__(1))) uint32_t unaligned_uint32_t;
+  using unaligned_uint32_t __attribute__((__aligned__(1))) = uint32_t;
   reinterpret_cast<unaligned_uint32_t*>(code + code_offset)[0] =
      dchecked_integral_cast<uint32_t>(address);
 }
diff --git a/compiler/optimizing/data_type.h b/compiler/optimizing/data_type.h
index 5ac6e46003..3cbcc9e0c3 100644
--- a/compiler/optimizing/data_type.h
+++ b/compiler/optimizing/data_type.h
@@ -231,6 +231,21 @@ class DataType {
     }
   }
 
+  static Type ToUnsigned(Type type) {
+    switch (type) {
+      case Type::kInt8:
+        return Type::kUint8;
+      case Type::kInt16:
+        return Type::kUint16;
+      case Type::kInt32:
+        return Type::kUint32;
+      case Type::kInt64:
+        return Type::kUint64;
+      default:
+        return type;
+    }
+  }
+
   static const char* PrettyDescriptor(Type type);
 
  private:
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 31db8c205f..a1af2be9de 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -106,8 +106,7 @@ std::ostream& operator<<(std::ostream& os, const StringList& list) {
   }
 }
 
-typedef Disassembler* create_disasm_prototype(InstructionSet instruction_set,
-                                              DisassemblerOptions* options);
+using create_disasm_prototype = Disassembler*(InstructionSet, DisassemblerOptions*);
 class HGraphVisualizerDisassembler {
  public:
   HGraphVisualizerDisassembler(InstructionSet instruction_set,
@@ -564,6 +563,14 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
     StartAttributeStream("kind") << instruction->GetOpKind();
   }
 
+  void VisitVecDotProd(HVecDotProd* instruction) override {
+    VisitVecOperation(instruction);
+    DataType::Type arg_type = instruction->InputAt(1)->AsVecOperation()->GetPackedType();
+    StartAttributeStream("type") << (instruction->IsZeroExtending() ?
+                                    DataType::ToUnsigned(arg_type) :
+                                    DataType::ToSigned(arg_type));
+  }
+
 #if defined(ART_ENABLE_CODEGEN_arm) || defined(ART_ENABLE_CODEGEN_arm64)
   void VisitMultiplyAccumulate(HMultiplyAccumulate* instruction) override {
     StartAttributeStream("kind") << instruction->GetOpKind();
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 7d66155b39..12b180d5ff 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -351,7 +351,10 @@ static bool HasReductionFormat(HInstruction* reduction, HInstruction* phi) {
 
 // Translates vector operation to reduction kind.
 static HVecReduce::ReductionKind GetReductionKind(HVecOperation* reduction) {
-  if (reduction->IsVecAdd() || reduction->IsVecSub() || reduction->IsVecSADAccumulate()) {
+  if (reduction->IsVecAdd() ||
+      reduction->IsVecSub() ||
+      reduction->IsVecSADAccumulate() ||
+      reduction->IsVecDotProd()) {
     return HVecReduce::kSum;
   }
   LOG(FATAL) << "Unsupported SIMD reduction " << reduction->GetId();
@@ -431,6 +434,23 @@ static void PeelByCount(HLoopInformation* loop_info, int count) {
   }
 }
 
+// Returns the narrower type out of instructions a and b types.
+static DataType::Type GetNarrowerType(HInstruction* a, HInstruction* b) {
+  DataType::Type type = a->GetType();
+  if (DataType::Size(b->GetType()) < DataType::Size(type)) {
+    type = b->GetType();
+  }
+  if (a->IsTypeConversion() &&
+      DataType::Size(a->InputAt(0)->GetType()) < DataType::Size(type)) {
+    type = a->InputAt(0)->GetType();
+  }
+  if (b->IsTypeConversion() &&
+      DataType::Size(b->InputAt(0)->GetType()) < DataType::Size(type)) {
+    type = b->InputAt(0)->GetType();
+  }
+  return type;
+}
+
 //
 // Public methods.
 //
@@ -1289,6 +1309,7 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node,
     DataType::Type type = instruction->GetType();
     // Recognize SAD idiom or direct reduction.
     if (VectorizeSADIdiom(node, instruction, generate_code, type, restrictions) ||
+        VectorizeDotProdIdiom(node, instruction, generate_code, type, restrictions) ||
         (TrySetVectorType(type, &restrictions) &&
          VectorizeUse(node, instruction, generate_code, type, restrictions))) {
       if (generate_code) {
@@ -1531,11 +1552,11 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
         case DataType::Type::kBool:
         case DataType::Type::kUint8:
         case DataType::Type::kInt8:
-          *restrictions |= kNoDiv | kNoReduction;
+          *restrictions |= kNoDiv | kNoReduction | kNoDotProd;
           return TrySetVectorLength(8);
         case DataType::Type::kUint16:
         case DataType::Type::kInt16:
-          *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
+          *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoDotProd;
           return TrySetVectorLength(4);
         case DataType::Type::kInt32:
           *restrictions |= kNoDiv | kNoWideSAD;
@@ -1580,12 +1601,23 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
           case DataType::Type::kBool:
           case DataType::Type::kUint8:
           case DataType::Type::kInt8:
-            *restrictions |=
-                kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoSAD;
+            *restrictions |= kNoMul |
+                             kNoDiv |
+                             kNoShift |
+                             kNoAbs |
+                             kNoSignedHAdd |
+                             kNoUnroundedHAdd |
+                             kNoSAD |
+                             kNoDotProd;
             return TrySetVectorLength(16);
           case DataType::Type::kUint16:
           case DataType::Type::kInt16:
-            *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoSAD;
+            *restrictions |= kNoDiv |
+                             kNoAbs |
+                             kNoSignedHAdd |
+                             kNoUnroundedHAdd |
+                             kNoSAD|
+                             kNoDotProd;
             return TrySetVectorLength(8);
           case DataType::Type::kInt32:
             *restrictions |= kNoDiv | kNoSAD;
@@ -1610,11 +1642,11 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
           case DataType::Type::kBool:
           case DataType::Type::kUint8:
           case DataType::Type::kInt8:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoDotProd;
             return TrySetVectorLength(16);
           case DataType::Type::kUint16:
           case DataType::Type::kInt16:
-            *restrictions |= kNoDiv | kNoStringCharAt;
+            *restrictions |= kNoDiv | kNoStringCharAt | kNoDotProd;
             return TrySetVectorLength(8);
           case DataType::Type::kInt32:
             *restrictions |= kNoDiv;
@@ -1639,11 +1671,11 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
           case DataType::Type::kBool:
           case DataType::Type::kUint8:
           case DataType::Type::kInt8:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoDotProd;
             return TrySetVectorLength(16);
           case DataType::Type::kUint16:
           case DataType::Type::kInt16:
-            *restrictions |= kNoDiv | kNoStringCharAt;
+            *restrictions |= kNoDiv | kNoStringCharAt | kNoDotProd;
             return TrySetVectorLength(8);
           case DataType::Type::kInt32:
             *restrictions |= kNoDiv;
@@ -2071,18 +2103,7 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
   HInstruction* r = a;
   HInstruction* s = b;
   bool is_unsigned = false;
-  DataType::Type sub_type = a->GetType();
-  if (DataType::Size(b->GetType()) < DataType::Size(sub_type)) {
-    sub_type = b->GetType();
-  }
-  if (a->IsTypeConversion() &&
-      DataType::Size(a->InputAt(0)->GetType()) < DataType::Size(sub_type)) {
-    sub_type = a->InputAt(0)->GetType();
-  }
-  if (b->IsTypeConversion() &&
-      DataType::Size(b->InputAt(0)->GetType()) < DataType::Size(sub_type)) {
-    sub_type = b->InputAt(0)->GetType();
-  }
+  DataType::Type sub_type = GetNarrowerType(a, b);
   if (reduction_type != sub_type &&
       (!IsNarrowerOperands(a, b, sub_type, &r, &s, &is_unsigned) || is_unsigned)) {
     return false;
@@ -2123,6 +2144,75 @@ bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
   return false;
 }
 
+// Method recognises the following dot product idiom:
+//   q += a * b for operands a, b whose type is narrower than the reduction one.
+// Provided that the operands have the same type or are promoted to a wider form.
+// Since this may involve a vector length change, the idiom is handled by going directly
+// to a dot product node (rather than relying combining finer grained nodes later).
+bool HLoopOptimization::VectorizeDotProdIdiom(LoopNode* node,
+                                              HInstruction* instruction,
+                                              bool generate_code,
+                                              DataType::Type reduction_type,
+                                              uint64_t restrictions) {
+  if (!instruction->IsAdd() || (reduction_type != DataType::Type::kInt32)) {
+    return false;
+  }
+
+  HInstruction* q = instruction->InputAt(0);
+  HInstruction* v = instruction->InputAt(1);
+  if (!v->IsMul() || v->GetType() != reduction_type) {
+    return false;
+  }
+
+  HInstruction* a = v->InputAt(0);
+  HInstruction* b = v->InputAt(1);
+  HInstruction* r = a;
+  HInstruction* s = b;
+  DataType::Type op_type = GetNarrowerType(a, b);
+  bool is_unsigned = false;
+
+  if (!IsNarrowerOperands(a, b, op_type, &r, &s, &is_unsigned)) {
+    return false;
+  }
+  op_type = HVecOperation::ToProperType(op_type, is_unsigned);
+
+  if (!TrySetVectorType(op_type, &restrictions) ||
+      HasVectorRestrictions(restrictions, kNoDotProd)) {
+    return false;
+  }
+
+  DCHECK(r != nullptr && s != nullptr);
+  // Accept dot product idiom for vectorizable operands. Vectorized code uses the shorthand
+  // idiomatic operation. Sequential code uses the original scalar expressions.
+  if (generate_code && vector_mode_ != kVector) {  // de-idiom
+    r = a;
+    s = b;
+  }
+  if (VectorizeUse(node, q, generate_code, op_type, restrictions) &&
+      VectorizeUse(node, r, generate_code, op_type, restrictions) &&
+      VectorizeUse(node, s, generate_code, op_type, restrictions)) {
+    if (generate_code) {
+      if (vector_mode_ == kVector) {
+        vector_map_->Put(instruction, new (global_allocator_) HVecDotProd(
+            global_allocator_,
+            vector_map_->Get(q),
+            vector_map_->Get(r),
+            vector_map_->Get(s),
+            reduction_type,
+            is_unsigned,
+            GetOtherVL(reduction_type, op_type, vector_length_),
+            kNoDexPc));
+        MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom);
+      } else {
+        GenerateVecOp(v, vector_map_->Get(r), vector_map_->Get(s), reduction_type);
+        GenerateVecOp(instruction, vector_map_->Get(q), vector_map_->Get(v), reduction_type);
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
 //
 // Vectorization heuristics.
 //
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 2b202fda75..1a842c4bf3 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -82,6 +82,7 @@ class HLoopOptimization : public HOptimization {
     kNoReduction     = 1 << 9,   // no reduction
     kNoSAD           = 1 << 10,  // no sum of absolute differences (SAD)
     kNoWideSAD       = 1 << 11,  // no sum of absolute differences (SAD) with operand widening
+    kNoDotProd       = 1 << 12,  // no dot product
   };
 
   /*
@@ -217,6 +218,11 @@ class HLoopOptimization : public HOptimization {
                          bool generate_code,
                          DataType::Type type,
                          uint64_t restrictions);
+  bool VectorizeDotProdIdiom(LoopNode* node,
+                             HInstruction* instruction,
+                             bool generate_code,
+                             DataType::Type type,
+                             uint64_t restrictions);
 
   // Vectorization heuristics.
   Alignment ComputeAlignment(HInstruction* offset,
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 68f1a2406a..76887f9a5b 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1453,6 +1453,7 @@ class HLoopInformationOutwardIterator : public ValueObject {
   M(VecSetScalars, VecOperation)                                        \
   M(VecMultiplyAccumulate, VecOperation)                                \
   M(VecSADAccumulate, VecOperation)                                     \
+  M(VecDotProd, VecOperation)                                           \
   M(VecLoad, VecMemoryOperation)                                        \
   M(VecStore, VecMemoryOperation)                                       \
 
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index c7539f2846..597e399dd1 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -1021,6 +1021,66 @@ class HVecSADAccumulate final : public HVecOperation {
   DEFAULT_COPY_CONSTRUCTOR(VecSADAccumulate);
 };
 
+// Performs dot product of two vectors and adds the result to wider precision components in
+// the accumulator.
+//
+// viz. DOT_PRODUCT([ a1, .. , am], [ x1, .. , xn ], [ y1, .. , yn ]) =
+//                  [ a1 + sum(xi * yi), .. , am + sum(xj * yj) ],
+//      for m <= n, non-overlapping sums,
+//      for either both signed or both unsigned operands x, y.
+//
+// Notes:
+//   - packed type reflects the type of sum reduction, not the type of the operands.
+//   - IsZeroExtending() is used to determine the kind of signed/zero extension to be
+//     performed for the operands.
+//
+// TODO: Support types other than kInt32 for packed type.
+class HVecDotProd final : public HVecOperation {
+ public:
+  HVecDotProd(ArenaAllocator* allocator,
+              HInstruction* accumulator,
+              HInstruction* left,
+              HInstruction* right,
+              DataType::Type packed_type,
+              bool is_zero_extending,
+              size_t vector_length,
+              uint32_t dex_pc)
+    : HVecOperation(kVecDotProd,
+                    allocator,
+                    packed_type,
+                    SideEffects::None(),
+                    /* number_of_inputs */ 3,
+                    vector_length,
+                    dex_pc) {
+    DCHECK(HasConsistentPackedTypes(accumulator, packed_type));
+    DCHECK(DataType::IsIntegralType(packed_type));
+    DCHECK(left->IsVecOperation());
+    DCHECK(right->IsVecOperation());
+    DCHECK_EQ(ToSignedType(left->AsVecOperation()->GetPackedType()),
+              ToSignedType(right->AsVecOperation()->GetPackedType()));
+    SetRawInputAt(0, accumulator);
+    SetRawInputAt(1, left);
+    SetRawInputAt(2, right);
+    SetPackedFlag<kFieldHDotProdIsZeroExtending>(is_zero_extending);
+  }
+
+  bool IsZeroExtending() const { return GetPackedFlag<kFieldHDotProdIsZeroExtending>(); }
+
+  bool CanBeMoved() const override { return true; }
+
+  DECLARE_INSTRUCTION(VecDotProd);
+
+ protected:
+  DEFAULT_COPY_CONSTRUCTOR(VecDotProd);
+
+ private:
+  // Additional packed bits.
+  static constexpr size_t kFieldHDotProdIsZeroExtending =
+      HVecOperation::kNumberOfVectorOpPackedBits;
+  static constexpr size_t kNumberOfHDotProdPackedBits = kFieldHDotProdIsZeroExtending + 1;
+  static_assert(kNumberOfHDotProdPackedBits <= kMaxNumberOfPackedBits, "Too many packed fields.");
+};
+
 // Loads a vector from memory, viz. load(mem, 1)
 // yield the vector [ mem(1), .. , mem(n) ].
 class HVecLoad final : public HVecMemoryOperation {
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 9ae025b3fe..3a550efeb8 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -399,7 +399,8 @@ class OptimizingCompiler final : public Compiler {
                             PassObserver* pass_observer,
                             VariableSizedHandleScope* handles) const;
 
-  void GenerateJitDebugInfo(ArtMethod* method, debug::MethodDebugInfo method_debug_info)
+  void GenerateJitDebugInfo(ArtMethod* method,
+                            const debug::MethodDebugInfo& method_debug_info)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   std::unique_ptr<OptimizingCompilerStats> compilation_stats_;
@@ -1406,7 +1407,8 @@ bool OptimizingCompiler::JitCompile(Thread* self,
   return true;
 }
 
-void OptimizingCompiler::GenerateJitDebugInfo(ArtMethod* method, debug::MethodDebugInfo info) {
+void OptimizingCompiler::GenerateJitDebugInfo(
+    ArtMethod* method, const debug::MethodDebugInfo& info) {
   const CompilerOptions& compiler_options = GetCompilerDriver()->GetCompilerOptions();
   DCHECK(compiler_options.GenerateAnyDebugInfo());
 
diff --git a/compiler/optimizing/parallel_move_test.cc b/compiler/optimizing/parallel_move_test.cc
index 399a6d8cbd..a8ab6cdd0c 100644
--- a/compiler/optimizing/parallel_move_test.cc
+++ b/compiler/optimizing/parallel_move_test.cc
@@ -174,8 +174,8 @@ class ParallelMoveTest : public ::testing::Test {
 template<> const bool ParallelMoveTest<TestParallelMoveResolverWithSwap>::has_swap = true;
 template<> const bool ParallelMoveTest<TestParallelMoveResolverNoSwap>::has_swap = false;
 
-typedef ::testing::Types<TestParallelMoveResolverWithSwap, TestParallelMoveResolverNoSwap>
-    ParallelMoveResolverTestTypes;
+using ParallelMoveResolverTestTypes =
+    ::testing::Types<TestParallelMoveResolverWithSwap, TestParallelMoveResolverNoSwap>;
 
 TYPED_TEST_CASE(ParallelMoveTest, ParallelMoveResolverTestTypes);
 
diff --git a/compiler/optimizing/ssa_builder.cc b/compiler/optimizing/ssa_builder.cc
index dda29a1b4b..db96e41064 100644
--- a/compiler/optimizing/ssa_builder.cc
+++ b/compiler/optimizing/ssa_builder.cc
@@ -440,7 +440,10 @@ static bool HasAliasInEnvironments(HInstruction* instruction) {
   return false;
 }
 
-void SsaBuilder::ReplaceUninitializedStringPhis() {
+// Returns whether the analysis succeeded. If it did not, we are going to bail
+// to interpreter.
+// TODO(ngeoffray): Remove this workaround.
+bool SsaBuilder::ReplaceUninitializedStringPhis() {
   ScopedArenaHashSet<HInstruction*> seen_instructions(
       local_allocator_->Adapter(kArenaAllocGraphBuilder));
   ScopedArenaVector<HInstruction*> worklist(local_allocator_->Adapter(kArenaAllocGraphBuilder));
@@ -467,17 +470,23 @@ void SsaBuilder::ReplaceUninitializedStringPhis() {
         if (found_instance == nullptr) {
           found_instance = current->AsNewInstance();
         } else {
-          DCHECK(found_instance == current);
+          if (found_instance != current) {
+            return false;
+          }
         }
       } else if (current->IsPhi()) {
         // Push all inputs to the worklist. Those should be Phis or NewInstance.
         for (HInstruction* input : current->GetInputs()) {
-          DCHECK(input->IsPhi() || input->IsNewInstance()) << input->DebugName();
+          if (!input->IsPhi() && !input->IsNewInstance()) {
+            return false;
+          }
           worklist.push_back(input);
         }
       } else {
         // The verifier prevents any other DEX uses of the uninitialized string.
-        DCHECK(current->IsEqual() || current->IsNotEqual());
+        if (!current->IsEqual() && !current->IsNotEqual()) {
+          return false;
+        }
         continue;
       }
       current->ReplaceUsesDominatedBy(invoke, invoke);
@@ -487,13 +496,18 @@ void SsaBuilder::ReplaceUninitializedStringPhis() {
       // be Phi, or Equal/NotEqual.
       for (const HUseListNode<HInstruction*>& use : current->GetUses()) {
         HInstruction* user = use.GetUser();
-        DCHECK(user->IsPhi() || user->IsEqual() || user->IsNotEqual()) << user->DebugName();
+        if (!user->IsPhi() && !user->IsEqual() && !user->IsNotEqual()) {
+          return false;
+        }
         worklist.push_back(user);
       }
     } while (!worklist.empty());
     seen_instructions.clear();
-    DCHECK(found_instance != nullptr);
+    if (found_instance == nullptr) {
+      return false;
+    }
   }
+  return true;
 }
 
 void SsaBuilder::RemoveRedundantUninitializedStrings() {
@@ -547,7 +561,9 @@ GraphAnalysisResult SsaBuilder::BuildSsa() {
   // Replace Phis that feed in a String.<init>, as well as their aliases, with
   // the actual String allocation invocation. We do this first, as the phis stored in
   // the data structure might get removed from the graph in later stages during `BuildSsa`.
-  ReplaceUninitializedStringPhis();
+  if (!ReplaceUninitializedStringPhis()) {
+    return kAnalysisSkipped;
+  }
 
   // Propagate types of phis. At this point, phis are typed void in the general
   // case, or float/double/reference if we created an equivalent phi. So we need
diff --git a/compiler/optimizing/ssa_builder.h b/compiler/optimizing/ssa_builder.h
index 765544508e..bae15acf98 100644
--- a/compiler/optimizing/ssa_builder.h
+++ b/compiler/optimizing/ssa_builder.h
@@ -123,7 +123,7 @@ class SsaBuilder : public ValueObject {
   HArrayGet* GetFloatOrDoubleEquivalentOfArrayGet(HArrayGet* aget);
 
   void RemoveRedundantUninitializedStrings();
-  void ReplaceUninitializedStringPhis();
+  bool ReplaceUninitializedStringPhis();
 
   HGraph* const graph_;
   Handle<mirror::ClassLoader> class_loader_;