ARM64: Support MultiplyAccumulate for SIMD.

Test: test-art-host, test-art-target.

Change-Id: I06af8415e15352d09d176cae828163cbe99ae7a7
diff --git a/compiler/optimizing/code_generator_vector_arm.cc b/compiler/optimizing/code_generator_vector_arm.cc
index 6e82123..f8552dc 100644
--- a/compiler/optimizing/code_generator_vector_arm.cc
+++ b/compiler/optimizing/code_generator_vector_arm.cc
@@ -245,6 +245,14 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderARM::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorARM::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
 void LocationsBuilderARM::VisitVecLoad(HVecLoad* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 2dfccff..b3eb639 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -681,6 +681,67 @@
   }
 }
 
+void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
+  switch (instr->GetPackedType()) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
+      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+// Some early revisions of the Cortex-A53 have an erratum (835769) whereby it is possible for a
+// 64-bit scalar multiply-accumulate instruction in AArch64 state to generate an incorrect result.
+// However vector MultiplyAccumulate instruction is not affected.
+void InstructionCodeGeneratorARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LocationSummary* locations = instr->GetLocations();
+  VRegister acc = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputAccumulatorIndex));
+  VRegister left = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulLeftIndex));
+  VRegister right = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulRightIndex));
+  switch (instr->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::kAdd) {
+        __ Mla(acc.V16B(), left.V16B(), right.V16B());
+      } else {
+        __ Mls(acc.V16B(), left.V16B(), right.V16B());
+      }
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::kAdd) {
+        __ Mla(acc.V8H(), left.V8H(), right.V8H());
+      } else {
+        __ Mls(acc.V8H(), left.V8H(), right.V8H());
+      }
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::kAdd) {
+        __ Mla(acc.V4S(), left.V4S(), right.V4S());
+      } else {
+        __ Mls(acc.V4S(), left.V4S(), right.V4S());
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+  }
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* arena,
                                   HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc
index 990178b..53f314e 100644
--- a/compiler/optimizing/code_generator_vector_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc
@@ -245,6 +245,14 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
 void LocationsBuilderARMVIXL::VisitVecLoad(HVecLoad* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc
index 8ea1ca7..c4a3225 100644
--- a/compiler/optimizing/code_generator_vector_mips.cc
+++ b/compiler/optimizing/code_generator_vector_mips.cc
@@ -245,6 +245,14 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
 void LocationsBuilderMIPS::VisitVecLoad(HVecLoad* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc
index a484bb4..50b95c1 100644
--- a/compiler/optimizing/code_generator_vector_mips64.cc
+++ b/compiler/optimizing/code_generator_vector_mips64.cc
@@ -245,6 +245,14 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
 void LocationsBuilderMIPS64::VisitVecLoad(HVecLoad* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index a86d060..013b092 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -730,6 +730,14 @@
   }
 }
 
+void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* arena,
                                   HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 6967353..66f19a4 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -719,6 +719,14 @@
   }
 }
 
+void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* arena,
                                   HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 1b2b9f8..e5d94c3 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -514,6 +514,10 @@
     StartAttributeStream("rounded") << std::boolalpha << hadd->IsRounded() << std::noboolalpha;
   }
 
+  void VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) OVERRIDE {
+    StartAttributeStream("kind") << instruction->GetOpKind();
+  }
+
 #if defined(ART_ENABLE_CODEGEN_arm) || defined(ART_ENABLE_CODEGEN_arm64)
   void VisitMultiplyAccumulate(HMultiplyAccumulate* instruction) OVERRIDE {
     StartAttributeStream("kind") << instruction->GetOpKind();
diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc
index 73b7b2b..f16e372 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.cc
+++ b/compiler/optimizing/instruction_simplifier_arm64.cc
@@ -210,5 +210,11 @@
   }
 }
 
+void InstructionSimplifierArm64Visitor::VisitVecMul(HVecMul* instruction) {
+  if (TryCombineVecMultiplyAccumulate(instruction, kArm64)) {
+    RecordSimplification();
+  }
+}
+
 }  // namespace arm64
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h
index 65654f5..eec4e49 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.h
+++ b/compiler/optimizing/instruction_simplifier_arm64.h
@@ -74,6 +74,7 @@
   void VisitTypeConversion(HTypeConversion* instruction) OVERRIDE;
   void VisitUShr(HUShr* instruction) OVERRIDE;
   void VisitXor(HXor* instruction) OVERRIDE;
+  void VisitVecMul(HVecMul* instruction) OVERRIDE;
 
   OptimizingCompilerStats* stats_;
 };
diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc
index c2b1374..7d1f146 100644
--- a/compiler/optimizing/instruction_simplifier_shared.cc
+++ b/compiler/optimizing/instruction_simplifier_shared.cc
@@ -278,5 +278,71 @@
   return true;
 }
 
+bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa) {
+  Primitive::Type type = mul->GetPackedType();
+  switch (isa) {
+    case kArm64:
+      if (!(type == Primitive::kPrimByte ||
+            type == Primitive::kPrimChar ||
+            type == Primitive::kPrimShort ||
+            type == Primitive::kPrimInt)) {
+        return false;
+      }
+      break;
+    default:
+      return false;
+  }
+
+  ArenaAllocator* arena = mul->GetBlock()->GetGraph()->GetArena();
+
+  if (mul->HasOnlyOneNonEnvironmentUse()) {
+    HInstruction* use = mul->GetUses().front().GetUser();
+    if (use->IsVecAdd() || use->IsVecSub()) {
+      // Replace code looking like
+      //    VECMUL tmp, x, y
+      //    VECADD/SUB dst, acc, tmp
+      // with
+      //    VECMULACC dst, acc, x, y
+      // Note that we do not want to (unconditionally) perform the merge when the
+      // multiplication has multiple uses and it can be merged in all of them.
+      // Multiple uses could happen on the same control-flow path, and we would
+      // then increase the amount of work. In the future we could try to evaluate
+      // whether all uses are on different control-flow paths (using dominance and
+      // reverse-dominance information) and only perform the merge when they are.
+      HInstruction* accumulator = nullptr;
+      HVecBinaryOperation* binop = use->AsVecBinaryOperation();
+      HInstruction* binop_left = binop->GetLeft();
+      HInstruction* binop_right = binop->GetRight();
+      // This is always true since the `HVecMul` has only one use (which is checked above).
+      DCHECK_NE(binop_left, binop_right);
+      if (binop_right == mul) {
+        accumulator = binop_left;
+      } else if (use->IsVecAdd()) {
+        DCHECK_EQ(binop_left, mul);
+        accumulator = binop_right;
+      }
+
+      HInstruction::InstructionKind kind =
+          use->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub;
+      if (accumulator != nullptr) {
+        HVecMultiplyAccumulate* mulacc =
+            new (arena) HVecMultiplyAccumulate(arena,
+                                               kind,
+                                               accumulator,
+                                               mul->GetLeft(),
+                                               mul->GetRight(),
+                                               binop->GetPackedType(),
+                                               binop->GetVectorLength());
+
+        binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc);
+        DCHECK(!mul->HasUses());
+        mul->GetBlock()->RemoveInstruction(mul);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
 
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_shared.h b/compiler/optimizing/instruction_simplifier_shared.h
index 83e3ffc..2ea103a 100644
--- a/compiler/optimizing/instruction_simplifier_shared.h
+++ b/compiler/optimizing/instruction_simplifier_shared.h
@@ -58,6 +58,8 @@
                                   HInstruction* index,
                                   size_t data_offset);
 
+bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa);
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_SHARED_H_
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 6be237e..af953c8 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1382,6 +1382,7 @@
   M(VecShl, VecBinaryOperation)                                         \
   M(VecShr, VecBinaryOperation)                                         \
   M(VecUShr, VecBinaryOperation)                                        \
+  M(VecMultiplyAccumulate, VecOperation)                                \
   M(VecLoad, VecMemoryOperation)                                        \
   M(VecStore, VecMemoryOperation)                                       \
 
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index bff58d0..450691c 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -143,6 +143,10 @@
                       /*number_of_inputs*/ 2,
                       vector_length,
                       dex_pc) { }
+
+  HInstruction* GetLeft() const { return InputAt(0); }
+  HInstruction* GetRight() const { return InputAt(1); }
+
   DECLARE_ABSTRACT_INSTRUCTION(VecBinaryOperation);
  private:
   DISALLOW_COPY_AND_ASSIGN(HVecBinaryOperation);
@@ -627,6 +631,59 @@
   DISALLOW_COPY_AND_ASSIGN(HVecUShr);
 };
 
+// Multiplies every component in the two vectors, adds the result vector to the accumulator vector.
+// viz. [ acc1, .., accn ] + [ x1, .. , xn ] * [ y1, .. , yn ] =
+//     [ acc1 + x1 * y1, .. , accn + xn * yn ].
+class HVecMultiplyAccumulate FINAL : public HVecOperation {
+ public:
+  HVecMultiplyAccumulate(ArenaAllocator* arena,
+                         InstructionKind op,
+                         HInstruction* accumulator,
+                         HInstruction* mul_left,
+                         HInstruction* mul_right,
+                         Primitive::Type packed_type,
+                         size_t vector_length,
+                         uint32_t dex_pc = kNoDexPc)
+      : HVecOperation(arena,
+                      packed_type,
+                      SideEffects::None(),
+                      /*number_of_inputs*/ 3,
+                      vector_length,
+                      dex_pc),
+        op_kind_(op) {
+    DCHECK(op == InstructionKind::kAdd || op == InstructionKind::kSub);
+    DCHECK(accumulator->IsVecOperation());
+    DCHECK(mul_left->IsVecOperation() && mul_right->IsVecOperation());
+    DCHECK_EQ(accumulator->AsVecOperation()->GetPackedType(), packed_type);
+    DCHECK_EQ(mul_left->AsVecOperation()->GetPackedType(), packed_type);
+    DCHECK_EQ(mul_right->AsVecOperation()->GetPackedType(), packed_type);
+
+    SetRawInputAt(kInputAccumulatorIndex, accumulator);
+    SetRawInputAt(kInputMulLeftIndex, mul_left);
+    SetRawInputAt(kInputMulRightIndex, mul_right);
+  }
+
+  static constexpr int kInputAccumulatorIndex = 0;
+  static constexpr int kInputMulLeftIndex = 1;
+  static constexpr int kInputMulRightIndex = 2;
+
+  bool CanBeMoved() const OVERRIDE { return true; }
+
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
+    return op_kind_ == other->AsVecMultiplyAccumulate()->op_kind_;
+  }
+
+  InstructionKind GetOpKind() const { return op_kind_; }
+
+  DECLARE_INSTRUCTION(VecMultiplyAccumulate);
+
+ private:
+  // Indicates if this is a MADD or MSUB.
+  const InstructionKind op_kind_;
+
+  DISALLOW_COPY_AND_ASSIGN(HVecMultiplyAccumulate);
+};
+
 // Loads a vector from memory, viz. load(mem, 1)
 // yield the vector [ mem(1), .. , mem(n) ].
 class HVecLoad FINAL : public HVecMemoryOperation {