ARM64: Support MultiplyAccumulate for SIMD.
Test: test-art-host, test-art-target.
Change-Id: I06af8415e15352d09d176cae828163cbe99ae7a7
diff --git a/compiler/optimizing/code_generator_vector_arm.cc b/compiler/optimizing/code_generator_vector_arm.cc
index 6e82123..f8552dc 100644
--- a/compiler/optimizing/code_generator_vector_arm.cc
+++ b/compiler/optimizing/code_generator_vector_arm.cc
@@ -245,6 +245,14 @@
LOG(FATAL) << "No SIMD for " << instruction->GetId();
}
+void LocationsBuilderARM::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorARM::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
void LocationsBuilderARM::VisitVecLoad(HVecLoad* instruction) {
LOG(FATAL) << "No SIMD for " << instruction->GetId();
}
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 2dfccff..b3eb639 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -681,6 +681,67 @@
}
}
+void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
+ switch (instr->GetPackedType()) {
+ case Primitive::kPrimByte:
+ case Primitive::kPrimChar:
+ case Primitive::kPrimShort:
+ case Primitive::kPrimInt:
+ locations->SetInAt(
+ HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
+ locations->SetInAt(
+ HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
+ locations->SetInAt(
+ HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
+ DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+ locations->SetOut(Location::SameAsFirstInput());
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+}
+
+// Some early revisions of the Cortex-A53 have an erratum (835769) whereby it is possible for a
+// 64-bit scalar multiply-accumulate instruction in AArch64 state to generate an incorrect result.
+// However vector MultiplyAccumulate instruction is not affected.
+void InstructionCodeGeneratorARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LocationSummary* locations = instr->GetLocations();
+ VRegister acc = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputAccumulatorIndex));
+ VRegister left = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulLeftIndex));
+ VRegister right = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulRightIndex));
+ switch (instr->GetPackedType()) {
+ case Primitive::kPrimByte:
+ DCHECK_EQ(16u, instr->GetVectorLength());
+ if (instr->GetOpKind() == HInstruction::kAdd) {
+ __ Mla(acc.V16B(), left.V16B(), right.V16B());
+ } else {
+ __ Mls(acc.V16B(), left.V16B(), right.V16B());
+ }
+ break;
+ case Primitive::kPrimChar:
+ case Primitive::kPrimShort:
+ DCHECK_EQ(8u, instr->GetVectorLength());
+ if (instr->GetOpKind() == HInstruction::kAdd) {
+ __ Mla(acc.V8H(), left.V8H(), right.V8H());
+ } else {
+ __ Mls(acc.V8H(), left.V8H(), right.V8H());
+ }
+ break;
+ case Primitive::kPrimInt:
+ DCHECK_EQ(4u, instr->GetVectorLength());
+ if (instr->GetOpKind() == HInstruction::kAdd) {
+ __ Mla(acc.V4S(), left.V4S(), right.V4S());
+ } else {
+ __ Mls(acc.V4S(), left.V4S(), right.V4S());
+ }
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ }
+}
+
// Helper to set up locations for vector memory operations.
static void CreateVecMemLocations(ArenaAllocator* arena,
HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc
index 990178b..53f314e 100644
--- a/compiler/optimizing/code_generator_vector_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc
@@ -245,6 +245,14 @@
LOG(FATAL) << "No SIMD for " << instruction->GetId();
}
+void LocationsBuilderARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
void LocationsBuilderARMVIXL::VisitVecLoad(HVecLoad* instruction) {
LOG(FATAL) << "No SIMD for " << instruction->GetId();
}
diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc
index 8ea1ca7..c4a3225 100644
--- a/compiler/optimizing/code_generator_vector_mips.cc
+++ b/compiler/optimizing/code_generator_vector_mips.cc
@@ -245,6 +245,14 @@
LOG(FATAL) << "No SIMD for " << instruction->GetId();
}
+void LocationsBuilderMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
void LocationsBuilderMIPS::VisitVecLoad(HVecLoad* instruction) {
LOG(FATAL) << "No SIMD for " << instruction->GetId();
}
diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc
index a484bb4..50b95c1 100644
--- a/compiler/optimizing/code_generator_vector_mips64.cc
+++ b/compiler/optimizing/code_generator_vector_mips64.cc
@@ -245,6 +245,14 @@
LOG(FATAL) << "No SIMD for " << instruction->GetId();
}
+void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
void LocationsBuilderMIPS64::VisitVecLoad(HVecLoad* instruction) {
LOG(FATAL) << "No SIMD for " << instruction->GetId();
}
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index a86d060..013b092 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -730,6 +730,14 @@
}
}
+void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
// Helper to set up locations for vector memory operations.
static void CreateVecMemLocations(ArenaAllocator* arena,
HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 6967353..66f19a4 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -719,6 +719,14 @@
}
}
+void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
// Helper to set up locations for vector memory operations.
static void CreateVecMemLocations(ArenaAllocator* arena,
HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 1b2b9f8..e5d94c3 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -514,6 +514,10 @@
StartAttributeStream("rounded") << std::boolalpha << hadd->IsRounded() << std::noboolalpha;
}
+ void VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) OVERRIDE {
+ StartAttributeStream("kind") << instruction->GetOpKind();
+ }
+
#if defined(ART_ENABLE_CODEGEN_arm) || defined(ART_ENABLE_CODEGEN_arm64)
void VisitMultiplyAccumulate(HMultiplyAccumulate* instruction) OVERRIDE {
StartAttributeStream("kind") << instruction->GetOpKind();
diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc
index 73b7b2b..f16e372 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.cc
+++ b/compiler/optimizing/instruction_simplifier_arm64.cc
@@ -210,5 +210,11 @@
}
}
+void InstructionSimplifierArm64Visitor::VisitVecMul(HVecMul* instruction) {
+ if (TryCombineVecMultiplyAccumulate(instruction, kArm64)) {
+ RecordSimplification();
+ }
+}
+
} // namespace arm64
} // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h
index 65654f5..eec4e49 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.h
+++ b/compiler/optimizing/instruction_simplifier_arm64.h
@@ -74,6 +74,7 @@
void VisitTypeConversion(HTypeConversion* instruction) OVERRIDE;
void VisitUShr(HUShr* instruction) OVERRIDE;
void VisitXor(HXor* instruction) OVERRIDE;
+ void VisitVecMul(HVecMul* instruction) OVERRIDE;
OptimizingCompilerStats* stats_;
};
diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc
index c2b1374..7d1f146 100644
--- a/compiler/optimizing/instruction_simplifier_shared.cc
+++ b/compiler/optimizing/instruction_simplifier_shared.cc
@@ -278,5 +278,71 @@
return true;
}
+bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa) {
+ Primitive::Type type = mul->GetPackedType();
+ switch (isa) {
+ case kArm64:
+ if (!(type == Primitive::kPrimByte ||
+ type == Primitive::kPrimChar ||
+ type == Primitive::kPrimShort ||
+ type == Primitive::kPrimInt)) {
+ return false;
+ }
+ break;
+ default:
+ return false;
+ }
+
+ ArenaAllocator* arena = mul->GetBlock()->GetGraph()->GetArena();
+
+ if (mul->HasOnlyOneNonEnvironmentUse()) {
+ HInstruction* use = mul->GetUses().front().GetUser();
+ if (use->IsVecAdd() || use->IsVecSub()) {
+ // Replace code looking like
+ // VECMUL tmp, x, y
+ // VECADD/SUB dst, acc, tmp
+ // with
+ // VECMULACC dst, acc, x, y
+ // Note that we do not want to (unconditionally) perform the merge when the
+ // multiplication has multiple uses and it can be merged in all of them.
+ // Multiple uses could happen on the same control-flow path, and we would
+ // then increase the amount of work. In the future we could try to evaluate
+ // whether all uses are on different control-flow paths (using dominance and
+ // reverse-dominance information) and only perform the merge when they are.
+ HInstruction* accumulator = nullptr;
+ HVecBinaryOperation* binop = use->AsVecBinaryOperation();
+ HInstruction* binop_left = binop->GetLeft();
+ HInstruction* binop_right = binop->GetRight();
+ // This is always true since the `HVecMul` has only one use (which is checked above).
+ DCHECK_NE(binop_left, binop_right);
+ if (binop_right == mul) {
+ accumulator = binop_left;
+ } else if (use->IsVecAdd()) {
+ DCHECK_EQ(binop_left, mul);
+ accumulator = binop_right;
+ }
+
+ HInstruction::InstructionKind kind =
+ use->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub;
+ if (accumulator != nullptr) {
+ HVecMultiplyAccumulate* mulacc =
+ new (arena) HVecMultiplyAccumulate(arena,
+ kind,
+ accumulator,
+ mul->GetLeft(),
+ mul->GetRight(),
+ binop->GetPackedType(),
+ binop->GetVectorLength());
+
+ binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc);
+ DCHECK(!mul->HasUses());
+ mul->GetBlock()->RemoveInstruction(mul);
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
} // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_shared.h b/compiler/optimizing/instruction_simplifier_shared.h
index 83e3ffc..2ea103a 100644
--- a/compiler/optimizing/instruction_simplifier_shared.h
+++ b/compiler/optimizing/instruction_simplifier_shared.h
@@ -58,6 +58,8 @@
HInstruction* index,
size_t data_offset);
+bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa);
+
} // namespace art
#endif // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_SHARED_H_
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 6be237e..af953c8 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1382,6 +1382,7 @@
M(VecShl, VecBinaryOperation) \
M(VecShr, VecBinaryOperation) \
M(VecUShr, VecBinaryOperation) \
+ M(VecMultiplyAccumulate, VecOperation) \
M(VecLoad, VecMemoryOperation) \
M(VecStore, VecMemoryOperation) \
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index bff58d0..450691c 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -143,6 +143,10 @@
/*number_of_inputs*/ 2,
vector_length,
dex_pc) { }
+
+ HInstruction* GetLeft() const { return InputAt(0); }
+ HInstruction* GetRight() const { return InputAt(1); }
+
DECLARE_ABSTRACT_INSTRUCTION(VecBinaryOperation);
private:
DISALLOW_COPY_AND_ASSIGN(HVecBinaryOperation);
@@ -627,6 +631,59 @@
DISALLOW_COPY_AND_ASSIGN(HVecUShr);
};
+// Multiplies every component in the two vectors, adds the result vector to the accumulator vector.
+// viz. [ acc1, .., accn ] + [ x1, .. , xn ] * [ y1, .. , yn ] =
+// [ acc1 + x1 * y1, .. , accn + xn * yn ].
+class HVecMultiplyAccumulate FINAL : public HVecOperation {
+ public:
+ HVecMultiplyAccumulate(ArenaAllocator* arena,
+ InstructionKind op,
+ HInstruction* accumulator,
+ HInstruction* mul_left,
+ HInstruction* mul_right,
+ Primitive::Type packed_type,
+ size_t vector_length,
+ uint32_t dex_pc = kNoDexPc)
+ : HVecOperation(arena,
+ packed_type,
+ SideEffects::None(),
+ /*number_of_inputs*/ 3,
+ vector_length,
+ dex_pc),
+ op_kind_(op) {
+ DCHECK(op == InstructionKind::kAdd || op == InstructionKind::kSub);
+ DCHECK(accumulator->IsVecOperation());
+ DCHECK(mul_left->IsVecOperation() && mul_right->IsVecOperation());
+ DCHECK_EQ(accumulator->AsVecOperation()->GetPackedType(), packed_type);
+ DCHECK_EQ(mul_left->AsVecOperation()->GetPackedType(), packed_type);
+ DCHECK_EQ(mul_right->AsVecOperation()->GetPackedType(), packed_type);
+
+ SetRawInputAt(kInputAccumulatorIndex, accumulator);
+ SetRawInputAt(kInputMulLeftIndex, mul_left);
+ SetRawInputAt(kInputMulRightIndex, mul_right);
+ }
+
+ static constexpr int kInputAccumulatorIndex = 0;
+ static constexpr int kInputMulLeftIndex = 1;
+ static constexpr int kInputMulRightIndex = 2;
+
+ bool CanBeMoved() const OVERRIDE { return true; }
+
+ bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
+ return op_kind_ == other->AsVecMultiplyAccumulate()->op_kind_;
+ }
+
+ InstructionKind GetOpKind() const { return op_kind_; }
+
+ DECLARE_INSTRUCTION(VecMultiplyAccumulate);
+
+ private:
+ // Indicates if this is a MADD or MSUB.
+ const InstructionKind op_kind_;
+
+ DISALLOW_COPY_AND_ASSIGN(HVecMultiplyAccumulate);
+};
+
// Loads a vector from memory, viz. load(mem, 1)
// yield the vector [ mem(1), .. , mem(n) ].
class HVecLoad FINAL : public HVecMemoryOperation {