ART: ARM64: Support DotProd SIMD idiom.
Implement support for vectorization idiom which performs dot
product of two vectors and adds the result to wider precision
components in the accumulator.
viz. DOT_PRODUCT([ a1, .. , am], [ x1, .. , xn ], [ y1, .. , yn ]) =
[ a1 + sum(xi * yi), .. , am + sum(xj * yj) ],
for m <= n, non-overlapping sums,
for either both signed or both unsigned operands x, y.
The patch shows up to 7x performance improvement on a micro
benchmark on Cortex-A57.
Test: 684-checker-simd-dotprod.
Test: test-art-host, test-art-target.
Change-Id: Ibab0d51f537fdecd1d84033197be3ebf5ec4e455
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 43169ba..e79a96b 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -1277,6 +1277,74 @@
}
}
+void LocationsBuilderARM64::VisitVecDotProd(HVecDotProd* instruction) {
+ LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+ DCHECK(instruction->GetPackedType() == DataType::Type::kInt32);
+ locations->SetInAt(0, Location::RequiresFpuRegister());
+ locations->SetInAt(1, Location::RequiresFpuRegister());
+ locations->SetInAt(2, Location::RequiresFpuRegister());
+ locations->SetOut(Location::SameAsFirstInput());
+
+ // For Int8 and Uint8 we need a temp register.
+ if (DataType::Size(instruction->InputAt(1)->AsVecOperation()->GetPackedType()) == 1) {
+ locations->AddTemp(Location::RequiresFpuRegister());
+ }
+}
+
+void InstructionCodeGeneratorARM64::VisitVecDotProd(HVecDotProd* instruction) {
+ LocationSummary* locations = instruction->GetLocations();
+ DCHECK(locations->InAt(0).Equals(locations->Out()));
+ VRegister acc = VRegisterFrom(locations->InAt(0));
+ VRegister left = VRegisterFrom(locations->InAt(1));
+ VRegister right = VRegisterFrom(locations->InAt(2));
+ HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
+ HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
+ DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
+ HVecOperation::ToSignedType(b->GetPackedType()));
+ DCHECK_EQ(instruction->GetPackedType(), DataType::Type::kInt32);
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+
+ size_t inputs_data_size = DataType::Size(a->GetPackedType());
+ switch (inputs_data_size) {
+ case 1u: {
+ DCHECK_EQ(16u, a->GetVectorLength());
+ VRegister tmp = VRegisterFrom(locations->GetTemp(0));
+ if (instruction->IsZeroExtending()) {
+ // TODO: Use Armv8.4-A UDOT instruction when it is available.
+ __ Umull(tmp.V8H(), left.V8B(), right.V8B());
+ __ Uaddw(acc.V4S(), acc.V4S(), tmp.V4H());
+ __ Uaddw2(acc.V4S(), acc.V4S(), tmp.V8H());
+
+ __ Umull2(tmp.V8H(), left.V16B(), right.V16B());
+ __ Uaddw(acc.V4S(), acc.V4S(), tmp.V4H());
+ __ Uaddw2(acc.V4S(), acc.V4S(), tmp.V8H());
+ } else {
+ // TODO: Use Armv8.4-A SDOT instruction when it is available.
+ __ Smull(tmp.V8H(), left.V8B(), right.V8B());
+ __ Saddw(acc.V4S(), acc.V4S(), tmp.V4H());
+ __ Saddw2(acc.V4S(), acc.V4S(), tmp.V8H());
+
+ __ Smull2(tmp.V8H(), left.V16B(), right.V16B());
+ __ Saddw(acc.V4S(), acc.V4S(), tmp.V4H());
+ __ Saddw2(acc.V4S(), acc.V4S(), tmp.V8H());
+ }
+ break;
+ }
+ case 2u:
+ DCHECK_EQ(8u, a->GetVectorLength());
+ if (instruction->IsZeroExtending()) {
+ __ Umlal(acc.V4S(), left.V4H(), right.V4H());
+ __ Umlal2(acc.V4S(), left.V8H(), right.V8H());
+ } else {
+ __ Smlal(acc.V4S(), left.V4H(), right.V4H());
+ __ Smlal2(acc.V4S(), left.V8H(), right.V8H());
+ }
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type size: " << inputs_data_size;
+ }
+}
+
// Helper to set up locations for vector memory operations.
static void CreateVecMemLocations(ArenaAllocator* allocator,
HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc
index 7b66b17..62b6c4e 100644
--- a/compiler/optimizing/code_generator_vector_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc
@@ -854,6 +854,14 @@
}
}
+void LocationsBuilderARMVIXL::VisitVecDotProd(HVecDotProd* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecDotProd(HVecDotProd* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
// Return whether the vector memory access operation is guaranteed to be word-aligned (ARM word
// size equals to 4).
static bool IsWordAligned(HVecMemoryOperation* instruction) {
diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc
index df0e148..24f4fb2 100644
--- a/compiler/optimizing/code_generator_vector_mips.cc
+++ b/compiler/optimizing/code_generator_vector_mips.cc
@@ -1274,6 +1274,14 @@
}
}
+void LocationsBuilderMIPS::VisitVecDotProd(HVecDotProd* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecDotProd(HVecDotProd* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
// Helper to set up locations for vector memory operations.
static void CreateVecMemLocations(ArenaAllocator* allocator,
HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc
index de354b6..972c49e 100644
--- a/compiler/optimizing/code_generator_vector_mips64.cc
+++ b/compiler/optimizing/code_generator_vector_mips64.cc
@@ -1272,6 +1272,14 @@
}
}
+void LocationsBuilderMIPS64::VisitVecDotProd(HVecDotProd* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecDotProd(HVecDotProd* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
// Helper to set up locations for vector memory operations.
static void CreateVecMemLocations(ArenaAllocator* allocator,
HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 2502275..c52ecc7 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -1143,6 +1143,14 @@
LOG(FATAL) << "No SIMD for " << instruction->GetId();
}
+void LocationsBuilderX86::VisitVecDotProd(HVecDotProd* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorX86::VisitVecDotProd(HVecDotProd* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
// Helper to set up locations for vector memory operations.
static void CreateVecMemLocations(ArenaAllocator* allocator,
HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 4a67daf..87d0106 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -1116,6 +1116,14 @@
LOG(FATAL) << "No SIMD for " << instruction->GetId();
}
+void LocationsBuilderX86_64::VisitVecDotProd(HVecDotProd* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecDotProd(HVecDotProd* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
// Helper to set up locations for vector memory operations.
static void CreateVecMemLocations(ArenaAllocator* allocator,
HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/data_type.h b/compiler/optimizing/data_type.h
index 5ac6e46..3cbcc9e 100644
--- a/compiler/optimizing/data_type.h
+++ b/compiler/optimizing/data_type.h
@@ -231,6 +231,21 @@
}
}
+ static Type ToUnsigned(Type type) {
+ switch (type) {
+ case Type::kInt8:
+ return Type::kUint8;
+ case Type::kInt16:
+ return Type::kUint16;
+ case Type::kInt32:
+ return Type::kUint32;
+ case Type::kInt64:
+ return Type::kUint64;
+ default:
+ return type;
+ }
+ }
+
static const char* PrettyDescriptor(Type type);
private:
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 31db8c2..21f22af 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -564,6 +564,14 @@
StartAttributeStream("kind") << instruction->GetOpKind();
}
+ void VisitVecDotProd(HVecDotProd* instruction) override {
+ VisitVecOperation(instruction);
+ DataType::Type arg_type = instruction->InputAt(1)->AsVecOperation()->GetPackedType();
+ StartAttributeStream("type") << (instruction->IsZeroExtending() ?
+ DataType::ToUnsigned(arg_type) :
+ DataType::ToSigned(arg_type));
+ }
+
#if defined(ART_ENABLE_CODEGEN_arm) || defined(ART_ENABLE_CODEGEN_arm64)
void VisitMultiplyAccumulate(HMultiplyAccumulate* instruction) override {
StartAttributeStream("kind") << instruction->GetOpKind();
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 7d66155..12b180d 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -351,7 +351,10 @@
// Translates vector operation to reduction kind.
static HVecReduce::ReductionKind GetReductionKind(HVecOperation* reduction) {
- if (reduction->IsVecAdd() || reduction->IsVecSub() || reduction->IsVecSADAccumulate()) {
+ if (reduction->IsVecAdd() ||
+ reduction->IsVecSub() ||
+ reduction->IsVecSADAccumulate() ||
+ reduction->IsVecDotProd()) {
return HVecReduce::kSum;
}
LOG(FATAL) << "Unsupported SIMD reduction " << reduction->GetId();
@@ -431,6 +434,23 @@
}
}
+// Returns the narrower type out of instructions a and b types.
+static DataType::Type GetNarrowerType(HInstruction* a, HInstruction* b) {
+ DataType::Type type = a->GetType();
+ if (DataType::Size(b->GetType()) < DataType::Size(type)) {
+ type = b->GetType();
+ }
+ if (a->IsTypeConversion() &&
+ DataType::Size(a->InputAt(0)->GetType()) < DataType::Size(type)) {
+ type = a->InputAt(0)->GetType();
+ }
+ if (b->IsTypeConversion() &&
+ DataType::Size(b->InputAt(0)->GetType()) < DataType::Size(type)) {
+ type = b->InputAt(0)->GetType();
+ }
+ return type;
+}
+
//
// Public methods.
//
@@ -1289,6 +1309,7 @@
DataType::Type type = instruction->GetType();
// Recognize SAD idiom or direct reduction.
if (VectorizeSADIdiom(node, instruction, generate_code, type, restrictions) ||
+ VectorizeDotProdIdiom(node, instruction, generate_code, type, restrictions) ||
(TrySetVectorType(type, &restrictions) &&
VectorizeUse(node, instruction, generate_code, type, restrictions))) {
if (generate_code) {
@@ -1531,11 +1552,11 @@
case DataType::Type::kBool:
case DataType::Type::kUint8:
case DataType::Type::kInt8:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv | kNoReduction | kNoDotProd;
return TrySetVectorLength(8);
case DataType::Type::kUint16:
case DataType::Type::kInt16:
- *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
+ *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoDotProd;
return TrySetVectorLength(4);
case DataType::Type::kInt32:
*restrictions |= kNoDiv | kNoWideSAD;
@@ -1580,12 +1601,23 @@
case DataType::Type::kBool:
case DataType::Type::kUint8:
case DataType::Type::kInt8:
- *restrictions |=
- kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoSAD;
+ *restrictions |= kNoMul |
+ kNoDiv |
+ kNoShift |
+ kNoAbs |
+ kNoSignedHAdd |
+ kNoUnroundedHAdd |
+ kNoSAD |
+ kNoDotProd;
return TrySetVectorLength(16);
case DataType::Type::kUint16:
case DataType::Type::kInt16:
- *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoSAD;
+ *restrictions |= kNoDiv |
+ kNoAbs |
+ kNoSignedHAdd |
+ kNoUnroundedHAdd |
+ kNoSAD|
+ kNoDotProd;
return TrySetVectorLength(8);
case DataType::Type::kInt32:
*restrictions |= kNoDiv | kNoSAD;
@@ -1610,11 +1642,11 @@
case DataType::Type::kBool:
case DataType::Type::kUint8:
case DataType::Type::kInt8:
- *restrictions |= kNoDiv;
+ *restrictions |= kNoDiv | kNoDotProd;
return TrySetVectorLength(16);
case DataType::Type::kUint16:
case DataType::Type::kInt16:
- *restrictions |= kNoDiv | kNoStringCharAt;
+ *restrictions |= kNoDiv | kNoStringCharAt | kNoDotProd;
return TrySetVectorLength(8);
case DataType::Type::kInt32:
*restrictions |= kNoDiv;
@@ -1639,11 +1671,11 @@
case DataType::Type::kBool:
case DataType::Type::kUint8:
case DataType::Type::kInt8:
- *restrictions |= kNoDiv;
+ *restrictions |= kNoDiv | kNoDotProd;
return TrySetVectorLength(16);
case DataType::Type::kUint16:
case DataType::Type::kInt16:
- *restrictions |= kNoDiv | kNoStringCharAt;
+ *restrictions |= kNoDiv | kNoStringCharAt | kNoDotProd;
return TrySetVectorLength(8);
case DataType::Type::kInt32:
*restrictions |= kNoDiv;
@@ -2071,18 +2103,7 @@
HInstruction* r = a;
HInstruction* s = b;
bool is_unsigned = false;
- DataType::Type sub_type = a->GetType();
- if (DataType::Size(b->GetType()) < DataType::Size(sub_type)) {
- sub_type = b->GetType();
- }
- if (a->IsTypeConversion() &&
- DataType::Size(a->InputAt(0)->GetType()) < DataType::Size(sub_type)) {
- sub_type = a->InputAt(0)->GetType();
- }
- if (b->IsTypeConversion() &&
- DataType::Size(b->InputAt(0)->GetType()) < DataType::Size(sub_type)) {
- sub_type = b->InputAt(0)->GetType();
- }
+ DataType::Type sub_type = GetNarrowerType(a, b);
if (reduction_type != sub_type &&
(!IsNarrowerOperands(a, b, sub_type, &r, &s, &is_unsigned) || is_unsigned)) {
return false;
@@ -2123,6 +2144,75 @@
return false;
}
+// Method recognises the following dot product idiom:
+// q += a * b for operands a, b whose type is narrower than the reduction one.
+// Provided that the operands have the same type or are promoted to a wider form.
+// Since this may involve a vector length change, the idiom is handled by going directly
+// to a dot product node (rather than relying combining finer grained nodes later).
+bool HLoopOptimization::VectorizeDotProdIdiom(LoopNode* node,
+ HInstruction* instruction,
+ bool generate_code,
+ DataType::Type reduction_type,
+ uint64_t restrictions) {
+ if (!instruction->IsAdd() || (reduction_type != DataType::Type::kInt32)) {
+ return false;
+ }
+
+ HInstruction* q = instruction->InputAt(0);
+ HInstruction* v = instruction->InputAt(1);
+ if (!v->IsMul() || v->GetType() != reduction_type) {
+ return false;
+ }
+
+ HInstruction* a = v->InputAt(0);
+ HInstruction* b = v->InputAt(1);
+ HInstruction* r = a;
+ HInstruction* s = b;
+ DataType::Type op_type = GetNarrowerType(a, b);
+ bool is_unsigned = false;
+
+ if (!IsNarrowerOperands(a, b, op_type, &r, &s, &is_unsigned)) {
+ return false;
+ }
+ op_type = HVecOperation::ToProperType(op_type, is_unsigned);
+
+ if (!TrySetVectorType(op_type, &restrictions) ||
+ HasVectorRestrictions(restrictions, kNoDotProd)) {
+ return false;
+ }
+
+ DCHECK(r != nullptr && s != nullptr);
+ // Accept dot product idiom for vectorizable operands. Vectorized code uses the shorthand
+ // idiomatic operation. Sequential code uses the original scalar expressions.
+ if (generate_code && vector_mode_ != kVector) { // de-idiom
+ r = a;
+ s = b;
+ }
+ if (VectorizeUse(node, q, generate_code, op_type, restrictions) &&
+ VectorizeUse(node, r, generate_code, op_type, restrictions) &&
+ VectorizeUse(node, s, generate_code, op_type, restrictions)) {
+ if (generate_code) {
+ if (vector_mode_ == kVector) {
+ vector_map_->Put(instruction, new (global_allocator_) HVecDotProd(
+ global_allocator_,
+ vector_map_->Get(q),
+ vector_map_->Get(r),
+ vector_map_->Get(s),
+ reduction_type,
+ is_unsigned,
+ GetOtherVL(reduction_type, op_type, vector_length_),
+ kNoDexPc));
+ MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom);
+ } else {
+ GenerateVecOp(v, vector_map_->Get(r), vector_map_->Get(s), reduction_type);
+ GenerateVecOp(instruction, vector_map_->Get(q), vector_map_->Get(v), reduction_type);
+ }
+ }
+ return true;
+ }
+ return false;
+}
+
//
// Vectorization heuristics.
//
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 2b202fd..1a842c4 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -82,6 +82,7 @@
kNoReduction = 1 << 9, // no reduction
kNoSAD = 1 << 10, // no sum of absolute differences (SAD)
kNoWideSAD = 1 << 11, // no sum of absolute differences (SAD) with operand widening
+ kNoDotProd = 1 << 12, // no dot product
};
/*
@@ -217,6 +218,11 @@
bool generate_code,
DataType::Type type,
uint64_t restrictions);
+ bool VectorizeDotProdIdiom(LoopNode* node,
+ HInstruction* instruction,
+ bool generate_code,
+ DataType::Type type,
+ uint64_t restrictions);
// Vectorization heuristics.
Alignment ComputeAlignment(HInstruction* offset,
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 68f1a24..76887f9 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1453,6 +1453,7 @@
M(VecSetScalars, VecOperation) \
M(VecMultiplyAccumulate, VecOperation) \
M(VecSADAccumulate, VecOperation) \
+ M(VecDotProd, VecOperation) \
M(VecLoad, VecMemoryOperation) \
M(VecStore, VecMemoryOperation) \
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index c7539f2..597e399 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -1021,6 +1021,66 @@
DEFAULT_COPY_CONSTRUCTOR(VecSADAccumulate);
};
+// Performs dot product of two vectors and adds the result to wider precision components in
+// the accumulator.
+//
+// viz. DOT_PRODUCT([ a1, .. , am], [ x1, .. , xn ], [ y1, .. , yn ]) =
+// [ a1 + sum(xi * yi), .. , am + sum(xj * yj) ],
+// for m <= n, non-overlapping sums,
+// for either both signed or both unsigned operands x, y.
+//
+// Notes:
+// - packed type reflects the type of sum reduction, not the type of the operands.
+// - IsZeroExtending() is used to determine the kind of signed/zero extension to be
+// performed for the operands.
+//
+// TODO: Support types other than kInt32 for packed type.
+class HVecDotProd final : public HVecOperation {
+ public:
+ HVecDotProd(ArenaAllocator* allocator,
+ HInstruction* accumulator,
+ HInstruction* left,
+ HInstruction* right,
+ DataType::Type packed_type,
+ bool is_zero_extending,
+ size_t vector_length,
+ uint32_t dex_pc)
+ : HVecOperation(kVecDotProd,
+ allocator,
+ packed_type,
+ SideEffects::None(),
+ /* number_of_inputs */ 3,
+ vector_length,
+ dex_pc) {
+ DCHECK(HasConsistentPackedTypes(accumulator, packed_type));
+ DCHECK(DataType::IsIntegralType(packed_type));
+ DCHECK(left->IsVecOperation());
+ DCHECK(right->IsVecOperation());
+ DCHECK_EQ(ToSignedType(left->AsVecOperation()->GetPackedType()),
+ ToSignedType(right->AsVecOperation()->GetPackedType()));
+ SetRawInputAt(0, accumulator);
+ SetRawInputAt(1, left);
+ SetRawInputAt(2, right);
+ SetPackedFlag<kFieldHDotProdIsZeroExtending>(is_zero_extending);
+ }
+
+ bool IsZeroExtending() const { return GetPackedFlag<kFieldHDotProdIsZeroExtending>(); }
+
+ bool CanBeMoved() const override { return true; }
+
+ DECLARE_INSTRUCTION(VecDotProd);
+
+ protected:
+ DEFAULT_COPY_CONSTRUCTOR(VecDotProd);
+
+ private:
+ // Additional packed bits.
+ static constexpr size_t kFieldHDotProdIsZeroExtending =
+ HVecOperation::kNumberOfVectorOpPackedBits;
+ static constexpr size_t kNumberOfHDotProdPackedBits = kFieldHDotProdIsZeroExtending + 1;
+ static_assert(kNumberOfHDotProdPackedBits <= kMaxNumberOfPackedBits, "Too many packed fields.");
+};
+
// Loads a vector from memory, viz. load(mem, 1)
// yield the vector [ mem(1), .. , mem(n) ].
class HVecLoad final : public HVecMemoryOperation {