ART: Implement predicated SIMD vectorization.
This CL brings support for predicated execution for
auto-vectorizer and implements arm64 SVE vector backend.
This version passes all the VIXL simulator-runnable tests in
SVE mode with checker off (as all VecOp CHECKs need to be
adjusted for an extra input) and all tests in NEON mode.
Test: art SIMD tests on VIXL simulator.
Test: art tests on FVP (steps in test/README.arm_fvp.md)
Change-Id: Ib78bde31a15e6713d875d6668ad4458f5519605f
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 4c9b01c..1210dbe 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -473,6 +473,7 @@
iset_(nullptr),
reductions_(nullptr),
simplified_(false),
+ predicated_vectorization_mode_(codegen.SupportsPredicatedSIMD()),
vector_length_(0),
vector_refs_(nullptr),
vector_static_peeling_factor_(0),
@@ -486,10 +487,7 @@
vector_header_(nullptr),
vector_body_(nullptr),
vector_index_(nullptr),
- arch_loop_helper_(ArchNoOptsLoopHelper::Create(compiler_options_ != nullptr
- ? compiler_options_->GetInstructionSet()
- : InstructionSet::kNone,
- global_allocator_)) {
+ arch_loop_helper_(ArchNoOptsLoopHelper::Create(codegen, global_allocator_)) {
}
bool HLoopOptimization::Run() {
@@ -1024,8 +1022,10 @@
}
} // for i
- // Find a suitable alignment strategy.
- SetAlignmentStrategy(peeling_votes, peeling_candidate);
+ if (!IsInPredicatedVectorizationMode()) {
+ // Find a suitable alignment strategy.
+ SetAlignmentStrategy(peeling_votes, peeling_candidate);
+ }
// Does vectorization seem profitable?
if (!IsVectorizationProfitable(trip_count)) {
@@ -1052,8 +1052,8 @@
// A cleanup loop is needed, at least, for any unknown trip count or
// for a known trip count with remainder iterations after vectorization.
- bool needs_cleanup = trip_count == 0 ||
- ((trip_count - vector_static_peeling_factor_) % chunk) != 0;
+ bool needs_cleanup = !IsInPredicatedVectorizationMode() &&
+ (trip_count == 0 || ((trip_count - vector_static_peeling_factor_) % chunk) != 0);
// Adjust vector bookkeeping.
HPhi* main_phi = nullptr;
@@ -1071,11 +1071,13 @@
// ptc = <peeling factor>;
HInstruction* ptc = nullptr;
if (vector_static_peeling_factor_ != 0) {
+ DCHECK(!IsInPredicatedVectorizationMode());
// Static loop peeling for SIMD alignment (using the most suitable
// fixed peeling factor found during prior alignment analysis).
DCHECK(vector_dynamic_peeling_candidate_ == nullptr);
ptc = graph_->GetConstant(induc_type, vector_static_peeling_factor_);
} else if (vector_dynamic_peeling_candidate_ != nullptr) {
+ DCHECK(!IsInPredicatedVectorizationMode());
// Dynamic loop peeling for SIMD alignment (using the most suitable
// candidate found during prior alignment analysis):
// rem = offset % ALIGN; // adjusted as #elements
@@ -1106,6 +1108,7 @@
HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader);
HInstruction* vtc = stc;
if (needs_cleanup) {
+ DCHECK(!IsInPredicatedVectorizationMode());
DCHECK(IsPowerOfTwo(chunk));
HInstruction* diff = stc;
if (ptc != nullptr) {
@@ -1143,6 +1146,7 @@
// moved around during suspend checks, since all analysis was based on
// nothing more than the Android runtime alignment conventions.
if (ptc != nullptr) {
+ DCHECK(!IsInPredicatedVectorizationMode());
vector_mode_ = kSequential;
GenerateNewLoop(node,
block,
@@ -1170,6 +1174,7 @@
// for ( ; i < stc; i += 1)
// <loop-body>
if (needs_cleanup) {
+ DCHECK(!IsInPredicatedVectorizationMode() || vector_runtime_test_a_ != nullptr);
vector_mode_ = kSequential;
GenerateNewLoop(node,
block,
@@ -1227,9 +1232,35 @@
// Generate header and prepare body.
// for (i = lo; i < hi; i += step)
// <loop-body>
- HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi);
- vector_header_->AddPhi(phi);
- vector_header_->AddInstruction(cond);
+ HInstruction* cond = nullptr;
+ HInstruction* set_pred = nullptr;
+ if (IsInPredicatedVectorizationMode()) {
+ HVecPredWhile* pred_while =
+ new (global_allocator_) HVecPredWhile(global_allocator_,
+ phi,
+ hi,
+ HVecPredWhile::CondKind::kLO,
+ DataType::Type::kInt32,
+ vector_length_,
+ 0u);
+
+ cond = new (global_allocator_) HVecPredCondition(global_allocator_,
+ pred_while,
+ HVecPredCondition::PCondKind::kNFirst,
+ DataType::Type::kInt32,
+ vector_length_,
+ 0u);
+
+ vector_header_->AddPhi(phi);
+ vector_header_->AddInstruction(pred_while);
+ vector_header_->AddInstruction(cond);
+ set_pred = pred_while;
+ } else {
+ cond = new (global_allocator_) HAboveOrEqual(phi, hi);
+ vector_header_->AddPhi(phi);
+ vector_header_->AddInstruction(cond);
+ }
+
vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
vector_index_ = phi;
vector_permanent_map_->clear(); // preserved over unrolling
@@ -1246,6 +1277,10 @@
auto i = vector_map_->find(it.Current());
if (i != vector_map_->end() && !i->second->IsInBlock()) {
Insert(vector_body_, i->second);
+ if (IsInPredicatedVectorizationMode() && i->second->IsVecOperation()) {
+ HVecOperation* op = i->second->AsVecOperation();
+ op->SetMergingGoverningPredicate(set_pred);
+ }
// Deal with instructions that need an environment, such as the scalar intrinsics.
if (i->second->NeedsEnvironment()) {
i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
@@ -1360,7 +1395,10 @@
} else if (instruction->IsArrayGet()) {
// Deal with vector restrictions.
bool is_string_char_at = instruction->AsArrayGet()->IsStringCharAt();
- if (is_string_char_at && HasVectorRestrictions(restrictions, kNoStringCharAt)) {
+
+ if (is_string_char_at && (HasVectorRestrictions(restrictions, kNoStringCharAt) ||
+ IsInPredicatedVectorizationMode())) {
+ // TODO: Support CharAt for predicated mode.
return false;
}
// Accept a right-hand-side array base[index] for
@@ -1575,32 +1613,73 @@
}
return false;
case InstructionSet::kArm64:
- // Allow vectorization for all ARM devices, because Android assumes that
- // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD).
- switch (type) {
- case DataType::Type::kBool:
- case DataType::Type::kUint8:
- case DataType::Type::kInt8:
- *restrictions |= kNoDiv;
- return TrySetVectorLength(type, 16);
- case DataType::Type::kUint16:
- case DataType::Type::kInt16:
- *restrictions |= kNoDiv;
- return TrySetVectorLength(type, 8);
- case DataType::Type::kInt32:
- *restrictions |= kNoDiv;
- return TrySetVectorLength(type, 4);
- case DataType::Type::kInt64:
- *restrictions |= kNoDiv | kNoMul;
- return TrySetVectorLength(type, 2);
- case DataType::Type::kFloat32:
- *restrictions |= kNoReduction;
- return TrySetVectorLength(type, 4);
- case DataType::Type::kFloat64:
- *restrictions |= kNoReduction;
- return TrySetVectorLength(type, 2);
- default:
- return false;
+ if (IsInPredicatedVectorizationMode()) {
+ // SVE vectorization.
+ CHECK(features->AsArm64InstructionSetFeatures()->HasSVE());
+ switch (type) {
+ case DataType::Type::kBool:
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ *restrictions |= kNoDiv |
+ kNoSignedHAdd |
+ kNoUnsignedHAdd |
+ kNoUnroundedHAdd |
+ kNoSAD;
+ return TrySetVectorLength(type, 16);
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ *restrictions |= kNoDiv |
+ kNoSignedHAdd |
+ kNoUnsignedHAdd |
+ kNoUnroundedHAdd |
+ kNoSAD |
+ kNoDotProd;
+ return TrySetVectorLength(type, 8);
+ case DataType::Type::kInt32:
+ *restrictions |= kNoDiv | kNoSAD;
+ return TrySetVectorLength(type, 4);
+ case DataType::Type::kInt64:
+ *restrictions |= kNoDiv | kNoSAD;
+ return TrySetVectorLength(type, 2);
+ case DataType::Type::kFloat32:
+ *restrictions |= kNoReduction;
+ return TrySetVectorLength(type, 4);
+ case DataType::Type::kFloat64:
+ *restrictions |= kNoReduction;
+ return TrySetVectorLength(type, 2);
+ default:
+ break;
+ }
+ return false;
+ } else {
+ // Allow vectorization for all ARM devices, because Android assumes that
+ // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD).
+ switch (type) {
+ case DataType::Type::kBool:
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ *restrictions |= kNoDiv;
+ return TrySetVectorLength(type, 16);
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ *restrictions |= kNoDiv;
+ return TrySetVectorLength(type, 8);
+ case DataType::Type::kInt32:
+ *restrictions |= kNoDiv;
+ return TrySetVectorLength(type, 4);
+ case DataType::Type::kInt64:
+ *restrictions |= kNoDiv | kNoMul;
+ return TrySetVectorLength(type, 2);
+ case DataType::Type::kFloat32:
+ *restrictions |= kNoReduction;
+ return TrySetVectorLength(type, 4);
+ case DataType::Type::kFloat64:
+ *restrictions |= kNoReduction;
+ return TrySetVectorLength(type, 2);
+ default:
+ break;
+ }
+ return false;
}
case InstructionSet::kX86:
case InstructionSet::kX86_64:
@@ -1693,6 +1772,15 @@
vector = new (global_allocator_)
HVecReplicateScalar(global_allocator_, input, type, vector_length_, kNoDexPc);
vector_permanent_map_->Put(org, Insert(vector_preheader_, vector));
+ if (IsInPredicatedVectorizationMode()) {
+ HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+ graph_->GetIntConstant(1),
+ type,
+ vector_length_,
+ 0u);
+ vector_preheader_->InsertInstructionBefore(set_pred, vector);
+ vector->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+ }
}
vector_map_->Put(org, vector);
}
@@ -1821,6 +1909,15 @@
vector_length,
kNoDexPc));
}
+ if (IsInPredicatedVectorizationMode()) {
+ HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+ graph_->GetIntConstant(1),
+ type,
+ vector_length,
+ 0u);
+ vector_preheader_->InsertInstructionBefore(set_pred, new_init);
+ new_init->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+ }
} else {
new_init = ReduceAndExtractIfNeeded(new_init);
}
@@ -1852,6 +1949,17 @@
instruction = new (global_allocator_) HVecExtractScalar(
global_allocator_, reduce, type, vector_length, 0, kNoDexPc);
exit->InsertInstructionAfter(instruction, reduce);
+
+ if (IsInPredicatedVectorizationMode()) {
+ HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+ graph_->GetIntConstant(1),
+ type,
+ vector_length,
+ 0u);
+ exit->InsertInstructionBefore(set_pred, reduce);
+ reduce->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+ instruction->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+ }
}
}
return instruction;
@@ -1991,7 +2099,8 @@
return false;
}
// Deal with vector restrictions.
- if ((!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) ||
+ if ((is_unsigned && HasVectorRestrictions(restrictions, kNoUnsignedHAdd)) ||
+ (!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) ||
(!is_rounded && HasVectorRestrictions(restrictions, kNoUnroundedHAdd))) {
return false;
}