diff options
Diffstat (limited to 'compiler/optimizing/loop_optimization.cc')
-rw-r--r-- | compiler/optimizing/loop_optimization.cc | 414 |
1 files changed, 280 insertions, 134 deletions
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index 716ea19657..aef6f1f5bd 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -366,8 +366,8 @@ static bool HasVectorRestrictions(uint64_t restrictions, uint64_t tested) { return (restrictions & tested) != 0; } -// Insert an instruction. -static HInstruction* Insert(HBasicBlock* block, HInstruction* instruction) { +// Insert an instruction at the end of the block, with safe checks. +inline HInstruction* Insert(HBasicBlock* block, HInstruction* instruction) { DCHECK(block != nullptr); DCHECK(instruction != nullptr); block->InsertInstructionBefore(instruction, block->GetLastInstruction()); @@ -488,6 +488,7 @@ HLoopOptimization::HLoopOptimization(HGraph* graph, vector_header_(nullptr), vector_body_(nullptr), vector_index_(nullptr), + loop_main_pred_(nullptr), arch_loop_helper_(ArchNoOptsLoopHelper::Create(codegen, global_allocator_)) { } @@ -849,22 +850,33 @@ bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) { return true; } } + + bool enable_alignment_strategies = !IsInPredicatedVectorizationMode(); // Vectorize loop, if possible and valid. - if (kEnableVectorization && + if (!kEnableVectorization || // Disable vectorization for debuggable graphs: this is a workaround for the bug // in 'GenerateNewLoop' which caused the SuspendCheck environment to be invalid. // TODO: b/138601207, investigate other possible cases with wrong environment values and // possibly switch back vectorization on for debuggable graphs. - !graph_->IsDebuggable() && - TrySetSimpleLoopHeader(header, &main_phi) && - ShouldVectorize(node, body, trip_count) && - TryAssignLastValue(node->loop_info, main_phi, preheader, /*collect_loop_uses*/ true)) { - Vectorize(node, body, exit, trip_count); - graph_->SetHasSIMD(true); // flag SIMD usage - MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorized); - return true; + graph_->IsDebuggable() || + !TrySetSimpleLoopHeader(header, &main_phi) || + !CanVectorizeDataFlow(node, body, enable_alignment_strategies)) { + return false; } - return false; + + if (!IsVectorizationProfitable(trip_count) || + !TryAssignLastValue(node->loop_info, main_phi, preheader, /*collect_loop_uses*/ true)) { + return false; + } + + if (IsInPredicatedVectorizationMode()) { + VectorizePredicated(node, body, exit); + } else { + VectorizeTraditional(node, body, exit, trip_count); + } + graph_->SetHasSIMD(true); // flag SIMD usage + MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorized); + return true; } bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) { @@ -1011,7 +1023,9 @@ bool HLoopOptimization::TryLoopScalarOpts(LoopNode* node) { // Intel Press, June, 2004 (http://www.aartbik.com/). // -bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) { +bool HLoopOptimization::CanVectorizeDataFlow(LoopNode* node, + HBasicBlock* block, + bool collect_alignment_info) { // Reset vector bookkeeping. vector_length_ = 0; vector_refs_->clear(); @@ -1116,24 +1130,108 @@ bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int6 } } // for i - if (!IsInPredicatedVectorizationMode()) { - // Find a suitable alignment strategy. + if (collect_alignment_info) { + // Update the info on alignment strategy. SetAlignmentStrategy(peeling_votes, peeling_candidate); } - // Does vectorization seem profitable? - if (!IsVectorizationProfitable(trip_count)) { - return false; - } - // Success! return true; } -void HLoopOptimization::Vectorize(LoopNode* node, - HBasicBlock* block, - HBasicBlock* exit, - int64_t trip_count) { +void HLoopOptimization::VectorizePredicated(LoopNode* node, + HBasicBlock* block, + HBasicBlock* exit) { + DCHECK(IsInPredicatedVectorizationMode()); + + HBasicBlock* header = node->loop_info->GetHeader(); + HBasicBlock* preheader = node->loop_info->GetPreHeader(); + + // Adjust vector bookkeeping. + HPhi* main_phi = nullptr; + bool is_simple_loop_header = TrySetSimpleLoopHeader(header, &main_phi); // refills sets + DCHECK(is_simple_loop_header); + vector_header_ = header; + vector_body_ = block; + + // Loop induction type. + DataType::Type induc_type = main_phi->GetType(); + DCHECK(induc_type == DataType::Type::kInt32 || induc_type == DataType::Type::kInt64) + << induc_type; + + // Generate loop control: + // stc = <trip-count>; + // vtc = <vector trip-count> + HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader); + HInstruction* vtc = stc; + vector_index_ = graph_->GetConstant(induc_type, 0); + bool needs_disambiguation_test = false; + // Generate runtime disambiguation test: + // vtc = a != b ? vtc : 0; + if (NeedsArrayRefsDisambiguationTest()) { + HInstruction* rt = Insert( + preheader, + new (global_allocator_) HNotEqual(vector_runtime_test_a_, vector_runtime_test_b_)); + vtc = Insert(preheader, + new (global_allocator_) + HSelect(rt, vtc, graph_->GetConstant(induc_type, 0), kNoDexPc)); + needs_disambiguation_test = true; + } + + // Generate vector loop: + // for ( ; i < vtc; i += vector_length) + // <vectorized-loop-body> + HBasicBlock* preheader_for_vector_loop = + graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit); + vector_mode_ = kVector; + GenerateNewLoopPredicated(node, + block, + preheader_for_vector_loop, + vector_index_, + vtc, + graph_->GetConstant(induc_type, vector_length_)); + + // Generate scalar loop, if needed: + // for ( ; i < stc; i += 1) + // <loop-body> + if (needs_disambiguation_test) { + vector_mode_ = kSequential; + HBasicBlock* preheader_for_cleanup_loop = + graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit); + // Use "Traditional" version for the sequential loop. + GenerateNewLoopScalarOrTraditional(node, + block, + preheader_for_cleanup_loop, + vector_index_, + stc, + graph_->GetConstant(induc_type, 1), + LoopAnalysisInfo::kNoUnrollingFactor); + } + + FinalizeVectorization(node, block); + + // Assign governing predicates for the predicated instructions inserted during vectorization + // outside the loop. + for (auto it : *vector_external_set_) { + DCHECK(it->IsVecOperation()); + HVecOperation* vec_op = it->AsVecOperation(); + + HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_, + graph_->GetIntConstant(1), + vec_op->GetPackedType(), + vec_op->GetVectorLength(), + 0u); + vec_op->GetBlock()->InsertInstructionBefore(set_pred, vec_op); + vec_op->SetMergingGoverningPredicate(set_pred); + } +} + +void HLoopOptimization::VectorizeTraditional(LoopNode* node, + HBasicBlock* block, + HBasicBlock* exit, + int64_t trip_count) { + DCHECK(!IsInPredicatedVectorizationMode()); + HBasicBlock* header = node->loop_info->GetHeader(); HBasicBlock* preheader = node->loop_info->GetPreHeader(); @@ -1146,7 +1244,7 @@ void HLoopOptimization::Vectorize(LoopNode* node, // A cleanup loop is needed, at least, for any unknown trip count or // for a known trip count with remainder iterations after vectorization. - bool needs_cleanup = !IsInPredicatedVectorizationMode() && + bool needs_cleanup = (trip_count == 0 || ((trip_count - vector_static_peeling_factor_) % chunk) != 0); // Adjust vector bookkeeping. @@ -1165,13 +1263,11 @@ void HLoopOptimization::Vectorize(LoopNode* node, // ptc = <peeling factor>; HInstruction* ptc = nullptr; if (vector_static_peeling_factor_ != 0) { - DCHECK(!IsInPredicatedVectorizationMode()); // Static loop peeling for SIMD alignment (using the most suitable // fixed peeling factor found during prior alignment analysis). DCHECK(vector_dynamic_peeling_candidate_ == nullptr); ptc = graph_->GetConstant(induc_type, vector_static_peeling_factor_); } else if (vector_dynamic_peeling_candidate_ != nullptr) { - DCHECK(!IsInPredicatedVectorizationMode()); // Dynamic loop peeling for SIMD alignment (using the most suitable // candidate found during prior alignment analysis): // rem = offset % ALIGN; // adjusted as #elements @@ -1202,7 +1298,6 @@ void HLoopOptimization::Vectorize(LoopNode* node, HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader); HInstruction* vtc = stc; if (needs_cleanup) { - DCHECK(!IsInPredicatedVectorizationMode()); DCHECK(IsPowerOfTwo(chunk)); HInstruction* diff = stc; if (ptc != nullptr) { @@ -1222,7 +1317,7 @@ void HLoopOptimization::Vectorize(LoopNode* node, // Generate runtime disambiguation test: // vtc = a != b ? vtc : 0; - if (vector_runtime_test_a_ != nullptr) { + if (NeedsArrayRefsDisambiguationTest()) { HInstruction* rt = Insert( preheader, new (global_allocator_) HNotEqual(vector_runtime_test_a_, vector_runtime_test_b_)); @@ -1240,45 +1335,55 @@ void HLoopOptimization::Vectorize(LoopNode* node, // moved around during suspend checks, since all analysis was based on // nothing more than the Android runtime alignment conventions. if (ptc != nullptr) { - DCHECK(!IsInPredicatedVectorizationMode()); vector_mode_ = kSequential; - GenerateNewLoop(node, - block, - graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit), - vector_index_, - ptc, - graph_->GetConstant(induc_type, 1), - LoopAnalysisInfo::kNoUnrollingFactor); + HBasicBlock* preheader_for_peeling_loop = + graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit); + GenerateNewLoopScalarOrTraditional(node, + block, + preheader_for_peeling_loop, + vector_index_, + ptc, + graph_->GetConstant(induc_type, 1), + LoopAnalysisInfo::kNoUnrollingFactor); } // Generate vector loop, possibly further unrolled: // for ( ; i < vtc; i += chunk) // <vectorized-loop-body> vector_mode_ = kVector; - GenerateNewLoop(node, - block, - graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit), - vector_index_, - vtc, - graph_->GetConstant(induc_type, vector_length_), // increment per unroll - unroll); - HLoopInformation* vloop = vector_header_->GetLoopInformation(); + HBasicBlock* preheader_for_vector_loop = + graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit); + GenerateNewLoopScalarOrTraditional(node, + block, + preheader_for_vector_loop, + vector_index_, + vtc, + graph_->GetConstant(induc_type, vector_length_), // per unroll + unroll); // Generate cleanup loop, if needed: // for ( ; i < stc; i += 1) // <loop-body> if (needs_cleanup) { - DCHECK_IMPLIES(IsInPredicatedVectorizationMode(), vector_runtime_test_a_ != nullptr); vector_mode_ = kSequential; - GenerateNewLoop(node, - block, - graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit), - vector_index_, - stc, - graph_->GetConstant(induc_type, 1), - LoopAnalysisInfo::kNoUnrollingFactor); + HBasicBlock* preheader_for_cleanup_loop = + graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit); + GenerateNewLoopScalarOrTraditional(node, + block, + preheader_for_cleanup_loop, + vector_index_, + stc, + graph_->GetConstant(induc_type, 1), + LoopAnalysisInfo::kNoUnrollingFactor); } + FinalizeVectorization(node, block); +} + +void HLoopOptimization::FinalizeVectorization(LoopNode* node, HBasicBlock* block) { + HBasicBlock* header = node->loop_info->GetHeader(); + HBasicBlock* preheader = node->loop_info->GetPreHeader(); + HLoopInformation* vloop = vector_header_->GetLoopInformation(); // Link reductions to their final uses. for (auto i = reductions_->begin(); i != reductions_->end(); ++i) { if (i->first->IsPhi()) { @@ -1296,22 +1401,6 @@ void HLoopOptimization::Vectorize(LoopNode* node, // and removing all instructions from the header. block->DisconnectAndDelete(); - if (IsInPredicatedVectorizationMode()) { - // Assigns governing predicates (all true) to the vector operations inserted outside the loop. - // - // TODO: Adjust GVN to support VecPredSetAll sharing. - for (auto it : *vector_external_set_) { - HVecOperation* vec_op = it->AsVecOperation(); - HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_, - graph_->GetIntConstant(1), - vec_op->GetPackedType(), - vec_op->GetVectorLength(), - 0u); - vec_op->GetBlock()->InsertInstructionBefore(set_pred, vec_op); - vec_op->SetMergingGoverningPredicate(set_pred); - } - } - while (!header->GetFirstInstruction()->IsGoto()) { header->RemoveInstruction(header->GetFirstInstruction()); } @@ -1323,14 +1412,7 @@ void HLoopOptimization::Vectorize(LoopNode* node, node->loop_info = vloop; } -void HLoopOptimization::GenerateNewLoop(LoopNode* node, - HBasicBlock* block, - HBasicBlock* new_preheader, - HInstruction* lo, - HInstruction* hi, - HInstruction* step, - uint32_t unroll) { - DCHECK(unroll == 1 || vector_mode_ == kVector); +HPhi* HLoopOptimization::InitializeForNewLoop(HBasicBlock* new_preheader, HInstruction* lo) { DataType::Type induc_type = lo->GetType(); // Prepare new loop. vector_preheader_ = new_preheader, @@ -1340,70 +1422,132 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node, kNoRegNumber, 0, HPhi::ToPhiType(induc_type)); - // Generate header and prepare body. - // for (i = lo; i < hi; i += step) - // <loop-body> - HInstruction* cond = nullptr; - HInstruction* set_pred = nullptr; - if (IsInPredicatedVectorizationMode()) { - HVecPredWhile* pred_while = - new (global_allocator_) HVecPredWhile(global_allocator_, - phi, - hi, - HVecPredWhile::CondKind::kLO, - DataType::Type::kInt32, - vector_length_, - 0u); - - cond = new (global_allocator_) HVecPredCondition(global_allocator_, - pred_while, - HVecPredCondition::PCondKind::kNFirst, - DataType::Type::kInt32, - vector_length_, - 0u); - - vector_header_->AddPhi(phi); - vector_header_->AddInstruction(pred_while); - vector_header_->AddInstruction(cond); - set_pred = pred_while; - } else { - cond = new (global_allocator_) HAboveOrEqual(phi, hi); - vector_header_->AddPhi(phi); - vector_header_->AddInstruction(cond); - } - - vector_header_->AddInstruction(new (global_allocator_) HIf(cond)); + vector_header_->AddPhi(phi); vector_index_ = phi; - vector_permanent_map_->clear(); // preserved over unrolling + vector_permanent_map_->clear(); vector_external_set_->clear(); + return phi; +} + +void HLoopOptimization::GenerateNewLoopScalarOrTraditional(LoopNode* node, + HBasicBlock* body, + HBasicBlock* new_preheader, + HInstruction* lo, + HInstruction* hi, + HInstruction* step, + uint32_t unroll) { + DCHECK(unroll == 1 || vector_mode_ == kVector); + DataType::Type induc_type = lo->GetType(); + HPhi* phi = InitializeForNewLoop(new_preheader, lo); + + // Generate loop exit check. + HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi); + vector_header_->AddInstruction(cond); + vector_header_->AddInstruction(new (global_allocator_) HIf(cond)); + for (uint32_t u = 0; u < unroll; u++) { - // Generate instruction map. - vector_map_->clear(); - for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { - bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true); - DCHECK(vectorized_def); - } - // Generate body from the instruction map, but in original program order. - HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment(); - for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { - auto i = vector_map_->find(it.Current()); - if (i != vector_map_->end() && !i->second->IsInBlock()) { - Insert(vector_body_, i->second); - if (IsInPredicatedVectorizationMode() && i->second->IsVecOperation()) { - HVecOperation* op = i->second->AsVecOperation(); - op->SetMergingGoverningPredicate(set_pred); - } - // Deal with instructions that need an environment, such as the scalar intrinsics. - if (i->second->NeedsEnvironment()) { - i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_); - } + GenerateNewLoopBodyOnce(node, body, induc_type, step); + } + + FinalizePhisForNewLoop(phi, lo); +} + +void HLoopOptimization::GenerateNewLoopPredicated(LoopNode* node, + HBasicBlock* body, + HBasicBlock* new_preheader, + HInstruction* lo, + HInstruction* hi, + HInstruction* step) { + DCHECK(IsInPredicatedVectorizationMode()); + DCHECK_EQ(vector_mode_, kVector); + DataType::Type induc_type = lo->GetType(); + HPhi* phi = InitializeForNewLoop(new_preheader, lo); + + // Generate loop exit check. + HVecPredWhile* pred_while = + new (global_allocator_) HVecPredWhile(global_allocator_, + phi, + hi, + HVecPredWhile::CondKind::kLO, + DataType::Type::kInt32, + vector_length_, + 0u); + + HInstruction* cond = + new (global_allocator_) HVecPredCondition(global_allocator_, + pred_while, + HVecPredCondition::PCondKind::kNFirst, + DataType::Type::kInt32, + vector_length_, + 0u); + + vector_header_->AddInstruction(pred_while); + vector_header_->AddInstruction(cond); + loop_main_pred_ = pred_while; + + + vector_header_->AddInstruction(new (global_allocator_) HIf(cond)); + + GenerateNewLoopBodyOnce(node, body, induc_type, step); + + // Assign governing predicates for instructions in the loop. + for (HInstructionIterator it(body->GetInstructions()); !it.Done(); it.Advance()) { + auto i = vector_map_->find(it.Current()); + if (i != vector_map_->end()) { + HInstruction* instr = i->second; + + if (!instr->IsVecOperation()) { + continue; + } + // There are cases when a vector instruction, which corresponds to some instruction in the + // original scalar loop, is located not in the newly created vector loop but + // in the vector loop preheader (and hence recorded in vector_external_set_). + // + // Governing predicates will be set for such instructions separately. + bool in_vector_loop = vector_header_->GetLoopInformation()->Contains(*instr->GetBlock()); + DCHECK_IMPLIES(!in_vector_loop, + vector_external_set_->find(instr) != vector_external_set_->end()); + + if (in_vector_loop && + !instr->AsVecOperation()->IsPredicated()) { + HVecOperation* op = instr->AsVecOperation(); + op->SetMergingGoverningPredicate(loop_main_pred_); + } + } + } + + FinalizePhisForNewLoop(phi, lo); +} + +void HLoopOptimization::GenerateNewLoopBodyOnce(LoopNode* node, + HBasicBlock* body, + DataType::Type induc_type, + HInstruction* step) { + // Generate instruction map. + vector_map_->clear(); + for (HInstructionIterator it(body->GetInstructions()); !it.Done(); it.Advance()) { + bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true); + DCHECK(vectorized_def); + } + // Generate body from the instruction map, but in original program order. + HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment(); + for (HInstructionIterator it(body->GetInstructions()); !it.Done(); it.Advance()) { + auto i = vector_map_->find(it.Current()); + if (i != vector_map_->end() && !i->second->IsInBlock()) { + Insert(vector_body_, i->second); + // Deal with instructions that need an environment, such as the scalar intrinsics. + if (i->second->NeedsEnvironment()) { + i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_); } } - // Generate the induction. - vector_index_ = new (global_allocator_) HAdd(induc_type, vector_index_, step); - Insert(vector_body_, vector_index_); } + // Generate the induction. + vector_index_ = new (global_allocator_) HAdd(induc_type, vector_index_, step); + Insert(vector_body_, vector_index_); +} + +void HLoopOptimization::FinalizePhisForNewLoop(HPhi* phi, HInstruction* lo) { // Finalize phi inputs for the reductions (if any). for (auto i = reductions_->begin(); i != reductions_->end(); ++i) { if (!i->first->IsPhi()) { @@ -2425,6 +2569,8 @@ bool HLoopOptimization::IsVectorizationProfitable(int64_t trip_count) { // TODO: trip count is really unsigned entity, provided the guarding test // is satisfied; deal with this more carefully later uint32_t max_peel = MaxNumberPeeled(); + // Peeling is not supported in predicated mode. + DCHECK_IMPLIES(IsInPredicatedVectorizationMode(), max_peel == 0u); if (vector_length_ == 0) { return false; // nothing found } else if (trip_count < 0) { |