ART: Implement scalar loop peeling.
Implement scalar loop peeling for invariant exits elimination
(on arm64). If the loop exit condition is loop invariant then
loop peeling + GVN + DCE can eliminate this exit in the loop
body. Note: GVN and DCE aren't applied during loop optimizations.
Note: this functionality is turned off by default now.
Test: test-art-host, test-art-target, boot-to-gui.
Change-Id: I98d20054a431838b452dc06bd25c075eb445960c
diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc
index cd3bdaf..a0760ef 100644
--- a/compiler/optimizing/loop_analysis.cc
+++ b/compiler/optimizing/loop_analysis.cc
@@ -16,6 +16,8 @@
#include "loop_analysis.h"
+#include "base/bit_vector-inl.h"
+
namespace art {
void LoopAnalysis::CalculateLoopBasicProperties(HLoopInformation* loop_info,
@@ -33,7 +35,8 @@
for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
HInstruction* instruction = it.Current();
- if (MakesScalarUnrollingNonBeneficial(instruction)) {
+ if (MakesScalarPeelingUnrollingNonBeneficial(instruction)) {
+ analysis_results->has_instructions_preventing_scalar_peeling_ = true;
analysis_results->has_instructions_preventing_scalar_unrolling_ = true;
}
analysis_results->instr_num_++;
@@ -42,6 +45,22 @@
}
}
+bool LoopAnalysis::HasLoopAtLeastOneInvariantExit(HLoopInformation* loop_info) {
+ HGraph* graph = loop_info->GetHeader()->GetGraph();
+ for (uint32_t block_id : loop_info->GetBlocks().Indexes()) {
+ HBasicBlock* block = graph->GetBlocks()[block_id];
+ DCHECK(block != nullptr);
+ if (block->EndsWithIf()) {
+ HIf* hif = block->GetLastInstruction()->AsIf();
+ HInstruction* input = hif->InputAt(0);
+ if (IsLoopExit(loop_info, hif) && !loop_info->Contains(*input->GetBlock())) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
class Arm64LoopHelper : public ArchDefaultLoopHelper {
public:
// Scalar loop unrolling parameters and heuristics.
@@ -60,7 +79,7 @@
// Loop's maximum instruction count. Loops with higher count will not be unrolled.
static constexpr uint32_t kArm64SimdHeuristicMaxBodySizeInstr = 50;
- bool IsLoopTooBigForScalarUnrolling(LoopAnalysisInfo* loop_analysis_info) const OVERRIDE {
+ bool IsLoopTooBigForScalarPeelingUnrolling(LoopAnalysisInfo* loop_analysis_info) const OVERRIDE {
size_t instr_num = loop_analysis_info->GetNumberOfInstructions();
size_t bb_num = loop_analysis_info->GetNumberOfBasicBlocks();
return (instr_num >= kArm64ScalarHeuristicMaxBodySizeInstr ||
@@ -77,6 +96,8 @@
return desired_unrolling_factor;
}
+ bool IsLoopPeelingEnabled() const OVERRIDE { return true; }
+
uint32_t GetSIMDUnrollingFactor(HBasicBlock* block,
int64_t trip_count,
uint32_t max_peel,
diff --git a/compiler/optimizing/loop_analysis.h b/compiler/optimizing/loop_analysis.h
index bad406f..ece9858 100644
--- a/compiler/optimizing/loop_analysis.h
+++ b/compiler/optimizing/loop_analysis.h
@@ -33,6 +33,7 @@
: bb_num_(0),
instr_num_(0),
exits_num_(0),
+ has_instructions_preventing_scalar_peeling_(false),
has_instructions_preventing_scalar_unrolling_(false),
loop_info_(loop_info) {}
@@ -40,6 +41,10 @@
size_t GetNumberOfInstructions() const { return instr_num_; }
size_t GetNumberOfExits() const { return exits_num_; }
+ bool HasInstructionsPreventingScalarPeeling() const {
+ return has_instructions_preventing_scalar_peeling_;
+ }
+
bool HasInstructionsPreventingScalarUnrolling() const {
return has_instructions_preventing_scalar_unrolling_;
}
@@ -53,6 +58,8 @@
size_t instr_num_;
// Number of loop's exits.
size_t exits_num_;
+ // Whether the loop has instructions which make scalar loop peeling non-beneficial.
+ bool has_instructions_preventing_scalar_peeling_;
// Whether the loop has instructions which make scalar loop unrolling non-beneficial.
bool has_instructions_preventing_scalar_unrolling_;
@@ -71,22 +78,35 @@
static void CalculateLoopBasicProperties(HLoopInformation* loop_info,
LoopAnalysisInfo* analysis_results);
+ // Returns whether the loop has at least one loop invariant exit.
+ static bool HasLoopAtLeastOneInvariantExit(HLoopInformation* loop_info);
+
+ // Returns whether HIf's true or false successor is outside the specified loop.
+ //
+ // Prerequisite: HIf must be in the specified loop.
+ static bool IsLoopExit(HLoopInformation* loop_info, const HIf* hif) {
+ DCHECK(loop_info->Contains(*hif->GetBlock()));
+ HBasicBlock* true_succ = hif->IfTrueSuccessor();
+ HBasicBlock* false_succ = hif->IfFalseSuccessor();
+ return (!loop_info->Contains(*true_succ) || !loop_info->Contains(*false_succ));
+ }
+
private:
- // Returns whether an instruction makes scalar loop unrolling non-beneficial.
+ // Returns whether an instruction makes scalar loop peeling/unrolling non-beneficial.
//
// If in the loop body we have a dex/runtime call then its contribution to the whole
- // loop performance will probably prevail. So unrolling optimization will not bring
- // any noticeable performance improvement however will increase the code size.
- static bool MakesScalarUnrollingNonBeneficial(HInstruction* instruction) {
+ // loop performance will probably prevail. So peeling/unrolling optimization will not bring
+ // any noticeable performance improvement. It will increase the code size.
+ static bool MakesScalarPeelingUnrollingNonBeneficial(HInstruction* instruction) {
return (instruction->IsNewArray() ||
instruction->IsNewInstance() ||
instruction->IsUnresolvedInstanceFieldGet() ||
instruction->IsUnresolvedInstanceFieldSet() ||
instruction->IsUnresolvedStaticFieldGet() ||
instruction->IsUnresolvedStaticFieldSet() ||
- // TODO: Unroll loops with intrinsified invokes.
+ // TODO: Support loops with intrinsified invokes.
instruction->IsInvoke() ||
- // TODO: Unroll loops with ClinitChecks.
+ // TODO: Support loops with ClinitChecks.
instruction->IsClinitCheck());
}
};
@@ -105,14 +125,14 @@
// doesn't support loop peeling and unrolling.
static ArchDefaultLoopHelper* Create(InstructionSet isa, ArenaAllocator* allocator);
- // Returns whether the loop is too big for loop unrolling by checking its total number of
+ // Returns whether the loop is too big for loop peeling/unrolling by checking its total number of
// basic blocks and instructions.
//
- // If the loop body has too many instructions then unrolling optimization will not bring
+ // If the loop body has too many instructions then peeling/unrolling optimization will not bring
// any noticeable performance improvement however will increase the code size.
//
// Returns 'true' by default, should be overridden by particular target loop helper.
- virtual bool IsLoopTooBigForScalarUnrolling(
+ virtual bool IsLoopTooBigForScalarPeelingUnrolling(
LoopAnalysisInfo* loop_analysis_info ATTRIBUTE_UNUSED) const { return true; }
// Returns optimal scalar unrolling factor for the loop.
@@ -123,6 +143,11 @@
return kNoUnrollingFactor;
}
+ // Returns whether scalar loop peeling is enabled,
+ //
+ // Returns 'false' by default, should be overridden by particular target loop helper.
+ virtual bool IsLoopPeelingEnabled() const { return false; }
+
// Returns optimal SIMD unrolling factor for the loop.
//
// Returns kNoUnrollingFactor by default, should be overridden by particular target loop helper.
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index b41b659..c0c721d 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -34,7 +34,7 @@
static constexpr bool kEnableVectorization = true;
// Enables scalar loop unrolling in the loop optimizer.
-static constexpr bool kEnableScalarUnrolling = false;
+static constexpr bool kEnableScalarPeelingUnrolling = false;
//
// Static helpers.
@@ -533,6 +533,43 @@
return true;
}
+// Tries to statically evaluate condition of the specified "HIf" for other condition checks.
+static void TryToEvaluateIfCondition(HIf* instruction, HGraph* graph) {
+ HInstruction* cond = instruction->InputAt(0);
+
+ // If a condition 'cond' is evaluated in an HIf instruction then in the successors of the
+ // IF_BLOCK we statically know the value of the condition 'cond' (TRUE in TRUE_SUCC, FALSE in
+ // FALSE_SUCC). Using that we can replace another evaluation (use) EVAL of the same 'cond'
+ // with TRUE value (FALSE value) if every path from the ENTRY_BLOCK to EVAL_BLOCK contains the
+ // edge HIF_BLOCK->TRUE_SUCC (HIF_BLOCK->FALSE_SUCC).
+ // if (cond) { if(cond) {
+ // if (cond) {} if (1) {}
+ // } else { =======> } else {
+ // if (cond) {} if (0) {}
+ // } }
+ if (!cond->IsConstant()) {
+ HBasicBlock* true_succ = instruction->IfTrueSuccessor();
+ HBasicBlock* false_succ = instruction->IfFalseSuccessor();
+
+ DCHECK_EQ(true_succ->GetPredecessors().size(), 1u);
+ DCHECK_EQ(false_succ->GetPredecessors().size(), 1u);
+
+ const HUseList<HInstruction*>& uses = cond->GetUses();
+ for (auto it = uses.begin(), end = uses.end(); it != end; /* ++it below */) {
+ HInstruction* user = it->GetUser();
+ size_t index = it->GetIndex();
+ HBasicBlock* user_block = user->GetBlock();
+ // Increment `it` now because `*it` may disappear thanks to user->ReplaceInput().
+ ++it;
+ if (true_succ->Dominates(user_block)) {
+ user->ReplaceInput(graph->GetIntConstant(1), index);
+ } else if (false_succ->Dominates(user_block)) {
+ user->ReplaceInput(graph->GetIntConstant(0), index);
+ }
+ }
+ }
+}
+
//
// Public methods.
//
@@ -851,40 +888,24 @@
bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
return TryOptimizeInnerLoopFinite(node) ||
+ TryPeelingForLoopInvariantExitsElimination(node) ||
TryUnrollingForBranchPenaltyReduction(node);
}
-void HLoopOptimization::PeelOrUnrollOnce(LoopNode* loop_node,
- bool do_unrolling,
- SuperblockCloner::HBasicBlockMap* bb_map,
- SuperblockCloner::HInstructionMap* hir_map) {
- // TODO: peel loop nests.
- DCHECK(loop_node->inner == nullptr);
- // Check that loop info is up-to-date.
- HLoopInformation* loop_info = loop_node->loop_info;
- HBasicBlock* header = loop_info->GetHeader();
- DCHECK(loop_info == header->GetLoopInformation());
-
- PeelUnrollHelper helper(loop_info, bb_map, hir_map);
- DCHECK(helper.IsLoopClonable());
- HBasicBlock* new_header = do_unrolling ? helper.DoUnrolling() : helper.DoPeeling();
- DCHECK(header == new_header);
- DCHECK(loop_info == new_header->GetLoopInformation());
-}
//
// Loop unrolling: generic part methods.
//
-bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* loop_node) {
+bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* node) {
// Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests)
// as InstructionSet is needed.
- if (!kEnableScalarUnrolling || compiler_driver_ == nullptr) {
+ if (!kEnableScalarPeelingUnrolling || compiler_driver_ == nullptr) {
return false;
}
- HLoopInformation* loop_info = loop_node->loop_info;
+ HLoopInformation* loop_info = node->loop_info;
int64_t trip_count = 0;
// Only unroll loops with a known tripcount.
if (!induction_range_.HasKnownTripCount(loop_info, &trip_count)) {
@@ -900,7 +921,7 @@
LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
// Check "IsLoopClonable" last as it can be time-consuming.
- if (arch_loop_helper_->IsLoopTooBigForScalarUnrolling(&loop_analysis_info) ||
+ if (arch_loop_helper_->IsLoopTooBigForScalarPeelingUnrolling(&loop_analysis_info) ||
(loop_analysis_info.GetNumberOfExits() > 1) ||
loop_analysis_info.HasInstructionsPreventingScalarUnrolling() ||
!PeelUnrollHelper::IsLoopClonable(loop_info)) {
@@ -911,21 +932,57 @@
DCHECK_EQ(unrolling_factor, 2u);
// Perform unrolling.
- ArenaAllocator* arena = loop_info->GetHeader()->GetGraph()->GetAllocator();
- SuperblockCloner::HBasicBlockMap bb_map(
- std::less<HBasicBlock*>(), arena->Adapter(kArenaAllocSuperblockCloner));
- SuperblockCloner::HInstructionMap hir_map(
- std::less<HInstruction*>(), arena->Adapter(kArenaAllocSuperblockCloner));
- PeelOrUnrollOnce(loop_node, /* unrolling */ true, &bb_map, &hir_map);
+ PeelUnrollSimpleHelper helper(loop_info);
+ helper.DoUnrolling();
// Remove the redundant loop check after unrolling.
- HIf* copy_hif = bb_map.Get(loop_info->GetHeader())->GetLastInstruction()->AsIf();
+ HIf* copy_hif =
+ helper.GetBasicBlockMap()->Get(loop_info->GetHeader())->GetLastInstruction()->AsIf();
int32_t constant = loop_info->Contains(*copy_hif->IfTrueSuccessor()) ? 1 : 0;
copy_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u);
return true;
}
+bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopNode* node) {
+ // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests)
+ // as InstructionSet is needed.
+ if (!kEnableScalarPeelingUnrolling || compiler_driver_ == nullptr) {
+ return false;
+ }
+
+ HLoopInformation* loop_info = node->loop_info;
+ // Check 'IsLoopClonable' the last as it might be time-consuming.
+ if (!arch_loop_helper_->IsLoopPeelingEnabled()) {
+ return false;
+ }
+
+ LoopAnalysisInfo loop_analysis_info(loop_info);
+ LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
+
+ // Check "IsLoopClonable" last as it can be time-consuming.
+ if (arch_loop_helper_->IsLoopTooBigForScalarPeelingUnrolling(&loop_analysis_info) ||
+ loop_analysis_info.HasInstructionsPreventingScalarPeeling() ||
+ !LoopAnalysis::HasLoopAtLeastOneInvariantExit(loop_info) ||
+ !PeelUnrollHelper::IsLoopClonable(loop_info)) {
+ return false;
+ }
+
+ // Perform peeling.
+ PeelUnrollSimpleHelper helper(loop_info);
+ helper.DoPeeling();
+
+ const SuperblockCloner::HInstructionMap* hir_map = helper.GetInstructionMap();
+ for (auto entry : *hir_map) {
+ HInstruction* copy = entry.second;
+ if (copy->IsIf()) {
+ TryToEvaluateIfCondition(copy->AsIf(), graph_);
+ }
+ }
+
+ return true;
+}
+
//
// Loop vectorization. The implementation is based on the book by Aart J.C. Bik:
// "The Software Vectorization Handbook. Applying Multimedia Extensions for Maximum Performance."
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 0120cff..f9a31a3 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -145,19 +145,14 @@
// Performs optimizations specific to inner loop. Returns true if anything changed.
bool OptimizeInnerLoop(LoopNode* node);
- // Performs loop peeling/unrolling once (depends on the 'do_unrolling'); the transformation
- // preserves the header and the loop info.
- //
- // Note: the function records copying information about blocks and instructions.
- void PeelOrUnrollOnce(LoopNode* loop_node,
- bool do_unrolling,
- SuperblockCloner::HBasicBlockMap* bb_map,
- SuperblockCloner::HInstructionMap* hir_map);
-
// Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling
// opportunities. Returns whether transformation happened.
bool TryUnrollingForBranchPenaltyReduction(LoopNode* loop_node);
+ // Tries to apply loop peeling for loop invariant exits elimination. Returns whether
+ // transformation happened.
+ bool TryPeelingForLoopInvariantExitsElimination(LoopNode* loop_node);
+
//
// Vectorization analysis and synthesis.
//
diff --git a/compiler/optimizing/superblock_cloner.cc b/compiler/optimizing/superblock_cloner.cc
index ee74f10..fad7729 100644
--- a/compiler/optimizing/superblock_cloner.cc
+++ b/compiler/optimizing/superblock_cloner.cc
@@ -28,11 +28,6 @@
using HBasicBlockSet = SuperblockCloner::HBasicBlockSet;
using HEdgeSet = SuperblockCloner::HEdgeSet;
-// When doing peeling we can choose whether to keep original loop (made of original basic blocks)
-// and form a peeled iteration of the copy blocks (preserve the header) or transfer original loop
-// blocks to the peeled iteration and create new loop from the copy blocks. Similar for unrolling.
-static const bool kPeelUnrollPreserveHeader = true;
-
void HEdge::Dump(std::ostream& stream) const {
stream << "(" << from_ << "->" << to_ << ")";
}
@@ -926,16 +921,12 @@
remap_orig_internal->Insert(e);
remap_copy_internal->Insert(e);
} else {
- if (kPeelUnrollPreserveHeader) {
- remap_copy_internal->Insert(e);
- } else {
- remap_orig_internal->Insert(e);
- }
+ remap_copy_internal->Insert(e);
}
}
// Set up remap_incoming edges set.
- if (to_unroll != kPeelUnrollPreserveHeader) {
+ if (!to_unroll) {
remap_incoming->Insert(HEdge(loop_info->GetPreHeader(), loop_header));
}
}
@@ -992,6 +983,9 @@
DCHECK(!loop_info_->IsIrreducible());
HBasicBlock* loop_header = loop_info_->GetHeader();
+ // Check that loop info is up-to-date.
+ DCHECK(loop_info_ == loop_header->GetLoopInformation());
+
HGraph* graph = loop_header->GetGraph();
ArenaAllocator allocator(graph->GetAllocator()->GetArenaPool());
@@ -1009,7 +1003,10 @@
cloner_.Run();
cloner_.CleanUp();
- return kPeelUnrollPreserveHeader ? loop_header : cloner_.GetBlockCopy(loop_header);
+ // Check that loop info is preserved.
+ DCHECK(loop_info_ == loop_header->GetLoopInformation());
+
+ return loop_header;
}
PeelUnrollSimpleHelper::PeelUnrollSimpleHelper(HLoopInformation* info)
diff --git a/compiler/optimizing/superblock_cloner.h b/compiler/optimizing/superblock_cloner.h
index afd5a5d..e093167 100644
--- a/compiler/optimizing/superblock_cloner.h
+++ b/compiler/optimizing/superblock_cloner.h
@@ -372,6 +372,9 @@
HBasicBlock* DoUnrolling() { return helper_.DoUnrolling(); }
HLoopInformation* GetRegionToBeAdjusted() const { return helper_.GetRegionToBeAdjusted(); }
+ const SuperblockCloner::HBasicBlockMap* GetBasicBlockMap() const { return &bb_map_; }
+ const SuperblockCloner::HInstructionMap* GetInstructionMap() const { return &hir_map_; }
+
private:
SuperblockCloner::HBasicBlockMap bb_map_;
SuperblockCloner::HInstructionMap hir_map_;