diff options
author | 2018-05-16 19:06:32 +0100 | |
---|---|---|
committer | 2018-07-04 13:12:18 +0100 | |
commit | 18ba1dacaaf426cbeb3c0aff6db9c58a752f9a96 (patch) | |
tree | e6d82d3b8856137a1b09a2843ea88165d97afbfe | |
parent | 0e32908d0ee4be5905cdd409dd3c45331fc98465 (diff) |
ART: Implement loop full unrolling.
Performs whole loop unrolling for small loops with small
trip count to eliminate the loop check overhead, to have
more opportunities for inter-iteration optimizations.
caffeinemark/FloatAtom: 1.2x performance on arm64 Cortex-A57.
Test: 530-checker-peel-unroll.
Test: test-art-host, test-art-target.
Change-Id: Idf3fe3cb611376935d176c60db8c49907222e28a
-rw-r--r-- | compiler/optimizing/loop_analysis.cc | 10 | ||||
-rw-r--r-- | compiler/optimizing/loop_analysis.h | 7 | ||||
-rw-r--r-- | compiler/optimizing/loop_optimization.cc | 54 | ||||
-rw-r--r-- | compiler/optimizing/loop_optimization.h | 6 | ||||
-rw-r--r-- | test/527-checker-array-access-split/src/Main.java | 8 | ||||
-rw-r--r-- | test/530-checker-peel-unroll/src/Main.java | 40 |
6 files changed, 119 insertions, 6 deletions
diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc index efb23e7d3e..d355cedb35 100644 --- a/compiler/optimizing/loop_analysis.cc +++ b/compiler/optimizing/loop_analysis.cc @@ -84,6 +84,8 @@ class ArchDefaultLoopHelper : public ArchNoOptsLoopHelper { static constexpr uint32_t kScalarHeuristicMaxBodySizeInstr = 17; // Loop's maximum basic block count. Loops with higher count will not be peeled/unrolled. static constexpr uint32_t kScalarHeuristicMaxBodySizeBlocks = 6; + // Maximum number of instructions to be created as a result of full unrolling. + static constexpr uint32_t kScalarHeuristicFullyUnrolledMaxInstrThreshold = 35; bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* analysis_info) const OVERRIDE { return analysis_info->HasLongTypeInstructions() || @@ -108,6 +110,14 @@ class ArchDefaultLoopHelper : public ArchNoOptsLoopHelper { bool IsLoopPeelingEnabled() const OVERRIDE { return true; } + bool IsFullUnrollingBeneficial(LoopAnalysisInfo* analysis_info) const OVERRIDE { + int64_t trip_count = analysis_info->GetTripCount(); + // We assume that trip count is known. + DCHECK_NE(trip_count, LoopAnalysisInfo::kUnknownTripCount); + size_t instr_num = analysis_info->GetNumberOfInstructions(); + return (trip_count * instr_num < kScalarHeuristicFullyUnrolledMaxInstrThreshold); + } + protected: bool IsLoopTooBig(LoopAnalysisInfo* loop_analysis_info, size_t instr_threshold, diff --git a/compiler/optimizing/loop_analysis.h b/compiler/optimizing/loop_analysis.h index bcb7b70494..57509ee410 100644 --- a/compiler/optimizing/loop_analysis.h +++ b/compiler/optimizing/loop_analysis.h @@ -160,6 +160,13 @@ class ArchNoOptsLoopHelper : public ArenaObject<kArenaAllocOptimization> { // Returns 'false' by default, should be overridden by particular target loop helper. virtual bool IsLoopPeelingEnabled() const { return false; } + // Returns whether it is beneficial to fully unroll the loop. + // + // Returns 'false' by default, should be overridden by particular target loop helper. + virtual bool IsFullUnrollingBeneficial(LoopAnalysisInfo* analysis_info ATTRIBUTE_UNUSED) const { + return false; + } + // Returns optimal SIMD unrolling factor for the loop. // // Returns kNoUnrollingFactor by default, should be overridden by particular target loop helper. diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index 440cd3351e..7d66155b39 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -422,6 +422,15 @@ static void TryToEvaluateIfCondition(HIf* instruction, HGraph* graph) { } } +// Peel the first 'count' iterations of the loop. +static void PeelByCount(HLoopInformation* loop_info, int count) { + for (int i = 0; i < count; i++) { + // Perform peeling. + PeelUnrollSimpleHelper helper(loop_info); + helper.DoPeeling(); + } +} + // // Public methods. // @@ -811,6 +820,45 @@ bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopAnalysisI return true; } +bool HLoopOptimization::TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code) { + // Fully unroll loops with a known and small trip count. + int64_t trip_count = analysis_info->GetTripCount(); + if (!arch_loop_helper_->IsLoopPeelingEnabled() || + trip_count == LoopAnalysisInfo::kUnknownTripCount || + !arch_loop_helper_->IsFullUnrollingBeneficial(analysis_info)) { + return false; + } + + if (generate_code) { + // Peeling of the N first iterations (where N equals to the trip count) will effectively + // eliminate the loop: after peeling we will have N sequential iterations copied into the loop + // preheader and the original loop. The trip count of this loop will be 0 as the sequential + // iterations are executed first and there are exactly N of them. Thus we can statically + // evaluate the loop exit condition to 'false' and fully eliminate it. + // + // Here is an example of full unrolling of a loop with a trip count 2: + // + // loop_cond_1 + // loop_body_1 <- First iteration. + // | + // \ v + // ==\ loop_cond_2 + // ==/ loop_body_2 <- Second iteration. + // / | + // <- v <- + // loop_cond \ loop_cond \ <- This cond is always false. + // loop_body _/ loop_body _/ + // + HLoopInformation* loop_info = analysis_info->GetLoopInfo(); + PeelByCount(loop_info, trip_count); + HIf* loop_hif = loop_info->GetHeader()->GetLastInstruction()->AsIf(); + int32_t constant = loop_info->Contains(*loop_hif->IfTrueSuccessor()) ? 0 : 1; + loop_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u); + } + + return true; +} + bool HLoopOptimization::TryPeelingAndUnrolling(LoopNode* node) { // Don't run peeling/unrolling if compiler_options_ is nullptr (i.e., running under tests) // as InstructionSet is needed. @@ -828,7 +876,8 @@ bool HLoopOptimization::TryPeelingAndUnrolling(LoopNode* node) { return false; } - if (!TryPeelingForLoopInvariantExitsElimination(&analysis_info, /*generate_code*/ false) && + if (!TryFullUnrolling(&analysis_info, /*generate_code*/ false) && + !TryPeelingForLoopInvariantExitsElimination(&analysis_info, /*generate_code*/ false) && !TryUnrollingForBranchPenaltyReduction(&analysis_info, /*generate_code*/ false)) { return false; } @@ -838,7 +887,8 @@ bool HLoopOptimization::TryPeelingAndUnrolling(LoopNode* node) { return false; } - return TryPeelingForLoopInvariantExitsElimination(&analysis_info) || + return TryFullUnrolling(&analysis_info) || + TryPeelingForLoopInvariantExitsElimination(&analysis_info) || TryUnrollingForBranchPenaltyReduction(&analysis_info); } diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h index bc4792458b..644b740ed4 100644 --- a/compiler/optimizing/loop_optimization.h +++ b/compiler/optimizing/loop_optimization.h @@ -155,6 +155,12 @@ class HLoopOptimization : public HOptimization { bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info, bool generate_code = true); + // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate + // the loop check overhead and to have more opportunities for inter-iteration optimizations. + // Returns whether transformation happened. 'generate_code' determines whether the optimization + // should be actually applied. + bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true); + // Tries to apply scalar loop peeling and unrolling. bool TryPeelingAndUnrolling(LoopNode* node); diff --git a/test/527-checker-array-access-split/src/Main.java b/test/527-checker-array-access-split/src/Main.java index a5caa7bce0..935b37858d 100644 --- a/test/527-checker-array-access-split/src/Main.java +++ b/test/527-checker-array-access-split/src/Main.java @@ -400,7 +400,7 @@ public class Main { /// CHECK: ArraySet [<<Address>>,<<Index>>,<<Div>>] public static int canMergeAfterBCE1() { - int[] array = {0, 7, 14, 21}; + int[] array = {0, 7, 14, 21, 28, 35, 42}; for (int i = 0; i < array.length; i++) { array[i] = array[i] / 7; } @@ -513,7 +513,7 @@ public class Main { /// CHECK-NOT: IntermediateAddress public static int canMergeAfterBCE2() { - int[] array = {64, 8, 4, 2 }; + int[] array = {128, 64, 32, 8, 4, 2 }; for (int i = 0; i < array.length - 1; i++) { array[i + 1] = array[i] << array[i + 1]; } @@ -571,8 +571,8 @@ public class Main { accrossGC(array, 0); assertIntEquals(125, array[0]); - assertIntEquals(3, canMergeAfterBCE1()); - assertIntEquals(1048576, canMergeAfterBCE2()); + assertIntEquals(6, canMergeAfterBCE1()); + assertIntEquals(2097152, canMergeAfterBCE2()); assertIntEquals(18, checkLongFloatDouble()); } diff --git a/test/530-checker-peel-unroll/src/Main.java b/test/530-checker-peel-unroll/src/Main.java index 11c29649ff..4d814407a3 100644 --- a/test/530-checker-peel-unroll/src/Main.java +++ b/test/530-checker-peel-unroll/src/Main.java @@ -1067,6 +1067,46 @@ public class Main { } } + /// CHECK-START: void Main.unrollingFull(int[]) loop_optimization (before) + /// CHECK-DAG: <<Param:l\d+>> ParameterValue loop:none + /// CHECK-DAG: <<Const0:i\d+>> IntConstant 0 loop:none + /// CHECK-DAG: <<Const1:i\d+>> IntConstant 1 loop:none + /// CHECK-DAG: <<Limit:i\d+>> IntConstant 2 loop:none + /// CHECK-DAG: <<Phi:i\d+>> Phi [<<Const0>>,{{i\d+}}] loop:<<Loop:B\d+>> outer_loop:none + /// CHECK-DAG: ArrayGet loop:<<Loop>> outer_loop:none + /// CHECK-DAG: ArrayGet loop:<<Loop>> outer_loop:none + /// CHECK-DAG: ArraySet loop:<<Loop>> outer_loop:none + // + /// CHECK-NOT: ArrayGet + /// CHECK-NOT: ArraySet + + /// CHECK-START: void Main.unrollingFull(int[]) loop_optimization (after) + /// CHECK-DAG: <<Param:l\d+>> ParameterValue loop:none + /// CHECK-DAG: <<Const0:i\d+>> IntConstant 0 loop:none + /// CHECK-DAG: <<Const1:i\d+>> IntConstant 1 loop:none + /// CHECK-DAG: <<Limit:i\d+>> IntConstant 2 loop:none + // Two peeled iterations + /// CHECK-DAG: ArrayGet loop:none + /// CHECK-DAG: ArrayGet loop:none + /// CHECK-DAG: ArraySet loop:none + /// CHECK-DAG: ArrayGet loop:none + /// CHECK-DAG: ArrayGet loop:none + /// CHECK-DAG: ArraySet loop:none + // Loop + /// CHECK-DAG: <<Phi:i\d+>> Phi [{{i\d+}},{{i\d+}}] loop:<<Loop:B\d+>> outer_loop:none + /// CHECK-DAG: ArrayGet loop:<<Loop>> outer_loop:none + /// CHECK-DAG: ArrayGet loop:<<Loop>> outer_loop:none + /// CHECK-DAG: ArraySet loop:<<Loop>> outer_loop:none + /// CHECK-DAG: If [<<Const1>>] loop:<<Loop>> outer_loop:none + // + /// CHECK-NOT: ArrayGet + /// CHECK-NOT: ArraySet + private static final void unrollingFull(int[] a) { + for (int i = 0; i < 2; i++) { + a[i] += a[i + 1]; + } + } + private static void expectEquals(int expected, int result) { if (expected != result) { throw new Error("Expected: " + expected + ", found: " + result); |