ART: Implement loop full unrolling.
Performs whole loop unrolling for small loops with small
trip count to eliminate the loop check overhead, to have
more opportunities for inter-iteration optimizations.
caffeinemark/FloatAtom: 1.2x performance on arm64 Cortex-A57.
Test: 530-checker-peel-unroll.
Test: test-art-host, test-art-target.
Change-Id: Idf3fe3cb611376935d176c60db8c49907222e28a
diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc
index efb23e7..d355ced 100644
--- a/compiler/optimizing/loop_analysis.cc
+++ b/compiler/optimizing/loop_analysis.cc
@@ -84,6 +84,8 @@
static constexpr uint32_t kScalarHeuristicMaxBodySizeInstr = 17;
// Loop's maximum basic block count. Loops with higher count will not be peeled/unrolled.
static constexpr uint32_t kScalarHeuristicMaxBodySizeBlocks = 6;
+ // Maximum number of instructions to be created as a result of full unrolling.
+ static constexpr uint32_t kScalarHeuristicFullyUnrolledMaxInstrThreshold = 35;
bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* analysis_info) const OVERRIDE {
return analysis_info->HasLongTypeInstructions() ||
@@ -108,6 +110,14 @@
bool IsLoopPeelingEnabled() const OVERRIDE { return true; }
+ bool IsFullUnrollingBeneficial(LoopAnalysisInfo* analysis_info) const OVERRIDE {
+ int64_t trip_count = analysis_info->GetTripCount();
+ // We assume that trip count is known.
+ DCHECK_NE(trip_count, LoopAnalysisInfo::kUnknownTripCount);
+ size_t instr_num = analysis_info->GetNumberOfInstructions();
+ return (trip_count * instr_num < kScalarHeuristicFullyUnrolledMaxInstrThreshold);
+ }
+
protected:
bool IsLoopTooBig(LoopAnalysisInfo* loop_analysis_info,
size_t instr_threshold,