ART: Implement loop full unrolling.

Performs whole loop unrolling for small loops with small
trip count to eliminate the loop check overhead, to have
more opportunities for inter-iteration optimizations.

caffeinemark/FloatAtom: 1.2x performance on arm64 Cortex-A57.

Test: 530-checker-peel-unroll.
Test: test-art-host, test-art-target.
Change-Id: Idf3fe3cb611376935d176c60db8c49907222e28a
diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc
index efb23e7..d355ced 100644
--- a/compiler/optimizing/loop_analysis.cc
+++ b/compiler/optimizing/loop_analysis.cc
@@ -84,6 +84,8 @@
   static constexpr uint32_t kScalarHeuristicMaxBodySizeInstr = 17;
   // Loop's maximum basic block count. Loops with higher count will not be peeled/unrolled.
   static constexpr uint32_t kScalarHeuristicMaxBodySizeBlocks = 6;
+  // Maximum number of instructions to be created as a result of full unrolling.
+  static constexpr uint32_t kScalarHeuristicFullyUnrolledMaxInstrThreshold = 35;
 
   bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* analysis_info) const OVERRIDE {
     return analysis_info->HasLongTypeInstructions() ||
@@ -108,6 +110,14 @@
 
   bool IsLoopPeelingEnabled() const OVERRIDE { return true; }
 
+  bool IsFullUnrollingBeneficial(LoopAnalysisInfo* analysis_info) const OVERRIDE {
+    int64_t trip_count = analysis_info->GetTripCount();
+    // We assume that trip count is known.
+    DCHECK_NE(trip_count, LoopAnalysisInfo::kUnknownTripCount);
+    size_t instr_num = analysis_info->GetNumberOfInstructions();
+    return (trip_count * instr_num < kScalarHeuristicFullyUnrolledMaxInstrThreshold);
+  }
+
  protected:
   bool IsLoopTooBig(LoopAnalysisInfo* loop_analysis_info,
                     size_t instr_threshold,
diff --git a/compiler/optimizing/loop_analysis.h b/compiler/optimizing/loop_analysis.h
index bcb7b70..57509ee 100644
--- a/compiler/optimizing/loop_analysis.h
+++ b/compiler/optimizing/loop_analysis.h
@@ -160,6 +160,13 @@
   // Returns 'false' by default, should be overridden by particular target loop helper.
   virtual bool IsLoopPeelingEnabled() const { return false; }
 
+  // Returns whether it is beneficial to fully unroll the loop.
+  //
+  // Returns 'false' by default, should be overridden by particular target loop helper.
+  virtual bool IsFullUnrollingBeneficial(LoopAnalysisInfo* analysis_info ATTRIBUTE_UNUSED) const {
+    return false;
+  }
+
   // Returns optimal SIMD unrolling factor for the loop.
   //
   // Returns kNoUnrollingFactor by default, should be overridden by particular target loop helper.
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 440cd33..7d66155 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -422,6 +422,15 @@
   }
 }
 
+// Peel the first 'count' iterations of the loop.
+static void PeelByCount(HLoopInformation* loop_info, int count) {
+  for (int i = 0; i < count; i++) {
+    // Perform peeling.
+    PeelUnrollSimpleHelper helper(loop_info);
+    helper.DoPeeling();
+  }
+}
+
 //
 // Public methods.
 //
@@ -811,6 +820,45 @@
   return true;
 }
 
+bool HLoopOptimization::TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code) {
+  // Fully unroll loops with a known and small trip count.
+  int64_t trip_count = analysis_info->GetTripCount();
+  if (!arch_loop_helper_->IsLoopPeelingEnabled() ||
+      trip_count == LoopAnalysisInfo::kUnknownTripCount ||
+      !arch_loop_helper_->IsFullUnrollingBeneficial(analysis_info)) {
+    return false;
+  }
+
+  if (generate_code) {
+    // Peeling of the N first iterations (where N equals to the trip count) will effectively
+    // eliminate the loop: after peeling we will have N sequential iterations copied into the loop
+    // preheader and the original loop. The trip count of this loop will be 0 as the sequential
+    // iterations are executed first and there are exactly N of them. Thus we can statically
+    // evaluate the loop exit condition to 'false' and fully eliminate it.
+    //
+    // Here is an example of full unrolling of a loop with a trip count 2:
+    //
+    //                                           loop_cond_1
+    //                                           loop_body_1        <- First iteration.
+    //                                               |
+    //                             \                 v
+    //                            ==\            loop_cond_2
+    //                            ==/            loop_body_2        <- Second iteration.
+    //                             /                 |
+    //               <-                              v     <-
+    //     loop_cond   \                         loop_cond   \      <- This cond is always false.
+    //     loop_body  _/                         loop_body  _/
+    //
+    HLoopInformation* loop_info = analysis_info->GetLoopInfo();
+    PeelByCount(loop_info, trip_count);
+    HIf* loop_hif = loop_info->GetHeader()->GetLastInstruction()->AsIf();
+    int32_t constant = loop_info->Contains(*loop_hif->IfTrueSuccessor()) ? 0 : 1;
+    loop_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u);
+  }
+
+  return true;
+}
+
 bool HLoopOptimization::TryPeelingAndUnrolling(LoopNode* node) {
   // Don't run peeling/unrolling if compiler_options_ is nullptr (i.e., running under tests)
   // as InstructionSet is needed.
@@ -828,7 +876,8 @@
     return false;
   }
 
-  if (!TryPeelingForLoopInvariantExitsElimination(&analysis_info, /*generate_code*/ false) &&
+  if (!TryFullUnrolling(&analysis_info, /*generate_code*/ false) &&
+      !TryPeelingForLoopInvariantExitsElimination(&analysis_info, /*generate_code*/ false) &&
       !TryUnrollingForBranchPenaltyReduction(&analysis_info, /*generate_code*/ false)) {
     return false;
   }
@@ -838,7 +887,8 @@
     return false;
   }
 
-  return TryPeelingForLoopInvariantExitsElimination(&analysis_info) ||
+  return TryFullUnrolling(&analysis_info) ||
+         TryPeelingForLoopInvariantExitsElimination(&analysis_info) ||
          TryUnrollingForBranchPenaltyReduction(&analysis_info);
 }
 
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index bc47924..644b740 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -155,6 +155,12 @@
   bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info,
                                                   bool generate_code = true);
 
+  // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate
+  // the loop check overhead and to have more opportunities for inter-iteration optimizations.
+  // Returns whether transformation happened. 'generate_code' determines whether the optimization
+  // should be actually applied.
+  bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true);
+
   // Tries to apply scalar loop peeling and unrolling.
   bool TryPeelingAndUnrolling(LoopNode* node);