ART: Implement scalar loop unrolling.

Implement scalar loop unrolling for small loops
(on arm64) with known trip count to reduce loop check
and branch penalty and to provide more opportunities
for instruction scheduling.

Note: this functionality is turned off by default now.

Test: cloner_test.cc
Test: test-art-target, test-art-host

Change-Id: Ic27fd8fb0bc0d7b69251252da37b8b510bc30acc
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 9414e5a..0120cff 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -20,12 +20,15 @@
 #include "base/scoped_arena_allocator.h"
 #include "base/scoped_arena_containers.h"
 #include "induction_var_range.h"
+#include "loop_analysis.h"
 #include "nodes.h"
 #include "optimization.h"
+#include "superblock_cloner.h"
 
 namespace art {
 
 class CompilerDriver;
+class ArchDefaultLoopHelper;
 
 /**
  * Loop optimizations. Builds a loop hierarchy and applies optimizations to
@@ -135,10 +138,26 @@
   void SimplifyInduction(LoopNode* node);
   void SimplifyBlocks(LoopNode* node);
 
-  // Performs optimizations specific to inner loop (empty loop removal,
+  // Performs optimizations specific to inner loop with finite header logic (empty loop removal,
   // unrolling, vectorization). Returns true if anything changed.
+  bool TryOptimizeInnerLoopFinite(LoopNode* node);
+
+  // Performs optimizations specific to inner loop. Returns true if anything changed.
   bool OptimizeInnerLoop(LoopNode* node);
 
+  // Performs loop peeling/unrolling once (depends on the 'do_unrolling'); the transformation
+  // preserves the header and the loop info.
+  //
+  // Note: the function records copying information about blocks and instructions.
+  void PeelOrUnrollOnce(LoopNode* loop_node,
+                        bool do_unrolling,
+                        SuperblockCloner::HBasicBlockMap* bb_map,
+                        SuperblockCloner::HInstructionMap* hir_map);
+
+  // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling
+  // opportunities. Returns whether transformation happened.
+  bool TryUnrollingForBranchPenaltyReduction(LoopNode* loop_node);
+
   //
   // Vectorization analysis and synthesis.
   //
@@ -203,7 +222,6 @@
                             const ArrayReference* peeling_candidate);
   uint32_t MaxNumberPeeled();
   bool IsVectorizationProfitable(int64_t trip_count);
-  uint32_t GetUnrollingFactor(HBasicBlock* block, int64_t trip_count);
 
   //
   // Helpers.
@@ -297,6 +315,9 @@
   HBasicBlock* vector_body_;  // body of the new loop
   HInstruction* vector_index_;  // normalized index of the new loop
 
+  // Helper for target-specific behaviour for loop optimizations.
+  ArchDefaultLoopHelper* arch_loop_helper_;
+
   friend class LoopOptimizationTest;
 
   DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);