ART: Implement scalar loop unrolling.

Implement scalar loop unrolling for small loops
(on arm64) with known trip count to reduce loop check
and branch penalty and to provide more opportunities
for instruction scheduling.

Note: this functionality is turned off by default now.

Test: cloner_test.cc
Test: test-art-target, test-art-host

Change-Id: Ic27fd8fb0bc0d7b69251252da37b8b510bc30acc
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index e1fb7ac..6908034 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -33,8 +33,8 @@
 // Enables vectorization (SIMDization) in the loop optimizer.
 static constexpr bool kEnableVectorization = true;
 
-// No loop unrolling factor (just one copy of the loop-body).
-static constexpr uint32_t kNoUnrollingFactor = 1;
+// Enables scalar loop unrolling in the loop optimizer.
+static constexpr bool kEnableScalarUnrolling = false;
 
 //
 // Static helpers.
@@ -480,7 +480,11 @@
       vector_preheader_(nullptr),
       vector_header_(nullptr),
       vector_body_(nullptr),
-      vector_index_(nullptr) {
+      vector_index_(nullptr),
+      arch_loop_helper_(ArchDefaultLoopHelper::Create(compiler_driver_ != nullptr
+                                                          ? compiler_driver_->GetInstructionSet()
+                                                          : InstructionSet::kNone,
+                                                      global_allocator_)) {
 }
 
 void HLoopOptimization::Run() {
@@ -691,7 +695,7 @@
   }
 }
 
-bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
+bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) {
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
   // Ensure loop header logic is finite.
@@ -761,6 +765,83 @@
   return false;
 }
 
+bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
+  return TryOptimizeInnerLoopFinite(node) ||
+         TryUnrollingForBranchPenaltyReduction(node);
+}
+
+void HLoopOptimization::PeelOrUnrollOnce(LoopNode* loop_node,
+                                         bool do_unrolling,
+                                         SuperblockCloner::HBasicBlockMap* bb_map,
+                                         SuperblockCloner::HInstructionMap* hir_map) {
+  // TODO: peel loop nests.
+  DCHECK(loop_node->inner == nullptr);
+
+  // Check that loop info is up-to-date.
+  HLoopInformation* loop_info = loop_node->loop_info;
+  HBasicBlock* header = loop_info->GetHeader();
+  DCHECK(loop_info == header->GetLoopInformation());
+
+  PeelUnrollHelper helper(loop_info, bb_map, hir_map);
+  DCHECK(helper.IsLoopClonable());
+  HBasicBlock* new_header = do_unrolling ? helper.DoUnrolling() : helper.DoPeeling();
+  DCHECK(header == new_header);
+  DCHECK(loop_info == new_header->GetLoopInformation());
+}
+
+//
+// Loop unrolling: generic part methods.
+//
+
+bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* loop_node) {
+  // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests)
+  // as InstructionSet is needed.
+  if (!kEnableScalarUnrolling || compiler_driver_ == nullptr) {
+    return false;
+  }
+
+  HLoopInformation* loop_info = loop_node->loop_info;
+  int64_t trip_count = 0;
+  // Only unroll loops with a known tripcount.
+  if (!induction_range_.HasKnownTripCount(loop_info, &trip_count)) {
+    return false;
+  }
+
+  uint32_t unrolling_factor = arch_loop_helper_->GetScalarUnrollingFactor(loop_info, trip_count);
+  if (unrolling_factor == kNoUnrollingFactor) {
+    return false;
+  }
+
+  LoopAnalysisInfo loop_analysis_info(loop_info);
+  LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
+
+  // Check "IsLoopClonable" last as it can be time-consuming.
+  if (arch_loop_helper_->IsLoopTooBigForScalarUnrolling(&loop_analysis_info) ||
+      (loop_analysis_info.GetNumberOfExits() > 1) ||
+      loop_analysis_info.HasInstructionsPreventingScalarUnrolling() ||
+      !PeelUnrollHelper::IsLoopClonable(loop_info)) {
+    return false;
+  }
+
+  // TODO: support other unrolling factors.
+  DCHECK_EQ(unrolling_factor, 2u);
+
+  // Perform unrolling.
+  ArenaAllocator* arena = loop_info->GetHeader()->GetGraph()->GetAllocator();
+  SuperblockCloner::HBasicBlockMap bb_map(
+      std::less<HBasicBlock*>(), arena->Adapter(kArenaAllocSuperblockCloner));
+  SuperblockCloner::HInstructionMap hir_map(
+      std::less<HInstruction*>(), arena->Adapter(kArenaAllocSuperblockCloner));
+  PeelOrUnrollOnce(loop_node, /* unrolling */ true, &bb_map, &hir_map);
+
+  // Remove the redundant loop check after unrolling.
+  HIf* copy_hif = bb_map.Get(loop_info->GetHeader())->GetLastInstruction()->AsIf();
+  int32_t constant = loop_info->Contains(*copy_hif->IfTrueSuccessor()) ? 1 : 0;
+  copy_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u);
+
+  return true;
+}
+
 //
 // Loop vectorization. The implementation is based on the book by Aart J.C. Bik:
 // "The Software Vectorization Handbook. Applying Multimedia Extensions for Maximum Performance."
@@ -891,7 +972,8 @@
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
 
   // Pick a loop unrolling factor for the vector loop.
-  uint32_t unroll = GetUnrollingFactor(block, trip_count);
+  uint32_t unroll = arch_loop_helper_->GetSIMDUnrollingFactor(
+      block, trip_count, MaxNumberPeeled(), vector_length_);
   uint32_t chunk = vector_length_ * unroll;
 
   DCHECK(trip_count == 0 || (trip_count >= MaxNumberPeeled() + chunk));
@@ -2174,41 +2256,6 @@
   return true;
 }
 
-static constexpr uint32_t ARM64_SIMD_MAXIMUM_UNROLL_FACTOR = 8;
-static constexpr uint32_t ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE = 50;
-
-uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) {
-  uint32_t max_peel = MaxNumberPeeled();
-  switch (compiler_driver_->GetInstructionSet()) {
-    case InstructionSet::kArm64: {
-      // Don't unroll with insufficient iterations.
-      // TODO: Unroll loops with unknown trip count.
-      DCHECK_NE(vector_length_, 0u);
-      if (trip_count < (2 * vector_length_ + max_peel)) {
-        return kNoUnrollingFactor;
-      }
-      // Don't unroll for large loop body size.
-      uint32_t instruction_count = block->GetInstructions().CountSize();
-      if (instruction_count >= ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE) {
-        return kNoUnrollingFactor;
-      }
-      // Find a beneficial unroll factor with the following restrictions:
-      //  - At least one iteration of the transformed loop should be executed.
-      //  - The loop body shouldn't be "too big" (heuristic).
-      uint32_t uf1 = ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE / instruction_count;
-      uint32_t uf2 = (trip_count - max_peel) / vector_length_;
-      uint32_t unroll_factor =
-          TruncToPowerOfTwo(std::min({uf1, uf2, ARM64_SIMD_MAXIMUM_UNROLL_FACTOR}));
-      DCHECK_GE(unroll_factor, 1u);
-      return unroll_factor;
-    }
-    case InstructionSet::kX86:
-    case InstructionSet::kX86_64:
-    default:
-      return kNoUnrollingFactor;
-  }
-}
-
 //
 // Helpers.
 //