ART: Enable scalar loop peeling and unrolling.

Turn on scalar loop peeling and unrolling by default.

Test: 482-checker-loop-back-edge-use, 530-checker-peel-unroll
Test: test-art-host, test-art-target, boot-to-gui
Change-Id: Ibfe1b54f790a97b281e85396da2985e0f22c2834
diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc
index a0760ef..a212445 100644
--- a/compiler/optimizing/loop_analysis.cc
+++ b/compiler/optimizing/loop_analysis.cc
@@ -35,6 +35,9 @@
 
     for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
       HInstruction* instruction = it.Current();
+      if (it.Current()->GetType() == DataType::Type::kInt64) {
+        analysis_results->has_long_type_instructions_ = true;
+      }
       if (MakesScalarPeelingUnrollingNonBeneficial(instruction)) {
         analysis_results->has_instructions_preventing_scalar_peeling_ = true;
         analysis_results->has_instructions_preventing_scalar_unrolling_ = true;
@@ -61,34 +64,29 @@
   return false;
 }
 
-class Arm64LoopHelper : public ArchDefaultLoopHelper {
+// Default implementation of loop helper; used for all targets unless a custom implementation
+// is provided. Enables scalar loop peeling and unrolling with the most conservative heuristics.
+class ArchDefaultLoopHelper : public ArchNoOptsLoopHelper {
  public:
   // Scalar loop unrolling parameters and heuristics.
   //
   // Maximum possible unrolling factor.
-  static constexpr uint32_t kArm64ScalarMaxUnrollFactor = 2;
+  static constexpr uint32_t kScalarMaxUnrollFactor = 2;
   // Loop's maximum instruction count. Loops with higher count will not be peeled/unrolled.
-  static constexpr uint32_t kArm64ScalarHeuristicMaxBodySizeInstr = 40;
+  static constexpr uint32_t kScalarHeuristicMaxBodySizeInstr = 17;
   // Loop's maximum basic block count. Loops with higher count will not be peeled/unrolled.
-  static constexpr uint32_t kArm64ScalarHeuristicMaxBodySizeBlocks = 8;
+  static constexpr uint32_t kScalarHeuristicMaxBodySizeBlocks = 6;
 
-  // SIMD loop unrolling parameters and heuristics.
-  //
-  // Maximum possible unrolling factor.
-  static constexpr uint32_t kArm64SimdMaxUnrollFactor = 8;
-  // Loop's maximum instruction count. Loops with higher count will not be unrolled.
-  static constexpr uint32_t kArm64SimdHeuristicMaxBodySizeInstr = 50;
-
-  bool IsLoopTooBigForScalarPeelingUnrolling(LoopAnalysisInfo* loop_analysis_info) const OVERRIDE {
-    size_t instr_num = loop_analysis_info->GetNumberOfInstructions();
-    size_t bb_num = loop_analysis_info->GetNumberOfBasicBlocks();
-    return (instr_num >= kArm64ScalarHeuristicMaxBodySizeInstr ||
-            bb_num >= kArm64ScalarHeuristicMaxBodySizeBlocks);
+  bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* loop_analysis_info) const OVERRIDE {
+    return loop_analysis_info->HasLongTypeInstructions() ||
+           IsLoopTooBig(loop_analysis_info,
+                        kScalarHeuristicMaxBodySizeInstr,
+                        kScalarHeuristicMaxBodySizeBlocks);
   }
 
   uint32_t GetScalarUnrollingFactor(HLoopInformation* loop_info ATTRIBUTE_UNUSED,
                                     uint64_t trip_count) const OVERRIDE {
-    uint32_t desired_unrolling_factor = kArm64ScalarMaxUnrollFactor;
+    uint32_t desired_unrolling_factor = kScalarMaxUnrollFactor;
     if (trip_count < desired_unrolling_factor || trip_count % desired_unrolling_factor != 0) {
       return kNoUnrollingFactor;
     }
@@ -98,6 +96,38 @@
 
   bool IsLoopPeelingEnabled() const OVERRIDE { return true; }
 
+ protected:
+  bool IsLoopTooBig(LoopAnalysisInfo* loop_analysis_info,
+                    size_t instr_threshold,
+                    size_t bb_threshold) const {
+    size_t instr_num = loop_analysis_info->GetNumberOfInstructions();
+    size_t bb_num = loop_analysis_info->GetNumberOfBasicBlocks();
+    return (instr_num >= instr_threshold || bb_num >= bb_threshold);
+  }
+};
+
+// Custom implementation of loop helper for arm64 target. Enables heuristics for scalar loop
+// peeling and unrolling and supports SIMD loop unrolling.
+class Arm64LoopHelper : public ArchDefaultLoopHelper {
+ public:
+  // SIMD loop unrolling parameters and heuristics.
+  //
+  // Maximum possible unrolling factor.
+  static constexpr uint32_t kArm64SimdMaxUnrollFactor = 8;
+  // Loop's maximum instruction count. Loops with higher count will not be unrolled.
+  static constexpr uint32_t kArm64SimdHeuristicMaxBodySizeInstr = 50;
+
+  // Loop's maximum instruction count. Loops with higher count will not be peeled/unrolled.
+  static constexpr uint32_t kArm64ScalarHeuristicMaxBodySizeInstr = 40;
+  // Loop's maximum basic block count. Loops with higher count will not be peeled/unrolled.
+  static constexpr uint32_t kArm64ScalarHeuristicMaxBodySizeBlocks = 8;
+
+  bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* loop_analysis_info) const OVERRIDE {
+    return IsLoopTooBig(loop_analysis_info,
+                        kArm64ScalarHeuristicMaxBodySizeInstr,
+                        kArm64ScalarHeuristicMaxBodySizeBlocks);
+  }
+
   uint32_t GetSIMDUnrollingFactor(HBasicBlock* block,
                                   int64_t trip_count,
                                   uint32_t max_peel,
@@ -126,8 +156,8 @@
   }
 };
 
-ArchDefaultLoopHelper* ArchDefaultLoopHelper::Create(InstructionSet isa,
-                                                     ArenaAllocator* allocator) {
+ArchNoOptsLoopHelper* ArchNoOptsLoopHelper::Create(InstructionSet isa,
+                                                   ArenaAllocator* allocator) {
   switch (isa) {
     case InstructionSet::kArm64: {
       return new (allocator) Arm64LoopHelper;
diff --git a/compiler/optimizing/loop_analysis.h b/compiler/optimizing/loop_analysis.h
index ece9858..c09d3ff 100644
--- a/compiler/optimizing/loop_analysis.h
+++ b/compiler/optimizing/loop_analysis.h
@@ -35,6 +35,7 @@
         exits_num_(0),
         has_instructions_preventing_scalar_peeling_(false),
         has_instructions_preventing_scalar_unrolling_(false),
+        has_long_type_instructions_(false),
         loop_info_(loop_info) {}
 
   size_t GetNumberOfBasicBlocks() const { return bb_num_; }
@@ -49,6 +50,10 @@
     return has_instructions_preventing_scalar_unrolling_;
   }
 
+  bool HasLongTypeInstructions() const {
+    return has_long_type_instructions_;
+  }
+
   const HLoopInformation* GetLoopInfo() const { return loop_info_; }
 
  private:
@@ -62,6 +67,9 @@
   bool has_instructions_preventing_scalar_peeling_;
   // Whether the loop has instructions which make scalar loop unrolling non-beneficial.
   bool has_instructions_preventing_scalar_unrolling_;
+  // Whether the loop has instructions of primitive long type; unrolling these loop will
+  // likely introduce spill/fills on 32-bit targets.
+  bool has_long_type_instructions_;
 
   // Corresponding HLoopInformation.
   const HLoopInformation* loop_info_;
@@ -117,22 +125,21 @@
 // To support peeling/unrolling for a new architecture one needs to create new helper class,
 // inherit it from this and add implementation for the following methods.
 //
-class ArchDefaultLoopHelper : public ArenaObject<kArenaAllocOptimization> {
+class ArchNoOptsLoopHelper : public ArenaObject<kArenaAllocOptimization> {
  public:
-  virtual ~ArchDefaultLoopHelper() {}
+  virtual ~ArchNoOptsLoopHelper() {}
 
   // Creates an instance of specialised helper for the target or default helper if the target
   // doesn't support loop peeling and unrolling.
-  static ArchDefaultLoopHelper* Create(InstructionSet isa, ArenaAllocator* allocator);
+  static ArchNoOptsLoopHelper* Create(InstructionSet isa, ArenaAllocator* allocator);
 
-  // Returns whether the loop is too big for loop peeling/unrolling by checking its total number of
-  // basic blocks and instructions.
+  // Returns whether the loop is not beneficial for loop peeling/unrolling.
   //
-  // If the loop body has too many instructions then peeling/unrolling optimization will not bring
-  // any noticeable performance improvement however will increase the code size.
+  // For example, if the loop body has too many instructions then peeling/unrolling optimization
+  // will not bring any noticeable performance improvement however will increase the code size.
   //
   // Returns 'true' by default, should be overridden by particular target loop helper.
-  virtual bool IsLoopTooBigForScalarPeelingUnrolling(
+  virtual bool IsLoopNonBeneficialForScalarOpts(
       LoopAnalysisInfo* loop_analysis_info ATTRIBUTE_UNUSED) const { return true; }
 
   // Returns optimal scalar unrolling factor for the loop.
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 1ce3524..eda6bd1 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -33,9 +33,6 @@
 // Enables vectorization (SIMDization) in the loop optimizer.
 static constexpr bool kEnableVectorization = true;
 
-// Enables scalar loop unrolling in the loop optimizer.
-static constexpr bool kEnableScalarPeelingUnrolling = false;
-
 //
 // Static helpers.
 //
@@ -457,7 +454,7 @@
       vector_header_(nullptr),
       vector_body_(nullptr),
       vector_index_(nullptr),
-      arch_loop_helper_(ArchDefaultLoopHelper::Create(compiler_driver_ != nullptr
+      arch_loop_helper_(ArchNoOptsLoopHelper::Create(compiler_driver_ != nullptr
                                                           ? compiler_driver_->GetInstructionSet()
                                                           : InstructionSet::kNone,
                                                       global_allocator_)) {
@@ -761,7 +758,7 @@
 bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* node) {
   // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests)
   // as InstructionSet is needed.
-  if (!kEnableScalarPeelingUnrolling || compiler_driver_ == nullptr) {
+  if (compiler_driver_ == nullptr) {
     return false;
   }
 
@@ -781,9 +778,9 @@
   LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
 
   // Check "IsLoopClonable" last as it can be time-consuming.
-  if (arch_loop_helper_->IsLoopTooBigForScalarPeelingUnrolling(&loop_analysis_info) ||
+  if (loop_analysis_info.HasInstructionsPreventingScalarUnrolling() ||
+      arch_loop_helper_->IsLoopNonBeneficialForScalarOpts(&loop_analysis_info) ||
       (loop_analysis_info.GetNumberOfExits() > 1) ||
-      loop_analysis_info.HasInstructionsPreventingScalarUnrolling() ||
       !PeelUnrollHelper::IsLoopClonable(loop_info)) {
     return false;
   }
@@ -807,7 +804,7 @@
 bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopNode* node) {
   // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests)
   // as InstructionSet is needed.
-  if (!kEnableScalarPeelingUnrolling || compiler_driver_ == nullptr) {
+  if (compiler_driver_ == nullptr) {
     return false;
   }
 
@@ -821,8 +818,8 @@
   LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
 
   // Check "IsLoopClonable" last as it can be time-consuming.
-  if (arch_loop_helper_->IsLoopTooBigForScalarPeelingUnrolling(&loop_analysis_info) ||
-      loop_analysis_info.HasInstructionsPreventingScalarPeeling() ||
+  if (loop_analysis_info.HasInstructionsPreventingScalarPeeling() ||
+      arch_loop_helper_->IsLoopNonBeneficialForScalarOpts(&loop_analysis_info) ||
       !LoopAnalysis::HasLoopAtLeastOneInvariantExit(loop_info) ||
       !PeelUnrollHelper::IsLoopClonable(loop_info)) {
     return false;
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 7807da1..191a93d 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -28,7 +28,7 @@
 namespace art {
 
 class CompilerDriver;
-class ArchDefaultLoopHelper;
+class ArchNoOptsLoopHelper;
 
 /**
  * Loop optimizations. Builds a loop hierarchy and applies optimizations to
@@ -308,7 +308,7 @@
   HInstruction* vector_index_;  // normalized index of the new loop
 
   // Helper for target-specific behaviour for loop optimizations.
-  ArchDefaultLoopHelper* arch_loop_helper_;
+  ArchNoOptsLoopHelper* arch_loop_helper_;
 
   friend class LoopOptimizationTest;