ART: Implement scalar loop peeling.

Implement scalar loop peeling for invariant exits elimination
(on arm64). If the loop exit condition is loop invariant then
loop peeling + GVN + DCE can eliminate this exit in the loop
body. Note: GVN and DCE aren't applied during loop optimizations.

Note: this functionality is turned off by default now.

Test: test-art-host, test-art-target, boot-to-gui.

Change-Id: I98d20054a431838b452dc06bd25c075eb445960c
diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc
index cd3bdaf..a0760ef 100644
--- a/compiler/optimizing/loop_analysis.cc
+++ b/compiler/optimizing/loop_analysis.cc
@@ -16,6 +16,8 @@
 
 #include "loop_analysis.h"
 
+#include "base/bit_vector-inl.h"
+
 namespace art {
 
 void LoopAnalysis::CalculateLoopBasicProperties(HLoopInformation* loop_info,
@@ -33,7 +35,8 @@
 
     for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
       HInstruction* instruction = it.Current();
-      if (MakesScalarUnrollingNonBeneficial(instruction)) {
+      if (MakesScalarPeelingUnrollingNonBeneficial(instruction)) {
+        analysis_results->has_instructions_preventing_scalar_peeling_ = true;
         analysis_results->has_instructions_preventing_scalar_unrolling_ = true;
       }
       analysis_results->instr_num_++;
@@ -42,6 +45,22 @@
   }
 }
 
+bool LoopAnalysis::HasLoopAtLeastOneInvariantExit(HLoopInformation* loop_info) {
+  HGraph* graph = loop_info->GetHeader()->GetGraph();
+  for (uint32_t block_id : loop_info->GetBlocks().Indexes()) {
+    HBasicBlock* block = graph->GetBlocks()[block_id];
+    DCHECK(block != nullptr);
+    if (block->EndsWithIf()) {
+      HIf* hif = block->GetLastInstruction()->AsIf();
+      HInstruction* input = hif->InputAt(0);
+      if (IsLoopExit(loop_info, hif) && !loop_info->Contains(*input->GetBlock())) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 class Arm64LoopHelper : public ArchDefaultLoopHelper {
  public:
   // Scalar loop unrolling parameters and heuristics.
@@ -60,7 +79,7 @@
   // Loop's maximum instruction count. Loops with higher count will not be unrolled.
   static constexpr uint32_t kArm64SimdHeuristicMaxBodySizeInstr = 50;
 
-  bool IsLoopTooBigForScalarUnrolling(LoopAnalysisInfo* loop_analysis_info) const OVERRIDE {
+  bool IsLoopTooBigForScalarPeelingUnrolling(LoopAnalysisInfo* loop_analysis_info) const OVERRIDE {
     size_t instr_num = loop_analysis_info->GetNumberOfInstructions();
     size_t bb_num = loop_analysis_info->GetNumberOfBasicBlocks();
     return (instr_num >= kArm64ScalarHeuristicMaxBodySizeInstr ||
@@ -77,6 +96,8 @@
     return desired_unrolling_factor;
   }
 
+  bool IsLoopPeelingEnabled() const OVERRIDE { return true; }
+
   uint32_t GetSIMDUnrollingFactor(HBasicBlock* block,
                                   int64_t trip_count,
                                   uint32_t max_peel,
diff --git a/compiler/optimizing/loop_analysis.h b/compiler/optimizing/loop_analysis.h
index bad406f..ece9858 100644
--- a/compiler/optimizing/loop_analysis.h
+++ b/compiler/optimizing/loop_analysis.h
@@ -33,6 +33,7 @@
       : bb_num_(0),
         instr_num_(0),
         exits_num_(0),
+        has_instructions_preventing_scalar_peeling_(false),
         has_instructions_preventing_scalar_unrolling_(false),
         loop_info_(loop_info) {}
 
@@ -40,6 +41,10 @@
   size_t GetNumberOfInstructions() const { return instr_num_; }
   size_t GetNumberOfExits() const { return exits_num_; }
 
+  bool HasInstructionsPreventingScalarPeeling() const {
+    return has_instructions_preventing_scalar_peeling_;
+  }
+
   bool HasInstructionsPreventingScalarUnrolling() const {
     return has_instructions_preventing_scalar_unrolling_;
   }
@@ -53,6 +58,8 @@
   size_t instr_num_;
   // Number of loop's exits.
   size_t exits_num_;
+  // Whether the loop has instructions which make scalar loop peeling non-beneficial.
+  bool has_instructions_preventing_scalar_peeling_;
   // Whether the loop has instructions which make scalar loop unrolling non-beneficial.
   bool has_instructions_preventing_scalar_unrolling_;
 
@@ -71,22 +78,35 @@
   static void CalculateLoopBasicProperties(HLoopInformation* loop_info,
                                            LoopAnalysisInfo* analysis_results);
 
+  // Returns whether the loop has at least one loop invariant exit.
+  static bool HasLoopAtLeastOneInvariantExit(HLoopInformation* loop_info);
+
+  // Returns whether HIf's true or false successor is outside the specified loop.
+  //
+  // Prerequisite: HIf must be in the specified loop.
+  static bool IsLoopExit(HLoopInformation* loop_info, const HIf* hif) {
+    DCHECK(loop_info->Contains(*hif->GetBlock()));
+    HBasicBlock* true_succ = hif->IfTrueSuccessor();
+    HBasicBlock* false_succ = hif->IfFalseSuccessor();
+    return (!loop_info->Contains(*true_succ) || !loop_info->Contains(*false_succ));
+  }
+
  private:
-  // Returns whether an instruction makes scalar loop unrolling non-beneficial.
+  // Returns whether an instruction makes scalar loop peeling/unrolling non-beneficial.
   //
   // If in the loop body we have a dex/runtime call then its contribution to the whole
-  // loop performance will probably prevail. So unrolling optimization will not bring
-  // any noticeable performance improvement however will increase the code size.
-  static bool MakesScalarUnrollingNonBeneficial(HInstruction* instruction) {
+  // loop performance will probably prevail. So peeling/unrolling optimization will not bring
+  // any noticeable performance improvement. It will increase the code size.
+  static bool MakesScalarPeelingUnrollingNonBeneficial(HInstruction* instruction) {
     return (instruction->IsNewArray() ||
         instruction->IsNewInstance() ||
         instruction->IsUnresolvedInstanceFieldGet() ||
         instruction->IsUnresolvedInstanceFieldSet() ||
         instruction->IsUnresolvedStaticFieldGet() ||
         instruction->IsUnresolvedStaticFieldSet() ||
-        // TODO: Unroll loops with intrinsified invokes.
+        // TODO: Support loops with intrinsified invokes.
         instruction->IsInvoke() ||
-        // TODO: Unroll loops with ClinitChecks.
+        // TODO: Support loops with ClinitChecks.
         instruction->IsClinitCheck());
   }
 };
@@ -105,14 +125,14 @@
   // doesn't support loop peeling and unrolling.
   static ArchDefaultLoopHelper* Create(InstructionSet isa, ArenaAllocator* allocator);
 
-  // Returns whether the loop is too big for loop unrolling by checking its total number of
+  // Returns whether the loop is too big for loop peeling/unrolling by checking its total number of
   // basic blocks and instructions.
   //
-  // If the loop body has too many instructions then unrolling optimization will not bring
+  // If the loop body has too many instructions then peeling/unrolling optimization will not bring
   // any noticeable performance improvement however will increase the code size.
   //
   // Returns 'true' by default, should be overridden by particular target loop helper.
-  virtual bool IsLoopTooBigForScalarUnrolling(
+  virtual bool IsLoopTooBigForScalarPeelingUnrolling(
       LoopAnalysisInfo* loop_analysis_info ATTRIBUTE_UNUSED) const { return true; }
 
   // Returns optimal scalar unrolling factor for the loop.
@@ -123,6 +143,11 @@
     return kNoUnrollingFactor;
   }
 
+  // Returns whether scalar loop peeling is enabled,
+  //
+  // Returns 'false' by default, should be overridden by particular target loop helper.
+  virtual bool IsLoopPeelingEnabled() const { return false; }
+
   // Returns optimal SIMD unrolling factor for the loop.
   //
   // Returns kNoUnrollingFactor by default, should be overridden by particular target loop helper.
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index b41b659..c0c721d 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -34,7 +34,7 @@
 static constexpr bool kEnableVectorization = true;
 
 // Enables scalar loop unrolling in the loop optimizer.
-static constexpr bool kEnableScalarUnrolling = false;
+static constexpr bool kEnableScalarPeelingUnrolling = false;
 
 //
 // Static helpers.
@@ -533,6 +533,43 @@
   return true;
 }
 
+// Tries to statically evaluate condition of the specified "HIf" for other condition checks.
+static void TryToEvaluateIfCondition(HIf* instruction, HGraph* graph) {
+  HInstruction* cond = instruction->InputAt(0);
+
+  // If a condition 'cond' is evaluated in an HIf instruction then in the successors of the
+  // IF_BLOCK we statically know the value of the condition 'cond' (TRUE in TRUE_SUCC, FALSE in
+  // FALSE_SUCC). Using that we can replace another evaluation (use) EVAL of the same 'cond'
+  // with TRUE value (FALSE value) if every path from the ENTRY_BLOCK to EVAL_BLOCK contains the
+  // edge HIF_BLOCK->TRUE_SUCC (HIF_BLOCK->FALSE_SUCC).
+  //     if (cond) {               if(cond) {
+  //       if (cond) {}              if (1) {}
+  //     } else {        =======>  } else {
+  //       if (cond) {}              if (0) {}
+  //     }                         }
+  if (!cond->IsConstant()) {
+    HBasicBlock* true_succ = instruction->IfTrueSuccessor();
+    HBasicBlock* false_succ = instruction->IfFalseSuccessor();
+
+    DCHECK_EQ(true_succ->GetPredecessors().size(), 1u);
+    DCHECK_EQ(false_succ->GetPredecessors().size(), 1u);
+
+    const HUseList<HInstruction*>& uses = cond->GetUses();
+    for (auto it = uses.begin(), end = uses.end(); it != end; /* ++it below */) {
+      HInstruction* user = it->GetUser();
+      size_t index = it->GetIndex();
+      HBasicBlock* user_block = user->GetBlock();
+      // Increment `it` now because `*it` may disappear thanks to user->ReplaceInput().
+      ++it;
+      if (true_succ->Dominates(user_block)) {
+        user->ReplaceInput(graph->GetIntConstant(1), index);
+     } else if (false_succ->Dominates(user_block)) {
+        user->ReplaceInput(graph->GetIntConstant(0), index);
+      }
+    }
+  }
+}
+
 //
 // Public methods.
 //
@@ -851,40 +888,24 @@
 
 bool HLoopOptimization::OptimizeInnerLoop(LoopNode* node) {
   return TryOptimizeInnerLoopFinite(node) ||
+         TryPeelingForLoopInvariantExitsElimination(node) ||
          TryUnrollingForBranchPenaltyReduction(node);
 }
 
-void HLoopOptimization::PeelOrUnrollOnce(LoopNode* loop_node,
-                                         bool do_unrolling,
-                                         SuperblockCloner::HBasicBlockMap* bb_map,
-                                         SuperblockCloner::HInstructionMap* hir_map) {
-  // TODO: peel loop nests.
-  DCHECK(loop_node->inner == nullptr);
 
-  // Check that loop info is up-to-date.
-  HLoopInformation* loop_info = loop_node->loop_info;
-  HBasicBlock* header = loop_info->GetHeader();
-  DCHECK(loop_info == header->GetLoopInformation());
-
-  PeelUnrollHelper helper(loop_info, bb_map, hir_map);
-  DCHECK(helper.IsLoopClonable());
-  HBasicBlock* new_header = do_unrolling ? helper.DoUnrolling() : helper.DoPeeling();
-  DCHECK(header == new_header);
-  DCHECK(loop_info == new_header->GetLoopInformation());
-}
 
 //
 // Loop unrolling: generic part methods.
 //
 
-bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* loop_node) {
+bool HLoopOptimization::TryUnrollingForBranchPenaltyReduction(LoopNode* node) {
   // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests)
   // as InstructionSet is needed.
-  if (!kEnableScalarUnrolling || compiler_driver_ == nullptr) {
+  if (!kEnableScalarPeelingUnrolling || compiler_driver_ == nullptr) {
     return false;
   }
 
-  HLoopInformation* loop_info = loop_node->loop_info;
+  HLoopInformation* loop_info = node->loop_info;
   int64_t trip_count = 0;
   // Only unroll loops with a known tripcount.
   if (!induction_range_.HasKnownTripCount(loop_info, &trip_count)) {
@@ -900,7 +921,7 @@
   LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
 
   // Check "IsLoopClonable" last as it can be time-consuming.
-  if (arch_loop_helper_->IsLoopTooBigForScalarUnrolling(&loop_analysis_info) ||
+  if (arch_loop_helper_->IsLoopTooBigForScalarPeelingUnrolling(&loop_analysis_info) ||
       (loop_analysis_info.GetNumberOfExits() > 1) ||
       loop_analysis_info.HasInstructionsPreventingScalarUnrolling() ||
       !PeelUnrollHelper::IsLoopClonable(loop_info)) {
@@ -911,21 +932,57 @@
   DCHECK_EQ(unrolling_factor, 2u);
 
   // Perform unrolling.
-  ArenaAllocator* arena = loop_info->GetHeader()->GetGraph()->GetAllocator();
-  SuperblockCloner::HBasicBlockMap bb_map(
-      std::less<HBasicBlock*>(), arena->Adapter(kArenaAllocSuperblockCloner));
-  SuperblockCloner::HInstructionMap hir_map(
-      std::less<HInstruction*>(), arena->Adapter(kArenaAllocSuperblockCloner));
-  PeelOrUnrollOnce(loop_node, /* unrolling */ true, &bb_map, &hir_map);
+  PeelUnrollSimpleHelper helper(loop_info);
+  helper.DoUnrolling();
 
   // Remove the redundant loop check after unrolling.
-  HIf* copy_hif = bb_map.Get(loop_info->GetHeader())->GetLastInstruction()->AsIf();
+  HIf* copy_hif =
+      helper.GetBasicBlockMap()->Get(loop_info->GetHeader())->GetLastInstruction()->AsIf();
   int32_t constant = loop_info->Contains(*copy_hif->IfTrueSuccessor()) ? 1 : 0;
   copy_hif->ReplaceInput(graph_->GetIntConstant(constant), 0u);
 
   return true;
 }
 
+bool HLoopOptimization::TryPeelingForLoopInvariantExitsElimination(LoopNode* node) {
+  // Don't run peeling/unrolling if compiler_driver_ is nullptr (i.e., running under tests)
+  // as InstructionSet is needed.
+  if (!kEnableScalarPeelingUnrolling || compiler_driver_ == nullptr) {
+    return false;
+  }
+
+  HLoopInformation* loop_info = node->loop_info;
+  // Check 'IsLoopClonable' the last as it might be time-consuming.
+  if (!arch_loop_helper_->IsLoopPeelingEnabled()) {
+    return false;
+  }
+
+  LoopAnalysisInfo loop_analysis_info(loop_info);
+  LoopAnalysis::CalculateLoopBasicProperties(loop_info, &loop_analysis_info);
+
+  // Check "IsLoopClonable" last as it can be time-consuming.
+  if (arch_loop_helper_->IsLoopTooBigForScalarPeelingUnrolling(&loop_analysis_info) ||
+      loop_analysis_info.HasInstructionsPreventingScalarPeeling() ||
+      !LoopAnalysis::HasLoopAtLeastOneInvariantExit(loop_info) ||
+      !PeelUnrollHelper::IsLoopClonable(loop_info)) {
+    return false;
+  }
+
+  // Perform peeling.
+  PeelUnrollSimpleHelper helper(loop_info);
+  helper.DoPeeling();
+
+  const SuperblockCloner::HInstructionMap* hir_map = helper.GetInstructionMap();
+  for (auto entry : *hir_map) {
+    HInstruction* copy = entry.second;
+    if (copy->IsIf()) {
+      TryToEvaluateIfCondition(copy->AsIf(), graph_);
+    }
+  }
+
+  return true;
+}
+
 //
 // Loop vectorization. The implementation is based on the book by Aart J.C. Bik:
 // "The Software Vectorization Handbook. Applying Multimedia Extensions for Maximum Performance."
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 0120cff..f9a31a3 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -145,19 +145,14 @@
   // Performs optimizations specific to inner loop. Returns true if anything changed.
   bool OptimizeInnerLoop(LoopNode* node);
 
-  // Performs loop peeling/unrolling once (depends on the 'do_unrolling'); the transformation
-  // preserves the header and the loop info.
-  //
-  // Note: the function records copying information about blocks and instructions.
-  void PeelOrUnrollOnce(LoopNode* loop_node,
-                        bool do_unrolling,
-                        SuperblockCloner::HBasicBlockMap* bb_map,
-                        SuperblockCloner::HInstructionMap* hir_map);
-
   // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling
   // opportunities. Returns whether transformation happened.
   bool TryUnrollingForBranchPenaltyReduction(LoopNode* loop_node);
 
+  // Tries to apply loop peeling for loop invariant exits elimination. Returns whether
+  // transformation happened.
+  bool TryPeelingForLoopInvariantExitsElimination(LoopNode* loop_node);
+
   //
   // Vectorization analysis and synthesis.
   //
diff --git a/compiler/optimizing/superblock_cloner.cc b/compiler/optimizing/superblock_cloner.cc
index ee74f10..fad7729 100644
--- a/compiler/optimizing/superblock_cloner.cc
+++ b/compiler/optimizing/superblock_cloner.cc
@@ -28,11 +28,6 @@
 using HBasicBlockSet = SuperblockCloner::HBasicBlockSet;
 using HEdgeSet = SuperblockCloner::HEdgeSet;
 
-// When doing peeling we can choose whether to keep original loop (made of original basic blocks)
-// and form a peeled iteration of the copy blocks (preserve the header) or transfer original loop
-// blocks to the peeled iteration and create new loop from the copy blocks. Similar for unrolling.
-static const bool kPeelUnrollPreserveHeader = true;
-
 void HEdge::Dump(std::ostream& stream) const {
   stream << "(" << from_ << "->" << to_ << ")";
 }
@@ -926,16 +921,12 @@
       remap_orig_internal->Insert(e);
       remap_copy_internal->Insert(e);
     } else {
-      if (kPeelUnrollPreserveHeader) {
-        remap_copy_internal->Insert(e);
-      } else {
-        remap_orig_internal->Insert(e);
-      }
+      remap_copy_internal->Insert(e);
     }
   }
 
   // Set up remap_incoming edges set.
-  if (to_unroll != kPeelUnrollPreserveHeader) {
+  if (!to_unroll) {
     remap_incoming->Insert(HEdge(loop_info->GetPreHeader(), loop_header));
   }
 }
@@ -992,6 +983,9 @@
   DCHECK(!loop_info_->IsIrreducible());
 
   HBasicBlock* loop_header = loop_info_->GetHeader();
+  // Check that loop info is up-to-date.
+  DCHECK(loop_info_ == loop_header->GetLoopInformation());
+
   HGraph* graph = loop_header->GetGraph();
   ArenaAllocator allocator(graph->GetAllocator()->GetArenaPool());
 
@@ -1009,7 +1003,10 @@
   cloner_.Run();
   cloner_.CleanUp();
 
-  return kPeelUnrollPreserveHeader ? loop_header : cloner_.GetBlockCopy(loop_header);
+  // Check that loop info is preserved.
+  DCHECK(loop_info_ == loop_header->GetLoopInformation());
+
+  return loop_header;
 }
 
 PeelUnrollSimpleHelper::PeelUnrollSimpleHelper(HLoopInformation* info)
diff --git a/compiler/optimizing/superblock_cloner.h b/compiler/optimizing/superblock_cloner.h
index afd5a5d..e093167 100644
--- a/compiler/optimizing/superblock_cloner.h
+++ b/compiler/optimizing/superblock_cloner.h
@@ -372,6 +372,9 @@
   HBasicBlock* DoUnrolling() { return helper_.DoUnrolling(); }
   HLoopInformation* GetRegionToBeAdjusted() const { return helper_.GetRegionToBeAdjusted(); }
 
+  const SuperblockCloner::HBasicBlockMap* GetBasicBlockMap() const { return &bb_map_; }
+  const SuperblockCloner::HInstructionMap* GetInstructionMap() const { return &hir_map_; }
+
  private:
   SuperblockCloner::HBasicBlockMap bb_map_;
   SuperblockCloner::HInstructionMap hir_map_;