Complete unrolling of loops with small body and trip count one.

Rationale:
Avoids the unnecessary loop control overhead, suspend check,
and exposes more opportunities for constant folding in the
resulting loop body. Fully unrolls loop in execute() of
the Dhrystone benchmark (3% to 8% improvements).

Test: test-art-host

Change-Id: If30f38caea9e9f87a929df041dfb7ed1c227aba3
diff --git a/compiler/optimizing/induction_var_range.cc b/compiler/optimizing/induction_var_range.cc
index d5c4c2f..6d8ae75 100644
--- a/compiler/optimizing/induction_var_range.cc
+++ b/compiler/optimizing/induction_var_range.cc
@@ -368,10 +368,14 @@
   }
 }
 
-bool InductionVarRange::IsFinite(HLoopInformation* loop) const {
+bool InductionVarRange::IsFinite(HLoopInformation* loop, /*out*/ int64_t* tc) const {
   HInductionVarAnalysis::InductionInfo *trip =
       induction_analysis_->LookupInfo(loop, GetLoopControl(loop));
-  return trip != nullptr && !IsUnsafeTripCount(trip);
+  if (trip != nullptr && !IsUnsafeTripCount(trip)) {
+    IsConstant(trip->op_a, kExact, tc);
+    return true;
+  }
+  return false;
 }
 
 //
diff --git a/compiler/optimizing/induction_var_range.h b/compiler/optimizing/induction_var_range.h
index ba14847..6c424b7 100644
--- a/compiler/optimizing/induction_var_range.h
+++ b/compiler/optimizing/induction_var_range.h
@@ -150,9 +150,9 @@
   }
 
   /**
-   * Checks if header logic of a loop terminates.
+   * Checks if header logic of a loop terminates. Sets trip-count tc if known.
    */
-  bool IsFinite(HLoopInformation* loop) const;
+  bool IsFinite(HLoopInformation* loop, /*out*/ int64_t* tc) const;
 
  private:
   /*
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 9d73e29..9583838 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -161,26 +161,27 @@
 
 void HLoopOptimization::TraverseLoopsInnerToOuter(LoopNode* node) {
   for ( ; node != nullptr; node = node->next) {
+    // Visit inner loops first.
     int current_induction_simplification_count = induction_simplication_count_;
     if (node->inner != nullptr) {
       TraverseLoopsInnerToOuter(node->inner);
     }
-    // Visit loop after its inner loops have been visited. If the induction of any inner
-    // loop has been simplified, recompute the induction information of this loop first.
+    // Recompute induction information of this loop if the induction
+    // of any inner loop has been simplified.
     if (current_induction_simplification_count != induction_simplication_count_) {
       induction_range_.ReVisit(node->loop_info);
     }
-    // Repeat simplifications until no more changes occur. Note that since
-    // each simplification consists of eliminating code (without introducing
-    // new code), this process is always finite.
+    // Repeat simplifications in the body of this loop until no more changes occur.
+    // Note that since each simplification consists of eliminating code (without
+    // introducing new code), this process is always finite.
     do {
       simplified_ = false;
-      SimplifyBlocks(node);
       SimplifyInduction(node);
+      SimplifyBlocks(node);
     } while (simplified_);
-    // Remove inner loops when empty.
+    // Simplify inner loop.
     if (node->inner == nullptr) {
-      RemoveIfEmptyInnerLoop(node);
+      SimplifyInnerLoop(node);
     }
   }
 }
@@ -198,7 +199,7 @@
     iset_->clear();
     int32_t use_count = 0;
     if (IsPhiInduction(phi) &&
-        IsOnlyUsedAfterLoop(node->loop_info, phi, &use_count) &&
+        IsOnlyUsedAfterLoop(node->loop_info, phi, /*collect_loop_uses*/ false, &use_count) &&
         // No uses, or no early-exit with proper replacement.
         (use_count == 0 ||
          (!IsEarlyExit(node->loop_info) && TryReplaceWithLastValue(phi, preheader)))) {
@@ -206,7 +207,6 @@
         RemoveFromCycle(i);
       }
       simplified_ = true;
-      induction_simplication_count_++;
     }
   }
 }
@@ -216,24 +216,14 @@
   for (HBlocksInLoopIterator it(*node->loop_info); !it.Done(); it.Advance()) {
     HBasicBlock* block = it.Current();
     // Remove dead instructions from the loop-body.
-    for (HBackwardInstructionIterator i(block->GetInstructions()); !i.Done(); i.Advance()) {
-      HInstruction* instruction = i.Current();
-      if (instruction->IsDeadAndRemovable()) {
-        simplified_ = true;
-        block->RemoveInstruction(instruction);
-      }
-    }
+    RemoveDeadInstructions(block->GetPhis());
+    RemoveDeadInstructions(block->GetInstructions());
     // Remove trivial control flow blocks from the loop-body.
-    HBasicBlock* succ = nullptr;
-    if (IsGotoBlock(block, &succ) && succ->GetPredecessors().size() == 1) {
-      // Trivial goto block can be removed.
-      HBasicBlock* pred = block->GetSinglePredecessor();
+    if (block->GetPredecessors().size() == 1 &&
+        block->GetSuccessors().size() == 1 &&
+        block->GetSingleSuccessor()->GetPredecessors().size() == 1) {
       simplified_ = true;
-      pred->ReplaceSuccessor(block, succ);
-      block->RemoveDominatedBlock(succ);
-      block->DisconnectAndDelete();
-      pred->AddDominatedBlock(succ);
-      succ->SetDominator(pred);
+      block->MergeWith(block->GetSingleSuccessor());
     } else if (block->GetSuccessors().size() == 2) {
       // Trivial if block can be bypassed to either branch.
       HBasicBlock* succ0 = block->GetSuccessors()[0];
@@ -258,55 +248,66 @@
   }
 }
 
-void HLoopOptimization::RemoveIfEmptyInnerLoop(LoopNode* node) {
+bool HLoopOptimization::SimplifyInnerLoop(LoopNode* node) {
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
   // Ensure loop header logic is finite.
-  if (!induction_range_.IsFinite(node->loop_info)) {
-    return;
+  int64_t tc = 0;
+  if (!induction_range_.IsFinite(node->loop_info, &tc)) {
+    return false;
   }
   // Ensure there is only a single loop-body (besides the header).
   HBasicBlock* body = nullptr;
   for (HBlocksInLoopIterator it(*node->loop_info); !it.Done(); it.Advance()) {
     if (it.Current() != header) {
       if (body != nullptr) {
-        return;
+        return false;
       }
       body = it.Current();
     }
   }
   // Ensure there is only a single exit point.
   if (header->GetSuccessors().size() != 2) {
-    return;
+    return false;
   }
   HBasicBlock* exit = (header->GetSuccessors()[0] == body)
       ? header->GetSuccessors()[1]
       : header->GetSuccessors()[0];
   // Ensure exit can only be reached by exiting loop.
   if (exit->GetPredecessors().size() != 1) {
-    return;
+    return false;
   }
-  // Detect an empty loop: no side effects other than plain iteration. Replace
-  // subsequent index uses, if any, with the last value and remove the loop.
+  // Detect either an empty loop (no side effects other than plain iteration) or
+  // a trivial loop (just iterating once). Replace subsequent index uses, if any,
+  // with the last value and remove the loop, possibly after unrolling its body.
+  HInstruction* phi = header->GetFirstPhi();
   iset_->clear();
   int32_t use_count = 0;
-  if (IsEmptyHeader(header) &&
-      IsEmptyBody(body) &&
-      IsOnlyUsedAfterLoop(node->loop_info, header->GetFirstPhi(), &use_count) &&
-      // No uses, or proper replacement.
-      (use_count == 0 || TryReplaceWithLastValue(header->GetFirstPhi(), preheader))) {
-    body->DisconnectAndDelete();
-    exit->RemovePredecessor(header);
-    header->RemoveSuccessor(exit);
-    header->RemoveDominatedBlock(exit);
-    header->DisconnectAndDelete();
-    preheader->AddSuccessor(exit);
-    preheader->AddInstruction(new (graph_->GetArena()) HGoto());  // global allocator
-    preheader->AddDominatedBlock(exit);
-    exit->SetDominator(preheader);
-    // Update hierarchy.
-    RemoveLoop(node);
+  if (IsEmptyHeader(header)) {
+    bool is_empty = IsEmptyBody(body);
+    if ((is_empty || tc == 1) &&
+        IsOnlyUsedAfterLoop(node->loop_info, phi, /*collect_loop_uses*/ true, &use_count) &&
+        // No uses, or proper replacement.
+        (use_count == 0 || TryReplaceWithLastValue(phi, preheader))) {
+      if (!is_empty) {
+        // Unroll the loop body, which sees initial value of the index.
+        phi->ReplaceWith(phi->InputAt(0));
+        preheader->MergeInstructionsWith(body);
+      }
+      body->DisconnectAndDelete();
+      exit->RemovePredecessor(header);
+      header->RemoveSuccessor(exit);
+      header->RemoveDominatedBlock(exit);
+      header->DisconnectAndDelete();
+      preheader->AddSuccessor(exit);
+      preheader->AddInstruction(new (graph_->GetArena()) HGoto());  // global allocator
+      preheader->AddDominatedBlock(exit);
+      exit->SetDominator(preheader);
+      RemoveLoop(node);  // update hierarchy
+      return true;
+    }
   }
+  return false;
 }
 
 bool HLoopOptimization::IsPhiInduction(HPhi* phi) {
@@ -374,12 +375,19 @@
 
 bool HLoopOptimization::IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
                                             HInstruction* instruction,
+                                            bool collect_loop_uses,
                                             /*out*/ int32_t* use_count) {
   for (const HUseListNode<HInstruction*>& use : instruction->GetUses()) {
     HInstruction* user = use.GetUser();
     if (iset_->find(user) == iset_->end()) {  // not excluded?
       HLoopInformation* other_loop_info = user->GetBlock()->GetLoopInformation();
       if (other_loop_info != nullptr && other_loop_info->IsIn(*loop_info)) {
+        // If collect_loop_uses is set, simply keep adding those uses to the set.
+        // Otherwise, reject uses inside the loop that were not already in the set.
+        if (collect_loop_uses) {
+          iset_->insert(user);
+          continue;
+        }
         return false;
       }
       ++*use_count;
@@ -388,40 +396,48 @@
   return true;
 }
 
-void HLoopOptimization::ReplaceAllUses(HInstruction* instruction, HInstruction* replacement) {
-  const HUseList<HInstruction*>& uses = instruction->GetUses();
-  for (auto it = uses.begin(), end = uses.end(); it != end;) {
-    HInstruction* user = it->GetUser();
-    size_t index = it->GetIndex();
-    ++it;  // increment before replacing
-    if (iset_->find(user) == iset_->end()) {  // not excluded?
-      user->ReplaceInput(replacement, index);
-      induction_range_.Replace(user, instruction, replacement);  // update induction
-    }
-  }
-  const HUseList<HEnvironment*>& env_uses = instruction->GetEnvUses();
-  for (auto it = env_uses.begin(), end = env_uses.end(); it != end;) {
-    HEnvironment* user = it->GetUser();
-    size_t index = it->GetIndex();
-    ++it;  // increment before replacing
-    if (iset_->find(user->GetHolder()) == iset_->end()) {  // not excluded?
-      user->RemoveAsUserOfInput(index);
-      user->SetRawEnvAt(index, replacement);
-      replacement->AddEnvUseAt(user, index);
-    }
-  }
-}
-
 bool HLoopOptimization::TryReplaceWithLastValue(HInstruction* instruction, HBasicBlock* block) {
   // Try to replace outside uses with the last value. Environment uses can consume this
   // value too, since any first true use is outside the loop (although this may imply
   // that de-opting may look "ahead" a bit on the phi value). If there are only environment
   // uses, the value is dropped altogether, since the computations have no effect.
   if (induction_range_.CanGenerateLastValue(instruction)) {
-    ReplaceAllUses(instruction, induction_range_.GenerateLastValue(instruction, graph_, block));
+    HInstruction* replacement = induction_range_.GenerateLastValue(instruction, graph_, block);
+    const HUseList<HInstruction*>& uses = instruction->GetUses();
+    for (auto it = uses.begin(), end = uses.end(); it != end;) {
+      HInstruction* user = it->GetUser();
+      size_t index = it->GetIndex();
+      ++it;  // increment before replacing
+      if (iset_->find(user) == iset_->end()) {  // not excluded?
+        user->ReplaceInput(replacement, index);
+        induction_range_.Replace(user, instruction, replacement);  // update induction
+      }
+    }
+    const HUseList<HEnvironment*>& env_uses = instruction->GetEnvUses();
+    for (auto it = env_uses.begin(), end = env_uses.end(); it != end;) {
+      HEnvironment* user = it->GetUser();
+      size_t index = it->GetIndex();
+      ++it;  // increment before replacing
+      if (iset_->find(user->GetHolder()) == iset_->end()) {  // not excluded?
+        user->RemoveAsUserOfInput(index);
+        user->SetRawEnvAt(index, replacement);
+        replacement->AddEnvUseAt(user, index);
+      }
+    }
+    induction_simplication_count_++;
     return true;
   }
   return false;
 }
 
+void HLoopOptimization::RemoveDeadInstructions(const HInstructionList& list) {
+  for (HBackwardInstructionIterator i(list); !i.Done(); i.Advance()) {
+    HInstruction* instruction = i.Current();
+    if (instruction->IsDeadAndRemovable()) {
+      simplified_ = true;
+      instruction->GetBlock()->RemoveInstructionOrPhi(instruction);
+    }
+  }
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 0f05b24..9ddab41 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -60,19 +60,21 @@
 
   void TraverseLoopsInnerToOuter(LoopNode* node);
 
+  // Simplification.
   void SimplifyInduction(LoopNode* node);
   void SimplifyBlocks(LoopNode* node);
-  void RemoveIfEmptyInnerLoop(LoopNode* node);
+  bool SimplifyInnerLoop(LoopNode* node);
 
+  // Helpers.
   bool IsPhiInduction(HPhi* phi);
   bool IsEmptyHeader(HBasicBlock* block);
   bool IsEmptyBody(HBasicBlock* block);
-
   bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
                            HInstruction* instruction,
+                           bool collect_loop_uses,
                            /*out*/ int32_t* use_count);
-  void ReplaceAllUses(HInstruction* instruction, HInstruction* replacement);
   bool TryReplaceWithLastValue(HInstruction* instruction, HBasicBlock* block);
+  void RemoveDeadInstructions(const HInstructionList& list);
 
   // Range information based on prior induction variable analysis.
   InductionVarRange induction_range_;
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index d45fa11..a6084eb 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -1853,6 +1853,14 @@
   SetGraph(nullptr);
 }
 
+void HBasicBlock::MergeInstructionsWith(HBasicBlock* other) {
+  DCHECK(EndsWithControlFlowInstruction());
+  RemoveInstruction(GetLastInstruction());
+  instructions_.Add(other->GetInstructions());
+  other->instructions_.SetBlockOfInstructions(this);
+  other->instructions_.Clear();
+}
+
 void HBasicBlock::MergeWith(HBasicBlock* other) {
   DCHECK_EQ(GetGraph(), other->GetGraph());
   DCHECK(ContainsElement(dominated_blocks_, other));
@@ -1861,11 +1869,7 @@
   DCHECK(other->GetPhis().IsEmpty());
 
   // Move instructions from `other` to `this`.
-  DCHECK(EndsWithControlFlowInstruction());
-  RemoveInstruction(GetLastInstruction());
-  instructions_.Add(other->GetInstructions());
-  other->instructions_.SetBlockOfInstructions(this);
-  other->instructions_.Clear();
+  MergeInstructionsWith(other);
 
   // Remove `other` from the loops it is included in.
   for (HLoopInformationOutwardIterator it(*other); !it.Done(); it.Advance()) {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index ea9a94c..bf13702 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1097,6 +1097,9 @@
   // with a control flow instruction).
   void ReplaceWith(HBasicBlock* other);
 
+  // Merges the instructions of `other` at the end of `this`.
+  void MergeInstructionsWith(HBasicBlock* other);
+
   // Merge `other` at the end of `this`. This method updates loops, reverse post
   // order, links to predecessors, successors, dominators and deletes the block
   // from the graph. The two blocks must be successive, i.e. `this` the only
diff --git a/test/627-checker-unroll/expected.txt b/test/627-checker-unroll/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/627-checker-unroll/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/627-checker-unroll/info.txt b/test/627-checker-unroll/info.txt
new file mode 100644
index 0000000..d7885f4
--- /dev/null
+++ b/test/627-checker-unroll/info.txt
@@ -0,0 +1 @@
+Test on loop unrolling.
diff --git a/test/627-checker-unroll/src/Main.java b/test/627-checker-unroll/src/Main.java
new file mode 100644
index 0000000..9785bdc
--- /dev/null
+++ b/test/627-checker-unroll/src/Main.java
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// Test on loop unrolling. Removes loop control overhead (including suspend
+// checks) and exposes more opportunities for constant folding.
+//
+public class Main {
+
+  static int sA = 0;
+
+  /// CHECK-START: void Main.unroll() loop_optimization (before)
+  /// CHECK-DAG: Phi            loop:<<Loop:B\d+>>
+  /// CHECK-DAG: StaticFieldSet loop:<<Loop>>
+  //
+  /// CHECK-START: void Main.unroll() loop_optimization (after)
+  /// CHECK-DAG: StaticFieldSet loop:none
+  //
+  /// CHECK-START: void Main.unroll() instruction_simplifier$after_bce (after)
+  /// CHECK-DAG: <<Int:i\d+>> IntConstant    68                  loop:none
+  /// CHECK-DAG:              StaticFieldSet [{{l\d+}},<<Int>>]  loop:none
+  //
+  /// CHECK-START: void Main.unroll() loop_optimization (after)
+  /// CHECK-NOT: Phi
+  public static void unroll() {
+    for (int i = 4; i < 5; i++) {
+      sA = 17 * i;
+    }
+  }
+
+  /// CHECK-START: int Main.unrollLV() loop_optimization (before)
+  /// CHECK-DAG: <<Phi:i\d+>> Phi              loop:<<Loop:B\d+>>
+  /// CHECK-DAG:              StaticFieldSet   loop:<<Loop>>
+  /// CHECK-DAG:              Return [<<Phi>>] loop:none
+  //
+  /// CHECK-START: int Main.unrollLV() loop_optimization (after)
+  /// CHECK-DAG: StaticFieldSet loop:none
+  //
+  /// CHECK-START: int Main.unrollLV() instruction_simplifier$after_bce (after)
+  /// CHECK-DAG: <<Int1:i\d+>> IntConstant    187                 loop:none
+  /// CHECK-DAG: <<Int2:i\d+>> IntConstant    12                  loop:none
+  /// CHECK-DAG:               StaticFieldSet [{{l\d+}},<<Int1>>] loop:none
+  /// CHECK-DAG:               Return [<<Int2>>]                  loop:none
+  //
+  /// CHECK-START: int Main.unrollLV() loop_optimization (after)
+  /// CHECK-NOT: Phi
+  public static int unrollLV() {
+    int i;
+    for (i = 11; i < 12; i++) {
+      sA = 17 * i;
+    }
+    return i;
+  }
+
+  /// CHECK-START: void Main.unrollNest() loop_optimization (before)
+  /// CHECK-DAG:               SuspendCheck    loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>> Phi             loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG:               SuspendCheck    loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>> Phi             loop:<<Loop2:B\d+>> outer_loop:<<Loop1>>
+  /// CHECK-DAG:               SuspendCheck    loop:<<Loop2>>      outer_loop:<<Loop1>>
+  /// CHECK-DAG: <<Phi3:i\d+>> Phi             loop:<<Loop3:B\d+>> outer_loop:<<Loop2>>
+  /// CHECK-DAG:               SuspendCheck    loop:<<Loop3>>      outer_loop:<<Loop2>>
+  /// CHECK-DAG:               StaticFieldSet  loop:<<Loop3>>      outer_loop:<<Loop2>>
+  //
+  /// CHECK-START: void Main.unrollNest() loop_optimization (after)
+  /// CHECK-DAG: StaticFieldSet loop:none
+  /// CHECK-DAG: SuspendCheck   loop:none
+  /// CHECK-NOT: SuspendCheck
+  //
+  /// CHECK-START: void Main.unrollNest() instruction_simplifier$after_bce (after)
+  /// CHECK-DAG: <<Int:i\d+>> IntConstant    6                   loop:none
+  /// CHECK-DAG:              StaticFieldSet [{{l\d+}},<<Int>>]  loop:none
+  //
+  /// CHECK-START: void Main.unrollNest() loop_optimization (after)
+  /// CHECK-NOT: Phi
+  public static void unrollNest() {
+    // Unrolling each loop in turn ultimately removes the complete nest!
+    for (int i = 4; i < 5; i++) {
+      for (int j = 5; j < 6; j++) {
+        for (int k = 6; k < 7; k++) {
+          sA = k;
+        }
+      }
+    }
+  }
+
+  //
+  // Verifier.
+  //
+
+  public static void main(String[] args) {
+    unroll();
+    expectEquals(68, sA);
+    expectEquals(12, unrollLV());
+    expectEquals(187, sA);
+    unrollNest();
+    expectEquals(6, sA);
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}