Fix intersecting live ranges created by instruction scheduler

When scheduling code like the following:
LOOP:
   v2=phi(v0, v1)
   use(v2)
   v1=...
   goto LOOP

the instruction scheduler can move 'v1=...' before 'use(v2)'. This
causes live ranges of v1 and v2 to intersect and results to a MOV
instruction to be created.

The CL fixes this.

Improvements, Pixel3:
  Little CPU, arm64
    micro/GCCLoops
      Example12       14.1%
      Example10b      11.0%
      Example23       8.1%
      Example24       6.6%
      Example10a      5.0%
    FFT workload      4.7%
    Compress workload 1.2%

  Little CPU, arm32
    micro/GCCLoops
      Example23         7.5%
      Example24         4.3%
    MonteCarlo workload 1.35%

  Big CPU, arm32 and arm64
    No significant improvements

No significant regressions (> 5%) are found.

Test: test.py --host --optimizing --jit --gtest
Test: test.py --target --optimizing --jit
Test: run-gtests.sh
Change-Id: I1e4282af18f2d51fde5325a0c00a57e8bbc4fbed
diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc
index e5ff8a8..f722cf9 100644
--- a/compiler/optimizing/scheduler.cc
+++ b/compiler/optimizing/scheduler.cc
@@ -282,6 +282,36 @@
   }
 }
 
+void SchedulingGraph::AddCrossIterationDependencies(SchedulingNode* node) {
+  for (HInstruction* instruction : node->GetInstruction()->GetInputs()) {
+    // Having a phi-function from a loop header as an input means the current node of the
+    // scheduling graph has a cross-iteration dependency because such phi-functions bring values
+    // from the previous iteration to the current iteration.
+    if (!instruction->IsLoopHeaderPhi()) {
+      continue;
+    }
+    for (HInstruction* phi_input : instruction->GetInputs()) {
+      // As a scheduling graph of the current basic block is built by
+      // processing instructions bottom-up, nullptr returned by GetNode means
+      // an instruction defining a value for the phi is either before the
+      // instruction represented by node or it is in a different basic block.
+      SchedulingNode* def_node = GetNode(phi_input);
+
+      // We don't create a dependency if there are uses besides the use in phi.
+      // In such cases a register to hold phi_input is usually allocated and
+      // a MOV instruction is generated. In cases with multiple uses and no MOV
+      // instruction, reordering creating a MOV instruction can improve
+      // performance more than an attempt to avoid a MOV instruction.
+      if (def_node != nullptr && def_node != node && phi_input->GetUses().HasExactlyOneElement()) {
+        // We have an implicit data dependency between node and def_node.
+        // AddAddDataDependency cannot be used because it is for explicit data dependencies.
+        // So AddOtherDependency is used.
+        AddOtherDependency(def_node, node);
+      }
+    }
+  }
+}
+
 void SchedulingGraph::AddDependencies(SchedulingNode* instruction_node,
                                       bool is_scheduling_barrier) {
   HInstruction* instruction = instruction_node->GetInstruction();
@@ -340,7 +370,11 @@
       if (other_node->IsSchedulingBarrier()) {
         // We have reached a scheduling barrier so we can stop further
         // processing.
-        DCHECK(other_node->HasOtherDependency(instruction_node));
+        //
+        // As a "other" dependency is not set up if a data dependency exists, we need to check that
+        // one of them must exist.
+        DCHECK(other_node->HasOtherDependency(instruction_node)
+               || other_node->HasDataDependency(instruction_node));
         break;
       }
       if (side_effect_dependency_analysis_.HasSideEffectDependency(other, instruction)) {
@@ -372,6 +406,8 @@
       AddOtherDependency(GetNode(use.GetUser()->GetHolder()), instruction_node);
     }
   }
+
+  AddCrossIterationDependencies(instruction_node);
 }
 
 static const std::string InstructionTypeId(const HInstruction* instruction) {
diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h
index a97eda7..f7180a0 100644
--- a/compiler/optimizing/scheduler.h
+++ b/compiler/optimizing/scheduler.h
@@ -183,7 +183,9 @@
 
   void AddOtherPredecessor(SchedulingNode* predecessor) {
     // Check whether the predecessor has been added earlier.
-    if (HasOtherDependency(predecessor)) {
+    // As an optimization of the scheduling graph, we don't need to create another dependency if
+    // there is a data dependency between scheduling nodes.
+    if (HasOtherDependency(predecessor) || HasDataDependency(predecessor)) {
       return;
     }
     other_predecessors_.push_back(predecessor);
@@ -362,6 +364,25 @@
     AddDependency(node, dependency, /*is_data_dependency*/false);
   }
 
+  // Analyze whether the scheduling node has cross-iteration dependencies which mean it uses
+  // values defined on the previous iteration.
+  //
+  // Supported cases:
+  //
+  //   L:
+  //     v2 = loop_head_phi(v1)
+  //     instr1(v2)
+  //     v1 = instr2
+  //     goto L
+  //
+  // In such cases moving instr2 before instr1 creates intersecting live ranges
+  // of v1 and v2. As a result a separate register is needed to keep the value
+  // defined by instr2 which is only used on the next iteration.
+  // If instr2 is not moved, no additional register is needed. The register
+  // used by instr1 is reused.
+  // To prevent such a situation a "other" dependency between instr1 and instr2 must be set.
+  void AddCrossIterationDependencies(SchedulingNode* node);
+
   // Add dependencies nodes for the given `SchedulingNode`: inputs, environments, and side-effects.
   void AddDependencies(SchedulingNode* node, bool is_scheduling_barrier = false);
 
diff --git a/test/706-checker-scheduler/src/Main.java b/test/706-checker-scheduler/src/Main.java
index af18193..5a66fbb 100644
--- a/test/706-checker-scheduler/src/Main.java
+++ b/test/706-checker-scheduler/src/Main.java
@@ -322,7 +322,7 @@
   // but has more complex chains of transforming the original references:
   // ParameterValue --> BoundType --> NullCheck --> ArrayGet.
   // ParameterValue --> BoundType --> NullCheck --> IntermediateAddress --> ArraySet.
-  // After using LSA to analyze the orginal references, the scheduler should be able
+  // After using LSA to analyze the original references, the scheduler should be able
   // to find out that 'a' and 'b' may alias, hence unable to schedule these ArraGet/Set.
 
   /// CHECK-START-ARM64: void Main.CrossOverLoop2(java.lang.Object, java.lang.Object) scheduler (before)
@@ -584,9 +584,126 @@
     }
   }
 
+  // Check that instructions having cross iteration dependencies are not
+  // reordered.
+  //
+  /// CHECK-START-{ARM,ARM64}: void Main.testCrossItersDependencies() scheduler (before)
+  /// CHECK:     <<ID1:i\d+>>  Phi [{{i\d+}},<<ID3:i\d+>>]
+  /// CHECK:     <<ID2:i\d+>>  Phi [{{i\d+}},<<ID4:i\d+>>]
+  //
+  /// CHECK:     <<ID3>>  Sub [<<ID1>>,<<ID2>>]
+  /// CHECK:     <<ID4>>  Add [<<ID2>>,{{i\d+}}]
+
+  /// CHECK-START-{ARM,ARM64}: void Main.testCrossItersDependencies() scheduler (after)
+  /// CHECK:     <<ID1:i\d+>>  Phi [{{i\d+}},<<ID3:i\d+>>]
+  /// CHECK:     <<ID2:i\d+>>  Phi [{{i\d+}},<<ID4:i\d+>>]
+  //
+  /// CHECK:     <<ID3>>  Sub [<<ID1>>,<<ID2>>]
+  /// CHECK:     <<ID4>>  Add [<<ID2>>,{{i\d+}}]
+
+  /// CHECK-START-ARM: void Main.testCrossItersDependencies() disassembly (after)
+  /// CHECK:     subs
+  /// CHECK:     add
+  /// CHECK:     adds
+  /// CHECK:     ldrh
+  /// CHECK:     cmp
+  /// CHECK:     beq
+
+  /// CHECK-START-ARM64: void Main.testCrossItersDependencies() disassembly (after)
+  /// CHECK:     sub
+  /// CHECK:     add
+  /// CHECK:     add
+  /// CHECK:     ldrh
+  /// CHECK:     cbz
+  private static void testCrossItersDependencies() {
+    int[] data = {1, 2, 3, 0};
+    int sub = 0;
+    int sum = data[0];
+    for (int i = 1; data[i] != 0; ++i) {
+      sub -= sum;
+      sum += data[i];
+    }
+    expectEquals(sub, -4);
+    expectEquals(sum, 6);
+  }
+
+  // Check instructions defining values for the next iteration don't become
+  // self-dependent in a scheduling graph which prevents valid reordering.
+  //
+  /// CHECK-START-{ARM,ARM64}: void Main.testNoSelfDependantSchedNode(int) scheduler (before)
+  /// CHECK:     IntermediateAddress
+  /// CHECK:     ArrayGet
+  /// CHECK:     LessThanOrEqual
+  /// CHECK:     Select
+  /// CHECK:     IntermediateAddress
+  /// CHECK:     ArraySet
+  /// CHECK:     Add
+
+  /// CHECK-START-{ARM,ARM64}: void Main.testNoSelfDependantSchedNode(int) scheduler (after)
+  /// CHECK:     IntermediateAddress
+  /// CHECK:     ArrayGet
+  /// CHECK:     IntermediateAddress
+  /// CHECK:     LessThanOrEqual
+  /// CHECK:     Select
+  /// CHECK:     ArraySet
+  /// CHECK:     Add
+  //
+  // Parameter n is to prevent unrolling of the main loop.
+  private static void testNoSelfDependantSchedNode(int n) {
+    final int MAX = 2;
+    int[] a = {1, 2, 3};
+    int[] b = new int[a.length];
+    n = Math.min(n, a.length);
+    for (int i = 0; i < n; ++i) {
+      int j = a[i];
+      b[i] = (j > MAX ? MAX : 0);
+    }
+    expectEquals(b[0], 0);
+    expectEquals(b[1], 0);
+    expectEquals(b[2], 2);
+  }
+
+  // In case of cross iteration dependencies when a value for the next iteration is also used on
+  // the current iteration a MOV instruction is generated anyway. In such cases setting dependency
+  // between scheduling nodes will not eliminate MOV.
+  // In the test 'i+1' is such an example.
+  // The test checks that a dependency between scheduling nodes (first ArrayGet and Add) is not
+  // setup and Add is scheduled before ArrayGet.
+  //
+  /// CHECK-START-{ARM,ARM64}: void Main.testNonPreventingSchedulingCrossItersDeps(int) scheduler (before)
+  /// CHECK:          IntermediateAddress
+  /// CHECK-NEXT:     ArrayGet
+  /// CHECK-NEXT:     Add
+  /// CHECK-NEXT:     ArrayGet
+
+  /// CHECK-START-{ARM,ARM64}: void Main.testNonPreventingSchedulingCrossItersDeps(int) scheduler (after)
+  /// CHECK:          IntermediateAddress
+  /// CHECK-NEXT:     Add
+  /// CHECK-NEXT:     ArrayGet
+  /// CHECK-NEXT:     ArrayGet
+  //
+  // Parameter n is to prevent unrolling of the main loop.
+  private static void testNonPreventingSchedulingCrossItersDeps(int n) {
+    int[] a = {1, 2, 3};
+    n = Math.min(n, a.length);
+    for (int i = 0; i < n - 1; ++i) {
+      if (a[i] < a[i + 1]) {
+        int tmp = a[i];
+        a[i] = a[i + 1];
+        a[i + 1] = tmp;
+      }
+    }
+    expectEquals(a[0], 2);
+    expectEquals(a[1], 3);
+    expectEquals(a[2], 1);
+  }
+
   public static void main(String[] args) {
     testVecSetScalars();
     testVecReplicateScalar();
+    testCrossItersDependencies();
+    testNoSelfDependantSchedNode(3);
+    testNonPreventingSchedulingCrossItersDeps(3);
     if ((arrayAccess() + intDiv(10)) != -35) {
       System.out.println("FAIL");
     }