Fix intersecting live ranges created by instruction scheduler
When scheduling code like the following:
LOOP:
v2=phi(v0, v1)
use(v2)
v1=...
goto LOOP
the instruction scheduler can move 'v1=...' before 'use(v2)'. This
causes live ranges of v1 and v2 to intersect and results to a MOV
instruction to be created.
The CL fixes this.
Improvements, Pixel3:
Little CPU, arm64
micro/GCCLoops
Example12 14.1%
Example10b 11.0%
Example23 8.1%
Example24 6.6%
Example10a 5.0%
FFT workload 4.7%
Compress workload 1.2%
Little CPU, arm32
micro/GCCLoops
Example23 7.5%
Example24 4.3%
MonteCarlo workload 1.35%
Big CPU, arm32 and arm64
No significant improvements
No significant regressions (> 5%) are found.
Test: test.py --host --optimizing --jit --gtest
Test: test.py --target --optimizing --jit
Test: run-gtests.sh
Change-Id: I1e4282af18f2d51fde5325a0c00a57e8bbc4fbed
diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc
index e5ff8a8..f722cf9 100644
--- a/compiler/optimizing/scheduler.cc
+++ b/compiler/optimizing/scheduler.cc
@@ -282,6 +282,36 @@
}
}
+void SchedulingGraph::AddCrossIterationDependencies(SchedulingNode* node) {
+ for (HInstruction* instruction : node->GetInstruction()->GetInputs()) {
+ // Having a phi-function from a loop header as an input means the current node of the
+ // scheduling graph has a cross-iteration dependency because such phi-functions bring values
+ // from the previous iteration to the current iteration.
+ if (!instruction->IsLoopHeaderPhi()) {
+ continue;
+ }
+ for (HInstruction* phi_input : instruction->GetInputs()) {
+ // As a scheduling graph of the current basic block is built by
+ // processing instructions bottom-up, nullptr returned by GetNode means
+ // an instruction defining a value for the phi is either before the
+ // instruction represented by node or it is in a different basic block.
+ SchedulingNode* def_node = GetNode(phi_input);
+
+ // We don't create a dependency if there are uses besides the use in phi.
+ // In such cases a register to hold phi_input is usually allocated and
+ // a MOV instruction is generated. In cases with multiple uses and no MOV
+ // instruction, reordering creating a MOV instruction can improve
+ // performance more than an attempt to avoid a MOV instruction.
+ if (def_node != nullptr && def_node != node && phi_input->GetUses().HasExactlyOneElement()) {
+ // We have an implicit data dependency between node and def_node.
+ // AddAddDataDependency cannot be used because it is for explicit data dependencies.
+ // So AddOtherDependency is used.
+ AddOtherDependency(def_node, node);
+ }
+ }
+ }
+}
+
void SchedulingGraph::AddDependencies(SchedulingNode* instruction_node,
bool is_scheduling_barrier) {
HInstruction* instruction = instruction_node->GetInstruction();
@@ -340,7 +370,11 @@
if (other_node->IsSchedulingBarrier()) {
// We have reached a scheduling barrier so we can stop further
// processing.
- DCHECK(other_node->HasOtherDependency(instruction_node));
+ //
+ // As a "other" dependency is not set up if a data dependency exists, we need to check that
+ // one of them must exist.
+ DCHECK(other_node->HasOtherDependency(instruction_node)
+ || other_node->HasDataDependency(instruction_node));
break;
}
if (side_effect_dependency_analysis_.HasSideEffectDependency(other, instruction)) {
@@ -372,6 +406,8 @@
AddOtherDependency(GetNode(use.GetUser()->GetHolder()), instruction_node);
}
}
+
+ AddCrossIterationDependencies(instruction_node);
}
static const std::string InstructionTypeId(const HInstruction* instruction) {
diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h
index a97eda7..f7180a0 100644
--- a/compiler/optimizing/scheduler.h
+++ b/compiler/optimizing/scheduler.h
@@ -183,7 +183,9 @@
void AddOtherPredecessor(SchedulingNode* predecessor) {
// Check whether the predecessor has been added earlier.
- if (HasOtherDependency(predecessor)) {
+ // As an optimization of the scheduling graph, we don't need to create another dependency if
+ // there is a data dependency between scheduling nodes.
+ if (HasOtherDependency(predecessor) || HasDataDependency(predecessor)) {
return;
}
other_predecessors_.push_back(predecessor);
@@ -362,6 +364,25 @@
AddDependency(node, dependency, /*is_data_dependency*/false);
}
+ // Analyze whether the scheduling node has cross-iteration dependencies which mean it uses
+ // values defined on the previous iteration.
+ //
+ // Supported cases:
+ //
+ // L:
+ // v2 = loop_head_phi(v1)
+ // instr1(v2)
+ // v1 = instr2
+ // goto L
+ //
+ // In such cases moving instr2 before instr1 creates intersecting live ranges
+ // of v1 and v2. As a result a separate register is needed to keep the value
+ // defined by instr2 which is only used on the next iteration.
+ // If instr2 is not moved, no additional register is needed. The register
+ // used by instr1 is reused.
+ // To prevent such a situation a "other" dependency between instr1 and instr2 must be set.
+ void AddCrossIterationDependencies(SchedulingNode* node);
+
// Add dependencies nodes for the given `SchedulingNode`: inputs, environments, and side-effects.
void AddDependencies(SchedulingNode* node, bool is_scheduling_barrier = false);
diff --git a/test/706-checker-scheduler/src/Main.java b/test/706-checker-scheduler/src/Main.java
index af18193..5a66fbb 100644
--- a/test/706-checker-scheduler/src/Main.java
+++ b/test/706-checker-scheduler/src/Main.java
@@ -322,7 +322,7 @@
// but has more complex chains of transforming the original references:
// ParameterValue --> BoundType --> NullCheck --> ArrayGet.
// ParameterValue --> BoundType --> NullCheck --> IntermediateAddress --> ArraySet.
- // After using LSA to analyze the orginal references, the scheduler should be able
+ // After using LSA to analyze the original references, the scheduler should be able
// to find out that 'a' and 'b' may alias, hence unable to schedule these ArraGet/Set.
/// CHECK-START-ARM64: void Main.CrossOverLoop2(java.lang.Object, java.lang.Object) scheduler (before)
@@ -584,9 +584,126 @@
}
}
+ // Check that instructions having cross iteration dependencies are not
+ // reordered.
+ //
+ /// CHECK-START-{ARM,ARM64}: void Main.testCrossItersDependencies() scheduler (before)
+ /// CHECK: <<ID1:i\d+>> Phi [{{i\d+}},<<ID3:i\d+>>]
+ /// CHECK: <<ID2:i\d+>> Phi [{{i\d+}},<<ID4:i\d+>>]
+ //
+ /// CHECK: <<ID3>> Sub [<<ID1>>,<<ID2>>]
+ /// CHECK: <<ID4>> Add [<<ID2>>,{{i\d+}}]
+
+ /// CHECK-START-{ARM,ARM64}: void Main.testCrossItersDependencies() scheduler (after)
+ /// CHECK: <<ID1:i\d+>> Phi [{{i\d+}},<<ID3:i\d+>>]
+ /// CHECK: <<ID2:i\d+>> Phi [{{i\d+}},<<ID4:i\d+>>]
+ //
+ /// CHECK: <<ID3>> Sub [<<ID1>>,<<ID2>>]
+ /// CHECK: <<ID4>> Add [<<ID2>>,{{i\d+}}]
+
+ /// CHECK-START-ARM: void Main.testCrossItersDependencies() disassembly (after)
+ /// CHECK: subs
+ /// CHECK: add
+ /// CHECK: adds
+ /// CHECK: ldrh
+ /// CHECK: cmp
+ /// CHECK: beq
+
+ /// CHECK-START-ARM64: void Main.testCrossItersDependencies() disassembly (after)
+ /// CHECK: sub
+ /// CHECK: add
+ /// CHECK: add
+ /// CHECK: ldrh
+ /// CHECK: cbz
+ private static void testCrossItersDependencies() {
+ int[] data = {1, 2, 3, 0};
+ int sub = 0;
+ int sum = data[0];
+ for (int i = 1; data[i] != 0; ++i) {
+ sub -= sum;
+ sum += data[i];
+ }
+ expectEquals(sub, -4);
+ expectEquals(sum, 6);
+ }
+
+ // Check instructions defining values for the next iteration don't become
+ // self-dependent in a scheduling graph which prevents valid reordering.
+ //
+ /// CHECK-START-{ARM,ARM64}: void Main.testNoSelfDependantSchedNode(int) scheduler (before)
+ /// CHECK: IntermediateAddress
+ /// CHECK: ArrayGet
+ /// CHECK: LessThanOrEqual
+ /// CHECK: Select
+ /// CHECK: IntermediateAddress
+ /// CHECK: ArraySet
+ /// CHECK: Add
+
+ /// CHECK-START-{ARM,ARM64}: void Main.testNoSelfDependantSchedNode(int) scheduler (after)
+ /// CHECK: IntermediateAddress
+ /// CHECK: ArrayGet
+ /// CHECK: IntermediateAddress
+ /// CHECK: LessThanOrEqual
+ /// CHECK: Select
+ /// CHECK: ArraySet
+ /// CHECK: Add
+ //
+ // Parameter n is to prevent unrolling of the main loop.
+ private static void testNoSelfDependantSchedNode(int n) {
+ final int MAX = 2;
+ int[] a = {1, 2, 3};
+ int[] b = new int[a.length];
+ n = Math.min(n, a.length);
+ for (int i = 0; i < n; ++i) {
+ int j = a[i];
+ b[i] = (j > MAX ? MAX : 0);
+ }
+ expectEquals(b[0], 0);
+ expectEquals(b[1], 0);
+ expectEquals(b[2], 2);
+ }
+
+ // In case of cross iteration dependencies when a value for the next iteration is also used on
+ // the current iteration a MOV instruction is generated anyway. In such cases setting dependency
+ // between scheduling nodes will not eliminate MOV.
+ // In the test 'i+1' is such an example.
+ // The test checks that a dependency between scheduling nodes (first ArrayGet and Add) is not
+ // setup and Add is scheduled before ArrayGet.
+ //
+ /// CHECK-START-{ARM,ARM64}: void Main.testNonPreventingSchedulingCrossItersDeps(int) scheduler (before)
+ /// CHECK: IntermediateAddress
+ /// CHECK-NEXT: ArrayGet
+ /// CHECK-NEXT: Add
+ /// CHECK-NEXT: ArrayGet
+
+ /// CHECK-START-{ARM,ARM64}: void Main.testNonPreventingSchedulingCrossItersDeps(int) scheduler (after)
+ /// CHECK: IntermediateAddress
+ /// CHECK-NEXT: Add
+ /// CHECK-NEXT: ArrayGet
+ /// CHECK-NEXT: ArrayGet
+ //
+ // Parameter n is to prevent unrolling of the main loop.
+ private static void testNonPreventingSchedulingCrossItersDeps(int n) {
+ int[] a = {1, 2, 3};
+ n = Math.min(n, a.length);
+ for (int i = 0; i < n - 1; ++i) {
+ if (a[i] < a[i + 1]) {
+ int tmp = a[i];
+ a[i] = a[i + 1];
+ a[i + 1] = tmp;
+ }
+ }
+ expectEquals(a[0], 2);
+ expectEquals(a[1], 3);
+ expectEquals(a[2], 1);
+ }
+
public static void main(String[] args) {
testVecSetScalars();
testVecReplicateScalar();
+ testCrossItersDependencies();
+ testNoSelfDependantSchedNode(3);
+ testNonPreventingSchedulingCrossItersDeps(3);
if ((arrayAccess() + intDiv(10)) != -35) {
System.out.println("FAIL");
}