ART: Optimize ADD/SUB+ADD_shift into ADDS/SUBS+CINC for HDiv/HRem HDiv/HRem having a constant divisor are optimized by using multiplication of the dividend by a sort of reciprocal of the divisor. The multiplication result might need some corrections to be finalized. The last correction is to increment by 1, if the result is negative. Currently it is done with 'add result, temp_result, temp_result, lsr #31 or #63'. Such ADD usually has latency 2, e.g. on Cortex-A55. However if one of the corrections is ADD or SUB, the sign can be detected with ADDS/SUBS. They set the N flag if the result is negative. This allows to use CINC which has latency 1: adds temp_result, temp_result, dividend cinc out, temp_result, mi This CL implements this optimization. Test: test.py --host --optimizing --jit Test: test.py --target --optimizing --jit Change-Id: Ia6aac6771908e992c86e32fe1694a82bd1b7af0b

commit: 1439e573517bb9f0b115aef5d3bbd9090751ebd6 [log] [tgz]
author: Evgeny Astigeevich <evgeny.astigeevich@linaro.org> Tue May 12 12:43:09 2020 +0100
committer: Vladimir Marko <vmarko@google.com> Mon May 18 09:11:30 2020 +0000
tree: d6a1a4aed01719e988a8ac0fb81ed2843667d75f
parent: 4be256069c494550037c81272ca4c27bd4a139df [diff]
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 11aade4..1e098d7 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc

@@ -3067,11 +3067,22 @@
     Register temp_result,
     Register out,
     UseScratchRegisterScope* temps_scope) {
+  // The multiplication result might need some corrections to be finalized.
+  // The last correction is to increment by 1, if the result is negative.
+  // Currently it is done with 'add result, temp_result, temp_result, lsr #31 or #63'.
+  // Such ADD usually has latency 2, e.g. on Cortex-A55.
+  // However if one of the corrections is ADD or SUB, the sign can be detected
+  // with ADDS/SUBS. They set the N flag if the result is negative.
+  // This allows to use CINC MI which has latency 1.
+  bool use_cond_inc = false;
+
   // As magic_number can be modified to fit into 32 bits, check whether the correction is needed.
   if (NeedToAddDividend(magic_number, divisor)) {
-    __ Add(temp_result, temp_result, dividend);
+    __ Adds(temp_result, temp_result, dividend);
+    use_cond_inc = true;
   } else if (NeedToSubDividend(magic_number, divisor)) {
-    __ Sub(temp_result, temp_result, dividend);
+    __ Subs(temp_result, temp_result, dividend);
+    use_cond_inc = true;
   }
 
   if (final_right_shift != 0) {
@@ -3079,7 +3090,11 @@
   }
 
   Register& result = (is_rem) ? temp_result : out;
-  __ Add(result, temp_result, Operand(temp_result, LSR, temp_result.GetSizeInBits() - 1));
+  if (use_cond_inc) {
+    __ Cinc(result, temp_result, mi);
+  } else {
+    __ Add(result, temp_result, Operand(temp_result, LSR, temp_result.GetSizeInBits() - 1));
+  }
   if (is_rem) {
     // TODO: Strength reduction for msub.
     Register temp_imm = temps_scope->AcquireSameSizeAs(out);

diff --git a/test/411-checker-hdiv-hrem-const/info.txt b/test/411-checker-hdiv-hrem-const/info.txt
index ff8332f..ce88f44 100644
--- a/test/411-checker-hdiv-hrem-const/info.txt
+++ b/test/411-checker-hdiv-hrem-const/info.txt

@@ -1,2 +1,2 @@
-Test the optimization of LSR+ASR into one ASR for integer division and remainder instructions when
+Checker test for optimizations of integer division and remainder instructions when
 the denominator is a constant, not power of 2.

diff --git a/test/411-checker-hdiv-hrem-const/src/DivTest.java b/test/411-checker-hdiv-hrem-const/src/DivTest.java
index cb97abd..9a1cd47 100644
--- a/test/411-checker-hdiv-hrem-const/src/DivTest.java
+++ b/test/411-checker-hdiv-hrem-const/src/DivTest.java

@@ -115,11 +115,13 @@
   // must be corrected by the 'add' instruction which is between 'lsr' and 'asr'
   // instructions. In such a case they cannot be combined into one 'asr'.
   //
+  // The test case also checks 'add' and 'add_shift' are optimized into 'adds' and 'cinc'.
+  //
   /// CHECK-START-ARM64: int DivTest.$noinline$IntDivBy7(int) disassembly (after)
   /// CHECK:                 lsr x{{\d+}}, x{{\d+}}, #32
-  /// CHECK-NEXT:            add w{{\d+}}, w{{\d+}}, w{{\d+}}
+  /// CHECK-NEXT:            adds w{{\d+}}, w{{\d+}}, w{{\d+}}
   /// CHECK-NEXT:            asr w{{\d+}}, w{{\d+}}, #2
-  /// CHECK-NEXT:            add w{{\d+}}, w{{\d+}}, w{{\d+}}, lsr #31
+  /// CHECK-NEXT:            cinc w{{\d+}}, w{{\d+}}, mi
   private static int $noinline$IntDivBy7(int v) {
     int r = v / 7;
     return r;
@@ -130,11 +132,13 @@
   // must be corrected. In this case it is a 'sub' instruction which is between 'lsr' and 'asr'
   // instructions. So they cannot be combined into one 'asr'.
   //
+  // The test case also checks 'sub' and 'add_shift' are optimized into 'subs' and 'cinc'.
+  //
   /// CHECK-START-ARM64: int DivTest.$noinline$IntDivByMinus7(int) disassembly (after)
   /// CHECK:                 lsr x{{\d+}}, x{{\d+}}, #32
-  /// CHECK-NEXT:            sub w{{\d+}}, w{{\d+}}, w{{\d+}}
+  /// CHECK-NEXT:            subs w{{\d+}}, w{{\d+}}, w{{\d+}}
   /// CHECK-NEXT:            asr w{{\d+}}, w{{\d+}}, #2
-  /// CHECK-NEXT:            add w{{\d+}}, w{{\d+}}, w{{\d+}}, lsr #31
+  /// CHECK-NEXT:            cinc w{{\d+}}, w{{\d+}}, mi
   private static int $noinline$IntDivByMinus7(int v) {
     int r = v / -7;
     return r;
@@ -218,6 +222,22 @@
     expectEquals(1L, $noinline$LongDivByMinus6(-6L));
     expectEquals(-3L, $noinline$LongDivByMinus6(19L));
     expectEquals(3L, $noinline$LongDivByMinus6(-19L));
+
+    expectEquals(0L, $noinline$LongDivBy100(0L));
+    expectEquals(0L, $noinline$LongDivBy100(1L));
+    expectEquals(0L, $noinline$LongDivBy100(-1L));
+    expectEquals(1L, $noinline$LongDivBy100(100L));
+    expectEquals(-1L, $noinline$LongDivBy100(-100L));
+    expectEquals(3L, $noinline$LongDivBy100(301L));
+    expectEquals(-3L, $noinline$LongDivBy100(-301L));
+
+    expectEquals(0L, $noinline$LongDivByMinus100(0L));
+    expectEquals(0L, $noinline$LongDivByMinus100(1L));
+    expectEquals(0L, $noinline$LongDivByMinus100(-1L));
+    expectEquals(-1L, $noinline$LongDivByMinus100(100L));
+    expectEquals(1L, $noinline$LongDivByMinus100(-100L));
+    expectEquals(-3L, $noinline$LongDivByMinus100(301L));
+    expectEquals(3L, $noinline$LongDivByMinus100(-301L));
   }
 
   // Test cases for Int64 HDiv/HRem to check that optimizations implemented for Int32 are not
@@ -272,4 +292,28 @@
     long r = v / -6L;
     return r;
   }
+
+  // A test to check 'add' and 'add_shift' are optimized into 'adds' and 'cinc'.
+  //
+  /// CHECK-START-ARM64: long DivTest.$noinline$LongDivBy100(long) disassembly (after)
+  /// CHECK:                 smulh x{{\d+}}, x{{\d+}}, x{{\d+}}
+  /// CHECK-NEXT:            adds  x{{\d+}}, x{{\d+}}, x{{\d+}}
+  /// CHECK-NEXT:            asr   x{{\d+}}, x{{\d+}}, #6
+  /// CHECK-NEXT:            cinc  x{{\d+}}, x{{\d+}}, mi
+  private static long $noinline$LongDivBy100(long v) {
+    long r = v / 100L;
+    return r;
+  }
+
+  // A test to check 'subs' and 'add_shift' are optimized into 'subs' and 'cinc'.
+  //
+  /// CHECK-START-ARM64: long DivTest.$noinline$LongDivByMinus100(long) disassembly (after)
+  /// CHECK:                 smulh x{{\d+}}, x{{\d+}}, x{{\d+}}
+  /// CHECK-NEXT:            subs  x{{\d+}}, x{{\d+}}, x{{\d+}}
+  /// CHECK-NEXT:            asr   x{{\d+}}, x{{\d+}}, #6
+  /// CHECK-NEXT:            cinc  x{{\d+}}, x{{\d+}}, mi
+  private static long $noinline$LongDivByMinus100(long v) {
+    long r = v / -100L;
+    return r;
+  }
 }

diff --git a/test/411-checker-hdiv-hrem-const/src/RemTest.java b/test/411-checker-hdiv-hrem-const/src/RemTest.java
index bcb1834..11889c4 100644
--- a/test/411-checker-hdiv-hrem-const/src/RemTest.java
+++ b/test/411-checker-hdiv-hrem-const/src/RemTest.java

@@ -121,9 +121,9 @@
   //
   /// CHECK-START-ARM64: int RemTest.$noinline$IntRemBy7(int) disassembly (after)
   /// CHECK:                 lsr x{{\d+}}, x{{\d+}}, #32
-  /// CHECK-NEXT:            add w{{\d+}}, w{{\d+}}, w{{\d+}}
+  /// CHECK-NEXT:            adds w{{\d+}}, w{{\d+}}, w{{\d+}}
   /// CHECK-NEXT:            asr w{{\d+}}, w{{\d+}}, #2
-  /// CHECK-NEXT:            add w{{\d+}}, w{{\d+}}, w{{\d+}}, lsr #31
+  /// CHECK-NEXT:            cinc w{{\d+}}, w{{\d+}}, mi
   /// CHECK-NEXT:            mov w{{\d+}}, #0x7
   /// CHECK-NEXT:            msub w{{\d+}}, w{{\d+}}, w{{\d+}}, w{{\d+}}
   private static int $noinline$IntRemBy7(int v) {
@@ -138,9 +138,9 @@
   //
   /// CHECK-START-ARM64: int RemTest.$noinline$IntRemByMinus7(int) disassembly (after)
   /// CHECK:                 lsr x{{\d+}}, x{{\d+}}, #32
-  /// CHECK-NEXT:            sub w{{\d+}}, w{{\d+}}, w{{\d+}}
+  /// CHECK-NEXT:            subs w{{\d+}}, w{{\d+}}, w{{\d+}}
   /// CHECK-NEXT:            asr w{{\d+}}, w{{\d+}}, #2
-  /// CHECK-NEXT:            add w{{\d+}}, w{{\d+}}, w{{\d+}}, lsr #31
+  /// CHECK-NEXT:            cinc w{{\d+}}, w{{\d+}}, mi
   /// CHECK-NEXT:            mov w{{\d+}}, #0xfffffff9
   /// CHECK-NEXT:            msub w{{\d+}}, w{{\d+}}, w{{\d+}}, w{{\d+}}
   private static int $noinline$IntRemByMinus7(int v) {
@@ -230,6 +230,22 @@
     expectEquals(0L, $noinline$LongRemByMinus6(-6L));
     expectEquals(1L, $noinline$LongRemByMinus6(19L));
     expectEquals(-1L, $noinline$LongRemByMinus6(-19L));
+
+    expectEquals(0L, $noinline$LongRemBy100(0L));
+    expectEquals(1L, $noinline$LongRemBy100(1L));
+    expectEquals(-1L, $noinline$LongRemBy100(-1L));
+    expectEquals(0L, $noinline$LongRemBy100(100L));
+    expectEquals(0L, $noinline$LongRemBy100(-100L));
+    expectEquals(1L, $noinline$LongRemBy100(101L));
+    expectEquals(-1L, $noinline$LongRemBy100(-101L));
+
+    expectEquals(0L, $noinline$LongRemByMinus100(0L));
+    expectEquals(1L, $noinline$LongRemByMinus100(1L));
+    expectEquals(-1L, $noinline$LongRemByMinus100(-1L));
+    expectEquals(0L, $noinline$LongRemByMinus100(100L));
+    expectEquals(0L, $noinline$LongRemByMinus100(-100L));
+    expectEquals(1L, $noinline$LongRemByMinus100(101L));
+    expectEquals(-1L, $noinline$LongRemByMinus100(-101L));
   }
 
   // Test cases for Int64 HDiv/HRem to check that optimizations implemented for Int32 are not
@@ -296,4 +312,32 @@
     long r = v % -6L;
     return r;
   }
+
+  // A test to check 'add' and 'add_shift' are optimized into 'adds' and 'cinc'.
+  //
+  /// CHECK-START-ARM64: long RemTest.$noinline$LongRemBy100(long) disassembly (after)
+  /// CHECK:                 smulh x{{\d+}}, x{{\d+}}, x{{\d+}}
+  /// CHECK-NEXT:            adds  x{{\d+}}, x{{\d+}}, x{{\d+}}
+  /// CHECK-NEXT:            asr   x{{\d+}}, x{{\d+}}, #6
+  /// CHECK-NEXT:            cinc  x{{\d+}}, x{{\d+}}, mi
+  /// CHECK-NEXT:            mov x{{\d+}}, #0x64
+  /// CHECK-NEXT:            msub x{{\d+}}, x{{\d+}}, x{{\d+}}, x{{\d+}}
+  private static long $noinline$LongRemBy100(long v) {
+    long r = v % 100L;
+    return r;
+  }
+
+  // A test to check 'sub' and 'add_shift' are optimized into 'subs' and 'cinc'.
+  //
+  /// CHECK-START-ARM64: long RemTest.$noinline$LongRemByMinus100(long) disassembly (after)
+  /// CHECK:                 smulh x{{\d+}}, x{{\d+}}, x{{\d+}}
+  /// CHECK-NEXT:            subs  x{{\d+}}, x{{\d+}}, x{{\d+}}
+  /// CHECK-NEXT:            asr   x{{\d+}}, x{{\d+}}, #6
+  /// CHECK-NEXT:            cinc  x{{\d+}}, x{{\d+}}, mi
+  /// CHECK-NEXT:            mov x{{\d+}}, #0xffffffffffffff9c
+  /// CHECK-NEXT:            msub x{{\d+}}, x{{\d+}}, x{{\d+}}, x{{\d+}}
+  private static long $noinline$LongRemByMinus100(long v) {
+    long r = v % -100L;
+    return r;
+  }
 }
commit	1439e573517bb9f0b115aef5d3bbd9090751ebd6	[log] [tgz]
author	Evgeny Astigeevich <evgeny.astigeevich@linaro.org>	Tue May 12 12:43:09 2020 +0100
committer	Vladimir Marko <vmarko@google.com>	Mon May 18 09:11:30 2020 +0000
tree	d6a1a4aed01719e988a8ac0fb81ed2843667d75f
parent	4be256069c494550037c81272ca4c27bd4a139df [diff]