Improve x86 long divide

Implement inline division for literal and variable divisors.  Use the
general case for dividing by a literal by using a double length multiply
by the appropriate constant with fixups.  This is the Hacker's Delight
algorithm.

Change-Id: I563c250f99d89fca5ff8bcbf13de74de13815cfe
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index 79820c9..ccae130 100644
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -284,18 +284,261 @@
   OpCmpImmBranch(ccode, low_reg, val_lo, taken);
 }
 
+void X86Mir2Lir::CalculateMagicAndShift(int divisor, int& magic, int& shift) {
+  // It does not make sense to calculate magic and shift for zero divisor.
+  DCHECK_NE(divisor, 0);
+
+  /* According to H.S.Warren's Hacker's Delight Chapter 10 and
+   * T,Grablund, P.L.Montogomery's Division by invariant integers using multiplication.
+   * The magic number M and shift S can be calculated in the following way:
+   * Let nc be the most positive value of numerator(n) such that nc = kd - 1,
+   * where divisor(d) >=2.
+   * Let nc be the most negative value of numerator(n) such that nc = kd + 1,
+   * where divisor(d) <= -2.
+   * Thus nc can be calculated like:
+   * nc = 2^31 + 2^31 % d - 1, where d >= 2
+   * nc = -2^31 + (2^31 + 1) % d, where d >= 2.
+   *
+   * So the shift p is the smallest p satisfying
+   * 2^p > nc * (d - 2^p % d), where d >= 2
+   * 2^p > nc * (d + 2^p % d), where d <= -2.
+   *
+   * the magic number M is calcuated by
+   * M = (2^p + d - 2^p % d) / d, where d >= 2
+   * M = (2^p - d - 2^p % d) / d, where d <= -2.
+   *
+   * Notice that p is always bigger than or equal to 32, so we just return 32-p as
+   * the shift number S.
+   */
+
+  int32_t p = 31;
+  const uint32_t two31 = 0x80000000U;
+
+  // Initialize the computations.
+  uint32_t abs_d = (divisor >= 0) ? divisor : -divisor;
+  uint32_t tmp = two31 + (static_cast<uint32_t>(divisor) >> 31);
+  uint32_t abs_nc = tmp - 1 - tmp % abs_d;
+  uint32_t quotient1 = two31 / abs_nc;
+  uint32_t remainder1 = two31 % abs_nc;
+  uint32_t quotient2 = two31 / abs_d;
+  uint32_t remainder2 = two31 % abs_d;
+
+  /*
+   * To avoid handling both positive and negative divisor, Hacker's Delight
+   * introduces a method to handle these 2 cases together to avoid duplication.
+   */
+  uint32_t delta;
+  do {
+    p++;
+    quotient1 = 2 * quotient1;
+    remainder1 = 2 * remainder1;
+    if (remainder1 >= abs_nc) {
+      quotient1++;
+      remainder1 = remainder1 - abs_nc;
+    }
+    quotient2 = 2 * quotient2;
+    remainder2 = 2 * remainder2;
+    if (remainder2 >= abs_d) {
+      quotient2++;
+      remainder2 = remainder2 - abs_d;
+    }
+    delta = abs_d - remainder2;
+  } while (quotient1 < delta || (quotient1 == delta && remainder1 == 0));
+
+  magic = (divisor > 0) ? (quotient2 + 1) : (-quotient2 - 1);
+  shift = p - 32;
+}
+
 RegLocation X86Mir2Lir::GenDivRemLit(RegLocation rl_dest, int reg_lo,
                                      int lit, bool is_div) {
   LOG(FATAL) << "Unexpected use of GenDivRemLit for x86";
   return rl_dest;
 }
 
+RegLocation X86Mir2Lir::GenDivRemLit(RegLocation rl_dest, RegLocation rl_src,
+                                     int imm, bool is_div) {
+  // Use a multiply (and fixup) to perform an int div/rem by a constant.
+
+  // We have to use fixed registers, so flush all the temps.
+  FlushAllRegs();
+  LockCallTemps();  // Prepare for explicit register usage.
+
+  // Assume that the result will be in EDX.
+  RegLocation rl_result = {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed,
+                          r2, INVALID_REG, INVALID_SREG, INVALID_SREG};
+
+  // handle 0x80000000 / -1 special case.
+  LIR *minint_branch = 0;
+  if (imm == -1) {
+    if (is_div) {
+      LoadValueDirectFixed(rl_src, r0);
+      OpRegImm(kOpCmp, r0, 0x80000000);
+      minint_branch = NewLIR2(kX86Jcc8, 0, kX86CondEq);
+
+      // for x != MIN_INT, x / -1 == -x.
+      NewLIR1(kX86Neg32R, r0);
+
+      LIR* branch_around = NewLIR1(kX86Jmp8, 0);
+      // The target for cmp/jmp above.
+      minint_branch->target = NewLIR0(kPseudoTargetLabel);
+      // EAX already contains the right value (0x80000000),
+      branch_around->target = NewLIR0(kPseudoTargetLabel);
+    } else {
+      // x % -1 == 0.
+      LoadConstantNoClobber(r0, 0);
+    }
+    // For this case, return the result in EAX.
+    rl_result.low_reg = r0;
+  } else {
+    DCHECK(imm <= -2 || imm >= 2);
+    // Use H.S.Warren's Hacker's Delight Chapter 10 and
+    // T,Grablund, P.L.Montogomery's Division by invariant integers using multiplication.
+    int magic, shift;
+    CalculateMagicAndShift(imm, magic, shift);
+
+    /*
+     * For imm >= 2,
+     *     int(n/imm) = floor(n/imm) = floor(M*n/2^S), while n > 0
+     *     int(n/imm) = ceil(n/imm) = floor(M*n/2^S) +1, while n < 0.
+     * For imm <= -2,
+     *     int(n/imm) = ceil(n/imm) = floor(M*n/2^S) +1 , while n > 0
+     *     int(n/imm) = floor(n/imm) = floor(M*n/2^S), while n < 0.
+     * We implement this algorithm in the following way:
+     * 1. multiply magic number m and numerator n, get the higher 32bit result in EDX
+     * 2. if imm > 0 and magic < 0, add numerator to EDX
+     *    if imm < 0 and magic > 0, sub numerator from EDX
+     * 3. if S !=0, SAR S bits for EDX
+     * 4. add 1 to EDX if EDX < 0
+     * 5. Thus, EDX is the quotient
+     */
+
+    // Numerator into EAX.
+    int numerator_reg = -1;
+    if (!is_div || (imm > 0 && magic < 0) || (imm < 0 && magic > 0)) {
+      // We will need the value later.
+      if (rl_src.location == kLocPhysReg) {
+        // We can use it directly.
+        DCHECK(rl_src.low_reg != r0 && rl_src.low_reg != r2);
+        numerator_reg = rl_src.low_reg;
+      } else {
+        LoadValueDirectFixed(rl_src, r1);
+        numerator_reg = r1;
+      }
+      OpRegCopy(r0, numerator_reg);
+    } else {
+      // Only need this once.  Just put it into EAX.
+      LoadValueDirectFixed(rl_src, r0);
+    }
+
+    // EDX = magic.
+    LoadConstantNoClobber(r2, magic);
+
+    // EDX:EAX = magic & dividend.
+    NewLIR1(kX86Imul32DaR, r2);
+
+    if (imm > 0 && magic < 0) {
+      // Add numerator to EDX.
+      DCHECK_NE(numerator_reg, -1);
+      NewLIR2(kX86Add32RR, r2, numerator_reg);
+    } else if (imm < 0 && magic > 0) {
+      DCHECK_NE(numerator_reg, -1);
+      NewLIR2(kX86Sub32RR, r2, numerator_reg);
+    }
+
+    // Do we need the shift?
+    if (shift != 0) {
+      // Shift EDX by 'shift' bits.
+      NewLIR2(kX86Sar32RI, r2, shift);
+    }
+
+    // Add 1 to EDX if EDX < 0.
+
+    // Move EDX to EAX.
+    OpRegCopy(r0, r2);
+
+    // Move sign bit to bit 0, zeroing the rest.
+    NewLIR2(kX86Shr32RI, r2, 31);
+
+    // EDX = EDX + EAX.
+    NewLIR2(kX86Add32RR, r2, r0);
+
+    // Quotient is in EDX.
+    if (!is_div) {
+      // We need to compute the remainder.
+      // Remainder is divisor - (quotient * imm).
+      DCHECK_NE(numerator_reg, -1);
+      OpRegCopy(r0, numerator_reg);
+
+      // EAX = numerator * imm.
+      OpRegRegImm(kOpMul, r2, r2, imm);
+
+      // EDX -= EAX.
+      NewLIR2(kX86Sub32RR, r0, r2);
+
+      // For this case, return the result in EAX.
+      rl_result.low_reg = r0;
+    }
+  }
+
+  return rl_result;
+}
+
 RegLocation X86Mir2Lir::GenDivRem(RegLocation rl_dest, int reg_lo,
                                   int reg_hi, bool is_div) {
   LOG(FATAL) << "Unexpected use of GenDivRem for x86";
   return rl_dest;
 }
 
+RegLocation X86Mir2Lir::GenDivRem(RegLocation rl_dest, RegLocation rl_src1,
+                                  RegLocation rl_src2, bool is_div, bool check_zero) {
+  // We have to use fixed registers, so flush all the temps.
+  FlushAllRegs();
+  LockCallTemps();  // Prepare for explicit register usage.
+
+  // Load LHS into EAX.
+  LoadValueDirectFixed(rl_src1, r0);
+
+  // Load RHS into EBX.
+  LoadValueDirectFixed(rl_src2, r1);
+
+  // Copy LHS sign bit into EDX.
+  NewLIR0(kx86Cdq32Da);
+
+  if (check_zero) {
+    // Handle division by zero case.
+    GenImmedCheck(kCondEq, r1, 0, kThrowDivZero);
+  }
+
+  // Have to catch 0x80000000/-1 case, or we will get an exception!
+  OpRegImm(kOpCmp, r1, -1);
+  LIR *minus_one_branch = NewLIR2(kX86Jcc8, 0, kX86CondNe);
+
+  // RHS is -1.
+  OpRegImm(kOpCmp, r0, 0x80000000);
+  LIR * minint_branch = NewLIR2(kX86Jcc8, 0, kX86CondNe);
+
+  // In 0x80000000/-1 case.
+  if (!is_div) {
+    // For DIV, EAX is already right. For REM, we need EDX 0.
+    LoadConstantNoClobber(r2, 0);
+  }
+  LIR* done = NewLIR1(kX86Jmp8, 0);
+
+  // Expected case.
+  minus_one_branch->target = NewLIR0(kPseudoTargetLabel);
+  minint_branch->target = minus_one_branch->target;
+  NewLIR1(kX86Idivmod32DaR, r1);
+  done->target = NewLIR0(kPseudoTargetLabel);
+
+  // Result is in EAX for div and EDX for rem.
+  RegLocation rl_result = {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed,
+                          r0, INVALID_REG, INVALID_SREG, INVALID_SREG};
+  if (!is_div) {
+    rl_result.low_reg = r2;
+  }
+  return rl_result;
+}
+
 bool X86Mir2Lir::GenInlinedMinMaxInt(CallInfo* info, bool is_min) {
   DCHECK_EQ(cu_->instruction_set, kX86);