ART: Implement the easy long division/remainder by a constant

Also optimizes long/int divisions by power-of-two values.

Also do some clean-up.

Change-Id: Ie414e64aac251c81361ae107d157c14439e6dab5
Signed-off-by: Alexei Zavjalov <alexei.zavjalov@intel.com>
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index b9abdbf..057639c 100755
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -513,7 +513,7 @@
   OpCondBranch(ccode, taken);
 }
 
-void X86Mir2Lir::CalculateMagicAndShift(int divisor, int& magic, int& shift) {
+void X86Mir2Lir::CalculateMagicAndShift(int64_t divisor, int64_t& magic, int& shift, bool is_long) {
   // It does not make sense to calculate magic and shift for zero divisor.
   DCHECK_NE(divisor, 0);
 
@@ -525,8 +525,8 @@
    * Let nc be the most negative value of numerator(n) such that nc = kd + 1,
    * where divisor(d) <= -2.
    * Thus nc can be calculated like:
-   * nc = 2^31 + 2^31 % d - 1, where d >= 2
-   * nc = -2^31 + (2^31 + 1) % d, where d >= 2.
+   * nc = exp + exp % d - 1, where d >= 2 and exp = 2^31 for int or 2^63 for long
+   * nc = -exp + (exp + 1) % d, where d >= 2 and exp = 2^31 for int or 2^63 for long
    *
    * So the shift p is the smallest p satisfying
    * 2^p > nc * (d - 2^p % d), where d >= 2
@@ -536,27 +536,28 @@
    * M = (2^p + d - 2^p % d) / d, where d >= 2
    * M = (2^p - d - 2^p % d) / d, where d <= -2.
    *
-   * Notice that p is always bigger than or equal to 32, so we just return 32-p as
+   * Notice that p is always bigger than or equal to 32/64, so we just return 32-p/64-p as
    * the shift number S.
    */
 
-  int32_t p = 31;
-  const uint32_t two31 = 0x80000000U;
+  int64_t p = (is_long) ? 63 : 31;
+  const uint64_t exp = (is_long) ? 0x8000000000000000ULL : 0x80000000U;
 
   // Initialize the computations.
-  uint32_t abs_d = (divisor >= 0) ? divisor : -divisor;
-  uint32_t tmp = two31 + (static_cast<uint32_t>(divisor) >> 31);
-  uint32_t abs_nc = tmp - 1 - tmp % abs_d;
-  uint32_t quotient1 = two31 / abs_nc;
-  uint32_t remainder1 = two31 % abs_nc;
-  uint32_t quotient2 = two31 / abs_d;
-  uint32_t remainder2 = two31 % abs_d;
+  uint64_t abs_d = (divisor >= 0) ? divisor : -divisor;
+  uint64_t tmp = exp + ((is_long) ? static_cast<uint64_t>(divisor) >> 63 :
+                                    static_cast<uint32_t>(divisor) >> 31);
+  uint64_t abs_nc = tmp - 1 - tmp % abs_d;
+  uint64_t quotient1 = exp / abs_nc;
+  uint64_t remainder1 = exp % abs_nc;
+  uint64_t quotient2 = exp / abs_d;
+  uint64_t remainder2 = exp % abs_d;
 
   /*
    * To avoid handling both positive and negative divisor, Hacker's Delight
    * introduces a method to handle these 2 cases together to avoid duplication.
    */
-  uint32_t delta;
+  uint64_t delta;
   do {
     p++;
     quotient1 = 2 * quotient1;
@@ -575,7 +576,12 @@
   } while (quotient1 < delta || (quotient1 == delta && remainder1 == 0));
 
   magic = (divisor > 0) ? (quotient2 + 1) : (-quotient2 - 1);
-  shift = p - 32;
+
+  if (!is_long) {
+    magic = static_cast<int>(magic);
+  }
+
+  shift = (is_long) ? p - 64 : p - 32;
 }
 
 RegLocation X86Mir2Lir::GenDivRemLit(RegLocation rl_dest, RegStorage reg_lo, int lit, bool is_div) {
@@ -586,52 +592,57 @@
 RegLocation X86Mir2Lir::GenDivRemLit(RegLocation rl_dest, RegLocation rl_src,
                                      int imm, bool is_div) {
   // Use a multiply (and fixup) to perform an int div/rem by a constant.
+  RegLocation rl_result;
 
-  // We have to use fixed registers, so flush all the temps.
-  FlushAllRegs();
-  LockCallTemps();  // Prepare for explicit register usage.
-
-  // Assume that the result will be in EDX.
-  RegLocation rl_result = {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, rs_r2, INVALID_SREG, INVALID_SREG};
-
-  // handle div/rem by 1 special case.
   if (imm == 1) {
+    rl_result = EvalLoc(rl_dest, kCoreReg, true);
     if (is_div) {
       // x / 1 == x.
-      StoreValue(rl_result, rl_src);
+      LoadValueDirectFixed(rl_src, rl_result.reg);
     } else {
       // x % 1 == 0.
-      LoadConstantNoClobber(rs_r0, 0);
-      // For this case, return the result in EAX.
-      rl_result.reg.SetReg(r0);
+      LoadConstantNoClobber(rl_result.reg, 0);
     }
   } else if (imm == -1) {  // handle 0x80000000 / -1 special case.
+    rl_result = EvalLoc(rl_dest, kCoreReg, true);
     if (is_div) {
-      LIR *minint_branch = 0;
-      LoadValueDirectFixed(rl_src, rs_r0);
-      OpRegImm(kOpCmp, rs_r0, 0x80000000);
-      minint_branch = NewLIR2(kX86Jcc8, 0, kX86CondEq);
+      LoadValueDirectFixed(rl_src, rl_result.reg);
+      OpRegImm(kOpCmp, rl_result.reg, 0x80000000);
+      LIR *minint_branch = NewLIR2(kX86Jcc8, 0, kX86CondEq);
 
       // for x != MIN_INT, x / -1 == -x.
-      NewLIR1(kX86Neg32R, r0);
+      NewLIR1(kX86Neg32R, rl_result.reg.GetReg());
 
-      LIR* branch_around = NewLIR1(kX86Jmp8, 0);
-      // The target for cmp/jmp above.
-      minint_branch->target = NewLIR0(kPseudoTargetLabel);
       // EAX already contains the right value (0x80000000),
-      branch_around->target = NewLIR0(kPseudoTargetLabel);
+      minint_branch->target = NewLIR0(kPseudoTargetLabel);
     } else {
       // x % -1 == 0.
-      LoadConstantNoClobber(rs_r0, 0);
+      LoadConstantNoClobber(rl_result.reg, 0);
     }
-    // For this case, return the result in EAX.
-    rl_result.reg.SetReg(r0);
+  } else if (is_div && IsPowerOfTwo(std::abs(imm))) {
+    // Division using shifting.
+    rl_src = LoadValue(rl_src, kCoreReg);
+    rl_result = EvalLoc(rl_dest, kCoreReg, true);
+    if (IsSameReg(rl_result.reg, rl_src.reg)) {
+      RegStorage rs_temp = AllocTypedTemp(false, kCoreReg);
+      rl_result.reg.SetReg(rs_temp.GetReg());
+    }
+    NewLIR3(kX86Lea32RM, rl_result.reg.GetReg(), rl_src.reg.GetReg(), std::abs(imm) - 1);
+    NewLIR2(kX86Test32RR, rl_src.reg.GetReg(), rl_src.reg.GetReg());
+    OpCondRegReg(kOpCmov, kCondPl, rl_result.reg, rl_src.reg);
+    int shift_amount = LowestSetBit(imm);
+    OpRegImm(kOpAsr, rl_result.reg, shift_amount);
+    if (imm < 0) {
+      OpReg(kOpNeg, rl_result.reg);
+    }
   } else {
     CHECK(imm <= -2 || imm >= 2);
+
     // Use H.S.Warren's Hacker's Delight Chapter 10 and
     // T,Grablund, P.L.Montogomery's Division by invariant integers using multiplication.
-    int magic, shift;
-    CalculateMagicAndShift(imm, magic, shift);
+    int64_t magic;
+    int shift;
+    CalculateMagicAndShift((int64_t)imm, magic, shift, false /* is_long */);
 
     /*
      * For imm >= 2,
@@ -649,18 +660,22 @@
      * 5. Thus, EDX is the quotient
      */
 
+    FlushReg(rs_r0);
+    Clobber(rs_r0);
+    LockTemp(rs_r0);
+    FlushReg(rs_r2);
+    Clobber(rs_r2);
+    LockTemp(rs_r2);
+
+    // Assume that the result will be in EDX.
+    rl_result = {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, rs_r2, INVALID_SREG, INVALID_SREG};
+
     // Numerator into EAX.
     RegStorage numerator_reg;
     if (!is_div || (imm > 0 && magic < 0) || (imm < 0 && magic > 0)) {
       // We will need the value later.
-      if (rl_src.location == kLocPhysReg) {
-        // We can use it directly.
-        DCHECK(rl_src.reg.GetReg() != rs_r0.GetReg() && rl_src.reg.GetReg() != rs_r2.GetReg());
-        numerator_reg = rl_src.reg;
-      } else {
-        numerator_reg = rs_r1;
-        LoadValueDirectFixed(rl_src, numerator_reg);
-      }
+      rl_src = LoadValue(rl_src, kCoreReg);
+      numerator_reg = rl_src.reg;
       OpRegCopy(rs_r0, numerator_reg);
     } else {
       // Only need this once.  Just put it into EAX.
@@ -1704,13 +1719,191 @@
   }
 }
 
+void X86Mir2Lir::GenDivRemLongLit(RegLocation rl_dest, RegLocation rl_src,
+                                  int64_t imm, bool is_div) {
+  if (imm == 0) {
+    GenDivZeroException();
+  } else if (imm == 1) {
+    if (is_div) {
+      // x / 1 == x.
+      StoreValueWide(rl_dest, rl_src);
+    } else {
+      // x % 1 == 0.
+      RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+      LoadConstantWide(rl_result.reg, 0);
+      StoreValueWide(rl_dest, rl_result);
+    }
+  } else if (imm == -1) {  // handle 0x8000000000000000 / -1 special case.
+    if (is_div) {
+      rl_src = LoadValueWide(rl_src, kCoreReg);
+      RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+      RegStorage rs_temp = AllocTempWide();
+
+      OpRegCopy(rl_result.reg, rl_src.reg);
+      LoadConstantWide(rs_temp, 0x8000000000000000);
+
+      // If x == MIN_LONG, return MIN_LONG.
+      OpRegReg(kOpCmp, rl_src.reg, rs_temp);
+      LIR *minint_branch = NewLIR2(kX86Jcc8, 0, kX86CondEq);
+
+      // For x != MIN_LONG, x / -1 == -x.
+      OpReg(kOpNeg, rl_result.reg);
+
+      minint_branch->target = NewLIR0(kPseudoTargetLabel);
+      FreeTemp(rs_temp);
+      StoreValueWide(rl_dest, rl_result);
+    } else {
+      // x % -1 == 0.
+      RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+      LoadConstantWide(rl_result.reg, 0);
+      StoreValueWide(rl_dest, rl_result);
+    }
+  } else if (is_div && IsPowerOfTwo(std::abs(imm))) {
+    // Division using shifting.
+    rl_src = LoadValueWide(rl_src, kCoreReg);
+    RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+    if (IsSameReg(rl_result.reg, rl_src.reg)) {
+      RegStorage rs_temp = AllocTypedTempWide(false, kCoreReg);
+      rl_result.reg.SetReg(rs_temp.GetReg());
+    }
+    LoadConstantWide(rl_result.reg, std::abs(imm) - 1);
+    OpRegReg(kOpAdd, rl_result.reg, rl_src.reg);
+    NewLIR2(kX86Test64RR, rl_src.reg.GetReg(), rl_src.reg.GetReg());
+    OpCondRegReg(kOpCmov, kCondPl, rl_result.reg, rl_src.reg);
+    int shift_amount = LowestSetBit(imm);
+    OpRegImm(kOpAsr, rl_result.reg, shift_amount);
+    if (imm < 0) {
+      OpReg(kOpNeg, rl_result.reg);
+    }
+    StoreValueWide(rl_dest, rl_result);
+  } else {
+    CHECK(imm <= -2 || imm >= 2);
+
+    FlushReg(rs_r0q);
+    Clobber(rs_r0q);
+    LockTemp(rs_r0q);
+    FlushReg(rs_r2q);
+    Clobber(rs_r2q);
+    LockTemp(rs_r2q);
+
+    RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, rs_r2q, INVALID_SREG, INVALID_SREG};
+
+    // Use H.S.Warren's Hacker's Delight Chapter 10 and
+    // T,Grablund, P.L.Montogomery's Division by invariant integers using multiplication.
+    int64_t magic;
+    int shift;
+    CalculateMagicAndShift(imm, magic, shift, true /* is_long */);
+
+    /*
+     * For imm >= 2,
+     *     int(n/imm) = floor(n/imm) = floor(M*n/2^S), while n > 0
+     *     int(n/imm) = ceil(n/imm) = floor(M*n/2^S) +1, while n < 0.
+     * For imm <= -2,
+     *     int(n/imm) = ceil(n/imm) = floor(M*n/2^S) +1 , while n > 0
+     *     int(n/imm) = floor(n/imm) = floor(M*n/2^S), while n < 0.
+     * We implement this algorithm in the following way:
+     * 1. multiply magic number m and numerator n, get the higher 64bit result in RDX
+     * 2. if imm > 0 and magic < 0, add numerator to RDX
+     *    if imm < 0 and magic > 0, sub numerator from RDX
+     * 3. if S !=0, SAR S bits for RDX
+     * 4. add 1 to RDX if RDX < 0
+     * 5. Thus, RDX is the quotient
+     */
+
+    // Numerator into RAX.
+    RegStorage numerator_reg;
+    if (!is_div || (imm > 0 && magic < 0) || (imm < 0 && magic > 0)) {
+      // We will need the value later.
+      rl_src = LoadValueWide(rl_src, kCoreReg);
+      numerator_reg = rl_src.reg;
+      OpRegCopyWide(rs_r0q, numerator_reg);
+    } else {
+      // Only need this once.  Just put it into RAX.
+      LoadValueDirectWideFixed(rl_src, rs_r0q);
+    }
+
+    // RDX = magic.
+    LoadConstantWide(rs_r2q, magic);
+
+    // RDX:RAX = magic & dividend.
+    NewLIR1(kX86Imul64DaR, rs_r2q.GetReg());
+
+    if (imm > 0 && magic < 0) {
+      // Add numerator to RDX.
+      DCHECK(numerator_reg.Valid());
+      OpRegReg(kOpAdd, rs_r2q, numerator_reg);
+    } else if (imm < 0 && magic > 0) {
+      DCHECK(numerator_reg.Valid());
+      OpRegReg(kOpSub, rs_r2q, numerator_reg);
+    }
+
+    // Do we need the shift?
+    if (shift != 0) {
+      // Shift RDX by 'shift' bits.
+      OpRegImm(kOpAsr, rs_r2q, shift);
+    }
+
+    // Move RDX to RAX.
+    OpRegCopyWide(rs_r0q, rs_r2q);
+
+    // Move sign bit to bit 0, zeroing the rest.
+    OpRegImm(kOpLsr, rs_r2q, 63);
+
+    // RDX = RDX + RAX.
+    OpRegReg(kOpAdd, rs_r2q, rs_r0q);
+
+    // Quotient is in RDX.
+    if (!is_div) {
+      // We need to compute the remainder.
+      // Remainder is divisor - (quotient * imm).
+      DCHECK(numerator_reg.Valid());
+      OpRegCopyWide(rs_r0q, numerator_reg);
+
+      // Imul doesn't support 64-bit imms.
+      if (imm > std::numeric_limits<int32_t>::max() ||
+          imm < std::numeric_limits<int32_t>::min()) {
+        RegStorage rs_temp = AllocTempWide();
+        LoadConstantWide(rs_temp, imm);
+
+        // RAX = numerator * imm.
+        NewLIR2(kX86Imul64RR, rs_r2q.GetReg(), rs_temp.GetReg());
+
+        FreeTemp(rs_temp);
+      } else {
+        // RAX = numerator * imm.
+        int short_imm = static_cast<int>(imm);
+        NewLIR3(kX86Imul64RRI, rs_r2q.GetReg(), rs_r2q.GetReg(), short_imm);
+      }
+
+      // RDX -= RAX.
+      OpRegReg(kOpSub, rs_r0q, rs_r2q);
+
+      // Store result.
+      OpRegCopyWide(rl_result.reg, rs_r0q);
+    } else {
+      // Store result.
+      OpRegCopyWide(rl_result.reg, rs_r2q);
+    }
+    StoreValueWide(rl_dest, rl_result);
+    FreeTemp(rs_r0q);
+    FreeTemp(rs_r2q);
+  }
+}
+
 void X86Mir2Lir::GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
-                           RegLocation rl_src2, bool is_div) {
+                               RegLocation rl_src2, bool is_div) {
   if (!cu_->target64) {
     LOG(FATAL) << "Unexpected use GenDivRemLong()";
     return;
   }
 
+  if (rl_src2.is_const) {
+    DCHECK(rl_src2.wide);
+    int64_t imm = mir_graph_->ConstantValueWide(rl_src2);
+    GenDivRemLongLit(rl_dest, rl_src1, imm, is_div);
+    return;
+  }
+
   // We have to use fixed registers, so flush all the temps.
   FlushAllRegs();
   LockCallTemps();  // Prepare for explicit register usage.
@@ -1734,7 +1927,7 @@
   // RHS is -1.
   LoadConstantWide(rs_r6q, 0x8000000000000000);
   NewLIR2(kX86Cmp64RR, rs_r0q.GetReg(), rs_r6q.GetReg());
-  LIR * minint_branch = NewLIR2(kX86Jcc8, 0, kX86CondNe);
+  LIR *minint_branch = NewLIR2(kX86Jcc8, 0, kX86CondNe);
 
   // In 0x8000000000000000/-1 case.
   if (!is_div) {