Improve x86 long multiply and shifts

Generate inline code for long shifts by constants and do long
multiplication inline. Convert multiplication by a constant to a
shift when we can. Fix some x86 assembler problems and add the new
instructions that were needed (64 bit shifts).

Change-Id: I6237a31c36159096e399d40d01eb6bfa22ac2772
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 3668dc0..32673db 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -192,7 +192,6 @@
     MIR* SpecialIPut(BasicBlock** bb, MIR* mir, OpSize size, bool long_or_double, bool is_object);
     MIR* SpecialIdentity(MIR* mir);
     LIR* LoadFPConstantValue(int r_dest, int value);
-    bool BadOverlap(RegLocation rl_src, RegLocation rl_dest);
     void ReplaceFixup(LIR* prev_lir, LIR* orig_lir, LIR* new_lir);
     void InsertFixupBefore(LIR* prev_lir, LIR* orig_lir, LIR* new_lir);
     void AssignDataOffsets();
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 71c3492..150794e 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -794,18 +794,6 @@
   StoreValueWide(rl_dest, rl_result);
 }
 
-
- /*
-  * Check to see if a result pair has a misaligned overlap with an operand pair.  This
-  * is not usual for dx to generate, but it is legal (for now).  In a future rev of
-  * dex, we'll want to make this case illegal.
-  */
-bool ArmMir2Lir::BadOverlap(RegLocation rl_src, RegLocation rl_dest) {
-  DCHECK(rl_src.wide);
-  DCHECK(rl_dest.wide);
-  return (abs(mir_graph_->SRegToVReg(rl_src.s_reg_low) - mir_graph_->SRegToVReg(rl_dest.s_reg_low)) == 1);
-}
-
 void ArmMir2Lir::GenMulLong(Instruction::Code opcode, RegLocation rl_dest,
                             RegLocation rl_src1, RegLocation rl_src2) {
     /*
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index 12ecfff..1eb79c9 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -1137,4 +1137,28 @@
   new_lir->next->prev = new_lir;
 }
 
+bool Mir2Lir::IsPowerOfTwo(uint64_t x) {
+  return (x & (x - 1)) == 0;
+}
+
+// Returns the index of the lowest set bit in 'x'.
+int32_t Mir2Lir::LowestSetBit(uint64_t x) {
+  int bit_posn = 0;
+  while ((x & 0xf) == 0) {
+    bit_posn += 4;
+    x >>= 4;
+  }
+  while ((x & 1) == 0) {
+    bit_posn++;
+    x >>= 1;
+  }
+  return bit_posn;
+}
+
+bool Mir2Lir::BadOverlap(RegLocation rl_src, RegLocation rl_dest) {
+  DCHECK(rl_src.wide);
+  DCHECK(rl_dest.wide);
+  return (abs(mir_graph_->SRegToVReg(rl_src.s_reg_low) - mir_graph_->SRegToVReg(rl_dest.s_reg_low)) == 1);
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 1f00b2a..d8b9869 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -1426,30 +1426,12 @@
  * or produce corresponding Thumb instructions directly.
  */
 
-static bool IsPowerOfTwo(int x) {
-  return (x & (x - 1)) == 0;
-}
-
 // Returns true if no more than two bits are set in 'x'.
 static bool IsPopCountLE2(unsigned int x) {
   x &= x - 1;
   return (x & (x - 1)) == 0;
 }
 
-// Returns the index of the lowest set bit in 'x'.
-static int32_t LowestSetBit(uint32_t x) {
-  int bit_posn = 0;
-  while ((x & 0xf) == 0) {
-    bit_posn += 4;
-    x >>= 4;
-  }
-  while ((x & 1) == 0) {
-    bit_posn++;
-    x >>= 1;
-  }
-  return bit_posn;
-}
-
 // Returns true if it added instructions to 'cu' to divide 'rl_src' by 'lit'
 // and store the result in 'rl_dest'.
 bool Mir2Lir::HandleEasyDivRem(Instruction::Code dalvik_opcode, bool is_div,
@@ -1741,7 +1723,7 @@
       break;
     case Instruction::MUL_LONG:
     case Instruction::MUL_LONG_2ADDR:
-      if (cu_->instruction_set == kThumb2) {
+      if (cu_->instruction_set != kMips) {
         GenMulLong(opcode, rl_dest, rl_src1, rl_src2);
         return;
       } else {
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index b59ec5e..b54bc7b 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -873,6 +873,29 @@
     CompilationUnit* GetCompilationUnit() {
       return cu_;
     }
+    /*
+     * @brief Returns the index of the lowest set bit in 'x'.
+     * @param x Value to be examined.
+     * @returns The bit number of the lowest bit set in the value.
+     */
+    int32_t LowestSetBit(uint64_t x);
+    /*
+     * @brief Is this value a power of two?
+     * @param x Value to be examined.
+     * @returns 'true' if only 1 bit is set in the value.
+     */
+    bool IsPowerOfTwo(uint64_t x);
+    /*
+     * @brief Do these SRs overlap?
+     * @param rl_op1 One RegLocation
+     * @param rl_op2 The other RegLocation
+     * @return 'true' if the VR pairs overlap
+     *
+     * Check to see if a result pair has a misaligned overlap with an operand pair.  This
+     * is not usual for dx to generate, but it is legal (for now).  In a future rev of
+     * dex, we'll want to make this case illegal.
+     */
+    bool BadOverlap(RegLocation rl_op1, RegLocation rl_op2);
 
     /*
      * @brief Force a location (in a register) into a temporary register
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 5e1c4d1..6d092d6 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -211,6 +211,8 @@
 #undef SHIFT_ENCODING_MAP
 
   { kX86Cmc, kNullary, NO_OPERAND, { 0, 0, 0xF5, 0, 0, 0, 0, 0}, "Cmc", "" },
+  { kX86Shld32RRI,  kRegRegImmRev, IS_TERTIARY_OP | REG_DEF0_USE01  | SETS_CCODES, { 0,    0, 0x0F, 0xA4, 0, 0, 0, 1}, "Shld32", "!0r,!1r,!2d" },
+  { kX86Shrd32RRI,  kRegRegImmRev, IS_TERTIARY_OP | REG_DEF0_USE01  | SETS_CCODES, { 0,    0, 0x0F, 0xAC, 0, 0, 0, 1}, "Shrd32", "!0r,!1r,!2d" },
 
   { kX86Test8RI,  kRegImm,             IS_BINARY_OP   | REG_USE0  | SETS_CCODES, { 0,    0, 0xF6, 0, 0, 0, 0, 1}, "Test8RI", "!0r,!1d" },
   { kX86Test8MI,  kMemImm,   IS_LOAD | IS_TERTIARY_OP | REG_USE0  | SETS_CCODES, { 0,    0, 0xF6, 0, 0, 0, 0, 1}, "Test8MI", "[!0r+!1d],!2d" },
@@ -422,6 +424,7 @@
     case kThreadImm:  // lir operands - 0: disp, 1: imm
       return ComputeSize(entry, 0, 0x12345678, false);  // displacement size is always 32bit
     case kRegRegImm:  // lir operands - 0: reg, 1: reg, 2: imm
+    case kRegRegImmRev:
       return ComputeSize(entry, 0, 0, false);
     case kRegMemImm:  // lir operands - 0: reg, 1: base, 2: disp, 3: imm
       return ComputeSize(entry, lir->operands[1], lir->operands[2], false);
@@ -642,7 +645,6 @@
   DCHECK_NE(0x0F, entry->skeleton.opcode);
   DCHECK_EQ(0, entry->skeleton.extra_opcode1);
   DCHECK_EQ(0, entry->skeleton.extra_opcode2);
-  DCHECK_NE(rX86_SP, base);
   EmitModrmDisp(entry->skeleton.modrm_opcode, base, disp);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   DCHECK_EQ(0, entry->skeleton.immediate_bytes);
@@ -755,6 +757,22 @@
   EmitImm(entry, imm);
 }
 
+void X86Mir2Lir::EmitRegRegImmRev(const X86EncodingMap* entry,
+                                  uint8_t reg1, uint8_t reg2, int32_t imm) {
+  EmitRegRegImm(entry, reg2, reg1, imm);
+}
+
+void X86Mir2Lir::EmitRegMemImm(const X86EncodingMap* entry,
+                               uint8_t reg, uint8_t base, int disp, int32_t imm) {
+  EmitPrefixAndOpcode(entry);
+  DCHECK(!X86_FPREG(reg));
+  DCHECK_LT(reg, 8);
+  EmitModrmDisp(reg, base, disp);
+  DCHECK_EQ(0, entry->skeleton.modrm_opcode);
+  DCHECK_EQ(0, entry->skeleton.ax_opcode);
+  EmitImm(entry, imm);
+}
+
 void X86Mir2Lir::EmitRegImm(const X86EncodingMap* entry, uint8_t reg, int imm) {
   if (entry->skeleton.prefix1 != 0) {
     code_buffer_.push_back(entry->skeleton.prefix1);
@@ -1186,9 +1204,16 @@
       case kRegRegStore:  // lir operands - 0: reg2, 1: reg1
         EmitRegReg(entry, lir->operands[1], lir->operands[0]);
         break;
+      case kRegRegImmRev:
+        EmitRegRegImmRev(entry, lir->operands[0], lir->operands[1], lir->operands[2]);
+        break;
       case kRegRegImm:
         EmitRegRegImm(entry, lir->operands[0], lir->operands[1], lir->operands[2]);
         break;
+      case kRegMemImm:
+        EmitRegMemImm(entry, lir->operands[0], lir->operands[1], lir->operands[2],
+                      lir->operands[3]);
+        break;
       case kRegImm:  // lir operands - 0: reg, 1: immediate
         EmitRegImm(entry, lir->operands[0], lir->operands[1]);
         break;
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 9cc4efd..6280b64 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -245,6 +245,8 @@
     void EmitRegThread(const X86EncodingMap* entry, uint8_t reg, int disp);
     void EmitRegReg(const X86EncodingMap* entry, uint8_t reg1, uint8_t reg2);
     void EmitRegRegImm(const X86EncodingMap* entry, uint8_t reg1, uint8_t reg2, int32_t imm);
+    void EmitRegRegImmRev(const X86EncodingMap* entry, uint8_t reg1, uint8_t reg2, int32_t imm);
+    void EmitRegMemImm(const X86EncodingMap* entry, uint8_t reg1, uint8_t base, int disp, int32_t imm);
     void EmitRegImm(const X86EncodingMap* entry, uint8_t reg, int imm);
     void EmitThreadImm(const X86EncodingMap* entry, int disp, int imm);
     void EmitMovRegImm(const X86EncodingMap* entry, uint8_t reg, int imm);
@@ -337,6 +339,32 @@
      * @param is_div 'true' if this is a division, 'false' for a remainder.
      */
     RegLocation GenDivRemLit(RegLocation rl_dest, RegLocation rl_src, int lit, bool is_div);
+
+    /*
+     * Generate code to implement long shift operations.
+     * @param opcode The DEX opcode to specify the shift type.
+     * @param rl_dest The destination.
+     * @param rl_src The value to be shifted.
+     * @param shift_amount How much to shift.
+     * @returns the RegLocation of the result.
+     */
+    RegLocation GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
+                                  RegLocation rl_src, int shift_amount);
+    /*
+     * Generate an imul of a register by a constant or a better sequence.
+     * @param dest Destination Register.
+     * @param src Source Register.
+     * @param val Constant multiplier.
+     */
+    void GenImulRegImm(int dest, int src, int val);
+    /*
+     * Generate an imul of a memory location by a constant or a better sequence.
+     * @param dest Destination Register.
+     * @param sreg Symbolic register.
+     * @param displacement Displacement on stack of Symbolic Register.
+     * @param val Constant multiplier.
+     */
+    void GenImulMemImm(int dest, int sreg, int displacement, int val);
 };
 
 }  // namespace art
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index ccae130..fc60eb9 100644
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -755,9 +755,188 @@
   return NULL;
 }
 
+void X86Mir2Lir::GenImulRegImm(int dest, int src, int val) {
+  switch (val) {
+    case 0:
+      NewLIR2(kX86Xor32RR, dest, dest);
+      break;
+    case 1:
+      OpRegCopy(dest, src);
+      break;
+    default:
+      OpRegRegImm(kOpMul, dest, src, val);
+      break;
+  }
+}
+
+void X86Mir2Lir::GenImulMemImm(int dest, int sreg, int displacement, int val) {
+  LIR *m;
+  switch (val) {
+    case 0:
+      NewLIR2(kX86Xor32RR, dest, dest);
+      break;
+    case 1:
+      LoadBaseDisp(rX86_SP, displacement, dest, kWord, sreg);
+      break;
+    default:
+      m = NewLIR4(IS_SIMM8(val) ? kX86Imul32RMI8 : kX86Imul32RMI, dest, rX86_SP,
+                  displacement, val);
+      AnnotateDalvikRegAccess(m, displacement >> 2, true /* is_load */, true /* is_64bit */);
+      break;
+  }
+}
+
 void X86Mir2Lir::GenMulLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
                             RegLocation rl_src2) {
-  LOG(FATAL) << "Unexpected use of GenX86Long for x86";
+  if (rl_src1.is_const) {
+    std::swap(rl_src1, rl_src2);
+  }
+  // Are we multiplying by a constant?
+  if (rl_src2.is_const) {
+    // Do special compare/branch against simple const operand
+    int64_t val = mir_graph_->ConstantValueWide(rl_src2);
+    if (val == 0) {
+      RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+      OpRegReg(kOpXor, rl_result.low_reg, rl_result.low_reg);
+      OpRegReg(kOpXor, rl_result.high_reg, rl_result.high_reg);
+      StoreValueWide(rl_dest, rl_result);
+      return;
+    } else if (val == 1) {
+      rl_src1 = EvalLocWide(rl_src1, kCoreReg, true);
+      StoreValueWide(rl_dest, rl_src1);
+      return;
+    } else if (val == 2) {
+      GenAddLong(Instruction::ADD_LONG, rl_dest, rl_src1, rl_src1);
+      return;
+    } else if (IsPowerOfTwo(val)) {
+      int shift_amount = LowestSetBit(val);
+      if (!BadOverlap(rl_src1, rl_dest)) {
+        rl_src1 = LoadValueWide(rl_src1, kCoreReg);
+        RegLocation rl_result = GenShiftImmOpLong(Instruction::SHL_LONG, rl_dest,
+                                                  rl_src1, shift_amount);
+        StoreValueWide(rl_dest, rl_result);
+        return;
+      }
+    }
+
+    // Okay, just bite the bullet and do it.
+    int32_t val_lo = Low32Bits(val);
+    int32_t val_hi = High32Bits(val);
+    FlushAllRegs();
+    LockCallTemps();  // Prepare for explicit register usage.
+    rl_src1 = UpdateLocWide(rl_src1);
+    bool src1_in_reg = rl_src1.location == kLocPhysReg;
+    int displacement = SRegOffset(rl_src1.s_reg_low);
+
+    // ECX <- 1H * 2L
+    // EAX <- 1L * 2H
+    if (src1_in_reg) {
+      GenImulRegImm(r1, rl_src1.high_reg, val_lo);
+      GenImulRegImm(r0, rl_src1.low_reg, val_hi);
+    } else {
+      GenImulMemImm(r1, GetSRegHi(rl_src1.s_reg_low), displacement + HIWORD_OFFSET, val_lo);
+      GenImulMemImm(r0, rl_src1.s_reg_low, displacement + LOWORD_OFFSET, val_hi);
+    }
+
+    // ECX <- ECX + EAX  (2H * 1L) + (1H * 2L)
+    NewLIR2(kX86Add32RR, r1, r0);
+
+    // EAX <- 2L
+    LoadConstantNoClobber(r0, val_lo);
+
+    // EDX:EAX <- 2L * 1L (double precision)
+    if (src1_in_reg) {
+      NewLIR1(kX86Mul32DaR, rl_src1.low_reg);
+    } else {
+      LIR *m = NewLIR2(kX86Mul32DaM, rX86_SP, displacement + LOWORD_OFFSET);
+      AnnotateDalvikRegAccess(m, (displacement + LOWORD_OFFSET) >> 2,
+                              true /* is_load */, true /* is_64bit */);
+    }
+
+    // EDX <- EDX + ECX (add high words)
+    NewLIR2(kX86Add32RR, r2, r1);
+
+    // Result is EDX:EAX
+    RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed, r0, r2,
+                             INVALID_SREG, INVALID_SREG};
+    StoreValueWide(rl_dest, rl_result);
+    return;
+  }
+
+  // Nope.  Do it the hard way
+  FlushAllRegs();
+  LockCallTemps();  // Prepare for explicit register usage.
+  rl_src1 = UpdateLocWide(rl_src1);
+  rl_src2 = UpdateLocWide(rl_src2);
+
+  // At this point, the VRs are in their home locations.
+  bool src1_in_reg = rl_src1.location == kLocPhysReg;
+  bool src2_in_reg = rl_src2.location == kLocPhysReg;
+
+  // ECX <- 1H
+  if (src1_in_reg) {
+    NewLIR2(kX86Mov32RR, r1, rl_src1.high_reg);
+  } else {
+    LoadBaseDisp(rX86_SP, SRegOffset(rl_src1.s_reg_low) + HIWORD_OFFSET, r1,
+                 kWord, GetSRegHi(rl_src1.s_reg_low));
+  }
+
+  // EAX <- 2H
+  if (src2_in_reg) {
+    NewLIR2(kX86Mov32RR, r0, rl_src2.high_reg);
+  } else {
+    LoadBaseDisp(rX86_SP, SRegOffset(rl_src2.s_reg_low) + HIWORD_OFFSET, r0,
+                 kWord, GetSRegHi(rl_src2.s_reg_low));
+  }
+
+  // EAX <- EAX * 1L  (2H * 1L)
+  if (src1_in_reg) {
+    NewLIR2(kX86Imul32RR, r0, rl_src1.low_reg);
+  } else {
+    int displacement = SRegOffset(rl_src1.s_reg_low);
+    LIR *m = NewLIR3(kX86Imul32RM, r0, rX86_SP, displacement + LOWORD_OFFSET);
+    AnnotateDalvikRegAccess(m, (displacement + LOWORD_OFFSET) >> 2,
+                            true /* is_load */, true /* is_64bit */);
+  }
+
+  // ECX <- ECX * 2L  (1H * 2L)
+  if (src2_in_reg) {
+    NewLIR2(kX86Imul32RR, r1, rl_src2.low_reg);
+  } else {
+    int displacement = SRegOffset(rl_src2.s_reg_low);
+    LIR *m = NewLIR3(kX86Imul32RM, r1, rX86_SP, displacement + LOWORD_OFFSET);
+    AnnotateDalvikRegAccess(m, (displacement + LOWORD_OFFSET) >> 2,
+                            true /* is_load */, true /* is_64bit */);
+  }
+
+  // ECX <- ECX + EAX  (2H * 1L) + (1H * 2L)
+  NewLIR2(kX86Add32RR, r1, r0);
+
+  // EAX <- 2L
+  if (src2_in_reg) {
+    NewLIR2(kX86Mov32RR, r0, rl_src2.low_reg);
+  } else {
+    LoadBaseDisp(rX86_SP, SRegOffset(rl_src2.s_reg_low) + LOWORD_OFFSET, r0,
+                 kWord, rl_src2.s_reg_low);
+  }
+
+  // EDX:EAX <- 2L * 1L (double precision)
+  if (src1_in_reg) {
+    NewLIR1(kX86Mul32DaR, rl_src1.low_reg);
+  } else {
+    int displacement = SRegOffset(rl_src1.s_reg_low);
+    LIR *m = NewLIR2(kX86Mul32DaM, rX86_SP, displacement + LOWORD_OFFSET);
+    AnnotateDalvikRegAccess(m, (displacement + LOWORD_OFFSET) >> 2,
+                            true /* is_load */, true /* is_64bit */);
+  }
+
+  // EDX <- EDX + ECX (add high words)
+  NewLIR2(kX86Add32RR, r2, r1);
+
+  // Result is EDX:EAX
+  RegLocation rl_result = {kLocPhysReg, 1, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed, r0, r2,
+                           INVALID_SREG, INVALID_SREG};
+  StoreValueWide(rl_dest, rl_result);
 }
 
 void X86Mir2Lir::GenLongRegOrMemOp(RegLocation rl_dest, RegLocation rl_src,
@@ -1057,10 +1236,89 @@
   }
 }
 
+RegLocation X86Mir2Lir::GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
+                                          RegLocation rl_src, int shift_amount) {
+  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+  switch (opcode) {
+    case Instruction::SHL_LONG:
+    case Instruction::SHL_LONG_2ADDR:
+      DCHECK_NE(shift_amount, 1);  // Prevent a double store from happening.
+      if (shift_amount == 32) {
+        OpRegCopy(rl_result.high_reg, rl_src.low_reg);
+        LoadConstant(rl_result.low_reg, 0);
+      } else if (shift_amount > 31) {
+        OpRegCopy(rl_result.high_reg, rl_src.low_reg);
+        FreeTemp(rl_src.high_reg);
+        NewLIR2(kX86Sal32RI, rl_result.high_reg, shift_amount - 32);
+        LoadConstant(rl_result.low_reg, 0);
+      } else {
+        OpRegCopy(rl_result.low_reg, rl_src.low_reg);
+        OpRegCopy(rl_result.high_reg, rl_src.high_reg);
+        NewLIR3(kX86Shld32RRI, rl_result.high_reg, rl_result.low_reg, shift_amount);
+        NewLIR2(kX86Sal32RI, rl_result.low_reg, shift_amount);
+      }
+      break;
+    case Instruction::SHR_LONG:
+    case Instruction::SHR_LONG_2ADDR:
+      if (shift_amount == 32) {
+        OpRegCopy(rl_result.low_reg, rl_src.high_reg);
+        OpRegCopy(rl_result.high_reg, rl_src.high_reg);
+        NewLIR2(kX86Sar32RI, rl_result.high_reg, 31);
+      } else if (shift_amount > 31) {
+        OpRegCopy(rl_result.low_reg, rl_src.high_reg);
+        OpRegCopy(rl_result.high_reg, rl_src.high_reg);
+        NewLIR2(kX86Sar32RI, rl_result.low_reg, shift_amount - 32);
+        NewLIR2(kX86Sar32RI, rl_result.high_reg, 31);
+      } else {
+        OpRegCopy(rl_result.low_reg, rl_src.low_reg);
+        OpRegCopy(rl_result.high_reg, rl_src.high_reg);
+        NewLIR3(kX86Shrd32RRI, rl_result.low_reg, rl_result.high_reg, shift_amount);
+        NewLIR2(kX86Sar32RI, rl_result.high_reg, shift_amount);
+      }
+      break;
+    case Instruction::USHR_LONG:
+    case Instruction::USHR_LONG_2ADDR:
+      if (shift_amount == 32) {
+        OpRegCopy(rl_result.low_reg, rl_src.high_reg);
+        LoadConstant(rl_result.high_reg, 0);
+      } else if (shift_amount > 31) {
+        OpRegCopy(rl_result.low_reg, rl_src.high_reg);
+        NewLIR2(kX86Shr32RI, rl_result.low_reg, shift_amount - 32);
+        LoadConstant(rl_result.high_reg, 0);
+      } else {
+        OpRegCopy(rl_result.low_reg, rl_src.low_reg);
+        OpRegCopy(rl_result.high_reg, rl_src.high_reg);
+        NewLIR3(kX86Shrd32RRI, rl_result.low_reg, rl_result.high_reg, shift_amount);
+        NewLIR2(kX86Shr32RI, rl_result.high_reg, shift_amount);
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unexpected case";
+  }
+  return rl_result;
+}
+
 void X86Mir2Lir::GenShiftImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
-                                   RegLocation rl_src1, RegLocation rl_shift) {
-  // Default implementation is just to ignore the constant case.
-  GenShiftOpLong(opcode, rl_dest, rl_src1, rl_shift);
+                                   RegLocation rl_src, RegLocation rl_shift) {
+  // Per spec, we only care about low 6 bits of shift amount.
+  int shift_amount = mir_graph_->ConstantValue(rl_shift) & 0x3f;
+  if (shift_amount == 0) {
+    rl_src = LoadValueWide(rl_src, kCoreReg);
+    StoreValueWide(rl_dest, rl_src);
+    return;
+  } else if (shift_amount == 1 &&
+            (opcode ==  Instruction::SHL_LONG || opcode == Instruction::SHL_LONG_2ADDR)) {
+    // Need to handle this here to avoid calling StoreValueWide twice.
+    GenAddLong(Instruction::ADD_LONG, rl_dest, rl_src, rl_src);
+    return;
+  }
+  if (BadOverlap(rl_src, rl_dest)) {
+    GenShiftOpLong(opcode, rl_dest, rl_src, rl_shift);
+    return;
+  }
+  rl_src = LoadValueWide(rl_src, kCoreReg);
+  RegLocation rl_result = GenShiftImmOpLong(opcode, rl_dest, rl_src, shift_amount);
+  StoreValueWide(rl_dest, rl_result);
 }
 
 void X86Mir2Lir::GenArithImmOpLong(Instruction::Code opcode,
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index d7f61fc..3a12038 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -304,6 +304,8 @@
   BinaryShiftOpCode(kX86Sar),
 #undef BinaryShiftOpcode
   kX86Cmc,
+  kX86Shld32RRI,
+  kX86Shrd32RRI,
 #define UnaryOpcode(opcode, reg, mem, array) \
   opcode ## 8 ## reg, opcode ## 8 ## mem, opcode ## 8 ## array, \
   opcode ## 16 ## reg, opcode ## 16 ## mem, opcode ## 16 ## array, \
@@ -398,6 +400,7 @@
   kRegImm, kMemImm, kArrayImm, kThreadImm,  // RI, MI, AI and TI instruction kinds.
   kRegRegImm, kRegMemImm, kRegArrayImm,    // RRI, RMI and RAI instruction kinds.
   kMovRegImm,                              // Shorter form move RI.
+  kRegRegImmRev,                           // RRI with first reg in r/m
   kShiftRegImm, kShiftMemImm, kShiftArrayImm,  // Shift opcode with immediate.
   kShiftRegCl, kShiftMemCl, kShiftArrayCl,     // Shift opcode with register CL.
   kRegRegReg, kRegRegMem, kRegRegArray,    // RRR, RRM, RRA instruction kinds.