Fixes to enable TrackLiveTemps optimization on x86.

- Created new kRegRegStore instruction class for Movdrx, where the
  source is first, and the destination is second.

- Reverted neg_float and neg_double implementation to prevent confusion
  of register types when optimizations are performed.

- Swapped order of loads for wide values to prevent base pointer from
  being clobbered when the base pointer equals the low destination reg.

- Implemented opRegCopyWide for general purpose reg source to floating
  point reg destination and vice versa.

- Added more opcode coverage to x86 disassembler.

Change-Id: I4e58eec91742cc51333003fa5a678ba5b23eb575
diff --git a/src/compiler/Frontend.cc b/src/compiler/Frontend.cc
index aeee4f0..bcaba10 100644
--- a/src/compiler/Frontend.cc
+++ b/src/compiler/Frontend.cc
@@ -791,8 +791,7 @@
     // Disable some optimizations on X86 for now
     cUnit->disableOpt |= (
         (1 << kLoadStoreElimination) |
-        (1 << kPromoteRegs) |
-        (1 << kTrackLiveTemps));
+        (1 << kPromoteRegs));
   }
   /* Are we generating code for the debugger? */
   if (compiler.IsDebuggingSupported()) {
diff --git a/src/compiler/codegen/x86/Assemble.cc b/src/compiler/codegen/x86/Assemble.cc
index 7bd5c52..63e4cc3 100644
--- a/src/compiler/codegen/x86/Assemble.cc
+++ b/src/compiler/codegen/x86/Assemble.cc
@@ -280,10 +280,13 @@
   EXT_0F_ENCODING_MAP(Divsd,     0xF2, 0x5E, REG_DEF0),
   EXT_0F_ENCODING_MAP(Divss,     0xF3, 0x5E, REG_DEF0),
 
+  { kX86PsrlqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 2, 0, 1 }, "PsrlqRI", "!0r,!1d" },
   { kX86PsllqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 6, 0, 1 }, "PsllqRI", "!0r,!1d" },
 
   EXT_0F_ENCODING_MAP(Movdxr,    0x66, 0x6E, REG_DEF0),
-  EXT_0F_ENCODING_MAP(Movdrx,    0x66, 0x7E, REG_DEF0),
+  { kX86MovdrxRR, kRegRegStore, IS_BINARY_OP | REG_DEF0   | REG_USE01,  { 0x66, 0, 0x0F, 0x7E, 0, 0, 0, 0 }, "MovdrxRR", "!0r,!1r" },
+  { kX86MovdrxMR, kMemReg,      IS_STORE | IS_TERTIARY_OP | REG_USE02,  { 0x66, 0, 0x0F, 0x7E, 0, 0, 0, 0 }, "MovdrxMR", "[!0r+!1d],!2r" },
+  { kX86MovdrxAR, kArrayReg,    IS_STORE | IS_QUIN_OP     | REG_USE014, { 0x66, 0, 0x0F, 0x7E, 0, 0, 0, 0 }, "MovdrxAR", "[!0r+!1r<<!2d+!3d],!4r" },
 
   { kX86Set8R, kRegCond,              IS_BINARY_OP   | REG_DEF0  | USES_CCODES, { 0, 0, 0x0F, 0x90, 0, 0, 0, 0 }, "Set8R", "!1c !0r" },
   { kX86Set8M, kMemCond,   IS_STORE | IS_TERTIARY_OP | REG_USE0  | USES_CCODES, { 0, 0, 0x0F, 0x90, 0, 0, 0, 0 }, "Set8M", "!2c [!0r+!1d]" },
@@ -375,6 +378,8 @@
       return computeSize(entry, lir->operands[0], false);
     case kRegReg:
       return computeSize(entry, 0, false);
+    case kRegRegStore:
+      return computeSize(entry, 0, false);
     case kRegMem: { // lir operands - 0: reg, 1: base, 2: disp
       int base = lir->operands[1];
       return computeSize(entry, lir->operands[2], false) + (base == rSP ? 1 : 0);
@@ -800,6 +805,9 @@
       DCHECK_EQ(0, entry->skeleton.extra_opcode1);
       DCHECK_EQ(0, entry->skeleton.extra_opcode2);
     }
+    if (FPREG(reg)) {
+      reg = reg & FP_REG_MASK;
+    }
     uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | reg;
     cUnit->codeBuffer.push_back(modrm);
   }
@@ -1307,6 +1315,9 @@
       case kRegReg:  // lir operands - 0: reg1, 1: reg2
         emitRegReg(cUnit, entry, lir->operands[0], lir->operands[1]);
         break;
+      case kRegRegStore:  // lir operands - 0: reg2, 1: reg1
+        emitRegReg(cUnit, entry, lir->operands[1], lir->operands[0]);
+        break;
       case kRegRegImm:
         emitRegRegImm(cUnit, entry, lir->operands[0], lir->operands[1], lir->operands[2]);
         break;
diff --git a/src/compiler/codegen/x86/FP/X86FP.cc b/src/compiler/codegen/x86/FP/X86FP.cc
index c00b5fc..f2488d0 100644
--- a/src/compiler/codegen/x86/FP/X86FP.cc
+++ b/src/compiler/codegen/x86/FP/X86FP.cc
@@ -21,6 +21,7 @@
                             RegLocation rlSrc2) {
   X86OpCode op = kX86Nop;
   RegLocation rlResult;
+  int tempReg;
 
   /*
    * Don't attempt to optimize register usage since these opcodes call out to
@@ -44,12 +45,13 @@
       op = kX86MulssRR;
       break;
     case Instruction::NEG_FLOAT:
-      // TODO: Make this nicer. Subtracting the source from 0 doesn't work in
-      // the 0 case, and using FCHS is difficult with register promotion. This
-      // code treats the value as a CoreReg to make it easy to manipulate.
-      rlSrc1 = loadValue(cUnit, rlSrc1, kCoreReg);
-      rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
-      opRegRegImm(cUnit, kOpAdd, rlResult.lowReg, rlSrc1.lowReg, 0x80000000);
+      // TODO: Make this an XorpsRM where the memory location holds 0x80000000
+      rlSrc1 = loadValue(cUnit, rlSrc1, kFPReg);
+      rlResult = oatEvalLoc(cUnit, rlDest, kFPReg, true);
+      tempReg = oatAllocTemp(cUnit);
+      loadConstant(cUnit, tempReg, 0x80000000);
+      newLIR2(cUnit, kX86MovdxrRR, rlResult.lowReg, tempReg);
+      newLIR2(cUnit, kX86XorpsRR, rlResult.lowReg, rlSrc1.lowReg);
       storeValue(cUnit, rlDest, rlResult);
       return false;
     case Instruction::REM_FLOAT_2ADDR:
@@ -81,6 +83,7 @@
                              RegLocation rlSrc2) {
   X86OpCode op = kX86Nop;
   RegLocation rlResult;
+  int tempReg;
 
   switch (opcode) {
     case Instruction::ADD_DOUBLE_2ADDR:
@@ -100,13 +103,14 @@
       op = kX86MulsdRR;
       break;
     case Instruction::NEG_DOUBLE:
-      // TODO: Make this nicer. Subtracting the source from 0 doesn't work in
-      // the 0 case, and using FCHS is difficult with register promotion. This
-      // code treats the value as a CoreReg to make it easy to manipulate.
-      rlSrc1 = loadValueWide(cUnit, rlSrc1, kCoreReg);
-      rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
-      opRegRegImm(cUnit, kOpAdd, rlResult.highReg, rlSrc1.highReg, 0x80000000);
-      opRegCopy(cUnit, rlResult.lowReg, rlSrc1.lowReg);
+      // TODO: Make this an XorpdRM where the memory location holds 0x8000000000000000
+      rlSrc1 = loadValueWide(cUnit, rlSrc1, kFPReg);
+      rlResult = oatEvalLoc(cUnit, rlDest, kFPReg, true);
+      tempReg = oatAllocTemp(cUnit);
+      loadConstant(cUnit, tempReg, 0x80000000);
+      newLIR2(cUnit, kX86MovdxrRR, rlResult.lowReg, tempReg);
+      newLIR2(cUnit, kX86PsllqRI, rlResult.lowReg, 32);
+      newLIR2(cUnit, kX86XorpsRR, rlResult.lowReg, rlSrc1.lowReg);
       storeValueWide(cUnit, rlDest, rlResult);
       return false;
     case Instruction::REM_DOUBLE_2ADDR:
diff --git a/src/compiler/codegen/x86/X86/Factory.cc b/src/compiler/codegen/x86/X86/Factory.cc
index d60d9de..9721038 100644
--- a/src/compiler/codegen/x86/X86/Factory.cc
+++ b/src/compiler/codegen/x86/X86/Factory.cc
@@ -475,9 +475,15 @@
     if (!pair) {
       load = newLIR3(cUnit, opcode, rDest, rBase, displacement + LOWORD_OFFSET);
     } else {
-      load = newLIR3(cUnit, opcode, rDest, rBase, displacement + LOWORD_OFFSET);
-      load2 = newLIR3(cUnit, opcode, rDestHi, rBase,
-                      displacement + HIWORD_OFFSET);
+      if (rBase == rDest) {
+        load2 = newLIR3(cUnit, opcode, rDestHi, rBase,
+                        displacement + HIWORD_OFFSET);
+        load = newLIR3(cUnit, opcode, rDest, rBase, displacement + LOWORD_OFFSET);
+      } else {
+        load = newLIR3(cUnit, opcode, rDest, rBase, displacement + LOWORD_OFFSET);
+        load2 = newLIR3(cUnit, opcode, rDestHi, rBase,
+                        displacement + HIWORD_OFFSET);
+      }
     }
     if (rBase == rSP) {
       annotateDalvikRegAccess(load, (displacement + (pair ? LOWORD_OFFSET : 0))
@@ -492,10 +498,17 @@
       load = newLIR5(cUnit, opcode, rDest, rBase, rIndex, scale,
                      displacement + LOWORD_OFFSET);
     } else {
-      load = newLIR5(cUnit, opcode, rDest, rBase, rIndex, scale,
-                     displacement + LOWORD_OFFSET);
-      load2 = newLIR5(cUnit, opcode, rDestHi, rBase, rIndex, scale,
-                      displacement + HIWORD_OFFSET);
+      if (rBase == rDest) {
+        load2 = newLIR5(cUnit, opcode, rDestHi, rBase, rIndex, scale,
+                        displacement + HIWORD_OFFSET);
+        load = newLIR5(cUnit, opcode, rDest, rBase, rIndex, scale,
+                       displacement + LOWORD_OFFSET);
+      } else {
+        load = newLIR5(cUnit, opcode, rDest, rBase, rIndex, scale,
+                       displacement + LOWORD_OFFSET);
+        load2 = newLIR5(cUnit, opcode, rDestHi, rBase, rIndex, scale,
+                        displacement + HIWORD_OFFSET);
+      }
     }
   }
 
diff --git a/src/compiler/codegen/x86/X86/Gen.cc b/src/compiler/codegen/x86/X86/Gen.cc
index b0b6ba8..adad05b 100644
--- a/src/compiler/codegen/x86/X86/Gen.cc
+++ b/src/compiler/codegen/x86/X86/Gen.cc
@@ -352,13 +352,18 @@
     if (srcFP) {
       opRegCopy(cUnit, S2D(destLo, destHi), S2D(srcLo, srcHi));
     } else {
-      UNIMPLEMENTED(WARNING);
-      newLIR0(cUnit, kX86Bkpt);
+      // TODO: Prevent this from happening in the code. The result is often
+      // unused or could have been loaded more easily from memory.
+      newLIR2(cUnit, kX86MovdxrRR, destLo, srcLo);
+      newLIR2(cUnit, kX86MovdxrRR, destHi, srcHi);
+      newLIR2(cUnit, kX86PsllqRI, destHi, 32);
+      newLIR2(cUnit, kX86OrpsRR, destLo, destHi);
     }
   } else {
     if (srcFP) {
-      UNIMPLEMENTED(WARNING);
-      newLIR0(cUnit, kX86Bkpt);
+      newLIR2(cUnit, kX86MovdrxRR, destLo, srcLo);
+      newLIR2(cUnit, kX86PsrlqRI, srcLo, 32);
+      newLIR2(cUnit, kX86MovdrxRR, destHi, srcLo);
     } else {
       // Handle overlap
       if (srcHi == destLo) {
diff --git a/src/compiler/codegen/x86/X86LIR.h b/src/compiler/codegen/x86/X86LIR.h
index 4c44118..c229844 100644
--- a/src/compiler/codegen/x86/X86LIR.h
+++ b/src/compiler/codegen/x86/X86LIR.h
@@ -110,7 +110,7 @@
 /* Offset to distingish FP regs */
 #define FP_REG_OFFSET 32
 /* Offset to distinguish DP FP regs */
-#define FP_DOUBLE (FP_REG_OFFSET + 32)
+#define FP_DOUBLE (FP_REG_OFFSET + 16)
 /* Offset to distingish the extra regs */
 #define EXTRA_REG_OFFSET (FP_DOUBLE + 16)
 /* Reg types */
@@ -433,9 +433,10 @@
   Binary0fOpCode(kX86Subss),    // float subtract
   Binary0fOpCode(kX86Divsd),    // double divide
   Binary0fOpCode(kX86Divss),    // float divide
-  kX86PsllqRI,                  // shift of floating point registers
+  kX86PsrlqRI,                  // right shift of floating point registers
+  kX86PsllqRI,                  // left shift of floating point registers
   Binary0fOpCode(kX86Movdxr),   // move into xmm from gpr
-  Binary0fOpCode(kX86Movdrx),   // move into reg from xmm
+  kX86MovdrxRR, kX86MovdrxMR, kX86MovdrxAR,// move into reg from xmm
   kX86Set8R, kX86Set8M, kX86Set8A,// set byte depending on condition operand
   kX86Mfence,                   // memory barrier
   Binary0fOpCode(kX86Imul16),   // 16bit multiply
@@ -470,6 +471,7 @@
   kReg, kMem, kArray,                      // R, M and A instruction kinds.
   kMemReg, kArrayReg, kThreadReg,          // MR, AR and TR instruction kinds.
   kRegReg, kRegMem, kRegArray, kRegThread, // RR, RM, RA and RT instruction kinds.
+  kRegRegStore,                            // RR following the store modrm reg-reg encoding rather than the load.
   kRegImm, kMemImm, kArrayImm, kThreadImm, // RI, MI, AI and TI instruction kinds.
   kRegRegImm, kRegMemImm, kRegArrayImm,    // RRI, RMI and RAI instruction kinds.
   kMovRegImm,                              // Shorter form move RI.
diff --git a/src/disassembler_x86.cc b/src/disassembler_x86.cc
index 4aff822..969feb8 100644
--- a/src/disassembler_x86.cc
+++ b/src/disassembler_x86.cc
@@ -225,10 +225,13 @@
       case 0x10: case 0x11:
         if (prefix[0] == 0xF2) {
           opcode << "movsd";
+          prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else if (prefix[0] == 0xF3) {
           opcode << "movss";
+          prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else if (prefix[2] == 0x66) {
           opcode << "movupd";
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else {
           opcode << "movups";
         }
@@ -237,6 +240,71 @@
         load = *instr == 0x10;
         store = !load;
         break;
+      case 0x2A:
+        if (prefix[2] == 0x66) {
+          opcode << "cvtpi2pd";
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else if (prefix[0] == 0xF2) {
+          opcode << "cvtsi2sd";
+          prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else if (prefix[0] == 0xF3) {
+          opcode << "cvtsi2ss";
+          prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          opcode << "cvtpi2ps";
+        }
+        load = true;
+        has_modrm = true;
+        dst_reg_file = SSE;
+        break;
+      case 0x2C:
+        if (prefix[2] == 0x66) {
+          opcode << "cvttpd2pi";
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else if (prefix[0] == 0xF2) {
+          opcode << "cvttsd2si";
+          prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else if (prefix[0] == 0xF3) {
+          opcode << "cvttss2si";
+          prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          opcode << "cvttps2pi";
+        }
+        load = true;
+        has_modrm = true;
+        src_reg_file = SSE;
+        break;
+      case 0x2D:
+        if (prefix[2] == 0x66) {
+          opcode << "cvtpd2pi";
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else if (prefix[0] == 0xF2) {
+          opcode << "cvtsd2si";
+          prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else if (prefix[0] == 0xF3) {
+          opcode << "cvtss2si";
+          prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          opcode << "cvtps2pi";
+        }
+        load = true;
+        has_modrm = true;
+        src_reg_file = SSE;
+        break;
+      case 0x2E:
+        opcode << "u";
+        // FALLTHROUGH
+      case 0x2F:
+        if (prefix[2] == 0x66) {
+          opcode << "comisd";
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          opcode << "comiss";
+        }
+        has_modrm = true;
+        load = true;
+        src_reg_file = dst_reg_file = SSE;
+        break;
       case 0x38:  // 3 byte extended opcode
         opcode << StringPrintf("unknown opcode '0F 38 %02X'", *instr);
         break;
@@ -264,13 +332,13 @@
         }
         if (prefix[2] == 0x66) {
           opcode << "pd";
-          prefix[2] = 0;  // clear prefix now its served its purpose as part of the opcode
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else if (prefix[0] == 0xF2) {
           opcode << "sd";
-          prefix[0] = 0;  // clear prefix now its served its purpose as part of the opcode
+          prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else if (prefix[0] == 0xF3) {
           opcode << "ss";
-          prefix[0] = 0;  // clear prefix now its served its purpose as part of the opcode
+          prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else {
           opcode << "ps";
         }
@@ -282,13 +350,13 @@
       case 0x5A:
         if (prefix[2] == 0x66) {
           opcode << "cvtpd2ps";
-          prefix[2] = 0;  // clear prefix now its served its purpose as part of the opcode
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else if (prefix[0] == 0xF2) {
           opcode << "cvtsd2ss";
-          prefix[0] = 0;  // clear prefix now its served its purpose as part of the opcode
+          prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else if (prefix[0] == 0xF3) {
           opcode << "cvtss2sd";
-          prefix[0] = 0;  // clear prefix now its served its purpose as part of the opcode
+          prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else {
           opcode << "cvtps2pd";
         }
@@ -299,12 +367,12 @@
       case 0x5B:
         if (prefix[2] == 0x66) {
           opcode << "cvtps2dq";
-          prefix[2] = 0;  // clear prefix now its served its purpose as part of the opcode
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else if (prefix[0] == 0xF2) {
           opcode << "bad opcode F2 0F 5B";
         } else if (prefix[0] == 0xF3) {
           opcode << "cvttps2dq";
-          prefix[0] = 0;  // clear prefix now its served its purpose as part of the opcode
+          prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else {
           opcode << "cvtdq2ps";
         }
@@ -315,12 +383,11 @@
       case 0x6E:
         if (prefix[2] == 0x66) {
           dst_reg_file = SSE;
-          opcode << "movq";
-          prefix[2] = 0;  // clear prefix now its served its purpose as part of the opcode
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else {
           dst_reg_file = MMX;
-          opcode << "movd";
         }
+        opcode << "movd";
         load = true;
         has_modrm = true;
         break;
@@ -328,10 +395,11 @@
         if (prefix[2] == 0x66) {
           dst_reg_file = SSE;
           opcode << "movdqa";
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else if (prefix[0] == 0xF3) {
           dst_reg_file = SSE;
           opcode << "movdqu";
-          prefix[0] = 0;  // clear prefix now its served its purpose as part of the opcode
+          prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else {
           dst_reg_file = MMX;
           opcode << "movq";
@@ -339,6 +407,59 @@
         load = true;
         has_modrm = true;
         break;
+      case 0x71:
+        if (prefix[2] == 0x66) {
+          dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          dst_reg_file = MMX;
+        }
+        static const char* x71_opcodes[] = {"unknown-71", "unknown-71", "psrlw", "unknown-71", "psraw", "unknown-71", "psllw", "unknown-71"};
+        modrm_opcodes = x71_opcodes;
+        reg_is_opcode = true;
+        has_modrm = true;
+        store = true;
+        immediate_bytes = 1;
+        break;
+      case 0x72:
+        if (prefix[2] == 0x66) {
+          dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          dst_reg_file = MMX;
+        }
+        static const char* x72_opcodes[] = {"unknown-72", "unknown-72", "psrld", "unknown-72", "psrad", "unknown-72", "pslld", "unknown-72"};
+        modrm_opcodes = x72_opcodes;
+        reg_is_opcode = true;
+        has_modrm = true;
+        store = true;
+        immediate_bytes = 1;
+        break;
+      case 0x73:
+        if (prefix[2] == 0x66) {
+          dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          dst_reg_file = MMX;
+        }
+        static const char* x73_opcodes[] = {"unknown-73", "unknown-73", "psrlq", "unknown-73", "unknown-73", "unknown-73", "psllq", "unknown-73"};
+        modrm_opcodes = x73_opcodes;
+        reg_is_opcode = true;
+        has_modrm = true;
+        store = true;
+        immediate_bytes = 1;
+        break;
+      case 0x7E:
+        if (prefix[2] == 0x66) {
+          src_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = MMX;
+        }
+        opcode << "movd";
+        has_modrm = true;
+        store = true;
+        break;
       case 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: case 0x87:
       case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: case 0x8E: case 0x8F:
         opcode << "j" << condition_codes[*instr & 0xF];
@@ -510,7 +631,7 @@
     }
   }
   if (ax) {
-    args << ", ";
+    // If this opcode implicitly uses ax, ax is always the first arg.
     DumpReg(args, rex, 0 /* EAX */, byte_operand, prefix[2], GPR);
   }
   if (cx) {