Enable load store elimination on x86.

Includes a fix to prevent stores from being sunk between cmp and jcc
ops. Also fixes neg-float/double when the source and destination are the
same register. All optis are enabled by default on x86 now.

Change-Id: Ie6f1a3a5ba94fd1b5298df87779d70d9868e8baa
diff --git a/src/compiler/Frontend.cc b/src/compiler/Frontend.cc
index b68233b..7585b77 100644
--- a/src/compiler/Frontend.cc
+++ b/src/compiler/Frontend.cc
@@ -797,10 +797,6 @@
     cUnit->disableOpt |= ~(1 << kSafeOptimizations);
   }
 #endif
-  if (cUnit->instructionSet == kX86) {
-    // Disable some optimizations on X86 for now
-    cUnit->disableOpt |= (1 << kLoadStoreElimination);
-  }
   /* Are we generating code for the debugger? */
   if (compiler.IsDebuggingSupported()) {
     cUnit->genDebugger = true;
diff --git a/src/compiler/codegen/LocalOptimizations.cc b/src/compiler/codegen/LocalOptimizations.cc
index faab3e0..2fc7ae0 100644
--- a/src/compiler/codegen/LocalOptimizations.cc
+++ b/src/compiler/codegen/LocalOptimizations.cc
@@ -226,6 +226,15 @@
       }
 
       if (stopHere == true) {
+#if defined(TARGET_X86)
+        // Prevent stores from being sunk between ops that generate ccodes and
+        // ops that use them.
+        int flags = EncodingMap[checkLIR->opcode].flags;
+        if (sinkDistance > 0 && (flags & IS_BRANCH) && (flags & USES_CCODES)) {
+          checkLIR = PREV_LIR(checkLIR);
+          sinkDistance--;
+        }
+#endif
         DEBUG_OPT(dumpDependentInsnPair(thisLIR, checkLIR, "REG CLOBBERED"));
         /* Only sink store instructions */
         if (sinkDistance && !isThisLIRLoad) {
diff --git a/src/compiler/codegen/x86/FP/X86FP.cc b/src/compiler/codegen/x86/FP/X86FP.cc
index 4e56e1d..6003465 100644
--- a/src/compiler/codegen/x86/FP/X86FP.cc
+++ b/src/compiler/codegen/x86/FP/X86FP.cc
@@ -44,16 +44,23 @@
     case Instruction::MUL_FLOAT:
       op = kX86MulssRR;
       break;
-    case Instruction::NEG_FLOAT:
+    case Instruction::NEG_FLOAT: {
       // TODO: Make this an XorpsRM where the memory location holds 0x80000000
       rlSrc1 = loadValue(cUnit, rlSrc1, kFPReg);
       rlResult = oatEvalLoc(cUnit, rlDest, kFPReg, true);
       tempReg = oatAllocTemp(cUnit);
       loadConstant(cUnit, tempReg, 0x80000000);
-      newLIR2(cUnit, kX86MovdxrRR, rlResult.lowReg, tempReg);
-      newLIR2(cUnit, kX86XorpsRR, rlResult.lowReg, rlSrc1.lowReg);
+      int rDest = rlResult.lowReg;
+      int rSrc1 = rlSrc1.lowReg;
+      if (rDest == rSrc1) {
+        rSrc1 = oatAllocTempFloat(cUnit);
+        opRegCopy(cUnit, rSrc1, rDest);
+      }
+      newLIR2(cUnit, kX86MovdxrRR, rDest, tempReg);
+      newLIR2(cUnit, kX86XorpsRR, rDest, rSrc1);
       storeValue(cUnit, rlDest, rlResult);
       return false;
+    }
     case Instruction::REM_FLOAT_2ADDR:
     case Instruction::REM_FLOAT: {
       return genArithOpFloatPortable(cUnit, opcode, rlDest, rlSrc1, rlSrc2);
@@ -67,7 +74,7 @@
   int rDest = rlResult.lowReg;
   int rSrc1 = rlSrc1.lowReg;
   int rSrc2 = rlSrc2.lowReg;
-  if (rSrc2 == rDest) {
+  if (rDest == rSrc2) {
     rSrc2 = oatAllocTempFloat(cUnit);
     opRegCopy(cUnit, rSrc2, rDest);
   }
@@ -102,17 +109,24 @@
     case Instruction::MUL_DOUBLE:
       op = kX86MulsdRR;
       break;
-    case Instruction::NEG_DOUBLE:
+    case Instruction::NEG_DOUBLE: {
       // TODO: Make this an XorpdRM where the memory location holds 0x8000000000000000
       rlSrc1 = loadValueWide(cUnit, rlSrc1, kFPReg);
       rlResult = oatEvalLoc(cUnit, rlDest, kFPReg, true);
       tempReg = oatAllocTemp(cUnit);
       loadConstant(cUnit, tempReg, 0x80000000);
-      newLIR2(cUnit, kX86MovdxrRR, rlResult.lowReg, tempReg);
-      newLIR2(cUnit, kX86PsllqRI, rlResult.lowReg, 32);
-      newLIR2(cUnit, kX86XorpsRR, rlResult.lowReg, rlSrc1.lowReg);
+      int rDest = S2D(rlResult.lowReg, rlResult.highReg);
+      int rSrc1 = S2D(rlSrc1.lowReg, rlSrc1.highReg);
+      if (rDest == rSrc1) {
+        rSrc1 = oatAllocTempDouble(cUnit) | FP_DOUBLE;
+        opRegCopy(cUnit, rSrc1, rDest);
+      }
+      newLIR2(cUnit, kX86MovdxrRR, rDest, tempReg);
+      newLIR2(cUnit, kX86PsllqRI, rDest, 32);
+      newLIR2(cUnit, kX86XorpsRR, rDest, rSrc1);
       storeValueWide(cUnit, rlDest, rlResult);
       return false;
+    }
     case Instruction::REM_DOUBLE_2ADDR:
     case Instruction::REM_DOUBLE: {
       return genArithOpDoublePortable(cUnit, opcode, rlDest, rlSrc1, rlSrc2);