Fixes to enable TrackLiveTemps optimization on x86.
- Created new kRegRegStore instruction class for Movdrx, where the
source is first, and the destination is second.
- Reverted neg_float and neg_double implementation to prevent confusion
of register types when optimizations are performed.
- Swapped order of loads for wide values to prevent base pointer from
being clobbered when the base pointer equals the low destination reg.
- Implemented opRegCopyWide for general purpose reg source to floating
point reg destination and vice versa.
- Added more opcode coverage to x86 disassembler.
Change-Id: I4e58eec91742cc51333003fa5a678ba5b23eb575
diff --git a/src/compiler/Frontend.cc b/src/compiler/Frontend.cc
index aeee4f0..bcaba10 100644
--- a/src/compiler/Frontend.cc
+++ b/src/compiler/Frontend.cc
@@ -791,8 +791,7 @@
// Disable some optimizations on X86 for now
cUnit->disableOpt |= (
(1 << kLoadStoreElimination) |
- (1 << kPromoteRegs) |
- (1 << kTrackLiveTemps));
+ (1 << kPromoteRegs));
}
/* Are we generating code for the debugger? */
if (compiler.IsDebuggingSupported()) {
diff --git a/src/compiler/codegen/x86/Assemble.cc b/src/compiler/codegen/x86/Assemble.cc
index 7bd5c52..63e4cc3 100644
--- a/src/compiler/codegen/x86/Assemble.cc
+++ b/src/compiler/codegen/x86/Assemble.cc
@@ -280,10 +280,13 @@
EXT_0F_ENCODING_MAP(Divsd, 0xF2, 0x5E, REG_DEF0),
EXT_0F_ENCODING_MAP(Divss, 0xF3, 0x5E, REG_DEF0),
+ { kX86PsrlqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 2, 0, 1 }, "PsrlqRI", "!0r,!1d" },
{ kX86PsllqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 6, 0, 1 }, "PsllqRI", "!0r,!1d" },
EXT_0F_ENCODING_MAP(Movdxr, 0x66, 0x6E, REG_DEF0),
- EXT_0F_ENCODING_MAP(Movdrx, 0x66, 0x7E, REG_DEF0),
+ { kX86MovdrxRR, kRegRegStore, IS_BINARY_OP | REG_DEF0 | REG_USE01, { 0x66, 0, 0x0F, 0x7E, 0, 0, 0, 0 }, "MovdrxRR", "!0r,!1r" },
+ { kX86MovdrxMR, kMemReg, IS_STORE | IS_TERTIARY_OP | REG_USE02, { 0x66, 0, 0x0F, 0x7E, 0, 0, 0, 0 }, "MovdrxMR", "[!0r+!1d],!2r" },
+ { kX86MovdrxAR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014, { 0x66, 0, 0x0F, 0x7E, 0, 0, 0, 0 }, "MovdrxAR", "[!0r+!1r<<!2d+!3d],!4r" },
{ kX86Set8R, kRegCond, IS_BINARY_OP | REG_DEF0 | USES_CCODES, { 0, 0, 0x0F, 0x90, 0, 0, 0, 0 }, "Set8R", "!1c !0r" },
{ kX86Set8M, kMemCond, IS_STORE | IS_TERTIARY_OP | REG_USE0 | USES_CCODES, { 0, 0, 0x0F, 0x90, 0, 0, 0, 0 }, "Set8M", "!2c [!0r+!1d]" },
@@ -375,6 +378,8 @@
return computeSize(entry, lir->operands[0], false);
case kRegReg:
return computeSize(entry, 0, false);
+ case kRegRegStore:
+ return computeSize(entry, 0, false);
case kRegMem: { // lir operands - 0: reg, 1: base, 2: disp
int base = lir->operands[1];
return computeSize(entry, lir->operands[2], false) + (base == rSP ? 1 : 0);
@@ -800,6 +805,9 @@
DCHECK_EQ(0, entry->skeleton.extra_opcode1);
DCHECK_EQ(0, entry->skeleton.extra_opcode2);
}
+ if (FPREG(reg)) {
+ reg = reg & FP_REG_MASK;
+ }
uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | reg;
cUnit->codeBuffer.push_back(modrm);
}
@@ -1307,6 +1315,9 @@
case kRegReg: // lir operands - 0: reg1, 1: reg2
emitRegReg(cUnit, entry, lir->operands[0], lir->operands[1]);
break;
+ case kRegRegStore: // lir operands - 0: reg2, 1: reg1
+ emitRegReg(cUnit, entry, lir->operands[1], lir->operands[0]);
+ break;
case kRegRegImm:
emitRegRegImm(cUnit, entry, lir->operands[0], lir->operands[1], lir->operands[2]);
break;
diff --git a/src/compiler/codegen/x86/FP/X86FP.cc b/src/compiler/codegen/x86/FP/X86FP.cc
index c00b5fc..f2488d0 100644
--- a/src/compiler/codegen/x86/FP/X86FP.cc
+++ b/src/compiler/codegen/x86/FP/X86FP.cc
@@ -21,6 +21,7 @@
RegLocation rlSrc2) {
X86OpCode op = kX86Nop;
RegLocation rlResult;
+ int tempReg;
/*
* Don't attempt to optimize register usage since these opcodes call out to
@@ -44,12 +45,13 @@
op = kX86MulssRR;
break;
case Instruction::NEG_FLOAT:
- // TODO: Make this nicer. Subtracting the source from 0 doesn't work in
- // the 0 case, and using FCHS is difficult with register promotion. This
- // code treats the value as a CoreReg to make it easy to manipulate.
- rlSrc1 = loadValue(cUnit, rlSrc1, kCoreReg);
- rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
- opRegRegImm(cUnit, kOpAdd, rlResult.lowReg, rlSrc1.lowReg, 0x80000000);
+ // TODO: Make this an XorpsRM where the memory location holds 0x80000000
+ rlSrc1 = loadValue(cUnit, rlSrc1, kFPReg);
+ rlResult = oatEvalLoc(cUnit, rlDest, kFPReg, true);
+ tempReg = oatAllocTemp(cUnit);
+ loadConstant(cUnit, tempReg, 0x80000000);
+ newLIR2(cUnit, kX86MovdxrRR, rlResult.lowReg, tempReg);
+ newLIR2(cUnit, kX86XorpsRR, rlResult.lowReg, rlSrc1.lowReg);
storeValue(cUnit, rlDest, rlResult);
return false;
case Instruction::REM_FLOAT_2ADDR:
@@ -81,6 +83,7 @@
RegLocation rlSrc2) {
X86OpCode op = kX86Nop;
RegLocation rlResult;
+ int tempReg;
switch (opcode) {
case Instruction::ADD_DOUBLE_2ADDR:
@@ -100,13 +103,14 @@
op = kX86MulsdRR;
break;
case Instruction::NEG_DOUBLE:
- // TODO: Make this nicer. Subtracting the source from 0 doesn't work in
- // the 0 case, and using FCHS is difficult with register promotion. This
- // code treats the value as a CoreReg to make it easy to manipulate.
- rlSrc1 = loadValueWide(cUnit, rlSrc1, kCoreReg);
- rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
- opRegRegImm(cUnit, kOpAdd, rlResult.highReg, rlSrc1.highReg, 0x80000000);
- opRegCopy(cUnit, rlResult.lowReg, rlSrc1.lowReg);
+ // TODO: Make this an XorpdRM where the memory location holds 0x8000000000000000
+ rlSrc1 = loadValueWide(cUnit, rlSrc1, kFPReg);
+ rlResult = oatEvalLoc(cUnit, rlDest, kFPReg, true);
+ tempReg = oatAllocTemp(cUnit);
+ loadConstant(cUnit, tempReg, 0x80000000);
+ newLIR2(cUnit, kX86MovdxrRR, rlResult.lowReg, tempReg);
+ newLIR2(cUnit, kX86PsllqRI, rlResult.lowReg, 32);
+ newLIR2(cUnit, kX86XorpsRR, rlResult.lowReg, rlSrc1.lowReg);
storeValueWide(cUnit, rlDest, rlResult);
return false;
case Instruction::REM_DOUBLE_2ADDR:
diff --git a/src/compiler/codegen/x86/X86/Factory.cc b/src/compiler/codegen/x86/X86/Factory.cc
index d60d9de..9721038 100644
--- a/src/compiler/codegen/x86/X86/Factory.cc
+++ b/src/compiler/codegen/x86/X86/Factory.cc
@@ -475,9 +475,15 @@
if (!pair) {
load = newLIR3(cUnit, opcode, rDest, rBase, displacement + LOWORD_OFFSET);
} else {
- load = newLIR3(cUnit, opcode, rDest, rBase, displacement + LOWORD_OFFSET);
- load2 = newLIR3(cUnit, opcode, rDestHi, rBase,
- displacement + HIWORD_OFFSET);
+ if (rBase == rDest) {
+ load2 = newLIR3(cUnit, opcode, rDestHi, rBase,
+ displacement + HIWORD_OFFSET);
+ load = newLIR3(cUnit, opcode, rDest, rBase, displacement + LOWORD_OFFSET);
+ } else {
+ load = newLIR3(cUnit, opcode, rDest, rBase, displacement + LOWORD_OFFSET);
+ load2 = newLIR3(cUnit, opcode, rDestHi, rBase,
+ displacement + HIWORD_OFFSET);
+ }
}
if (rBase == rSP) {
annotateDalvikRegAccess(load, (displacement + (pair ? LOWORD_OFFSET : 0))
@@ -492,10 +498,17 @@
load = newLIR5(cUnit, opcode, rDest, rBase, rIndex, scale,
displacement + LOWORD_OFFSET);
} else {
- load = newLIR5(cUnit, opcode, rDest, rBase, rIndex, scale,
- displacement + LOWORD_OFFSET);
- load2 = newLIR5(cUnit, opcode, rDestHi, rBase, rIndex, scale,
- displacement + HIWORD_OFFSET);
+ if (rBase == rDest) {
+ load2 = newLIR5(cUnit, opcode, rDestHi, rBase, rIndex, scale,
+ displacement + HIWORD_OFFSET);
+ load = newLIR5(cUnit, opcode, rDest, rBase, rIndex, scale,
+ displacement + LOWORD_OFFSET);
+ } else {
+ load = newLIR5(cUnit, opcode, rDest, rBase, rIndex, scale,
+ displacement + LOWORD_OFFSET);
+ load2 = newLIR5(cUnit, opcode, rDestHi, rBase, rIndex, scale,
+ displacement + HIWORD_OFFSET);
+ }
}
}
diff --git a/src/compiler/codegen/x86/X86/Gen.cc b/src/compiler/codegen/x86/X86/Gen.cc
index b0b6ba8..adad05b 100644
--- a/src/compiler/codegen/x86/X86/Gen.cc
+++ b/src/compiler/codegen/x86/X86/Gen.cc
@@ -352,13 +352,18 @@
if (srcFP) {
opRegCopy(cUnit, S2D(destLo, destHi), S2D(srcLo, srcHi));
} else {
- UNIMPLEMENTED(WARNING);
- newLIR0(cUnit, kX86Bkpt);
+ // TODO: Prevent this from happening in the code. The result is often
+ // unused or could have been loaded more easily from memory.
+ newLIR2(cUnit, kX86MovdxrRR, destLo, srcLo);
+ newLIR2(cUnit, kX86MovdxrRR, destHi, srcHi);
+ newLIR2(cUnit, kX86PsllqRI, destHi, 32);
+ newLIR2(cUnit, kX86OrpsRR, destLo, destHi);
}
} else {
if (srcFP) {
- UNIMPLEMENTED(WARNING);
- newLIR0(cUnit, kX86Bkpt);
+ newLIR2(cUnit, kX86MovdrxRR, destLo, srcLo);
+ newLIR2(cUnit, kX86PsrlqRI, srcLo, 32);
+ newLIR2(cUnit, kX86MovdrxRR, destHi, srcLo);
} else {
// Handle overlap
if (srcHi == destLo) {
diff --git a/src/compiler/codegen/x86/X86LIR.h b/src/compiler/codegen/x86/X86LIR.h
index 4c44118..c229844 100644
--- a/src/compiler/codegen/x86/X86LIR.h
+++ b/src/compiler/codegen/x86/X86LIR.h
@@ -110,7 +110,7 @@
/* Offset to distingish FP regs */
#define FP_REG_OFFSET 32
/* Offset to distinguish DP FP regs */
-#define FP_DOUBLE (FP_REG_OFFSET + 32)
+#define FP_DOUBLE (FP_REG_OFFSET + 16)
/* Offset to distingish the extra regs */
#define EXTRA_REG_OFFSET (FP_DOUBLE + 16)
/* Reg types */
@@ -433,9 +433,10 @@
Binary0fOpCode(kX86Subss), // float subtract
Binary0fOpCode(kX86Divsd), // double divide
Binary0fOpCode(kX86Divss), // float divide
- kX86PsllqRI, // shift of floating point registers
+ kX86PsrlqRI, // right shift of floating point registers
+ kX86PsllqRI, // left shift of floating point registers
Binary0fOpCode(kX86Movdxr), // move into xmm from gpr
- Binary0fOpCode(kX86Movdrx), // move into reg from xmm
+ kX86MovdrxRR, kX86MovdrxMR, kX86MovdrxAR,// move into reg from xmm
kX86Set8R, kX86Set8M, kX86Set8A,// set byte depending on condition operand
kX86Mfence, // memory barrier
Binary0fOpCode(kX86Imul16), // 16bit multiply
@@ -470,6 +471,7 @@
kReg, kMem, kArray, // R, M and A instruction kinds.
kMemReg, kArrayReg, kThreadReg, // MR, AR and TR instruction kinds.
kRegReg, kRegMem, kRegArray, kRegThread, // RR, RM, RA and RT instruction kinds.
+ kRegRegStore, // RR following the store modrm reg-reg encoding rather than the load.
kRegImm, kMemImm, kArrayImm, kThreadImm, // RI, MI, AI and TI instruction kinds.
kRegRegImm, kRegMemImm, kRegArrayImm, // RRI, RMI and RAI instruction kinds.
kMovRegImm, // Shorter form move RI.
diff --git a/src/disassembler_x86.cc b/src/disassembler_x86.cc
index 4aff822..969feb8 100644
--- a/src/disassembler_x86.cc
+++ b/src/disassembler_x86.cc
@@ -225,10 +225,13 @@
case 0x10: case 0x11:
if (prefix[0] == 0xF2) {
opcode << "movsd";
+ prefix[0] = 0; // clear prefix now it's served its purpose as part of the opcode
} else if (prefix[0] == 0xF3) {
opcode << "movss";
+ prefix[0] = 0; // clear prefix now it's served its purpose as part of the opcode
} else if (prefix[2] == 0x66) {
opcode << "movupd";
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
} else {
opcode << "movups";
}
@@ -237,6 +240,71 @@
load = *instr == 0x10;
store = !load;
break;
+ case 0x2A:
+ if (prefix[2] == 0x66) {
+ opcode << "cvtpi2pd";
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else if (prefix[0] == 0xF2) {
+ opcode << "cvtsi2sd";
+ prefix[0] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else if (prefix[0] == 0xF3) {
+ opcode << "cvtsi2ss";
+ prefix[0] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ opcode << "cvtpi2ps";
+ }
+ load = true;
+ has_modrm = true;
+ dst_reg_file = SSE;
+ break;
+ case 0x2C:
+ if (prefix[2] == 0x66) {
+ opcode << "cvttpd2pi";
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else if (prefix[0] == 0xF2) {
+ opcode << "cvttsd2si";
+ prefix[0] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else if (prefix[0] == 0xF3) {
+ opcode << "cvttss2si";
+ prefix[0] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ opcode << "cvttps2pi";
+ }
+ load = true;
+ has_modrm = true;
+ src_reg_file = SSE;
+ break;
+ case 0x2D:
+ if (prefix[2] == 0x66) {
+ opcode << "cvtpd2pi";
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else if (prefix[0] == 0xF2) {
+ opcode << "cvtsd2si";
+ prefix[0] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else if (prefix[0] == 0xF3) {
+ opcode << "cvtss2si";
+ prefix[0] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ opcode << "cvtps2pi";
+ }
+ load = true;
+ has_modrm = true;
+ src_reg_file = SSE;
+ break;
+ case 0x2E:
+ opcode << "u";
+ // FALLTHROUGH
+ case 0x2F:
+ if (prefix[2] == 0x66) {
+ opcode << "comisd";
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ opcode << "comiss";
+ }
+ has_modrm = true;
+ load = true;
+ src_reg_file = dst_reg_file = SSE;
+ break;
case 0x38: // 3 byte extended opcode
opcode << StringPrintf("unknown opcode '0F 38 %02X'", *instr);
break;
@@ -264,13 +332,13 @@
}
if (prefix[2] == 0x66) {
opcode << "pd";
- prefix[2] = 0; // clear prefix now its served its purpose as part of the opcode
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
} else if (prefix[0] == 0xF2) {
opcode << "sd";
- prefix[0] = 0; // clear prefix now its served its purpose as part of the opcode
+ prefix[0] = 0; // clear prefix now it's served its purpose as part of the opcode
} else if (prefix[0] == 0xF3) {
opcode << "ss";
- prefix[0] = 0; // clear prefix now its served its purpose as part of the opcode
+ prefix[0] = 0; // clear prefix now it's served its purpose as part of the opcode
} else {
opcode << "ps";
}
@@ -282,13 +350,13 @@
case 0x5A:
if (prefix[2] == 0x66) {
opcode << "cvtpd2ps";
- prefix[2] = 0; // clear prefix now its served its purpose as part of the opcode
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
} else if (prefix[0] == 0xF2) {
opcode << "cvtsd2ss";
- prefix[0] = 0; // clear prefix now its served its purpose as part of the opcode
+ prefix[0] = 0; // clear prefix now it's served its purpose as part of the opcode
} else if (prefix[0] == 0xF3) {
opcode << "cvtss2sd";
- prefix[0] = 0; // clear prefix now its served its purpose as part of the opcode
+ prefix[0] = 0; // clear prefix now it's served its purpose as part of the opcode
} else {
opcode << "cvtps2pd";
}
@@ -299,12 +367,12 @@
case 0x5B:
if (prefix[2] == 0x66) {
opcode << "cvtps2dq";
- prefix[2] = 0; // clear prefix now its served its purpose as part of the opcode
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
} else if (prefix[0] == 0xF2) {
opcode << "bad opcode F2 0F 5B";
} else if (prefix[0] == 0xF3) {
opcode << "cvttps2dq";
- prefix[0] = 0; // clear prefix now its served its purpose as part of the opcode
+ prefix[0] = 0; // clear prefix now it's served its purpose as part of the opcode
} else {
opcode << "cvtdq2ps";
}
@@ -315,12 +383,11 @@
case 0x6E:
if (prefix[2] == 0x66) {
dst_reg_file = SSE;
- opcode << "movq";
- prefix[2] = 0; // clear prefix now its served its purpose as part of the opcode
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
} else {
dst_reg_file = MMX;
- opcode << "movd";
}
+ opcode << "movd";
load = true;
has_modrm = true;
break;
@@ -328,10 +395,11 @@
if (prefix[2] == 0x66) {
dst_reg_file = SSE;
opcode << "movdqa";
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
} else if (prefix[0] == 0xF3) {
dst_reg_file = SSE;
opcode << "movdqu";
- prefix[0] = 0; // clear prefix now its served its purpose as part of the opcode
+ prefix[0] = 0; // clear prefix now it's served its purpose as part of the opcode
} else {
dst_reg_file = MMX;
opcode << "movq";
@@ -339,6 +407,59 @@
load = true;
has_modrm = true;
break;
+ case 0x71:
+ if (prefix[2] == 0x66) {
+ dst_reg_file = SSE;
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ dst_reg_file = MMX;
+ }
+ static const char* x71_opcodes[] = {"unknown-71", "unknown-71", "psrlw", "unknown-71", "psraw", "unknown-71", "psllw", "unknown-71"};
+ modrm_opcodes = x71_opcodes;
+ reg_is_opcode = true;
+ has_modrm = true;
+ store = true;
+ immediate_bytes = 1;
+ break;
+ case 0x72:
+ if (prefix[2] == 0x66) {
+ dst_reg_file = SSE;
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ dst_reg_file = MMX;
+ }
+ static const char* x72_opcodes[] = {"unknown-72", "unknown-72", "psrld", "unknown-72", "psrad", "unknown-72", "pslld", "unknown-72"};
+ modrm_opcodes = x72_opcodes;
+ reg_is_opcode = true;
+ has_modrm = true;
+ store = true;
+ immediate_bytes = 1;
+ break;
+ case 0x73:
+ if (prefix[2] == 0x66) {
+ dst_reg_file = SSE;
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ dst_reg_file = MMX;
+ }
+ static const char* x73_opcodes[] = {"unknown-73", "unknown-73", "psrlq", "unknown-73", "unknown-73", "unknown-73", "psllq", "unknown-73"};
+ modrm_opcodes = x73_opcodes;
+ reg_is_opcode = true;
+ has_modrm = true;
+ store = true;
+ immediate_bytes = 1;
+ break;
+ case 0x7E:
+ if (prefix[2] == 0x66) {
+ src_reg_file = SSE;
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ src_reg_file = MMX;
+ }
+ opcode << "movd";
+ has_modrm = true;
+ store = true;
+ break;
case 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: case 0x87:
case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D: case 0x8E: case 0x8F:
opcode << "j" << condition_codes[*instr & 0xF];
@@ -510,7 +631,7 @@
}
}
if (ax) {
- args << ", ";
+ // If this opcode implicitly uses ax, ax is always the first arg.
DumpReg(args, rex, 0 /* EAX */, byte_operand, prefix[2], GPR);
}
if (cx) {