Reduce x86 sequence for GP pair to XMM
Added support for punpckldq which is useful for interleaving
32-bit values from two xmm registers.
This new instruction is now used for transfers from GP pairs
to XMM in order to reduce path length.
Change-Id: I70d9b69449dfcfb9a94a628deb74a7cffe96bac7
Signed-off-by: Razvan A Lupusoru <razvan.a.lupusoru@intel.com>
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 5e1c4d1..35bdb0f 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -288,6 +288,7 @@
EXT_0F_ENCODING_MAP(Subss, 0xF3, 0x5C, REG_DEF0),
EXT_0F_ENCODING_MAP(Divsd, 0xF2, 0x5E, REG_DEF0),
EXT_0F_ENCODING_MAP(Divss, 0xF3, 0x5E, REG_DEF0),
+ EXT_0F_ENCODING_MAP(Punpckldq, 0x66, 0x62, REG_DEF0),
{ kX86PsrlqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 2, 0, 1 }, "PsrlqRI", "!0r,!1d" },
{ kX86PsllqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 6, 0, 1 }, "PsllqRI", "!0r,!1d" },
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index a2c215c..fcfd885 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -346,8 +346,7 @@
if (val_hi != 0) {
r_dest_hi = AllocTempDouble();
LoadConstantNoClobber(r_dest_hi, val_hi);
- NewLIR2(kX86PsllqRI, r_dest_hi, 32);
- NewLIR2(kX86OrpsRR, r_dest_lo, r_dest_hi);
+ NewLIR2(kX86PunpckldqRR, r_dest_lo, r_dest_hi);
FreeTemp(r_dest_hi);
}
}
@@ -594,8 +593,7 @@
NewLIR2(kX86MovdxrRR, fp_reg, low_reg);
int tmp_reg = AllocTempDouble();
NewLIR2(kX86MovdxrRR, tmp_reg, high_reg);
- NewLIR2(kX86PsllqRI, tmp_reg, 32);
- NewLIR2(kX86OrpsRR, fp_reg, tmp_reg);
+ NewLIR2(kX86PunpckldqRR, fp_reg, tmp_reg);
FreeTemp(tmp_reg);
}
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index d7f61fc..e091a84 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -350,6 +350,7 @@
Binary0fOpCode(kX86Subss), // float subtract
Binary0fOpCode(kX86Divsd), // double divide
Binary0fOpCode(kX86Divss), // float divide
+ Binary0fOpCode(kX86Punpckldq), // Interleave low-order double words
kX86PsrlqRI, // right shift of floating point registers
kX86PsllqRI, // left shift of floating point registers
kX86SqrtsdRR, // sqrt of floating point register
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 6d82f0a..ef83498 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -392,6 +392,17 @@
has_modrm = true;
src_reg_file = dst_reg_file = SSE;
break;
+ case 0x62:
+ if (prefix[2] == 0x66) {
+ src_reg_file = dst_reg_file = SSE;
+ prefix[2] = 0; // Clear prefix now. It has served its purpose as part of the opcode.
+ } else {
+ src_reg_file = dst_reg_file = MMX;
+ }
+ opcode << "punpckldq";
+ load = true;
+ has_modrm = true;
+ break;
case 0x6E:
if (prefix[2] == 0x66) {
dst_reg_file = SSE;