Reduce x86 sequence for GP pair to XMM

Added support for punpckldq which is useful for interleaving
32-bit values from two xmm registers.

This new instruction is now used for transfers from GP pairs
to XMM in order to reduce path length.

Change-Id: I70d9b69449dfcfb9a94a628deb74a7cffe96bac7
Signed-off-by: Razvan A Lupusoru <razvan.a.lupusoru@intel.com>
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 5e1c4d1..35bdb0f 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -288,6 +288,7 @@
   EXT_0F_ENCODING_MAP(Subss,     0xF3, 0x5C, REG_DEF0),
   EXT_0F_ENCODING_MAP(Divsd,     0xF2, 0x5E, REG_DEF0),
   EXT_0F_ENCODING_MAP(Divss,     0xF3, 0x5E, REG_DEF0),
+  EXT_0F_ENCODING_MAP(Punpckldq, 0x66, 0x62, REG_DEF0),
 
   { kX86PsrlqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 2, 0, 1 }, "PsrlqRI", "!0r,!1d" },
   { kX86PsllqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 6, 0, 1 }, "PsllqRI", "!0r,!1d" },
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index a2c215c..fcfd885 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -346,8 +346,7 @@
         if (val_hi != 0) {
           r_dest_hi = AllocTempDouble();
           LoadConstantNoClobber(r_dest_hi, val_hi);
-          NewLIR2(kX86PsllqRI, r_dest_hi, 32);
-          NewLIR2(kX86OrpsRR, r_dest_lo, r_dest_hi);
+          NewLIR2(kX86PunpckldqRR, r_dest_lo, r_dest_hi);
           FreeTemp(r_dest_hi);
         }
       }
@@ -594,8 +593,7 @@
   NewLIR2(kX86MovdxrRR, fp_reg, low_reg);
   int tmp_reg = AllocTempDouble();
   NewLIR2(kX86MovdxrRR, tmp_reg, high_reg);
-  NewLIR2(kX86PsllqRI, tmp_reg, 32);
-  NewLIR2(kX86OrpsRR, fp_reg, tmp_reg);
+  NewLIR2(kX86PunpckldqRR, fp_reg, tmp_reg);
   FreeTemp(tmp_reg);
 }
 
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index d7f61fc..e091a84 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -350,6 +350,7 @@
   Binary0fOpCode(kX86Subss),    // float subtract
   Binary0fOpCode(kX86Divsd),    // double divide
   Binary0fOpCode(kX86Divss),    // float divide
+  Binary0fOpCode(kX86Punpckldq),  // Interleave low-order double words
   kX86PsrlqRI,                  // right shift of floating point registers
   kX86PsllqRI,                  // left shift of floating point registers
   kX86SqrtsdRR,                 // sqrt of floating point register
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 6d82f0a..ef83498 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -392,6 +392,17 @@
         has_modrm = true;
         src_reg_file = dst_reg_file = SSE;
         break;
+      case 0x62:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // Clear prefix now. It has served its purpose as part of the opcode.
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "punpckldq";
+        load = true;
+        has_modrm = true;
+        break;
       case 0x6E:
         if (prefix[2] == 0x66) {
           dst_reg_file = SSE;