Merge "Implement all vector instructions for X86"
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 9200106..91a66d3 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -279,6 +279,11 @@
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, 0, 0, 0, 0 }, #opname "RM", "!0r,[!1r+!2d]" }, \
 { kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { prefix, 0, 0x0F, opcode, 0, 0, 0, 0 }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
 
+#define EXT_0F_ENCODING2_MAP(opname, prefix, opcode, opcode2, reg_def) \
+{ kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0 }, #opname "RR", "!0r,!1r" }, \
+{ kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0 }, #opname "RM", "!0r,[!1r+!2d]" }, \
+{ kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0 }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
+
   EXT_0F_ENCODING_MAP(Movsd, 0xF2, 0x10, REG_DEF0),
   { kX86MovsdMR, kMemReg,   IS_STORE | IS_TERTIARY_OP | REG_USE02,  { 0xF2, 0, 0x0F, 0x11, 0, 0, 0, 0 }, "MovsdMR", "[!0r+!1d],!2r" },
   { kX86MovsdAR, kArrayReg, IS_STORE | IS_QUIN_OP     | REG_USE014, { 0xF2, 0, 0x0F, 0x11, 0, 0, 0, 0 }, "MovsdAR", "[!0r+!1r<<!2d+!3d],!4r" },
@@ -310,10 +315,42 @@
   EXT_0F_ENCODING_MAP(Divsd,     0xF2, 0x5E, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Divss,     0xF3, 0x5E, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Punpckldq, 0x66, 0x62, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Sqrtsd,    0xF2, 0x51, REG_DEF0_USE0),
+  EXT_0F_ENCODING2_MAP(Pmulld,   0x66, 0x38, 0x40, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Pmullw,    0x66, 0xD5, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Mulps,     0x00, 0x59, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Mulpd,     0x66, 0x59, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Paddb,     0x66, 0xFC, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Paddw,     0x66, 0xFD, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Paddd,     0x66, 0xFE, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Addps,     0x00, 0x58, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Addpd,     0xF2, 0x58, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Psubb,     0x66, 0xF8, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Psubw,     0x66, 0xF9, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Psubd,     0x66, 0xFA, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Subps,     0x00, 0x5C, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Subpd,     0x66, 0x5C, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Pand,      0x66, 0xDB, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Por,       0x66, 0xEB, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Pxor,      0x66, 0xEF, REG_DEF0_USE0),
+  EXT_0F_ENCODING2_MAP(Phaddw,   0x66, 0x38, 0x01, REG_DEF0_USE0),
+  EXT_0F_ENCODING2_MAP(Phaddd,   0x66, 0x38, 0x02, REG_DEF0_USE0),
 
+  { kX86PextrbRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x14, 0, 0, 1 }, "PextbRRI", "!0r,!1r,!2d" },
+  { kX86PextrwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0xC5, 0x00, 0, 0, 1 }, "PextwRRI", "!0r,!1r,!2d" },
+  { kX86PextrdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1 }, "PextdRRI", "!0r,!1r,!2d" },
+
+  { kX86PshuflwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0xF2, 0, 0x0F, 0x70, 0, 0, 0, 1 }, "PshuflwRRI", "!0r,!1r,!2d" },
+  { kX86PshufdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x70, 0, 0, 0, 1 }, "PshuffRRI", "!0r,!1r,!2d" },
+
+  { kX86PsrawRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 4, 0, 1 }, "PsrawRI", "!0r,!1d" },
+  { kX86PsradRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 4, 0, 1 }, "PsradRI", "!0r,!1d" },
+  { kX86PsrlwRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 2, 0, 1 }, "PsrlwRI", "!0r,!1d" },
+  { kX86PsrldRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 2, 0, 1 }, "PsrldRI", "!0r,!1d" },
   { kX86PsrlqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 2, 0, 1 }, "PsrlqRI", "!0r,!1d" },
+  { kX86PsllwRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 6, 0, 1 }, "PsllwRI", "!0r,!1d" },
+  { kX86PslldRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 6, 0, 1 }, "PslldRI", "!0r,!1d" },
   { kX86PsllqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 6, 0, 1 }, "PsllqRI", "!0r,!1d" },
-  { kX86SqrtsdRR, kRegReg, IS_BINARY_OP | REG_DEF0_USE1, { 0xF2, 0, 0x0F, 0x51, 0, 0, 0, 0 }, "SqrtsdRR", "!0r,!1r" },
 
   { kX86Fild32M, kMem, IS_LOAD | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDB, 0x00, 0, 0, 0, 0 }, "Fild32M", "[!0r,!1d]" },
   { kX86Fild64M, kMem, IS_LOAD | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDF, 0x00, 0, 5, 0, 0 }, "Fild64M", "[!0r,!1d]" },
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 72cdbbd..1807d5c 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -429,6 +429,136 @@
     void GenConst128(BasicBlock* bb, MIR* mir);
 
     /*
+     * @brief MIR to move a vectorized register to another.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination
+     * @note vC: source
+     */
+    void GenMoveVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed multiply of units in two vector registers: vB = vB .* @note vC using vA to know the type of the vector.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: source
+     */
+    void GenMultiplyVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed addition of units in two vector registers: vB = vB .+ vC using vA to know the type of the vector.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: source
+     */
+    void GenAddVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed subtraction of units in two vector registers: vB = vB .- vC using vA to know the type of the vector.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: source
+     */
+    void GenSubtractVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed shift left of units in two vector registers: vB = vB .<< vC using vA to know the type of the vector.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: immediate
+     */
+    void GenShiftLeftVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed signed shift right of units in two vector registers: vB = vB .>> vC using vA to know the type of the vector.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: immediate
+     */
+    void GenSignedShiftRightVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed unsigned shift right of units in two vector registers: vB = vB .>>> vC using vA to know the type of the vector.
+     * @param bb The basic block in which the MIR is from..
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: immediate
+     */
+    void GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed bitwise and of units in two vector registers: vB = vB .& vC using vA to know the type of the vector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: source
+     */
+    void GenAndVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed bitwise or of units in two vector registers: vB = vB .| vC using vA to know the type of the vector.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: source
+     */
+    void GenOrVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed bitwise xor of units in two vector registers: vB = vB .^ vC using vA to know the type of the vector.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: source
+     */
+    void GenXorVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Reduce a 128-bit packed element into a single VR by taking lower bits
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @details Instruction does a horizontal addition of the packed elements and then adds it to VR.
+     * @note vA: TypeSize
+     * @note vB: destination and source VR (not vector register)
+     * @note vC: source (vector register)
+     */
+    void GenAddReduceVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Extract a packed element into a single VR.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination VR (not vector register)
+     * @note vC: source (vector register)
+     * @note arg[0]: The index to use for extraction from vector register (which packed element).
+     */
+    void GenReduceVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Create a vector value, with all TypeSize values equal to vC
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize.
+     * @note vB: destination vector register.
+     * @note vC: source VR (not vector register).
+     */
+    void GenSetVector(BasicBlock *bb, MIR *mir);
+
+    /*
      * @brief Generate code for a vector opcode.
      * @param bb The basic block in which the MIR is from.
      * @param mir The MIR whose opcode is a non-standard opcode.
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index e7a629a..889ea8b 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -81,6 +81,16 @@
 #endif
 };
 
+static const RegStorage xp_temps_arr_32[] = {
+    rs_xr0, rs_xr1, rs_xr2, rs_xr3, rs_xr4, rs_xr5, rs_xr6, rs_xr7,
+};
+static const RegStorage xp_temps_arr_64[] = {
+    rs_xr0, rs_xr1, rs_xr2, rs_xr3, rs_xr4, rs_xr5, rs_xr6, rs_xr7,
+#ifdef TARGET_REX_SUPPORT
+    rs_xr8, rs_xr9, rs_xr10, rs_xr11, rs_xr12, rs_xr13, rs_xr14, rs_xr15
+#endif
+};
+
 static const std::vector<RegStorage> empty_pool;
 static const std::vector<RegStorage> core_regs_32(core_regs_arr_32,
     core_regs_arr_32 + sizeof(core_regs_arr_32) / sizeof(core_regs_arr_32[0]));
@@ -111,6 +121,11 @@
 static const std::vector<RegStorage> dp_temps_64(dp_temps_arr_64,
     dp_temps_arr_64 + sizeof(dp_temps_arr_64) / sizeof(dp_temps_arr_64[0]));
 
+static const std::vector<RegStorage> xp_temps_32(xp_temps_arr_32,
+    xp_temps_arr_32 + sizeof(xp_temps_arr_32) / sizeof(xp_temps_arr_32[0]));
+static const std::vector<RegStorage> xp_temps_64(xp_temps_arr_64,
+    xp_temps_arr_64 + sizeof(xp_temps_arr_64) / sizeof(xp_temps_arr_64[0]));
+
 RegStorage rs_rX86_SP;
 
 X86NativeRegisterPool rX86_ARG0;
@@ -209,7 +224,7 @@
   /* Double registers in x86 are just a single FP register */
   seed = 1;
   /* FP register starts at bit position 16 */
-  shift = reg.IsFloat() ? kX86FPReg0 : 0;
+  shift = (reg.IsFloat() || reg.StorageSize() > 8) ? kX86FPReg0 : 0;
   /* Expand the double register id into single offset */
   shift += reg_id;
   return (seed << shift);
@@ -542,17 +557,31 @@
 
   // Target-specific adjustments.
 
+  // Add in XMM registers.
+  const std::vector<RegStorage> *xp_temps = Gen64Bit() ? &xp_temps_64 : &xp_temps_32;
+  for (RegStorage reg : *xp_temps) {
+    RegisterInfo* info = new (arena_) RegisterInfo(reg, GetRegMaskCommon(reg));
+    reginfo_map_.Put(reg.GetReg(), info);
+    info->SetIsTemp(true);
+  }
+
   // Alias single precision xmm to double xmms.
   // TODO: as needed, add larger vector sizes - alias all to the largest.
   GrowableArray<RegisterInfo*>::Iterator it(&reg_pool_->sp_regs_);
   for (RegisterInfo* info = it.Next(); info != nullptr; info = it.Next()) {
     int sp_reg_num = info->GetReg().GetRegNum();
+    RegStorage xp_reg = RegStorage::Solo128(sp_reg_num);
+    RegisterInfo* xp_reg_info = GetRegInfo(xp_reg);
+    // 128-bit xmm vector register's master storage should refer to itself.
+    DCHECK_EQ(xp_reg_info, xp_reg_info->Master());
+
+    // Redirect 32-bit vector's master storage to 128-bit vector.
+    info->SetMaster(xp_reg_info);
+
     RegStorage dp_reg = RegStorage::Solo64(RegStorage::kFloatingPoint | sp_reg_num);
     RegisterInfo* dp_reg_info = GetRegInfo(dp_reg);
-    // 64-bit xmm vector register's master storage should refer to itself.
-    DCHECK_EQ(dp_reg_info, dp_reg_info->Master());
-    // Redirect 32-bit vector's master storage to 64-bit vector.
-    info->SetMaster(dp_reg_info);
+    // Redirect 64-bit vector's master storage to 128-bit vector.
+    dp_reg_info->SetMaster(xp_reg_info);
   }
 
   // Don't start allocating temps at r0/s0/d0 or you may clobber return regs in early-exit methods.
@@ -1240,6 +1269,45 @@
     case kMirOpConstVector:
       GenConst128(bb, mir);
       break;
+    case kMirOpMoveVector:
+      GenMoveVector(bb, mir);
+      break;
+    case kMirOpPackedMultiply:
+      GenMultiplyVector(bb, mir);
+      break;
+    case kMirOpPackedAddition:
+      GenAddVector(bb, mir);
+      break;
+    case kMirOpPackedSubtract:
+      GenSubtractVector(bb, mir);
+      break;
+    case kMirOpPackedShiftLeft:
+      GenShiftLeftVector(bb, mir);
+      break;
+    case kMirOpPackedSignedShiftRight:
+      GenSignedShiftRightVector(bb, mir);
+      break;
+    case kMirOpPackedUnsignedShiftRight:
+      GenUnsignedShiftRightVector(bb, mir);
+      break;
+    case kMirOpPackedAnd:
+      GenAndVector(bb, mir);
+      break;
+    case kMirOpPackedOr:
+      GenOrVector(bb, mir);
+      break;
+    case kMirOpPackedXor:
+      GenXorVector(bb, mir);
+      break;
+    case kMirOpPackedAddReduce:
+      GenAddReduceVector(bb, mir);
+      break;
+    case kMirOpPackedReduce:
+      GenReduceVector(bb, mir);
+      break;
+    case kMirOpPackedSet:
+      GenSetVector(bb, mir);
+      break;
     default:
       break;
   }
@@ -1249,9 +1317,9 @@
   int type_size = mir->dalvikInsn.vA;
   // We support 128 bit vectors.
   DCHECK_EQ(type_size & 0xFFFF, 128);
-  int reg = mir->dalvikInsn.vB;
-  DCHECK_LT(reg, 8);
+  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
   uint32_t *args = mir->dalvikInsn.arg;
+  int reg = rs_dest.GetReg();
   // Check for all 0 case.
   if (args[0] == 0 && args[1] == 0 && args[2] == 0 && args[3] == 0) {
     NewLIR2(kX86XorpsRR, reg, reg);
@@ -1277,6 +1345,287 @@
   SetMemRefType(load, true, kLiteral);
 }
 
+void X86Mir2Lir::GenMoveVector(BasicBlock *bb, MIR *mir) {
+  // We only support 128 bit registers.
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vC);
+  NewLIR2(kX86Mova128RR, rs_dest.GetReg(), rs_src.GetReg());
+}
+
+void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  int opcode = 0;
+  switch (opsize) {
+    case k32:
+      opcode = kX86PmulldRR;
+      break;
+    case kSignedHalf:
+      opcode = kX86PmullwRR;
+      break;
+    case kSingle:
+      opcode = kX86MulpsRR;
+      break;
+    case kDouble:
+      opcode = kX86MulpdRR;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector multiply " << opsize;
+      break;
+  }
+  NewLIR2(opcode, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenAddVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  int opcode = 0;
+  switch (opsize) {
+    case k32:
+      opcode = kX86PadddRR;
+      break;
+    case kSignedHalf:
+    case kUnsignedHalf:
+      opcode = kX86PaddwRR;
+      break;
+    case kUnsignedByte:
+    case kSignedByte:
+      opcode = kX86PaddbRR;
+      break;
+    case kSingle:
+      opcode = kX86AddpsRR;
+      break;
+    case kDouble:
+      opcode = kX86AddpdRR;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector addition " << opsize;
+      break;
+  }
+  NewLIR2(opcode, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  int opcode = 0;
+  switch (opsize) {
+    case k32:
+      opcode = kX86PsubdRR;
+      break;
+    case kSignedHalf:
+    case kUnsignedHalf:
+      opcode = kX86PsubwRR;
+      break;
+    case kUnsignedByte:
+    case kSignedByte:
+      opcode = kX86PsubbRR;
+      break;
+    case kSingle:
+      opcode = kX86SubpsRR;
+      break;
+    case kDouble:
+      opcode = kX86SubpdRR;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector subtraction " << opsize;
+      break;
+  }
+  NewLIR2(opcode, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  int imm = mir->dalvikInsn.vC;
+  int opcode = 0;
+  switch (opsize) {
+    case k32:
+      opcode = kX86PslldRI;
+      break;
+    case k64:
+      opcode = kX86PsllqRI;
+      break;
+    case kSignedHalf:
+    case kUnsignedHalf:
+      opcode = kX86PsllwRI;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector shift left " << opsize;
+      break;
+  }
+  NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+}
+
+void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  int imm = mir->dalvikInsn.vC;
+  int opcode = 0;
+  switch (opsize) {
+    case k32:
+      opcode = kX86PsradRI;
+      break;
+    case kSignedHalf:
+    case kUnsignedHalf:
+      opcode = kX86PsrawRI;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector signed shift right " << opsize;
+      break;
+  }
+  NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+}
+
+void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  int imm = mir->dalvikInsn.vC;
+  int opcode = 0;
+  switch (opsize) {
+    case k32:
+      opcode = kX86PsrldRI;
+      break;
+    case k64:
+      opcode = kX86PsrlqRI;
+      break;
+    case kSignedHalf:
+    case kUnsignedHalf:
+      opcode = kX86PsrlwRI;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector unsigned shift right " << opsize;
+      break;
+  }
+  NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+}
+
+void X86Mir2Lir::GenAndVector(BasicBlock *bb, MIR *mir) {
+  // We only support 128 bit registers.
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  NewLIR2(kX86PandRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenOrVector(BasicBlock *bb, MIR *mir) {
+  // We only support 128 bit registers.
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenXorVector(BasicBlock *bb, MIR *mir) {
+  // We only support 128 bit registers.
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenAddReduceVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  int imm = mir->dalvikInsn.vC;
+  int opcode = 0;
+  switch (opsize) {
+    case k32:
+      opcode = kX86PhadddRR;
+      break;
+    case kSignedHalf:
+    case kUnsignedHalf:
+      opcode = kX86PhaddwRR;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector add reduce " << opsize;
+      break;
+  }
+  NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+}
+
+void X86Mir2Lir::GenReduceVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vB);
+  int index = mir->dalvikInsn.arg[0];
+  int opcode = 0;
+  switch (opsize) {
+    case k32:
+      opcode = kX86PextrdRRI;
+      break;
+    case kSignedHalf:
+    case kUnsignedHalf:
+      opcode = kX86PextrwRRI;
+      break;
+    case kUnsignedByte:
+    case kSignedByte:
+      opcode = kX86PextrbRRI;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector reduce " << opsize;
+      break;
+  }
+  // We need to extract to a GPR.
+  RegStorage temp = AllocTemp();
+  NewLIR3(opcode, temp.GetReg(), rs_src.GetReg(), index);
+
+  // Assume that the destination VR is in the def for the mir.
+  RegLocation rl_dest = mir_graph_->GetDest(mir);
+  RegLocation rl_temp =
+    {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, temp, INVALID_SREG, INVALID_SREG};
+  StoreValue(rl_dest, rl_temp);
+}
+
+void X86Mir2Lir::GenSetVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
+  int op_low = 0, op_high = 0;
+  switch (opsize) {
+    case k32:
+      op_low = kX86PshufdRRI;
+      break;
+    case kSignedHalf:
+    case kUnsignedHalf:
+      // Handles low quadword.
+      op_low = kX86PshuflwRRI;
+      // Handles upper quadword.
+      op_high = kX86PshufdRRI;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector set " << opsize;
+      break;
+  }
+
+  // Load the value from the VR into a GPR.
+  RegLocation rl_src = mir_graph_->GetSrc(mir, 0);
+  rl_src = LoadValue(rl_src, kCoreReg);
+
+  // Load the value into the XMM register.
+  NewLIR2(kX86MovdxrRR, rs_dest.GetReg(), rl_src.reg.GetReg());
+
+  // Now shuffle the value across the destination.
+  NewLIR3(op_low, rs_dest.GetReg(), rs_dest.GetReg(), 0);
+
+  // And then repeat as needed.
+  if (op_high != 0) {
+    NewLIR3(op_high, rs_dest.GetReg(), rs_dest.GetReg(), 0);
+  }
+}
+
+
 LIR *X86Mir2Lir::ScanVectorLiteral(MIR *mir) {
   int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg);
   for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index adfed0c..430bc7d 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -151,7 +151,7 @@
   rRET           = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 16,
 #endif
 
-  // xmm registers, single precision view
+  // xmm registers, single precision view.
   fr0  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 0,
   fr1  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 1,
   fr2  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 2,
@@ -161,7 +161,7 @@
   fr6  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 6,
   fr7  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 7,
 
-  // xmm registers, double precision alises
+  // xmm registers, double precision aliases.
   dr0  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 0,
   dr1  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 1,
   dr2  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 2,
@@ -171,15 +171,15 @@
   dr6  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 6,
   dr7  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 7,
 
-  // xmm registers, quad precision alises
-  qr0  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 0,
-  qr1  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 1,
-  qr2  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 2,
-  qr3  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 3,
-  qr4  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 4,
-  qr5  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 5,
-  qr6  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 6,
-  qr7  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 7,
+  // xmm registers aliases.
+  xr0  = RegStorage::k128BitSolo | 0,
+  xr1  = RegStorage::k128BitSolo | 1,
+  xr2  = RegStorage::k128BitSolo | 2,
+  xr3  = RegStorage::k128BitSolo | 3,
+  xr4  = RegStorage::k128BitSolo | 4,
+  xr5  = RegStorage::k128BitSolo | 5,
+  xr6  = RegStorage::k128BitSolo | 6,
+  xr7  = RegStorage::k128BitSolo | 7,
 
   // TODO: as needed, add 256, 512 and 1024-bit xmm views.
 };
@@ -221,14 +221,14 @@
 constexpr RegStorage rs_dr6(RegStorage::kValid | dr6);
 constexpr RegStorage rs_dr7(RegStorage::kValid | dr7);
 
-constexpr RegStorage rs_qr0(RegStorage::kValid | qr0);
-constexpr RegStorage rs_qr1(RegStorage::kValid | qr1);
-constexpr RegStorage rs_qr2(RegStorage::kValid | qr2);
-constexpr RegStorage rs_qr3(RegStorage::kValid | qr3);
-constexpr RegStorage rs_qr4(RegStorage::kValid | qr4);
-constexpr RegStorage rs_qr5(RegStorage::kValid | qr5);
-constexpr RegStorage rs_qr6(RegStorage::kValid | qr6);
-constexpr RegStorage rs_qr7(RegStorage::kValid | qr7);
+constexpr RegStorage rs_xr0(RegStorage::kValid | xr0);
+constexpr RegStorage rs_xr1(RegStorage::kValid | xr1);
+constexpr RegStorage rs_xr2(RegStorage::kValid | xr2);
+constexpr RegStorage rs_xr3(RegStorage::kValid | xr3);
+constexpr RegStorage rs_xr4(RegStorage::kValid | xr4);
+constexpr RegStorage rs_xr5(RegStorage::kValid | xr5);
+constexpr RegStorage rs_xr6(RegStorage::kValid | xr6);
+constexpr RegStorage rs_xr7(RegStorage::kValid | xr7);
 
 extern X86NativeRegisterPool rX86_ARG0;
 extern X86NativeRegisterPool rX86_ARG1;
@@ -418,9 +418,39 @@
   Binary0fOpCode(kX86Divsd),    // double divide
   Binary0fOpCode(kX86Divss),    // float divide
   Binary0fOpCode(kX86Punpckldq),  // Interleave low-order double words
-  kX86PsrlqRI,                  // right shift of floating point registers
-  kX86PsllqRI,                  // left shift of floating point registers
-  kX86SqrtsdRR,                 // sqrt of floating point register
+  Binary0fOpCode(kX86Sqrtsd),   // square root
+  Binary0fOpCode(kX86Pmulld),   // parallel integer multiply 32 bits x 4
+  Binary0fOpCode(kX86Pmullw),   // parallel integer multiply 16 bits x 8
+  Binary0fOpCode(kX86Mulps),    // parallel FP multiply 32 bits x 4
+  Binary0fOpCode(kX86Mulpd),    // parallel FP multiply 64 bits x 2
+  Binary0fOpCode(kX86Paddb),    // parallel integer addition 8 bits x 16
+  Binary0fOpCode(kX86Paddw),    // parallel integer addition 16 bits x 8
+  Binary0fOpCode(kX86Paddd),    // parallel integer addition 32 bits x 4
+  Binary0fOpCode(kX86Addps),    // parallel FP addition 32 bits x 4
+  Binary0fOpCode(kX86Addpd),    // parallel FP addition 64 bits x 2
+  Binary0fOpCode(kX86Psubb),    // parallel integer subtraction 8 bits x 16
+  Binary0fOpCode(kX86Psubw),    // parallel integer subtraction 16 bits x 8
+  Binary0fOpCode(kX86Psubd),    // parallel integer subtraction 32 bits x 4
+  Binary0fOpCode(kX86Subps),    // parallel FP subtraction 32 bits x 4
+  Binary0fOpCode(kX86Subpd),    // parallel FP subtraction 64 bits x 2
+  Binary0fOpCode(kX86Pand),     // parallel AND 128 bits x 1
+  Binary0fOpCode(kX86Por),      // parallel OR 128 bits x 1
+  Binary0fOpCode(kX86Pxor),     // parallel XOR 128 bits x 1
+  Binary0fOpCode(kX86Phaddw),   // parallel horizontal addition 16 bits x 8
+  Binary0fOpCode(kX86Phaddd),   // parallel horizontal addition 32 bits x 4
+  kX86PextrbRRI,                // Extract 8 bits from XMM into GPR
+  kX86PextrwRRI,                // Extract 16 bits from XMM into GPR
+  kX86PextrdRRI,                // Extract 32 bits from XMM into GPR
+  kX86PshuflwRRI,               // Shuffle 16 bits in lower 64 bits of XMM.
+  kX86PshufdRRI,                // Shuffle 32 bits in XMM.
+  kX86PsrawRI,                  // signed right shift of floating point registers 16 bits x 8
+  kX86PsradRI,                  // signed right shift of floating point registers 32 bits x 4
+  kX86PsrlwRI,                  // logical right shift of floating point registers 16 bits x 8
+  kX86PsrldRI,                  // logical right shift of floating point registers 32 bits x 4
+  kX86PsrlqRI,                  // logical right shift of floating point registers 64 bits x 2
+  kX86PsllwRI,                  // left shift of floating point registers 16 bits x 8
+  kX86PslldRI,                  // left shift of floating point registers 32 bits x 4
+  kX86PsllqRI,                  // left shift of floating point registers 64 bits x 2
   kX86Fild32M,                  // push 32-bit integer on x87 stack
   kX86Fild64M,                  // push 64-bit integer on x87 stack
   kX86Fstp32M,                  // pop top x87 fp stack and do 32-bit store
diff --git a/compiler/dex/reg_storage.h b/compiler/dex/reg_storage.h
index df21343..2f7e701 100644
--- a/compiler/dex/reg_storage.h
+++ b/compiler/dex/reg_storage.h
@@ -280,6 +280,11 @@
     return RegStorage(k32BitSolo, (reg_num & kRegNumMask) | kFloatingPoint);
   }
 
+  // Create a 128-bit solo.
+  static RegStorage Solo128(int reg_num) {
+    return RegStorage(k128BitSolo, reg_num & kRegTypeMask);
+  }
+
   // Create a 64-bit solo.
   static RegStorage Solo64(int reg_num) {
     return RegStorage(k64BitSolo, reg_num & kRegTypeMask);
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 5cc6acf..cba4ebf 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -363,10 +363,49 @@
         src_reg_file = dst_reg_file = SSE;
         break;
       case 0x38:  // 3 byte extended opcode
-        opcode << StringPrintf("unknown opcode '0F 38 %02X'", *instr);
+        instr++;
+        if (prefix[2] == 0x66) {
+          switch (*instr) {
+            case 0x40:
+              opcode << "pmulld";
+              prefix[2] = 0;
+              has_modrm = true;
+              load = true;
+              src_reg_file = dst_reg_file = SSE;
+              break;
+            default:
+              opcode << StringPrintf("unknown opcode '0F 38 %02X'", *instr);
+          }
+        } else {
+          opcode << StringPrintf("unknown opcode '0F 38 %02X'", *instr);
+        }
         break;
       case 0x3A:  // 3 byte extended opcode
-        opcode << StringPrintf("unknown opcode '0F 3A %02X'", *instr);
+        instr++;
+        if (prefix[2] == 0x66) {
+          switch (*instr) {
+            case 0x14:
+              opcode << "pextrb";
+              prefix[2] = 0;
+              has_modrm = true;
+              store = true;
+              dst_reg_file = SSE;
+              immediate_bytes = 1;
+              break;
+            case 0x16:
+              opcode << "pextrd";
+              prefix[2] = 0;
+              has_modrm = true;
+              store = true;
+              dst_reg_file = SSE;
+              immediate_bytes = 1;
+              break;
+            default:
+              opcode << StringPrintf("unknown opcode '0F 3A %02X'", *instr);
+          }
+        } else {
+          opcode << StringPrintf("unknown opcode '0F 3A %02X'", *instr);
+        }
         break;
       case 0x40: case 0x41: case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
       case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C: case 0x4D: case 0x4E: case 0x4F:
@@ -467,11 +506,11 @@
         break;
       case 0x6F:
         if (prefix[2] == 0x66) {
-          dst_reg_file = SSE;
+          src_reg_file = dst_reg_file = SSE;
           opcode << "movdqa";
           prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else if (prefix[0] == 0xF3) {
-          dst_reg_file = SSE;
+          src_reg_file = dst_reg_file = SSE;
           opcode << "movdqu";
           prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else {
@@ -481,6 +520,25 @@
         load = true;
         has_modrm = true;
         break;
+      case 0x70:
+        if (prefix[2] == 0x66) {
+          opcode << "pshufd";
+          prefix[2] = 0;
+          has_modrm = true;
+          store = true;
+          src_reg_file = dst_reg_file = SSE;
+          immediate_bytes = 1;
+        } else if (prefix[0] == 0xF2) {
+          opcode << "pshuflw";
+          prefix[0] = 0;
+          has_modrm = true;
+          store = true;
+          src_reg_file = dst_reg_file = SSE;
+          immediate_bytes = 1;
+        } else {
+          opcode << StringPrintf("unknown opcode '0F %02X'", *instr);
+        }
+        break;
       case 0x71:
         if (prefix[2] == 0x66) {
           dst_reg_file = SSE;
@@ -603,6 +661,18 @@
       case 0xB7: opcode << "movzxw"; has_modrm = true; load = true; break;
       case 0xBE: opcode << "movsxb"; has_modrm = true; load = true; break;
       case 0xBF: opcode << "movsxw"; has_modrm = true; load = true; break;
+      case 0xC5:
+        if (prefix[2] == 0x66) {
+          opcode << "pextrw";
+          prefix[2] = 0;
+          has_modrm = true;
+          store = true;
+          src_reg_file = dst_reg_file = SSE;
+          immediate_bytes = 1;
+        } else {
+          opcode << StringPrintf("unknown opcode '0F %02X'", *instr);
+        }
+        break;
       case 0xC7:
         static const char* x0FxC7_opcodes[] = { "unknown-0f-c7", "cmpxchg8b", "unknown-0f-c7", "unknown-0f-c7", "unknown-0f-c7", "unknown-0f-c7", "unknown-0f-c7", "unknown-0f-c7" };
         modrm_opcodes = x0FxC7_opcodes;
@@ -614,6 +684,125 @@
         opcode << "bswap";
         reg_in_opcode = true;
         break;
+      case 0xDB:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "pand";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
+      case 0xD5:
+        if (prefix[2] == 0x66) {
+          opcode << "pmullw";
+          prefix[2] = 0;
+          has_modrm = true;
+          load = true;
+          src_reg_file = dst_reg_file = SSE;
+        } else {
+          opcode << StringPrintf("unknown opcode '0F %02X'", *instr);
+        }
+        break;
+      case 0xEB:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "por";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
+      case 0xEF:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "pxor";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
+      case 0xF8:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "psubb";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
+      case 0xF9:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "psubw";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
+      case 0xFA:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "psubd";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
+      case 0xFC:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "paddb";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
+      case 0xFD:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "paddw";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
+      case 0xFE:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "paddd";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
       default:
         opcode << StringPrintf("unknown opcode '0F %02X'", *instr);
         break;