Merge "Implement all vector instructions for X86"
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 9200106..91a66d3 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -279,6 +279,11 @@
{ kX86 ## opname ## RM, kRegMem, IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1, { prefix, 0, 0x0F, opcode, 0, 0, 0, 0 }, #opname "RM", "!0r,[!1r+!2d]" }, \
{ kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP | reg_def | REG_USE12, { prefix, 0, 0x0F, opcode, 0, 0, 0, 0 }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
+#define EXT_0F_ENCODING2_MAP(opname, prefix, opcode, opcode2, reg_def) \
+{ kX86 ## opname ## RR, kRegReg, IS_BINARY_OP | reg_def | REG_USE1, { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0 }, #opname "RR", "!0r,!1r" }, \
+{ kX86 ## opname ## RM, kRegMem, IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1, { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0 }, #opname "RM", "!0r,[!1r+!2d]" }, \
+{ kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP | reg_def | REG_USE12, { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0 }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
+
EXT_0F_ENCODING_MAP(Movsd, 0xF2, 0x10, REG_DEF0),
{ kX86MovsdMR, kMemReg, IS_STORE | IS_TERTIARY_OP | REG_USE02, { 0xF2, 0, 0x0F, 0x11, 0, 0, 0, 0 }, "MovsdMR", "[!0r+!1d],!2r" },
{ kX86MovsdAR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014, { 0xF2, 0, 0x0F, 0x11, 0, 0, 0, 0 }, "MovsdAR", "[!0r+!1r<<!2d+!3d],!4r" },
@@ -310,10 +315,42 @@
EXT_0F_ENCODING_MAP(Divsd, 0xF2, 0x5E, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Divss, 0xF3, 0x5E, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Punpckldq, 0x66, 0x62, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Sqrtsd, 0xF2, 0x51, REG_DEF0_USE0),
+ EXT_0F_ENCODING2_MAP(Pmulld, 0x66, 0x38, 0x40, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Pmullw, 0x66, 0xD5, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Mulps, 0x00, 0x59, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Mulpd, 0x66, 0x59, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Paddb, 0x66, 0xFC, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Paddw, 0x66, 0xFD, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Paddd, 0x66, 0xFE, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Addps, 0x00, 0x58, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Addpd, 0xF2, 0x58, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Psubb, 0x66, 0xF8, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Psubw, 0x66, 0xF9, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Psubd, 0x66, 0xFA, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Subps, 0x00, 0x5C, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Subpd, 0x66, 0x5C, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Pand, 0x66, 0xDB, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Por, 0x66, 0xEB, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Pxor, 0x66, 0xEF, REG_DEF0_USE0),
+ EXT_0F_ENCODING2_MAP(Phaddw, 0x66, 0x38, 0x01, REG_DEF0_USE0),
+ EXT_0F_ENCODING2_MAP(Phaddd, 0x66, 0x38, 0x02, REG_DEF0_USE0),
+ { kX86PextrbRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x14, 0, 0, 1 }, "PextbRRI", "!0r,!1r,!2d" },
+ { kX86PextrwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0xC5, 0x00, 0, 0, 1 }, "PextwRRI", "!0r,!1r,!2d" },
+ { kX86PextrdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1 }, "PextdRRI", "!0r,!1r,!2d" },
+
+ { kX86PshuflwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0xF2, 0, 0x0F, 0x70, 0, 0, 0, 1 }, "PshuflwRRI", "!0r,!1r,!2d" },
+ { kX86PshufdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x70, 0, 0, 0, 1 }, "PshuffRRI", "!0r,!1r,!2d" },
+
+ { kX86PsrawRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 4, 0, 1 }, "PsrawRI", "!0r,!1d" },
+ { kX86PsradRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 4, 0, 1 }, "PsradRI", "!0r,!1d" },
+ { kX86PsrlwRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 2, 0, 1 }, "PsrlwRI", "!0r,!1d" },
+ { kX86PsrldRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 2, 0, 1 }, "PsrldRI", "!0r,!1d" },
{ kX86PsrlqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 2, 0, 1 }, "PsrlqRI", "!0r,!1d" },
+ { kX86PsllwRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 6, 0, 1 }, "PsllwRI", "!0r,!1d" },
+ { kX86PslldRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 6, 0, 1 }, "PslldRI", "!0r,!1d" },
{ kX86PsllqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 6, 0, 1 }, "PsllqRI", "!0r,!1d" },
- { kX86SqrtsdRR, kRegReg, IS_BINARY_OP | REG_DEF0_USE1, { 0xF2, 0, 0x0F, 0x51, 0, 0, 0, 0 }, "SqrtsdRR", "!0r,!1r" },
{ kX86Fild32M, kMem, IS_LOAD | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDB, 0x00, 0, 0, 0, 0 }, "Fild32M", "[!0r,!1d]" },
{ kX86Fild64M, kMem, IS_LOAD | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDF, 0x00, 0, 5, 0, 0 }, "Fild64M", "[!0r,!1d]" },
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 72cdbbd..1807d5c 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -429,6 +429,136 @@
void GenConst128(BasicBlock* bb, MIR* mir);
/*
+ * @brief MIR to move a vectorized register to another.
+ * @param bb The basic block in which the MIR is from.
+ * @param mir The MIR whose opcode is kMirConstVector.
+ * @note vA: TypeSize
+ * @note vB: destination
+ * @note vC: source
+ */
+ void GenMoveVector(BasicBlock *bb, MIR *mir);
+
+ /*
+ * @brief Packed multiply of units in two vector registers: vB = vB .* @note vC using vA to know the type of the vector.
+ * @param bb The basic block in which the MIR is from.
+ * @param mir The MIR whose opcode is kMirConstVector.
+ * @note vA: TypeSize
+ * @note vB: destination and source
+ * @note vC: source
+ */
+ void GenMultiplyVector(BasicBlock *bb, MIR *mir);
+
+ /*
+ * @brief Packed addition of units in two vector registers: vB = vB .+ vC using vA to know the type of the vector.
+ * @param bb The basic block in which the MIR is from.
+ * @param mir The MIR whose opcode is kMirConstVector.
+ * @note vA: TypeSize
+ * @note vB: destination and source
+ * @note vC: source
+ */
+ void GenAddVector(BasicBlock *bb, MIR *mir);
+
+ /*
+ * @brief Packed subtraction of units in two vector registers: vB = vB .- vC using vA to know the type of the vector.
+ * @param bb The basic block in which the MIR is from.
+ * @param mir The MIR whose opcode is kMirConstVector.
+ * @note vA: TypeSize
+ * @note vB: destination and source
+ * @note vC: source
+ */
+ void GenSubtractVector(BasicBlock *bb, MIR *mir);
+
+ /*
+ * @brief Packed shift left of units in two vector registers: vB = vB .<< vC using vA to know the type of the vector.
+ * @param bb The basic block in which the MIR is from.
+ * @param mir The MIR whose opcode is kMirConstVector.
+ * @note vA: TypeSize
+ * @note vB: destination and source
+ * @note vC: immediate
+ */
+ void GenShiftLeftVector(BasicBlock *bb, MIR *mir);
+
+ /*
+ * @brief Packed signed shift right of units in two vector registers: vB = vB .>> vC using vA to know the type of the vector.
+ * @param bb The basic block in which the MIR is from.
+ * @param mir The MIR whose opcode is kMirConstVector.
+ * @note vA: TypeSize
+ * @note vB: destination and source
+ * @note vC: immediate
+ */
+ void GenSignedShiftRightVector(BasicBlock *bb, MIR *mir);
+
+ /*
+ * @brief Packed unsigned shift right of units in two vector registers: vB = vB .>>> vC using vA to know the type of the vector.
+ * @param bb The basic block in which the MIR is from..
+ * @param mir The MIR whose opcode is kMirConstVector.
+ * @note vA: TypeSize
+ * @note vB: destination and source
+ * @note vC: immediate
+ */
+ void GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir);
+
+ /*
+ * @brief Packed bitwise and of units in two vector registers: vB = vB .& vC using vA to know the type of the vector.
+ * @note vA: TypeSize
+ * @note vB: destination and source
+ * @note vC: source
+ */
+ void GenAndVector(BasicBlock *bb, MIR *mir);
+
+ /*
+ * @brief Packed bitwise or of units in two vector registers: vB = vB .| vC using vA to know the type of the vector.
+ * @param bb The basic block in which the MIR is from.
+ * @param mir The MIR whose opcode is kMirConstVector.
+ * @note vA: TypeSize
+ * @note vB: destination and source
+ * @note vC: source
+ */
+ void GenOrVector(BasicBlock *bb, MIR *mir);
+
+ /*
+ * @brief Packed bitwise xor of units in two vector registers: vB = vB .^ vC using vA to know the type of the vector.
+ * @param bb The basic block in which the MIR is from.
+ * @param mir The MIR whose opcode is kMirConstVector.
+ * @note vA: TypeSize
+ * @note vB: destination and source
+ * @note vC: source
+ */
+ void GenXorVector(BasicBlock *bb, MIR *mir);
+
+ /*
+ * @brief Reduce a 128-bit packed element into a single VR by taking lower bits
+ * @param bb The basic block in which the MIR is from.
+ * @param mir The MIR whose opcode is kMirConstVector.
+ * @details Instruction does a horizontal addition of the packed elements and then adds it to VR.
+ * @note vA: TypeSize
+ * @note vB: destination and source VR (not vector register)
+ * @note vC: source (vector register)
+ */
+ void GenAddReduceVector(BasicBlock *bb, MIR *mir);
+
+ /*
+ * @brief Extract a packed element into a single VR.
+ * @param bb The basic block in which the MIR is from.
+ * @param mir The MIR whose opcode is kMirConstVector.
+ * @note vA: TypeSize
+ * @note vB: destination VR (not vector register)
+ * @note vC: source (vector register)
+ * @note arg[0]: The index to use for extraction from vector register (which packed element).
+ */
+ void GenReduceVector(BasicBlock *bb, MIR *mir);
+
+ /*
+ * @brief Create a vector value, with all TypeSize values equal to vC
+ * @param bb The basic block in which the MIR is from.
+ * @param mir The MIR whose opcode is kMirConstVector.
+ * @note vA: TypeSize.
+ * @note vB: destination vector register.
+ * @note vC: source VR (not vector register).
+ */
+ void GenSetVector(BasicBlock *bb, MIR *mir);
+
+ /*
* @brief Generate code for a vector opcode.
* @param bb The basic block in which the MIR is from.
* @param mir The MIR whose opcode is a non-standard opcode.
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index e7a629a..889ea8b 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -81,6 +81,16 @@
#endif
};
+static const RegStorage xp_temps_arr_32[] = {
+ rs_xr0, rs_xr1, rs_xr2, rs_xr3, rs_xr4, rs_xr5, rs_xr6, rs_xr7,
+};
+static const RegStorage xp_temps_arr_64[] = {
+ rs_xr0, rs_xr1, rs_xr2, rs_xr3, rs_xr4, rs_xr5, rs_xr6, rs_xr7,
+#ifdef TARGET_REX_SUPPORT
+ rs_xr8, rs_xr9, rs_xr10, rs_xr11, rs_xr12, rs_xr13, rs_xr14, rs_xr15
+#endif
+};
+
static const std::vector<RegStorage> empty_pool;
static const std::vector<RegStorage> core_regs_32(core_regs_arr_32,
core_regs_arr_32 + sizeof(core_regs_arr_32) / sizeof(core_regs_arr_32[0]));
@@ -111,6 +121,11 @@
static const std::vector<RegStorage> dp_temps_64(dp_temps_arr_64,
dp_temps_arr_64 + sizeof(dp_temps_arr_64) / sizeof(dp_temps_arr_64[0]));
+static const std::vector<RegStorage> xp_temps_32(xp_temps_arr_32,
+ xp_temps_arr_32 + sizeof(xp_temps_arr_32) / sizeof(xp_temps_arr_32[0]));
+static const std::vector<RegStorage> xp_temps_64(xp_temps_arr_64,
+ xp_temps_arr_64 + sizeof(xp_temps_arr_64) / sizeof(xp_temps_arr_64[0]));
+
RegStorage rs_rX86_SP;
X86NativeRegisterPool rX86_ARG0;
@@ -209,7 +224,7 @@
/* Double registers in x86 are just a single FP register */
seed = 1;
/* FP register starts at bit position 16 */
- shift = reg.IsFloat() ? kX86FPReg0 : 0;
+ shift = (reg.IsFloat() || reg.StorageSize() > 8) ? kX86FPReg0 : 0;
/* Expand the double register id into single offset */
shift += reg_id;
return (seed << shift);
@@ -542,17 +557,31 @@
// Target-specific adjustments.
+ // Add in XMM registers.
+ const std::vector<RegStorage> *xp_temps = Gen64Bit() ? &xp_temps_64 : &xp_temps_32;
+ for (RegStorage reg : *xp_temps) {
+ RegisterInfo* info = new (arena_) RegisterInfo(reg, GetRegMaskCommon(reg));
+ reginfo_map_.Put(reg.GetReg(), info);
+ info->SetIsTemp(true);
+ }
+
// Alias single precision xmm to double xmms.
// TODO: as needed, add larger vector sizes - alias all to the largest.
GrowableArray<RegisterInfo*>::Iterator it(®_pool_->sp_regs_);
for (RegisterInfo* info = it.Next(); info != nullptr; info = it.Next()) {
int sp_reg_num = info->GetReg().GetRegNum();
+ RegStorage xp_reg = RegStorage::Solo128(sp_reg_num);
+ RegisterInfo* xp_reg_info = GetRegInfo(xp_reg);
+ // 128-bit xmm vector register's master storage should refer to itself.
+ DCHECK_EQ(xp_reg_info, xp_reg_info->Master());
+
+ // Redirect 32-bit vector's master storage to 128-bit vector.
+ info->SetMaster(xp_reg_info);
+
RegStorage dp_reg = RegStorage::Solo64(RegStorage::kFloatingPoint | sp_reg_num);
RegisterInfo* dp_reg_info = GetRegInfo(dp_reg);
- // 64-bit xmm vector register's master storage should refer to itself.
- DCHECK_EQ(dp_reg_info, dp_reg_info->Master());
- // Redirect 32-bit vector's master storage to 64-bit vector.
- info->SetMaster(dp_reg_info);
+ // Redirect 64-bit vector's master storage to 128-bit vector.
+ dp_reg_info->SetMaster(xp_reg_info);
}
// Don't start allocating temps at r0/s0/d0 or you may clobber return regs in early-exit methods.
@@ -1240,6 +1269,45 @@
case kMirOpConstVector:
GenConst128(bb, mir);
break;
+ case kMirOpMoveVector:
+ GenMoveVector(bb, mir);
+ break;
+ case kMirOpPackedMultiply:
+ GenMultiplyVector(bb, mir);
+ break;
+ case kMirOpPackedAddition:
+ GenAddVector(bb, mir);
+ break;
+ case kMirOpPackedSubtract:
+ GenSubtractVector(bb, mir);
+ break;
+ case kMirOpPackedShiftLeft:
+ GenShiftLeftVector(bb, mir);
+ break;
+ case kMirOpPackedSignedShiftRight:
+ GenSignedShiftRightVector(bb, mir);
+ break;
+ case kMirOpPackedUnsignedShiftRight:
+ GenUnsignedShiftRightVector(bb, mir);
+ break;
+ case kMirOpPackedAnd:
+ GenAndVector(bb, mir);
+ break;
+ case kMirOpPackedOr:
+ GenOrVector(bb, mir);
+ break;
+ case kMirOpPackedXor:
+ GenXorVector(bb, mir);
+ break;
+ case kMirOpPackedAddReduce:
+ GenAddReduceVector(bb, mir);
+ break;
+ case kMirOpPackedReduce:
+ GenReduceVector(bb, mir);
+ break;
+ case kMirOpPackedSet:
+ GenSetVector(bb, mir);
+ break;
default:
break;
}
@@ -1249,9 +1317,9 @@
int type_size = mir->dalvikInsn.vA;
// We support 128 bit vectors.
DCHECK_EQ(type_size & 0xFFFF, 128);
- int reg = mir->dalvikInsn.vB;
- DCHECK_LT(reg, 8);
+ RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
uint32_t *args = mir->dalvikInsn.arg;
+ int reg = rs_dest.GetReg();
// Check for all 0 case.
if (args[0] == 0 && args[1] == 0 && args[2] == 0 && args[3] == 0) {
NewLIR2(kX86XorpsRR, reg, reg);
@@ -1277,6 +1345,287 @@
SetMemRefType(load, true, kLiteral);
}
+void X86Mir2Lir::GenMoveVector(BasicBlock *bb, MIR *mir) {
+ // We only support 128 bit registers.
+ DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+ RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
+ RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vC);
+ NewLIR2(kX86Mova128RR, rs_dest.GetReg(), rs_src.GetReg());
+}
+
+void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) {
+ DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+ OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+ RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+ RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+ int opcode = 0;
+ switch (opsize) {
+ case k32:
+ opcode = kX86PmulldRR;
+ break;
+ case kSignedHalf:
+ opcode = kX86PmullwRR;
+ break;
+ case kSingle:
+ opcode = kX86MulpsRR;
+ break;
+ case kDouble:
+ opcode = kX86MulpdRR;
+ break;
+ default:
+ LOG(FATAL) << "Unsupported vector multiply " << opsize;
+ break;
+ }
+ NewLIR2(opcode, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenAddVector(BasicBlock *bb, MIR *mir) {
+ DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+ OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+ RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+ RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+ int opcode = 0;
+ switch (opsize) {
+ case k32:
+ opcode = kX86PadddRR;
+ break;
+ case kSignedHalf:
+ case kUnsignedHalf:
+ opcode = kX86PaddwRR;
+ break;
+ case kUnsignedByte:
+ case kSignedByte:
+ opcode = kX86PaddbRR;
+ break;
+ case kSingle:
+ opcode = kX86AddpsRR;
+ break;
+ case kDouble:
+ opcode = kX86AddpdRR;
+ break;
+ default:
+ LOG(FATAL) << "Unsupported vector addition " << opsize;
+ break;
+ }
+ NewLIR2(opcode, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) {
+ DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+ OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+ RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+ RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+ int opcode = 0;
+ switch (opsize) {
+ case k32:
+ opcode = kX86PsubdRR;
+ break;
+ case kSignedHalf:
+ case kUnsignedHalf:
+ opcode = kX86PsubwRR;
+ break;
+ case kUnsignedByte:
+ case kSignedByte:
+ opcode = kX86PsubbRR;
+ break;
+ case kSingle:
+ opcode = kX86SubpsRR;
+ break;
+ case kDouble:
+ opcode = kX86SubpdRR;
+ break;
+ default:
+ LOG(FATAL) << "Unsupported vector subtraction " << opsize;
+ break;
+ }
+ NewLIR2(opcode, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) {
+ DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+ OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+ RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+ int imm = mir->dalvikInsn.vC;
+ int opcode = 0;
+ switch (opsize) {
+ case k32:
+ opcode = kX86PslldRI;
+ break;
+ case k64:
+ opcode = kX86PsllqRI;
+ break;
+ case kSignedHalf:
+ case kUnsignedHalf:
+ opcode = kX86PsllwRI;
+ break;
+ default:
+ LOG(FATAL) << "Unsupported vector shift left " << opsize;
+ break;
+ }
+ NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+}
+
+void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) {
+ DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+ OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+ RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+ int imm = mir->dalvikInsn.vC;
+ int opcode = 0;
+ switch (opsize) {
+ case k32:
+ opcode = kX86PsradRI;
+ break;
+ case kSignedHalf:
+ case kUnsignedHalf:
+ opcode = kX86PsrawRI;
+ break;
+ default:
+ LOG(FATAL) << "Unsupported vector signed shift right " << opsize;
+ break;
+ }
+ NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+}
+
+void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) {
+ DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+ OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+ RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+ int imm = mir->dalvikInsn.vC;
+ int opcode = 0;
+ switch (opsize) {
+ case k32:
+ opcode = kX86PsrldRI;
+ break;
+ case k64:
+ opcode = kX86PsrlqRI;
+ break;
+ case kSignedHalf:
+ case kUnsignedHalf:
+ opcode = kX86PsrlwRI;
+ break;
+ default:
+ LOG(FATAL) << "Unsupported vector unsigned shift right " << opsize;
+ break;
+ }
+ NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+}
+
+void X86Mir2Lir::GenAndVector(BasicBlock *bb, MIR *mir) {
+ // We only support 128 bit registers.
+ DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+ RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+ RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+ NewLIR2(kX86PandRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenOrVector(BasicBlock *bb, MIR *mir) {
+ // We only support 128 bit registers.
+ DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+ RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+ RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+ NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenXorVector(BasicBlock *bb, MIR *mir) {
+ // We only support 128 bit registers.
+ DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+ RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+ RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+ NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenAddReduceVector(BasicBlock *bb, MIR *mir) {
+ DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+ OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+ RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+ int imm = mir->dalvikInsn.vC;
+ int opcode = 0;
+ switch (opsize) {
+ case k32:
+ opcode = kX86PhadddRR;
+ break;
+ case kSignedHalf:
+ case kUnsignedHalf:
+ opcode = kX86PhaddwRR;
+ break;
+ default:
+ LOG(FATAL) << "Unsupported vector add reduce " << opsize;
+ break;
+ }
+ NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+}
+
+void X86Mir2Lir::GenReduceVector(BasicBlock *bb, MIR *mir) {
+ DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+ OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+ RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vB);
+ int index = mir->dalvikInsn.arg[0];
+ int opcode = 0;
+ switch (opsize) {
+ case k32:
+ opcode = kX86PextrdRRI;
+ break;
+ case kSignedHalf:
+ case kUnsignedHalf:
+ opcode = kX86PextrwRRI;
+ break;
+ case kUnsignedByte:
+ case kSignedByte:
+ opcode = kX86PextrbRRI;
+ break;
+ default:
+ LOG(FATAL) << "Unsupported vector reduce " << opsize;
+ break;
+ }
+ // We need to extract to a GPR.
+ RegStorage temp = AllocTemp();
+ NewLIR3(opcode, temp.GetReg(), rs_src.GetReg(), index);
+
+ // Assume that the destination VR is in the def for the mir.
+ RegLocation rl_dest = mir_graph_->GetDest(mir);
+ RegLocation rl_temp =
+ {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, temp, INVALID_SREG, INVALID_SREG};
+ StoreValue(rl_dest, rl_temp);
+}
+
+void X86Mir2Lir::GenSetVector(BasicBlock *bb, MIR *mir) {
+ DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+ OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+ RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
+ int op_low = 0, op_high = 0;
+ switch (opsize) {
+ case k32:
+ op_low = kX86PshufdRRI;
+ break;
+ case kSignedHalf:
+ case kUnsignedHalf:
+ // Handles low quadword.
+ op_low = kX86PshuflwRRI;
+ // Handles upper quadword.
+ op_high = kX86PshufdRRI;
+ break;
+ default:
+ LOG(FATAL) << "Unsupported vector set " << opsize;
+ break;
+ }
+
+ // Load the value from the VR into a GPR.
+ RegLocation rl_src = mir_graph_->GetSrc(mir, 0);
+ rl_src = LoadValue(rl_src, kCoreReg);
+
+ // Load the value into the XMM register.
+ NewLIR2(kX86MovdxrRR, rs_dest.GetReg(), rl_src.reg.GetReg());
+
+ // Now shuffle the value across the destination.
+ NewLIR3(op_low, rs_dest.GetReg(), rs_dest.GetReg(), 0);
+
+ // And then repeat as needed.
+ if (op_high != 0) {
+ NewLIR3(op_high, rs_dest.GetReg(), rs_dest.GetReg(), 0);
+ }
+}
+
+
LIR *X86Mir2Lir::ScanVectorLiteral(MIR *mir) {
int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg);
for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index adfed0c..430bc7d 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -151,7 +151,7 @@
rRET = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 16,
#endif
- // xmm registers, single precision view
+ // xmm registers, single precision view.
fr0 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 0,
fr1 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 1,
fr2 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 2,
@@ -161,7 +161,7 @@
fr6 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 6,
fr7 = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 7,
- // xmm registers, double precision alises
+ // xmm registers, double precision aliases.
dr0 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 0,
dr1 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 1,
dr2 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 2,
@@ -171,15 +171,15 @@
dr6 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 6,
dr7 = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 7,
- // xmm registers, quad precision alises
- qr0 = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 0,
- qr1 = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 1,
- qr2 = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 2,
- qr3 = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 3,
- qr4 = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 4,
- qr5 = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 5,
- qr6 = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 6,
- qr7 = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 7,
+ // xmm registers aliases.
+ xr0 = RegStorage::k128BitSolo | 0,
+ xr1 = RegStorage::k128BitSolo | 1,
+ xr2 = RegStorage::k128BitSolo | 2,
+ xr3 = RegStorage::k128BitSolo | 3,
+ xr4 = RegStorage::k128BitSolo | 4,
+ xr5 = RegStorage::k128BitSolo | 5,
+ xr6 = RegStorage::k128BitSolo | 6,
+ xr7 = RegStorage::k128BitSolo | 7,
// TODO: as needed, add 256, 512 and 1024-bit xmm views.
};
@@ -221,14 +221,14 @@
constexpr RegStorage rs_dr6(RegStorage::kValid | dr6);
constexpr RegStorage rs_dr7(RegStorage::kValid | dr7);
-constexpr RegStorage rs_qr0(RegStorage::kValid | qr0);
-constexpr RegStorage rs_qr1(RegStorage::kValid | qr1);
-constexpr RegStorage rs_qr2(RegStorage::kValid | qr2);
-constexpr RegStorage rs_qr3(RegStorage::kValid | qr3);
-constexpr RegStorage rs_qr4(RegStorage::kValid | qr4);
-constexpr RegStorage rs_qr5(RegStorage::kValid | qr5);
-constexpr RegStorage rs_qr6(RegStorage::kValid | qr6);
-constexpr RegStorage rs_qr7(RegStorage::kValid | qr7);
+constexpr RegStorage rs_xr0(RegStorage::kValid | xr0);
+constexpr RegStorage rs_xr1(RegStorage::kValid | xr1);
+constexpr RegStorage rs_xr2(RegStorage::kValid | xr2);
+constexpr RegStorage rs_xr3(RegStorage::kValid | xr3);
+constexpr RegStorage rs_xr4(RegStorage::kValid | xr4);
+constexpr RegStorage rs_xr5(RegStorage::kValid | xr5);
+constexpr RegStorage rs_xr6(RegStorage::kValid | xr6);
+constexpr RegStorage rs_xr7(RegStorage::kValid | xr7);
extern X86NativeRegisterPool rX86_ARG0;
extern X86NativeRegisterPool rX86_ARG1;
@@ -418,9 +418,39 @@
Binary0fOpCode(kX86Divsd), // double divide
Binary0fOpCode(kX86Divss), // float divide
Binary0fOpCode(kX86Punpckldq), // Interleave low-order double words
- kX86PsrlqRI, // right shift of floating point registers
- kX86PsllqRI, // left shift of floating point registers
- kX86SqrtsdRR, // sqrt of floating point register
+ Binary0fOpCode(kX86Sqrtsd), // square root
+ Binary0fOpCode(kX86Pmulld), // parallel integer multiply 32 bits x 4
+ Binary0fOpCode(kX86Pmullw), // parallel integer multiply 16 bits x 8
+ Binary0fOpCode(kX86Mulps), // parallel FP multiply 32 bits x 4
+ Binary0fOpCode(kX86Mulpd), // parallel FP multiply 64 bits x 2
+ Binary0fOpCode(kX86Paddb), // parallel integer addition 8 bits x 16
+ Binary0fOpCode(kX86Paddw), // parallel integer addition 16 bits x 8
+ Binary0fOpCode(kX86Paddd), // parallel integer addition 32 bits x 4
+ Binary0fOpCode(kX86Addps), // parallel FP addition 32 bits x 4
+ Binary0fOpCode(kX86Addpd), // parallel FP addition 64 bits x 2
+ Binary0fOpCode(kX86Psubb), // parallel integer subtraction 8 bits x 16
+ Binary0fOpCode(kX86Psubw), // parallel integer subtraction 16 bits x 8
+ Binary0fOpCode(kX86Psubd), // parallel integer subtraction 32 bits x 4
+ Binary0fOpCode(kX86Subps), // parallel FP subtraction 32 bits x 4
+ Binary0fOpCode(kX86Subpd), // parallel FP subtraction 64 bits x 2
+ Binary0fOpCode(kX86Pand), // parallel AND 128 bits x 1
+ Binary0fOpCode(kX86Por), // parallel OR 128 bits x 1
+ Binary0fOpCode(kX86Pxor), // parallel XOR 128 bits x 1
+ Binary0fOpCode(kX86Phaddw), // parallel horizontal addition 16 bits x 8
+ Binary0fOpCode(kX86Phaddd), // parallel horizontal addition 32 bits x 4
+ kX86PextrbRRI, // Extract 8 bits from XMM into GPR
+ kX86PextrwRRI, // Extract 16 bits from XMM into GPR
+ kX86PextrdRRI, // Extract 32 bits from XMM into GPR
+ kX86PshuflwRRI, // Shuffle 16 bits in lower 64 bits of XMM.
+ kX86PshufdRRI, // Shuffle 32 bits in XMM.
+ kX86PsrawRI, // signed right shift of floating point registers 16 bits x 8
+ kX86PsradRI, // signed right shift of floating point registers 32 bits x 4
+ kX86PsrlwRI, // logical right shift of floating point registers 16 bits x 8
+ kX86PsrldRI, // logical right shift of floating point registers 32 bits x 4
+ kX86PsrlqRI, // logical right shift of floating point registers 64 bits x 2
+ kX86PsllwRI, // left shift of floating point registers 16 bits x 8
+ kX86PslldRI, // left shift of floating point registers 32 bits x 4
+ kX86PsllqRI, // left shift of floating point registers 64 bits x 2
kX86Fild32M, // push 32-bit integer on x87 stack
kX86Fild64M, // push 64-bit integer on x87 stack
kX86Fstp32M, // pop top x87 fp stack and do 32-bit store
diff --git a/compiler/dex/reg_storage.h b/compiler/dex/reg_storage.h
index df21343..2f7e701 100644
--- a/compiler/dex/reg_storage.h
+++ b/compiler/dex/reg_storage.h
@@ -280,6 +280,11 @@
return RegStorage(k32BitSolo, (reg_num & kRegNumMask) | kFloatingPoint);
}
+ // Create a 128-bit solo.
+ static RegStorage Solo128(int reg_num) {
+ return RegStorage(k128BitSolo, reg_num & kRegTypeMask);
+ }
+
// Create a 64-bit solo.
static RegStorage Solo64(int reg_num) {
return RegStorage(k64BitSolo, reg_num & kRegTypeMask);
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 5cc6acf..cba4ebf 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -363,10 +363,49 @@
src_reg_file = dst_reg_file = SSE;
break;
case 0x38: // 3 byte extended opcode
- opcode << StringPrintf("unknown opcode '0F 38 %02X'", *instr);
+ instr++;
+ if (prefix[2] == 0x66) {
+ switch (*instr) {
+ case 0x40:
+ opcode << "pmulld";
+ prefix[2] = 0;
+ has_modrm = true;
+ load = true;
+ src_reg_file = dst_reg_file = SSE;
+ break;
+ default:
+ opcode << StringPrintf("unknown opcode '0F 38 %02X'", *instr);
+ }
+ } else {
+ opcode << StringPrintf("unknown opcode '0F 38 %02X'", *instr);
+ }
break;
case 0x3A: // 3 byte extended opcode
- opcode << StringPrintf("unknown opcode '0F 3A %02X'", *instr);
+ instr++;
+ if (prefix[2] == 0x66) {
+ switch (*instr) {
+ case 0x14:
+ opcode << "pextrb";
+ prefix[2] = 0;
+ has_modrm = true;
+ store = true;
+ dst_reg_file = SSE;
+ immediate_bytes = 1;
+ break;
+ case 0x16:
+ opcode << "pextrd";
+ prefix[2] = 0;
+ has_modrm = true;
+ store = true;
+ dst_reg_file = SSE;
+ immediate_bytes = 1;
+ break;
+ default:
+ opcode << StringPrintf("unknown opcode '0F 3A %02X'", *instr);
+ }
+ } else {
+ opcode << StringPrintf("unknown opcode '0F 3A %02X'", *instr);
+ }
break;
case 0x40: case 0x41: case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C: case 0x4D: case 0x4E: case 0x4F:
@@ -467,11 +506,11 @@
break;
case 0x6F:
if (prefix[2] == 0x66) {
- dst_reg_file = SSE;
+ src_reg_file = dst_reg_file = SSE;
opcode << "movdqa";
prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
} else if (prefix[0] == 0xF3) {
- dst_reg_file = SSE;
+ src_reg_file = dst_reg_file = SSE;
opcode << "movdqu";
prefix[0] = 0; // clear prefix now it's served its purpose as part of the opcode
} else {
@@ -481,6 +520,25 @@
load = true;
has_modrm = true;
break;
+ case 0x70:
+ if (prefix[2] == 0x66) {
+ opcode << "pshufd";
+ prefix[2] = 0;
+ has_modrm = true;
+ store = true;
+ src_reg_file = dst_reg_file = SSE;
+ immediate_bytes = 1;
+ } else if (prefix[0] == 0xF2) {
+ opcode << "pshuflw";
+ prefix[0] = 0;
+ has_modrm = true;
+ store = true;
+ src_reg_file = dst_reg_file = SSE;
+ immediate_bytes = 1;
+ } else {
+ opcode << StringPrintf("unknown opcode '0F %02X'", *instr);
+ }
+ break;
case 0x71:
if (prefix[2] == 0x66) {
dst_reg_file = SSE;
@@ -603,6 +661,18 @@
case 0xB7: opcode << "movzxw"; has_modrm = true; load = true; break;
case 0xBE: opcode << "movsxb"; has_modrm = true; load = true; break;
case 0xBF: opcode << "movsxw"; has_modrm = true; load = true; break;
+ case 0xC5:
+ if (prefix[2] == 0x66) {
+ opcode << "pextrw";
+ prefix[2] = 0;
+ has_modrm = true;
+ store = true;
+ src_reg_file = dst_reg_file = SSE;
+ immediate_bytes = 1;
+ } else {
+ opcode << StringPrintf("unknown opcode '0F %02X'", *instr);
+ }
+ break;
case 0xC7:
static const char* x0FxC7_opcodes[] = { "unknown-0f-c7", "cmpxchg8b", "unknown-0f-c7", "unknown-0f-c7", "unknown-0f-c7", "unknown-0f-c7", "unknown-0f-c7", "unknown-0f-c7" };
modrm_opcodes = x0FxC7_opcodes;
@@ -614,6 +684,125 @@
opcode << "bswap";
reg_in_opcode = true;
break;
+ case 0xDB:
+ if (prefix[2] == 0x66) {
+ src_reg_file = dst_reg_file = SSE;
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ src_reg_file = dst_reg_file = MMX;
+ }
+ opcode << "pand";
+ prefix[2] = 0;
+ has_modrm = true;
+ load = true;
+ break;
+ case 0xD5:
+ if (prefix[2] == 0x66) {
+ opcode << "pmullw";
+ prefix[2] = 0;
+ has_modrm = true;
+ load = true;
+ src_reg_file = dst_reg_file = SSE;
+ } else {
+ opcode << StringPrintf("unknown opcode '0F %02X'", *instr);
+ }
+ break;
+ case 0xEB:
+ if (prefix[2] == 0x66) {
+ src_reg_file = dst_reg_file = SSE;
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ src_reg_file = dst_reg_file = MMX;
+ }
+ opcode << "por";
+ prefix[2] = 0;
+ has_modrm = true;
+ load = true;
+ break;
+ case 0xEF:
+ if (prefix[2] == 0x66) {
+ src_reg_file = dst_reg_file = SSE;
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ src_reg_file = dst_reg_file = MMX;
+ }
+ opcode << "pxor";
+ prefix[2] = 0;
+ has_modrm = true;
+ load = true;
+ break;
+ case 0xF8:
+ if (prefix[2] == 0x66) {
+ src_reg_file = dst_reg_file = SSE;
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ src_reg_file = dst_reg_file = MMX;
+ }
+ opcode << "psubb";
+ prefix[2] = 0;
+ has_modrm = true;
+ load = true;
+ break;
+ case 0xF9:
+ if (prefix[2] == 0x66) {
+ src_reg_file = dst_reg_file = SSE;
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ src_reg_file = dst_reg_file = MMX;
+ }
+ opcode << "psubw";
+ prefix[2] = 0;
+ has_modrm = true;
+ load = true;
+ break;
+ case 0xFA:
+ if (prefix[2] == 0x66) {
+ src_reg_file = dst_reg_file = SSE;
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ src_reg_file = dst_reg_file = MMX;
+ }
+ opcode << "psubd";
+ prefix[2] = 0;
+ has_modrm = true;
+ load = true;
+ break;
+ case 0xFC:
+ if (prefix[2] == 0x66) {
+ src_reg_file = dst_reg_file = SSE;
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ src_reg_file = dst_reg_file = MMX;
+ }
+ opcode << "paddb";
+ prefix[2] = 0;
+ has_modrm = true;
+ load = true;
+ break;
+ case 0xFD:
+ if (prefix[2] == 0x66) {
+ src_reg_file = dst_reg_file = SSE;
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ src_reg_file = dst_reg_file = MMX;
+ }
+ opcode << "paddw";
+ prefix[2] = 0;
+ has_modrm = true;
+ load = true;
+ break;
+ case 0xFE:
+ if (prefix[2] == 0x66) {
+ src_reg_file = dst_reg_file = SSE;
+ prefix[2] = 0; // clear prefix now it's served its purpose as part of the opcode
+ } else {
+ src_reg_file = dst_reg_file = MMX;
+ }
+ opcode << "paddd";
+ prefix[2] = 0;
+ has_modrm = true;
+ load = true;
+ break;
default:
opcode << StringPrintf("unknown opcode '0F %02X'", *instr);
break;