Specializing x86 range argument copying The ARM implementation of range argument copying was specialized in some cases. For all other architectures, it would fall back to generating memcpy. This patch updates the x86 implementation so it does not call memcpy and instead generates loads and stores, favoring movement of 128-bit chunks. Change-Id: Ic891e5609a4b0e81a47c29cc5a9b301bd10a1933 Signed-off-by: Razvan A Lupusoru <razvan.a.lupusoru@intel.com>

commit: 2c498d1f28e62e81fbdb477ff93ca7454e7493d7 [log] [tgz]
author: Razvan A Lupusoru <razvan.a.lupusoru@intel.com> Wed Jan 29 16:02:57 2014 -0800
committer: Ian Rogers <irogers@google.com> Wed Feb 05 22:42:21 2014 -0800
tree: 94654433a4dae83ab75d432304dcc0358aefeb1c
parent: 1dcff62155e8477eb114c8a86eb1beb0797ffc11 [diff]
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index 18122b3..2bc36a5 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h

@@ -210,6 +210,22 @@
   kOpInvalid,
 };
 
+enum MoveType {
+  kMov8GP,      // Move 8-bit general purpose register.
+  kMov16GP,     // Move 16-bit general purpose register.
+  kMov32GP,     // Move 32-bit general purpose register.
+  kMov64GP,     // Move 64-bit general purpose register.
+  kMov32FP,     // Move 32-bit FP register.
+  kMov64FP,     // Move 64-bit FP register.
+  kMovLo64FP,   // Move low 32-bits of 64-bit FP register.
+  kMovHi64FP,   // Move high 32-bits of 64-bit FP register.
+  kMovU128FP,   // Move 128-bit FP register to/from possibly unaligned region.
+  kMov128FP = kMovU128FP,
+  kMovA128FP,   // Move 128-bit FP register to/from region surely aligned to 16-bytes.
+  kMovLo128FP,  // Move low 64-bits of 128-bit FP register.
+  kMovHi128FP,  // Move high 64-bits of 128-bit FP register.
+};
+
 std::ostream& operator<<(std::ostream& os, const OpKind& kind);
 
 enum ConditionCode {

diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 32673db..0ed4576 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h

@@ -154,6 +154,8 @@
     LIR* OpRegImm(OpKind op, int r_dest_src1, int value);
     LIR* OpRegMem(OpKind op, int r_dest, int rBase, int offset);
     LIR* OpRegReg(OpKind op, int r_dest_src1, int r_src2);
+    LIR* OpMovRegMem(int r_dest, int r_base, int offset, MoveType move_type);
+    LIR* OpMovMemReg(int r_base, int offset, int r_src, MoveType move_type);
     LIR* OpCondRegReg(OpKind op, ConditionCode cc, int r_dest, int r_src);
     LIR* OpRegRegImm(OpKind op, int r_dest, int r_src1, int value);
     LIR* OpRegRegReg(OpKind op, int r_dest, int r_src1, int r_src2);

diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index 07fc6c7..9d3968b 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc

@@ -367,6 +367,16 @@
   return OpRegRegShift(op, r_dest_src1, r_src2, 0);
 }
 
+LIR* ArmMir2Lir::OpMovRegMem(int r_dest, int r_base, int offset, MoveType move_type) {
+  UNIMPLEMENTED(FATAL);
+  return nullptr;
+}
+
+LIR* ArmMir2Lir::OpMovMemReg(int r_base, int offset, int r_src, MoveType move_type) {
+  UNIMPLEMENTED(FATAL);
+  return nullptr;
+}
+
 LIR* ArmMir2Lir::OpCondRegReg(OpKind op, ConditionCode cc, int r_dest, int r_src) {
   LOG(FATAL) << "Unexpected use of OpCondRegReg for Arm";
   return NULL;

diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 3823fb3..6382dd6 100644
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc

@@ -811,42 +811,145 @@
     }
   }
 
+  // Logic below assumes that Method pointer is at offset zero from SP.
+  DCHECK_EQ(VRegOffset(static_cast<int>(kVRegMethodPtrBaseReg)), 0);
+
+  // The first 3 arguments are passed via registers.
+  // TODO: For 64-bit, instead of hardcoding 4 for Method* size, we should either
+  // get size of uintptr_t or size of object reference according to model being used.
+  int outs_offset = 4 /* Method* */ + (3 * sizeof(uint32_t));
   int start_offset = SRegOffset(info->args[3].s_reg_low);
-  int outs_offset = 4 /* Method* */ + (3 * 4);
-  if (cu_->instruction_set != kThumb2) {
+  int regs_left_to_pass_via_stack = info->num_arg_words - 3;
+  DCHECK_GT(regs_left_to_pass_via_stack, 0);
+
+  if (cu_->instruction_set == kThumb2 && regs_left_to_pass_via_stack <= 16) {
+    // Use vldm/vstm pair using kArg3 as a temp
+    call_state = next_call_insn(cu_, info, call_state, target_method, vtable_idx,
+                             direct_code, direct_method, type);
+    OpRegRegImm(kOpAdd, TargetReg(kArg3), TargetReg(kSp), start_offset);
+    LIR* ld = OpVldm(TargetReg(kArg3), regs_left_to_pass_via_stack);
+    // TUNING: loosen barrier
+    ld->u.m.def_mask = ENCODE_ALL;
+    SetMemRefType(ld, true /* is_load */, kDalvikReg);
+    call_state = next_call_insn(cu_, info, call_state, target_method, vtable_idx,
+                             direct_code, direct_method, type);
+    OpRegRegImm(kOpAdd, TargetReg(kArg3), TargetReg(kSp), 4 /* Method* */ + (3 * 4));
+    call_state = next_call_insn(cu_, info, call_state, target_method, vtable_idx,
+                             direct_code, direct_method, type);
+    LIR* st = OpVstm(TargetReg(kArg3), regs_left_to_pass_via_stack);
+    SetMemRefType(st, false /* is_load */, kDalvikReg);
+    st->u.m.def_mask = ENCODE_ALL;
+    call_state = next_call_insn(cu_, info, call_state, target_method, vtable_idx,
+                             direct_code, direct_method, type);
+  } else if (cu_->instruction_set == kX86) {
+    int current_src_offset = start_offset;
+    int current_dest_offset = outs_offset;
+
+    while (regs_left_to_pass_via_stack > 0) {
+      // This is based on the knowledge that the stack itself is 16-byte aligned.
+      bool src_is_16b_aligned = (current_src_offset & 0xF) == 0;
+      bool dest_is_16b_aligned = (current_dest_offset & 0xF) == 0;
+      size_t bytes_to_move;
+
+      /*
+       * The amount to move defaults to 32-bit. If there are 4 registers left to move, then do a
+       * a 128-bit move because we won't get the chance to try to aligned. If there are more than
+       * 4 registers left to move, consider doing a 128-bit only if either src or dest are aligned.
+       * We do this because we could potentially do a smaller move to align.
+       */
+      if (regs_left_to_pass_via_stack == 4 ||
+          (regs_left_to_pass_via_stack > 4 && (src_is_16b_aligned || dest_is_16b_aligned))) {
+        // Moving 128-bits via xmm register.
+        bytes_to_move = sizeof(uint32_t) * 4;
+
+        // Allocate a free xmm temp. Since we are working through the calling sequence,
+        // we expect to have an xmm temporary available.
+        int temp = AllocTempDouble();
+        CHECK_GT(temp, 0);
+
+        LIR* ld1 = nullptr;
+        LIR* ld2 = nullptr;
+        LIR* st1 = nullptr;
+        LIR* st2 = nullptr;
+
+        /*
+         * The logic is similar for both loads and stores. If we have 16-byte alignment,
+         * do an aligned move. If we have 8-byte alignment, then do the move in two
+         * parts. This approach prevents possible cache line splits. Finally, fall back
+         * to doing an unaligned move. In most cases we likely won't split the cache
+         * line but we cannot prove it and thus take a conservative approach.
+         */
+        bool src_is_8b_aligned = (current_src_offset & 0x7) == 0;
+        bool dest_is_8b_aligned = (current_dest_offset & 0x7) == 0;
+
+        if (src_is_16b_aligned) {
+          ld1 = OpMovRegMem(temp, TargetReg(kSp), current_src_offset, kMovA128FP);
+        } else if (src_is_8b_aligned) {
+          ld1 = OpMovRegMem(temp, TargetReg(kSp), current_src_offset, kMovLo128FP);
+          ld2 = OpMovRegMem(temp, TargetReg(kSp), current_src_offset + (bytes_to_move >> 1), kMovHi128FP);
+        } else {
+          ld1 = OpMovRegMem(temp, TargetReg(kSp), current_src_offset, kMovU128FP);
+        }
+
+        if (dest_is_16b_aligned) {
+          st1 = OpMovMemReg(TargetReg(kSp), current_dest_offset, temp, kMovA128FP);
+        } else if (dest_is_8b_aligned) {
+          st1 = OpMovMemReg(TargetReg(kSp), current_dest_offset, temp, kMovLo128FP);
+          st2 = OpMovMemReg(TargetReg(kSp), current_dest_offset + (bytes_to_move >> 1), temp, kMovHi128FP);
+        } else {
+          st1 = OpMovMemReg(TargetReg(kSp), current_dest_offset, temp, kMovU128FP);
+        }
+
+        // TODO If we could keep track of aliasing information for memory accesses that are wider
+        // than 64-bit, we wouldn't need to set up a barrier.
+        if (ld1 != nullptr) {
+          if (ld2 != nullptr) {
+            // For 64-bit load we can actually set up the aliasing information.
+            AnnotateDalvikRegAccess(ld1, current_src_offset >> 2, true, true);
+            AnnotateDalvikRegAccess(ld2, (current_src_offset + (bytes_to_move >> 1)) >> 2, true, true);
+          } else {
+            // Set barrier for 128-bit load.
+            SetMemRefType(ld1, true /* is_load */, kDalvikReg);
+            ld1->u.m.def_mask = ENCODE_ALL;
+          }
+        }
+        if (st1 != nullptr) {
+          if (st2 != nullptr) {
+            // For 64-bit store we can actually set up the aliasing information.
+            AnnotateDalvikRegAccess(st1, current_dest_offset >> 2, false, true);
+            AnnotateDalvikRegAccess(st2, (current_dest_offset + (bytes_to_move >> 1)) >> 2, false, true);
+          } else {
+            // Set barrier for 128-bit store.
+            SetMemRefType(st1, false /* is_load */, kDalvikReg);
+            st1->u.m.def_mask = ENCODE_ALL;
+          }
+        }
+
+        // Free the temporary used for the data movement.
+        FreeTemp(temp);
+      } else {
+        // Moving 32-bits via general purpose register.
+        bytes_to_move = sizeof(uint32_t);
+
+        // Instead of allocating a new temp, simply reuse one of the registers being used
+        // for argument passing.
+        int temp = TargetReg(kArg3);
+
+        // Now load the argument VR and store to the outs.
+        LoadWordDisp(TargetReg(kSp), current_src_offset, temp);
+        StoreWordDisp(TargetReg(kSp), current_dest_offset, temp);
+      }
+
+      current_src_offset += bytes_to_move;
+      current_dest_offset += bytes_to_move;
+      regs_left_to_pass_via_stack -= (bytes_to_move >> 2);
+    }
+  } else {
     // Generate memcpy
     OpRegRegImm(kOpAdd, TargetReg(kArg0), TargetReg(kSp), outs_offset);
     OpRegRegImm(kOpAdd, TargetReg(kArg1), TargetReg(kSp), start_offset);
     CallRuntimeHelperRegRegImm(QUICK_ENTRYPOINT_OFFSET(pMemcpy), TargetReg(kArg0),
                                TargetReg(kArg1), (info->num_arg_words - 3) * 4, false);
-  } else {
-    if (info->num_arg_words >= 20) {
-      // Generate memcpy
-      OpRegRegImm(kOpAdd, TargetReg(kArg0), TargetReg(kSp), outs_offset);
-      OpRegRegImm(kOpAdd, TargetReg(kArg1), TargetReg(kSp), start_offset);
-      CallRuntimeHelperRegRegImm(QUICK_ENTRYPOINT_OFFSET(pMemcpy), TargetReg(kArg0),
-                                 TargetReg(kArg1), (info->num_arg_words - 3) * 4, false);
-    } else {
-      // Use vldm/vstm pair using kArg3 as a temp
-      int regs_left = std::min(info->num_arg_words - 3, 16);
-      call_state = next_call_insn(cu_, info, call_state, target_method, vtable_idx,
-                               direct_code, direct_method, type);
-      OpRegRegImm(kOpAdd, TargetReg(kArg3), TargetReg(kSp), start_offset);
-      LIR* ld = OpVldm(TargetReg(kArg3), regs_left);
-      // TUNING: loosen barrier
-      ld->u.m.def_mask = ENCODE_ALL;
-      SetMemRefType(ld, true /* is_load */, kDalvikReg);
-      call_state = next_call_insn(cu_, info, call_state, target_method, vtable_idx,
-                               direct_code, direct_method, type);
-      OpRegRegImm(kOpAdd, TargetReg(kArg3), TargetReg(kSp), 4 /* Method* */ + (3 * 4));
-      call_state = next_call_insn(cu_, info, call_state, target_method, vtable_idx,
-                               direct_code, direct_method, type);
-      LIR* st = OpVstm(TargetReg(kArg3), regs_left);
-      SetMemRefType(st, false /* is_load */, kDalvikReg);
-      st->u.m.def_mask = ENCODE_ALL;
-      call_state = next_call_insn(cu_, info, call_state, target_method, vtable_idx,
-                               direct_code, direct_method, type);
-    }
   }
 
   call_state = LoadArgRegs(info, call_state, next_call_insn,

diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index aca93f5..11b8f83 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h

@@ -151,6 +151,8 @@
     LIR* OpRegImm(OpKind op, int r_dest_src1, int value);
     LIR* OpRegMem(OpKind op, int r_dest, int rBase, int offset);
     LIR* OpRegReg(OpKind op, int r_dest_src1, int r_src2);
+    LIR* OpMovRegMem(int r_dest, int r_base, int offset, MoveType move_type);
+    LIR* OpMovMemReg(int r_base, int offset, int r_src, MoveType move_type);
     LIR* OpCondRegReg(OpKind op, ConditionCode cc, int r_dest, int r_src);
     LIR* OpRegRegImm(OpKind op, int r_dest, int r_src1, int value);
     LIR* OpRegRegReg(OpKind op, int r_dest, int r_src1, int r_src2);

diff --git a/compiler/dex/quick/mips/utility_mips.cc b/compiler/dex/quick/mips/utility_mips.cc
index c5e2b36..21c971c 100644
--- a/compiler/dex/quick/mips/utility_mips.cc
+++ b/compiler/dex/quick/mips/utility_mips.cc

@@ -325,6 +325,16 @@
   return NewLIR2(opcode, r_dest_src1, r_src2);
 }
 
+LIR* MipsMir2Lir::OpMovRegMem(int r_dest, int r_base, int offset, MoveType move_type) {
+  UNIMPLEMENTED(FATAL);
+  return nullptr;
+}
+
+LIR* MipsMir2Lir::OpMovMemReg(int r_base, int offset, int r_src, MoveType move_type) {
+  UNIMPLEMENTED(FATAL);
+  return nullptr;
+}
+
 LIR* MipsMir2Lir::OpCondRegReg(OpKind op, ConditionCode cc, int r_dest, int r_src) {
   LOG(FATAL) << "Unexpected use of OpCondRegReg for MIPS";
   return NULL;

diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 09b501a..3a68044 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h

@@ -946,6 +946,27 @@
     virtual LIR* OpRegReg(OpKind op, int r_dest_src1, int r_src2) = 0;
 
     /**
+     * @brief Used to generate an LIR that does a load from mem to reg.
+     * @param r_dest The destination physical register.
+     * @param r_base The base physical register for memory operand.
+     * @param offset The displacement for memory operand.
+     * @param move_type Specification on the move desired (size, alignment, register kind).
+     * @return Returns the generate move LIR.
+     */
+    virtual LIR* OpMovRegMem(int r_dest, int r_base, int offset, MoveType move_type) = 0;
+
+    /**
+     * @brief Used to generate an LIR that does a store from reg to mem.
+     * @param r_base The base physical register for memory operand.
+     * @param offset The displacement for memory operand.
+     * @param r_src The destination physical register.
+     * @param bytes_to_move The number of bytes to move.
+     * @param is_aligned Whether the memory location is known to be aligned.
+     * @return Returns the generate move LIR.
+     */
+    virtual LIR* OpMovMemReg(int r_base, int offset, int r_src, MoveType move_type) = 0;
+
+    /**
      * @brief Used for generating a conditional register to register operation.
      * @param op The opcode kind.
      * @param cc The condition code that when true will perform the opcode.

diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 3058b0c..ae53ddb 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc

@@ -297,6 +297,24 @@
   { kX86SqrtsdRR, kRegReg, IS_BINARY_OP | REG_DEF0_USE1, { 0xF2, 0, 0x0F, 0x51, 0, 0, 0, 0 }, "SqrtsdRR", "!0r,!1r" },
   { kX86FstpdM, kMem, IS_STORE | IS_BINARY_OP | REG_USE0, { 0x0, 0, 0xDD, 0x00, 0, 3, 0, 0 }, "FstpdM", "[!0r,!1d]" },
 
+  EXT_0F_ENCODING_MAP(Movups,    0x0, 0x10, REG_DEF0),
+  { kX86MovupsMR, kMemReg,      IS_STORE | IS_TERTIARY_OP | REG_USE02,  { 0x0, 0, 0x0F, 0x11, 0, 0, 0, 0 }, "MovupsMR", "[!0r+!1d],!2r" },
+  { kX86MovupsAR, kArrayReg,    IS_STORE | IS_QUIN_OP     | REG_USE014, { 0x0, 0, 0x0F, 0x11, 0, 0, 0, 0 }, "MovupsAR", "[!0r+!1r<<!2d+!3d],!4r" },
+
+  EXT_0F_ENCODING_MAP(Movaps,    0x0, 0x28, REG_DEF0),
+  { kX86MovapsMR, kMemReg,      IS_STORE | IS_TERTIARY_OP | REG_USE02,  { 0x0, 0, 0x0F, 0x29, 0, 0, 0, 0 }, "MovapsMR", "[!0r+!1d],!2r" },
+  { kX86MovapsAR, kArrayReg,    IS_STORE | IS_QUIN_OP     | REG_USE014, { 0x0, 0, 0x0F, 0x29, 0, 0, 0, 0 }, "MovapsAR", "[!0r+!1r<<!2d+!3d],!4r" },
+
+  { kX86MovlpsRM, kRegMem,      IS_LOAD | IS_TERTIARY_OP | REG_DEF0 | REG_USE01,  { 0x0, 0, 0x0F, 0x12, 0, 0, 0, 0 }, "MovlpsRM", "!0r,[!1r+!2d]" },
+  { kX86MovlpsRA, kRegArray,    IS_LOAD | IS_QUIN_OP     | REG_DEF0 | REG_USE012, { 0x0, 0, 0x0F, 0x12, 0, 0, 0, 0 }, "MovlpsRA", "!0r,[!1r+!2r<<!3d+!4d]" },
+  { kX86MovlpsMR, kMemReg,      IS_STORE | IS_TERTIARY_OP | REG_USE02,  { 0x0, 0, 0x0F, 0x13, 0, 0, 0, 0 }, "MovlpsMR", "[!0r+!1d],!2r" },
+  { kX86MovlpsAR, kArrayReg,    IS_STORE | IS_QUIN_OP     | REG_USE014, { 0x0, 0, 0x0F, 0x13, 0, 0, 0, 0 }, "MovlpsAR", "[!0r+!1r<<!2d+!3d],!4r" },
+
+  { kX86MovhpsRM, kRegMem,      IS_LOAD | IS_TERTIARY_OP | REG_DEF0 | REG_USE01,  { 0x0, 0, 0x0F, 0x16, 0, 0, 0, 0 }, "MovhpsRM", "!0r,[!1r+!2d]" },
+  { kX86MovhpsRA, kRegArray,    IS_LOAD | IS_QUIN_OP     | REG_DEF0 | REG_USE012, { 0x0, 0, 0x0F, 0x16, 0, 0, 0, 0 }, "MovhpsRA", "!0r,[!1r+!2r<<!3d+!4d]" },
+  { kX86MovhpsMR, kMemReg,      IS_STORE | IS_TERTIARY_OP | REG_USE02,  { 0x0, 0, 0x0F, 0x17, 0, 0, 0, 0 }, "MovhpsMR", "[!0r+!1d],!2r" },
+  { kX86MovhpsAR, kArrayReg,    IS_STORE | IS_QUIN_OP     | REG_USE014, { 0x0, 0, 0x0F, 0x17, 0, 0, 0, 0 }, "MovhpsAR", "[!0r+!1r<<!2d+!3d],!4r" },
+
   EXT_0F_ENCODING_MAP(Movdxr,    0x66, 0x6E, REG_DEF0),
   { kX86MovdrxRR, kRegRegStore, IS_BINARY_OP | REG_DEF0   | REG_USE01,  { 0x66, 0, 0x0F, 0x7E, 0, 0, 0, 0 }, "MovdrxRR", "!0r,!1r" },
   { kX86MovdrxMR, kMemReg,      IS_STORE | IS_TERTIARY_OP | REG_USE02,  { 0x66, 0, 0x0F, 0x7E, 0, 0, 0, 0 }, "MovdrxMR", "[!0r+!1d],!2r" },

diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 6896504..4c1c171 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h

@@ -207,6 +207,8 @@
     LIR* OpMemReg(OpKind op, RegLocation rl_dest, int value);
     LIR* OpRegMem(OpKind op, int r_dest, RegLocation value);
     LIR* OpRegReg(OpKind op, int r_dest_src1, int r_src2);
+    LIR* OpMovRegMem(int r_dest, int r_base, int offset, MoveType move_type);
+    LIR* OpMovMemReg(int r_base, int offset, int r_src, MoveType move_type);
     LIR* OpCondRegReg(OpKind op, ConditionCode cc, int r_dest, int r_src);
     LIR* OpRegRegImm(OpKind op, int r_dest, int r_src1, int value);
     LIR* OpRegRegReg(OpKind op, int r_dest, int r_src1, int r_src2);

diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index 18c5ca8..e2744d0 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc

@@ -211,6 +211,110 @@
     return NewLIR2(opcode, r_dest_src1, r_src2);
 }
 
+LIR* X86Mir2Lir::OpMovRegMem(int r_dest, int r_base, int offset, MoveType move_type) {
+  DCHECK(!(X86_FPREG(r_base)));
+
+  X86OpCode opcode = kX86Nop;
+  switch (move_type) {
+    case kMov8GP:
+      CHECK(!X86_FPREG(r_dest));
+      opcode = kX86Mov8RM;
+      break;
+    case kMov16GP:
+      CHECK(!X86_FPREG(r_dest));
+      opcode = kX86Mov16RM;
+      break;
+    case kMov32GP:
+      CHECK(!X86_FPREG(r_dest));
+      opcode = kX86Mov32RM;
+      break;
+    case kMov32FP:
+      CHECK(X86_FPREG(r_dest));
+      opcode = kX86MovssRM;
+      break;
+    case kMov64FP:
+      CHECK(X86_FPREG(r_dest));
+      opcode = kX86MovsdRM;
+      break;
+    case kMovU128FP:
+      CHECK(X86_FPREG(r_dest));
+      opcode = kX86MovupsRM;
+      break;
+    case kMovA128FP:
+      CHECK(X86_FPREG(r_dest));
+      opcode = kX86MovapsRM;
+      break;
+    case kMovLo128FP:
+      CHECK(X86_FPREG(r_dest));
+      opcode = kX86MovlpsRM;
+      break;
+    case kMovHi128FP:
+      CHECK(X86_FPREG(r_dest));
+      opcode = kX86MovhpsRM;
+      break;
+    case kMov64GP:
+    case kMovLo64FP:
+    case kMovHi64FP:
+    default:
+      LOG(FATAL) << "Bad case in OpMovRegMem";
+      break;
+  }
+
+  return NewLIR3(opcode, r_dest, r_base, offset);
+}
+
+LIR* X86Mir2Lir::OpMovMemReg(int r_base, int offset, int r_src, MoveType move_type) {
+  DCHECK(!(X86_FPREG(r_base)));
+
+  X86OpCode opcode = kX86Nop;
+  switch (move_type) {
+    case kMov8GP:
+      CHECK(!X86_FPREG(r_src));
+      opcode = kX86Mov8MR;
+      break;
+    case kMov16GP:
+      CHECK(!X86_FPREG(r_src));
+      opcode = kX86Mov16MR;
+      break;
+    case kMov32GP:
+      CHECK(!X86_FPREG(r_src));
+      opcode = kX86Mov32MR;
+      break;
+    case kMov32FP:
+      CHECK(X86_FPREG(r_src));
+      opcode = kX86MovssMR;
+      break;
+    case kMov64FP:
+      CHECK(X86_FPREG(r_src));
+      opcode = kX86MovsdMR;
+      break;
+    case kMovU128FP:
+      CHECK(X86_FPREG(r_src));
+      opcode = kX86MovupsMR;
+      break;
+    case kMovA128FP:
+      CHECK(X86_FPREG(r_src));
+      opcode = kX86MovapsMR;
+      break;
+    case kMovLo128FP:
+      CHECK(X86_FPREG(r_src));
+      opcode = kX86MovlpsMR;
+      break;
+    case kMovHi128FP:
+      CHECK(X86_FPREG(r_src));
+      opcode = kX86MovhpsMR;
+      break;
+    case kMov64GP:
+    case kMovLo64FP:
+    case kMovHi64FP:
+    default:
+      LOG(FATAL) << "Bad case in OpMovMemReg";
+      break;
+  }
+
+  return NewLIR3(opcode, r_base, offset, r_src);
+}
+
 LIR* X86Mir2Lir::OpCondRegReg(OpKind op, ConditionCode cc, int r_dest, int r_src) {
   // The only conditional reg to reg operation supported is Cmov
   DCHECK_EQ(op, kOpCmov);

diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index 7f35d06..6962ff7 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h

@@ -357,6 +357,14 @@
   kX86PsllqRI,                  // left shift of floating point registers
   kX86SqrtsdRR,                 // sqrt of floating point register
   kX86FstpdM,                   // Store and pop top x87 fp stack
+  Binary0fOpCode(kX86Movups),   // load unaligned packed single FP values from xmm2/m128 to xmm1
+  kX86MovupsMR, kX86MovupsAR,   // store unaligned packed single FP values from xmm1 to m128
+  Binary0fOpCode(kX86Movaps),   // load aligned packed single FP values from xmm2/m128 to xmm1
+  kX86MovapsMR, kX86MovapsAR,   // store aligned packed single FP values from xmm1 to m128
+  kX86MovlpsRM, kX86MovlpsRA,   // load packed single FP values from m64 to low quadword of xmm
+  kX86MovlpsMR, kX86MovlpsAR,   // store packed single FP values from low quadword of xmm to m64
+  kX86MovhpsRM, kX86MovhpsRA,   // load packed single FP values from m64 to high quadword of xmm
+  kX86MovhpsMR, kX86MovhpsAR,   // store packed single FP values from high quadword of xmm to m64
   Binary0fOpCode(kX86Movdxr),   // move into xmm from gpr
   kX86MovdrxRR, kX86MovdrxMR, kX86MovdrxAR,  // move into reg from xmm
   kX86Set8R, kX86Set8M, kX86Set8A,  // set byte depending on condition operand

diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 6c25e0a..903d755 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc

@@ -246,6 +246,42 @@
         load = *instr == 0x10;
         store = !load;
         break;
+      case 0x12: case 0x13:
+        if (prefix[2] == 0x66) {
+          opcode << "movlpd";
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else if (prefix[0] == 0) {
+          opcode << "movlps";
+        }
+        has_modrm = true;
+        src_reg_file = dst_reg_file = SSE;
+        load = *instr == 0x12;
+        store = !load;
+        break;
+      case 0x16: case 0x17:
+        if (prefix[2] == 0x66) {
+          opcode << "movhpd";
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else if (prefix[0] == 0) {
+          opcode << "movhps";
+        }
+        has_modrm = true;
+        src_reg_file = dst_reg_file = SSE;
+        load = *instr == 0x16;
+        store = !load;
+        break;
+      case 0x28: case 0x29:
+        if (prefix[2] == 0x66) {
+          opcode << "movapd";
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else if (prefix[0] == 0) {
+          opcode << "movaps";
+        }
+        has_modrm = true;
+        src_reg_file = dst_reg_file = SSE;
+        load = *instr == 0x28;
+        store = !load;
+        break;
       case 0x2A:
         if (prefix[2] == 0x66) {
           opcode << "cvtpi2pd";
commit	2c498d1f28e62e81fbdb477ff93ca7454e7493d7	[log] [tgz]
author	Razvan A Lupusoru <razvan.a.lupusoru@intel.com>	Wed Jan 29 16:02:57 2014 -0800
committer	Ian Rogers <irogers@google.com>	Wed Feb 05 22:42:21 2014 -0800
tree	94654433a4dae83ab75d432304dcc0358aefeb1c
parent	1dcff62155e8477eb114c8a86eb1beb0797ffc11 [diff]