x86_64: GenInlinedCas must use wide rl_src_offset under 64-bit targets

This patch fixes to use wide rl_src_offset for int and long types
under 64-bit targets, and fixes movzx8 and movsx8 to use r8_form
on the second register only.

Change-Id: Ib8c0756609100f9bc5c228f1eb391421416f3af6
Signed-off-by: Chao-ying Fu <chao-ying.fu@intel.com>
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 4e973d8..8df5b6d 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -327,6 +327,13 @@
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
 { kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
 
+// This is a special encoding with r8_form on the second register only
+// for Movzx8 and Movsx8.
+#define EXT_0F_R8_FORM_ENCODING_MAP(opname, prefix, opcode, reg_def) \
+{ kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, true }, #opname "RR", "!0r,!1r" }, \
+{ kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
+{ kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
+
 #define EXT_0F_REX_W_ENCODING_MAP(opname, prefix, opcode, reg_def) \
 { kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { prefix, REX_W, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, REX_W, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
@@ -488,9 +495,9 @@
   { kX86LockCmpxchg64A, kArray,   IS_STORE | IS_QUAD_OP | REG_USE01 | REG_DEFAD_USEAD | REG_USEC | REG_USEB | SETS_CCODES,  { 0xF0, 0, 0x0F, 0xC7, 0, 1, 0, 0, false }, "Lock Cmpxchg8b", "[!0r+!1r<<!2d+!3d]" },
   { kX86XchgMR, kMemReg,          IS_STORE | IS_LOAD | IS_TERTIARY_OP | REG_DEF2 | REG_USE02,          { 0, 0, 0x87, 0, 0, 0, 0, 0, false }, "Xchg", "[!0r+!1d],!2r" },
 
-  EXT_0F_ENCODING_MAP(Movzx8,  0x00, 0xB6, REG_DEF0),
+  EXT_0F_R8_FORM_ENCODING_MAP(Movzx8,  0x00, 0xB6, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movzx16, 0x00, 0xB7, REG_DEF0),
-  EXT_0F_ENCODING_MAP(Movsx8,  0x00, 0xBE, REG_DEF0),
+  EXT_0F_R8_FORM_ENCODING_MAP(Movsx8,  0x00, 0xBE, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movsx16, 0x00, 0xBF, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movzx8q,  REX_W, 0xB6, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movzx16q, REX_W, 0xB7, REG_DEF0),
@@ -593,6 +600,10 @@
   }
 }
 
+static bool IsByteSecondOperand(const X86EncodingMap* entry) {
+  return StartsWith(entry->name, "Movzx8") || StartsWith(entry->name, "Movsx8");
+}
+
 size_t X86Mir2Lir::ComputeSize(const X86EncodingMap* entry, int32_t raw_reg, int32_t raw_index,
                                int32_t raw_base, int32_t displacement) {
   bool has_modrm = HasModrm(entry);
@@ -613,7 +624,8 @@
     bool registers_need_rex_prefix = NeedsRex(raw_reg) || NeedsRex(raw_index) || NeedsRex(raw_base);
     if (r8_form) {
       // Do we need an empty REX prefix to normalize byte registers?
-      registers_need_rex_prefix = registers_need_rex_prefix || (RegStorage::RegNum(raw_reg) >= 4);
+      registers_need_rex_prefix = registers_need_rex_prefix ||
+          (RegStorage::RegNum(raw_reg) >= 4 && !IsByteSecondOperand(entry));
       registers_need_rex_prefix = registers_need_rex_prefix ||
           (modrm_is_reg_reg && (RegStorage::RegNum(raw_base) >= 4));
     }
@@ -877,7 +889,7 @@
   uint8_t rex = 0;
   if (r8_form) {
     // Do we need an empty REX prefix to normalize byte register addressing?
-    if (RegStorage::RegNum(raw_reg_r) >= 4) {
+    if (RegStorage::RegNum(raw_reg_r) >= 4 && !IsByteSecondOperand(entry)) {
       rex |= 0x40;  // REX.0000
     } else if (modrm_is_reg_reg && RegStorage::RegNum(raw_reg_b) >= 4) {
       rex |= 0x40;  // REX.0000
@@ -1167,7 +1179,9 @@
 }
 
 void X86Mir2Lir::EmitRegReg(const X86EncodingMap* entry, int32_t raw_reg1, int32_t raw_reg2) {
-  CheckValidByteRegister(entry, raw_reg1);
+  if (!IsByteSecondOperand(entry)) {
+    CheckValidByteRegister(entry, raw_reg1);
+  }
   CheckValidByteRegister(entry, raw_reg2);
   EmitPrefixAndOpcode(entry, raw_reg1, NO_REG, raw_reg2);
   uint8_t low_reg1 = LowRegisterBits(raw_reg1);
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index b416a7b..f1166f6 100755
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -836,14 +836,12 @@
 
 bool X86Mir2Lir::GenInlinedCas(CallInfo* info, bool is_long, bool is_object) {
   DCHECK(cu_->instruction_set == kX86 || cu_->instruction_set == kX86_64);
-  if (cu_->instruction_set == kX86_64) {
-    return false;  // TODO: Verify working on x86-64.
-  }
-
   // Unused - RegLocation rl_src_unsafe = info->args[0];
   RegLocation rl_src_obj = info->args[1];  // Object - known non-null
   RegLocation rl_src_offset = info->args[2];  // long low
-  rl_src_offset = NarrowRegLoc(rl_src_offset);  // ignore high half in info->args[3]
+  if (!cu_->target64) {
+    rl_src_offset = NarrowRegLoc(rl_src_offset);  // ignore high half in info->args[3]
+  }
   RegLocation rl_src_expected = info->args[4];  // int, long or Object
   // If is_long, high half is in info->args[5]
   RegLocation rl_src_new_value = info->args[is_long ? 6 : 5];  // int, long or Object
@@ -851,21 +849,21 @@
 
   if (is_long && cu_->target64) {
     // RAX must hold expected for CMPXCHG. Neither rl_new_value, nor r_ptr may be in RAX.
-    FlushReg(rs_r0);
-    Clobber(rs_r0);
-    LockTemp(rs_r0);
+    FlushReg(rs_r0q);
+    Clobber(rs_r0q);
+    LockTemp(rs_r0q);
 
     RegLocation rl_object = LoadValue(rl_src_obj, kRefReg);
     RegLocation rl_new_value = LoadValueWide(rl_src_new_value, kCoreReg);
-    RegLocation rl_offset = LoadValue(rl_src_offset, kCoreReg);
-    LoadValueDirectWide(rl_src_expected, rs_r0);
+    RegLocation rl_offset = LoadValueWide(rl_src_offset, kCoreReg);
+    LoadValueDirectWide(rl_src_expected, rs_r0q);
     NewLIR5(kX86LockCmpxchg64AR, rl_object.reg.GetReg(), rl_offset.reg.GetReg(), 0, 0, rl_new_value.reg.GetReg());
 
     // After a store we need to insert barrier in case of potential load. Since the
     // locked cmpxchg has full barrier semantics, only a scheduling barrier will be generated.
     GenMemBarrier(kStoreLoad);
 
-    FreeTemp(rs_r0);
+    FreeTemp(rs_r0q);
   } else if (is_long) {
     // TODO: avoid unnecessary loads of SI and DI when the values are in registers.
     // TODO: CFI support.
@@ -947,7 +945,12 @@
       LockTemp(rs_r0);
     }
 
-    RegLocation rl_offset = LoadValue(rl_src_offset, kCoreReg);
+    RegLocation rl_offset;
+    if (cu_->target64) {
+      rl_offset = LoadValueWide(rl_src_offset, kCoreReg);
+    } else {
+      rl_offset = LoadValue(rl_src_offset, kCoreReg);
+    }
     LoadValueDirect(rl_src_expected, rs_r0);
     NewLIR5(kX86LockCmpxchgAR, rl_object.reg.GetReg(), rl_offset.reg.GetReg(), 0, 0, rl_new_value.reg.GetReg());