Allow mixing of thread offsets between 32 and 64bit architectures.

Begin a more full implementation x86-64 REX prefixes.
Doesn't implement 64bit thread offset support for the JNI compiler.

Change-Id: If9af2f08a1833c21ddb4b4077f9b03add1a05147
diff --git a/compiler/utils/arm/assembler_arm.cc b/compiler/utils/arm/assembler_arm.cc
index 872a557..59eb98e 100644
--- a/compiler/utils/arm/assembler_arm.cc
+++ b/compiler/utils/arm/assembler_arm.cc
@@ -1577,7 +1577,7 @@
   StoreToOffset(kStoreWord, scratch.AsCoreRegister(), SP, dest.Int32Value());
 }
 
-void ArmAssembler::StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
+void ArmAssembler::StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm,
                                        ManagedRegister mscratch) {
   ArmManagedRegister scratch = mscratch.AsArm();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -1609,18 +1609,18 @@
   return EmitLoad(this, m_dst, SP, src.Int32Value(), size);
 }
 
-void ArmAssembler::Load(ManagedRegister m_dst, ThreadOffset src, size_t size) {
+void ArmAssembler::LoadFromThread32(ManagedRegister m_dst, ThreadOffset<4> src, size_t size) {
   return EmitLoad(this, m_dst, TR, src.Int32Value(), size);
 }
 
-void ArmAssembler::LoadRawPtrFromThread(ManagedRegister m_dst, ThreadOffset offs) {
+void ArmAssembler::LoadRawPtrFromThread32(ManagedRegister m_dst, ThreadOffset<4> offs) {
   ArmManagedRegister dst = m_dst.AsArm();
   CHECK(dst.IsCoreRegister()) << dst;
   LoadFromOffset(kLoadWord, dst.AsCoreRegister(), TR, offs.Int32Value());
 }
 
-void ArmAssembler::CopyRawPtrFromThread(FrameOffset fr_offs,
-                                        ThreadOffset thr_offs,
+void ArmAssembler::CopyRawPtrFromThread32(FrameOffset fr_offs,
+                                        ThreadOffset<4> thr_offs,
                                         ManagedRegister mscratch) {
   ArmManagedRegister scratch = mscratch.AsArm();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -1630,7 +1630,7 @@
                 SP, fr_offs.Int32Value());
 }
 
-void ArmAssembler::CopyRawPtrToThread(ThreadOffset thr_offs,
+void ArmAssembler::CopyRawPtrToThread32(ThreadOffset<4> thr_offs,
                                       FrameOffset fr_offs,
                                       ManagedRegister mscratch) {
   ArmManagedRegister scratch = mscratch.AsArm();
@@ -1641,7 +1641,7 @@
                 TR, thr_offs.Int32Value());
 }
 
-void ArmAssembler::StoreStackOffsetToThread(ThreadOffset thr_offs,
+void ArmAssembler::StoreStackOffsetToThread32(ThreadOffset<4> thr_offs,
                                             FrameOffset fr_offs,
                                             ManagedRegister mscratch) {
   ArmManagedRegister scratch = mscratch.AsArm();
@@ -1651,7 +1651,7 @@
                 TR, thr_offs.Int32Value());
 }
 
-void ArmAssembler::StoreStackPointerToThread(ThreadOffset thr_offs) {
+void ArmAssembler::StoreStackPointerToThread32(ThreadOffset<4> thr_offs) {
   StoreToOffset(kStoreWord, SP, TR, thr_offs.Int32Value());
 }
 
@@ -1844,7 +1844,7 @@
   // TODO: place reference map on call
 }
 
-void ArmAssembler::Call(ThreadOffset /*offset*/, ManagedRegister /*scratch*/) {
+void ArmAssembler::CallFromThread32(ThreadOffset<4> /*offset*/, ManagedRegister /*scratch*/) {
   UNIMPLEMENTED(FATAL);
 }
 
@@ -1862,7 +1862,7 @@
   ArmExceptionSlowPath* slow = new ArmExceptionSlowPath(scratch, stack_adjust);
   buffer_.EnqueueSlowPath(slow);
   LoadFromOffset(kLoadWord, scratch.AsCoreRegister(),
-                 TR, Thread::ExceptionOffset().Int32Value());
+                 TR, Thread::ExceptionOffset<4>().Int32Value());
   cmp(scratch.AsCoreRegister(), ShifterOperand(0));
   b(slow->Entry(), NE);
 }
@@ -1878,7 +1878,7 @@
   // Don't care about preserving R0 as this call won't return
   __ mov(R0, ShifterOperand(scratch_.AsCoreRegister()));
   // Set up call to Thread::Current()->pDeliverException
-  __ LoadFromOffset(kLoadWord, R12, TR, QUICK_ENTRYPOINT_OFFSET(pDeliverException).Int32Value());
+  __ LoadFromOffset(kLoadWord, R12, TR, QUICK_ENTRYPOINT_OFFSET(4, pDeliverException).Int32Value());
   __ blx(R12);
   // Call never returns
   __ bkpt(0);
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index bb9207c..f5be04a 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -35,6 +35,7 @@
   // Data-processing operands - Uninitialized
   ShifterOperand() {
     type_ = -1;
+    encoding_ = 0;
   }
 
   // Data-processing operands - Immediate
@@ -210,7 +211,7 @@
 };
 
 
-class ArmAssembler : public Assembler {
+class ArmAssembler FINAL : public Assembler {
  public:
   ArmAssembler() {}
   virtual ~ArmAssembler() {}
@@ -438,127 +439,116 @@
   //
 
   // Emit code that will create an activation on the stack
-  virtual void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                          const std::vector<ManagedRegister>& callee_save_regs,
-                          const ManagedRegisterEntrySpills& entry_spills);
+  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
+                  const std::vector<ManagedRegister>& callee_save_regs,
+                  const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack
-  virtual void RemoveFrame(size_t frame_size,
-                           const std::vector<ManagedRegister>& callee_save_regs);
+  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+      OVERRIDE;
 
-  virtual void IncreaseFrameSize(size_t adjust);
-  virtual void DecreaseFrameSize(size_t adjust);
+  void IncreaseFrameSize(size_t adjust) OVERRIDE;
+  void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
   // Store routines
-  virtual void Store(FrameOffset offs, ManagedRegister src, size_t size);
-  virtual void StoreRef(FrameOffset dest, ManagedRegister src);
-  virtual void StoreRawPtr(FrameOffset dest, ManagedRegister src);
+  void Store(FrameOffset offs, ManagedRegister src, size_t size) OVERRIDE;
+  void StoreRef(FrameOffset dest, ManagedRegister src) OVERRIDE;
+  void StoreRawPtr(FrameOffset dest, ManagedRegister src) OVERRIDE;
 
-  virtual void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                     ManagedRegister scratch);
+  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister scratch);
+  void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister scratch);
+  void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                  ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreStackPointerToThread(ThreadOffset thr_offs);
+  void StoreStackPointerToThread32(ThreadOffset<4> thr_offs) OVERRIDE;
 
-  virtual void StoreSpanning(FrameOffset dest, ManagedRegister src,
-                             FrameOffset in_off, ManagedRegister scratch);
+  void StoreSpanning(FrameOffset dest, ManagedRegister src, FrameOffset in_off,
+                     ManagedRegister scratch) OVERRIDE;
 
   // Load routines
-  virtual void Load(ManagedRegister dest, FrameOffset src, size_t size);
+  void Load(ManagedRegister dest, FrameOffset src, size_t size) OVERRIDE;
 
-  virtual void Load(ManagedRegister dest, ThreadOffset src, size_t size);
+  void LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, FrameOffset  src);
+  void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, ManagedRegister base,
-                       MemberOffset offs);
+  void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) OVERRIDE;
 
-  virtual void LoadRawPtr(ManagedRegister dest, ManagedRegister base,
-                          Offset offs);
+  void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) OVERRIDE;
 
-  virtual void LoadRawPtrFromThread(ManagedRegister dest,
-                                    ThreadOffset offs);
+  void LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs) OVERRIDE;
 
   // Copying routines
-  virtual void Move(ManagedRegister dest, ManagedRegister src, size_t size);
+  void Move(ManagedRegister dest, ManagedRegister src, size_t size) OVERRIDE;
 
-  virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                                    ManagedRegister scratch);
+  void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                              ManagedRegister scratch) OVERRIDE;
 
-  virtual void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                                  ManagedRegister scratch);
+  void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void CopyRef(FrameOffset dest, FrameOffset src,
-                       ManagedRegister scratch);
+  void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister scratch) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest, Offset dest_offset,
-                    ManagedRegister src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest, Offset dest_offset, ManagedRegister src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void MemoryBarrier(ManagedRegister scratch);
+  void MemoryBarrier(ManagedRegister scratch) OVERRIDE;
 
   // Sign extension
-  virtual void SignExtend(ManagedRegister mreg, size_t size);
+  void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Zero extension
-  virtual void ZeroExtend(ManagedRegister mreg, size_t size);
+  void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Exploit fast access in managed code to Thread::Current()
-  virtual void GetCurrentThread(ManagedRegister tr);
-  virtual void GetCurrentThread(FrameOffset dest_offset,
-                                ManagedRegister scratch);
+  void GetCurrentThread(ManagedRegister tr) OVERRIDE;
+  void GetCurrentThread(FrameOffset dest_offset, ManagedRegister scratch) OVERRIDE;
 
   // Set up out_reg to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed. in_reg holds a possibly stale reference
   // that can be used to avoid loading the SIRT entry to see if the value is
   // NULL.
-  virtual void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset,
-                               ManagedRegister in_reg, bool null_allowed);
+  void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset, ManagedRegister in_reg,
+                       bool null_allowed) OVERRIDE;
 
   // Set up out_off to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed.
-  virtual void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset,
-                               ManagedRegister scratch, bool null_allowed);
+  void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset, ManagedRegister scratch,
+                       bool null_allowed) OVERRIDE;
 
   // src holds a SIRT entry (Object**) load this into dst
-  virtual void LoadReferenceFromSirt(ManagedRegister dst,
-                                     ManagedRegister src);
+  void LoadReferenceFromSirt(ManagedRegister dst, ManagedRegister src) OVERRIDE;
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
   // know that src may not be null.
-  virtual void VerifyObject(ManagedRegister src, bool could_be_null);
-  virtual void VerifyObject(FrameOffset src, bool could_be_null);
+  void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
+  void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
   // Call to address held at [base+offset]
-  virtual void Call(ManagedRegister base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(FrameOffset base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(ThreadOffset offset, ManagedRegister scratch);
+  void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void Call(FrameOffset base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch) OVERRIDE;
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
-  virtual void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust);
+  void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust) OVERRIDE;
 
  private:
   void EmitType01(Condition cond,
@@ -642,12 +632,12 @@
 };
 
 // Slowpath entered when Thread::Current()->_exception is non-null
-class ArmExceptionSlowPath : public SlowPath {
+class ArmExceptionSlowPath FINAL : public SlowPath {
  public:
   explicit ArmExceptionSlowPath(ArmManagedRegister scratch, size_t stack_adjust)
       : scratch_(scratch), stack_adjust_(stack_adjust) {
   }
-  virtual void Emit(Assembler *sp_asm);
+  void Emit(Assembler *sp_asm) OVERRIDE;
  private:
   const ArmManagedRegister scratch_;
   const size_t stack_adjust_;
diff --git a/compiler/utils/arm/constants_arm.h b/compiler/utils/arm/constants_arm.h
index cc795b1..058f945 100644
--- a/compiler/utils/arm/constants_arm.h
+++ b/compiler/utils/arm/constants_arm.h
@@ -242,22 +242,22 @@
   }
 
   // Get the raw instruction bits.
-  inline int32_t InstructionBits() const {
+  int32_t InstructionBits() const {
     return *reinterpret_cast<const int32_t*>(this);
   }
 
   // Set the raw instruction bits to value.
-  inline void SetInstructionBits(int32_t value) {
+  void SetInstructionBits(int32_t value) {
     *reinterpret_cast<int32_t*>(this) = value;
   }
 
   // Read one particular bit out of the instruction bits.
-  inline int Bit(int nr) const {
+  int Bit(int nr) const {
     return (InstructionBits() >> nr) & 1;
   }
 
   // Read a bit field out of the instruction bits.
-  inline int Bits(int shift, int count) const {
+  int Bits(int shift, int count) const {
     return (InstructionBits() >> shift) & ((1 << count) - 1);
   }
 
@@ -265,80 +265,80 @@
   // Accessors for the different named fields used in the ARM encoding.
   // The naming of these accessor corresponds to figure A3-1.
   // Generally applicable fields
-  inline Condition ConditionField() const {
+  Condition ConditionField() const {
     return static_cast<Condition>(Bits(kConditionShift, kConditionBits));
   }
-  inline int TypeField() const { return Bits(kTypeShift, kTypeBits); }
+  int TypeField() const { return Bits(kTypeShift, kTypeBits); }
 
-  inline Register RnField() const { return static_cast<Register>(
+  Register RnField() const { return static_cast<Register>(
                                         Bits(kRnShift, kRnBits)); }
-  inline Register RdField() const { return static_cast<Register>(
+  Register RdField() const { return static_cast<Register>(
                                         Bits(kRdShift, kRdBits)); }
 
   // Fields used in Data processing instructions
-  inline Opcode OpcodeField() const {
+  Opcode OpcodeField() const {
     return static_cast<Opcode>(Bits(kOpcodeShift, kOpcodeBits));
   }
-  inline int SField() const { return Bits(kSShift, kSBits); }
+  int SField() const { return Bits(kSShift, kSBits); }
   // with register
-  inline Register RmField() const {
+  Register RmField() const {
     return static_cast<Register>(Bits(kRmShift, kRmBits));
   }
-  inline Shift ShiftField() const { return static_cast<Shift>(
+  Shift ShiftField() const { return static_cast<Shift>(
                                         Bits(kShiftShift, kShiftBits)); }
-  inline int RegShiftField() const { return Bit(4); }
-  inline Register RsField() const {
+  int RegShiftField() const { return Bit(4); }
+  Register RsField() const {
     return static_cast<Register>(Bits(kRsShift, kRsBits));
   }
-  inline int ShiftAmountField() const { return Bits(kShiftImmShift,
+  int ShiftAmountField() const { return Bits(kShiftImmShift,
                                                     kShiftImmBits); }
   // with immediate
-  inline int RotateField() const { return Bits(kRotateShift, kRotateBits); }
-  inline int Immed8Field() const { return Bits(kImmed8Shift, kImmed8Bits); }
+  int RotateField() const { return Bits(kRotateShift, kRotateBits); }
+  int Immed8Field() const { return Bits(kImmed8Shift, kImmed8Bits); }
 
   // Fields used in Load/Store instructions
-  inline int PUField() const { return Bits(23, 2); }
-  inline int  BField() const { return Bit(22); }
-  inline int  WField() const { return Bit(21); }
-  inline int  LField() const { return Bit(20); }
+  int PUField() const { return Bits(23, 2); }
+  int  BField() const { return Bit(22); }
+  int  WField() const { return Bit(21); }
+  int  LField() const { return Bit(20); }
   // with register uses same fields as Data processing instructions above
   // with immediate
-  inline int Offset12Field() const { return Bits(kOffset12Shift,
+  int Offset12Field() const { return Bits(kOffset12Shift,
                                                  kOffset12Bits); }
   // multiple
-  inline int RlistField() const { return Bits(0, 16); }
+  int RlistField() const { return Bits(0, 16); }
   // extra loads and stores
-  inline int SignField() const { return Bit(6); }
-  inline int HField() const { return Bit(5); }
-  inline int ImmedHField() const { return Bits(8, 4); }
-  inline int ImmedLField() const { return Bits(0, 4); }
+  int SignField() const { return Bit(6); }
+  int HField() const { return Bit(5); }
+  int ImmedHField() const { return Bits(8, 4); }
+  int ImmedLField() const { return Bits(0, 4); }
 
   // Fields used in Branch instructions
-  inline int LinkField() const { return Bits(kLinkShift, kLinkBits); }
-  inline int SImmed24Field() const { return ((InstructionBits() << 8) >> 8); }
+  int LinkField() const { return Bits(kLinkShift, kLinkBits); }
+  int SImmed24Field() const { return ((InstructionBits() << 8) >> 8); }
 
   // Fields used in Supervisor Call instructions
-  inline uint32_t SvcField() const { return Bits(0, 24); }
+  uint32_t SvcField() const { return Bits(0, 24); }
 
   // Field used in Breakpoint instruction
-  inline uint16_t BkptField() const {
+  uint16_t BkptField() const {
     return ((Bits(8, 12) << 4) | Bits(0, 4));
   }
 
   // Field used in 16-bit immediate move instructions
-  inline uint16_t MovwField() const {
+  uint16_t MovwField() const {
     return ((Bits(16, 4) << 12) | Bits(0, 12));
   }
 
   // Field used in VFP float immediate move instruction
-  inline float ImmFloatField() const {
+  float ImmFloatField() const {
     uint32_t imm32 = (Bit(19) << 31) | (((1 << 5) - Bit(18)) << 25) |
                      (Bits(16, 2) << 23) | (Bits(0, 4) << 19);
     return bit_cast<float, uint32_t>(imm32);
   }
 
   // Field used in VFP double immediate move instruction
-  inline double ImmDoubleField() const {
+  double ImmDoubleField() const {
     uint64_t imm64 = (Bit(19)*(1LL << 63)) | (((1LL << 8) - Bit(18)) << 54) |
                      (Bits(16, 2)*(1LL << 52)) | (Bits(0, 4)*(1LL << 48));
     return bit_cast<double, uint64_t>(imm64);
@@ -347,7 +347,7 @@
   // Test for data processing instructions of type 0 or 1.
   // See "ARM Architecture Reference Manual ARMv7-A and ARMv7-R edition",
   // section A5.1 "ARM instruction set encoding".
-  inline bool IsDataProcessing() const {
+  bool IsDataProcessing() const {
     CHECK_NE(ConditionField(), kSpecialCondition);
     CHECK_EQ(Bits(26, 2), 0);  // Type 0 or 1.
     return ((Bits(20, 5) & 0x19) != 0x10) &&
@@ -359,47 +359,47 @@
   // Tests for special encodings of type 0 instructions (extra loads and stores,
   // as well as multiplications, synchronization primitives, and miscellaneous).
   // Can only be called for a type 0 or 1 instruction.
-  inline bool IsMiscellaneous() const {
+  bool IsMiscellaneous() const {
     CHECK_EQ(Bits(26, 2), 0);  // Type 0 or 1.
     return ((Bit(25) == 0) && ((Bits(20, 5) & 0x19) == 0x10) && (Bit(7) == 0));
   }
-  inline bool IsMultiplyOrSyncPrimitive() const {
+  bool IsMultiplyOrSyncPrimitive() const {
     CHECK_EQ(Bits(26, 2), 0);  // Type 0 or 1.
     return ((Bit(25) == 0) && (Bits(4, 4) == 9));
   }
 
   // Test for Supervisor Call instruction.
-  inline bool IsSvc() const {
+  bool IsSvc() const {
     return ((InstructionBits() & 0xff000000) == 0xef000000);
   }
 
   // Test for Breakpoint instruction.
-  inline bool IsBkpt() const {
+  bool IsBkpt() const {
     return ((InstructionBits() & 0xfff000f0) == 0xe1200070);
   }
 
   // VFP register fields.
-  inline SRegister SnField() const {
+  SRegister SnField() const {
     return static_cast<SRegister>((Bits(kRnShift, kRnBits) << 1) + Bit(7));
   }
-  inline SRegister SdField() const {
+  SRegister SdField() const {
     return static_cast<SRegister>((Bits(kRdShift, kRdBits) << 1) + Bit(22));
   }
-  inline SRegister SmField() const {
+  SRegister SmField() const {
     return static_cast<SRegister>((Bits(kRmShift, kRmBits) << 1) + Bit(5));
   }
-  inline DRegister DnField() const {
+  DRegister DnField() const {
     return static_cast<DRegister>(Bits(kRnShift, kRnBits) + (Bit(7) << 4));
   }
-  inline DRegister DdField() const {
+  DRegister DdField() const {
     return static_cast<DRegister>(Bits(kRdShift, kRdBits) + (Bit(22) << 4));
   }
-  inline DRegister DmField() const {
+  DRegister DmField() const {
     return static_cast<DRegister>(Bits(kRmShift, kRmBits) + (Bit(5) << 4));
   }
 
   // Test for VFP data processing or single transfer instructions of type 7.
-  inline bool IsVFPDataProcessingOrSingleTransfer() const {
+  bool IsVFPDataProcessingOrSingleTransfer() const {
     CHECK_NE(ConditionField(), kSpecialCondition);
     CHECK_EQ(TypeField(), 7);
     return ((Bit(24) == 0) && (Bits(9, 3) == 5));
@@ -408,7 +408,7 @@
   }
 
   // Test for VFP 64-bit transfer instructions of type 6.
-  inline bool IsVFPDoubleTransfer() const {
+  bool IsVFPDoubleTransfer() const {
     CHECK_NE(ConditionField(), kSpecialCondition);
     CHECK_EQ(TypeField(), 6);
     return ((Bits(21, 4) == 2) && (Bits(9, 3) == 5) &&
@@ -416,20 +416,20 @@
   }
 
   // Test for VFP load and store instructions of type 6.
-  inline bool IsVFPLoadStore() const {
+  bool IsVFPLoadStore() const {
     CHECK_NE(ConditionField(), kSpecialCondition);
     CHECK_EQ(TypeField(), 6);
     return ((Bits(20, 5) & 0x12) == 0x10) && (Bits(9, 3) == 5);
   }
 
   // Special accessors that test for existence of a value.
-  inline bool HasS() const { return SField() == 1; }
-  inline bool HasB() const { return BField() == 1; }
-  inline bool HasW() const { return WField() == 1; }
-  inline bool HasL() const { return LField() == 1; }
-  inline bool HasSign() const { return SignField() == 1; }
-  inline bool HasH() const { return HField() == 1; }
-  inline bool HasLink() const { return LinkField() == 1; }
+  bool HasS() const { return SField() == 1; }
+  bool HasB() const { return BField() == 1; }
+  bool HasW() const { return WField() == 1; }
+  bool HasL() const { return LField() == 1; }
+  bool HasSign() const { return SignField() == 1; }
+  bool HasH() const { return HField() == 1; }
+  bool HasLink() const { return LinkField() == 1; }
 
   // Instructions are read out of a code stream. The only way to get a
   // reference to an instruction is to convert a pointer. There is no way
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index f8b91d7..5b2c8ba 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -155,7 +155,7 @@
   StoreToOffset(scratch.AsCoreRegister(), SP, offs.Int32Value());
 }
 
-void Arm64Assembler::StoreImmediateToThread(ThreadOffset offs, uint32_t imm,
+void Arm64Assembler::StoreImmediateToThread32(ThreadOffset<4> offs, uint32_t imm,
                                             ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -163,7 +163,7 @@
   StoreToOffset(scratch.AsCoreRegister(), TR, offs.Int32Value());
 }
 
-void Arm64Assembler::StoreStackOffsetToThread(ThreadOffset tr_offs,
+void Arm64Assembler::StoreStackOffsetToThread32(ThreadOffset<4> tr_offs,
                                               FrameOffset fr_offs,
                                               ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
@@ -172,7 +172,7 @@
   StoreToOffset(scratch.AsCoreRegister(), TR, tr_offs.Int32Value());
 }
 
-void Arm64Assembler::StoreStackPointerToThread(ThreadOffset tr_offs) {
+void Arm64Assembler::StoreStackPointerToThread32(ThreadOffset<4> tr_offs) {
   // Arm64 does not support: "str sp, [dest]" therefore we use IP1 as a temp reg.
   ___ Mov(reg_x(IP1), reg_x(SP));
   StoreToOffset(IP1, TR, tr_offs.Int32Value());
@@ -269,7 +269,7 @@
   return Load(m_dst.AsArm64(), SP, src.Int32Value(), size);
 }
 
-void Arm64Assembler::Load(ManagedRegister m_dst, ThreadOffset src, size_t size) {
+void Arm64Assembler::LoadFromThread32(ManagedRegister m_dst, ThreadOffset<4> src, size_t size) {
   return Load(m_dst.AsArm64(), TR, src.Int32Value(), size);
 }
 
@@ -294,7 +294,7 @@
   LoadFromOffset(dst.AsCoreRegister(), base.AsCoreRegister(), offs.Int32Value());
 }
 
-void Arm64Assembler::LoadRawPtrFromThread(ManagedRegister m_dst, ThreadOffset offs) {
+void Arm64Assembler::LoadRawPtrFromThread32(ManagedRegister m_dst, ThreadOffset<4> offs) {
   Arm64ManagedRegister dst = m_dst.AsArm64();
   CHECK(dst.IsCoreRegister()) << dst;
   LoadFromOffset(dst.AsCoreRegister(), TR, offs.Int32Value());
@@ -322,8 +322,8 @@
   }
 }
 
-void Arm64Assembler::CopyRawPtrFromThread(FrameOffset fr_offs,
-                                          ThreadOffset tr_offs,
+void Arm64Assembler::CopyRawPtrFromThread32(FrameOffset fr_offs,
+                                          ThreadOffset<4> tr_offs,
                                           ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -331,7 +331,7 @@
   StoreToOffset(scratch.AsCoreRegister(), SP, fr_offs.Int32Value());
 }
 
-void Arm64Assembler::CopyRawPtrToThread(ThreadOffset tr_offs,
+void Arm64Assembler::CopyRawPtrToThread32(ThreadOffset<4> tr_offs,
                                         FrameOffset fr_offs,
                                         ManagedRegister m_scratch) {
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
@@ -486,7 +486,7 @@
   ___ Blr(reg_x(scratch.AsCoreRegister()));
 }
 
-void Arm64Assembler::Call(ThreadOffset /*offset*/, ManagedRegister /*scratch*/) {
+void Arm64Assembler::CallFromThread32(ThreadOffset<4> /*offset*/, ManagedRegister /*scratch*/) {
   UNIMPLEMENTED(FATAL) << "Unimplemented Call() variant";
 }
 
@@ -555,7 +555,7 @@
   Arm64ManagedRegister scratch = m_scratch.AsArm64();
   Arm64Exception *current_exception = new Arm64Exception(scratch, stack_adjust);
   exception_blocks_.push_back(current_exception);
-  LoadFromOffset(scratch.AsCoreRegister(), TR, Thread::ExceptionOffset().Int32Value());
+  LoadFromOffset(scratch.AsCoreRegister(), TR, Thread::ExceptionOffset<4>().Int32Value());
   ___ Cmp(reg_x(scratch.AsCoreRegister()), 0);
   ___ B(current_exception->Entry(), COND_OP(NE));
 }
@@ -569,7 +569,7 @@
   // Pass exception object as argument.
   // Don't care about preserving X0 as this won't return.
   ___ Mov(reg_x(X0), reg_x(exception->scratch_.AsCoreRegister()));
-  LoadFromOffset(IP1, TR, QUICK_ENTRYPOINT_OFFSET(pDeliverException).Int32Value());
+  LoadFromOffset(IP1, TR, QUICK_ENTRYPOINT_OFFSET(8, pDeliverException).Int32Value());
   ___ Blr(reg_x(IP1));
   // Call should never return.
   ___ Brk();
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index 44eb6ff..3abcaad 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -79,7 +79,7 @@
 
 class Arm64Exception;
 
-class Arm64Assembler : public Assembler {
+class Arm64Assembler FINAL : public Assembler {
  public:
   Arm64Assembler() : vixl_buf_(new byte[BUF_SIZE]),
   vixl_masm_(new vixl::MacroAssembler(vixl_buf_, BUF_SIZE)) {}
@@ -111,105 +111,97 @@
   // Emit code that will create an activation on the stack.
   void BuildFrame(size_t frame_size, ManagedRegister method_reg,
                   const std::vector<ManagedRegister>& callee_save_regs,
-                  const ManagedRegisterEntrySpills& entry_spills);
+                  const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack.
-  void RemoveFrame(size_t frame_size,
-                   const std::vector<ManagedRegister>& callee_save_regs);
+  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+      OVERRIDE;
 
-  void IncreaseFrameSize(size_t adjust);
-  void DecreaseFrameSize(size_t adjust);
+  void IncreaseFrameSize(size_t adjust) OVERRIDE;
+  void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
   // Store routines.
-  void Store(FrameOffset offs, ManagedRegister src, size_t size);
-  void StoreRef(FrameOffset dest, ManagedRegister src);
-  void StoreRawPtr(FrameOffset dest, ManagedRegister src);
-  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                     ManagedRegister scratch);
-  void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister scratch);
-  void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister scratch);
-  void StoreStackPointerToThread(ThreadOffset thr_offs);
-  void StoreSpanning(FrameOffset dest, ManagedRegister src,
-                             FrameOffset in_off, ManagedRegister scratch);
+  void Store(FrameOffset offs, ManagedRegister src, size_t size) OVERRIDE;
+  void StoreRef(FrameOffset dest, ManagedRegister src) OVERRIDE;
+  void StoreRawPtr(FrameOffset dest, ManagedRegister src) OVERRIDE;
+  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister scratch) OVERRIDE;
+  void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm, ManagedRegister scratch)
+      OVERRIDE;
+  void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                  ManagedRegister scratch) OVERRIDE;
+  void StoreStackPointerToThread32(ThreadOffset<4> thr_offs) OVERRIDE;
+  void StoreSpanning(FrameOffset dest, ManagedRegister src, FrameOffset in_off,
+                     ManagedRegister scratch) OVERRIDE;
 
   // Load routines.
-  void Load(ManagedRegister dest, FrameOffset src, size_t size);
-  void Load(ManagedRegister dest, ThreadOffset src, size_t size);
-  void LoadRef(ManagedRegister dest, FrameOffset  src);
-  void LoadRef(ManagedRegister dest, ManagedRegister base,
-               MemberOffset offs);
-  void LoadRawPtr(ManagedRegister dest, ManagedRegister base,
-                  Offset offs);
-  void LoadRawPtrFromThread(ManagedRegister dest,
-                            ThreadOffset offs);
+  void Load(ManagedRegister dest, FrameOffset src, size_t size) OVERRIDE;
+  void LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size) OVERRIDE;
+  void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
+  void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) OVERRIDE;
+  void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) OVERRIDE;
+  void LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs) OVERRIDE;
+
   // Copying routines.
-  void Move(ManagedRegister dest, ManagedRegister src, size_t size);
-  void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                            ManagedRegister scratch);
-  void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                          ManagedRegister scratch);
-  void CopyRef(FrameOffset dest, FrameOffset src,
-               ManagedRegister scratch);
-  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size);
-  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset,
-            ManagedRegister scratch, size_t size);
-  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
-            ManagedRegister scratch, size_t size);
-  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
-            ManagedRegister scratch, size_t size);
-  void Copy(ManagedRegister dest, Offset dest_offset,
-            ManagedRegister src, Offset src_offset,
-            ManagedRegister scratch, size_t size);
+  void Move(ManagedRegister dest, ManagedRegister src, size_t size) OVERRIDE;
+  void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                              ManagedRegister scratch) OVERRIDE;
+  void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
+      OVERRIDE;
+  void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister scratch) OVERRIDE;
+  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size) OVERRIDE;
+  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
+  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src, ManagedRegister scratch,
+            size_t size) OVERRIDE;
+  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
+  void Copy(ManagedRegister dest, Offset dest_offset, ManagedRegister src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
   void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
-            ManagedRegister scratch, size_t size);
-  void MemoryBarrier(ManagedRegister scratch);
+            ManagedRegister scratch, size_t size) OVERRIDE;
+  void MemoryBarrier(ManagedRegister scratch) OVERRIDE;
 
   // Sign extension.
-  void SignExtend(ManagedRegister mreg, size_t size);
+  void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Zero extension.
-  void ZeroExtend(ManagedRegister mreg, size_t size);
+  void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Exploit fast access in managed code to Thread::Current().
-  void GetCurrentThread(ManagedRegister tr);
-  void GetCurrentThread(FrameOffset dest_offset,
-                        ManagedRegister scratch);
+  void GetCurrentThread(ManagedRegister tr) OVERRIDE;
+  void GetCurrentThread(FrameOffset dest_offset, ManagedRegister scratch) OVERRIDE;
 
   // Set up out_reg to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed. in_reg holds a possibly stale reference
   // that can be used to avoid loading the SIRT entry to see if the value is
   // NULL.
   void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset,
-                       ManagedRegister in_reg, bool null_allowed);
+                       ManagedRegister in_reg, bool null_allowed) OVERRIDE;
 
   // Set up out_off to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed.
   void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset,
-                       ManagedRegister scratch, bool null_allowed);
+                       ManagedRegister scratch, bool null_allowed) OVERRIDE;
 
   // src holds a SIRT entry (Object**) load this into dst.
-  void LoadReferenceFromSirt(ManagedRegister dst,
-                             ManagedRegister src);
+  void LoadReferenceFromSirt(ManagedRegister dst, ManagedRegister src) OVERRIDE;
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
   // know that src may not be null.
-  void VerifyObject(ManagedRegister src, bool could_be_null);
-  void VerifyObject(FrameOffset src, bool could_be_null);
+  void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
+  void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
   // Call to address held at [base+offset].
-  void Call(ManagedRegister base, Offset offset, ManagedRegister scratch);
-  void Call(FrameOffset base, Offset offset, ManagedRegister scratch);
-  void Call(ThreadOffset offset, ManagedRegister scratch);
+  void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void Call(FrameOffset base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch) OVERRIDE;
 
   // Jump to address (not setting link register)
   void JumpTo(ManagedRegister m_base, Offset offs, ManagedRegister m_scratch);
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
-  void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust);
+  void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust) OVERRIDE;
 
  private:
   static vixl::Register reg_x(int code) {
diff --git a/compiler/utils/arm64/constants_arm64.h b/compiler/utils/arm64/constants_arm64.h
index c05c2f1..ecf9fbe 100644
--- a/compiler/utils/arm64/constants_arm64.h
+++ b/compiler/utils/arm64/constants_arm64.h
@@ -29,7 +29,7 @@
 namespace art {
 namespace arm64 {
 
-  constexpr unsigned int kCalleeSavedRegsSize = 20;
+constexpr unsigned int kCalleeSavedRegsSize = 20;
 
 }  // arm64
 }  // art
diff --git a/compiler/utils/assembler.cc b/compiler/utils/assembler.cc
index 1921b28..26bdceb 100644
--- a/compiler/utils/assembler.cc
+++ b/compiler/utils/assembler.cc
@@ -122,4 +122,78 @@
   }
 }
 
+void Assembler::StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm,
+                                         ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm,
+                                         ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::StoreStackOffsetToThread32(ThreadOffset<4> thr_offs,
+                                           FrameOffset fr_offs,
+                                           ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::StoreStackOffsetToThread64(ThreadOffset<8> thr_offs,
+                                           FrameOffset fr_offs,
+                                           ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::StoreStackPointerToThread32(ThreadOffset<4> thr_offs) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::StoreStackPointerToThread64(ThreadOffset<8> thr_offs) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::LoadFromThread64(ManagedRegister dest, ThreadOffset<8> src, size_t size) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::LoadRawPtrFromThread64(ManagedRegister dest, ThreadOffset<8> offs) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                                       ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CopyRawPtrFromThread64(FrameOffset fr_offs, ThreadOffset<8> thr_offs,
+                                       ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                     ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CopyRawPtrToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs,
+                                     ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void Assembler::CallFromThread64(ThreadOffset<8> offset, ManagedRegister scratch) {
+  UNIMPLEMENTED(FATAL);
+}
+
 }  // namespace art
diff --git a/compiler/utils/assembler.h b/compiler/utils/assembler.h
index c23fd44..219c87f 100644
--- a/compiler/utils/assembler.h
+++ b/compiler/utils/assembler.h
@@ -374,14 +374,20 @@
   virtual void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
                                      ManagedRegister scratch) = 0;
 
-  virtual void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister scratch) = 0;
+  virtual void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm,
+                                        ManagedRegister scratch);
+  virtual void StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm,
+                                        ManagedRegister scratch);
 
-  virtual void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister scratch) = 0;
+  virtual void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs,
+                                          FrameOffset fr_offs,
+                                          ManagedRegister scratch);
+  virtual void StoreStackOffsetToThread64(ThreadOffset<8> thr_offs,
+                                          FrameOffset fr_offs,
+                                          ManagedRegister scratch);
 
-  virtual void StoreStackPointerToThread(ThreadOffset thr_offs) = 0;
+  virtual void StoreStackPointerToThread32(ThreadOffset<4> thr_offs);
+  virtual void StoreStackPointerToThread64(ThreadOffset<8> thr_offs);
 
   virtual void StoreSpanning(FrameOffset dest, ManagedRegister src,
                              FrameOffset in_off, ManagedRegister scratch) = 0;
@@ -389,27 +395,29 @@
   // Load routines
   virtual void Load(ManagedRegister dest, FrameOffset src, size_t size) = 0;
 
-  virtual void Load(ManagedRegister dest, ThreadOffset src, size_t size) = 0;
+  virtual void LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size);
+  virtual void LoadFromThread64(ManagedRegister dest, ThreadOffset<8> src, size_t size);
 
   virtual void LoadRef(ManagedRegister dest, FrameOffset  src) = 0;
+  virtual void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) = 0;
 
-  virtual void LoadRef(ManagedRegister dest, ManagedRegister base,
-                       MemberOffset offs) = 0;
+  virtual void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) = 0;
 
-  virtual void LoadRawPtr(ManagedRegister dest, ManagedRegister base,
-                          Offset offs) = 0;
-
-  virtual void LoadRawPtrFromThread(ManagedRegister dest,
-                                    ThreadOffset offs) = 0;
+  virtual void LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs);
+  virtual void LoadRawPtrFromThread64(ManagedRegister dest, ThreadOffset<8> offs);
 
   // Copying routines
   virtual void Move(ManagedRegister dest, ManagedRegister src, size_t size) = 0;
 
-  virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                                    ManagedRegister scratch) = 0;
+  virtual void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                                      ManagedRegister scratch);
+  virtual void CopyRawPtrFromThread64(FrameOffset fr_offs, ThreadOffset<8> thr_offs,
+                                      ManagedRegister scratch);
 
-  virtual void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                                  ManagedRegister scratch) = 0;
+  virtual void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                    ManagedRegister scratch);
+  virtual void CopyRawPtrToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs,
+                                    ManagedRegister scratch);
 
   virtual void CopyRef(FrameOffset dest, FrameOffset src,
                        ManagedRegister scratch) = 0;
@@ -471,7 +479,8 @@
                     ManagedRegister scratch) = 0;
   virtual void Call(FrameOffset base, Offset offset,
                     ManagedRegister scratch) = 0;
-  virtual void Call(ThreadOffset offset, ManagedRegister scratch) = 0;
+  virtual void CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch);
+  virtual void CallFromThread64(ThreadOffset<8> offset, ManagedRegister scratch);
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index dfd3306..99c29f1 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -633,7 +633,7 @@
   StoreToOffset(kStoreWord, scratch.AsCoreRegister(), SP, dest.Int32Value());
 }
 
-void MipsAssembler::StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
+void MipsAssembler::StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm,
                                            ManagedRegister mscratch) {
   MipsManagedRegister scratch = mscratch.AsMips();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -641,7 +641,7 @@
   StoreToOffset(kStoreWord, scratch.AsCoreRegister(), S1, dest.Int32Value());
 }
 
-void MipsAssembler::StoreStackOffsetToThread(ThreadOffset thr_offs,
+void MipsAssembler::StoreStackOffsetToThread32(ThreadOffset<4> thr_offs,
                                              FrameOffset fr_offs,
                                              ManagedRegister mscratch) {
   MipsManagedRegister scratch = mscratch.AsMips();
@@ -651,7 +651,7 @@
                 S1, thr_offs.Int32Value());
 }
 
-void MipsAssembler::StoreStackPointerToThread(ThreadOffset thr_offs) {
+void MipsAssembler::StoreStackPointerToThread32(ThreadOffset<4> thr_offs) {
   StoreToOffset(kStoreWord, SP, S1, thr_offs.Int32Value());
 }
 
@@ -668,7 +668,7 @@
   return EmitLoad(mdest, SP, src.Int32Value(), size);
 }
 
-void MipsAssembler::Load(ManagedRegister mdest, ThreadOffset src, size_t size) {
+void MipsAssembler::LoadFromThread32(ManagedRegister mdest, ThreadOffset<4> src, size_t size) {
   return EmitLoad(mdest, S1, src.Int32Value(), size);
 }
 
@@ -697,8 +697,8 @@
                  base.AsMips().AsCoreRegister(), offs.Int32Value());
 }
 
-void MipsAssembler::LoadRawPtrFromThread(ManagedRegister mdest,
-                                         ThreadOffset offs) {
+void MipsAssembler::LoadRawPtrFromThread32(ManagedRegister mdest,
+                                         ThreadOffset<4> offs) {
   MipsManagedRegister dest = mdest.AsMips();
   CHECK(dest.IsCoreRegister());
   LoadFromOffset(kLoadWord, dest.AsCoreRegister(), S1, offs.Int32Value());
@@ -748,8 +748,8 @@
   StoreToOffset(kStoreWord, scratch.AsCoreRegister(), SP, dest.Int32Value());
 }
 
-void MipsAssembler::CopyRawPtrFromThread(FrameOffset fr_offs,
-                                         ThreadOffset thr_offs,
+void MipsAssembler::CopyRawPtrFromThread32(FrameOffset fr_offs,
+                                         ThreadOffset<4> thr_offs,
                                          ManagedRegister mscratch) {
   MipsManagedRegister scratch = mscratch.AsMips();
   CHECK(scratch.IsCoreRegister()) << scratch;
@@ -759,7 +759,7 @@
                 SP, fr_offs.Int32Value());
 }
 
-void MipsAssembler::CopyRawPtrToThread(ThreadOffset thr_offs,
+void MipsAssembler::CopyRawPtrToThread32(ThreadOffset<4> thr_offs,
                                        FrameOffset fr_offs,
                                        ManagedRegister mscratch) {
   MipsManagedRegister scratch = mscratch.AsMips();
@@ -923,7 +923,7 @@
   // TODO: place reference map on call
 }
 
-void MipsAssembler::Call(ThreadOffset /*offset*/, ManagedRegister /*mscratch*/) {
+void MipsAssembler::CallFromThread32(ThreadOffset<4> /*offset*/, ManagedRegister /*mscratch*/) {
   UNIMPLEMENTED(FATAL) << "no mips implementation";
 }
 
@@ -941,7 +941,7 @@
   MipsExceptionSlowPath* slow = new MipsExceptionSlowPath(scratch, stack_adjust);
   buffer_.EnqueueSlowPath(slow);
   LoadFromOffset(kLoadWord, scratch.AsCoreRegister(),
-                 S1, Thread::ExceptionOffset().Int32Value());
+                 S1, Thread::ExceptionOffset<4>().Int32Value());
   EmitBranch(scratch.AsCoreRegister(), ZERO, slow->Entry(), false);
 }
 
@@ -956,7 +956,7 @@
   // Don't care about preserving A0 as this call won't return
   __ Move(A0, scratch_.AsCoreRegister());
   // Set up call to Thread::Current()->pDeliverException
-  __ LoadFromOffset(kLoadWord, T9, S1, QUICK_ENTRYPOINT_OFFSET(pDeliverException).Int32Value());
+  __ LoadFromOffset(kLoadWord, T9, S1, QUICK_ENTRYPOINT_OFFSET(4, pDeliverException).Int32Value());
   __ Jr(T9);
   // Call never returns
   __ Break();
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index 0d1a94c..75ee8b9 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -29,171 +29,6 @@
 
 namespace art {
 namespace mips {
-#if 0
-class Operand {
- public:
-  uint8_t mod() const {
-    return (encoding_at(0) >> 6) & 3;
-  }
-
-  Register rm() const {
-    return static_cast<Register>(encoding_at(0) & 7);
-  }
-
-  ScaleFactor scale() const {
-    return static_cast<ScaleFactor>((encoding_at(1) >> 6) & 3);
-  }
-
-  Register index() const {
-    return static_cast<Register>((encoding_at(1) >> 3) & 7);
-  }
-
-  Register base() const {
-    return static_cast<Register>(encoding_at(1) & 7);
-  }
-
-  int8_t disp8() const {
-    CHECK_GE(length_, 2);
-    return static_cast<int8_t>(encoding_[length_ - 1]);
-  }
-
-  int32_t disp32() const {
-    CHECK_GE(length_, 5);
-    int32_t value;
-    memcpy(&value, &encoding_[length_ - 4], sizeof(value));
-    return value;
-  }
-
-  bool IsRegister(Register reg) const {
-    return ((encoding_[0] & 0xF8) == 0xC0)  // Addressing mode is register only.
-        && ((encoding_[0] & 0x07) == reg);  // Register codes match.
-  }
-
- protected:
-  // Operand can be sub classed (e.g: Address).
-  Operand() : length_(0) { }
-
-  void SetModRM(int mod, Register rm) {
-    CHECK_EQ(mod & ~3, 0);
-    encoding_[0] = (mod << 6) | rm;
-    length_ = 1;
-  }
-
-  void SetSIB(ScaleFactor scale, Register index, Register base) {
-    CHECK_EQ(length_, 1);
-    CHECK_EQ(scale & ~3, 0);
-    encoding_[1] = (scale << 6) | (index << 3) | base;
-    length_ = 2;
-  }
-
-  void SetDisp8(int8_t disp) {
-    CHECK(length_ == 1 || length_ == 2);
-    encoding_[length_++] = static_cast<uint8_t>(disp);
-  }
-
-  void SetDisp32(int32_t disp) {
-    CHECK(length_ == 1 || length_ == 2);
-    int disp_size = sizeof(disp);
-    memmove(&encoding_[length_], &disp, disp_size);
-    length_ += disp_size;
-  }
-
- private:
-  byte length_;
-  byte encoding_[6];
-  byte padding_;
-
-  explicit Operand(Register reg) { SetModRM(3, reg); }
-
-  // Get the operand encoding byte at the given index.
-  uint8_t encoding_at(int index) const {
-    CHECK_GE(index, 0);
-    CHECK_LT(index, length_);
-    return encoding_[index];
-  }
-
-  friend class MipsAssembler;
-
-  DISALLOW_COPY_AND_ASSIGN(Operand);
-};
-
-
-class Address : public Operand {
- public:
-  Address(Register base, int32_t disp) {
-    Init(base, disp);
-  }
-
-  Address(Register base, Offset disp) {
-    Init(base, disp.Int32Value());
-  }
-
-  Address(Register base, FrameOffset disp) {
-    CHECK_EQ(base, ESP);
-    Init(ESP, disp.Int32Value());
-  }
-
-  Address(Register base, MemberOffset disp) {
-    Init(base, disp.Int32Value());
-  }
-
-  void Init(Register base, int32_t disp) {
-    if (disp == 0 && base != EBP) {
-      SetModRM(0, base);
-      if (base == ESP) SetSIB(TIMES_1, ESP, base);
-    } else if (disp >= -128 && disp <= 127) {
-      SetModRM(1, base);
-      if (base == ESP) SetSIB(TIMES_1, ESP, base);
-      SetDisp8(disp);
-    } else {
-      SetModRM(2, base);
-      if (base == ESP) SetSIB(TIMES_1, ESP, base);
-      SetDisp32(disp);
-    }
-  }
-
-
-  Address(Register index, ScaleFactor scale, int32_t disp) {
-    CHECK_NE(index, ESP);  // Illegal addressing mode.
-    SetModRM(0, ESP);
-    SetSIB(scale, index, EBP);
-    SetDisp32(disp);
-  }
-
-  Address(Register base, Register index, ScaleFactor scale, int32_t disp) {
-    CHECK_NE(index, ESP);  // Illegal addressing mode.
-    if (disp == 0 && base != EBP) {
-      SetModRM(0, ESP);
-      SetSIB(scale, index, base);
-    } else if (disp >= -128 && disp <= 127) {
-      SetModRM(1, ESP);
-      SetSIB(scale, index, base);
-      SetDisp8(disp);
-    } else {
-      SetModRM(2, ESP);
-      SetSIB(scale, index, base);
-      SetDisp32(disp);
-    }
-  }
-
-  static Address Absolute(uword addr) {
-    Address result;
-    result.SetModRM(0, EBP);
-    result.SetDisp32(addr);
-    return result;
-  }
-
-  static Address Absolute(ThreadOffset addr) {
-    return Absolute(addr.Int32Value());
-  }
-
- private:
-  Address() {}
-
-  DISALLOW_COPY_AND_ASSIGN(Address);
-};
-
-#endif
 
 enum LoadOperandType {
   kLoadSignedByte,
@@ -215,7 +50,7 @@
   kStoreDWord
 };
 
-class MipsAssembler : public Assembler {
+class MipsAssembler FINAL : public Assembler {
  public:
   MipsAssembler() {}
   virtual ~MipsAssembler() {}
@@ -310,40 +145,6 @@
   void StoreFToOffset(FRegister reg, Register base, int32_t offset);
   void StoreDToOffset(DRegister reg, Register base, int32_t offset);
 
-#if 0
-  MipsAssembler* lock();
-
-  void mfence();
-
-  MipsAssembler* fs();
-
-  //
-  // Macros for High-level operations.
-  //
-
-  void AddImmediate(Register reg, const Immediate& imm);
-
-  void LoadDoubleConstant(XmmRegister dst, double value);
-
-  void DoubleNegate(XmmRegister d);
-  void FloatNegate(XmmRegister f);
-
-  void DoubleAbs(XmmRegister reg);
-
-  void LockCmpxchgl(const Address& address, Register reg) {
-    lock()->cmpxchgl(address, reg);
-  }
-
-  //
-  // Misc. functionality
-  //
-  int PreferredLoopAlignment() { return 16; }
-  void Align(int alignment, int offset);
-
-  // Debugging and bringup support.
-  void Stop(const char* message);
-#endif
-
   // Emit data (e.g. encoded instruction or immediate) to the instruction stream.
   void Emit(int32_t value);
   void EmitBranch(Register rt, Register rs, Label* label, bool equal);
@@ -355,127 +156,116 @@
   //
 
   // Emit code that will create an activation on the stack
-  virtual void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                          const std::vector<ManagedRegister>& callee_save_regs,
-                          const ManagedRegisterEntrySpills& entry_spills);
+  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
+                  const std::vector<ManagedRegister>& callee_save_regs,
+                  const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack
-  virtual void RemoveFrame(size_t frame_size,
-                           const std::vector<ManagedRegister>& callee_save_regs);
+  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+      OVERRIDE;
 
-  virtual void IncreaseFrameSize(size_t adjust);
-  virtual void DecreaseFrameSize(size_t adjust);
+  void IncreaseFrameSize(size_t adjust) OVERRIDE;
+  void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
   // Store routines
-  virtual void Store(FrameOffset offs, ManagedRegister msrc, size_t size);
-  virtual void StoreRef(FrameOffset dest, ManagedRegister msrc);
-  virtual void StoreRawPtr(FrameOffset dest, ManagedRegister msrc);
+  void Store(FrameOffset offs, ManagedRegister msrc, size_t size) OVERRIDE;
+  void StoreRef(FrameOffset dest, ManagedRegister msrc) OVERRIDE;
+  void StoreRawPtr(FrameOffset dest, ManagedRegister msrc) OVERRIDE;
 
-  virtual void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                     ManagedRegister mscratch);
+  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister mscratch) OVERRIDE;
 
-  virtual void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister mscratch);
+  void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm, ManagedRegister mscratch)
+      OVERRIDE;
 
-  virtual void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister mscratch);
+  void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                  ManagedRegister mscratch) OVERRIDE;
 
-  virtual void StoreStackPointerToThread(ThreadOffset thr_offs);
+  void StoreStackPointerToThread32(ThreadOffset<4> thr_offs) OVERRIDE;
 
-  virtual void StoreSpanning(FrameOffset dest, ManagedRegister msrc,
-                             FrameOffset in_off, ManagedRegister mscratch);
+  void StoreSpanning(FrameOffset dest, ManagedRegister msrc, FrameOffset in_off,
+                     ManagedRegister mscratch) OVERRIDE;
 
   // Load routines
-  virtual void Load(ManagedRegister mdest, FrameOffset src, size_t size);
+  void Load(ManagedRegister mdest, FrameOffset src, size_t size) OVERRIDE;
 
-  virtual void Load(ManagedRegister mdest, ThreadOffset src, size_t size);
+  void LoadFromThread32(ManagedRegister mdest, ThreadOffset<4> src, size_t size) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, FrameOffset  src);
+  void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister mdest, ManagedRegister base,
-                       MemberOffset offs);
+  void LoadRef(ManagedRegister mdest, ManagedRegister base, MemberOffset offs) OVERRIDE;
 
-  virtual void LoadRawPtr(ManagedRegister mdest, ManagedRegister base,
-                          Offset offs);
+  void LoadRawPtr(ManagedRegister mdest, ManagedRegister base, Offset offs) OVERRIDE;
 
-  virtual void LoadRawPtrFromThread(ManagedRegister mdest,
-                                    ThreadOffset offs);
+  void LoadRawPtrFromThread32(ManagedRegister mdest, ThreadOffset<4> offs) OVERRIDE;
 
   // Copying routines
-  virtual void Move(ManagedRegister mdest, ManagedRegister msrc, size_t size);
+  void Move(ManagedRegister mdest, ManagedRegister msrc, size_t size) OVERRIDE;
 
-  virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                                    ManagedRegister mscratch);
+  void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                              ManagedRegister mscratch) OVERRIDE;
 
-  virtual void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                                  ManagedRegister mscratch);
+  void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                            ManagedRegister mscratch) OVERRIDE;
 
-  virtual void CopyRef(FrameOffset dest, FrameOffset src,
-                       ManagedRegister mscratch);
+  void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister mscratch) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src, ManagedRegister mscratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister mscratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset,
-                    ManagedRegister mscratch, size_t size);
+  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset, ManagedRegister mscratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
-                    ManagedRegister mscratch, size_t size);
+  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
+            ManagedRegister mscratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
-                    ManagedRegister mscratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset, ManagedRegister mscratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest, Offset dest_offset,
-                    ManagedRegister src, Offset src_offset,
-                    ManagedRegister mscratch, size_t size);
+  void Copy(ManagedRegister dest, Offset dest_offset, ManagedRegister src, Offset src_offset,
+            ManagedRegister mscratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
-                    ManagedRegister mscratch, size_t size);
+  void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
+            ManagedRegister mscratch, size_t size) OVERRIDE;
 
-  virtual void MemoryBarrier(ManagedRegister);
+  void MemoryBarrier(ManagedRegister) OVERRIDE;
 
   // Sign extension
-  virtual void SignExtend(ManagedRegister mreg, size_t size);
+  void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Zero extension
-  virtual void ZeroExtend(ManagedRegister mreg, size_t size);
+  void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Exploit fast access in managed code to Thread::Current()
-  virtual void GetCurrentThread(ManagedRegister tr);
-  virtual void GetCurrentThread(FrameOffset dest_offset,
-                                ManagedRegister mscratch);
+  void GetCurrentThread(ManagedRegister tr) OVERRIDE;
+  void GetCurrentThread(FrameOffset dest_offset, ManagedRegister mscratch) OVERRIDE;
 
   // Set up out_reg to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed. in_reg holds a possibly stale reference
   // that can be used to avoid loading the SIRT entry to see if the value is
   // NULL.
-  virtual void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset,
-                               ManagedRegister in_reg, bool null_allowed);
+  void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset, ManagedRegister in_reg,
+                       bool null_allowed) OVERRIDE;
 
   // Set up out_off to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed.
-  virtual void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset,
-                               ManagedRegister mscratch, bool null_allowed);
+  void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset, ManagedRegister mscratch,
+                       bool null_allowed) OVERRIDE;
 
   // src holds a SIRT entry (Object**) load this into dst
-  virtual void LoadReferenceFromSirt(ManagedRegister dst,
-                                     ManagedRegister src);
+  void LoadReferenceFromSirt(ManagedRegister dst, ManagedRegister src) OVERRIDE;
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
   // know that src may not be null.
-  virtual void VerifyObject(ManagedRegister src, bool could_be_null);
-  virtual void VerifyObject(FrameOffset src, bool could_be_null);
+  void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
+  void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
   // Call to address held at [base+offset]
-  virtual void Call(ManagedRegister base, Offset offset,
-                    ManagedRegister mscratch);
-  virtual void Call(FrameOffset base, Offset offset,
-                    ManagedRegister mscratch);
-  virtual void Call(ThreadOffset offset, ManagedRegister mscratch);
+  void Call(ManagedRegister base, Offset offset, ManagedRegister mscratch) OVERRIDE;
+  void Call(FrameOffset base, Offset offset, ManagedRegister mscratch) OVERRIDE;
+  void CallFromThread32(ThreadOffset<4> offset, ManagedRegister mscratch) OVERRIDE;
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
-  virtual void ExceptionPoll(ManagedRegister mscratch, size_t stack_adjust);
+  void ExceptionPoll(ManagedRegister mscratch, size_t stack_adjust) OVERRIDE;
 
  private:
   void EmitR(int opcode, Register rs, Register rt, Register rd, int shamt, int funct);
@@ -491,11 +281,11 @@
 };
 
 // Slowpath entered when Thread::Current()->_exception is non-null
-class MipsExceptionSlowPath : public SlowPath {
+class MipsExceptionSlowPath FINAL : public SlowPath {
  public:
   explicit MipsExceptionSlowPath(MipsManagedRegister scratch, size_t stack_adjust)
       : scratch_(scratch), stack_adjust_(stack_adjust) {}
-  virtual void Emit(Assembler *sp_asm);
+  virtual void Emit(Assembler *sp_asm) OVERRIDE;
  private:
   const MipsManagedRegister scratch_;
   const size_t stack_adjust_;
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index ebbb43a..aac8b01 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -1478,12 +1478,12 @@
   movl(Address(ESP, dest), Immediate(imm));
 }
 
-void X86Assembler::StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
+void X86Assembler::StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm,
                                           ManagedRegister) {
   fs()->movl(Address::Absolute(dest), Immediate(imm));
 }
 
-void X86Assembler::StoreStackOffsetToThread(ThreadOffset thr_offs,
+void X86Assembler::StoreStackOffsetToThread32(ThreadOffset<4> thr_offs,
                                             FrameOffset fr_offs,
                                             ManagedRegister mscratch) {
   X86ManagedRegister scratch = mscratch.AsX86();
@@ -1492,14 +1492,10 @@
   fs()->movl(Address::Absolute(thr_offs), scratch.AsCpuRegister());
 }
 
-void X86Assembler::StoreStackPointerToThread(ThreadOffset thr_offs) {
+void X86Assembler::StoreStackPointerToThread32(ThreadOffset<4> thr_offs) {
   fs()->movl(Address::Absolute(thr_offs), ESP);
 }
 
-void X86Assembler::StoreLabelToThread(ThreadOffset thr_offs, Label* lbl) {
-  fs()->movl(Address::Absolute(thr_offs), lbl);
-}
-
 void X86Assembler::StoreSpanning(FrameOffset /*dst*/, ManagedRegister /*src*/,
                                  FrameOffset /*in_off*/, ManagedRegister /*scratch*/) {
   UNIMPLEMENTED(FATAL);  // this case only currently exists for ARM
@@ -1532,7 +1528,7 @@
   }
 }
 
-void X86Assembler::Load(ManagedRegister mdest, ThreadOffset src, size_t size) {
+void X86Assembler::LoadFromThread32(ManagedRegister mdest, ThreadOffset<4> src, size_t size) {
   X86ManagedRegister dest = mdest.AsX86();
   if (dest.IsNoRegister()) {
     CHECK_EQ(0u, size);
@@ -1542,7 +1538,7 @@
   } else if (dest.IsRegisterPair()) {
     CHECK_EQ(8u, size);
     fs()->movl(dest.AsRegisterPairLow(), Address::Absolute(src));
-    fs()->movl(dest.AsRegisterPairHigh(), Address::Absolute(ThreadOffset(src.Int32Value()+4)));
+    fs()->movl(dest.AsRegisterPairHigh(), Address::Absolute(ThreadOffset<4>(src.Int32Value()+4)));
   } else if (dest.IsX87Register()) {
     if (size == 4) {
       fs()->flds(Address::Absolute(src));
@@ -1582,8 +1578,8 @@
   movl(dest.AsCpuRegister(), Address(base.AsX86().AsCpuRegister(), offs));
 }
 
-void X86Assembler::LoadRawPtrFromThread(ManagedRegister mdest,
-                                        ThreadOffset offs) {
+void X86Assembler::LoadRawPtrFromThread32(ManagedRegister mdest,
+                                        ThreadOffset<4> offs) {
   X86ManagedRegister dest = mdest.AsX86();
   CHECK(dest.IsCpuRegister());
   fs()->movl(dest.AsCpuRegister(), Address::Absolute(offs));
@@ -1645,8 +1641,8 @@
   movl(Address(ESP, dest), scratch.AsCpuRegister());
 }
 
-void X86Assembler::CopyRawPtrFromThread(FrameOffset fr_offs,
-                                        ThreadOffset thr_offs,
+void X86Assembler::CopyRawPtrFromThread32(FrameOffset fr_offs,
+                                        ThreadOffset<4> thr_offs,
                                         ManagedRegister mscratch) {
   X86ManagedRegister scratch = mscratch.AsX86();
   CHECK(scratch.IsCpuRegister());
@@ -1654,7 +1650,7 @@
   Store(fr_offs, scratch, 4);
 }
 
-void X86Assembler::CopyRawPtrToThread(ThreadOffset thr_offs,
+void X86Assembler::CopyRawPtrToThread32(ThreadOffset<4> thr_offs,
                                       FrameOffset fr_offs,
                                       ManagedRegister mscratch) {
   X86ManagedRegister scratch = mscratch.AsX86();
@@ -1804,26 +1800,26 @@
   call(Address(scratch, offset));
 }
 
-void X86Assembler::Call(ThreadOffset offset, ManagedRegister /*mscratch*/) {
+void X86Assembler::CallFromThread32(ThreadOffset<4> offset, ManagedRegister /*mscratch*/) {
   fs()->call(Address::Absolute(offset));
 }
 
 void X86Assembler::GetCurrentThread(ManagedRegister tr) {
   fs()->movl(tr.AsX86().AsCpuRegister(),
-             Address::Absolute(Thread::SelfOffset()));
+             Address::Absolute(Thread::SelfOffset<4>()));
 }
 
 void X86Assembler::GetCurrentThread(FrameOffset offset,
                                     ManagedRegister mscratch) {
   X86ManagedRegister scratch = mscratch.AsX86();
-  fs()->movl(scratch.AsCpuRegister(), Address::Absolute(Thread::SelfOffset()));
+  fs()->movl(scratch.AsCpuRegister(), Address::Absolute(Thread::SelfOffset<4>()));
   movl(Address(ESP, offset), scratch.AsCpuRegister());
 }
 
 void X86Assembler::ExceptionPoll(ManagedRegister /*scratch*/, size_t stack_adjust) {
   X86ExceptionSlowPath* slow = new X86ExceptionSlowPath(stack_adjust);
   buffer_.EnqueueSlowPath(slow);
-  fs()->cmpl(Address::Absolute(Thread::ExceptionOffset()), Immediate(0));
+  fs()->cmpl(Address::Absolute(Thread::ExceptionOffset<4>()), Immediate(0));
   j(kNotEqual, slow->Entry());
 }
 
@@ -1836,8 +1832,8 @@
     __ DecreaseFrameSize(stack_adjust_);
   }
   // Pass exception as argument in EAX
-  __ fs()->movl(EAX, Address::Absolute(Thread::ExceptionOffset()));
-  __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(pDeliverException)));
+  __ fs()->movl(EAX, Address::Absolute(Thread::ExceptionOffset<4>()));
+  __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(4, pDeliverException)));
   // this call should never return
   __ int3();
 #undef __
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index f906a6f..f8fc4c0 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -117,7 +117,6 @@
  private:
   byte length_;
   byte encoding_[6];
-  byte padding_;
 
   explicit Operand(Register reg) { SetModRM(3, reg); }
 
@@ -192,21 +191,15 @@
     }
   }
 
-  static Address Absolute(uword addr, bool has_rip = false) {
+  static Address Absolute(uword addr) {
     Address result;
-    if (has_rip) {
-      result.SetModRM(0, ESP);
-      result.SetSIB(TIMES_1, ESP, EBP);
-      result.SetDisp32(addr);
-    } else {
-      result.SetModRM(0, EBP);
-      result.SetDisp32(addr);
-    }
+    result.SetModRM(0, EBP);
+    result.SetDisp32(addr);
     return result;
   }
 
-  static Address Absolute(ThreadOffset addr, bool has_rip = false) {
-    return Absolute(addr.Int32Value(), has_rip);
+  static Address Absolute(ThreadOffset<4> addr) {
+    return Absolute(addr.Int32Value());
   }
 
  private:
@@ -465,129 +458,116 @@
   //
 
   // Emit code that will create an activation on the stack
-  virtual void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                          const std::vector<ManagedRegister>& callee_save_regs,
-                          const ManagedRegisterEntrySpills& entry_spills);
+  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
+                  const std::vector<ManagedRegister>& callee_save_regs,
+                  const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack
-  virtual void RemoveFrame(size_t frame_size,
-                           const std::vector<ManagedRegister>& callee_save_regs);
+  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+      OVERRIDE;
 
-  virtual void IncreaseFrameSize(size_t adjust);
-  virtual void DecreaseFrameSize(size_t adjust);
+  void IncreaseFrameSize(size_t adjust) OVERRIDE;
+  void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
   // Store routines
-  virtual void Store(FrameOffset offs, ManagedRegister src, size_t size);
-  virtual void StoreRef(FrameOffset dest, ManagedRegister src);
-  virtual void StoreRawPtr(FrameOffset dest, ManagedRegister src);
+  void Store(FrameOffset offs, ManagedRegister src, size_t size) OVERRIDE;
+  void StoreRef(FrameOffset dest, ManagedRegister src) OVERRIDE;
+  void StoreRawPtr(FrameOffset dest, ManagedRegister src) OVERRIDE;
 
-  virtual void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                     ManagedRegister scratch);
+  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister scratch);
+  void StoreImmediateToThread32(ThreadOffset<4> dest, uint32_t imm, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister scratch);
+  void StoreStackOffsetToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs,
+                                  ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreStackPointerToThread(ThreadOffset thr_offs);
+  void StoreStackPointerToThread32(ThreadOffset<4> thr_offs) OVERRIDE;
 
-  void StoreLabelToThread(ThreadOffset thr_offs, Label* lbl);
-
-  virtual void StoreSpanning(FrameOffset dest, ManagedRegister src,
-                             FrameOffset in_off, ManagedRegister scratch);
+  void StoreSpanning(FrameOffset dest, ManagedRegister src, FrameOffset in_off,
+                     ManagedRegister scratch) OVERRIDE;
 
   // Load routines
-  virtual void Load(ManagedRegister dest, FrameOffset src, size_t size);
+  void Load(ManagedRegister dest, FrameOffset src, size_t size) OVERRIDE;
 
-  virtual void Load(ManagedRegister dest, ThreadOffset src, size_t size);
+  void LoadFromThread32(ManagedRegister dest, ThreadOffset<4> src, size_t size) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, FrameOffset  src);
+  void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, ManagedRegister base,
-                       MemberOffset offs);
+  void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) OVERRIDE;
 
-  virtual void LoadRawPtr(ManagedRegister dest, ManagedRegister base,
-                          Offset offs);
+  void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) OVERRIDE;
 
-  virtual void LoadRawPtrFromThread(ManagedRegister dest,
-                                    ThreadOffset offs);
+  void LoadRawPtrFromThread32(ManagedRegister dest, ThreadOffset<4> offs) OVERRIDE;
 
   // Copying routines
-  virtual void Move(ManagedRegister dest, ManagedRegister src, size_t size);
+  void Move(ManagedRegister dest, ManagedRegister src, size_t size) OVERRIDE;
 
-  virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                                    ManagedRegister scratch);
+  void CopyRawPtrFromThread32(FrameOffset fr_offs, ThreadOffset<4> thr_offs,
+                              ManagedRegister scratch) OVERRIDE;
 
-  virtual void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                                  ManagedRegister scratch);
+  void CopyRawPtrToThread32(ThreadOffset<4> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void CopyRef(FrameOffset dest, FrameOffset src,
-                       ManagedRegister scratch);
+  void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister scratch) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest, Offset dest_offset,
-                    ManagedRegister src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest, Offset dest_offset, ManagedRegister src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void MemoryBarrier(ManagedRegister);
+  void MemoryBarrier(ManagedRegister) OVERRIDE;
 
   // Sign extension
-  virtual void SignExtend(ManagedRegister mreg, size_t size);
+  void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Zero extension
-  virtual void ZeroExtend(ManagedRegister mreg, size_t size);
+  void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Exploit fast access in managed code to Thread::Current()
-  virtual void GetCurrentThread(ManagedRegister tr);
-  virtual void GetCurrentThread(FrameOffset dest_offset,
-                                ManagedRegister scratch);
+  void GetCurrentThread(ManagedRegister tr) OVERRIDE;
+  void GetCurrentThread(FrameOffset dest_offset, ManagedRegister scratch) OVERRIDE;
 
   // Set up out_reg to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed. in_reg holds a possibly stale reference
   // that can be used to avoid loading the SIRT entry to see if the value is
   // NULL.
-  virtual void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset,
-                               ManagedRegister in_reg, bool null_allowed);
+  void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset, ManagedRegister in_reg,
+                       bool null_allowed) OVERRIDE;
 
   // Set up out_off to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed.
-  virtual void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset,
-                               ManagedRegister scratch, bool null_allowed);
+  void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset, ManagedRegister scratch,
+                       bool null_allowed) OVERRIDE;
 
   // src holds a SIRT entry (Object**) load this into dst
-  virtual void LoadReferenceFromSirt(ManagedRegister dst,
-                                     ManagedRegister src);
+  void LoadReferenceFromSirt(ManagedRegister dst, ManagedRegister src) OVERRIDE;
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
   // know that src may not be null.
-  virtual void VerifyObject(ManagedRegister src, bool could_be_null);
-  virtual void VerifyObject(FrameOffset src, bool could_be_null);
+  void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
+  void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
   // Call to address held at [base+offset]
-  virtual void Call(ManagedRegister base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(FrameOffset base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(ThreadOffset offset, ManagedRegister scratch);
+  void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void Call(FrameOffset base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void CallFromThread32(ThreadOffset<4> offset, ManagedRegister scratch) OVERRIDE;
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
-  virtual void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust);
+  void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust) OVERRIDE;
 
  private:
   inline void EmitUint8(uint8_t value);
@@ -637,10 +617,10 @@
 }
 
 // Slowpath entered when Thread::Current()->_exception is non-null
-class X86ExceptionSlowPath : public SlowPath {
+class X86ExceptionSlowPath FINAL : public SlowPath {
  public:
   explicit X86ExceptionSlowPath(size_t stack_adjust) : stack_adjust_(stack_adjust) {}
-  virtual void Emit(Assembler *sp_asm);
+  virtual void Emit(Assembler *sp_asm) OVERRIDE;
  private:
   const size_t stack_adjust_;
 };
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index fa302c9..52b9382 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -24,23 +24,29 @@
 namespace art {
 namespace x86_64 {
 
+std::ostream& operator<<(std::ostream& os, const CpuRegister& reg) {
+  return os << reg.AsRegister();
+}
+
 std::ostream& operator<<(std::ostream& os, const XmmRegister& reg) {
-  return os << "XMM" << static_cast<int>(reg);
+  return os << reg.AsFloatRegister();
 }
 
 std::ostream& operator<<(std::ostream& os, const X87Register& reg) {
   return os << "ST" << static_cast<int>(reg);
 }
 
-void X86_64Assembler::call(Register reg) {
+void X86_64Assembler::call(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xFF);
-  EmitRegisterOperand(2, reg);
+  EmitRegisterOperand(2, reg.LowBits());
 }
 
 
 void X86_64Assembler::call(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0xFF);
   EmitOperand(2, address);
 }
@@ -54,15 +60,16 @@
 }
 
 
-void X86_64Assembler::pushq(Register reg) {
+void X86_64Assembler::pushq(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_rm(reg);
-  EmitUint8(0x50 + reg);
+  EmitOptionalRex32(reg);
+  EmitUint8(0x50 + reg.LowBits());
 }
 
 
 void X86_64Assembler::pushq(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0xFF);
   EmitOperand(6, address);
 }
@@ -80,332 +87,335 @@
 }
 
 
-void X86_64Assembler::popq(Register reg) {
+void X86_64Assembler::popq(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_rm(reg);
-  EmitUint8(0x58 + reg);
+  EmitOptionalRex32(reg);
+  EmitUint8(0x58 + reg.LowBits());
 }
 
 
 void X86_64Assembler::popq(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0x8F);
   EmitOperand(0, address);
 }
 
 
-void X86_64Assembler::movq(Register dst, const Immediate& imm) {
+void X86_64Assembler::movq(CpuRegister dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x48);  // REX.W
-  EmitUint8(0xB8 + dst);
+  EmitRex64(dst);
+  EmitUint8(0xB8 + dst.LowBits());
   EmitImmediate(imm);
 }
 
 
-void X86_64Assembler::movl(Register dst, const Immediate& imm) {
+void X86_64Assembler::movl(CpuRegister dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xB8 + dst);
+  EmitOptionalRex32(dst);
+  EmitUint8(0xB8 + dst.LowBits());
   EmitImmediate(imm);
 }
 
 
-void X86_64Assembler::movq(Register dst, Register src) {
+void X86_64Assembler::movq(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x48);  // REX.W
+  EmitRex64(dst, src);
   EmitUint8(0x89);
-  EmitRegisterOperand(src, dst);
+  EmitRegisterOperand(src.LowBits(), dst.LowBits());
 }
 
 
-void X86_64Assembler::movl(Register dst, Register src) {
+void X86_64Assembler::movl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x89);
-  EmitRegisterOperand(src, dst);
+  EmitRegisterOperand(src.LowBits(), dst.LowBits());
 }
 
 
-void X86_64Assembler::movq(Register dst, const Address& src) {
+void X86_64Assembler::movq(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_reg(dst, 8);
+  EmitRex64(dst, src);
   EmitUint8(0x8B);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movl(Register dst, const Address& src) {
+void X86_64Assembler::movl(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_reg(dst, 4);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x8B);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movq(const Address& dst, Register src) {
+void X86_64Assembler::movq(const Address& dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_reg(src, 8);
+  EmitRex64(src, dst);
   EmitUint8(0x89);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
 
-void X86_64Assembler::movl(const Address& dst, Register src) {
+void X86_64Assembler::movl(const Address& dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_reg(src, 4);
+  EmitOptionalRex32(src, dst);
   EmitUint8(0x89);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
-
 void X86_64Assembler::movl(const Address& dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst);
   EmitUint8(0xC7);
   EmitOperand(0, dst);
   EmitImmediate(imm);
 }
 
-void X86_64Assembler::movl(const Address& dst, Label* lbl) {
+void X86_64Assembler::movzxb(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xC7);
-  EmitOperand(0, dst);
-  EmitLabel(lbl, dst.length_ + 5);
-}
-
-void X86_64Assembler::movzxb(Register dst, ByteRegister src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalByteRegNormalizingRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xB6);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
 
-void X86_64Assembler::movzxb(Register dst, const Address& src) {
+void X86_64Assembler::movzxb(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalByteRegNormalizingRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xB6);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movsxb(Register dst, ByteRegister src) {
+void X86_64Assembler::movsxb(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalByteRegNormalizingRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xBE);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
 
-void X86_64Assembler::movsxb(Register dst, const Address& src) {
+void X86_64Assembler::movsxb(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalByteRegNormalizingRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xBE);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movb(Register /*dst*/, const Address& /*src*/) {
+void X86_64Assembler::movb(CpuRegister /*dst*/, const Address& /*src*/) {
   LOG(FATAL) << "Use movzxb or movsxb instead.";
 }
 
 
-void X86_64Assembler::movb(const Address& dst, ByteRegister src) {
+void X86_64Assembler::movb(const Address& dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalByteRegNormalizingRex32(src, dst);
   EmitUint8(0x88);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
 
 void X86_64Assembler::movb(const Address& dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xC6);
-  EmitOperand(RAX, dst);
+  EmitOperand(Register::RAX, dst);
   CHECK(imm.is_int8());
   EmitUint8(imm.value() & 0xFF);
 }
 
 
-void X86_64Assembler::movzxw(Register dst, Register src) {
+void X86_64Assembler::movzxw(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xB7);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
 
-void X86_64Assembler::movzxw(Register dst, const Address& src) {
+void X86_64Assembler::movzxw(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xB7);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movsxw(Register dst, Register src) {
+void X86_64Assembler::movsxw(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xBF);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
 
-void X86_64Assembler::movsxw(Register dst, const Address& src) {
+void X86_64Assembler::movsxw(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xBF);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::movw(Register /*dst*/, const Address& /*src*/) {
+void X86_64Assembler::movw(CpuRegister /*dst*/, const Address& /*src*/) {
   LOG(FATAL) << "Use movzxw or movsxw instead.";
 }
 
 
-void X86_64Assembler::movw(const Address& dst, Register src) {
+void X86_64Assembler::movw(const Address& dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(src, dst);
   EmitOperandSizeOverride();
   EmitUint8(0x89);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
 
-void X86_64Assembler::leaq(Register dst, const Address& src) {
+void X86_64Assembler::leaq(CpuRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex_reg(dst, 8);
+  EmitRex64(dst, src);
   EmitUint8(0x8D);
-  EmitOperand(dst, src);
-}
-
-
-void X86_64Assembler::cmovl(Condition condition, Register dst, Register src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x0F);
-  EmitUint8(0x40 + condition);
-  EmitRegisterOperand(dst, src);
-}
-
-
-void X86_64Assembler::setb(Condition condition, Register dst) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x0F);
-  EmitUint8(0x90 + condition);
-  EmitOperand(0, Operand(dst));
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::movss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x10);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::movss(const Address& dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(src, dst);
   EmitUint8(0x0F);
   EmitUint8(0x11);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
 
 void X86_64Assembler::movss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x11);
-  EmitXmmRegisterOperand(src, dst);
+  EmitXmmRegisterOperand(src.LowBits(), dst);
 }
 
 
-void X86_64Assembler::movd(XmmRegister dst, Register src) {
+void X86_64Assembler::movd(XmmRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x6E);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::movd(Register dst, XmmRegister src) {
+void X86_64Assembler::movd(CpuRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(src, dst);
   EmitUint8(0x0F);
   EmitUint8(0x7E);
-  EmitOperand(src, Operand(dst));
+  EmitOperand(src.LowBits(), Operand(dst));
 }
 
 
 void X86_64Assembler::addss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x58);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::addss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x58);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::subss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5C);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::subss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5C);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::mulss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x59);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::mulss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x59);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::divss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5E);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::divss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5E);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
@@ -426,258 +436,287 @@
 void X86_64Assembler::movsd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x10);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::movsd(const Address& dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(src, dst);
   EmitUint8(0x0F);
   EmitUint8(0x11);
-  EmitOperand(src, dst);
+  EmitOperand(src.LowBits(), dst);
 }
 
 
 void X86_64Assembler::movsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x11);
-  EmitXmmRegisterOperand(src, dst);
+  EmitXmmRegisterOperand(src.LowBits(), dst);
 }
 
 
 void X86_64Assembler::addsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x58);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::addsd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x58);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::subsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5C);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::subsd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5C);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::mulsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x59);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::mulsd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x59);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::divsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5E);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::divsd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5E);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::cvtsi2ss(XmmRegister dst, Register src) {
+void X86_64Assembler::cvtsi2ss(XmmRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2A);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::cvtsi2sd(XmmRegister dst, Register src) {
+void X86_64Assembler::cvtsi2sd(XmmRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2A);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::cvtss2si(Register dst, XmmRegister src) {
+void X86_64Assembler::cvtss2si(CpuRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2D);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::cvtss2sd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5A);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::cvtsd2si(Register dst, XmmRegister src) {
+void X86_64Assembler::cvtsd2si(CpuRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2D);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::cvttss2si(Register dst, XmmRegister src) {
+void X86_64Assembler::cvttss2si(CpuRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2C);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
-void X86_64Assembler::cvttsd2si(Register dst, XmmRegister src) {
+void X86_64Assembler::cvttsd2si(CpuRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x2C);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::cvtsd2ss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x5A);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::cvtdq2pd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xE6);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::comiss(XmmRegister a, XmmRegister b) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(a, b);
   EmitUint8(0x0F);
   EmitUint8(0x2F);
-  EmitXmmRegisterOperand(a, b);
+  EmitXmmRegisterOperand(a.LowBits(), b);
 }
 
 
 void X86_64Assembler::comisd(XmmRegister a, XmmRegister b) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(a, b);
   EmitUint8(0x0F);
   EmitUint8(0x2F);
-  EmitXmmRegisterOperand(a, b);
+  EmitXmmRegisterOperand(a.LowBits(), b);
 }
 
 
 void X86_64Assembler::sqrtsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x51);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::sqrtss(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x51);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::xorpd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x57);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::xorpd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x57);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::xorps(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x57);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::xorps(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x57);
-  EmitXmmRegisterOperand(dst, src);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
 }
 
 
 void X86_64Assembler::andpd(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0x54);
-  EmitOperand(dst, src);
+  EmitOperand(dst.LowBits(), src);
 }
 
 
@@ -766,92 +805,102 @@
 }
 
 
-void X86_64Assembler::xchgl(Register dst, Register src) {
+void X86_64Assembler::xchgl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x87);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
-void X86_64Assembler::xchgl(Register reg, const Address& address) {
+void X86_64Assembler::xchgl(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x87);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
-void X86_64Assembler::cmpl(Register reg, const Immediate& imm) {
+void X86_64Assembler::cmpl(CpuRegister reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitComplex(7, Operand(reg), imm);
 }
 
 
-void X86_64Assembler::cmpl(Register reg0, Register reg1) {
+void X86_64Assembler::cmpl(CpuRegister reg0, CpuRegister reg1) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg0, reg1);
   EmitUint8(0x3B);
-  EmitOperand(reg0, Operand(reg1));
+  EmitOperand(reg0.LowBits(), Operand(reg1));
 }
 
 
-void X86_64Assembler::cmpl(Register reg, const Address& address) {
+void X86_64Assembler::cmpl(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x3B);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
-void X86_64Assembler::addl(Register dst, Register src) {
+void X86_64Assembler::addl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x03);
-  EmitRegisterOperand(dst, src);
+  EmitRegisterOperand(dst.LowBits(), src.LowBits());
 }
 
 
-void X86_64Assembler::addl(Register reg, const Address& address) {
+void X86_64Assembler::addl(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x03);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
-void X86_64Assembler::cmpl(const Address& address, Register reg) {
+void X86_64Assembler::cmpl(const Address& address, CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x39);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
 void X86_64Assembler::cmpl(const Address& address, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitComplex(7, address, imm);
 }
 
 
-void X86_64Assembler::testl(Register reg1, Register reg2) {
+void X86_64Assembler::testl(CpuRegister reg1, CpuRegister reg2) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex(reg1, reg2, 4);
+  EmitOptionalRex32(reg1, reg2);
   EmitUint8(0x85);
-  EmitRegisterOperand(reg1, reg2);
+  EmitRegisterOperand(reg1.LowBits(), reg2.LowBits());
 }
 
 
-void X86_64Assembler::testl(Register reg, const Immediate& immediate) {
+void X86_64Assembler::testl(CpuRegister reg, const Immediate& immediate) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   // For registers that have a byte variant (RAX, RBX, RCX, and RDX)
-  // we only test the byte register to keep the encoding short.
-  if (immediate.is_uint8() && reg < 4) {
+  // we only test the byte CpuRegister to keep the encoding short.
+  if (immediate.is_uint8() && reg.AsRegister() < 4) {
     // Use zero-extended 8-bit immediate.
-    if (reg == RAX) {
+    if (reg.AsRegister() == RAX) {
       EmitUint8(0xA8);
     } else {
       EmitUint8(0xF6);
-      EmitUint8(0xC0 + reg);
+      EmitUint8(0xC0 + reg.AsRegister());
     }
     EmitUint8(immediate.value() & 0xFF);
-  } else if (reg == RAX) {
+  } else if (reg.AsRegister() == RAX) {
     // Use short form if the destination is RAX.
     EmitUint8(0xA9);
     EmitImmediate(immediate);
   } else {
+    EmitOptionalRex32(reg);
     EmitUint8(0xF7);
     EmitOperand(0, Operand(reg));
     EmitImmediate(immediate);
@@ -859,136 +908,145 @@
 }
 
 
-void X86_64Assembler::andl(Register dst, Register src) {
+void X86_64Assembler::andl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x23);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::andl(Register dst, const Immediate& imm) {
+void X86_64Assembler::andl(CpuRegister dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst);
   EmitComplex(4, Operand(dst), imm);
 }
 
 
-void X86_64Assembler::orl(Register dst, Register src) {
+void X86_64Assembler::orl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0B);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::orl(Register dst, const Immediate& imm) {
+void X86_64Assembler::orl(CpuRegister dst, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst);
   EmitComplex(1, Operand(dst), imm);
 }
 
 
-void X86_64Assembler::xorl(Register dst, Register src) {
+void X86_64Assembler::xorl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  rex(dst, src, 4);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x33);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
-void X86_64Assembler::rex_reg(Register &dst, size_t size) {
-  Register src = kNoRegister;
-  rex(dst, src, size);
-}
-
-void X86_64Assembler::rex_rm(Register &src, size_t size) {
-  Register dst = kNoRegister;
-  rex(dst, src, size);
-}
-
-void X86_64Assembler::rex(Register &dst, Register &src, size_t size) {
-  uint8_t rex = 0;
+#if 0
+void X86_64Assembler::rex(bool force, bool w, Register* r, Register* x, Register* b) {
   // REX.WRXB
   // W - 64-bit operand
   // R - MODRM.reg
   // X - SIB.index
   // B - MODRM.rm/SIB.base
-  if (size == 8) {
+  uint8_t rex = force ? 0x40 : 0;
+  if (w) {
     rex |= 0x48;  // REX.W000
   }
-  if (dst >= Register::R8 && dst < Register::kNumberOfCpuRegisters) {
+  if (r != nullptr && *r >= Register::R8 && *r < Register::kNumberOfCpuRegisters) {
     rex |= 0x44;  // REX.0R00
-    dst = static_cast<Register>(dst - 8);
+    *r = static_cast<Register>(*r - 8);
   }
-  if (src >= Register::R8 && src < Register::kNumberOfCpuRegisters) {
+  if (x != nullptr && *x >= Register::R8 && *x < Register::kNumberOfCpuRegisters) {
+    rex |= 0x42;  // REX.00X0
+    *x = static_cast<Register>(*x - 8);
+  }
+  if (b != nullptr && *b >= Register::R8 && *b < Register::kNumberOfCpuRegisters) {
     rex |= 0x41;  // REX.000B
-    src = static_cast<Register>(src - 8);
+    *b = static_cast<Register>(*b - 8);
   }
   if (rex != 0) {
     EmitUint8(rex);
   }
 }
 
-void X86_64Assembler::addl(Register reg, const Immediate& imm) {
+void X86_64Assembler::rex_reg_mem(bool force, bool w, Register* dst, const Address& mem) {
+  // REX.WRXB
+  // W - 64-bit operand
+  // R - MODRM.reg
+  // X - SIB.index
+  // B - MODRM.rm/SIB.base
+  uint8_t rex = mem->rex();
+  if (force) {
+    rex |= 0x40;  // REX.0000
+  }
+  if (w) {
+    rex |= 0x48;  // REX.W000
+  }
+  if (dst != nullptr && *dst >= Register::R8 && *dst < Register::kNumberOfCpuRegisters) {
+    rex |= 0x44;  // REX.0R00
+    *dst = static_cast<Register>(*dst - 8);
+  }
+  if (rex != 0) {
+    EmitUint8(rex);
+  }
+}
+
+void rex_mem_reg(bool force, bool w, Address* mem, Register* src);
+#endif
+
+void X86_64Assembler::addl(CpuRegister reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitComplex(0, Operand(reg), imm);
 }
 
 
-void X86_64Assembler::addq(Register reg, const Immediate& imm) {
+void X86_64Assembler::addq(CpuRegister reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x48);  // REX.W
+  EmitRex64(reg);
   EmitComplex(0, Operand(reg), imm);
 }
 
 
-void X86_64Assembler::addl(const Address& address, Register reg) {
+void X86_64Assembler::addl(const Address& address, CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x01);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
 void X86_64Assembler::addl(const Address& address, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitComplex(0, address, imm);
 }
 
 
-void X86_64Assembler::adcl(Register reg, const Immediate& imm) {
+void X86_64Assembler::subl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitComplex(2, Operand(reg), imm);
-}
-
-
-void X86_64Assembler::adcl(Register dst, Register src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x13);
-  EmitOperand(dst, Operand(src));
-}
-
-
-void X86_64Assembler::adcl(Register dst, const Address& address) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x13);
-  EmitOperand(dst, address);
-}
-
-
-void X86_64Assembler::subl(Register dst, Register src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x2B);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::subl(Register reg, const Immediate& imm) {
+void X86_64Assembler::subl(CpuRegister reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x48);  // REX.W
+  EmitOptionalRex32(reg);
   EmitComplex(5, Operand(reg), imm);
 }
 
 
-void X86_64Assembler::subl(Register reg, const Address& address) {
+void X86_64Assembler::subl(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x2B);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
@@ -998,39 +1056,44 @@
 }
 
 
-void X86_64Assembler::idivl(Register reg) {
+void X86_64Assembler::idivl(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xF7);
-  EmitUint8(0xF8 | reg);
+  EmitUint8(0xF8 | reg.LowBits());
 }
 
 
-void X86_64Assembler::imull(Register dst, Register src) {
+void X86_64Assembler::imull(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst, src);
   EmitUint8(0x0F);
   EmitUint8(0xAF);
-  EmitOperand(dst, Operand(src));
+  EmitOperand(dst.LowBits(), Operand(src));
 }
 
 
-void X86_64Assembler::imull(Register reg, const Immediate& imm) {
+void X86_64Assembler::imull(CpuRegister reg, const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0x69);
-  EmitOperand(reg, Operand(reg));
+  EmitOperand(reg.LowBits(), Operand(reg));
   EmitImmediate(imm);
 }
 
 
-void X86_64Assembler::imull(Register reg, const Address& address) {
+void X86_64Assembler::imull(CpuRegister reg, const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg, address);
   EmitUint8(0x0F);
   EmitUint8(0xAF);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 
-void X86_64Assembler::imull(Register reg) {
+void X86_64Assembler::imull(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xF7);
   EmitOperand(5, Operand(reg));
 }
@@ -1038,13 +1101,15 @@
 
 void X86_64Assembler::imull(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0xF7);
   EmitOperand(5, address);
 }
 
 
-void X86_64Assembler::mull(Register reg) {
+void X86_64Assembler::mull(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xF7);
   EmitOperand(4, Operand(reg));
 }
@@ -1052,106 +1117,56 @@
 
 void X86_64Assembler::mull(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0xF7);
   EmitOperand(4, address);
 }
 
 
-void X86_64Assembler::sbbl(Register dst, Register src) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x1B);
-  EmitOperand(dst, Operand(src));
-}
 
-
-void X86_64Assembler::sbbl(Register reg, const Immediate& imm) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitComplex(3, Operand(reg), imm);
-}
-
-
-void X86_64Assembler::sbbl(Register dst, const Address& address) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x1B);
-  EmitOperand(dst, address);
-}
-
-
-void X86_64Assembler::incl(Register reg) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x40 + reg);
-}
-
-
-void X86_64Assembler::incl(const Address& address) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xFF);
-  EmitOperand(0, address);
-}
-
-
-void X86_64Assembler::decl(Register reg) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x48 + reg);
-}
-
-
-void X86_64Assembler::decl(const Address& address) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0xFF);
-  EmitOperand(1, address);
-}
-
-
-void X86_64Assembler::shll(Register reg, const Immediate& imm) {
+void X86_64Assembler::shll(CpuRegister reg, const Immediate& imm) {
   EmitGenericShift(4, reg, imm);
 }
 
 
-void X86_64Assembler::shll(Register operand, Register shifter) {
+void X86_64Assembler::shll(CpuRegister operand, CpuRegister shifter) {
   EmitGenericShift(4, operand, shifter);
 }
 
 
-void X86_64Assembler::shrl(Register reg, const Immediate& imm) {
+void X86_64Assembler::shrl(CpuRegister reg, const Immediate& imm) {
   EmitGenericShift(5, reg, imm);
 }
 
 
-void X86_64Assembler::shrl(Register operand, Register shifter) {
+void X86_64Assembler::shrl(CpuRegister operand, CpuRegister shifter) {
   EmitGenericShift(5, operand, shifter);
 }
 
 
-void X86_64Assembler::sarl(Register reg, const Immediate& imm) {
+void X86_64Assembler::sarl(CpuRegister reg, const Immediate& imm) {
   EmitGenericShift(7, reg, imm);
 }
 
 
-void X86_64Assembler::sarl(Register operand, Register shifter) {
+void X86_64Assembler::sarl(CpuRegister operand, CpuRegister shifter) {
   EmitGenericShift(7, operand, shifter);
 }
 
 
-void X86_64Assembler::shld(Register dst, Register src) {
+void X86_64Assembler::negl(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  EmitUint8(0x0F);
-  EmitUint8(0xA5);
-  EmitRegisterOperand(src, dst);
-}
-
-
-void X86_64Assembler::negl(Register reg) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xF7);
   EmitOperand(3, Operand(reg));
 }
 
 
-void X86_64Assembler::notl(Register reg) {
+void X86_64Assembler::notl(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xF7);
-  EmitUint8(0xD0 | reg);
+  EmitUint8(0xD0 | reg.LowBits());
 }
 
 
@@ -1228,14 +1243,16 @@
 }
 
 
-void X86_64Assembler::jmp(Register reg) {
+void X86_64Assembler::jmp(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(reg);
   EmitUint8(0xFF);
-  EmitRegisterOperand(4, reg);
+  EmitRegisterOperand(4, reg.LowBits());
 }
 
 void X86_64Assembler::jmp(const Address& address) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(address);
   EmitUint8(0xFF);
   EmitOperand(4, address);
 }
@@ -1268,11 +1285,11 @@
 }
 
 
-void X86_64Assembler::cmpxchgl(const Address& address, Register reg) {
+void X86_64Assembler::cmpxchgl(const Address& address, CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x0F);
   EmitUint8(0xB1);
-  EmitOperand(reg, address);
+  EmitOperand(reg.LowBits(), address);
 }
 
 void X86_64Assembler::mfence() {
@@ -1289,19 +1306,12 @@
   return this;
 }
 
-void X86_64Assembler::AddImmediate(Register reg, const Immediate& imm) {
+void X86_64Assembler::AddImmediate(CpuRegister reg, const Immediate& imm) {
   int value = imm.value();
-  if (value > 0) {
-    if (value == 1) {
-      incl(reg);
-    } else if (value != 0) {
+  if (value != 0) {
+    if (value > 0) {
       addl(reg, imm);
-    }
-  } else if (value < 0) {
-    value = -value;
-    if (value == 1) {
-      decl(reg);
-    } else if (value != 0) {
+    } else {
       subl(reg, Immediate(value));
     }
   }
@@ -1313,8 +1323,8 @@
   int64_t constant = bit_cast<int64_t, double>(value);
   pushq(Immediate(High32Bits(constant)));
   pushq(Immediate(Low32Bits(constant)));
-  movsd(dst, Address(RSP, 0));
-  addq(RSP, Immediate(2 * kWordSize));
+  movsd(dst, Address(CpuRegister(RSP), 0));
+  addq(CpuRegister(RSP), Immediate(2 * kWordSize));
 }
 
 
@@ -1372,7 +1382,7 @@
 }
 
 
-void X86_64Assembler::EmitOperand(int reg_or_opcode, const Operand& operand) {
+void X86_64Assembler::EmitOperand(uint8_t reg_or_opcode, const Operand& operand) {
   CHECK_GE(reg_or_opcode, 0);
   CHECK_LT(reg_or_opcode, 8);
   const int length = operand.length_;
@@ -1392,9 +1402,9 @@
 }
 
 
-void X86_64Assembler::EmitComplex(int reg_or_opcode,
-                               const Operand& operand,
-                               const Immediate& immediate) {
+void X86_64Assembler::EmitComplex(uint8_t reg_or_opcode,
+                                  const Operand& operand,
+                                  const Immediate& immediate) {
   CHECK_GE(reg_or_opcode, 0);
   CHECK_LT(reg_or_opcode, 8);
   if (immediate.is_int8()) {
@@ -1402,7 +1412,7 @@
     EmitUint8(0x83);
     EmitOperand(reg_or_opcode, operand);
     EmitUint8(immediate.value() & 0xFF);
-  } else if (operand.IsRegister(RAX)) {
+  } else if (operand.IsRegister(CpuRegister(RAX))) {
     // Use short form if the destination is eax.
     EmitUint8(0x05 + (reg_or_opcode << 3));
     EmitImmediate(immediate);
@@ -1434,7 +1444,7 @@
 
 
 void X86_64Assembler::EmitGenericShift(int reg_or_opcode,
-                                    Register reg,
+                                    CpuRegister reg,
                                     const Immediate& imm) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   CHECK(imm.is_int8());
@@ -1450,14 +1460,89 @@
 
 
 void X86_64Assembler::EmitGenericShift(int reg_or_opcode,
-                                    Register operand,
-                                    Register shifter) {
+                                    CpuRegister operand,
+                                    CpuRegister shifter) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  CHECK_EQ(shifter, RCX);
+  CHECK_EQ(shifter.AsRegister(), RCX);
   EmitUint8(0xD3);
   EmitOperand(reg_or_opcode, Operand(operand));
 }
 
+void X86_64Assembler::EmitOptionalRex(bool force, bool w, bool r, bool x, bool b) {
+  // REX.WRXB
+  // W - 64-bit operand
+  // R - MODRM.reg
+  // X - SIB.index
+  // B - MODRM.rm/SIB.base
+  uint8_t rex = force ? 0x40 : 0;
+  if (w) {
+    rex |= 0x48;  // REX.W000
+  }
+  if (r) {
+    rex |= 0x44;  // REX.0R00
+  }
+  if (x) {
+    rex |= 0x42;  // REX.00X0
+  }
+  if (b) {
+    rex |= 0x41;  // REX.000B
+  }
+  if (rex != 0) {
+    EmitUint8(rex);
+  }
+}
+
+void X86_64Assembler::EmitOptionalRex32(CpuRegister reg) {
+  EmitOptionalRex(false, false, reg.NeedsRex(), false, false);
+}
+
+void X86_64Assembler::EmitOptionalRex32(CpuRegister dst, CpuRegister src) {
+  EmitOptionalRex(false, false, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitOptionalRex32(XmmRegister dst, XmmRegister src) {
+  EmitOptionalRex(false, false, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitOptionalRex32(CpuRegister dst, XmmRegister src) {
+  EmitOptionalRex(false, false, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitOptionalRex32(XmmRegister dst, CpuRegister src) {
+  EmitOptionalRex(false, false, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitOptionalRex32(const Operand& operand) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void X86_64Assembler::EmitOptionalRex32(CpuRegister dst, const Operand& operand) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void X86_64Assembler::EmitOptionalRex32(XmmRegister dst, const Operand& operand) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void X86_64Assembler::EmitRex64(CpuRegister reg) {
+  EmitOptionalRex(false, true, reg.NeedsRex(), false, false);
+}
+void X86_64Assembler::EmitRex64(CpuRegister dst, CpuRegister src) {
+  EmitOptionalRex(false, true, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitRex64(CpuRegister dst, const Operand& operand) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void X86_64Assembler::EmitOptionalByteRegNormalizingRex32(CpuRegister dst, CpuRegister src) {
+  EmitOptionalRex(true, false, dst.NeedsRex(), false, src.NeedsRex());
+}
+
+void X86_64Assembler::EmitOptionalByteRegNormalizingRex32(CpuRegister dst, const Operand& operand) {
+  UNIMPLEMENTED(FATAL);
+}
+
 void X86_64Assembler::BuildFrame(size_t frame_size, ManagedRegister method_reg,
                               const std::vector<ManagedRegister>& spill_regs,
                               const ManagedRegisterEntrySpills& entry_spills) {
@@ -1466,25 +1551,26 @@
     pushq(spill_regs.at(i).AsX86_64().AsCpuRegister());
   }
   // return address then method on stack
-  addq(RSP, Immediate(-frame_size + (spill_regs.size() * kPointerSize) +
-                      kPointerSize /*method*/ + kPointerSize /*return address*/));
+  addq(CpuRegister(RSP), Immediate(-frame_size + (spill_regs.size() * kPointerSize) +
+                                   kPointerSize /*method*/ + kPointerSize /*return address*/));
   pushq(method_reg.AsX86_64().AsCpuRegister());
 
   for (size_t i = 0; i < entry_spills.size(); ++i) {
     ManagedRegisterSpill spill = entry_spills.at(i);
     if (spill.AsX86_64().IsCpuRegister()) {
       if (spill.getSize() == 8) {
-        movq(Address(RSP, frame_size + spill.getSpillOffset()), spill.AsX86_64().AsCpuRegister());
+        movq(Address(CpuRegister(RSP), frame_size + spill.getSpillOffset()),
+             spill.AsX86_64().AsCpuRegister());
       } else {
         CHECK_EQ(spill.getSize(), 4);
-        movl(Address(RSP, frame_size + spill.getSpillOffset()), spill.AsX86_64().AsCpuRegister());
+        movl(Address(CpuRegister(RSP), frame_size + spill.getSpillOffset()), spill.AsX86_64().AsCpuRegister());
       }
     } else {
       if (spill.getSize() == 8) {
-        movsd(Address(RSP, frame_size + spill.getSpillOffset()), spill.AsX86_64().AsXmmRegister());
+        movsd(Address(CpuRegister(RSP), frame_size + spill.getSpillOffset()), spill.AsX86_64().AsXmmRegister());
       } else {
         CHECK_EQ(spill.getSize(), 4);
-        movss(Address(RSP, frame_size + spill.getSpillOffset()), spill.AsX86_64().AsXmmRegister());
+        movss(Address(CpuRegister(RSP), frame_size + spill.getSpillOffset()), spill.AsX86_64().AsXmmRegister());
       }
     }
   }
@@ -1493,7 +1579,7 @@
 void X86_64Assembler::RemoveFrame(size_t frame_size,
                             const std::vector<ManagedRegister>& spill_regs) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
-  addq(RSP, Immediate(frame_size - (spill_regs.size() * kPointerSize) - kPointerSize));
+  addq(CpuRegister(RSP), Immediate(frame_size - (spill_regs.size() * kPointerSize) - kPointerSize));
   for (size_t i = 0; i < spill_regs.size(); ++i) {
     popq(spill_regs.at(i).AsX86_64().AsCpuRegister());
   }
@@ -1502,12 +1588,12 @@
 
 void X86_64Assembler::IncreaseFrameSize(size_t adjust) {
   CHECK_ALIGNED(adjust, kStackAlignment);
-  addq(RSP, Immediate(-adjust));
+  addq(CpuRegister(RSP), Immediate(-adjust));
 }
 
 void X86_64Assembler::DecreaseFrameSize(size_t adjust) {
   CHECK_ALIGNED(adjust, kStackAlignment);
-  addq(RSP, Immediate(adjust));
+  addq(CpuRegister(RSP), Immediate(adjust));
 }
 
 void X86_64Assembler::Store(FrameOffset offs, ManagedRegister msrc, size_t size) {
@@ -1517,28 +1603,28 @@
   } else if (src.IsCpuRegister()) {
     if (size == 4) {
       CHECK_EQ(4u, size);
-      movl(Address(RSP, offs), src.AsCpuRegister());
+      movl(Address(CpuRegister(RSP), offs), src.AsCpuRegister());
     } else {
       CHECK_EQ(8u, size);
-      movq(Address(RSP, offs), src.AsCpuRegister());
+      movq(Address(CpuRegister(RSP), offs), src.AsCpuRegister());
     }
   } else if (src.IsRegisterPair()) {
     CHECK_EQ(0u, size);
-    movq(Address(RSP, offs), src.AsRegisterPairLow());
-    movq(Address(RSP, FrameOffset(offs.Int32Value()+4)),
+    movq(Address(CpuRegister(RSP), offs), src.AsRegisterPairLow());
+    movq(Address(CpuRegister(RSP), FrameOffset(offs.Int32Value()+4)),
          src.AsRegisterPairHigh());
   } else if (src.IsX87Register()) {
     if (size == 4) {
-      fstps(Address(RSP, offs));
+      fstps(Address(CpuRegister(RSP), offs));
     } else {
-      fstpl(Address(RSP, offs));
+      fstpl(Address(CpuRegister(RSP), offs));
     }
   } else {
     CHECK(src.IsXmmRegister());
     if (size == 4) {
-      movss(Address(RSP, offs), src.AsXmmRegister());
+      movss(Address(CpuRegister(RSP), offs), src.AsXmmRegister());
     } else {
-      movsd(Address(RSP, offs), src.AsXmmRegister());
+      movsd(Address(CpuRegister(RSP), offs), src.AsXmmRegister());
     }
   }
 }
@@ -1546,40 +1632,36 @@
 void X86_64Assembler::StoreRef(FrameOffset dest, ManagedRegister msrc) {
   X86_64ManagedRegister src = msrc.AsX86_64();
   CHECK(src.IsCpuRegister());
-  movq(Address(RSP, dest), src.AsCpuRegister());
+  movq(Address(CpuRegister(RSP), dest), src.AsCpuRegister());
 }
 
 void X86_64Assembler::StoreRawPtr(FrameOffset dest, ManagedRegister msrc) {
   X86_64ManagedRegister src = msrc.AsX86_64();
   CHECK(src.IsCpuRegister());
-  movq(Address(RSP, dest), src.AsCpuRegister());
+  movq(Address(CpuRegister(RSP), dest), src.AsCpuRegister());
 }
 
 void X86_64Assembler::StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                         ManagedRegister) {
-  movl(Address(RSP, dest), Immediate(imm));  // TODO(64) movq?
+                                            ManagedRegister) {
+  movl(Address(CpuRegister(RSP), dest), Immediate(imm));  // TODO(64) movq?
 }
 
-void X86_64Assembler::StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                          ManagedRegister) {
+void X86_64Assembler::StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm,
+                                               ManagedRegister) {
   gs()->movl(Address::Absolute(dest, true), Immediate(imm));  // TODO(64) movq?
 }
 
-void X86_64Assembler::StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                            FrameOffset fr_offs,
-                                            ManagedRegister mscratch) {
+void X86_64Assembler::StoreStackOffsetToThread64(ThreadOffset<8> thr_offs,
+                                                 FrameOffset fr_offs,
+                                                 ManagedRegister mscratch) {
   X86_64ManagedRegister scratch = mscratch.AsX86_64();
   CHECK(scratch.IsCpuRegister());
-  leaq(scratch.AsCpuRegister(), Address(RSP, fr_offs));
+  leaq(scratch.AsCpuRegister(), Address(CpuRegister(RSP), fr_offs));
   gs()->movq(Address::Absolute(thr_offs, true), scratch.AsCpuRegister());
 }
 
-void X86_64Assembler::StoreStackPointerToThread(ThreadOffset thr_offs) {
-  gs()->movq(Address::Absolute(thr_offs, true), RSP);
-}
-
-void X86_64Assembler::StoreLabelToThread(ThreadOffset thr_offs, Label* lbl) {
-  gs()->movl(Address::Absolute(thr_offs, true), lbl);  // TODO(64) movq?
+void X86_64Assembler::StoreStackPointerToThread64(ThreadOffset<8> thr_offs) {
+  gs()->movq(Address::Absolute(thr_offs, true), CpuRegister(RSP));
 }
 
 void X86_64Assembler::StoreSpanning(FrameOffset /*dst*/, ManagedRegister /*src*/,
@@ -1594,42 +1676,41 @@
   } else if (dest.IsCpuRegister()) {
     if (size == 4) {
       CHECK_EQ(4u, size);
-      movl(dest.AsCpuRegister(), Address(RSP, src));
+      movl(dest.AsCpuRegister(), Address(CpuRegister(RSP), src));
     } else {
       CHECK_EQ(8u, size);
-      movq(dest.AsCpuRegister(), Address(RSP, src));
+      movq(dest.AsCpuRegister(), Address(CpuRegister(RSP), src));
     }
   } else if (dest.IsRegisterPair()) {
     CHECK_EQ(0u, size);
-    movq(dest.AsRegisterPairLow(), Address(RSP, src));
-    movq(dest.AsRegisterPairHigh(), Address(RSP, FrameOffset(src.Int32Value()+4)));
+    movq(dest.AsRegisterPairLow(), Address(CpuRegister(RSP), src));
+    movq(dest.AsRegisterPairHigh(), Address(CpuRegister(RSP), FrameOffset(src.Int32Value()+4)));
   } else if (dest.IsX87Register()) {
     if (size == 4) {
-      flds(Address(RSP, src));
+      flds(Address(CpuRegister(RSP), src));
     } else {
-      fldl(Address(RSP, src));
+      fldl(Address(CpuRegister(RSP), src));
     }
   } else {
     CHECK(dest.IsXmmRegister());
     if (size == 4) {
-      movss(dest.AsXmmRegister(), Address(RSP, src));
+      movss(dest.AsXmmRegister(), Address(CpuRegister(RSP), src));
     } else {
-      movsd(dest.AsXmmRegister(), Address(RSP, src));
+      movsd(dest.AsXmmRegister(), Address(CpuRegister(RSP), src));
     }
   }
 }
 
-void X86_64Assembler::Load(ManagedRegister mdest, ThreadOffset src, size_t size) {
+void X86_64Assembler::LoadFromThread64(ManagedRegister mdest, ThreadOffset<8> src, size_t size) {
   X86_64ManagedRegister dest = mdest.AsX86_64();
   if (dest.IsNoRegister()) {
     CHECK_EQ(0u, size);
   } else if (dest.IsCpuRegister()) {
     CHECK_EQ(4u, size);
-    gs()->movq(dest.AsCpuRegister(), Address::Absolute(src, true));
+    gs()->movl(dest.AsCpuRegister(), Address::Absolute(src, true));
   } else if (dest.IsRegisterPair()) {
     CHECK_EQ(8u, size);
     gs()->movq(dest.AsRegisterPairLow(), Address::Absolute(src, true));
-    gs()->movq(dest.AsRegisterPairHigh(), Address::Absolute(ThreadOffset(src.Int32Value()+4), true));
   } else if (dest.IsX87Register()) {
     if (size == 4) {
       gs()->flds(Address::Absolute(src, true));
@@ -1649,7 +1730,7 @@
 void X86_64Assembler::LoadRef(ManagedRegister mdest, FrameOffset  src) {
   X86_64ManagedRegister dest = mdest.AsX86_64();
   CHECK(dest.IsCpuRegister());
-  movq(dest.AsCpuRegister(), Address(RSP, src));
+  movq(dest.AsCpuRegister(), Address(CpuRegister(RSP), src));
 }
 
 void X86_64Assembler::LoadRef(ManagedRegister mdest, ManagedRegister base,
@@ -1666,8 +1747,7 @@
   movq(dest.AsCpuRegister(), Address(base.AsX86_64().AsCpuRegister(), offs));
 }
 
-void X86_64Assembler::LoadRawPtrFromThread(ManagedRegister mdest,
-                                        ThreadOffset offs) {
+void X86_64Assembler::LoadRawPtrFromThread64(ManagedRegister mdest, ThreadOffset<8> offs) {
   X86_64ManagedRegister dest = mdest.AsX86_64();
   CHECK(dest.IsCpuRegister());
   gs()->movq(dest.AsCpuRegister(), Address::Absolute(offs, true));
@@ -1678,7 +1758,7 @@
   CHECK(size == 1 || size == 2) << size;
   CHECK(reg.IsCpuRegister()) << reg;
   if (size == 1) {
-    movsxb(reg.AsCpuRegister(), reg.AsByteRegister());
+    movsxb(reg.AsCpuRegister(), reg.AsCpuRegister());
   } else {
     movsxw(reg.AsCpuRegister(), reg.AsCpuRegister());
   }
@@ -1689,7 +1769,7 @@
   CHECK(size == 1 || size == 2) << size;
   CHECK(reg.IsCpuRegister()) << reg;
   if (size == 1) {
-    movzxb(reg.AsCpuRegister(), reg.AsByteRegister());
+    movzxb(reg.AsCpuRegister(), reg.AsCpuRegister());
   } else {
     movzxw(reg.AsCpuRegister(), reg.AsCpuRegister());
   }
@@ -1703,17 +1783,17 @@
       movq(dest.AsCpuRegister(), src.AsCpuRegister());
     } else if (src.IsX87Register() && dest.IsXmmRegister()) {
       // Pass via stack and pop X87 register
-      subl(RSP, Immediate(16));
+      subl(CpuRegister(RSP), Immediate(16));
       if (size == 4) {
         CHECK_EQ(src.AsX87Register(), ST0);
-        fstps(Address(RSP, 0));
-        movss(dest.AsXmmRegister(), Address(RSP, 0));
+        fstps(Address(CpuRegister(RSP), 0));
+        movss(dest.AsXmmRegister(), Address(CpuRegister(RSP), 0));
       } else {
         CHECK_EQ(src.AsX87Register(), ST0);
-        fstpl(Address(RSP, 0));
-        movsd(dest.AsXmmRegister(), Address(RSP, 0));
+        fstpl(Address(CpuRegister(RSP), 0));
+        movsd(dest.AsXmmRegister(), Address(CpuRegister(RSP), 0));
       }
-      addq(RSP, Immediate(16));
+      addq(CpuRegister(RSP), Immediate(16));
     } else {
       // TODO: x87, SSE
       UNIMPLEMENTED(FATAL) << ": Move " << dest << ", " << src;
@@ -1725,22 +1805,22 @@
                            ManagedRegister mscratch) {
   X86_64ManagedRegister scratch = mscratch.AsX86_64();
   CHECK(scratch.IsCpuRegister());
-  movl(scratch.AsCpuRegister(), Address(RSP, src));
-  movl(Address(RSP, dest), scratch.AsCpuRegister());
+  movl(scratch.AsCpuRegister(), Address(CpuRegister(RSP), src));
+  movl(Address(CpuRegister(RSP), dest), scratch.AsCpuRegister());
 }
 
-void X86_64Assembler::CopyRawPtrFromThread(FrameOffset fr_offs,
-                                        ThreadOffset thr_offs,
-                                        ManagedRegister mscratch) {
+void X86_64Assembler::CopyRawPtrFromThread64(FrameOffset fr_offs,
+                                             ThreadOffset<8> thr_offs,
+                                             ManagedRegister mscratch) {
   X86_64ManagedRegister scratch = mscratch.AsX86_64();
   CHECK(scratch.IsCpuRegister());
   gs()->movq(scratch.AsCpuRegister(), Address::Absolute(thr_offs, true));
   Store(fr_offs, scratch, 8);
 }
 
-void X86_64Assembler::CopyRawPtrToThread(ThreadOffset thr_offs,
-                                      FrameOffset fr_offs,
-                                      ManagedRegister mscratch) {
+void X86_64Assembler::CopyRawPtrToThread64(ThreadOffset<8> thr_offs,
+                                           FrameOffset fr_offs,
+                                           ManagedRegister mscratch) {
   X86_64ManagedRegister scratch = mscratch.AsX86_64();
   CHECK(scratch.IsCpuRegister());
   Load(scratch, fr_offs, 8);
@@ -1771,17 +1851,17 @@
                         ManagedRegister scratch, size_t size) {
   CHECK(scratch.IsNoRegister());
   CHECK_EQ(size, 4u);
-  pushq(Address(RSP, src));
+  pushq(Address(CpuRegister(RSP), src));
   popq(Address(dest_base.AsX86_64().AsCpuRegister(), dest_offset));
 }
 
 void X86_64Assembler::Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
                         ManagedRegister mscratch, size_t size) {
-  Register scratch = mscratch.AsX86_64().AsCpuRegister();
+  CpuRegister scratch = mscratch.AsX86_64().AsCpuRegister();
   CHECK_EQ(size, 4u);
-  movq(scratch, Address(RSP, src_base));
+  movq(scratch, Address(CpuRegister(RSP), src_base));
   movq(scratch, Address(scratch, src_offset));
-  movq(Address(RSP, dest), scratch);
+  movq(Address(CpuRegister(RSP), dest), scratch);
 }
 
 void X86_64Assembler::Copy(ManagedRegister dest, Offset dest_offset,
@@ -1795,10 +1875,10 @@
 
 void X86_64Assembler::Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
                         ManagedRegister mscratch, size_t size) {
-  Register scratch = mscratch.AsX86_64().AsCpuRegister();
+  CpuRegister scratch = mscratch.AsX86_64().AsCpuRegister();
   CHECK_EQ(size, 4u);
   CHECK_EQ(dest.Int32Value(), src.Int32Value());
-  movq(scratch, Address(RSP, src));
+  movq(scratch, Address(CpuRegister(RSP), src));
   pushq(Address(scratch, src_offset));
   popq(Address(scratch, dest_offset));
 }
@@ -1818,7 +1898,7 @@
     // Use out_reg as indicator of NULL
     in_reg = out_reg;
     // TODO: movzwl
-    movl(in_reg.AsCpuRegister(), Address(RSP, sirt_offset));
+    movl(in_reg.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
   }
   CHECK(in_reg.IsCpuRegister());
   CHECK(out_reg.IsCpuRegister());
@@ -1830,10 +1910,10 @@
     }
     testl(in_reg.AsCpuRegister(), in_reg.AsCpuRegister());
     j(kZero, &null_arg);
-    leaq(out_reg.AsCpuRegister(), Address(RSP, sirt_offset));
+    leaq(out_reg.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
     Bind(&null_arg);
   } else {
-    leaq(out_reg.AsCpuRegister(), Address(RSP, sirt_offset));
+    leaq(out_reg.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
   }
 }
 
@@ -1845,13 +1925,13 @@
   CHECK(scratch.IsCpuRegister());
   if (null_allowed) {
     Label null_arg;
-    movl(scratch.AsCpuRegister(), Address(RSP, sirt_offset));
+    movl(scratch.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
     testl(scratch.AsCpuRegister(), scratch.AsCpuRegister());
     j(kZero, &null_arg);
-    leaq(scratch.AsCpuRegister(), Address(RSP, sirt_offset));
+    leaq(scratch.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
     Bind(&null_arg);
   } else {
-    leaq(scratch.AsCpuRegister(), Address(RSP, sirt_offset));
+    leaq(scratch.AsCpuRegister(), Address(CpuRegister(RSP), sirt_offset));
   }
   Store(out_off, scratch, 8);
 }
@@ -1889,35 +1969,42 @@
 }
 
 void X86_64Assembler::Call(FrameOffset base, Offset offset, ManagedRegister mscratch) {
-  Register scratch = mscratch.AsX86_64().AsCpuRegister();
-  movq(scratch, Address(RSP, base));
+  CpuRegister scratch = mscratch.AsX86_64().AsCpuRegister();
+  movq(scratch, Address(CpuRegister(RSP), base));
   call(Address(scratch, offset));
 }
 
-void X86_64Assembler::Call(ThreadOffset offset, ManagedRegister /*mscratch*/) {
+void X86_64Assembler::CallFromThread64(ThreadOffset<8> offset, ManagedRegister /*mscratch*/) {
   gs()->call(Address::Absolute(offset, true));
 }
 
 void X86_64Assembler::GetCurrentThread(ManagedRegister tr) {
-  gs()->movq(tr.AsX86_64().AsCpuRegister(),
-             Address::Absolute(Thread::SelfOffset(), true));
+  gs()->movq(tr.AsX86_64().AsCpuRegister(), Address::Absolute(Thread::SelfOffset<8>(), true));
 }
 
-void X86_64Assembler::GetCurrentThread(FrameOffset offset,
-                                    ManagedRegister mscratch) {
+void X86_64Assembler::GetCurrentThread(FrameOffset offset, ManagedRegister mscratch) {
   X86_64ManagedRegister scratch = mscratch.AsX86_64();
-  gs()->movq(scratch.AsCpuRegister(), Address::Absolute(Thread::SelfOffset(), true));
-  movq(Address(RSP, offset), scratch.AsCpuRegister());
+  gs()->movq(scratch.AsCpuRegister(), Address::Absolute(Thread::SelfOffset<8>(), true));
+  movq(Address(CpuRegister(RSP), offset), scratch.AsCpuRegister());
 }
 
+// Slowpath entered when Thread::Current()->_exception is non-null
+class X86_64ExceptionSlowPath FINAL : public SlowPath {
+ public:
+  explicit X86_64ExceptionSlowPath(size_t stack_adjust) : stack_adjust_(stack_adjust) {}
+  virtual void Emit(Assembler *sp_asm) OVERRIDE;
+ private:
+  const size_t stack_adjust_;
+};
+
 void X86_64Assembler::ExceptionPoll(ManagedRegister /*scratch*/, size_t stack_adjust) {
-  X86ExceptionSlowPath* slow = new X86ExceptionSlowPath(stack_adjust);
+  X86_64ExceptionSlowPath* slow = new X86_64ExceptionSlowPath(stack_adjust);
   buffer_.EnqueueSlowPath(slow);
-  gs()->cmpl(Address::Absolute(Thread::ExceptionOffset(), true), Immediate(0));
+  gs()->cmpl(Address::Absolute(Thread::ExceptionOffset<8>(), true), Immediate(0));
   j(kNotEqual, slow->Entry());
 }
 
-void X86ExceptionSlowPath::Emit(Assembler *sasm) {
+void X86_64ExceptionSlowPath::Emit(Assembler *sasm) {
   X86_64Assembler* sp_asm = down_cast<X86_64Assembler*>(sasm);
 #define __ sp_asm->
   __ Bind(&entry_);
@@ -1925,27 +2012,14 @@
   if (stack_adjust_ != 0) {  // Fix up the frame.
     __ DecreaseFrameSize(stack_adjust_);
   }
-  // Pass exception as argument in RAX
-  __ gs()->movq(RAX, Address::Absolute(Thread::ExceptionOffset(), true));  // TODO(64): Pass argument via RDI
-  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(pDeliverException), true));
+  // Pass exception as argument in RDI
+  __ gs()->movq(CpuRegister(RDI), Address::Absolute(Thread::ExceptionOffset<8>(), true));
+  __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(8, pDeliverException), true));
   // this call should never return
   __ int3();
 #undef __
 }
 
-static const char* kRegisterNames[] = {
-  "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
-  "r8",  "r9",  "r10", "r11", "r12", "r13", "r14", "r15",
-};
-
-std::ostream& operator<<(std::ostream& os, const Register& rhs) {
-  if (rhs >= RAX && rhs <= R15) {
-    os << kRegisterNames[rhs];
-  } else {
-    os << "Register[" << static_cast<int>(rhs) << "]";
-  }
-  return os;
-}
 }  // namespace x86_64
 }  // namespace art
 
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index d48ba72..1d42d89 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -80,25 +80,30 @@
     return value;
   }
 
-  bool IsRegister(Register reg) const {
+  bool IsRegister(CpuRegister reg) const {
+    CHECK(!reg.NeedsRex()) << "TODO: rex support:" << reg;
     return ((encoding_[0] & 0xF8) == 0xC0)  // Addressing mode is register only.
-        && ((encoding_[0] & 0x07) == reg);  // Register codes match.
+        && ((encoding_[0] & 0x07) == reg.LowBits());  // Register codes match.
   }
 
  protected:
   // Operand can be sub classed (e.g: Address).
   Operand() : length_(0) { }
 
-  void SetModRM(int mod, Register rm) {
+  void SetModRM(int mod, CpuRegister rm) {
     CHECK_EQ(mod & ~3, 0);
-    encoding_[0] = (mod << 6) | rm;
+    CHECK(!rm.NeedsRex());
+    encoding_[0] = (mod << 6) | static_cast<uint8_t>(rm.AsRegister());
     length_ = 1;
   }
 
-  void SetSIB(ScaleFactor scale, Register index, Register base) {
+  void SetSIB(ScaleFactor scale, CpuRegister index, CpuRegister base) {
+    CHECK(!index.NeedsRex()) << "TODO: rex support: " << index;
+    CHECK(!base.NeedsRex()) << "TODO: rex support: " << base;
     CHECK_EQ(length_, 1);
     CHECK_EQ(scale & ~3, 0);
-    encoding_[1] = (scale << 6) | (index << 3) | base;
+    encoding_[1] = (scale << 6) | (static_cast<uint8_t>(index.AsRegister()) << 3) |
+        static_cast<uint8_t>(base.AsRegister());
     length_ = 2;
   }
 
@@ -117,9 +122,8 @@
  private:
   byte length_;
   byte encoding_[6];
-  byte padding_;
 
-  explicit Operand(Register reg) { SetModRM(3, reg); }
+  explicit Operand(CpuRegister reg) { SetModRM(3, reg); }
 
   // Get the operand encoding byte at the given index.
   uint8_t encoding_at(int index) const {
@@ -136,77 +140,85 @@
 
 class Address : public Operand {
  public:
-  Address(Register base, int32_t disp) {
+  Address(CpuRegister base, int32_t disp) {
     Init(base, disp);
   }
 
-  Address(Register base, Offset disp) {
+  Address(CpuRegister base, Offset disp) {
     Init(base, disp.Int32Value());
   }
 
-  Address(Register base, FrameOffset disp) {
-    CHECK_EQ(base, RSP);
-    Init(RSP, disp.Int32Value());
+  Address(CpuRegister base, FrameOffset disp) {
+    CHECK_EQ(base.AsRegister(), RSP);
+    Init(CpuRegister(RSP), disp.Int32Value());
   }
 
-  Address(Register base, MemberOffset disp) {
+  Address(CpuRegister base, MemberOffset disp) {
     Init(base, disp.Int32Value());
   }
 
-  void Init(Register base, int32_t disp) {
-    if (disp == 0 && base != RBP) {
+  void Init(CpuRegister base, int32_t disp) {
+    if (disp == 0 && base.AsRegister() != RBP) {
       SetModRM(0, base);
-      if (base == RSP) SetSIB(TIMES_1, RSP, base);
+      if (base.AsRegister() == RSP) {
+        SetSIB(TIMES_1, CpuRegister(RSP), base);
+      }
     } else if (disp >= -128 && disp <= 127) {
       SetModRM(1, base);
-      if (base == RSP) SetSIB(TIMES_1, RSP, base);
+      if (base.AsRegister() == RSP) {
+        SetSIB(TIMES_1, CpuRegister(RSP), base);
+      }
       SetDisp8(disp);
     } else {
       SetModRM(2, base);
-      if (base == RSP) SetSIB(TIMES_1, RSP, base);
+      if (base.AsRegister() == RSP) {
+        SetSIB(TIMES_1, CpuRegister(RSP), base);
+      }
       SetDisp32(disp);
     }
   }
 
 
-  Address(Register index, ScaleFactor scale, int32_t disp) {
-    CHECK_NE(index, RSP);  // Illegal addressing mode.
-    SetModRM(0, RSP);
-    SetSIB(scale, index, RBP);
+  Address(CpuRegister index, ScaleFactor scale, int32_t disp) {
+    CHECK_NE(index.AsRegister(), RSP);  // Illegal addressing mode.
+    SetModRM(0, CpuRegister(RSP));
+    SetSIB(scale, index, CpuRegister(RBP));
     SetDisp32(disp);
   }
 
-  Address(Register base, Register index, ScaleFactor scale, int32_t disp) {
-    CHECK_NE(index, RSP);  // Illegal addressing mode.
-    if (disp == 0 && base != RBP) {
-      SetModRM(0, RSP);
+  Address(CpuRegister base, CpuRegister index, ScaleFactor scale, int32_t disp) {
+    CHECK_NE(index.AsRegister(), RSP);  // Illegal addressing mode.
+    if (disp == 0 && base.AsRegister() != RBP) {
+      SetModRM(0, CpuRegister(RSP));
       SetSIB(scale, index, base);
     } else if (disp >= -128 && disp <= 127) {
-      SetModRM(1, RSP);
+      SetModRM(1, CpuRegister(RSP));
       SetSIB(scale, index, base);
       SetDisp8(disp);
     } else {
-      SetModRM(2, RSP);
+      SetModRM(2, CpuRegister(RSP));
       SetSIB(scale, index, base);
       SetDisp32(disp);
     }
   }
 
-  static Address Absolute(uword addr, bool has_rip = false) {
+  // If no_rip is true then the Absolute address isn't RIP relative.
+  static Address Absolute(uword addr, bool no_rip = false) {
     Address result;
-    if (has_rip) {
-      result.SetModRM(0, RSP);
-      result.SetSIB(TIMES_1, RSP, RBP);
+    if (no_rip) {
+      result.SetModRM(0, CpuRegister(RSP));
+      result.SetSIB(TIMES_1, CpuRegister(RSP), CpuRegister(RBP));
       result.SetDisp32(addr);
     } else {
-      result.SetModRM(0, RBP);
+      result.SetModRM(0, CpuRegister(RBP));
       result.SetDisp32(addr);
     }
     return result;
   }
 
-  static Address Absolute(ThreadOffset addr, bool has_rip = false) {
-    return Absolute(addr.Int32Value(), has_rip);
+  // If no_rip is true then the Absolute address isn't RIP relative.
+  static Address Absolute(ThreadOffset<8> addr, bool no_rip = false) {
+    return Absolute(addr.Int32Value(), no_rip);
   }
 
  private:
@@ -216,7 +228,7 @@
 };
 
 
-class X86_64Assembler : public Assembler {
+class X86_64Assembler FINAL : public Assembler {
  public:
   X86_64Assembler() {}
   virtual ~X86_64Assembler() {}
@@ -224,56 +236,51 @@
   /*
    * Emit Machine Instructions.
    */
-  void call(Register reg);
+  void call(CpuRegister reg);
   void call(const Address& address);
   void call(Label* label);
 
-  void pushq(Register reg);
+  void pushq(CpuRegister reg);
   void pushq(const Address& address);
   void pushq(const Immediate& imm);
 
-  void popq(Register reg);
+  void popq(CpuRegister reg);
   void popq(const Address& address);
 
-  void movq(Register dst, const Immediate& src);
-  void movl(Register dst, const Immediate& src);
-  void movq(Register dst, Register src);
-  void movl(Register dst, Register src);
+  void movq(CpuRegister dst, const Immediate& src);
+  void movl(CpuRegister dst, const Immediate& src);
+  void movq(CpuRegister dst, CpuRegister src);
+  void movl(CpuRegister dst, CpuRegister src);
 
-  void movq(Register dst, const Address& src);
-  void movl(Register dst, const Address& src);
-  void movq(const Address& dst, Register src);
-  void movl(const Address& dst, Register src);
+  void movq(CpuRegister dst, const Address& src);
+  void movl(CpuRegister dst, const Address& src);
+  void movq(const Address& dst, CpuRegister src);
+  void movl(const Address& dst, CpuRegister src);
   void movl(const Address& dst, const Immediate& imm);
-  void movl(const Address& dst, Label* lbl);
 
-  void movzxb(Register dst, ByteRegister src);
-  void movzxb(Register dst, const Address& src);
-  void movsxb(Register dst, ByteRegister src);
-  void movsxb(Register dst, const Address& src);
-  void movb(Register dst, const Address& src);
-  void movb(const Address& dst, ByteRegister src);
+  void movzxb(CpuRegister dst, CpuRegister src);
+  void movzxb(CpuRegister dst, const Address& src);
+  void movsxb(CpuRegister dst, CpuRegister src);
+  void movsxb(CpuRegister dst, const Address& src);
+  void movb(CpuRegister dst, const Address& src);
+  void movb(const Address& dst, CpuRegister src);
   void movb(const Address& dst, const Immediate& imm);
 
-  void movzxw(Register dst, Register src);
-  void movzxw(Register dst, const Address& src);
-  void movsxw(Register dst, Register src);
-  void movsxw(Register dst, const Address& src);
-  void movw(Register dst, const Address& src);
-  void movw(const Address& dst, Register src);
+  void movzxw(CpuRegister dst, CpuRegister src);
+  void movzxw(CpuRegister dst, const Address& src);
+  void movsxw(CpuRegister dst, CpuRegister src);
+  void movsxw(CpuRegister dst, const Address& src);
+  void movw(CpuRegister dst, const Address& src);
+  void movw(const Address& dst, CpuRegister src);
 
-  void leaq(Register dst, const Address& src);
-
-  void cmovl(Condition condition, Register dst, Register src);
-
-  void setb(Condition condition, Register dst);
+  void leaq(CpuRegister dst, const Address& src);
 
   void movss(XmmRegister dst, const Address& src);
   void movss(const Address& dst, XmmRegister src);
   void movss(XmmRegister dst, XmmRegister src);
 
-  void movd(XmmRegister dst, Register src);
-  void movd(Register dst, XmmRegister src);
+  void movd(XmmRegister dst, CpuRegister src);
+  void movd(CpuRegister dst, XmmRegister src);
 
   void addss(XmmRegister dst, XmmRegister src);
   void addss(XmmRegister dst, const Address& src);
@@ -297,17 +304,17 @@
   void divsd(XmmRegister dst, XmmRegister src);
   void divsd(XmmRegister dst, const Address& src);
 
-  void cvtsi2ss(XmmRegister dst, Register src);
-  void cvtsi2sd(XmmRegister dst, Register src);
+  void cvtsi2ss(XmmRegister dst, CpuRegister src);
+  void cvtsi2sd(XmmRegister dst, CpuRegister src);
 
-  void cvtss2si(Register dst, XmmRegister src);
+  void cvtss2si(CpuRegister dst, XmmRegister src);
   void cvtss2sd(XmmRegister dst, XmmRegister src);
 
-  void cvtsd2si(Register dst, XmmRegister src);
+  void cvtsd2si(CpuRegister dst, XmmRegister src);
   void cvtsd2ss(XmmRegister dst, XmmRegister src);
 
-  void cvttss2si(Register dst, XmmRegister src);
-  void cvttsd2si(Register dst, XmmRegister src);
+  void cvttss2si(CpuRegister dst, XmmRegister src);
+  void cvttsd2si(CpuRegister dst, XmmRegister src);
 
   void cvtdq2pd(XmmRegister dst, XmmRegister src);
 
@@ -344,77 +351,62 @@
   void fcos();
   void fptan();
 
-  void xchgl(Register dst, Register src);
-  void xchgl(Register reg, const Address& address);
+  void xchgl(CpuRegister dst, CpuRegister src);
+  void xchgl(CpuRegister reg, const Address& address);
 
-  void cmpl(Register reg, const Immediate& imm);
-  void cmpl(Register reg0, Register reg1);
-  void cmpl(Register reg, const Address& address);
+  void cmpl(CpuRegister reg, const Immediate& imm);
+  void cmpl(CpuRegister reg0, CpuRegister reg1);
+  void cmpl(CpuRegister reg, const Address& address);
 
-  void cmpl(const Address& address, Register reg);
+  void cmpl(const Address& address, CpuRegister reg);
   void cmpl(const Address& address, const Immediate& imm);
 
-  void testl(Register reg1, Register reg2);
-  void testl(Register reg, const Immediate& imm);
+  void testl(CpuRegister reg1, CpuRegister reg2);
+  void testl(CpuRegister reg, const Immediate& imm);
 
-  void andl(Register dst, const Immediate& imm);
-  void andl(Register dst, Register src);
+  void andl(CpuRegister dst, const Immediate& imm);
+  void andl(CpuRegister dst, CpuRegister src);
 
-  void orl(Register dst, const Immediate& imm);
-  void orl(Register dst, Register src);
+  void orl(CpuRegister dst, const Immediate& imm);
+  void orl(CpuRegister dst, CpuRegister src);
 
-  void xorl(Register dst, Register src);
+  void xorl(CpuRegister dst, CpuRegister src);
 
-  void addl(Register dst, Register src);
-  void addq(Register reg, const Immediate& imm);
-  void addl(Register reg, const Immediate& imm);
-  void addl(Register reg, const Address& address);
+  void addl(CpuRegister dst, CpuRegister src);
+  void addq(CpuRegister reg, const Immediate& imm);
+  void addl(CpuRegister reg, const Immediate& imm);
+  void addl(CpuRegister reg, const Address& address);
 
-  void addl(const Address& address, Register reg);
+  void addl(const Address& address, CpuRegister reg);
   void addl(const Address& address, const Immediate& imm);
 
-  void adcl(Register dst, Register src);
-  void adcl(Register reg, const Immediate& imm);
-  void adcl(Register dst, const Address& address);
-
-  void subl(Register dst, Register src);
-  void subl(Register reg, const Immediate& imm);
-  void subl(Register reg, const Address& address);
+  void subl(CpuRegister dst, CpuRegister src);
+  void subl(CpuRegister reg, const Immediate& imm);
+  void subl(CpuRegister reg, const Address& address);
 
   void cdq();
 
-  void idivl(Register reg);
+  void idivl(CpuRegister reg);
 
-  void imull(Register dst, Register src);
-  void imull(Register reg, const Immediate& imm);
-  void imull(Register reg, const Address& address);
+  void imull(CpuRegister dst, CpuRegister src);
+  void imull(CpuRegister reg, const Immediate& imm);
+  void imull(CpuRegister reg, const Address& address);
 
-  void imull(Register reg);
+  void imull(CpuRegister reg);
   void imull(const Address& address);
 
-  void mull(Register reg);
+  void mull(CpuRegister reg);
   void mull(const Address& address);
 
-  void sbbl(Register dst, Register src);
-  void sbbl(Register reg, const Immediate& imm);
-  void sbbl(Register reg, const Address& address);
+  void shll(CpuRegister reg, const Immediate& imm);
+  void shll(CpuRegister operand, CpuRegister shifter);
+  void shrl(CpuRegister reg, const Immediate& imm);
+  void shrl(CpuRegister operand, CpuRegister shifter);
+  void sarl(CpuRegister reg, const Immediate& imm);
+  void sarl(CpuRegister operand, CpuRegister shifter);
 
-  void incl(Register reg);
-  void incl(const Address& address);
-
-  void decl(Register reg);
-  void decl(const Address& address);
-
-  void shll(Register reg, const Immediate& imm);
-  void shll(Register operand, Register shifter);
-  void shrl(Register reg, const Immediate& imm);
-  void shrl(Register operand, Register shifter);
-  void sarl(Register reg, const Immediate& imm);
-  void sarl(Register operand, Register shifter);
-  void shld(Register dst, Register src);
-
-  void negl(Register reg);
-  void notl(Register reg);
+  void negl(CpuRegister reg);
+  void notl(CpuRegister reg);
 
   void enter(const Immediate& imm);
   void leave();
@@ -428,12 +420,12 @@
 
   void j(Condition condition, Label* label);
 
-  void jmp(Register reg);
+  void jmp(CpuRegister reg);
   void jmp(const Address& address);
   void jmp(Label* label);
 
   X86_64Assembler* lock();
-  void cmpxchgl(const Address& address, Register reg);
+  void cmpxchgl(const Address& address, CpuRegister reg);
 
   void mfence();
 
@@ -443,7 +435,7 @@
   // Macros for High-level operations.
   //
 
-  void AddImmediate(Register reg, const Immediate& imm);
+  void AddImmediate(CpuRegister reg, const Immediate& imm);
 
   void LoadDoubleConstant(XmmRegister dst, double value);
 
@@ -452,7 +444,7 @@
 
   void DoubleAbs(XmmRegister reg);
 
-  void LockCmpxchgl(const Address& address, Register reg) {
+  void LockCmpxchgl(const Address& address, CpuRegister reg) {
     lock()->cmpxchgl(address, reg);
   }
 
@@ -468,109 +460,99 @@
   //
 
   // Emit code that will create an activation on the stack
-  virtual void BuildFrame(size_t frame_size, ManagedRegister method_reg,
-                          const std::vector<ManagedRegister>& callee_save_regs,
-                          const ManagedRegisterEntrySpills& entry_spills);
+  void BuildFrame(size_t frame_size, ManagedRegister method_reg,
+                  const std::vector<ManagedRegister>& callee_save_regs,
+                  const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
 
   // Emit code that will remove an activation from the stack
-  virtual void RemoveFrame(size_t frame_size,
-                           const std::vector<ManagedRegister>& callee_save_regs);
+  void RemoveFrame(size_t frame_size, const std::vector<ManagedRegister>& callee_save_regs)
+      OVERRIDE;
 
-  virtual void IncreaseFrameSize(size_t adjust);
-  virtual void DecreaseFrameSize(size_t adjust);
+  void IncreaseFrameSize(size_t adjust) OVERRIDE;
+  void DecreaseFrameSize(size_t adjust) OVERRIDE;
 
   // Store routines
-  virtual void Store(FrameOffset offs, ManagedRegister src, size_t size);
-  virtual void StoreRef(FrameOffset dest, ManagedRegister src);
-  virtual void StoreRawPtr(FrameOffset dest, ManagedRegister src);
+  void Store(FrameOffset offs, ManagedRegister src, size_t size) OVERRIDE;
+  void StoreRef(FrameOffset dest, ManagedRegister src) OVERRIDE;
+  void StoreRawPtr(FrameOffset dest, ManagedRegister src) OVERRIDE;
 
-  virtual void StoreImmediateToFrame(FrameOffset dest, uint32_t imm,
-                                     ManagedRegister scratch);
+  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreImmediateToThread(ThreadOffset dest, uint32_t imm,
-                                      ManagedRegister scratch);
+  void StoreImmediateToThread64(ThreadOffset<8> dest, uint32_t imm, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void StoreStackOffsetToThread(ThreadOffset thr_offs,
-                                        FrameOffset fr_offs,
-                                        ManagedRegister scratch);
+  void StoreStackOffsetToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs,
+                                  ManagedRegister scratch) OVERRIDE;
 
-  virtual void StoreStackPointerToThread(ThreadOffset thr_offs);
+  void StoreStackPointerToThread64(ThreadOffset<8> thr_offs) OVERRIDE;
 
-  void StoreLabelToThread(ThreadOffset thr_offs, Label* lbl);
-
-  virtual void StoreSpanning(FrameOffset dest, ManagedRegister src,
-                             FrameOffset in_off, ManagedRegister scratch);
+  void StoreSpanning(FrameOffset dest, ManagedRegister src, FrameOffset in_off,
+                     ManagedRegister scratch) OVERRIDE;
 
   // Load routines
-  virtual void Load(ManagedRegister dest, FrameOffset src, size_t size);
+  void Load(ManagedRegister dest, FrameOffset src, size_t size) OVERRIDE;
 
-  virtual void Load(ManagedRegister dest, ThreadOffset src, size_t size);
+  void LoadFromThread64(ManagedRegister dest, ThreadOffset<8> src, size_t size) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, FrameOffset  src);
+  void LoadRef(ManagedRegister dest, FrameOffset  src) OVERRIDE;
 
-  virtual void LoadRef(ManagedRegister dest, ManagedRegister base,
-                       MemberOffset offs);
+  void LoadRef(ManagedRegister dest, ManagedRegister base, MemberOffset offs) OVERRIDE;
 
-  virtual void LoadRawPtr(ManagedRegister dest, ManagedRegister base,
-                          Offset offs);
+  void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) OVERRIDE;
 
-  virtual void LoadRawPtrFromThread(ManagedRegister dest,
-                                    ThreadOffset offs);
+  void LoadRawPtrFromThread64(ManagedRegister dest, ThreadOffset<8> offs) OVERRIDE;
 
   // Copying routines
-  virtual void Move(ManagedRegister dest, ManagedRegister src, size_t size);
+  void Move(ManagedRegister dest, ManagedRegister src, size_t size);
 
-  virtual void CopyRawPtrFromThread(FrameOffset fr_offs, ThreadOffset thr_offs,
-                                    ManagedRegister scratch);
+  void CopyRawPtrFromThread64(FrameOffset fr_offs, ThreadOffset<8> thr_offs,
+                              ManagedRegister scratch) OVERRIDE;
 
-  virtual void CopyRawPtrToThread(ThreadOffset thr_offs, FrameOffset fr_offs,
-                                  ManagedRegister scratch);
+  void CopyRawPtrToThread64(ThreadOffset<8> thr_offs, FrameOffset fr_offs, ManagedRegister scratch)
+      OVERRIDE;
 
-  virtual void CopyRef(FrameOffset dest, FrameOffset src,
-                       ManagedRegister scratch);
+  void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister scratch) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, ManagedRegister src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest_base, Offset dest_offset, FrameOffset src, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, FrameOffset src_base, Offset src_offset, ManagedRegister scratch,
+            size_t size) OVERRIDE;
 
-  virtual void Copy(ManagedRegister dest, Offset dest_offset,
-                    ManagedRegister src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(ManagedRegister dest, Offset dest_offset, ManagedRegister src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
-                    ManagedRegister scratch, size_t size);
+  void Copy(FrameOffset dest, Offset dest_offset, FrameOffset src, Offset src_offset,
+            ManagedRegister scratch, size_t size) OVERRIDE;
 
-  virtual void MemoryBarrier(ManagedRegister);
+  void MemoryBarrier(ManagedRegister) OVERRIDE;
 
   // Sign extension
-  virtual void SignExtend(ManagedRegister mreg, size_t size);
+  void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Zero extension
-  virtual void ZeroExtend(ManagedRegister mreg, size_t size);
+  void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
 
   // Exploit fast access in managed code to Thread::Current()
-  virtual void GetCurrentThread(ManagedRegister tr);
-  virtual void GetCurrentThread(FrameOffset dest_offset,
-                                ManagedRegister scratch);
+  void GetCurrentThread(ManagedRegister tr) OVERRIDE;
+  void GetCurrentThread(FrameOffset dest_offset, ManagedRegister scratch) OVERRIDE;
 
   // Set up out_reg to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed. in_reg holds a possibly stale reference
   // that can be used to avoid loading the SIRT entry to see if the value is
   // NULL.
-  virtual void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset,
-                               ManagedRegister in_reg, bool null_allowed);
+  void CreateSirtEntry(ManagedRegister out_reg, FrameOffset sirt_offset, ManagedRegister in_reg,
+                       bool null_allowed) OVERRIDE;
 
   // Set up out_off to hold a Object** into the SIRT, or to be NULL if the
   // value is null and null_allowed.
-  virtual void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset,
-                               ManagedRegister scratch, bool null_allowed);
+  void CreateSirtEntry(FrameOffset out_off, FrameOffset sirt_offset, ManagedRegister scratch,
+                       bool null_allowed) OVERRIDE;
 
   // src holds a SIRT entry (Object**) load this into dst
   virtual void LoadReferenceFromSirt(ManagedRegister dst,
@@ -578,40 +560,57 @@
 
   // Heap::VerifyObject on src. In some cases (such as a reference to this) we
   // know that src may not be null.
-  virtual void VerifyObject(ManagedRegister src, bool could_be_null);
-  virtual void VerifyObject(FrameOffset src, bool could_be_null);
+  void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
+  void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
 
   // Call to address held at [base+offset]
-  virtual void Call(ManagedRegister base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(FrameOffset base, Offset offset,
-                    ManagedRegister scratch);
-  virtual void Call(ThreadOffset offset, ManagedRegister scratch);
+  void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void Call(FrameOffset base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void CallFromThread64(ThreadOffset<8> offset, ManagedRegister scratch) OVERRIDE;
 
   // Generate code to check if Thread::Current()->exception_ is non-null
   // and branch to a ExceptionSlowPath if it is.
-  virtual void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust);
+  void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust) OVERRIDE;
 
  private:
-  inline void EmitUint8(uint8_t value);
-  inline void EmitInt32(int32_t value);
-  inline void EmitRegisterOperand(int rm, int reg);
-  inline void EmitXmmRegisterOperand(int rm, XmmRegister reg);
-  inline void EmitFixup(AssemblerFixup* fixup);
-  inline void EmitOperandSizeOverride();
+  void EmitUint8(uint8_t value);
+  void EmitInt32(int32_t value);
+  void EmitRegisterOperand(uint8_t rm, uint8_t reg);
+  void EmitXmmRegisterOperand(uint8_t rm, XmmRegister reg);
+  void EmitFixup(AssemblerFixup* fixup);
+  void EmitOperandSizeOverride();
 
-  void EmitOperand(int rm, const Operand& operand);
+  void EmitOperand(uint8_t rm, const Operand& operand);
   void EmitImmediate(const Immediate& imm);
-  void EmitComplex(int rm, const Operand& operand, const Immediate& immediate);
+  void EmitComplex(uint8_t rm, const Operand& operand, const Immediate& immediate);
   void EmitLabel(Label* label, int instruction_size);
   void EmitLabelLink(Label* label);
   void EmitNearLabelLink(Label* label);
 
-  void EmitGenericShift(int rm, Register reg, const Immediate& imm);
-  void EmitGenericShift(int rm, Register operand, Register shifter);
-  void rex(Register &dst, Register &src, size_t size = 4);
-  void rex_reg(Register &dst, size_t size = 4);
-  void rex_rm(Register &src, size_t size = 4);
+  void EmitGenericShift(int rm, CpuRegister reg, const Immediate& imm);
+  void EmitGenericShift(int rm, CpuRegister operand, CpuRegister shifter);
+
+  // If any input is not false, output the necessary rex prefix.
+  void EmitOptionalRex(bool force, bool w, bool r, bool x, bool b);
+
+  // Emit a rex prefix byte if necessary for reg. ie if reg is a register in the range R8 to R15.
+  void EmitOptionalRex32(CpuRegister reg);
+  void EmitOptionalRex32(CpuRegister dst, CpuRegister src);
+  void EmitOptionalRex32(XmmRegister dst, XmmRegister src);
+  void EmitOptionalRex32(CpuRegister dst, XmmRegister src);
+  void EmitOptionalRex32(XmmRegister dst, CpuRegister src);
+  void EmitOptionalRex32(const Operand& operand);
+  void EmitOptionalRex32(CpuRegister dst, const Operand& operand);
+  void EmitOptionalRex32(XmmRegister dst, const Operand& operand);
+
+  // Emit a REX.W prefix plus necessary register bit encodings.
+  void EmitRex64(CpuRegister reg);
+  void EmitRex64(CpuRegister dst, CpuRegister src);
+  void EmitRex64(CpuRegister dst, const Operand& operand);
+
+  // Emit a REX prefix to normalize byte registers plus necessary register bit encodings.
+  void EmitOptionalByteRegNormalizingRex32(CpuRegister dst, CpuRegister src);
+  void EmitOptionalByteRegNormalizingRex32(CpuRegister dst, const Operand& operand);
 
   DISALLOW_COPY_AND_ASSIGN(X86_64Assembler);
 };
@@ -624,14 +623,14 @@
   buffer_.Emit<int32_t>(value);
 }
 
-inline void X86_64Assembler::EmitRegisterOperand(int rm, int reg) {
+inline void X86_64Assembler::EmitRegisterOperand(uint8_t rm, uint8_t reg) {
   CHECK_GE(rm, 0);
   CHECK_LT(rm, 8);
   buffer_.Emit<uint8_t>(0xC0 + (rm << 3) + reg);
 }
 
-inline void X86_64Assembler::EmitXmmRegisterOperand(int rm, XmmRegister reg) {
-  EmitRegisterOperand(rm, static_cast<Register>(reg));
+inline void X86_64Assembler::EmitXmmRegisterOperand(uint8_t rm, XmmRegister reg) {
+  EmitRegisterOperand(rm, static_cast<uint8_t>(reg.AsFloatRegister()));
 }
 
 inline void X86_64Assembler::EmitFixup(AssemblerFixup* fixup) {
@@ -642,15 +641,6 @@
   EmitUint8(0x66);
 }
 
-// Slowpath entered when Thread::Current()->_exception is non-null
-class X86ExceptionSlowPath : public SlowPath {
- public:
-  explicit X86ExceptionSlowPath(size_t stack_adjust) : stack_adjust_(stack_adjust) {}
-  virtual void Emit(Assembler *sp_asm);
- private:
-  const size_t stack_adjust_;
-};
-
 }  // namespace x86_64
 }  // namespace art
 
diff --git a/compiler/utils/x86_64/constants_x86_64.h b/compiler/utils/x86_64/constants_x86_64.h
index 3340802..58a0379 100644
--- a/compiler/utils/x86_64/constants_x86_64.h
+++ b/compiler/utils/x86_64/constants_x86_64.h
@@ -27,30 +27,37 @@
 namespace art {
 namespace x86_64 {
 
-enum ByteRegister {
-  AL = 0,
-  CL = 1,
-  DL = 2,
-  BL = 3,
-  AH = 4,
-  CH = 5,
-  DH = 6,
-  BH = 7,
-  kNoByteRegister = -1  // Signals an illegal register.
+class CpuRegister {
+ public:
+  explicit CpuRegister(Register r) : reg_(r) {}
+  Register AsRegister() const {
+    return reg_;
+  }
+  uint8_t LowBits() const {
+    return reg_ & 7;
+  }
+  bool NeedsRex() const {
+    return reg_ > 7;
+  }
+ private:
+  const Register reg_;
 };
+std::ostream& operator<<(std::ostream& os, const CpuRegister& reg);
 
-
-enum XmmRegister {
-  _XMM0 = 0,
-  _XMM1 = 1,
-  _XMM2 = 2,
-  _XMM3 = 3,
-  _XMM4 = 4,
-  _XMM5 = 5,
-  _XMM6 = 6,
-  _XMM7 = 7,
-  kNumberOfXmmRegisters = 8,
-  kNoXmmRegister = -1  // Signals an illegal register.
+class XmmRegister {
+ public:
+  explicit XmmRegister(FloatRegister r) : reg_(r) {}
+  FloatRegister AsFloatRegister() const {
+    return reg_;
+  }
+  uint8_t LowBits() const {
+    return reg_ & 7;
+  }
+  bool NeedsRex() const {
+    return reg_ > 7;
+  }
+ private:
+  const FloatRegister reg_;
 };
 std::ostream& operator<<(std::ostream& os, const XmmRegister& reg);
 
diff --git a/compiler/utils/x86_64/managed_register_x86_64.cc b/compiler/utils/x86_64/managed_register_x86_64.cc
index 057a894..b8c2db2 100644
--- a/compiler/utils/x86_64/managed_register_x86_64.cc
+++ b/compiler/utils/x86_64/managed_register_x86_64.cc
@@ -60,8 +60,8 @@
   CHECK(other.IsValidManagedRegister());
   if (Equals(other)) return true;
   if (IsRegisterPair()) {
-    Register low = AsRegisterPairLow();
-    Register high = AsRegisterPairHigh();
+    Register low = AsRegisterPairLow().AsRegister();
+    Register high = AsRegisterPairHigh().AsRegister();
     return X86_64ManagedRegister::FromCpuRegister(low).Overlaps(other) ||
         X86_64ManagedRegister::FromCpuRegister(high).Overlaps(other);
   }
@@ -94,11 +94,11 @@
   if (!IsValidManagedRegister()) {
     os << "No Register";
   } else if (IsXmmRegister()) {
-    os << "XMM: " << static_cast<int>(AsXmmRegister());
+    os << "XMM: " << static_cast<int>(AsXmmRegister().AsFloatRegister());
   } else if (IsX87Register()) {
     os << "X87: " << static_cast<int>(AsX87Register());
   } else if (IsCpuRegister()) {
-    os << "CPU: " << static_cast<int>(AsCpuRegister());
+    os << "CPU: " << static_cast<int>(AsCpuRegister().AsRegister());
   } else if (IsRegisterPair()) {
     os << "Pair: " << AsRegisterPairLow() << ", " << AsRegisterPairHigh();
   } else {
diff --git a/compiler/utils/x86_64/managed_register_x86_64.h b/compiler/utils/x86_64/managed_register_x86_64.h
index d68c59d..822659f 100644
--- a/compiler/utils/x86_64/managed_register_x86_64.h
+++ b/compiler/utils/x86_64/managed_register_x86_64.h
@@ -46,8 +46,8 @@
 const int kNumberOfCpuRegIds = kNumberOfCpuRegisters;
 const int kNumberOfCpuAllocIds = kNumberOfCpuRegisters;
 
-const int kNumberOfXmmRegIds = kNumberOfXmmRegisters;
-const int kNumberOfXmmAllocIds = kNumberOfXmmRegisters;
+const int kNumberOfXmmRegIds = kNumberOfFloatRegisters;
+const int kNumberOfXmmAllocIds = kNumberOfFloatRegisters;
 
 const int kNumberOfX87RegIds = kNumberOfX87Registers;
 const int kNumberOfX87AllocIds = kNumberOfX87Registers;
@@ -87,20 +87,14 @@
 // There is a one-to-one mapping between ManagedRegister and register id.
 class X86_64ManagedRegister : public ManagedRegister {
  public:
-  ByteRegister AsByteRegister() const {
+  CpuRegister AsCpuRegister() const {
     CHECK(IsCpuRegister());
-    CHECK_LT(AsCpuRegister(), RSP);  // RSP, RBP, ESI and RDI cannot be encoded as byte registers.
-    return static_cast<ByteRegister>(id_);
-  }
-
-  Register AsCpuRegister() const {
-    CHECK(IsCpuRegister());
-    return static_cast<Register>(id_);
+    return CpuRegister(static_cast<Register>(id_));
   }
 
   XmmRegister AsXmmRegister() const {
     CHECK(IsXmmRegister());
-    return static_cast<XmmRegister>(id_ - kNumberOfCpuRegIds);
+    return XmmRegister(static_cast<FloatRegister>(id_ - kNumberOfCpuRegIds));
   }
 
   X87Register AsX87Register() const {
@@ -109,13 +103,13 @@
                                     (kNumberOfCpuRegIds + kNumberOfXmmRegIds));
   }
 
-  Register AsRegisterPairLow() const {
+  CpuRegister AsRegisterPairLow() const {
     CHECK(IsRegisterPair());
     // Appropriate mapping of register ids allows to use AllocIdLow().
     return FromRegId(AllocIdLow()).AsCpuRegister();
   }
 
-  Register AsRegisterPairHigh() const {
+  CpuRegister AsRegisterPairHigh() const {
     CHECK(IsRegisterPair());
     // Appropriate mapping of register ids allows to use AllocIdHigh().
     return FromRegId(AllocIdHigh()).AsCpuRegister();
@@ -157,8 +151,7 @@
     return FromRegId(r);
   }
 
-  static X86_64ManagedRegister FromXmmRegister(XmmRegister r) {
-    CHECK_NE(r, kNoXmmRegister);
+  static X86_64ManagedRegister FromXmmRegister(FloatRegister r) {
     return FromRegId(r + kNumberOfCpuRegIds);
   }