49 files changed, 956 insertions, 360 deletions
diff --git a/Android.mk b/Android.mk
index 0eaca7f921..3324458bf5 100644
--- a/Android.mk
+++ b/Android.mk
@@ -97,6 +97,9 @@ include $(art_build_path)/Android.oat.mk
 ART_HOST_DEPENDENCIES := $(ART_HOST_EXECUTABLES) $(HOST_OUT_JAVA_LIBRARIES)/core-libart-hostdex.jar
 ART_HOST_DEPENDENCIES += $(HOST_OUT_SHARED_LIBRARIES)/libjavacore$(ART_HOST_SHLIB_EXTENSION)
 ART_TARGET_DEPENDENCIES := $(ART_TARGET_EXECUTABLES) $(TARGET_OUT_JAVA_LIBRARIES)/core-libart.jar $(TARGET_OUT_SHARED_LIBRARIES)/libjavacore.so
+ifdef TARGET_2ND_ARCH
+ART_TARGET_DEPENDENCIES += $(2ND_TARGET_OUT_SHARED_LIBRARIES)/libjavacore.so
+endif
 
 ########################################################################
 # test targets
diff --git a/build/Android.common.mk b/build/Android.common.mk
index 3961219d07..ae54efb061 100644
--- a/build/Android.common.mk
+++ b/build/Android.common.mk
@@ -147,8 +147,6 @@ ART_TEST_OUT := $(TARGET_OUT_DATA)/art-test
 2ND_TARGET_ARCH := $(TARGET_2ND_ARCH)
 ART_PHONY_TEST_TARGET_SUFFIX :=
 2ND_ART_PHONY_TEST_TARGET_SUFFIX :=
-ART_TARGET_BINARY_SUFFIX :=
-2ND_ART_TARGET_BINARY_SUFFIX :=
 ifdef TARGET_2ND_ARCH
   art_test_primary_suffix :=
   art_test_secondary_suffix :=
@@ -156,7 +154,6 @@ ifdef TARGET_2ND_ARCH
     art_test_primary_suffix := 64
     ART_PHONY_TEST_TARGET_SUFFIX := 64
     2ND_ART_PHONY_TEST_TARGET_SUFFIX := 32
-    ART_TARGET_BINARY_SUFFIX := 64
     ART_TARGET_ARCH_32 := $(TARGET_2ND_ARCH)
     ART_TARGET_ARCH_64 := $(TARGET_ARCH)
   else
diff --git a/compiler/dex/frontend.cc b/compiler/dex/frontend.cc
index 3bc060ba27..77b5057538 100644
--- a/compiler/dex/frontend.cc
+++ b/compiler/dex/frontend.cc
@@ -75,6 +75,7 @@ static uint32_t kCompilerDebugFlags = 0 |     // Enable debug/testing modes
   // (1 << kDebugShowSummaryMemoryUsage) |
   // (1 << kDebugShowFilterStats) |
   // (1 << kDebugTimings) |
+  // (1 << kDebugCodegenDump) |
   0;
 
 CompilationUnit::CompilationUnit(ArenaPool* pool)
@@ -852,6 +853,10 @@ static CompiledMethod* CompileMethod(CompilerDriver& driver,
     }
   }
 
+  if (cu.verbose) {
+    cu.enable_debug |= (1 << kDebugCodegenDump);
+  }
+
   /*
    * TODO: rework handling of optimization and debug flags.  Should we split out
    * MIR and backend flags?  Need command-line setting as well.
@@ -877,6 +882,7 @@ static CompiledMethod* CompileMethod(CompilerDriver& driver,
   if (cu.instruction_set == kArm64) {
     // TODO(Arm64): enable optimizations once backend is mature enough.
     cu.disable_opt = ~(uint32_t)0;
+    cu.enable_debug |= (1 << kDebugCodegenDump);
   }
 
   cu.StartTimingSplit("BuildMIRGraph");
diff --git a/compiler/dex/frontend.h b/compiler/dex/frontend.h
index c3b8d120d2..9e376ee7ee 100644
--- a/compiler/dex/frontend.h
+++ b/compiler/dex/frontend.h
@@ -76,7 +76,8 @@ enum debugControlVector {
   kDebugVerifyBitcode,
   kDebugShowSummaryMemoryUsage,
   kDebugShowFilterStats,
-  kDebugTimings
+  kDebugTimings,
+  kDebugCodegenDump
 };
 
 class LLVMInfo {
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 876419c8b4..2d1c19e2d2 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -54,8 +54,6 @@ class ArmMir2Lir FINAL : public Mir2Lir {
     void MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg);
 
     // Required for target - register utilities.
-    RegStorage AllocTypedTemp(bool fp_hint, int reg_class);
-    RegStorage AllocTypedTempWide(bool fp_hint, int reg_class);
     RegStorage TargetReg(SpecialTargetRegister reg);
     RegStorage GetArgMappingToPhysicalReg(int arg_num);
     RegLocation GetReturnAlt();
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index f7a7fe89f9..1520c52a7a 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -46,6 +46,7 @@ static const RegStorage sp_temps_arr[] =
 static const RegStorage dp_temps_arr[] =
     {rs_dr0, rs_dr1, rs_dr2, rs_dr3, rs_dr4, rs_dr5, rs_dr6, rs_dr7};
 
+static const std::vector<RegStorage> empty_pool;
 static const std::vector<RegStorage> core_regs(core_regs_arr,
     core_regs_arr + sizeof(core_regs_arr) / sizeof(core_regs_arr[0]));
 static const std::vector<RegStorage> sp_regs(sp_regs_arr,
@@ -554,26 +555,11 @@ Mir2Lir* ArmCodeGenerator(CompilationUnit* const cu, MIRGraph* const mir_graph,
   return new ArmMir2Lir(cu, mir_graph, arena);
 }
 
-// Alloc a pair of core registers, or a double.
-RegStorage ArmMir2Lir::AllocTypedTempWide(bool fp_hint, int reg_class) {
-  if (((reg_class == kAnyReg) && fp_hint) || (reg_class == kFPReg)) {
-    return AllocTempDouble();
-  } else {
-    RegStorage low_reg = AllocTemp();
-    RegStorage high_reg = AllocTemp();
-    return RegStorage::MakeRegPair(low_reg, high_reg);
-  }
-}
-
-RegStorage ArmMir2Lir::AllocTypedTemp(bool fp_hint, int reg_class) {
-  if (((reg_class == kAnyReg) && fp_hint) || (reg_class == kFPReg))
-    return AllocTempSingle();
-  return AllocTemp();
-}
-
 void ArmMir2Lir::CompilerInitializeRegAlloc() {
-  reg_pool_ = new (arena_) RegisterPool(this, arena_, core_regs, sp_regs, dp_regs, reserved_regs,
-                                        core_temps, sp_temps, dp_temps);
+  reg_pool_ = new (arena_) RegisterPool(this, arena_, core_regs, empty_pool /* core64 */, sp_regs,
+                                        dp_regs, reserved_regs, empty_pool /* reserved64 */,
+                                        core_temps, empty_pool /* core64_temps */, sp_temps,
+                                        dp_temps);
 
   // Target-specific adjustments.
 
diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h
index 33a1c009e7..c3b23fdf60 100644
--- a/compiler/dex/quick/arm64/arm64_lir.h
+++ b/compiler/dex/quick/arm64/arm64_lir.h
@@ -129,27 +129,32 @@ enum ArmResourceEncodingPos {
   R(24) R(25) R(26) R(27) R(28) R(29) R(30) R(31)
 
 // Registers (integer) values.
-// TODO(Arm64): for now we define rx##nr identically to rw##nr. We should rather define rx##nr as
-// a k64BitSolo. We should do this once the register allocator is ready.
 enum A64NativeRegisterPool {
 #  define A64_DEFINE_REGISTERS(nr) \
     rw##nr = RegStorage::k32BitSolo | RegStorage::kCoreRegister | nr, \
-    rx##nr = RegStorage::k32BitSolo | RegStorage::kCoreRegister | nr, \
+    rx##nr = RegStorage::k64BitSolo | RegStorage::kCoreRegister | nr, \
     rf##nr = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | nr, \
     rd##nr = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | nr,
   A64_REGISTER_CODE_LIST(A64_DEFINE_REGISTERS)
 #undef A64_DEFINE_REGISTERS
 
-  // TODO(Arm64): can we change the lines below such that rwzr != rwsp && rxzr != rsp?
-  //   This would be desirable to allow detecting usage-errors in the assembler.
   rwzr = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 0x3f,
-  rxzr = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 0x3f,
+  rxzr = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 0x3f,
   rwsp = rw31,
   rsp = rx31,
   rA64_SUSPEND = rx19,
   rA64_SELF = rx18,
   rA64_SP = rx31,
-  rA64_LR = rx30
+  rA64_LR = rx30,
+  /*
+   * FIXME: It's a bit awkward to define both 32 and 64-bit views of these - we'll only ever use
+   * the 64-bit view. However, for now we'll define a 32-bit view to keep these from being
+   * allocated as 32-bit temp registers.
+   */
+  rA32_SUSPEND = rw19,
+  rA32_SELF = rw18,
+  rA32_SP = rw31,
+  rA32_LR = rw30
 };
 
 #define A64_DEFINE_REGSTORAGES(nr) \
@@ -166,6 +171,11 @@ constexpr RegStorage rs_rA64_SUSPEND(RegStorage::kValid | rA64_SUSPEND);
 constexpr RegStorage rs_rA64_SELF(RegStorage::kValid | rA64_SELF);
 constexpr RegStorage rs_rA64_SP(RegStorage::kValid | rA64_SP);
 constexpr RegStorage rs_rA64_LR(RegStorage::kValid | rA64_LR);
+// TODO: eliminate the need for these.
+constexpr RegStorage rs_rA32_SUSPEND(RegStorage::kValid | rA32_SUSPEND);
+constexpr RegStorage rs_rA32_SELF(RegStorage::kValid | rA32_SELF);
+constexpr RegStorage rs_rA32_SP(RegStorage::kValid | rA32_SP);
+constexpr RegStorage rs_rA32_LR(RegStorage::kValid | rA32_LR);
 
 // RegisterLocation templates return values (following the hard-float calling convention).
 const RegLocation arm_loc_c_return =
diff --git a/compiler/dex/quick/arm64/assemble_arm64.cc b/compiler/dex/quick/arm64/assemble_arm64.cc
index 01fcc0d873..656f8fd6a4 100644
--- a/compiler/dex/quick/arm64/assemble_arm64.cc
+++ b/compiler/dex/quick/arm64/assemble_arm64.cc
@@ -666,7 +666,7 @@ uint8_t* Arm64Mir2Lir::EncodeLIRs(uint8_t* write_pos, LIR* lir) {
                   expected = "core register";
                 } else if (want_size_match && (reg.Is64Bit() != want_64_bit)) {
                   expected = (want_64_bit) ? "x-register" : "w-register";
-                } else if (reg.GetRegNum() == 31 && is_zero == want_zero) {
+                } else if (reg.GetRegNum() == 31 && is_zero != want_zero) {
                   expected = (want_zero) ? "zero-register" : "sp-register";
                 }
               }
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index 01b7802d76..350e4830ae 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -54,8 +54,6 @@ class Arm64Mir2Lir : public Mir2Lir {
     void MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg);
 
     // Required for target - register utilities.
-    RegStorage AllocTypedTemp(bool fp_hint, int reg_class);
-    RegStorage AllocTypedTempWide(bool fp_hint, int reg_class);
     RegStorage TargetReg(SpecialTargetRegister reg);
     RegStorage GetArgMappingToPhysicalReg(int arg_num);
     RegLocation GetReturnAlt();
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index 12ac4c8a0f..2b1c5e8386 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -27,6 +27,12 @@ namespace art {
 
 // TODO: rework this when c++11 support allows.
 static const RegStorage core_regs_arr[] =
+    {rs_w0, rs_w1, rs_w2, rs_w3, rs_w4, rs_w5, rs_w6, rs_w7,
+     rs_w8, rs_w9, rs_w10, rs_w11, rs_w12, rs_w13, rs_w14, rs_w15,
+     rs_w16, rs_w17, rs_w18, rs_w19, rs_w20, rs_w21, rs_w22, rs_w23,
+     rs_w24, rs_w25, rs_w26, rs_w27, rs_w28, rs_w29, rs_w30, rs_w31,
+     rs_wzr};
+static const RegStorage core64_regs_arr[] =
     {rs_x0, rs_x1, rs_x2, rs_x3, rs_x4, rs_x5, rs_x6, rs_x7,
      rs_x8, rs_x9, rs_x10, rs_x11, rs_x12, rs_x13, rs_x14, rs_x15,
      rs_x16, rs_x17, rs_x18, rs_x19, rs_x20, rs_x21, rs_x22, rs_x23,
@@ -43,12 +49,18 @@ static const RegStorage dp_regs_arr[] =
      rs_d16, rs_d17, rs_d18, rs_d19, rs_d20, rs_d21, rs_d22, rs_d23,
      rs_d24, rs_d25, rs_d26, rs_d27, rs_d28, rs_d29, rs_d30, rs_d31};
 static const RegStorage reserved_regs_arr[] =
+    {rs_rA32_SUSPEND, rs_rA32_SELF, rs_rA32_SP, rs_rA32_LR, rs_wzr};
+static const RegStorage reserved64_regs_arr[] =
     {rs_rA64_SUSPEND, rs_rA64_SELF, rs_rA64_SP, rs_rA64_LR, rs_xzr};
 // TUNING: Are there too many temp registers and too less promote target?
 // This definition need to be matched with runtime.cc, quick entry assembly and JNI compiler
 // Note: we are not able to call to C function directly if it un-match C ABI.
 // Currently, rs_rA64_SELF is not a callee save register which does not match C ABI.
 static const RegStorage core_temps_arr[] =
+    {rs_w0, rs_w1, rs_w2, rs_w3, rs_w4, rs_w5, rs_w6, rs_w7,
+     rs_w8, rs_w9, rs_w10, rs_w11, rs_w12, rs_w13, rs_w14, rs_w15, rs_w16,
+     rs_w17};
+static const RegStorage core64_temps_arr[] =
     {rs_x0, rs_x1, rs_x2, rs_x3, rs_x4, rs_x5, rs_x6, rs_x7,
      rs_x8, rs_x9, rs_x10, rs_x11, rs_x12, rs_x13, rs_x14, rs_x15, rs_x16,
      rs_x17};
@@ -63,14 +75,20 @@ static const RegStorage dp_temps_arr[] =
 
 static const std::vector<RegStorage> core_regs(core_regs_arr,
     core_regs_arr + arraysize(core_regs_arr));
+static const std::vector<RegStorage> core64_regs(core64_regs_arr,
+    core64_regs_arr + arraysize(core64_regs_arr));
 static const std::vector<RegStorage> sp_regs(sp_regs_arr,
     sp_regs_arr + arraysize(sp_regs_arr));
 static const std::vector<RegStorage> dp_regs(dp_regs_arr,
     dp_regs_arr + arraysize(dp_regs_arr));
 static const std::vector<RegStorage> reserved_regs(reserved_regs_arr,
     reserved_regs_arr + arraysize(reserved_regs_arr));
+static const std::vector<RegStorage> reserved64_regs(reserved64_regs_arr,
+    reserved64_regs_arr + arraysize(reserved64_regs_arr));
 static const std::vector<RegStorage> core_temps(core_temps_arr,
     core_temps_arr + arraysize(core_temps_arr));
+static const std::vector<RegStorage> core64_temps(core64_temps_arr,
+    core64_temps_arr + arraysize(core64_temps_arr));
 static const std::vector<RegStorage> sp_temps(sp_temps_arr, sp_temps_arr + arraysize(sp_temps_arr));
 static const std::vector<RegStorage> dp_temps(dp_temps_arr, dp_temps_arr + arraysize(dp_temps_arr));
 
@@ -578,26 +596,10 @@ Mir2Lir* Arm64CodeGenerator(CompilationUnit* const cu, MIRGraph* const mir_graph
   return new Arm64Mir2Lir(cu, mir_graph, arena);
 }
 
-// Alloc a pair of core registers, or a double.
-RegStorage Arm64Mir2Lir::AllocTypedTempWide(bool fp_hint, int reg_class) {
-  if (((reg_class == kAnyReg) && fp_hint) || (reg_class == kFPReg)) {
-    return AllocTempDouble();
-  } else {
-    RegStorage low_reg = AllocTemp();
-    RegStorage high_reg = AllocTemp();
-    return RegStorage::MakeRegPair(low_reg, high_reg);
-  }
-}
-
-RegStorage Arm64Mir2Lir::AllocTypedTemp(bool fp_hint, int reg_class) {
-  if (((reg_class == kAnyReg) && fp_hint) || (reg_class == kFPReg))
-    return AllocTempSingle();
-  return AllocTemp();
-}
-
 void Arm64Mir2Lir::CompilerInitializeRegAlloc() {
-  reg_pool_ = new (arena_) RegisterPool(this, arena_, core_regs, sp_regs, dp_regs, reserved_regs,
-                                        core_temps, sp_temps, dp_temps);
+  reg_pool_ = new (arena_) RegisterPool(this, arena_, core_regs, core64_regs, sp_regs, dp_regs,
+                                        reserved_regs, reserved64_regs, core_temps, core64_temps,
+                                        sp_temps, dp_temps);
 
   // Target-specific adjustments.
   // Alias single precision float registers to corresponding double registers.
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index a3c94ed103..256135df71 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -1003,7 +1003,7 @@ void Mir2Lir::Materialize() {
     /* Convert LIR into machine code. */
     AssembleLIR();
 
-    if (cu_->verbose) {
+    if ((cu_->enable_debug & (1 << kDebugCodegenDump)) != 0) {
       CodegenDump();
     }
   }
diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index 0c59465260..2b57b35443 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h
@@ -54,8 +54,6 @@ class MipsMir2Lir FINAL : public Mir2Lir {
     void MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg);
 
     // Required for target - register utilities.
-    RegStorage AllocTypedTemp(bool fp_hint, int reg_class);
-    RegStorage AllocTypedTempWide(bool fp_hint, int reg_class);
     RegStorage TargetReg(SpecialTargetRegister reg);
     RegStorage GetArgMappingToPhysicalReg(int arg_num);
     RegLocation GetReturnAlt();
diff --git a/compiler/dex/quick/mips/target_mips.cc b/compiler/dex/quick/mips/target_mips.cc
index 7a3da717c2..55cf4344f1 100644
--- a/compiler/dex/quick/mips/target_mips.cc
+++ b/compiler/dex/quick/mips/target_mips.cc
@@ -46,6 +46,7 @@ static RegStorage sp_temps_arr[] =
 static RegStorage dp_temps_arr[] =
     {rs_rD0, rs_rD1, rs_rD2, rs_rD3, rs_rD4, rs_rD5, rs_rD6, rs_rD7};
 
+static const std::vector<RegStorage> empty_pool;
 static const std::vector<RegStorage> core_regs(core_regs_arr,
     core_regs_arr + sizeof(core_regs_arr) / sizeof(core_regs_arr[0]));
 static const std::vector<RegStorage> sp_regs(sp_regs_arr,
@@ -442,27 +443,11 @@ bool MipsMir2Lir::GenMemBarrier(MemBarrierKind barrier_kind) {
 #endif
 }
 
-// Alloc a pair of core registers, or a double.
-RegStorage MipsMir2Lir::AllocTypedTempWide(bool fp_hint, int reg_class) {
-  if (((reg_class == kAnyReg) && fp_hint) || (reg_class == kFPReg)) {
-    return AllocTempDouble();
-  }
-
-  RegStorage low_reg = AllocTemp();
-  RegStorage high_reg = AllocTemp();
-  return RegStorage::MakeRegPair(low_reg, high_reg);
-}
-
-RegStorage MipsMir2Lir::AllocTypedTemp(bool fp_hint, int reg_class) {
-  if (((reg_class == kAnyReg) && fp_hint) || (reg_class == kFPReg)) {
-    return AllocTempSingle();
-  }
-  return AllocTemp();
-}
-
 void MipsMir2Lir::CompilerInitializeRegAlloc() {
-  reg_pool_ = new (arena_) RegisterPool(this, arena_, core_regs, sp_regs, dp_regs, reserved_regs,
-                                        core_temps, sp_temps, dp_temps);
+  reg_pool_ = new (arena_) RegisterPool(this, arena_, core_regs, empty_pool /* core64 */, sp_regs,
+                                        dp_regs, reserved_regs, empty_pool /* reserved64 */,
+                                        core_temps, empty_pool /* core64_temps */, sp_temps,
+                                        dp_temps);
 
   // Target-specific adjustments.
 
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 454365ae14..3584c33291 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -411,10 +411,15 @@ class Mir2Lir : public Backend {
 
     class RegisterPool {
      public:
-      RegisterPool(Mir2Lir* m2l, ArenaAllocator* arena, const std::vector<RegStorage>& core_regs,
-                   const std::vector<RegStorage>& sp_regs, const std::vector<RegStorage>& dp_regs,
+      RegisterPool(Mir2Lir* m2l, ArenaAllocator* arena,
+                   const std::vector<RegStorage>& core_regs,
+                   const std::vector<RegStorage>& core64_regs,
+                   const std::vector<RegStorage>& sp_regs,
+                   const std::vector<RegStorage>& dp_regs,
                    const std::vector<RegStorage>& reserved_regs,
+                   const std::vector<RegStorage>& reserved64_regs,
                    const std::vector<RegStorage>& core_temps,
+                   const std::vector<RegStorage>& core64_temps,
                    const std::vector<RegStorage>& sp_temps,
                    const std::vector<RegStorage>& dp_temps);
       ~RegisterPool() {}
@@ -428,6 +433,8 @@ class Mir2Lir : public Backend {
       }
       GrowableArray<RegisterInfo*> core_regs_;
       int next_core_reg_;
+      GrowableArray<RegisterInfo*> core64_regs_;
+      int next_core64_reg_;
       GrowableArray<RegisterInfo*> sp_regs_;    // Single precision float.
       int next_sp_reg_;
       GrowableArray<RegisterInfo*> dp_regs_;    // Double precision float.
@@ -674,8 +681,11 @@ class Mir2Lir : public Backend {
     RegStorage AllocTempBody(GrowableArray<RegisterInfo*> &regs, int* next_temp, bool required);
     virtual RegStorage AllocFreeTemp();
     virtual RegStorage AllocTemp();
+    virtual RegStorage AllocTempWide();
     virtual RegStorage AllocTempSingle();
     virtual RegStorage AllocTempDouble();
+    virtual RegStorage AllocTypedTemp(bool fp_hint, int reg_class);
+    virtual RegStorage AllocTypedTempWide(bool fp_hint, int reg_class);
     void FlushReg(RegStorage reg);
     void FlushRegWide(RegStorage reg);
     RegStorage AllocLiveReg(int s_reg, int reg_class, bool wide);
@@ -1089,8 +1099,6 @@ class Mir2Lir : public Backend {
     virtual void MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg) = 0;
 
     // Required for target - register utilities.
-    virtual RegStorage AllocTypedTemp(bool fp_hint, int reg_class) = 0;
-    virtual RegStorage AllocTypedTempWide(bool fp_hint, int reg_class) = 0;
     virtual RegStorage TargetReg(SpecialTargetRegister reg) = 0;
     virtual RegStorage GetArgMappingToPhysicalReg(int arg_num) = 0;
     virtual RegLocation GetReturnAlt() = 0;
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index 06d05e2274..2c51c1f2fd 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -57,14 +57,19 @@ Mir2Lir::RegisterInfo::RegisterInfo(RegStorage r, uint64_t mask)
 
 Mir2Lir::RegisterPool::RegisterPool(Mir2Lir* m2l, ArenaAllocator* arena,
                                     const std::vector<RegStorage>& core_regs,
+                                    const std::vector<RegStorage>& core64_regs,
                                     const std::vector<RegStorage>& sp_regs,
                                     const std::vector<RegStorage>& dp_regs,
                                     const std::vector<RegStorage>& reserved_regs,
+                                    const std::vector<RegStorage>& reserved64_regs,
                                     const std::vector<RegStorage>& core_temps,
+                                    const std::vector<RegStorage>& core64_temps,
                                     const std::vector<RegStorage>& sp_temps,
                                     const std::vector<RegStorage>& dp_temps) :
-    core_regs_(arena, core_regs.size()), next_core_reg_(0), sp_regs_(arena, sp_regs.size()),
-    next_sp_reg_(0), dp_regs_(arena, dp_regs.size()), next_dp_reg_(0), m2l_(m2l)  {
+    core_regs_(arena, core_regs.size()), next_core_reg_(0),
+    core64_regs_(arena, core64_regs.size()), next_core64_reg_(0),
+    sp_regs_(arena, sp_regs.size()), next_sp_reg_(0),
+    dp_regs_(arena, dp_regs.size()), next_dp_reg_(0), m2l_(m2l)  {
   // Initialize the fast lookup map.
   m2l_->reginfo_map_.Reset();
   if (kIsDebugBuild) {
@@ -82,6 +87,11 @@ Mir2Lir::RegisterPool::RegisterPool(Mir2Lir* m2l, ArenaAllocator* arena,
     m2l_->reginfo_map_.Put(reg.GetReg(), info);
     core_regs_.Insert(info);
   }
+  for (RegStorage reg : core64_regs) {
+    RegisterInfo* info = new (arena) RegisterInfo(reg, m2l_->GetRegMaskCommon(reg));
+    m2l_->reginfo_map_.Put(reg.GetReg(), info);
+    core64_regs_.Insert(info);
+  }
   for (RegStorage reg : sp_regs) {
     RegisterInfo* info = new (arena) RegisterInfo(reg, m2l_->GetRegMaskCommon(reg));
     m2l_->reginfo_map_.Put(reg.GetReg(), info);
@@ -97,11 +107,17 @@ Mir2Lir::RegisterPool::RegisterPool(Mir2Lir* m2l, ArenaAllocator* arena,
   for (RegStorage reg : reserved_regs) {
     m2l_->MarkInUse(reg);
   }
+  for (RegStorage reg : reserved64_regs) {
+    m2l_->MarkInUse(reg);
+  }
 
   // Mark temp regs - all others not in use can be used for promotion
   for (RegStorage reg : core_temps) {
     m2l_->MarkTemp(reg);
   }
+  for (RegStorage reg : core64_temps) {
+    m2l_->MarkTemp(reg);
+  }
   for (RegStorage reg : sp_temps) {
     m2l_->MarkTemp(reg);
   }
@@ -374,6 +390,18 @@ RegStorage Mir2Lir::AllocTemp() {
   return AllocTempBody(reg_pool_->core_regs_, &reg_pool_->next_core_reg_, true);
 }
 
+RegStorage Mir2Lir::AllocTempWide() {
+  RegStorage res;
+  if (reg_pool_->core64_regs_.Size() != 0) {
+    res = AllocTempBody(reg_pool_->core64_regs_, &reg_pool_->next_core64_reg_, true);
+  } else {
+    RegStorage low_reg = AllocTemp();
+    RegStorage high_reg = AllocTemp();
+    res = RegStorage::MakeRegPair(low_reg, high_reg);
+  }
+  return res;
+}
+
 RegStorage Mir2Lir::AllocTempSingle() {
   RegStorage res = AllocTempBody(reg_pool_->sp_regs_, &reg_pool_->next_sp_reg_, true);
   DCHECK(res.IsSingle()) << "Reg: 0x" << std::hex << res.GetRawBits();
@@ -386,6 +414,20 @@ RegStorage Mir2Lir::AllocTempDouble() {
   return res;
 }
 
+RegStorage Mir2Lir::AllocTypedTempWide(bool fp_hint, int reg_class) {
+  if (((reg_class == kAnyReg) && fp_hint) || (reg_class == kFPReg)) {
+    return AllocTempDouble();
+  }
+  return AllocTempWide();
+}
+
+RegStorage Mir2Lir::AllocTypedTemp(bool fp_hint, int reg_class) {
+  if (((reg_class == kAnyReg) && fp_hint) || (reg_class == kFPReg)) {
+    return AllocTempSingle();
+  }
+  return AllocTemp();
+}
+
 RegStorage Mir2Lir::FindLiveReg(GrowableArray<RegisterInfo*> &regs, int s_reg) {
   RegStorage res;
   GrowableArray<RegisterInfo*>::Iterator it(&regs);
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index e717638fba..3070edd202 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -54,8 +54,6 @@ class X86Mir2Lir : public Mir2Lir {
     void MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg);
 
     // Required for target - register utilities.
-    RegStorage AllocTypedTemp(bool fp_hint, int reg_class);
-    RegStorage AllocTypedTempWide(bool fp_hint, int reg_class);
     RegStorage TargetReg(SpecialTargetRegister reg);
     RegStorage GetArgMappingToPhysicalReg(int arg_num);
     RegLocation GetReturnAlt();
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 9ddeb68e80..e7a629aa0b 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -81,6 +81,7 @@ static const RegStorage dp_temps_arr_64[] = {
 #endif
 };
 
+static const std::vector<RegStorage> empty_pool;
 static const std::vector<RegStorage> core_regs_32(core_regs_arr_32,
     core_regs_arr_32 + sizeof(core_regs_arr_32) / sizeof(core_regs_arr_32[0]));
 static const std::vector<RegStorage> core_regs_64(core_regs_arr_64,
@@ -528,30 +529,15 @@ bool X86Mir2Lir::GenMemBarrier(MemBarrierKind barrier_kind) {
 #endif
 }
 
-// Alloc a pair of core registers, or a double.
-RegStorage X86Mir2Lir::AllocTypedTempWide(bool fp_hint, int reg_class) {
-  if (((reg_class == kAnyReg) && fp_hint) || (reg_class == kFPReg)) {
-    return AllocTempDouble();
-  }
-  RegStorage low_reg = AllocTemp();
-  RegStorage high_reg = AllocTemp();
-  return RegStorage::MakeRegPair(low_reg, high_reg);
-}
-
-RegStorage X86Mir2Lir::AllocTypedTemp(bool fp_hint, int reg_class) {
-  if (((reg_class == kAnyReg) && fp_hint) || (reg_class == kFPReg)) {
-    return AllocTempSingle();
-  }
-  return AllocTemp();
-}
-
 void X86Mir2Lir::CompilerInitializeRegAlloc() {
   if (Gen64Bit()) {
-    reg_pool_ = new (arena_) RegisterPool(this, arena_, core_regs_64, sp_regs_64, dp_regs_64, reserved_regs_64,
-                                        core_temps_64, sp_temps_64, dp_temps_64);
+    reg_pool_ = new (arena_) RegisterPool(this, arena_, empty_pool, core_regs_64, sp_regs_64,
+                                          dp_regs_64, empty_pool, reserved_regs_64,
+                                          empty_pool, core_temps_64, sp_temps_64, dp_temps_64);
   } else {
-    reg_pool_ = new (arena_) RegisterPool(this, arena_, core_regs_32, sp_regs_32, dp_regs_32, reserved_regs_32,
-                                        core_temps_32, sp_temps_32, dp_temps_32);
+    reg_pool_ = new (arena_) RegisterPool(this, arena_, core_regs_32, empty_pool, sp_regs_32,
+                                          dp_regs_32, reserved_regs_32, empty_pool,
+                                          core_temps_32, empty_pool, sp_temps_32, dp_temps_32);
   }
 
   // Target-specific adjustments.
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index b8c33a07e0..0f41d2b2f6 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -1363,7 +1363,7 @@ class ParallelCompilationManager {
     self->AssertNoPendingException();
     CHECK_GT(work_units, 0U);
 
-    index_ = begin;
+    index_.StoreRelaxed(begin);
     for (size_t i = 0; i < work_units; ++i) {
       thread_pool_->AddTask(self, new ForAllClosure(this, end, callback));
     }
@@ -1378,7 +1378,7 @@ class ParallelCompilationManager {
   }
 
   size_t NextIndex() {
-    return index_.FetchAndAdd(1);
+    return index_.FetchAndAddSequentiallyConsistent(1);
   }
 
  private:
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 346b08c741..ac922ddecd 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -508,26 +508,42 @@ NO_ARG_RUNTIME_EXCEPTION art_quick_throw_stack_overflow, artThrowStackOverflowFr
 ONE_ARG_RUNTIME_EXCEPTION art_quick_throw_no_such_method, artThrowNoSuchMethodFromCode
 
     /*
-     * TODO arm64 specifics need to be fleshed out.
      * All generated callsites for interface invokes and invocation slow paths will load arguments
-     * as usual - except instead of loading x0 with the target Method*, x0 will contain
-     * the method_idx.  This wrapper will save x1-x3, load the caller's Method*, align the
+     * as usual - except instead of loading arg0/x0 with the target Method*, arg0/x0 will contain
+     * the method_idx.  This wrapper will save arg1-arg3, load the caller's Method*, align the
      * stack and call the appropriate C helper.
-     * NOTE: "this" is first visible argument of the target, and so can be found in x1.
+     * NOTE: "this" is first visible argument of the target, and so can be found in arg1/x1.
      *
-     * The helper will attempt to locate the target and return a result in x0 consisting
+     * The helper will attempt to locate the target and return a 128-bit result in x0/x1 consisting
      * of the target Method* in x0 and method->code_ in x1.
      *
-     * If unsuccessful, the helper will return NULL/NULL. There will be a pending exception in the
+     * If unsuccessful, the helper will return NULL/????. There will be a pending exception in the
      * thread and we branch to another stub to deliver it.
      *
      * On success this wrapper will restore arguments and *jump* to the target, leaving the lr
      * pointing back to the original caller.
+     *
+     * Adapted from ARM32 code.
+     *
+     * Clobbers x12.
      */
 .macro INVOKE_TRAMPOLINE c_name, cxx_name
     .extern \cxx_name
 ENTRY \c_name
-    brk 0
+    SETUP_REF_AND_ARGS_CALLEE_SAVE_FRAME  // save callee saves in case allocation triggers GC
+    // Helper signature is always
+    // (method_idx, *this_object, *caller_method, *self, sp)
+
+    ldr    x2, [sp, #FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE]  // pass caller Method*
+    mov    x3, xSELF                      // pass Thread::Current
+    mov    x4, sp
+    bl     \cxx_name                      // (method_idx, this, caller, Thread*, SP)
+    mov    x12, x1                         // save Method*->code_
+    RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME
+    cbz    x0, 1f                         // did we find the target? if not go to exception delivery
+    br     x12                             // tail call to target
+1:
+    DELIVER_PENDING_EXCEPTION
 END \c_name
 .endm
 
@@ -1381,8 +1397,17 @@ ENTRY art_quick_proxy_invoke_handler
     DELIVER_PENDING_EXCEPTION
 END art_quick_proxy_invoke_handler
 
-UNIMPLEMENTED art_quick_imt_conflict_trampoline
-
+    /*
+     * Called to resolve an imt conflict. x12 is a hidden argument that holds the target method's
+     * dex method index.
+     */
+ENTRY art_quick_imt_conflict_trampoline
+    ldr    x0, [sp, #0]                                // load caller Method*
+    ldr    w0, [x0, #METHOD_DEX_CACHE_METHODS_OFFSET]  // load dex_cache_resolved_methods
+    add    x0, x0, #OBJECT_ARRAY_DATA_OFFSET           // get starting address of data
+    ldr    w0, [x0, x12, lsl 2]                        // load the target method
+    b art_quick_invoke_interface_trampoline
+END art_quick_imt_conflict_trampoline
 
 ENTRY art_quick_resolution_trampoline
     SETUP_REF_AND_ARGS_CALLEE_SAVE_FRAME
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index b22ca82473..fac988310a 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -16,6 +16,8 @@
 
 #include "common_runtime_test.h"
 #include "mirror/art_field-inl.h"
+#include "mirror/art_method-inl.h"
+#include "mirror/class-inl.h"
 #include "mirror/string-inl.h"
 
 #include <cstdio>
@@ -50,6 +52,7 @@ class StubTest : public CommonRuntimeTest {
         pair.first = "-Xmx4M";  // Smallest we can go.
       }
     }
+    options->push_back(std::make_pair("-Xint", nullptr));
   }
 
   // Helper function needed since TEST_F makes a new class.
@@ -283,6 +286,234 @@ class StubTest : public CommonRuntimeTest {
     return result;
   }
 
+  // TODO: Set up a frame according to referrer's specs.
+  size_t Invoke3WithReferrerAndHidden(size_t arg0, size_t arg1, size_t arg2, uintptr_t code,
+                                      Thread* self, mirror::ArtMethod* referrer, size_t hidden) {
+    // Push a transition back into managed code onto the linked list in thread.
+    ManagedStack fragment;
+    self->PushManagedStackFragment(&fragment);
+
+    size_t result;
+    size_t fpr_result = 0;
+#if defined(__i386__)
+    // TODO: Set the thread?
+    __asm__ __volatile__(
+        "movd %[hidden], %%xmm0\n\t"
+        "pushl %[referrer]\n\t"     // Store referrer
+        "call *%%edi\n\t"           // Call the stub
+        "addl $4, %%esp"            // Pop referrer
+        : "=a" (result)
+          // Use the result from eax
+          : "a"(arg0), "c"(arg1), "d"(arg2), "D"(code), [referrer]"m"(referrer), [hidden]"r"(hidden)
+            // This places code into edi, arg0 into eax, arg1 into ecx, and arg2 into edx
+            : );  // clobber.
+    // TODO: Should we clobber the other registers? EBX gets clobbered by some of the stubs,
+    //       but compilation fails when declaring that.
+#elif defined(__arm__)
+    __asm__ __volatile__(
+        "push {r1-r12, lr}\n\t"     // Save state, 13*4B = 52B
+        ".cfi_adjust_cfa_offset 52\n\t"
+        "push {r9}\n\t"
+        ".cfi_adjust_cfa_offset 4\n\t"
+        "mov r9, %[referrer]\n\n"
+        "str r9, [sp, #-8]!\n\t"   // Push referrer, +8B padding so 16B aligned
+        ".cfi_adjust_cfa_offset 8\n\t"
+        "ldr r9, [sp, #8]\n\t"
+
+        // Push everything on the stack, so we don't rely on the order. What a mess. :-(
+        "sub sp, sp, #24\n\t"
+        "str %[arg0], [sp]\n\t"
+        "str %[arg1], [sp, #4]\n\t"
+        "str %[arg2], [sp, #8]\n\t"
+        "str %[code], [sp, #12]\n\t"
+        "str %[self], [sp, #16]\n\t"
+        "str %[hidden], [sp, #20]\n\t"
+        "ldr r0, [sp]\n\t"
+        "ldr r1, [sp, #4]\n\t"
+        "ldr r2, [sp, #8]\n\t"
+        "ldr r3, [sp, #12]\n\t"
+        "ldr r9, [sp, #16]\n\t"
+        "ldr r12, [sp, #20]\n\t"
+        "add sp, sp, #24\n\t"
+
+        "blx r3\n\t"                // Call the stub
+        "add sp, sp, #12\n\t"       // Pop nullptr and padding
+        ".cfi_adjust_cfa_offset -12\n\t"
+        "pop {r1-r12, lr}\n\t"      // Restore state
+        ".cfi_adjust_cfa_offset -52\n\t"
+        "mov %[result], r0\n\t"     // Save the result
+        : [result] "=r" (result)
+          // Use the result from r0
+          : [arg0] "r"(arg0), [arg1] "r"(arg1), [arg2] "r"(arg2), [code] "r"(code), [self] "r"(self),
+            [referrer] "r"(referrer), [hidden] "r"(hidden)
+            : );  // clobber.
+#elif defined(__aarch64__)
+    __asm__ __volatile__(
+        // Spill space for d8 - d15
+        "sub sp, sp, #64\n\t"
+        ".cfi_adjust_cfa_offset 64\n\t"
+        "stp d8, d9,   [sp]\n\t"
+        "stp d10, d11, [sp, #16]\n\t"
+        "stp d12, d13, [sp, #32]\n\t"
+        "stp d14, d15, [sp, #48]\n\t"
+
+        "sub sp, sp, #48\n\t"          // Reserve stack space, 16B aligned
+        ".cfi_adjust_cfa_offset 48\n\t"
+        "stp %[referrer], x1, [sp]\n\t"// referrer, x1
+        "stp x2, x3,   [sp, #16]\n\t"   // Save x2, x3
+        "stp x18, x30, [sp, #32]\n\t"   // Save x18(xSELF), xLR
+
+        // Push everything on the stack, so we don't rely on the order. What a mess. :-(
+        "sub sp, sp, #48\n\t"
+        ".cfi_adjust_cfa_offset 48\n\t"
+        "str %[arg0], [sp]\n\t"
+        "str %[arg1], [sp, #8]\n\t"
+        "str %[arg2], [sp, #16]\n\t"
+        "str %[code], [sp, #24]\n\t"
+        "str %[self], [sp, #32]\n\t"
+        "str %[hidden], [sp, #40]\n\t"
+
+        // Now we definitely have x0-x3 free, use it to garble d8 - d15
+        "movk x0, #0xfad0\n\t"
+        "movk x0, #0xebad, lsl #16\n\t"
+        "movk x0, #0xfad0, lsl #32\n\t"
+        "movk x0, #0xebad, lsl #48\n\t"
+        "fmov d8, x0\n\t"
+        "add x0, x0, 1\n\t"
+        "fmov d9, x0\n\t"
+        "add x0, x0, 1\n\t"
+        "fmov d10, x0\n\t"
+        "add x0, x0, 1\n\t"
+        "fmov d11, x0\n\t"
+        "add x0, x0, 1\n\t"
+        "fmov d12, x0\n\t"
+        "add x0, x0, 1\n\t"
+        "fmov d13, x0\n\t"
+        "add x0, x0, 1\n\t"
+        "fmov d14, x0\n\t"
+        "add x0, x0, 1\n\t"
+        "fmov d15, x0\n\t"
+
+        // Load call params
+        "ldr x0, [sp]\n\t"
+        "ldr x1, [sp, #8]\n\t"
+        "ldr x2, [sp, #16]\n\t"
+        "ldr x3, [sp, #24]\n\t"
+        "ldr x18, [sp, #32]\n\t"
+        "ldr x12, [sp, #40]\n\t"
+        "add sp, sp, #48\n\t"
+        ".cfi_adjust_cfa_offset -48\n\t"
+
+
+        "blr x3\n\t"              // Call the stub
+
+        // Test d8 - d15. We can use x1 and x2.
+        "movk x1, #0xfad0\n\t"
+        "movk x1, #0xebad, lsl #16\n\t"
+        "movk x1, #0xfad0, lsl #32\n\t"
+        "movk x1, #0xebad, lsl #48\n\t"
+        "fmov x2, d8\n\t"
+        "cmp x1, x2\n\t"
+        "b.ne 1f\n\t"
+        "add x1, x1, 1\n\t"
+
+        "fmov x2, d9\n\t"
+        "cmp x1, x2\n\t"
+        "b.ne 1f\n\t"
+        "add x1, x1, 1\n\t"
+
+        "fmov x2, d10\n\t"
+        "cmp x1, x2\n\t"
+        "b.ne 1f\n\t"
+        "add x1, x1, 1\n\t"
+
+        "fmov x2, d11\n\t"
+        "cmp x1, x2\n\t"
+        "b.ne 1f\n\t"
+        "add x1, x1, 1\n\t"
+
+        "fmov x2, d12\n\t"
+        "cmp x1, x2\n\t"
+        "b.ne 1f\n\t"
+        "add x1, x1, 1\n\t"
+
+        "fmov x2, d13\n\t"
+        "cmp x1, x2\n\t"
+        "b.ne 1f\n\t"
+        "add x1, x1, 1\n\t"
+
+        "fmov x2, d14\n\t"
+        "cmp x1, x2\n\t"
+        "b.ne 1f\n\t"
+        "add x1, x1, 1\n\t"
+
+        "fmov x2, d15\n\t"
+        "cmp x1, x2\n\t"
+        "b.ne 1f\n\t"
+
+        "mov %[fpr_result], #0\n\t"
+
+        // Finish up.
+        "2:\n\t"
+        "ldp x1, x2, [sp, #8]\n\t"     // Restore x1, x2
+        "ldp x3, x18, [sp, #24]\n\t"   // Restore x3, xSELF
+        "ldr x30, [sp, #40]\n\t"       // Restore xLR
+        "add sp, sp, #48\n\t"          // Free stack space
+        ".cfi_adjust_cfa_offset -48\n\t"
+        "mov %[result], x0\n\t"        // Save the result
+
+        "ldp d8, d9,   [sp]\n\t"       // Restore d8 - d15
+        "ldp d10, d11, [sp, #16]\n\t"
+        "ldp d12, d13, [sp, #32]\n\t"
+        "ldp d14, d15, [sp, #48]\n\t"
+        "add sp, sp, #64\n\t"
+        ".cfi_adjust_cfa_offset -64\n\t"
+
+        "b 3f\n\t"                     // Goto end
+
+        // Failed fpr verification.
+        "1:\n\t"
+        "mov %[fpr_result], #1\n\t"
+        "b 2b\n\t"                     // Goto finish-up
+
+        // End
+        "3:\n\t"
+        : [result] "=r" (result), [fpr_result] "=r" (fpr_result)
+        // Use the result from r0
+        : [arg0] "0"(arg0), [arg1] "r"(arg1), [arg2] "r"(arg2), [code] "r"(code), [self] "r"(self),
+          [referrer] "r"(referrer), [hidden] "r"(hidden)
+        : "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17");  // clobber.
+#elif defined(__x86_64__)
+    // Note: Uses the native convention
+    // TODO: Set the thread?
+    __asm__ __volatile__(
+        "movq %[hidden], %%r9\n\t"     // No need to save r9, listed as clobbered
+        "movd %%r9, %%xmm0\n\t"
+        "pushq %[referrer]\n\t"        // Push referrer
+        "pushq (%%rsp)\n\t"            // & 16B alignment padding
+        ".cfi_adjust_cfa_offset 16\n\t"
+        "call *%%rax\n\t"              // Call the stub
+        "addq $16, %%rsp\n\t"          // Pop nullptr and padding
+        ".cfi_adjust_cfa_offset -16\n\t"
+        : "=a" (result)
+        // Use the result from rax
+        : "D"(arg0), "S"(arg1), "d"(arg2), "a"(code), [referrer] "m"(referrer), [hidden] "m"(hidden)
+        // This places arg0 into rdi, arg1 into rsi, arg2 into rdx, and code into rax
+        : "rbx", "rcx", "rbp", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15");  // clobber all
+    // TODO: Should we clobber the other registers?
+#else
+    LOG(WARNING) << "Was asked to invoke for an architecture I do not understand.";
+    result = 0;
+#endif
+    // Pop transition.
+    self->PopManagedStackFragment(fragment);
+
+    fp_result = fpr_result;
+    EXPECT_EQ(0U, fp_result);
+
+    return result;
+  }
+
   // Method with 32b arg0, 64b arg1
   size_t Invoke3UWithReferrer(size_t arg0, uint64_t arg1, uintptr_t code, Thread* self,
                               mirror::ArtMethod* referrer) {
@@ -1448,4 +1679,116 @@ TEST_F(StubTest, Fields64) {
   TestFields(self, this, Primitive::Type::kPrimLong);
 }
 
+#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
+extern "C" void art_quick_imt_conflict_trampoline(void);
+#endif
+
+TEST_F(StubTest, IMT) {
+#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
+  TEST_DISABLED_FOR_HEAP_REFERENCE_POISONING();
+
+  Thread* self = Thread::Current();
+
+  ScopedObjectAccess soa(self);
+  StackHandleScope<7> hs(self);
+
+  JNIEnv* env = Thread::Current()->GetJniEnv();
+
+  // ArrayList
+
+  // Load ArrayList and used methods (JNI).
+  jclass arraylist_jclass = env->FindClass("java/util/ArrayList");
+  ASSERT_NE(nullptr, arraylist_jclass);
+  jmethodID arraylist_constructor = env->GetMethodID(arraylist_jclass, "<init>", "()V");
+  ASSERT_NE(nullptr, arraylist_constructor);
+  jmethodID contains_jmethod = env->GetMethodID(arraylist_jclass, "contains", "(Ljava/lang/Object;)Z");
+  ASSERT_NE(nullptr, contains_jmethod);
+  jmethodID add_jmethod = env->GetMethodID(arraylist_jclass, "add", "(Ljava/lang/Object;)Z");
+  ASSERT_NE(nullptr, add_jmethod);
+
+  // Get mirror representation.
+  Handle<mirror::ArtMethod> contains_amethod(hs.NewHandle(soa.DecodeMethod(contains_jmethod)));
+
+  // Patch up ArrayList.contains.
+  if (contains_amethod.Get()->GetEntryPointFromQuickCompiledCode() == nullptr) {
+    contains_amethod.Get()->SetEntryPointFromQuickCompiledCode(reinterpret_cast<void*>(
+        GetTlsPtr(self)->quick_entrypoints.pQuickToInterpreterBridge));
+  }
+
+  // List
+
+  // Load List and used methods (JNI).
+  jclass list_jclass = env->FindClass("java/util/List");
+  ASSERT_NE(nullptr, list_jclass);
+  jmethodID inf_contains_jmethod = env->GetMethodID(list_jclass, "contains", "(Ljava/lang/Object;)Z");
+  ASSERT_NE(nullptr, inf_contains_jmethod);
+
+  // Get mirror representation.
+  Handle<mirror::ArtMethod> inf_contains(hs.NewHandle(soa.DecodeMethod(inf_contains_jmethod)));
+
+  // Object
+
+  jclass obj_jclass = env->FindClass("java/lang/Object");
+  ASSERT_NE(nullptr, obj_jclass);
+  jmethodID obj_constructor = env->GetMethodID(obj_jclass, "<init>", "()V");
+  ASSERT_NE(nullptr, obj_constructor);
+
+  // Sanity check: check that there is a conflict for List.contains in ArrayList.
+
+  mirror::Class* arraylist_class = soa.Decode<mirror::Class*>(arraylist_jclass);
+  mirror::ArtMethod* m = arraylist_class->GetImTable()->Get(
+      inf_contains->GetDexMethodIndex() % ClassLinker::kImtSize);
+
+  if (!m->IsImtConflictMethod()) {
+    LOG(WARNING) << "Test is meaningless, no IMT conflict in setup: " <<
+        PrettyMethod(m, true);
+    LOG(WARNING) << "Please update StubTest.IMT.";
+    return;
+  }
+
+  // Create instances.
+
+  jobject jarray_list = env->NewObject(arraylist_jclass, arraylist_constructor);
+  ASSERT_NE(nullptr, jarray_list);
+  Handle<mirror::Object> array_list(hs.NewHandle(soa.Decode<mirror::Object*>(jarray_list)));
+
+  jobject jobj = env->NewObject(obj_jclass, obj_constructor);
+  ASSERT_NE(nullptr, jobj);
+  Handle<mirror::Object> obj(hs.NewHandle(soa.Decode<mirror::Object*>(jobj)));
+
+  // Invoke.
+
+  size_t result =
+      Invoke3WithReferrerAndHidden(0U, reinterpret_cast<size_t>(array_list.Get()),
+                                   reinterpret_cast<size_t>(obj.Get()),
+                                   reinterpret_cast<uintptr_t>(&art_quick_imt_conflict_trampoline),
+                                   self, contains_amethod.Get(),
+                                   static_cast<size_t>(inf_contains.Get()->GetDexMethodIndex()));
+
+  ASSERT_FALSE(self->IsExceptionPending());
+  EXPECT_EQ(static_cast<size_t>(JNI_FALSE), result);
+
+  // Add object.
+
+  env->CallBooleanMethod(jarray_list, add_jmethod, jobj);
+
+  ASSERT_FALSE(self->IsExceptionPending()) << PrettyTypeOf(self->GetException(nullptr));
+
+  // Invoke again.
+
+  result = Invoke3WithReferrerAndHidden(0U, reinterpret_cast<size_t>(array_list.Get()),
+                                        reinterpret_cast<size_t>(obj.Get()),
+                                        reinterpret_cast<uintptr_t>(&art_quick_imt_conflict_trampoline),
+                                        self, contains_amethod.Get(),
+                                        static_cast<size_t>(inf_contains.Get()->GetDexMethodIndex()));
+
+  ASSERT_FALSE(self->IsExceptionPending());
+  EXPECT_EQ(static_cast<size_t>(JNI_TRUE), result);
+#else
+  LOG(INFO) << "Skipping memcpy as I don't know how to do that on " << kRuntimeISA;
+  // Force-print to std::cout so it's also outside the logcat.
+  std::cout << "Skipping memcpy as I don't know how to do that on " << kRuntimeISA << std::endl;
+#endif
+}
+
 }  // namespace art
diff --git a/runtime/arch/x86/asm_support_x86.S b/runtime/arch/x86/asm_support_x86.S
index 909bd3ece7..f1d07464e3 100644
--- a/runtime/arch/x86/asm_support_x86.S
+++ b/runtime/arch/x86/asm_support_x86.S
@@ -19,7 +19,7 @@
 
 #include "asm_support_x86.h"
 
-#if defined(__clang__) && (__clang_major__ < 4) && (__clang_minor__ < 5)
+#if defined(__APPLE__) || (defined(__clang__) && (__clang_major__ < 4) && (__clang_minor__ < 5))
     // Clang's as(1) doesn't let you name macro parameters prior to 3.5.
     #define MACRO0(macro_name) .macro macro_name
     #define MACRO1(macro_name, macro_arg1) .macro macro_name
@@ -32,8 +32,6 @@
     #define PLT_VAR(name, index) SYMBOL($index)
     #define REG_VAR(name,index) %$index
     #define CALL_MACRO(name,index) $index
-    #define FUNCTION_TYPE(name,index) .type $index, @function
-    #define SIZE(name,index) .size $index, .-$index
 
     //  The use of $x for arguments mean that literals need to be represented with $$x in macros.
     #define LITERAL(value) $value
@@ -56,13 +54,22 @@
     #define PLT_VAR(name, index) name&@PLT
     #define REG_VAR(name,index) %name
     #define CALL_MACRO(name,index) name&
-    #define FUNCTION_TYPE(name,index) .type name&, @function
-    #define SIZE(name,index) .size name, .-name
 
     #define LITERAL(value) $value
     #define MACRO_LITERAL(value) $value
 #endif
 
+#if defined(__APPLE__)
+    #define FUNCTION_TYPE(name,index)
+    #define SIZE(name,index)
+#elif defined(__clang__) && (__clang_major__ < 4) && (__clang_minor__ < 5)
+    #define FUNCTION_TYPE(name,index) .type $index, @function
+    #define SIZE(name,index) .size $index, .-$index
+#else
+    #define FUNCTION_TYPE(name,index) .type name&, @function
+    #define SIZE(name,index) .size name, .-name
+#endif
+
     // CFI support.
 #if !defined(__APPLE__)
     #define CFI_STARTPROC .cfi_startproc
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 9c86c75ddd..ed7f246466 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -239,24 +239,45 @@ TWO_ARG_RUNTIME_EXCEPTION art_quick_throw_array_bounds, artThrowArrayBoundsFromC
 
     /*
      * All generated callsites for interface invokes and invocation slow paths will load arguments
-     * as usual - except instead of loading arg0/r0 with the target Method*, arg0/r0 will contain
+     * as usual - except instead of loading arg0/rdi with the target Method*, arg0/rdi will contain
      * the method_idx.  This wrapper will save arg1-arg3, load the caller's Method*, align the
      * stack and call the appropriate C helper.
-     * NOTE: "this" is first visible argument of the target, and so can be found in arg1/r1.
+     * NOTE: "this" is first visible argument of the target, and so can be found in arg1/rsi.
      *
-     * The helper will attempt to locate the target and return a 64-bit result in r0/r1 consisting
-     * of the target Method* in r0 and method->code_ in r1.
+     * The helper will attempt to locate the target and return a 128-bit result in rax/rdx consisting
+     * of the target Method* in rax and method->code_ in rdx.
      *
-     * If unsuccessful, the helper will return NULL/NULL. There will bea pending exception in the
+     * If unsuccessful, the helper will return NULL/????. There will be a pending exception in the
      * thread and we branch to another stub to deliver it.
      *
-     * On success this wrapper will restore arguments and *jump* to the target, leaving the lr
-     * pointing back to the original caller.
+     * On success this wrapper will restore arguments and *jump* to the target, leaving the return
+     * location on the stack.
+     *
+     * Adapted from x86 code.
      */
 MACRO2(INVOKE_TRAMPOLINE, c_name, cxx_name)
     DEFINE_FUNCTION VAR(c_name, 0)
-    int3
-    int3
+    SETUP_REF_AND_ARGS_CALLEE_SAVE_FRAME  // save callee saves in case allocation triggers GC
+    // Helper signature is always
+    // (method_idx, *this_object, *caller_method, *self, sp)
+
+    movq FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE(%rsp), %rdx  // pass caller Method*
+    movq %gs:THREAD_SELF_OFFSET, %rcx                      // pass Thread
+    movq %rsp, %r8                                         // pass SP
+
+    call PLT_VAR(cxx_name, 1)                   // cxx_name(arg1, arg2, caller method*, Thread*, SP)
+                                                           // save the code pointer
+    movq %rax, %rdi
+    movq %rdx, %rax
+    RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME
+
+    testq %rdi, %rdi
+    jz 1f
+
+    // Tail call to intended method.
+    jmp *%rax
+1:
+    DELIVER_PENDING_EXCEPTION
     END_FUNCTION VAR(c_name, 0)
 END_MACRO
 
@@ -977,9 +998,18 @@ DEFINE_FUNCTION art_quick_proxy_invoke_handler
 END_FUNCTION art_quick_proxy_invoke_handler
 
     /*
-     * Called to resolve an imt conflict.
+     * Called to resolve an imt conflict. Clobbers %rax (which will be clobbered later anyways).
+     *
+     * xmm0 is a hidden argument that holds the target method's dex method index.
+     * TODO: With proper hard-float support, this needs to be kept in sync with the quick compiler.
      */
-UNIMPLEMENTED art_quick_imt_conflict_trampoline
+DEFINE_FUNCTION art_quick_imt_conflict_trampoline
+    movq 16(%rsp), %rdi            // load caller Method*
+    movl METHOD_DEX_CACHE_METHODS_OFFSET(%rdi), %edi  // load dex_cache_resolved_methods
+    movd %xmm0, %rax               // get target method index stored in xmm0
+    movl OBJECT_ARRAY_DATA_OFFSET(%rdi, %rax, 4), %edi  // load the target method
+    jmp art_quick_invoke_interface_trampoline_local
+END_FUNCTION art_quick_imt_conflict_trampoline
 
 DEFINE_FUNCTION art_quick_resolution_trampoline
     SETUP_REF_AND_ARGS_CALLEE_SAVE_FRAME
diff --git a/runtime/atomic.h b/runtime/atomic.h
index 1f975dc0ce..9262db6724 100644
--- a/runtime/atomic.h
+++ b/runtime/atomic.h
@@ -17,7 +17,15 @@
 #ifndef ART_RUNTIME_ATOMIC_H_
 #define ART_RUNTIME_ATOMIC_H_
 
+#ifdef __clang__
+#define ART_HAVE_STDATOMIC 1
+#endif
+
 #include <stdint.h>
+#if ART_HAVE_STDATOMIC
+#include <atomic>
+#endif
+#include <limits>
 #include <vector>
 
 #include "base/logging.h"
@@ -27,6 +35,76 @@ namespace art {
 
 class Mutex;
 
+#if ART_HAVE_STDATOMIC
+template<typename T>
+class Atomic : public std::atomic<T> {
+ public:
+  COMPILE_ASSERT(sizeof(T) == sizeof(std::atomic<T>),
+                 std_atomic_size_differs_from_that_of_underlying_type);
+  COMPILE_ASSERT(alignof(T) == alignof(std::atomic<T>),
+                 std_atomic_alignment_differs_from_that_of_underlying_type);
+
+  Atomic<T>() : std::atomic<T>() { }
+
+  explicit Atomic<T>(T value) : std::atomic<T>(value) { }
+
+  // Load from memory without ordering or synchronization constraints.
+  T LoadRelaxed() const {
+    return this->load(std::memory_order_relaxed);
+  }
+
+  // Load from memory with a total ordering.
+  T LoadSequentiallyConsistent() const {
+    return this->load(std::memory_order_seq_cst);
+  }
+
+  // Store to memory without ordering or synchronization constraints.
+  void StoreRelaxed(T desired) {
+    this->store(desired, std::memory_order_relaxed);
+  }
+
+  // Store to memory with a total ordering.
+  void StoreSequentiallyConsistent(T desired) {
+    this->store(desired, std::memory_order_seq_cst);
+  }
+
+  // Atomically replace the value with desired value if it matches the expected value. Doesn't
+  // imply ordering or synchronization constraints.
+  bool CompareExchangeWeakRelaxed(T expected_value, T desired_value) {
+    return this->compare_exchange_weak(expected_value, desired_value, std::memory_order_relaxed);
+  }
+
+  // Atomically replace the value with desired value if it matches the expected value. Prior writes
+  // made to other memory locations by the thread that did the release become visible in this
+  // thread.
+  bool CompareExchangeWeakAcquire(T expected_value, T desired_value) {
+    return this->compare_exchange_weak(expected_value, desired_value, std::memory_order_acquire);
+  }
+
+  // Atomically replace the value with desired value if it matches the expected value. prior writes
+  // to other memory locations become visible to the threads that do a consume or an acquire on the
+  // same location.
+  bool CompareExchangeWeakRelease(T expected_value, T desired_value) {
+    return this->compare_exchange_weak(expected_value, desired_value, std::memory_order_release);
+  }
+
+  T FetchAndAddSequentiallyConsistent(const T value) {
+    return this->fetch_add(value, std::memory_order_seq_cst);  // Return old_value.
+  }
+
+  T FetchAndSubSequentiallyConsistent(const T value) {
+    return this->fetch_sub(value, std::memory_order_seq_cst);  // Return old value.
+  }
+
+  volatile T* Address() {
+    return reinterpret_cast<T*>(this);
+  }
+
+  static T MaxValue() {
+    return std::numeric_limits<T>::max();
+  }
+};
+#else
 template<typename T>
 class Atomic {
  public:
@@ -34,24 +112,54 @@ class Atomic {
 
   explicit Atomic<T>(T value) : value_(value) { }
 
-  Atomic<T>& operator=(T desired) {
-    Store(desired);
-    return *this;
+  // Load from memory without ordering or synchronization constraints.
+  T LoadRelaxed() const {
+    return value_;
   }
 
-  T Load() const {
-    return value_;
+  // Load from memory with a total ordering.
+  T LoadSequentiallyConsistent() const;
+
+  // Store to memory without ordering or synchronization constraints.
+  void StoreRelaxed(T desired) {
+    value_ = desired;
+  }
+
+  // Store to memory with a total ordering.
+  void StoreSequentiallyConsistent(T desired);
+
+  // Atomically replace the value with desired value if it matches the expected value. Doesn't
+  // imply ordering or synchronization constraints.
+  bool CompareExchangeWeakRelaxed(T expected_value, T desired_value) {
+    // TODO: make this relaxed.
+    return __sync_bool_compare_and_swap(&value_, expected_value, desired_value);
   }
 
-  operator T() const {
-    return Load();
+  // Atomically replace the value with desired value if it matches the expected value. Prior writes
+  // made to other memory locations by the thread that did the release become visible in this
+  // thread.
+  bool CompareExchangeWeakAcquire(T expected_value, T desired_value) {
+    // TODO: make this acquire.
+    return __sync_bool_compare_and_swap(&value_, expected_value, desired_value);
+  }
+
+  // Atomically replace the value with desired value if it matches the expected value. prior writes
+  // to other memory locations become visible to the threads that do a consume or an acquire on the
+  // same location.
+  bool CompareExchangeWeakRelease(T expected_value, T desired_value) {
+    // TODO: make this release.
+    return __sync_bool_compare_and_swap(&value_, expected_value, desired_value);
+  }
+
+  volatile T* Address() {
+    return &value_;
   }
 
-  T FetchAndAdd(const T value) {
+  T FetchAndAddSequentiallyConsistent(const T value) {
     return __sync_fetch_and_add(&value_, value);  // Return old_value.
   }
 
-  T FetchAndSub(const T value) {
+  T FetchAndSubSequentiallyConsistent(const T value) {
     return __sync_fetch_and_sub(&value_, value);  // Return old value.
   }
 
@@ -71,22 +179,14 @@ class Atomic {
     return __sync_fetch_and_sub(&value_, 1);  // Return old value.
   }
 
-  bool CompareAndSwap(T expected_value, T desired_value) {
-    return __sync_bool_compare_and_swap(&value_, expected_value, desired_value);
-  }
-
-  volatile T* Address() {
-    return &value_;
+  static T MaxValue() {
+    return std::numeric_limits<T>::max();
   }
 
  private:
-  // Unsafe = operator for non atomic operations on the integer.
-  void Store(T desired) {
-    value_ = desired;
-  }
-
-  volatile T value_;
+  T value_;
 };
+#endif
 
 typedef Atomic<int32_t> AtomicInteger;
 
@@ -260,6 +360,23 @@ class QuasiAtomic {
   DISALLOW_COPY_AND_ASSIGN(QuasiAtomic);
 };
 
+#if !ART_HAVE_STDATOMIC
+template<typename T>
+inline T Atomic<T>::LoadSequentiallyConsistent() const {
+  T result = value_;
+  QuasiAtomic::MembarLoadLoad();
+  return result;
+}
+
+template<typename T>
+inline void Atomic<T>::StoreSequentiallyConsistent(T desired) {
+  QuasiAtomic::MembarStoreStore();
+  value_ = desired;
+  QuasiAtomic::MembarStoreLoad();
+}
+
+#endif
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_ATOMIC_H_
diff --git a/runtime/barrier_test.cc b/runtime/barrier_test.cc
index 331d0c0175..086ef440a3 100644
--- a/runtime/barrier_test.cc
+++ b/runtime/barrier_test.cc
@@ -77,20 +77,20 @@ TEST_F(BarrierTest, CheckWait) {
   barrier.Increment(self, num_threads);
   // At this point each thread should have passed through the barrier. The first count should be
   // equal to num_threads.
-  EXPECT_EQ(num_threads, count1);
+  EXPECT_EQ(num_threads, count1.LoadRelaxed());
   // Count 3 should still be zero since no thread should have gone past the second barrier.
-  EXPECT_EQ(0, count3);
+  EXPECT_EQ(0, count3.LoadRelaxed());
   // Now lets tell the threads to pass again.
   barrier.Increment(self, num_threads);
   // Count 2 should be equal to num_threads since each thread must have passed the second barrier
   // at this point.
-  EXPECT_EQ(num_threads, count2);
+  EXPECT_EQ(num_threads, count2.LoadRelaxed());
   // Wait for all the threads to finish.
   thread_pool.Wait(self, true, false);
   // All three counts should be equal to num_threads now.
-  EXPECT_EQ(count1, count2);
-  EXPECT_EQ(count2, count3);
-  EXPECT_EQ(num_threads, count3);
+  EXPECT_EQ(count1.LoadRelaxed(), count2.LoadRelaxed());
+  EXPECT_EQ(count2.LoadRelaxed(), count3.LoadRelaxed());
+  EXPECT_EQ(num_threads, count3.LoadRelaxed());
 }
 
 class CheckPassTask : public Task {
@@ -133,7 +133,7 @@ TEST_F(BarrierTest, CheckPass) {
   // Wait for all the tasks to complete using the barrier.
   barrier.Increment(self, expected_total_tasks);
   // The total number of completed tasks should be equal to expected_total_tasks.
-  EXPECT_EQ(count, expected_total_tasks);
+  EXPECT_EQ(count.LoadRelaxed(), expected_total_tasks);
 }
 
 }  // namespace art
diff --git a/runtime/base/macros.h b/runtime/base/macros.h
index 81755144b9..47571f85bf 100644
--- a/runtime/base/macros.h
+++ b/runtime/base/macros.h
@@ -169,7 +169,7 @@ char (&ArraySizeHelper(T (&array)[N]))[N];
 // bionic and glibc both have TEMP_FAILURE_RETRY, but Mac OS' libc doesn't.
 #ifndef TEMP_FAILURE_RETRY
 #define TEMP_FAILURE_RETRY(exp) ({ \
-  typeof(exp) _rc; \
+  decltype(exp) _rc; \
   do { \
     _rc = (exp); \
   } while (_rc == -1 && errno == EINTR); \
diff --git a/runtime/base/mutex-inl.h b/runtime/base/mutex-inl.h
index a7e25cb907..adf4c66aa4 100644
--- a/runtime/base/mutex-inl.h
+++ b/runtime/base/mutex-inl.h
@@ -221,7 +221,7 @@ inline void ReaderWriterMutex::SharedUnlock(Thread* self) {
       // Reduce state by 1.
       done = android_atomic_release_cas(cur_state, cur_state - 1, &state_) == 0;
       if (done && (cur_state - 1) == 0) {  // cas may fail due to noise?
-        if (num_pending_writers_ > 0 || num_pending_readers_ > 0) {
+        if (num_pending_writers_.LoadRelaxed() > 0 || num_pending_readers_ > 0) {
           // Wake any exclusive waiters as there are now no readers.
           futex(&state_, FUTEX_WAKE, -1, NULL, NULL, 0);
         }
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 2bc17bf403..6f7f2c1e99 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -71,12 +71,12 @@ static bool ComputeRelativeTimeSpec(timespec* result_ts, const timespec& lhs, co
 class ScopedAllMutexesLock {
  public:
   explicit ScopedAllMutexesLock(const BaseMutex* mutex) : mutex_(mutex) {
-    while (!gAllMutexData->all_mutexes_guard.CompareAndSwap(0, mutex)) {
+    while (!gAllMutexData->all_mutexes_guard.CompareExchangeWeakAcquire(0, mutex)) {
       NanoSleep(100);
     }
   }
   ~ScopedAllMutexesLock() {
-    while (!gAllMutexData->all_mutexes_guard.CompareAndSwap(mutex_, 0)) {
+    while (!gAllMutexData->all_mutexes_guard.CompareExchangeWeakRelease(mutex_, 0)) {
       NanoSleep(100);
     }
   }
@@ -174,34 +174,34 @@ void BaseMutex::RecordContention(uint64_t blocked_tid,
                                  uint64_t owner_tid,
                                  uint64_t nano_time_blocked) {
   if (kLogLockContentions) {
-    ContentionLogData* data = contetion_log_data_;
+    ContentionLogData* data = contention_log_data_;
     ++(data->contention_count);
     data->AddToWaitTime(nano_time_blocked);
     ContentionLogEntry* log = data->contention_log;
     // This code is intentionally racy as it is only used for diagnostics.
-    uint32_t slot = data->cur_content_log_entry;
+    uint32_t slot = data->cur_content_log_entry.LoadRelaxed();
     if (log[slot].blocked_tid == blocked_tid &&
         log[slot].owner_tid == blocked_tid) {
       ++log[slot].count;
     } else {
       uint32_t new_slot;
       do {
-        slot = data->cur_content_log_entry;
+        slot = data->cur_content_log_entry.LoadRelaxed();
         new_slot = (slot + 1) % kContentionLogSize;
-      } while (!data->cur_content_log_entry.CompareAndSwap(slot, new_slot));
+      } while (!data->cur_content_log_entry.CompareExchangeWeakRelaxed(slot, new_slot));
       log[new_slot].blocked_tid = blocked_tid;
       log[new_slot].owner_tid = owner_tid;
-      log[new_slot].count = 1;
+      log[new_slot].count.StoreRelaxed(1);
     }
   }
 }
 
 void BaseMutex::DumpContention(std::ostream& os) const {
   if (kLogLockContentions) {
-    const ContentionLogData* data = contetion_log_data_;
+    const ContentionLogData* data = contention_log_data_;
     const ContentionLogEntry* log = data->contention_log;
     uint64_t wait_time = data->wait_time;
-    uint32_t contention_count = data->contention_count;
+    uint32_t contention_count = data->contention_count.LoadRelaxed();
     if (contention_count == 0) {
       os << "never contended";
     } else {
@@ -213,7 +213,7 @@ void BaseMutex::DumpContention(std::ostream& os) const {
       for (size_t i = 0; i < kContentionLogSize; ++i) {
         uint64_t blocked_tid = log[i].blocked_tid;
         uint64_t owner_tid = log[i].owner_tid;
-        uint32_t count = log[i].count;
+        uint32_t count = log[i].count.LoadRelaxed();
         if (count > 0) {
           auto it = most_common_blocked.find(blocked_tid);
           if (it != most_common_blocked.end()) {
@@ -261,7 +261,7 @@ Mutex::Mutex(const char* name, LockLevel level, bool recursive)
 #if ART_USE_FUTEXES
   state_ = 0;
   exclusive_owner_ = 0;
-  num_contenders_ = 0;
+  DCHECK_EQ(0, num_contenders_.LoadRelaxed());
 #elif defined(__BIONIC__) || defined(__APPLE__)
   // Use recursive mutexes for bionic and Apple otherwise the
   // non-recursive mutexes don't have TIDs to check lock ownership of.
@@ -283,7 +283,8 @@ Mutex::~Mutex() {
     LOG(shutting_down ? WARNING : FATAL) << "destroying mutex with owner: " << exclusive_owner_;
   } else {
     CHECK_EQ(exclusive_owner_, 0U)  << "unexpectedly found an owner on unlocked mutex " << name_;
-    CHECK_EQ(num_contenders_, 0) << "unexpectedly found a contender on mutex " << name_;
+    CHECK_EQ(num_contenders_.LoadRelaxed(), 0)
+        << "unexpectedly found a contender on mutex " << name_;
   }
 #else
   // We can't use CHECK_MUTEX_CALL here because on shutdown a suspended daemon thread
@@ -406,7 +407,7 @@ void Mutex::ExclusiveUnlock(Thread* self) {
       done =  __sync_bool_compare_and_swap(&state_, cur_state, 0 /* new state */);
       if (LIKELY(done)) {  // Spurious fail?
         // Wake a contender
-        if (UNLIKELY(num_contenders_ > 0)) {
+        if (UNLIKELY(num_contenders_.LoadRelaxed() > 0)) {
           futex(&state_, FUTEX_WAKE, 1, NULL, NULL, 0);
         }
       }
@@ -459,7 +460,7 @@ ReaderWriterMutex::~ReaderWriterMutex() {
   CHECK_EQ(state_, 0);
   CHECK_EQ(exclusive_owner_, 0U);
   CHECK_EQ(num_pending_readers_, 0);
-  CHECK_EQ(num_pending_writers_, 0);
+  CHECK_EQ(num_pending_writers_.LoadRelaxed(), 0);
 #else
   // We can't use CHECK_MUTEX_CALL here because on shutdown a suspended daemon thread
   // may still be using locks.
@@ -523,7 +524,7 @@ void ReaderWriterMutex::ExclusiveUnlock(Thread* self) {
       done =  __sync_bool_compare_and_swap(&state_, -1 /* cur_state*/, 0 /* new state */);
       if (LIKELY(done)) {  // cmpxchg may fail due to noise?
         // Wake any waiters.
-        if (UNLIKELY(num_pending_readers_ > 0 || num_pending_writers_ > 0)) {
+        if (UNLIKELY(num_pending_readers_ > 0 || num_pending_writers_.LoadRelaxed() > 0)) {
           futex(&state_, FUTEX_WAKE, -1, NULL, NULL, 0);
         }
       }
@@ -646,7 +647,7 @@ std::ostream& operator<<(std::ostream& os, const ReaderWriterMutex& mu) {
 ConditionVariable::ConditionVariable(const char* name, Mutex& guard)
     : name_(name), guard_(guard) {
 #if ART_USE_FUTEXES
-  sequence_ = 0;
+  DCHECK_EQ(0, sequence_.LoadRelaxed());
   num_waiters_ = 0;
 #else
   pthread_condattr_t cond_attrs;
@@ -691,7 +692,7 @@ void ConditionVariable::Broadcast(Thread* self) {
     sequence_++;  // Indicate the broadcast occurred.
     bool done = false;
     do {
-      int32_t cur_sequence = sequence_;
+      int32_t cur_sequence = sequence_.LoadRelaxed();
       // Requeue waiters onto mutex. The waiter holds the contender count on the mutex high ensuring
       // mutex unlocks will awaken the requeued waiter thread.
       done = futex(sequence_.Address(), FUTEX_CMP_REQUEUE, 0,
@@ -740,7 +741,7 @@ void ConditionVariable::WaitHoldingLocks(Thread* self) {
   // Ensure the Mutex is contended so that requeued threads are awoken.
   guard_.num_contenders_++;
   guard_.recursion_count_ = 1;
-  int32_t cur_sequence = sequence_;
+  int32_t cur_sequence = sequence_.LoadRelaxed();
   guard_.ExclusiveUnlock(self);
   if (futex(sequence_.Address(), FUTEX_WAIT, cur_sequence, NULL, NULL, 0) != 0) {
     // Futex failed, check it is an expected error.
@@ -754,7 +755,7 @@ void ConditionVariable::WaitHoldingLocks(Thread* self) {
   CHECK_GE(num_waiters_, 0);
   num_waiters_--;
   // We awoke and so no longer require awakes from the guard_'s unlock.
-  CHECK_GE(guard_.num_contenders_, 0);
+  CHECK_GE(guard_.num_contenders_.LoadRelaxed(), 0);
   guard_.num_contenders_--;
 #else
   guard_.recursion_count_ = 0;
@@ -775,7 +776,7 @@ void ConditionVariable::TimedWait(Thread* self, int64_t ms, int32_t ns) {
   // Ensure the Mutex is contended so that requeued threads are awoken.
   guard_.num_contenders_++;
   guard_.recursion_count_ = 1;
-  int32_t cur_sequence = sequence_;
+  int32_t cur_sequence = sequence_.LoadRelaxed();
   guard_.ExclusiveUnlock(self);
   if (futex(sequence_.Address(), FUTEX_WAIT, cur_sequence, &rel_ts, NULL, 0) != 0) {
     if (errno == ETIMEDOUT) {
@@ -790,7 +791,7 @@ void ConditionVariable::TimedWait(Thread* self, int64_t ms, int32_t ns) {
   CHECK_GE(num_waiters_, 0);
   num_waiters_--;
   // We awoke and so no longer require awakes from the guard_'s unlock.
-  CHECK_GE(guard_.num_contenders_, 0);
+  CHECK_GE(guard_.num_contenders_.LoadRelaxed(), 0);
   guard_.num_contenders_--;
 #else
 #if !defined(__APPLE__)
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index 3f35670d6c..e13c8d5d62 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -160,12 +160,12 @@ class BaseMutex {
     void AddToWaitTime(uint64_t value);
     ContentionLogData() : wait_time(0) {}
   };
-  ContentionLogData contetion_log_data_[kContentionLogDataSize];
+  ContentionLogData contention_log_data_[kContentionLogDataSize];
 
  public:
   bool HasEverContended() const {
     if (kLogLockContentions) {
-      return contetion_log_data_->contention_count > 0;
+      return contention_log_data_->contention_count.LoadSequentiallyConsistent() > 0;
     }
     return false;
   }
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 4bd86db47f..ee276c1670 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -1602,15 +1602,72 @@ extern "C" uint64_t artQuickGenericJniEndTrampoline(Thread* self, mirror::ArtMet
   }
 }
 
+// The following definitions create return types for two word-sized entities that will be passed
+// in registers so that memory operations for the interface trampolines can be avoided. The entities
+// are the resolved method and the pointer to the code to be invoked.
+//
+// On x86, ARM32 and MIPS, this is given for a *scalar* 64bit value. The definition thus *must* be
+// uint64_t or long long int. We use the upper 32b for code, and the lower 32b for the method.
+//
+// On x86_64 and ARM64, structs are decomposed for allocation, so we can create a structs of two
+// size_t-sized values.
+//
+// We need two operations:
+//
+// 1) A flag value that signals failure. The assembly stubs expect the method part to be "0".
+//    GetFailureValue() will return a value that has method == 0.
+//
+// 2) A value that combines a code pointer and a method pointer.
+//    GetSuccessValue() constructs this.
+
+#if defined(__i386__) || defined(__arm__) || defined(__mips__)
+typedef uint64_t MethodAndCode;
+
+// Encodes method_ptr==nullptr and code_ptr==nullptr
+static constexpr MethodAndCode GetFailureValue() {
+  return 0;
+}
+
+// Use the lower 32b for the method pointer and the upper 32b for the code pointer.
+static MethodAndCode GetSuccessValue(const void* code, mirror::ArtMethod* method) {
+  uint32_t method_uint = reinterpret_cast<uint32_t>(method);
+  uint64_t code_uint = reinterpret_cast<uint32_t>(code);
+  return ((code_uint << 32) | method_uint);
+}
+
+#elif defined(__x86_64__) || defined(__aarch64__)
+struct MethodAndCode {
+  uintptr_t method;
+  uintptr_t code;
+};
+
+// Encodes method_ptr==nullptr. Leaves random value in code pointer.
+static MethodAndCode GetFailureValue() {
+  MethodAndCode ret;
+  ret.method = 0;
+  return ret;
+}
+
+// Write values into their respective members.
+static MethodAndCode GetSuccessValue(const void* code, mirror::ArtMethod* method) {
+  MethodAndCode ret;
+  ret.method = reinterpret_cast<uintptr_t>(method);
+  ret.code = reinterpret_cast<uintptr_t>(code);
+  return ret;
+}
+#else
+#error "Unsupported architecture"
+#endif
+
 template<InvokeType type, bool access_check>
-static uint64_t artInvokeCommon(uint32_t method_idx, mirror::Object* this_object,
-                                mirror::ArtMethod* caller_method,
-                                Thread* self, mirror::ArtMethod** sp);
+static MethodAndCode artInvokeCommon(uint32_t method_idx, mirror::Object* this_object,
+                                     mirror::ArtMethod* caller_method,
+                                     Thread* self, mirror::ArtMethod** sp);
 
 template<InvokeType type, bool access_check>
-static uint64_t artInvokeCommon(uint32_t method_idx, mirror::Object* this_object,
-                                mirror::ArtMethod* caller_method,
-                                Thread* self, mirror::ArtMethod** sp) {
+static MethodAndCode artInvokeCommon(uint32_t method_idx, mirror::Object* this_object,
+                                     mirror::ArtMethod* caller_method,
+                                     Thread* self, mirror::ArtMethod** sp) {
   mirror::ArtMethod* method = FindMethodFast(method_idx, this_object, caller_method, access_check,
                                              type);
   if (UNLIKELY(method == nullptr)) {
@@ -1630,7 +1687,7 @@ static uint64_t artInvokeCommon(uint32_t method_idx, mirror::Object* this_object
 
     if (UNLIKELY(method == NULL)) {
       CHECK(self->IsExceptionPending());
-      return 0;  // failure
+      return GetFailureValue();  // Failure.
     }
   }
   DCHECK(!self->IsExceptionPending());
@@ -1639,24 +1696,17 @@ static uint64_t artInvokeCommon(uint32_t method_idx, mirror::Object* this_object
   // When we return, the caller will branch to this address, so it had better not be 0!
   DCHECK(code != nullptr) << "Code was NULL in method: " << PrettyMethod(method) << " location: "
       << MethodHelper(method).GetDexFile().GetLocation();
-#ifdef __LP64__
-  UNIMPLEMENTED(FATAL);
-  return 0;
-#else
-  uint32_t method_uint = reinterpret_cast<uint32_t>(method);
-  uint64_t code_uint = reinterpret_cast<uint32_t>(code);
-  uint64_t result = ((code_uint << 32) | method_uint);
-  return result;
-#endif
+
+  return GetSuccessValue(code, method);
 }
 
 // Explicit artInvokeCommon template function declarations to please analysis tool.
 #define EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(type, access_check)                                \
   template SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)                                          \
-  uint64_t artInvokeCommon<type, access_check>(uint32_t method_idx,                             \
-                                               mirror::Object* this_object,                     \
-                                               mirror::ArtMethod* caller_method,                \
-                                               Thread* self, mirror::ArtMethod** sp)            \
+  MethodAndCode artInvokeCommon<type, access_check>(uint32_t method_idx,                        \
+                                                    mirror::Object* this_object,                \
+                                                    mirror::ArtMethod* caller_method,           \
+                                                    Thread* self, mirror::ArtMethod** sp)       \
 
 EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(kVirtual, false);
 EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(kVirtual, true);
@@ -1672,57 +1722,57 @@ EXPLICIT_INVOKE_COMMON_TEMPLATE_DECL(kSuper, true);
 
 
 // See comments in runtime_support_asm.S
-extern "C" uint64_t artInvokeInterfaceTrampolineWithAccessCheck(uint32_t method_idx,
-                                                                mirror::Object* this_object,
-                                                                mirror::ArtMethod* caller_method,
-                                                                Thread* self,
-                                                                mirror::ArtMethod** sp)
+extern "C" MethodAndCode artInvokeInterfaceTrampolineWithAccessCheck(uint32_t method_idx,
+                                                                     mirror::Object* this_object,
+                                                                     mirror::ArtMethod* caller_method,
+                                                                     Thread* self,
+                                                                     mirror::ArtMethod** sp)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   return artInvokeCommon<kInterface, true>(method_idx, this_object, caller_method, self, sp);
 }
 
 
-extern "C" uint64_t artInvokeDirectTrampolineWithAccessCheck(uint32_t method_idx,
-                                                             mirror::Object* this_object,
-                                                             mirror::ArtMethod* caller_method,
-                                                             Thread* self,
-                                                             mirror::ArtMethod** sp)
+extern "C" MethodAndCode artInvokeDirectTrampolineWithAccessCheck(uint32_t method_idx,
+                                                                  mirror::Object* this_object,
+                                                                  mirror::ArtMethod* caller_method,
+                                                                  Thread* self,
+                                                                  mirror::ArtMethod** sp)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   return artInvokeCommon<kDirect, true>(method_idx, this_object, caller_method, self, sp);
 }
 
-extern "C" uint64_t artInvokeStaticTrampolineWithAccessCheck(uint32_t method_idx,
-                                                             mirror::Object* this_object,
-                                                             mirror::ArtMethod* caller_method,
-                                                             Thread* self,
-                                                             mirror::ArtMethod** sp)
+extern "C" MethodAndCode artInvokeStaticTrampolineWithAccessCheck(uint32_t method_idx,
+                                                                  mirror::Object* this_object,
+                                                                  mirror::ArtMethod* caller_method,
+                                                                  Thread* self,
+                                                                  mirror::ArtMethod** sp)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   return artInvokeCommon<kStatic, true>(method_idx, this_object, caller_method, self, sp);
 }
 
-extern "C" uint64_t artInvokeSuperTrampolineWithAccessCheck(uint32_t method_idx,
-                                                            mirror::Object* this_object,
-                                                            mirror::ArtMethod* caller_method,
-                                                            Thread* self,
-                                                            mirror::ArtMethod** sp)
+extern "C" MethodAndCode artInvokeSuperTrampolineWithAccessCheck(uint32_t method_idx,
+                                                                 mirror::Object* this_object,
+                                                                 mirror::ArtMethod* caller_method,
+                                                                 Thread* self,
+                                                                 mirror::ArtMethod** sp)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   return artInvokeCommon<kSuper, true>(method_idx, this_object, caller_method, self, sp);
 }
 
-extern "C" uint64_t artInvokeVirtualTrampolineWithAccessCheck(uint32_t method_idx,
-                                                              mirror::Object* this_object,
-                                                              mirror::ArtMethod* caller_method,
-                                                              Thread* self,
-                                                              mirror::ArtMethod** sp)
+extern "C" MethodAndCode artInvokeVirtualTrampolineWithAccessCheck(uint32_t method_idx,
+                                                                   mirror::Object* this_object,
+                                                                   mirror::ArtMethod* caller_method,
+                                                                   Thread* self,
+                                                                   mirror::ArtMethod** sp)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   return artInvokeCommon<kVirtual, true>(method_idx, this_object, caller_method, self, sp);
 }
 
 // Determine target of interface dispatch. This object is known non-null.
-extern "C" uint64_t artInvokeInterfaceTrampoline(mirror::ArtMethod* interface_method,
-                                                 mirror::Object* this_object,
-                                                 mirror::ArtMethod* caller_method,
-                                                 Thread* self, mirror::ArtMethod** sp)
+extern "C" MethodAndCode artInvokeInterfaceTrampoline(mirror::ArtMethod* interface_method,
+                                                      mirror::Object* this_object,
+                                                      mirror::ArtMethod* caller_method,
+                                                      Thread* self, mirror::ArtMethod** sp)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   mirror::ArtMethod* method;
   if (LIKELY(interface_method->GetDexMethodIndex() != DexFile::kDexNoIndex)) {
@@ -1731,7 +1781,7 @@ extern "C" uint64_t artInvokeInterfaceTrampoline(mirror::ArtMethod* interface_me
       FinishCalleeSaveFrameSetup(self, sp, Runtime::kRefsAndArgs);
       ThrowIncompatibleClassChangeErrorClassForInterfaceDispatch(interface_method, this_object,
                                                                  caller_method);
-      return 0;  // Failure.
+      return GetFailureValue();  // Failure.
     }
   } else {
     FinishCalleeSaveFrameSetup(self, sp, Runtime::kRefsAndArgs);
@@ -1828,7 +1878,7 @@ extern "C" uint64_t artInvokeInterfaceTrampoline(mirror::ArtMethod* interface_me
 
     if (UNLIKELY(method == nullptr)) {
       CHECK(self->IsExceptionPending());
-      return 0;  // Failure.
+      return GetFailureValue();  // Failure.
     }
   }
   const void* code = method->GetEntryPointFromQuickCompiledCode();
@@ -1836,15 +1886,8 @@ extern "C" uint64_t artInvokeInterfaceTrampoline(mirror::ArtMethod* interface_me
   // When we return, the caller will branch to this address, so it had better not be 0!
   DCHECK(code != nullptr) << "Code was NULL in method: " << PrettyMethod(method) << " location: "
       << MethodHelper(method).GetDexFile().GetLocation();
-#ifdef __LP64__
-  UNIMPLEMENTED(FATAL);
-  return 0;
-#else
-  uint32_t method_uint = reinterpret_cast<uint32_t>(method);
-  uint64_t code_uint = reinterpret_cast<uint32_t>(code);
-  uint64_t result = ((code_uint << 32) | method_uint);
-  return result;
-#endif
+
+  return GetSuccessValue(code, method);
 }
 
 }  // namespace art
diff --git a/runtime/gc/accounting/atomic_stack.h b/runtime/gc/accounting/atomic_stack.h
index f3ed8d32c0..979970c4e2 100644
--- a/runtime/gc/accounting/atomic_stack.h
+++ b/runtime/gc/accounting/atomic_stack.h
@@ -46,8 +46,8 @@ class AtomicStack {
   void Reset() {
     DCHECK(mem_map_.get() != NULL);
     DCHECK(begin_ != NULL);
-    front_index_ = 0;
-    back_index_ = 0;
+    front_index_.StoreRelaxed(0);
+    back_index_.StoreRelaxed(0);
     debug_is_sorted_ = true;
     int result = madvise(begin_, sizeof(T) * capacity_, MADV_DONTNEED);
     if (result == -1) {
@@ -64,12 +64,12 @@ class AtomicStack {
     }
     int32_t index;
     do {
-      index = back_index_;
+      index = back_index_.LoadRelaxed();
       if (UNLIKELY(static_cast<size_t>(index) >= capacity_)) {
         // Stack overflow.
         return false;
       }
-    } while (!back_index_.CompareAndSwap(index, index + 1));
+    } while (!back_index_.CompareExchangeWeakRelaxed(index, index + 1));
     begin_[index] = value;
     return true;
   }
@@ -83,13 +83,13 @@ class AtomicStack {
     int32_t index;
     int32_t new_index;
     do {
-      index = back_index_;
+      index = back_index_.LoadRelaxed();
       new_index = index + num_slots;
       if (UNLIKELY(static_cast<size_t>(new_index) >= capacity_)) {
         // Stack overflow.
         return false;
       }
-    } while (!back_index_.CompareAndSwap(index, new_index));
+    } while (!back_index_.CompareExchangeWeakRelaxed(index, new_index));
     *start_address = &begin_[index];
     *end_address = &begin_[new_index];
     if (kIsDebugBuild) {
@@ -114,31 +114,31 @@ class AtomicStack {
     if (kIsDebugBuild) {
       debug_is_sorted_ = false;
     }
-    int32_t index = back_index_;
+    int32_t index = back_index_.LoadRelaxed();
     DCHECK_LT(static_cast<size_t>(index), capacity_);
-    back_index_ = index + 1;
+    back_index_.StoreRelaxed(index + 1);
     begin_[index] = value;
   }
 
   T PopBack() {
-    DCHECK_GT(back_index_, front_index_);
+    DCHECK_GT(back_index_.LoadRelaxed(), front_index_.LoadRelaxed());
     // Decrement the back index non atomically.
-    back_index_ = back_index_ - 1;
-    return begin_[back_index_];
+    back_index_.StoreRelaxed(back_index_.LoadRelaxed() - 1);
+    return begin_[back_index_.LoadRelaxed()];
   }
 
   // Take an item from the front of the stack.
   T PopFront() {
-    int32_t index = front_index_;
-    DCHECK_LT(index, back_index_.Load());
-    front_index_ = front_index_ + 1;
+    int32_t index = front_index_.LoadRelaxed();
+    DCHECK_LT(index, back_index_.LoadRelaxed());
+    front_index_.StoreRelaxed(index + 1);
     return begin_[index];
   }
 
   // Pop a number of elements.
   void PopBackCount(int32_t n) {
     DCHECK_GE(Size(), static_cast<size_t>(n));
-    back_index_.FetchAndSub(n);
+    back_index_.FetchAndSubSequentiallyConsistent(n);
   }
 
   bool IsEmpty() const {
@@ -146,16 +146,16 @@ class AtomicStack {
   }
 
   size_t Size() const {
-    DCHECK_LE(front_index_, back_index_);
-    return back_index_ - front_index_;
+    DCHECK_LE(front_index_.LoadRelaxed(), back_index_.LoadRelaxed());
+    return back_index_.LoadRelaxed() - front_index_.LoadRelaxed();
   }
 
   T* Begin() const {
-    return const_cast<T*>(begin_ + front_index_);
+    return const_cast<T*>(begin_ + front_index_.LoadRelaxed());
   }
 
   T* End() const {
-    return const_cast<T*>(begin_ + back_index_);
+    return const_cast<T*>(begin_ + back_index_.LoadRelaxed());
   }
 
   size_t Capacity() const {
@@ -169,11 +169,11 @@ class AtomicStack {
   }
 
   void Sort() {
-    int32_t start_back_index = back_index_.Load();
-    int32_t start_front_index = front_index_.Load();
+    int32_t start_back_index = back_index_.LoadRelaxed();
+    int32_t start_front_index = front_index_.LoadRelaxed();
     std::sort(Begin(), End());
-    CHECK_EQ(start_back_index, back_index_.Load());
-    CHECK_EQ(start_front_index, front_index_.Load());
+    CHECK_EQ(start_back_index, back_index_.LoadRelaxed());
+    CHECK_EQ(start_front_index, front_index_.LoadRelaxed());
     if (kIsDebugBuild) {
       debug_is_sorted_ = true;
     }
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index cc258f5a9a..43331c32b5 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -99,9 +99,10 @@ MarkSweep::MarkSweep(Heap* heap, bool is_concurrent, const std::string& name_pre
     : GarbageCollector(heap,
                        name_prefix +
                        (is_concurrent ? "concurrent mark sweep": "mark sweep")),
+      current_space_bitmap_(nullptr), mark_bitmap_(nullptr), mark_stack_(nullptr),
       gc_barrier_(new Barrier(0)),
       mark_stack_lock_("mark sweep mark stack lock", kMarkSweepMarkStackLock),
-      is_concurrent_(is_concurrent) {
+      is_concurrent_(is_concurrent), live_stack_freeze_size_(0) {
 }
 
 void MarkSweep::InitializePhase() {
@@ -109,19 +110,19 @@ void MarkSweep::InitializePhase() {
   mark_stack_ = heap_->GetMarkStack();
   DCHECK(mark_stack_ != nullptr);
   immune_region_.Reset();
-  class_count_ = 0;
-  array_count_ = 0;
-  other_count_ = 0;
-  large_object_test_ = 0;
-  large_object_mark_ = 0;
-  overhead_time_ = 0;
-  work_chunks_created_ = 0;
-  work_chunks_deleted_ = 0;
-  reference_count_ = 0;
-  mark_null_count_ = 0;
-  mark_immune_count_ = 0;
-  mark_fastpath_count_ = 0;
-  mark_slowpath_count_ = 0;
+  class_count_.StoreRelaxed(0);
+  array_count_.StoreRelaxed(0);
+  other_count_.StoreRelaxed(0);
+  large_object_test_.StoreRelaxed(0);
+  large_object_mark_.StoreRelaxed(0);
+  overhead_time_ .StoreRelaxed(0);
+  work_chunks_created_.StoreRelaxed(0);
+  work_chunks_deleted_.StoreRelaxed(0);
+  reference_count_.StoreRelaxed(0);
+  mark_null_count_.StoreRelaxed(0);
+  mark_immune_count_.StoreRelaxed(0);
+  mark_fastpath_count_.StoreRelaxed(0);
+  mark_slowpath_count_.StoreRelaxed(0);
   {
     // TODO: I don't think we should need heap bitmap lock to Get the mark bitmap.
     ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
@@ -596,7 +597,7 @@ class MarkStackTask : public Task {
         if (kUseFinger) {
           android_memory_barrier();
           if (reinterpret_cast<uintptr_t>(ref) >=
-              static_cast<uintptr_t>(mark_sweep_->atomic_finger_)) {
+              static_cast<uintptr_t>(mark_sweep_->atomic_finger_.LoadRelaxed())) {
             return;
           }
         }
@@ -881,7 +882,7 @@ void MarkSweep::RecursiveMark() {
           // This function does not handle heap end increasing, so we must use the space end.
           uintptr_t begin = reinterpret_cast<uintptr_t>(space->Begin());
           uintptr_t end = reinterpret_cast<uintptr_t>(space->End());
-          atomic_finger_ = static_cast<int32_t>(0xFFFFFFFF);
+          atomic_finger_.StoreRelaxed(AtomicInteger::MaxValue());
 
           // Create a few worker tasks.
           const size_t n = thread_count * 2;
@@ -1214,7 +1215,9 @@ void MarkSweep::ProcessMarkStackParallel(size_t thread_count) {
   thread_pool->Wait(self, true, true);
   thread_pool->StopWorkers(self);
   mark_stack_->Reset();
-  CHECK_EQ(work_chunks_created_, work_chunks_deleted_) << " some of the work chunks were leaked";
+  CHECK_EQ(work_chunks_created_.LoadSequentiallyConsistent(),
+           work_chunks_deleted_.LoadSequentiallyConsistent())
+      << " some of the work chunks were leaked";
 }
 
 // Scan anything that's on the mark stack.
@@ -1269,24 +1272,27 @@ inline bool MarkSweep::IsMarked(const Object* object) const
 void MarkSweep::FinishPhase() {
   TimingLogger::ScopedSplit split("FinishPhase", &timings_);
   if (kCountScannedTypes) {
-    VLOG(gc) << "MarkSweep scanned classes=" << class_count_ << " arrays=" << array_count_
-             << " other=" << other_count_;
+    VLOG(gc) << "MarkSweep scanned classes=" << class_count_.LoadRelaxed()
+        << " arrays=" << array_count_.LoadRelaxed() << " other=" << other_count_.LoadRelaxed();
   }
   if (kCountTasks) {
-    VLOG(gc) << "Total number of work chunks allocated: " << work_chunks_created_;
+    VLOG(gc) << "Total number of work chunks allocated: " << work_chunks_created_.LoadRelaxed();
   }
   if (kMeasureOverhead) {
-    VLOG(gc) << "Overhead time " << PrettyDuration(overhead_time_);
+    VLOG(gc) << "Overhead time " << PrettyDuration(overhead_time_.LoadRelaxed());
   }
   if (kProfileLargeObjects) {
-    VLOG(gc) << "Large objects tested " << large_object_test_ << " marked " << large_object_mark_;
+    VLOG(gc) << "Large objects tested " << large_object_test_.LoadRelaxed()
+        << " marked " << large_object_mark_.LoadRelaxed();
   }
   if (kCountJavaLangRefs) {
-    VLOG(gc) << "References scanned " << reference_count_;
+    VLOG(gc) << "References scanned " << reference_count_.LoadRelaxed();
   }
   if (kCountMarkedObjects) {
-    VLOG(gc) << "Marked: null=" << mark_null_count_ << " immune=" <<  mark_immune_count_
-        << " fastpath=" << mark_fastpath_count_ << " slowpath=" << mark_slowpath_count_;
+    VLOG(gc) << "Marked: null=" << mark_null_count_.LoadRelaxed()
+        << " immune=" <<  mark_immune_count_.LoadRelaxed()
+        << " fastpath=" << mark_fastpath_count_.LoadRelaxed()
+        << " slowpath=" << mark_slowpath_count_.LoadRelaxed();
   }
   CHECK(mark_stack_->IsEmpty());  // Ensure that the mark stack is empty.
   mark_stack_->Reset();
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index e9a3c3a42b..d73bf3f69f 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -305,14 +305,14 @@ class MarkSweep : public GarbageCollector {
   AtomicInteger mark_fastpath_count_;
   AtomicInteger mark_slowpath_count_;
 
-  // Verification.
-  size_t live_stack_freeze_size_;
-
   std::unique_ptr<Barrier> gc_barrier_;
   Mutex mark_stack_lock_ ACQUIRED_AFTER(Locks::classlinker_classes_lock_);
 
   const bool is_concurrent_;
 
+  // Verification.
+  size_t live_stack_freeze_size_;
+
  private:
   friend class AddIfReachesAllocSpaceVisitor;  // Used by mod-union table.
   friend class CardScanTask;
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 7cee5a094c..03b72b6870 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -96,7 +96,7 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self, mirror::Clas
     CHECK_LE(obj->SizeOf(), usable_size);
   }
   const size_t new_num_bytes_allocated =
-      static_cast<size_t>(num_bytes_allocated_.FetchAndAdd(bytes_allocated)) + bytes_allocated;
+      static_cast<size_t>(num_bytes_allocated_.FetchAndAddSequentiallyConsistent(bytes_allocated)) + bytes_allocated;
   // TODO: Deprecate.
   if (kInstrumented) {
     if (Runtime::Current()->HasStatsEnabled()) {
@@ -264,7 +264,7 @@ inline Heap::AllocationTimer::~AllocationTimer() {
     // Only if the allocation succeeded, record the time.
     if (allocated_obj != nullptr) {
       uint64_t allocation_end_time = NanoTime() / kTimeAdjust;
-      heap_->total_allocation_time_.FetchAndAdd(allocation_end_time - allocation_start_time_);
+      heap_->total_allocation_time_.FetchAndAddSequentiallyConsistent(allocation_end_time - allocation_start_time_);
     }
   }
 };
@@ -279,7 +279,7 @@ inline bool Heap::ShouldAllocLargeObject(mirror::Class* c, size_t byte_count) co
 
 template <bool kGrow>
 inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type, size_t alloc_size) {
-  size_t new_footprint = num_bytes_allocated_ + alloc_size;
+  size_t new_footprint = num_bytes_allocated_.LoadSequentiallyConsistent() + alloc_size;
   if (UNLIKELY(new_footprint > max_allowed_footprint_)) {
     if (UNLIKELY(new_footprint > growth_limit_)) {
       return true;
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index d37f2ad960..ea1ccdd665 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -116,6 +116,7 @@ Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max
       long_pause_log_threshold_(long_pause_log_threshold),
       long_gc_log_threshold_(long_gc_log_threshold),
       ignore_max_footprint_(ignore_max_footprint),
+      zygote_creation_lock_("zygote creation lock", kZygoteCreationLock),
       have_zygote_space_(false),
       large_object_threshold_(std::numeric_limits<size_t>::max()),  // Starts out disabled.
       collector_type_running_(kCollectorTypeNone),
@@ -292,7 +293,7 @@ Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max
   }
 
   // TODO: Count objects in the image space here.
-  num_bytes_allocated_ = 0;
+  num_bytes_allocated_.StoreRelaxed(0);
 
   // Default mark stack size in bytes.
   static const size_t default_mark_stack_size = 64 * KB;
@@ -658,13 +659,13 @@ void Heap::RemoveSpace(space::Space* space) {
 
 void Heap::RegisterGCAllocation(size_t bytes) {
   if (this != nullptr) {
-    gc_memory_overhead_.FetchAndAdd(bytes);
+    gc_memory_overhead_.FetchAndAddSequentiallyConsistent(bytes);
   }
 }
 
 void Heap::RegisterGCDeAllocation(size_t bytes) {
   if (this != nullptr) {
-    gc_memory_overhead_.FetchAndSub(bytes);
+    gc_memory_overhead_.FetchAndSubSequentiallyConsistent(bytes);
   }
 }
 
@@ -699,7 +700,8 @@ void Heap::DumpGcPerformanceInfo(std::ostream& os) {
     }
     collector->ResetMeasurements();
   }
-  uint64_t allocation_time = static_cast<uint64_t>(total_allocation_time_) * kTimeAdjust;
+  uint64_t allocation_time =
+      static_cast<uint64_t>(total_allocation_time_.LoadRelaxed()) * kTimeAdjust;
   if (total_duration != 0) {
     const double total_seconds = static_cast<double>(total_duration / 1000) / 1000000.0;
     os << "Total time spent in GC: " << PrettyDuration(total_duration) << "\n";
@@ -719,7 +721,7 @@ void Heap::DumpGcPerformanceInfo(std::ostream& os) {
   }
   os << "Total mutator paused time: " << PrettyDuration(total_paused_time) << "\n";
   os << "Total time waiting for GC to complete: " << PrettyDuration(total_wait_time_) << "\n";
-  os << "Approximate GC data structures memory overhead: " << gc_memory_overhead_;
+  os << "Approximate GC data structures memory overhead: " << gc_memory_overhead_.LoadRelaxed();
   BaseMutex::DumpAll(os);
 }
 
@@ -1021,7 +1023,7 @@ void Heap::VerifyObjectBody(mirror::Object* obj) {
     return;
   }
   // Ignore early dawn of the universe verifications.
-  if (UNLIKELY(static_cast<size_t>(num_bytes_allocated_.Load()) < 10 * KB)) {
+  if (UNLIKELY(static_cast<size_t>(num_bytes_allocated_.LoadRelaxed()) < 10 * KB)) {
     return;
   }
   CHECK(IsAligned<kObjectAlignment>(obj)) << "Object isn't aligned: " << obj;
@@ -1052,9 +1054,9 @@ void Heap::RecordFree(uint64_t freed_objects, int64_t freed_bytes) {
   // Use signed comparison since freed bytes can be negative when background compaction foreground
   // transitions occurs. This is caused by the moving objects from a bump pointer space to a
   // free list backed space typically increasing memory footprint due to padding and binning.
-  DCHECK_LE(freed_bytes, static_cast<int64_t>(num_bytes_allocated_.Load()));
+  DCHECK_LE(freed_bytes, static_cast<int64_t>(num_bytes_allocated_.LoadRelaxed()));
   // Note: This relies on 2s complement for handling negative freed_bytes.
-  num_bytes_allocated_.FetchAndSub(static_cast<ssize_t>(freed_bytes));
+  num_bytes_allocated_.FetchAndSubSequentiallyConsistent(static_cast<ssize_t>(freed_bytes));
   if (Runtime::Current()->HasStatsEnabled()) {
     RuntimeStats* thread_stats = Thread::Current()->GetStats();
     thread_stats->freed_objects += freed_objects;
@@ -1312,7 +1314,7 @@ void Heap::TransitionCollector(CollectorType collector_type) {
   VLOG(heap) << "TransitionCollector: " << static_cast<int>(collector_type_)
              << " -> " << static_cast<int>(collector_type);
   uint64_t start_time = NanoTime();
-  uint32_t before_allocated = num_bytes_allocated_.Load();
+  uint32_t before_allocated = num_bytes_allocated_.LoadSequentiallyConsistent();
   ThreadList* tl = Runtime::Current()->GetThreadList();
   Thread* self = Thread::Current();
   ScopedThreadStateChange tsc(self, kWaitingPerformingGc);
@@ -1390,7 +1392,7 @@ void Heap::TransitionCollector(CollectorType collector_type) {
   uint64_t duration = NanoTime() - start_time;
   GrowForUtilization(semi_space_collector_);
   FinishGC(self, collector::kGcTypeFull);
-  int32_t after_allocated = num_bytes_allocated_.Load();
+  int32_t after_allocated = num_bytes_allocated_.LoadSequentiallyConsistent();
   int32_t delta_allocated = before_allocated - after_allocated;
   LOG(INFO) << "Heap transition to " << process_state_ << " took "
       << PrettyDuration(duration) << " saved at least " << PrettySize(delta_allocated);
@@ -1551,7 +1553,6 @@ void Heap::UnBindBitmaps() {
 
 void Heap::PreZygoteFork() {
   CollectGarbageInternal(collector::kGcTypeFull, kGcCauseBackground, false);
-  static Mutex zygote_creation_lock_("zygote creation lock", kZygoteCreationLock);
   Thread* self = Thread::Current();
   MutexLock mu(self, zygote_creation_lock_);
   // Try to see if we have any Zygote spaces.
@@ -1866,7 +1867,7 @@ class VerifyReferenceVisitor {
       : heap_(heap), fail_count_(fail_count), verify_referent_(verify_referent) {}
 
   size_t GetFailureCount() const {
-    return fail_count_->Load();
+    return fail_count_->LoadSequentiallyConsistent();
   }
 
   void operator()(mirror::Class* klass, mirror::Reference* ref) const
@@ -1903,7 +1904,7 @@ class VerifyReferenceVisitor {
       // Verify that the reference is live.
       return true;
     }
-    if (fail_count_->FetchAndAdd(1) == 0) {
+    if (fail_count_->FetchAndAddSequentiallyConsistent(1) == 0) {
       // Print message on only on first failure to prevent spam.
       LOG(ERROR) << "!!!!!!!!!!!!!!Heap corruption detected!!!!!!!!!!!!!!!!!!!";
     }
@@ -2019,7 +2020,7 @@ class VerifyObjectVisitor {
   }
 
   size_t GetFailureCount() const {
-    return fail_count_->Load();
+    return fail_count_->LoadSequentiallyConsistent();
   }
 
  private:
@@ -2429,7 +2430,7 @@ bool Heap::IsMovableObject(const mirror::Object* obj) const {
 }
 
 void Heap::UpdateMaxNativeFootprint() {
-  size_t native_size = native_bytes_allocated_;
+  size_t native_size = native_bytes_allocated_.LoadRelaxed();
   // TODO: Tune the native heap utilization to be a value other than the java heap utilization.
   size_t target_size = native_size / GetTargetHeapUtilization();
   if (target_size > native_size + max_free_) {
@@ -2701,21 +2702,22 @@ void Heap::RegisterNativeAllocation(JNIEnv* env, int bytes) {
     native_need_to_run_finalization_ = false;
   }
   // Total number of native bytes allocated.
-  native_bytes_allocated_.FetchAndAdd(bytes);
-  if (static_cast<size_t>(native_bytes_allocated_) > native_footprint_gc_watermark_) {
+  size_t new_native_bytes_allocated = native_bytes_allocated_.FetchAndAddSequentiallyConsistent(bytes);
+  new_native_bytes_allocated += bytes;
+  if (new_native_bytes_allocated > native_footprint_gc_watermark_) {
     collector::GcType gc_type = have_zygote_space_ ? collector::kGcTypePartial :
         collector::kGcTypeFull;
 
     // The second watermark is higher than the gc watermark. If you hit this it means you are
     // allocating native objects faster than the GC can keep up with.
-    if (static_cast<size_t>(native_bytes_allocated_) > native_footprint_limit_) {
+    if (new_native_bytes_allocated > native_footprint_limit_) {
       if (WaitForGcToComplete(kGcCauseForNativeAlloc, self) != collector::kGcTypeNone) {
         // Just finished a GC, attempt to run finalizers.
         RunFinalization(env);
         CHECK(!env->ExceptionCheck());
       }
       // If we still are over the watermark, attempt a GC for alloc and run finalizers.
-      if (static_cast<size_t>(native_bytes_allocated_) > native_footprint_limit_) {
+      if (new_native_bytes_allocated > native_footprint_limit_) {
         CollectGarbageInternal(gc_type, kGcCauseForNativeAlloc, false);
         RunFinalization(env);
         native_need_to_run_finalization_ = false;
@@ -2737,7 +2739,7 @@ void Heap::RegisterNativeAllocation(JNIEnv* env, int bytes) {
 void Heap::RegisterNativeFree(JNIEnv* env, int bytes) {
   int expected_size, new_size;
   do {
-    expected_size = native_bytes_allocated_.Load();
+    expected_size = native_bytes_allocated_.LoadRelaxed();
     new_size = expected_size - bytes;
     if (UNLIKELY(new_size < 0)) {
       ScopedObjectAccess soa(env);
@@ -2746,7 +2748,7 @@ void Heap::RegisterNativeFree(JNIEnv* env, int bytes) {
                                  "registered as allocated", bytes, expected_size).c_str());
       break;
     }
-  } while (!native_bytes_allocated_.CompareAndSwap(expected_size, new_size));
+  } while (!native_bytes_allocated_.CompareExchangeWeakRelaxed(expected_size, new_size));
 }
 
 size_t Heap::GetTotalMemory() const {
@@ -2781,9 +2783,9 @@ void Heap::AddRememberedSet(accounting::RememberedSet* remembered_set) {
   CHECK(remembered_set != nullptr);
   space::Space* space = remembered_set->GetSpace();
   CHECK(space != nullptr);
-  CHECK(remembered_sets_.find(space) == remembered_sets_.end());
+  CHECK(remembered_sets_.find(space) == remembered_sets_.end()) << space;
   remembered_sets_.Put(space, remembered_set);
-  CHECK(remembered_sets_.find(space) != remembered_sets_.end());
+  CHECK(remembered_sets_.find(space) != remembered_sets_.end()) << space;
 }
 
 void Heap::RemoveRememberedSet(space::Space* space) {
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 6fe0dcf24e..887b17eb05 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -372,7 +372,7 @@ class Heap {
 
   // Returns the number of bytes currently allocated.
   size_t GetBytesAllocated() const {
-    return num_bytes_allocated_;
+    return num_bytes_allocated_.LoadSequentiallyConsistent();
   }
 
   // Returns the number of objects currently allocated.
@@ -408,7 +408,7 @@ class Heap {
 
   // Implements java.lang.Runtime.freeMemory.
   size_t GetFreeMemory() const {
-    return GetTotalMemory() - num_bytes_allocated_;
+    return GetTotalMemory() - num_bytes_allocated_.LoadSequentiallyConsistent();
   }
 
   // get the space that corresponds to an object's address. Current implementation searches all
@@ -778,6 +778,9 @@ class Heap {
   // useful for benchmarking since it reduces time spent in GC to a low %.
   const bool ignore_max_footprint_;
 
+  // Lock which guards zygote space creation.
+  Mutex zygote_creation_lock_;
+
   // If we have a zygote space.
   bool have_zygote_space_;
 
diff --git a/runtime/gc/space/bump_pointer_space-inl.h b/runtime/gc/space/bump_pointer_space-inl.h
index 497a61f273..71c295e147 100644
--- a/runtime/gc/space/bump_pointer_space-inl.h
+++ b/runtime/gc/space/bump_pointer_space-inl.h
@@ -48,8 +48,8 @@ inline mirror::Object* BumpPointerSpace::AllocThreadUnsafe(Thread* self, size_t
   end_ += num_bytes;
   *bytes_allocated = num_bytes;
   // Use the CAS free versions as an optimization.
-  objects_allocated_ = objects_allocated_ + 1;
-  bytes_allocated_ = bytes_allocated_ + num_bytes;
+  objects_allocated_.StoreRelaxed(objects_allocated_.LoadRelaxed() + 1);
+  bytes_allocated_.StoreRelaxed(bytes_allocated_.LoadRelaxed() + num_bytes);
   if (UNLIKELY(usable_size != nullptr)) {
     *usable_size = num_bytes;
   }
@@ -76,8 +76,8 @@ inline mirror::Object* BumpPointerSpace::AllocNonvirtualWithoutAccounting(size_t
 inline mirror::Object* BumpPointerSpace::AllocNonvirtual(size_t num_bytes) {
   mirror::Object* ret = AllocNonvirtualWithoutAccounting(num_bytes);
   if (ret != nullptr) {
-    objects_allocated_.FetchAndAdd(1);
-    bytes_allocated_.FetchAndAdd(num_bytes);
+    objects_allocated_.FetchAndAddSequentiallyConsistent(1);
+    bytes_allocated_.FetchAndAddSequentiallyConsistent(num_bytes);
   }
   return ret;
 }
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index fcd772bba5..fd0a92d56f 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -68,8 +68,8 @@ void BumpPointerSpace::Clear() {
   // Reset the end of the space back to the beginning, we move the end forward as we allocate
   // objects.
   SetEnd(Begin());
-  objects_allocated_ = 0;
-  bytes_allocated_ = 0;
+  objects_allocated_.StoreRelaxed(0);
+  bytes_allocated_.StoreRelaxed(0);
   growth_end_ = Limit();
   {
     MutexLock mu(Thread::Current(), block_lock_);
@@ -204,7 +204,7 @@ accounting::ContinuousSpaceBitmap::SweepCallback* BumpPointerSpace::GetSweepCall
 
 uint64_t BumpPointerSpace::GetBytesAllocated() {
   // Start out pre-determined amount (blocks which are not being allocated into).
-  uint64_t total = static_cast<uint64_t>(bytes_allocated_.Load());
+  uint64_t total = static_cast<uint64_t>(bytes_allocated_.LoadRelaxed());
   Thread* self = Thread::Current();
   MutexLock mu(self, *Locks::runtime_shutdown_lock_);
   MutexLock mu2(self, *Locks::thread_list_lock_);
@@ -222,7 +222,7 @@ uint64_t BumpPointerSpace::GetBytesAllocated() {
 
 uint64_t BumpPointerSpace::GetObjectsAllocated() {
   // Start out pre-determined amount (blocks which are not being allocated into).
-  uint64_t total = static_cast<uint64_t>(objects_allocated_.Load());
+  uint64_t total = static_cast<uint64_t>(objects_allocated_.LoadRelaxed());
   Thread* self = Thread::Current();
   MutexLock mu(self, *Locks::runtime_shutdown_lock_);
   MutexLock mu2(self, *Locks::thread_list_lock_);
@@ -239,8 +239,8 @@ uint64_t BumpPointerSpace::GetObjectsAllocated() {
 }
 
 void BumpPointerSpace::RevokeThreadLocalBuffersLocked(Thread* thread) {
-  objects_allocated_.FetchAndAdd(thread->GetThreadLocalObjectsAllocated());
-  bytes_allocated_.FetchAndAdd(thread->GetThreadLocalBytesAllocated());
+  objects_allocated_.FetchAndAddSequentiallyConsistent(thread->GetThreadLocalObjectsAllocated());
+  bytes_allocated_.FetchAndAddSequentiallyConsistent(thread->GetThreadLocalBytesAllocated());
   thread->SetTlab(nullptr, nullptr);
 }
 
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index 6ea94a91a2..45fee14519 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -239,7 +239,7 @@ ImageSpace* ImageSpace::Init(const char* image_filename, const char* image_locat
     *error_msg = StringPrintf("Failed to map image bitmap: %s", error_msg->c_str());
     return nullptr;
   }
-  uint32_t bitmap_index = bitmap_index_.FetchAndAdd(1);
+  uint32_t bitmap_index = bitmap_index_.FetchAndAddSequentiallyConsistent(1);
   std::string bitmap_name(StringPrintf("imagespace %s live-bitmap %u", image_filename,
                                        bitmap_index));
   std::unique_ptr<accounting::ContinuousSpaceBitmap> bitmap(
diff --git a/runtime/gc/space/zygote_space.cc b/runtime/gc/space/zygote_space.cc
index 046641362d..fb3a12efb5 100644
--- a/runtime/gc/space/zygote_space.cc
+++ b/runtime/gc/space/zygote_space.cc
@@ -115,7 +115,7 @@ void ZygoteSpace::SweepCallback(size_t num_ptrs, mirror::Object** ptrs, void* ar
     // Need to mark the card since this will update the mod-union table next GC cycle.
     card_table->MarkCard(ptrs[i]);
   }
-  zygote_space->objects_allocated_.FetchAndSub(num_ptrs);
+  zygote_space->objects_allocated_.FetchAndSubSequentiallyConsistent(num_ptrs);
 }
 
 }  // namespace space
diff --git a/runtime/gc/space/zygote_space.h b/runtime/gc/space/zygote_space.h
index 50fc62b699..5d5fe76b74 100644
--- a/runtime/gc/space/zygote_space.h
+++ b/runtime/gc/space/zygote_space.h
@@ -65,7 +65,7 @@ class ZygoteSpace FINAL : public ContinuousMemMapAllocSpace {
   }
 
   uint64_t GetObjectsAllocated() {
-    return objects_allocated_;
+    return objects_allocated_.LoadSequentiallyConsistent();
   }
 
   void Clear() OVERRIDE;
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index 075d225bf4..2dbcc8058d 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -522,9 +522,9 @@ void Instrumentation::SetEntrypointsInstrumented(bool instrumented) {
 void Instrumentation::InstrumentQuickAllocEntryPoints() {
   // TODO: the read of quick_alloc_entry_points_instrumentation_counter_ is racey and this code
   //       should be guarded by a lock.
-  DCHECK_GE(quick_alloc_entry_points_instrumentation_counter_.Load(), 0);
+  DCHECK_GE(quick_alloc_entry_points_instrumentation_counter_.LoadSequentiallyConsistent(), 0);
   const bool enable_instrumentation =
-      quick_alloc_entry_points_instrumentation_counter_.FetchAndAdd(1) == 0;
+      quick_alloc_entry_points_instrumentation_counter_.FetchAndAddSequentiallyConsistent(1) == 0;
   if (enable_instrumentation) {
     SetEntrypointsInstrumented(true);
   }
@@ -533,9 +533,9 @@ void Instrumentation::InstrumentQuickAllocEntryPoints() {
 void Instrumentation::UninstrumentQuickAllocEntryPoints() {
   // TODO: the read of quick_alloc_entry_points_instrumentation_counter_ is racey and this code
   //       should be guarded by a lock.
-  DCHECK_GT(quick_alloc_entry_points_instrumentation_counter_.Load(), 0);
+  DCHECK_GT(quick_alloc_entry_points_instrumentation_counter_.LoadSequentiallyConsistent(), 0);
   const bool disable_instrumentation =
-      quick_alloc_entry_points_instrumentation_counter_.FetchAndSub(1) == 1;
+      quick_alloc_entry_points_instrumentation_counter_.FetchAndSubSequentiallyConsistent(1) == 1;
   if (disable_instrumentation) {
     SetEntrypointsInstrumented(false);
   }
diff --git a/runtime/interpreter/interpreter_goto_table_impl.cc b/runtime/interpreter/interpreter_goto_table_impl.cc
index e0f9e5f958..9a274f618a 100644
--- a/runtime/interpreter/interpreter_goto_table_impl.cc
+++ b/runtime/interpreter/interpreter_goto_table_impl.cc
@@ -234,9 +234,9 @@ JValue ExecuteGotoImpl(Thread* self, MethodHelper& mh, const DexFile::CodeItem*
   HANDLE_INSTRUCTION_END();
 
   HANDLE_INSTRUCTION_START(MOVE_EXCEPTION) {
-    Throwable* exception = self->GetException(NULL);
-    self->ClearException();
+    Throwable* exception = self->GetException(nullptr);
     shadow_frame.SetVRegReference(inst->VRegA_11x(inst_data), exception);
+    self->ClearException();
     ADVANCE(1);
   }
   HANDLE_INSTRUCTION_END();
diff --git a/runtime/interpreter/interpreter_switch_impl.cc b/runtime/interpreter/interpreter_switch_impl.cc
index c0275f636a..68759ad65a 100644
--- a/runtime/interpreter/interpreter_switch_impl.cc
+++ b/runtime/interpreter/interpreter_switch_impl.cc
@@ -163,9 +163,9 @@ JValue ExecuteSwitchImpl(Thread* self, MethodHelper& mh, const DexFile::CodeItem
         break;
       case Instruction::MOVE_EXCEPTION: {
         PREAMBLE();
-        Throwable* exception = self->GetException(NULL);
-        self->ClearException();
+        Throwable* exception = self->GetException(nullptr);
         shadow_frame.SetVRegReference(inst->VRegA_11x(inst_data), exception);
+        self->ClearException();
         inst = inst->Next_1xx();
         break;
       }
diff --git a/runtime/mirror/object.cc b/runtime/mirror/object.cc
index 04905a57ca..69e5a84fed 100644
--- a/runtime/mirror/object.cc
+++ b/runtime/mirror/object.cc
@@ -139,10 +139,10 @@ int32_t Object::GenerateIdentityHashCode() {
   static AtomicInteger seed(987654321 + std::time(nullptr));
   int32_t expected_value, new_value;
   do {
-    expected_value = static_cast<uint32_t>(seed.Load());
+    expected_value = static_cast<uint32_t>(seed.LoadRelaxed());
     new_value = expected_value * 1103515245 + 12345;
   } while ((expected_value & LockWord::kHashMask) == 0 ||
-      !seed.CompareAndSwap(expected_value, new_value));
+      !seed.CompareExchangeWeakRelaxed(expected_value, new_value));
   return expected_value & LockWord::kHashMask;
 }
 
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 0beb6514ee..f783edbfc3 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -99,12 +99,12 @@ Monitor::Monitor(Thread* self, Thread* owner, mirror::Object* obj, int32_t hash_
 
 int32_t Monitor::GetHashCode() {
   while (!HasHashCode()) {
-    if (hash_code_.CompareAndSwap(0, mirror::Object::GenerateIdentityHashCode())) {
+    if (hash_code_.CompareExchangeWeakRelaxed(0, mirror::Object::GenerateIdentityHashCode())) {
       break;
     }
   }
   DCHECK(HasHashCode());
-  return hash_code_.Load();
+  return hash_code_.LoadRelaxed();
 }
 
 bool Monitor::Install(Thread* self) {
@@ -119,7 +119,7 @@ bool Monitor::Install(Thread* self) {
       break;
     }
     case LockWord::kHashCode: {
-      CHECK_EQ(hash_code_, static_cast<int32_t>(lw.GetHashCode()));
+      CHECK_EQ(hash_code_.LoadRelaxed(), static_cast<int32_t>(lw.GetHashCode()));
       break;
     }
     case LockWord::kFatLocked: {
diff --git a/runtime/monitor.h b/runtime/monitor.h
index bc5d2e4bb9..bc1b2ed4eb 100644
--- a/runtime/monitor.h
+++ b/runtime/monitor.h
@@ -107,7 +107,7 @@ class Monitor {
   bool IsLocked() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   bool HasHashCode() const {
-    return hash_code_.Load() != 0;
+    return hash_code_.LoadRelaxed() != 0;
   }
 
   MonitorId GetMonitorId() const {
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index 2987393bf2..4330d275a9 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -533,7 +533,7 @@ bool ParsedOptions::Parse(const Runtime::Options& options, bool ignore_unrecogni
       Trace::SetDefaultClockSource(kProfilerClockSourceWall);
     } else if (option == "-Xprofile:dualclock") {
       Trace::SetDefaultClockSource(kProfilerClockSourceDual);
-    } else if (StartsWith(option, "-Xprofile:")) {
+    } else if (StartsWith(option, "-Xprofile-filename:")) {
       if (!ParseStringAfterChar(option, ':', &profile_output_filename_)) {
         return false;
       }
@@ -786,7 +786,7 @@ void ParsedOptions::Usage(const char* fmt, ...) {
   UsageMessage(stream, "  -Xmethod-trace\n");
   UsageMessage(stream, "  -Xmethod-trace-file:filename");
   UsageMessage(stream, "  -Xmethod-trace-file-size:integervalue\n");
-  UsageMessage(stream, "  -Xprofile=filename\n");
+  UsageMessage(stream, "  -Xprofile-filename:filename\n");
   UsageMessage(stream, "  -Xprofile-period:integervalue\n");
   UsageMessage(stream, "  -Xprofile-duration:integervalue\n");
   UsageMessage(stream, "  -Xprofile-interval:integervalue\n");
diff --git a/runtime/thread_pool_test.cc b/runtime/thread_pool_test.cc
index c1a1ad73e0..292c94f64b 100644
--- a/runtime/thread_pool_test.cc
+++ b/runtime/thread_pool_test.cc
@@ -69,7 +69,7 @@ TEST_F(ThreadPoolTest, CheckRun) {
   // Wait for tasks to complete.
   thread_pool.Wait(self, true, false);
   // Make sure that we finished all the work.
-  EXPECT_EQ(num_tasks, count);
+  EXPECT_EQ(num_tasks, count.LoadSequentiallyConsistent());
 }
 
 TEST_F(ThreadPoolTest, StopStart) {
@@ -82,7 +82,7 @@ TEST_F(ThreadPoolTest, StopStart) {
   }
   usleep(200);
   // Check that no threads started prematurely.
-  EXPECT_EQ(0, count);
+  EXPECT_EQ(0, count.LoadSequentiallyConsistent());
   // Signal the threads to start processing tasks.
   thread_pool.StartWorkers(self);
   usleep(200);
@@ -91,10 +91,11 @@ TEST_F(ThreadPoolTest, StopStart) {
   thread_pool.AddTask(self, new CountTask(&bad_count));
   usleep(200);
   // Ensure that the task added after the workers were stopped doesn't get run.
-  EXPECT_EQ(0, bad_count);
+  EXPECT_EQ(0, bad_count.LoadSequentiallyConsistent());
   // Allow tasks to finish up and delete themselves.
   thread_pool.StartWorkers(self);
-  while (count.Load() != num_tasks && bad_count.Load() != 1) {
+  while (count.LoadSequentiallyConsistent() != num_tasks &&
+      bad_count.LoadSequentiallyConsistent() != 1) {
     usleep(200);
   }
   thread_pool.StopWorkers(self);
@@ -135,7 +136,7 @@ TEST_F(ThreadPoolTest, RecursiveTest) {
   thread_pool.AddTask(self, new TreeTask(&thread_pool, &count, depth));
   thread_pool.StartWorkers(self);
   thread_pool.Wait(self, true, false);
-  EXPECT_EQ((1 << depth) - 1, count);
+  EXPECT_EQ((1 << depth) - 1, count.LoadSequentiallyConsistent());
 }
 
 }  // namespace art
diff --git a/test/Android.mk b/test/Android.mk
index 71f6be16a4..8caa033b98 100644
--- a/test/Android.mk
+++ b/test/Android.mk
@@ -116,7 +116,7 @@ define declare-test-art-oat-targets-impl
 test-art-target-oat-$(1)$($(2)ART_PHONY_TEST_TARGET_SUFFIX): $(ART_TEST_OUT)/oat-test-dex-$(1).jar test-art-target-sync
 	adb shell touch $(ART_TEST_DIR)/$(TARGET_$(2)ARCH)/$$@
 	adb shell rm $(ART_TEST_DIR)/$(TARGET_$(2)ARCH)/$$@
-	adb shell sh -c "/system/bin/dalvikvm$($(2)ART_TARGET_BINARY_SUFFIX) $(DALVIKVM_FLAGS) -XXlib:libartd.so -Ximage:$(ART_TEST_DIR)/core.art -classpath $(ART_TEST_DIR)/oat-test-dex-$(1).jar -Djava.library.path=$(ART_TEST_DIR)/$(TARGET_$(2)ARCH) $(1) && touch $(ART_TEST_DIR)/$(TARGET_$(2)ARCH)/$$@"
+	adb shell sh -c "/system/bin/dalvikvm$($(2)ART_PHONY_TEST_TARGET_SUFFIX) $(DALVIKVM_FLAGS) -XXlib:libartd.so -Ximage:$(ART_TEST_DIR)/core.art -classpath $(ART_TEST_DIR)/oat-test-dex-$(1).jar -Djava.library.path=$(ART_TEST_DIR)/$(TARGET_$(2)ARCH) $(1) && touch $(ART_TEST_DIR)/$(TARGET_$(2)ARCH)/$$@"
 	$(hide) (adb pull $(ART_TEST_DIR)/$(TARGET_$(2)ARCH)/$$@ /tmp/ && echo $$@ PASSED) || (echo $$@ FAILED && exit 1)
 	$(hide) rm /tmp/$$@
 endef