96 files changed, 2296 insertions, 1460 deletions
diff --git a/build/Android.common.mk b/build/Android.common.mk
index 09f34b3092..f916e1ee6f 100644
--- a/build/Android.common.mk
+++ b/build/Android.common.mk
@@ -134,7 +134,7 @@ endif
 # Clang on the target: only enabled for ARM64. Target builds use GCC by default.
 ART_TARGET_CLANG :=
 ART_TARGET_CLANG_arm :=
-ART_TARGET_CLANG_arm64 := true
+ART_TARGET_CLANG_arm64 :=
 ART_TARGET_CLANG_mips :=
 ART_TARGET_CLANG_x86 :=
 ART_TARGET_CLANG_x86_64 :=
diff --git a/build/Android.oat.mk b/build/Android.oat.mk
index fbb7eb36c6..c67a815832 100644
--- a/build/Android.oat.mk
+++ b/build/Android.oat.mk
@@ -42,6 +42,11 @@ $(HOST_CORE_IMG_OUT): $(HOST_CORE_DEX_FILES) $(DEX2OATD_DEPENDENCY)
 
 $(HOST_CORE_OAT_OUT): $(HOST_CORE_IMG_OUT)
 
+IMPLICIT_CHECKS_arm := null,stack
+IMPLICIT_CHECKS_arm64 := none
+IMPLICIT_CHECKS_x86 := none
+IMPLICIT_CHECKS_x86_64 := none
+IMPLICIT_CHECKS_mips := none
 define create-oat-target-targets
 $$($(1)TARGET_CORE_IMG_OUT): $$($(1)TARGET_CORE_DEX_FILES) $$(DEX2OATD_DEPENDENCY)
 	@echo "target dex2oat: $$@ ($$?)"
@@ -49,6 +54,7 @@ $$($(1)TARGET_CORE_IMG_OUT): $$($(1)TARGET_CORE_DEX_FILES) $$(DEX2OATD_DEPENDENC
 	$$(hide) $$(DEX2OATD) --runtime-arg -Xms16m --runtime-arg -Xmx16m --image-classes=$$(PRELOADED_CLASSES) $$(addprefix \
 		--dex-file=,$$(TARGET_CORE_DEX_FILES)) $$(addprefix --dex-location=,$$(TARGET_CORE_DEX_LOCATIONS)) --oat-file=$$($(1)TARGET_CORE_OAT_OUT) \
 		--oat-location=$$($(1)TARGET_CORE_OAT) --image=$$($(1)TARGET_CORE_IMG_OUT) --base=$$(LIBART_IMG_TARGET_BASE_ADDRESS) \
+		--implicit-checks=$(IMPLICIT_CHECKS_$($(1)TARGET_ARCH)) \
 		--instruction-set=$$($(1)TARGET_ARCH) --instruction-set-features=$$(TARGET_INSTRUCTION_SET_FEATURES) --android-root=$$(PRODUCT_OUT)/system
 
 # This "renaming" eases declaration in art/Android.mk
@@ -58,7 +64,7 @@ $$($(1)TARGET_CORE_OAT_OUT): $$($(1)TARGET_CORE_IMG_OUT)
 endef
 
 ifdef TARGET_2ND_ARCH
-$(eval $(call create-oat-target-targets,2ND_))
+  $(eval $(call create-oat-target-targets,2ND_))
 endif
 $(eval $(call create-oat-target-targets,))
 
diff --git a/compiler/common_compiler_test.h b/compiler/common_compiler_test.h
index 5050d4eb78..45cf2fba7f 100644
--- a/compiler/common_compiler_test.h
+++ b/compiler/common_compiler_test.h
@@ -371,10 +371,10 @@ class CommonCompilerTest : public CommonRuntimeTest {
   void CompileMethod(mirror::ArtMethod* method) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     CHECK(method != nullptr);
     TimingLogger timings("CommonTest::CompileMethod", false, false);
-    timings.StartSplit("CompileOne");
+    TimingLogger::ScopedTiming t(__FUNCTION__, &timings);
     compiler_driver_->CompileOne(method, &timings);
+    TimingLogger::ScopedTiming t2("MakeExecutable", &timings);
     MakeExecutable(method);
-    timings.EndSplit();
   }
 
   void CompileDirectMethod(Handle<mirror::ClassLoader> class_loader, const char* class_name,
diff --git a/compiler/compiled_method.cc b/compiler/compiled_method.cc
index 7441daccfe..f098a34ea7 100644
--- a/compiler/compiled_method.cc
+++ b/compiler/compiled_method.cc
@@ -86,7 +86,11 @@ uint32_t CompiledCode::AlignCode(uint32_t offset, InstructionSet instruction_set
 }
 
 size_t CompiledCode::CodeDelta() const {
-  switch (instruction_set_) {
+  return CodeDelta(instruction_set_);
+}
+
+size_t CompiledCode::CodeDelta(InstructionSet instruction_set) {
+  switch (instruction_set) {
     case kArm:
     case kArm64:
     case kMips:
@@ -98,7 +102,7 @@ size_t CompiledCode::CodeDelta() const {
       return 1;
     }
     default:
-      LOG(FATAL) << "Unknown InstructionSet: " << instruction_set_;
+      LOG(FATAL) << "Unknown InstructionSet: " << instruction_set;
       return 0;
   }
 }
diff --git a/compiler/compiled_method.h b/compiler/compiled_method.h
index 23cd250678..b8cd851a1f 100644
--- a/compiler/compiled_method.h
+++ b/compiler/compiled_method.h
@@ -67,6 +67,7 @@ class CompiledCode {
   // returns the difference between the code address and a usable PC.
   // mainly to cope with kThumb2 where the lower bit must be set.
   size_t CodeDelta() const;
+  static size_t CodeDelta(InstructionSet instruction_set);
 
   // Returns a pointer suitable for invoking the code at the argument
   // code_pointer address.  Mainly to cope with kThumb2 where the
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index de9ac4bd01..caecb7a48e 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -527,6 +527,13 @@ enum FixupKind {
 
 std::ostream& operator<<(std::ostream& os, const FixupKind& kind);
 
+enum VolatileKind {
+  kNotVolatile,      // Load/Store is not volatile
+  kVolatile          // Load/Store is volatile
+};
+
+std::ostream& operator<<(std::ostream& os, const VolatileKind& kind);
+
 }  // namespace art
 
 #endif  // ART_COMPILER_DEX_COMPILER_ENUMS_H_
diff --git a/compiler/dex/frontend.cc b/compiler/dex/frontend.cc
index 414d51412e..72990b4be0 100644
--- a/compiler/dex/frontend.cc
+++ b/compiler/dex/frontend.cc
@@ -114,19 +114,20 @@ CompilationUnit::~CompilationUnit() {
 
 void CompilationUnit::StartTimingSplit(const char* label) {
   if (compiler_driver->GetDumpPasses()) {
-    timings.StartSplit(label);
+    timings.StartTiming(label);
   }
 }
 
 void CompilationUnit::NewTimingSplit(const char* label) {
   if (compiler_driver->GetDumpPasses()) {
-    timings.NewSplit(label);
+    timings.EndTiming();
+    timings.StartTiming(label);
   }
 }
 
 void CompilationUnit::EndTiming() {
   if (compiler_driver->GetDumpPasses()) {
-    timings.EndSplit();
+    timings.EndTiming();
     if (enable_debug & (1 << kDebugTimings)) {
       LOG(INFO) << "TIMINGS " << PrettyMethod(method_idx, *dex_file);
       LOG(INFO) << Dumpable<TimingLogger>(timings);
@@ -783,10 +784,11 @@ static CompiledMethod* CompileMethod(CompilerDriver& driver,
                                      uint16_t class_def_idx, uint32_t method_idx,
                                      jobject class_loader, const DexFile& dex_file,
                                      void* llvm_compilation_unit) {
-  VLOG(compiler) << "Compiling " << PrettyMethod(method_idx, dex_file) << "...";
+  std::string method_name = PrettyMethod(method_idx, dex_file);
+  VLOG(compiler) << "Compiling " << method_name << "...";
   if (code_item->insns_size_in_code_units_ >= 0x10000) {
     LOG(INFO) << "Method size exceeds compiler limits: " << code_item->insns_size_in_code_units_
-              << " in " << PrettyMethod(method_idx, dex_file);
+              << " in " << method_name;
     return NULL;
   }
 
@@ -818,8 +820,7 @@ static CompiledMethod* CompileMethod(CompilerDriver& driver,
   cu.compiler_flip_match = false;
   bool use_match = !cu.compiler_method_match.empty();
   bool match = use_match && (cu.compiler_flip_match ^
-      (PrettyMethod(method_idx, dex_file).find(cu.compiler_method_match) !=
-       std::string::npos));
+      (method_name.find(cu.compiler_method_match) != std::string::npos));
   if (!use_match || match) {
     cu.disable_opt = kCompilerOptimizerDisableFlags;
     cu.enable_debug = kCompilerDebugFlags;
@@ -830,7 +831,7 @@ static CompiledMethod* CompileMethod(CompilerDriver& driver,
   if (gVerboseMethods.size() != 0) {
     cu.verbose = false;
     for (size_t i = 0; i < gVerboseMethods.size(); ++i) {
-      if (PrettyMethod(method_idx, dex_file).find(gVerboseMethods[i])
+      if (method_name.find(gVerboseMethods[i])
           != std::string::npos) {
         cu.verbose = true;
         break;
@@ -864,7 +865,9 @@ static CompiledMethod* CompileMethod(CompilerDriver& driver,
         (1 << kPromoteCompilerTemps));
   } else if (cu.instruction_set == kX86_64) {
     // TODO(X86_64): enable optimizations once backend is mature enough.
-    cu.disable_opt = ~(uint32_t)0;
+    cu.disable_opt |= (
+        (1 << kLoadStoreElimination) |
+        (1 << kPromoteRegs));
   } else if (cu.instruction_set == kArm64) {
     // TODO(Arm64): enable optimizations once backend is mature enough.
     cu.disable_opt = ~(uint32_t)0;
@@ -885,22 +888,13 @@ static CompiledMethod* CompileMethod(CompilerDriver& driver,
     cu.mir_graph->EnableOpcodeCounting();
   }
 
-  // Check early if we should skip this compilation if the profiler is enabled.
-  if (cu.compiler_driver->ProfilePresent()) {
-    std::string methodname = PrettyMethod(method_idx, dex_file);
-    if (cu.mir_graph->SkipCompilationByName(methodname)) {
-      return nullptr;
-    }
-  }
-
   /* Build the raw MIR graph */
   cu.mir_graph->InlineMethod(code_item, access_flags, invoke_type, class_def_idx, method_idx,
                               class_loader, dex_file);
 
   // TODO(Arm64): Remove this when we are able to compile everything.
   if (!CanCompileMethod(method_idx, dex_file, cu)) {
-    VLOG(compiler)  << cu.instruction_set << ": Cannot compile method : "
-                    << PrettyMethod(method_idx, dex_file);
+    VLOG(compiler)  << cu.instruction_set << ": Cannot compile method : " << method_name;
     return nullptr;
   }
 
@@ -908,7 +902,7 @@ static CompiledMethod* CompileMethod(CompilerDriver& driver,
   std::string skip_message;
   if (cu.mir_graph->SkipCompilation(&skip_message)) {
     VLOG(compiler) << cu.instruction_set << ": Skipping method : "
-                   << PrettyMethod(method_idx, dex_file) << "  Reason = " << skip_message;
+                   << method_name << "  Reason = " << skip_message;
     return nullptr;
   }
 
@@ -916,6 +910,13 @@ static CompiledMethod* CompileMethod(CompilerDriver& driver,
   PassDriverMEOpts pass_driver(&cu);
   pass_driver.Launch();
 
+  /* For non-leaf methods check if we should skip compilation when the profiler is enabled. */
+  if (cu.compiler_driver->ProfilePresent()
+      && !cu.mir_graph->MethodIsLeaf()
+      && cu.mir_graph->SkipCompilationByName(method_name)) {
+    return nullptr;
+  }
+
   if (cu.enable_debug & (1 << kDebugDumpCheckStats)) {
     cu.mir_graph->DumpCheckStats();
   }
@@ -931,7 +932,7 @@ static CompiledMethod* CompileMethod(CompilerDriver& driver,
   if (cu.enable_debug & (1 << kDebugShowMemoryUsage)) {
     if (cu.arena_stack.PeakBytesAllocated() > 256 * 1024) {
       MemStats stack_stats(cu.arena_stack.GetPeakStats());
-      LOG(INFO) << PrettyMethod(method_idx, dex_file) << " " << Dumpable<MemStats>(stack_stats);
+      LOG(INFO) << method_name << " " << Dumpable<MemStats>(stack_stats);
     }
   }
   cu.arena_stack.Reset();
@@ -939,8 +940,7 @@ static CompiledMethod* CompileMethod(CompilerDriver& driver,
   CompiledMethod* result = NULL;
 
   if (cu.mir_graph->PuntToInterpreter()) {
-    VLOG(compiler) << cu.instruction_set << ": Punted method to interpreter: "
-                   << PrettyMethod(method_idx, dex_file);
+    VLOG(compiler) << cu.instruction_set << ": Punted method to interpreter: " << method_name;
     return nullptr;
   }
 
@@ -951,21 +951,21 @@ static CompiledMethod* CompileMethod(CompilerDriver& driver,
   cu.NewTimingSplit("Cleanup");
 
   if (result) {
-    VLOG(compiler) << cu.instruction_set << ": Compiled " << PrettyMethod(method_idx, dex_file);
+    VLOG(compiler) << cu.instruction_set << ": Compiled " << method_name;
   } else {
-    VLOG(compiler) << cu.instruction_set << ": Deferred " << PrettyMethod(method_idx, dex_file);
+    VLOG(compiler) << cu.instruction_set << ": Deferred " << method_name;
   }
 
   if (cu.enable_debug & (1 << kDebugShowMemoryUsage)) {
     if (cu.arena.BytesAllocated() > (1 * 1024 *1024)) {
       MemStats mem_stats(cu.arena.GetMemStats());
-      LOG(INFO) << PrettyMethod(method_idx, dex_file) << " " << Dumpable<MemStats>(mem_stats);
+      LOG(INFO) << method_name << " " << Dumpable<MemStats>(mem_stats);
     }
   }
 
   if (cu.enable_debug & (1 << kDebugShowSummaryMemoryUsage)) {
     LOG(INFO) << "MEMINFO " << cu.arena.BytesAllocated() << " " << cu.mir_graph->GetNumBlocks()
-              << " " << PrettyMethod(method_idx, dex_file);
+              << " " << method_name;
   }
 
   cu.EndTiming();
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index 590c7674f6..04d6898e36 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -316,9 +316,9 @@ void ArmMir2Lir::GenMoveException(RegLocation rl_dest) {
   int ex_offset = Thread::ExceptionOffset<4>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kRefReg, true);
   RegStorage reset_reg = AllocTempRef();
-  LoadRefDisp(rs_rARM_SELF, ex_offset, rl_result.reg);
+  LoadRefDisp(rs_rARM_SELF, ex_offset, rl_result.reg, kNotVolatile);
   LoadConstant(reset_reg, 0);
-  StoreRefDisp(rs_rARM_SELF, ex_offset, reset_reg);
+  StoreRefDisp(rs_rARM_SELF, ex_offset, reset_reg, kNotVolatile);
   FreeTemp(reset_reg);
   StoreValue(rl_dest, rl_result);
 }
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 44998627ca..70dce7f11e 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -33,20 +33,16 @@ class ArmMir2Lir FINAL : public Mir2Lir {
     LIR* CheckSuspendUsingLoad() OVERRIDE;
     RegStorage LoadHelper(ThreadOffset<4> offset) OVERRIDE;
     RegStorage LoadHelper(ThreadOffset<8> offset) OVERRIDE;
-    LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
-                              OpSize size) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
-                      OpSize size) OVERRIDE;
+                      OpSize size, VolatileKind is_volatile) OVERRIDE;
     LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale,
                          OpSize size) OVERRIDE;
     LIR* LoadBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale, int displacement,
                              RegStorage r_dest, OpSize size) OVERRIDE;
     LIR* LoadConstantNoClobber(RegStorage r_dest, int value);
     LIR* LoadConstantWide(RegStorage r_dest, int64_t value);
-    LIR* StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
-                               OpSize size) OVERRIDE;
     LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
-                       OpSize size) OVERRIDE;
+                       OpSize size, VolatileKind is_volatile) OVERRIDE;
     LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
                           OpSize size) OVERRIDE;
     LIR* StoreBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale, int displacement,
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 916c52838a..e34d944ab2 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -723,7 +723,7 @@ bool ArmMir2Lir::GenInlinedPeek(CallInfo* info, OpSize size) {
   } else {
     DCHECK(size == kSignedByte || size == kSignedHalf || size == k32);
     // Unaligned load with LDR and LDRSH is allowed on ARMv7 with SCTLR.A set to 0.
-    LoadBaseDisp(rl_address.reg, 0, rl_result.reg, size);
+    LoadBaseDisp(rl_address.reg, 0, rl_result.reg, size, kNotVolatile);
     StoreValue(rl_dest, rl_result);
   }
   return true;
@@ -737,13 +737,13 @@ bool ArmMir2Lir::GenInlinedPoke(CallInfo* info, OpSize size) {
   if (size == k64) {
     // Fake unaligned STRD by two unaligned STR instructions on ARMv7 with SCTLR.A set to 0.
     RegLocation rl_value = LoadValueWide(rl_src_value, kCoreReg);
-    StoreBaseDisp(rl_address.reg, 0, rl_value.reg.GetLow(), k32);
-    StoreBaseDisp(rl_address.reg, 4, rl_value.reg.GetHigh(), k32);
+    StoreBaseDisp(rl_address.reg, 0, rl_value.reg.GetLow(), k32, kNotVolatile);
+    StoreBaseDisp(rl_address.reg, 4, rl_value.reg.GetHigh(), k32, kNotVolatile);
   } else {
     DCHECK(size == kSignedByte || size == kSignedHalf || size == k32);
     // Unaligned store with STR and STRSH is allowed on ARMv7 with SCTLR.A set to 0.
     RegLocation rl_value = LoadValue(rl_src_value, kCoreReg);
-    StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size);
+    StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size, kNotVolatile);
   }
   return true;
 }
@@ -1230,7 +1230,7 @@ void ArmMir2Lir::GenArrayGet(int opt_flags, OpSize size, RegLocation rl_array,
       }
       FreeTemp(reg_len);
     }
-    LoadBaseDisp(reg_ptr, data_offset, rl_result.reg, size);
+    LoadBaseDisp(reg_ptr, data_offset, rl_result.reg, size, kNotVolatile);
     MarkPossibleNullPointerException(opt_flags);
     if (!constant_index) {
       FreeTemp(reg_ptr);
@@ -1330,7 +1330,7 @@ void ArmMir2Lir::GenArrayPut(int opt_flags, OpSize size, RegLocation rl_array,
       FreeTemp(reg_len);
     }
 
-    StoreBaseDisp(reg_ptr, data_offset, rl_src.reg, size);
+    StoreBaseDisp(reg_ptr, data_offset, rl_src.reg, size, kNotVolatile);
     MarkPossibleNullPointerException(opt_flags);
   } else {
     /* reg_ptr -> array data */
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index b236f99311..bc8f95b1ca 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -961,31 +961,37 @@ LIR* ArmMir2Lir::LoadBaseDispBody(RegStorage r_base, int displacement, RegStorag
   return load;
 }
 
-LIR* ArmMir2Lir::LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
-                                      OpSize size) {
-  // Only 64-bit load needs special handling.
-  if (UNLIKELY(size == k64 || size == kDouble)) {
-    DCHECK(!r_dest.IsFloat());  // See RegClassForFieldLoadSave().
-    // If the cpu supports LPAE, aligned LDRD is atomic - fall through to LoadBaseDisp().
-    if (!cu_->compiler_driver->GetInstructionSetFeatures().HasLpae()) {
-      // Use LDREXD for the atomic load. (Expect displacement > 0, don't optimize for == 0.)
-      RegStorage r_ptr = AllocTemp();
-      OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
-      LIR* lir = NewLIR3(kThumb2Ldrexd, r_dest.GetLowReg(), r_dest.GetHighReg(), r_ptr.GetReg());
-      FreeTemp(r_ptr);
-      return lir;
-    }
-  }
-  return LoadBaseDisp(r_base, displacement, r_dest, size);
-}
-
 LIR* ArmMir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
-                              OpSize size) {
+                              OpSize size, VolatileKind is_volatile) {
   // TODO: base this on target.
   if (size == kWord) {
     size = k32;
   }
-  return LoadBaseDispBody(r_base, displacement, r_dest, size);
+  LIR* load;
+  if (UNLIKELY(is_volatile == kVolatile &&
+               (size == k64 || size == kDouble) &&
+               !cu_->compiler_driver->GetInstructionSetFeatures().HasLpae())) {
+    // Only 64-bit load needs special handling.
+    // If the cpu supports LPAE, aligned LDRD is atomic - fall through to LoadBaseDisp().
+    DCHECK(!r_dest.IsFloat());  // See RegClassForFieldLoadSave().
+    // Use LDREXD for the atomic load. (Expect displacement > 0, don't optimize for == 0.)
+    RegStorage r_ptr = AllocTemp();
+    OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
+    LIR* lir = NewLIR3(kThumb2Ldrexd, r_dest.GetLowReg(), r_dest.GetHighReg(), r_ptr.GetReg());
+    FreeTemp(r_ptr);
+    return lir;
+  } else {
+    load = LoadBaseDispBody(r_base, displacement, r_dest, size);
+  }
+
+  if (UNLIKELY(is_volatile == kVolatile)) {
+    // Without context sensitive analysis, we must issue the most conservative barriers.
+    // In this case, either a load or store may follow so we issue both barriers.
+    GenMemBarrier(kLoadLoad);
+    GenMemBarrier(kLoadStore);
+  }
+
+  return load;
 }
 
 
@@ -1081,49 +1087,58 @@ LIR* ArmMir2Lir::StoreBaseDispBody(RegStorage r_base, int displacement, RegStora
   return store;
 }
 
-LIR* ArmMir2Lir::StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
-                                       OpSize size) {
-  // Only 64-bit store needs special handling.
-  if (UNLIKELY(size == k64 || size == kDouble)) {
-    DCHECK(!r_src.IsFloat());  // See RegClassForFieldLoadSave().
+LIR* ArmMir2Lir::StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
+                               OpSize size, VolatileKind is_volatile) {
+  if (UNLIKELY(is_volatile == kVolatile)) {
+    // There might have been a store before this volatile one so insert StoreStore barrier.
+    GenMemBarrier(kStoreStore);
+  }
+
+  LIR* store;
+  if (UNLIKELY(is_volatile == kVolatile &&
+               (size == k64 || size == kDouble) &&
+               !cu_->compiler_driver->GetInstructionSetFeatures().HasLpae())) {
+    // Only 64-bit store needs special handling.
     // If the cpu supports LPAE, aligned STRD is atomic - fall through to StoreBaseDisp().
-    if (!cu_->compiler_driver->GetInstructionSetFeatures().HasLpae()) {
-      // Use STREXD for the atomic store. (Expect displacement > 0, don't optimize for == 0.)
-      RegStorage r_ptr = AllocTemp();
+    // Use STREXD for the atomic store. (Expect displacement > 0, don't optimize for == 0.)
+    DCHECK(!r_src.IsFloat());  // See RegClassForFieldLoadSave().
+    RegStorage r_ptr = AllocTemp();
+    OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
+    LIR* fail_target = NewLIR0(kPseudoTargetLabel);
+    // We have only 5 temporary registers available and if r_base, r_src and r_ptr already
+    // take 4, we can't directly allocate 2 more for LDREXD temps. In that case clobber r_ptr
+    // in LDREXD and recalculate it from r_base.
+    RegStorage r_temp = AllocTemp();
+    RegStorage r_temp_high = AllocFreeTemp();  // We may not have another temp.
+    if (r_temp_high.Valid()) {
+      NewLIR3(kThumb2Ldrexd, r_temp.GetReg(), r_temp_high.GetReg(), r_ptr.GetReg());
+      FreeTemp(r_temp_high);
+      FreeTemp(r_temp);
+    } else {
+      // If we don't have another temp, clobber r_ptr in LDREXD and reload it.
+      NewLIR3(kThumb2Ldrexd, r_temp.GetReg(), r_ptr.GetReg(), r_ptr.GetReg());
+      FreeTemp(r_temp);  // May need the temp for kOpAdd.
       OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
-      LIR* fail_target = NewLIR0(kPseudoTargetLabel);
-      // We have only 5 temporary registers available and if r_base, r_src and r_ptr already
-      // take 4, we can't directly allocate 2 more for LDREXD temps. In that case clobber r_ptr
-      // in LDREXD and recalculate it from r_base.
-      RegStorage r_temp = AllocTemp();
-      RegStorage r_temp_high = AllocFreeTemp();  // We may not have another temp.
-      if (r_temp_high.Valid()) {
-        NewLIR3(kThumb2Ldrexd, r_temp.GetReg(), r_temp_high.GetReg(), r_ptr.GetReg());
-        FreeTemp(r_temp_high);
-        FreeTemp(r_temp);
-      } else {
-        // If we don't have another temp, clobber r_ptr in LDREXD and reload it.
-        NewLIR3(kThumb2Ldrexd, r_temp.GetReg(), r_ptr.GetReg(), r_ptr.GetReg());
-        FreeTemp(r_temp);  // May need the temp for kOpAdd.
-        OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
-      }
-      LIR* lir = NewLIR4(kThumb2Strexd, r_temp.GetReg(), r_src.GetLowReg(), r_src.GetHighReg(),
-                         r_ptr.GetReg());
-      OpCmpImmBranch(kCondNe, r_temp, 0, fail_target);
-      FreeTemp(r_ptr);
-      return lir;
     }
+    store = NewLIR4(kThumb2Strexd, r_temp.GetReg(), r_src.GetLowReg(), r_src.GetHighReg(),
+                    r_ptr.GetReg());
+    OpCmpImmBranch(kCondNe, r_temp, 0, fail_target);
+    FreeTemp(r_ptr);
+  } else {
+    // TODO: base this on target.
+    if (size == kWord) {
+      size = k32;
+    }
+
+    store = StoreBaseDispBody(r_base, displacement, r_src, size);
   }
-  return StoreBaseDisp(r_base, displacement, r_src, size);
-}
 
-LIR* ArmMir2Lir::StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
-                               OpSize size) {
-  // TODO: base this on target.
-  if (size == kWord) {
-    size = k32;
+  if (UNLIKELY(is_volatile == kVolatile)) {
+    // A load might follow the volatile store so insert a StoreLoad barrier.
+    GenMemBarrier(kStoreLoad);
   }
-  return StoreBaseDispBody(r_base, displacement, r_src, size);
+
+  return store;
 }
 
 LIR* ArmMir2Lir::OpFpRegCopy(RegStorage r_dest, RegStorage r_src) {
diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h
index 3f32c5194e..1f1a252343 100644
--- a/compiler/dex/quick/arm64/arm64_lir.h
+++ b/compiler/dex/quick/arm64/arm64_lir.h
@@ -101,6 +101,7 @@ namespace art {
 // Temporary macros, used to mark code which wants to distinguish betweek zr/sp.
 #define A64_REG_IS_SP(reg_num) ((reg_num) == rwsp || (reg_num) == rsp)
 #define A64_REG_IS_ZR(reg_num) ((reg_num) == rwzr || (reg_num) == rxzr)
+#define A64_REGSTORAGE_IS_SP_OR_ZR(rs) (((rs).GetRegNum() & 0x1f) == 0x1f)
 
 enum Arm64ResourceEncodingPos {
   kArm64GPReg0   = 0,
diff --git a/compiler/dex/quick/arm64/assemble_arm64.cc b/compiler/dex/quick/arm64/assemble_arm64.cc
index 2a8da24982..bee64f1d42 100644
--- a/compiler/dex/quick/arm64/assemble_arm64.cc
+++ b/compiler/dex/quick/arm64/assemble_arm64.cc
@@ -632,19 +632,19 @@ uint8_t* Arm64Mir2Lir::EncodeLIRs(uint8_t* write_pos, LIR* lir) {
           if (static_cast<unsigned>(kind) < kFmtBitBlt) {
             bool is_zero = A64_REG_IS_ZR(operand);
 
-            if (kIsDebugBuild) {
+            if (kIsDebugBuild && (kFailOnSizeError || kReportSizeError)) {
               // Register usage checks: First establish register usage requirements based on the
               // format in `kind'.
               bool want_float = false;
               bool want_64_bit = false;
-              bool want_size_match = false;
+              bool want_var_size = true;
               bool want_zero = false;
               switch (kind) {
                 case kFmtRegX:
                   want_64_bit = true;
                   // Intentional fall-through.
                 case kFmtRegW:
-                  want_size_match = true;
+                  want_var_size = false;
                   // Intentional fall-through.
                 case kFmtRegR:
                   want_zero = true;
@@ -653,7 +653,7 @@ uint8_t* Arm64Mir2Lir::EncodeLIRs(uint8_t* write_pos, LIR* lir) {
                   want_64_bit = true;
                   // Intentional fall-through.
                 case kFmtRegWOrSp:
-                  want_size_match = true;
+                  want_var_size = false;
                   break;
                 case kFmtRegROrSp:
                   break;
@@ -661,7 +661,7 @@ uint8_t* Arm64Mir2Lir::EncodeLIRs(uint8_t* write_pos, LIR* lir) {
                   want_64_bit = true;
                   // Intentional fall-through.
                 case kFmtRegS:
-                  want_size_match = true;
+                  want_var_size = false;
                   // Intentional fall-through.
                 case kFmtRegF:
                   want_float = true;
@@ -672,21 +672,27 @@ uint8_t* Arm64Mir2Lir::EncodeLIRs(uint8_t* write_pos, LIR* lir) {
                   break;
               }
 
+              // want_var_size == true means kind == kFmtReg{R,F}. In these two cases, we want
+              // the register size to be coherent with the instruction width.
+              if (want_var_size) {
+                want_64_bit = opcode_is_wide;
+              }
+
               // Now check that the requirements are satisfied.
               RegStorage reg(operand | RegStorage::kValid);
               const char *expected = nullptr;
               if (want_float) {
                 if (!reg.IsFloat()) {
                   expected = "float register";
-                } else if (want_size_match && (reg.IsDouble() != want_64_bit)) {
+                } else if (reg.IsDouble() != want_64_bit) {
                   expected = (want_64_bit) ? "double register" : "single register";
                 }
               } else {
                 if (reg.IsFloat()) {
                   expected = "core register";
-                } else if (want_size_match && (reg.Is64Bit() != want_64_bit)) {
+                } else if (reg.Is64Bit() != want_64_bit) {
                   expected = (want_64_bit) ? "x-register" : "w-register";
-                } else if (reg.GetRegNum() == 31 && is_zero != want_zero) {
+                } else if (A64_REGSTORAGE_IS_SP_OR_ZR(reg) && is_zero != want_zero) {
                   expected = (want_zero) ? "zero-register" : "sp-register";
                 }
               }
@@ -698,8 +704,13 @@ uint8_t* Arm64Mir2Lir::EncodeLIRs(uint8_t* write_pos, LIR* lir) {
               if (expected != nullptr) {
                 LOG(WARNING) << "Method: " << PrettyMethod(cu_->method_idx, *cu_->dex_file)
                              << " @ 0x" << std::hex << lir->dalvik_offset;
-                LOG(FATAL) << "Bad argument n. " << i << " of " << encoder->name
-                           << ". Expected " << expected << ", got 0x" << std::hex << operand;
+                if (kFailOnSizeError) {
+                  LOG(FATAL) << "Bad argument n. " << i << " of " << encoder->name
+                             << ". Expected " << expected << ", got 0x" << std::hex << operand;
+                } else {
+                  LOG(WARNING) << "Bad argument n. " << i << " of " << encoder->name
+                               << ". Expected " << expected << ", got 0x" << std::hex << operand;
+                }
               }
             }
 
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index 1df576b743..c3f4711546 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -267,7 +267,7 @@ void Arm64Mir2Lir::GenMonitorExit(int opt_flags, RegLocation rl_src) {
   MarkPossibleNullPointerException(opt_flags);
   LIR* slow_unlock_branch = OpCmpBranch(kCondNe, rs_w1, rs_w2, NULL);
   GenMemBarrier(kStoreLoad);
-  Store32Disp(rs_x0, mirror::Object::MonitorOffset().Int32Value(), rs_xzr);
+  Store32Disp(rs_x0, mirror::Object::MonitorOffset().Int32Value(), rs_wzr);
   LIR* unlock_success_branch = OpUnconditionalBranch(NULL);
 
   LIR* slow_path_target = NewLIR0(kPseudoTargetLabel);
@@ -289,8 +289,8 @@ void Arm64Mir2Lir::GenMonitorExit(int opt_flags, RegLocation rl_src) {
 void Arm64Mir2Lir::GenMoveException(RegLocation rl_dest) {
   int ex_offset = Thread::ExceptionOffset<8>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kRefReg, true);
-  LoadRefDisp(rs_rA64_SELF, ex_offset, rl_result.reg);
-  StoreRefDisp(rs_rA64_SELF, ex_offset, rs_xzr);
+  LoadRefDisp(rs_rA64_SELF, ex_offset, rl_result.reg, kNotVolatile);
+  StoreRefDisp(rs_rA64_SELF, ex_offset, rs_xzr, kNotVolatile);
   StoreValue(rl_dest, rl_result);
 }
 
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index f1270eca01..68fa6f40cb 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -26,6 +26,11 @@ namespace art {
 
 class Arm64Mir2Lir : public Mir2Lir {
  protected:
+  // If we detect a size error, FATAL out.
+  static constexpr bool kFailOnSizeError = false && kIsDebugBuild;
+  // If we detect a size error, report to LOG.
+  static constexpr bool kReportSizeError = false && kIsDebugBuild;
+
   // TODO: consolidate 64-bit target support.
   class InToRegStorageMapper {
    public:
@@ -69,22 +74,25 @@ class Arm64Mir2Lir : public Mir2Lir {
     LIR* CheckSuspendUsingLoad() OVERRIDE;
     RegStorage LoadHelper(ThreadOffset<4> offset) OVERRIDE;
     RegStorage LoadHelper(ThreadOffset<8> offset) OVERRIDE;
-    LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
-                              OpSize size) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
-                      OpSize size) OVERRIDE;
+                      OpSize size, VolatileKind is_volatile) OVERRIDE;
+    LIR* LoadRefDisp(RegStorage r_base, int displacement, RegStorage r_dest,
+                     VolatileKind is_volatile)
+        OVERRIDE;
     LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale,
                          OpSize size) OVERRIDE;
+    LIR* LoadRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest) OVERRIDE;
     LIR* LoadBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale, int displacement,
                              RegStorage r_dest, OpSize size) OVERRIDE;
     LIR* LoadConstantNoClobber(RegStorage r_dest, int value);
     LIR* LoadConstantWide(RegStorage r_dest, int64_t value);
-    LIR* StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
-                               OpSize size) OVERRIDE;
     LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
-                       OpSize size) OVERRIDE;
+                       OpSize size, VolatileKind is_volatile) OVERRIDE;
+    LIR* StoreRefDisp(RegStorage r_base, int displacement, RegStorage r_src,
+                      VolatileKind is_volatile) OVERRIDE;
     LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
                           OpSize size) OVERRIDE;
+    LIR* StoreRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src) OVERRIDE;
     LIR* StoreBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale, int displacement,
                               RegStorage r_src, OpSize size) OVERRIDE;
     void MarkGCCard(RegStorage val_reg, RegStorage tgt_addr_reg) OVERRIDE;
@@ -283,8 +291,15 @@ class Arm64Mir2Lir : public Mir2Lir {
      * @see As64BitReg
      */
     RegStorage As32BitReg(RegStorage reg) {
-      DCHECK(reg.Is64Bit());
       DCHECK(!reg.IsPair());
+      if ((kFailOnSizeError || kReportSizeError) && !reg.Is64Bit()) {
+        if (kFailOnSizeError) {
+          LOG(FATAL) << "Expected 64b register";
+        } else {
+          LOG(WARNING) << "Expected 64b register";
+          return reg;
+        }
+      }
       RegStorage ret_val = RegStorage(RegStorage::k32BitSolo,
                                       reg.GetRawBits() & RegStorage::kRegTypeMask);
       DCHECK_EQ(GetRegInfo(reg)->FindMatchingView(RegisterInfo::k32SoloStorageMask)
@@ -293,6 +308,18 @@ class Arm64Mir2Lir : public Mir2Lir {
       return ret_val;
     }
 
+    RegStorage Check32BitReg(RegStorage reg) {
+      if ((kFailOnSizeError || kReportSizeError) && !reg.Is32Bit()) {
+        if (kFailOnSizeError) {
+          LOG(FATAL) << "Checked for 32b register";
+        } else {
+          LOG(WARNING) << "Checked for 32b register";
+          return As32BitReg(reg);
+        }
+      }
+      return reg;
+    }
+
     /**
      * @brief Given register wNN (sNN), returns register xNN (dNN).
      * @param reg #RegStorage containing a Solo32 input register (e.g. @c w1 or @c s2).
@@ -300,8 +327,15 @@ class Arm64Mir2Lir : public Mir2Lir {
      * @see As32BitReg
      */
     RegStorage As64BitReg(RegStorage reg) {
-      DCHECK(reg.Is32Bit());
       DCHECK(!reg.IsPair());
+      if ((kFailOnSizeError || kReportSizeError) && !reg.Is32Bit()) {
+        if (kFailOnSizeError) {
+          LOG(FATAL) << "Expected 32b register";
+        } else {
+          LOG(WARNING) << "Expected 32b register";
+          return reg;
+        }
+      }
       RegStorage ret_val = RegStorage(RegStorage::k64BitSolo,
                                       reg.GetRawBits() & RegStorage::kRegTypeMask);
       DCHECK_EQ(GetRegInfo(reg)->FindMatchingView(RegisterInfo::k64SoloStorageMask)
@@ -310,6 +344,18 @@ class Arm64Mir2Lir : public Mir2Lir {
       return ret_val;
     }
 
+    RegStorage Check64BitReg(RegStorage reg) {
+      if ((kFailOnSizeError || kReportSizeError) && !reg.Is64Bit()) {
+        if (kFailOnSizeError) {
+          LOG(FATAL) << "Checked for 64b register";
+        } else {
+          LOG(WARNING) << "Checked for 64b register";
+          return As64BitReg(reg);
+        }
+      }
+      return reg;
+    }
+
     LIR* LoadFPConstantValue(RegStorage r_dest, int32_t value);
     LIR* LoadFPConstantValueWide(RegStorage r_dest, int64_t value);
     void ReplaceFixup(LIR* prev_lir, LIR* orig_lir, LIR* new_lir);
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index 2ac4adbadc..1fdbe2dfba 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -410,7 +410,7 @@ bool Arm64Mir2Lir::GenInlinedPeek(CallInfo* info, OpSize size) {
   RegLocation rl_address = LoadValue(rl_src_address, kCoreReg);   // kRefReg
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
 
-  LoadBaseDisp(rl_address.reg, 0, rl_result.reg, size);
+  LoadBaseDisp(rl_address.reg, 0, rl_result.reg, size, kNotVolatile);
   if (size == k64) {
     StoreValueWide(rl_dest, rl_result);
   } else {
@@ -433,7 +433,7 @@ bool Arm64Mir2Lir::GenInlinedPoke(CallInfo* info, OpSize size) {
     DCHECK(size == kSignedByte || size == kSignedHalf || size == k32);
     rl_value = LoadValue(rl_src_value, kCoreReg);
   }
-  StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size);
+  StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size, kNotVolatile);
   return true;
 }
 
@@ -747,7 +747,11 @@ void Arm64Mir2Lir::GenArrayGet(int opt_flags, OpSize size, RegLocation rl_array,
       }
       FreeTemp(reg_len);
     }
-    LoadBaseDisp(reg_ptr, data_offset, rl_result.reg, size);
+    if (rl_result.ref) {
+      LoadRefDisp(reg_ptr, data_offset, rl_result.reg, kNotVolatile);
+    } else {
+      LoadBaseDisp(reg_ptr, data_offset, rl_result.reg, size, kNotVolatile);
+    }
     MarkPossibleNullPointerException(opt_flags);
     if (!constant_index) {
       FreeTemp(reg_ptr);
@@ -768,7 +772,11 @@ void Arm64Mir2Lir::GenArrayGet(int opt_flags, OpSize size, RegLocation rl_array,
       GenArrayBoundsCheck(rl_index.reg, reg_len);
       FreeTemp(reg_len);
     }
-    LoadBaseIndexed(reg_ptr, As64BitReg(rl_index.reg), rl_result.reg, scale, size);
+    if (rl_result.ref) {
+      LoadRefIndexed(reg_ptr, As64BitReg(rl_index.reg), rl_result.reg);
+    } else {
+      LoadBaseIndexed(reg_ptr, As64BitReg(rl_index.reg), rl_result.reg, scale, size);
+    }
     MarkPossibleNullPointerException(opt_flags);
     FreeTemp(reg_ptr);
     StoreValue(rl_dest, rl_result);
@@ -847,8 +855,11 @@ void Arm64Mir2Lir::GenArrayPut(int opt_flags, OpSize size, RegLocation rl_array,
       }
       FreeTemp(reg_len);
     }
-
-    StoreBaseDisp(reg_ptr, data_offset, rl_src.reg, size);
+    if (rl_src.ref) {
+      StoreRefDisp(reg_ptr, data_offset, rl_src.reg, kNotVolatile);
+    } else {
+      StoreBaseDisp(reg_ptr, data_offset, rl_src.reg, size, kNotVolatile);
+    }
     MarkPossibleNullPointerException(opt_flags);
   } else {
     /* reg_ptr -> array data */
@@ -858,7 +869,11 @@ void Arm64Mir2Lir::GenArrayPut(int opt_flags, OpSize size, RegLocation rl_array,
       GenArrayBoundsCheck(rl_index.reg, reg_len);
       FreeTemp(reg_len);
     }
-    StoreBaseIndexed(reg_ptr, As64BitReg(rl_index.reg), rl_src.reg, scale, size);
+    if (rl_src.ref) {
+      StoreRefIndexed(reg_ptr, As64BitReg(rl_index.reg), rl_src.reg);
+    } else {
+      StoreBaseIndexed(reg_ptr, As64BitReg(rl_index.reg), rl_src.reg, scale, size);
+    }
     MarkPossibleNullPointerException(opt_flags);
   }
   if (allocated_reg_ptr_temp) {
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index 06e1cda305..dfaa4837d2 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -789,7 +789,7 @@ RegStorage Arm64Mir2Lir::LoadHelper(ThreadOffset<4> offset) {
 RegStorage Arm64Mir2Lir::LoadHelper(ThreadOffset<8> offset) {
   // TODO(Arm64): use LoadWordDisp instead.
   //   e.g. LoadWordDisp(rs_rA64_SELF, offset.Int32Value(), rs_rA64_LR);
-  LoadBaseDisp(rs_rA64_SELF, offset.Int32Value(), rs_rA64_LR, k64);
+  LoadBaseDisp(rs_rA64_SELF, offset.Int32Value(), rs_rA64_LR, k64, kNotVolatile);
   return rs_rA64_LR;
 }
 
@@ -949,7 +949,7 @@ void Arm64Mir2Lir::FlushIns(RegLocation* ArgLocs, RegLocation rl_method) {
   StoreValue(rl_method, rl_src);
   // If Method* has been promoted, explicitly flush
   if (rl_method.location == kLocPhysReg) {
-    StoreRefDisp(TargetReg(kSp), 0, TargetReg(kArg0));
+    StoreRefDisp(TargetReg(kSp), 0, TargetReg(kArg0), kNotVolatile);
   }
 
   if (cu_->num_ins == 0) {
@@ -971,7 +971,7 @@ void Arm64Mir2Lir::FlushIns(RegLocation* ArgLocs, RegLocation rl_method) {
       } else if ((v_map->fp_location == kLocPhysReg) && t_loc->fp) {
         OpRegCopy(RegStorage::Solo32(v_map->FpReg), reg);
       } else {
-        StoreBaseDisp(TargetReg(kSp), SRegOffset(start_vreg + i), reg, op_size);
+        StoreBaseDisp(TargetReg(kSp), SRegOffset(start_vreg + i), reg, op_size, kNotVolatile);
         if (reg.Is64Bit()) {
           if (SRegOffset(start_vreg + i) + 4 != SRegOffset(start_vreg + i + 1)) {
             LOG(FATAL) << "64 bit value stored in non-consecutive 4 bytes slots";
@@ -1057,14 +1057,14 @@ int Arm64Mir2Lir::GenDalvikArgsRange(CallInfo* info, int call_state,
         loc = UpdateLocWide(loc);
         if (loc.location == kLocPhysReg) {
           ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
-          StoreBaseDisp(TargetReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, k64);
+          StoreBaseDisp(TargetReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, k64, kNotVolatile);
         }
         next_arg += 2;
       } else {
         loc = UpdateLoc(loc);
         if (loc.location == kLocPhysReg) {
           ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
-          StoreBaseDisp(TargetReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, k32);
+          StoreBaseDisp(TargetReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, k32, kNotVolatile);
         }
         next_arg++;
       }
@@ -1122,18 +1122,27 @@ int Arm64Mir2Lir::GenDalvikArgsRange(CallInfo* info, int call_state,
           ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
           if (rl_arg.wide) {
             if (rl_arg.location == kLocPhysReg) {
-              StoreBaseDisp(TargetReg(kSp), out_offset, rl_arg.reg, k64);
+              StoreBaseDisp(TargetReg(kSp), out_offset, rl_arg.reg, k64, kNotVolatile);
             } else {
               LoadValueDirectWideFixed(rl_arg, regWide);
-              StoreBaseDisp(TargetReg(kSp), out_offset, regWide, k64);
+              StoreBaseDisp(TargetReg(kSp), out_offset, regWide, k64, kNotVolatile);
             }
             i++;
           } else {
             if (rl_arg.location == kLocPhysReg) {
-              StoreBaseDisp(TargetReg(kSp), out_offset, rl_arg.reg, k32);
+              if (rl_arg.ref) {
+                StoreRefDisp(TargetReg(kSp), out_offset, rl_arg.reg, kNotVolatile);
+              } else {
+                StoreBaseDisp(TargetReg(kSp), out_offset, rl_arg.reg, k32, kNotVolatile);
+              }
             } else {
-              LoadValueDirectFixed(rl_arg, regSingle);
-              StoreBaseDisp(TargetReg(kSp), out_offset, regSingle, k32);
+              if (rl_arg.ref) {
+                LoadValueDirectFixed(rl_arg, regSingle);
+                StoreRefDisp(TargetReg(kSp), out_offset, regSingle, kNotVolatile);
+              } else {
+                LoadValueDirectFixed(rl_arg, As32BitReg(regSingle));
+                StoreBaseDisp(TargetReg(kSp), out_offset, As32BitReg(regSingle), k32, kNotVolatile);
+              }
             }
           }
         }
diff --git a/compiler/dex/quick/arm64/utility_arm64.cc b/compiler/dex/quick/arm64/utility_arm64.cc
index 672aa8884f..12c2f415d6 100644
--- a/compiler/dex/quick/arm64/utility_arm64.cc
+++ b/compiler/dex/quick/arm64/utility_arm64.cc
@@ -893,9 +893,7 @@ LIR* Arm64Mir2Lir::LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegSto
   ArmOpcode opcode = kA64Brk1d;
   DCHECK(r_base.Is64Bit());
   // TODO: need a cleaner handling of index registers here and throughout.
-  if (r_index.Is32Bit()) {
-    r_index = As64BitReg(r_index);
-  }
+  r_index = Check32BitReg(r_index);
 
   if (r_dest.IsFloat()) {
     if (r_dest.IsDouble()) {
@@ -918,12 +916,14 @@ LIR* Arm64Mir2Lir::LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegSto
     case kDouble:
     case kWord:
     case k64:
+      r_dest = Check64BitReg(r_dest);
       opcode = WIDE(kA64Ldr4rXxG);
       expected_scale = 3;
       break;
     case kSingle:
     case k32:
     case kReference:
+      r_dest = Check32BitReg(r_dest);
       opcode = kA64Ldr4rXxG;
       expected_scale = 2;
       break;
@@ -959,6 +959,10 @@ LIR* Arm64Mir2Lir::LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegSto
   return load;
 }
 
+LIR* Arm64Mir2Lir::LoadRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest) {
+  return LoadBaseIndexed(r_base, r_index, As32BitReg(r_dest), 2, kReference);
+}
+
 LIR* Arm64Mir2Lir::StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src,
                                     int scale, OpSize size) {
   LIR* store;
@@ -966,9 +970,7 @@ LIR* Arm64Mir2Lir::StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegSt
   ArmOpcode opcode = kA64Brk1d;
   DCHECK(r_base.Is64Bit());
   // TODO: need a cleaner handling of index registers here and throughout.
-  if (r_index.Is32Bit()) {
-    r_index = As64BitReg(r_index);
-  }
+  r_index = Check32BitReg(r_index);
 
   if (r_src.IsFloat()) {
     if (r_src.IsDouble()) {
@@ -991,12 +993,14 @@ LIR* Arm64Mir2Lir::StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegSt
     case kDouble:     // Intentional fall-trough.
     case kWord:       // Intentional fall-trough.
     case k64:
+      r_src = Check64BitReg(r_src);
       opcode = WIDE(kA64Str4rXxG);
       expected_scale = 3;
       break;
     case kSingle:     // Intentional fall-trough.
     case k32:         // Intentional fall-trough.
     case kReference:
+      r_src = Check32BitReg(r_src);
       opcode = kA64Str4rXxG;
       expected_scale = 2;
       break;
@@ -1026,6 +1030,10 @@ LIR* Arm64Mir2Lir::StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegSt
   return store;
 }
 
+LIR* Arm64Mir2Lir::StoreRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src) {
+  return StoreBaseIndexed(r_base, r_index, As32BitReg(r_src), 2, kReference);
+}
+
 /*
  * Load value from base + displacement.  Optionally perform null check
  * on base (which must have an associated s_reg and MIR).  If not
@@ -1042,6 +1050,7 @@ LIR* Arm64Mir2Lir::LoadBaseDispBody(RegStorage r_base, int displacement, RegStor
     case kDouble:     // Intentional fall-through.
     case kWord:       // Intentional fall-through.
     case k64:
+      r_dest = Check64BitReg(r_dest);
       scale = 3;
       if (r_dest.IsFloat()) {
         DCHECK(r_dest.IsDouble());
@@ -1055,6 +1064,7 @@ LIR* Arm64Mir2Lir::LoadBaseDispBody(RegStorage r_base, int displacement, RegStor
     case kSingle:     // Intentional fall-through.
     case k32:         // Intentional fall-trough.
     case kReference:
+      r_dest = Check32BitReg(r_dest);
       scale = 2;
       if (r_dest.IsFloat()) {
         DCHECK(r_dest.IsSingle());
@@ -1106,18 +1116,27 @@ LIR* Arm64Mir2Lir::LoadBaseDispBody(RegStorage r_base, int displacement, RegStor
   return load;
 }
 
-LIR* Arm64Mir2Lir::LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
-                                        OpSize size) {
+LIR* Arm64Mir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
+                                OpSize size, VolatileKind is_volatile) {
   // LoadBaseDisp() will emit correct insn for atomic load on arm64
   // assuming r_dest is correctly prepared using RegClassForFieldLoadStore().
-  return LoadBaseDisp(r_base, displacement, r_dest, size);
-}
 
-LIR* Arm64Mir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
-                                OpSize size) {
-  return LoadBaseDispBody(r_base, displacement, r_dest, size);
+  LIR* load = LoadBaseDispBody(r_base, displacement, r_dest, size);
+
+  if (UNLIKELY(is_volatile == kVolatile)) {
+    // Without context sensitive analysis, we must issue the most conservative barriers.
+    // In this case, either a load or store may follow so we issue both barriers.
+    GenMemBarrier(kLoadLoad);
+    GenMemBarrier(kLoadStore);
+  }
+
+  return load;
 }
 
+LIR* Arm64Mir2Lir::LoadRefDisp(RegStorage r_base, int displacement, RegStorage r_dest,
+                               VolatileKind is_volatile) {
+  return LoadBaseDisp(r_base, displacement, As32BitReg(r_dest), kReference, is_volatile);
+}
 
 LIR* Arm64Mir2Lir::StoreBaseDispBody(RegStorage r_base, int displacement, RegStorage r_src,
                                      OpSize size) {
@@ -1130,6 +1149,7 @@ LIR* Arm64Mir2Lir::StoreBaseDispBody(RegStorage r_base, int displacement, RegSto
     case kDouble:     // Intentional fall-through.
     case kWord:       // Intentional fall-through.
     case k64:
+      r_src = Check64BitReg(r_src);
       scale = 3;
       if (r_src.IsFloat()) {
         DCHECK(r_src.IsDouble());
@@ -1143,6 +1163,7 @@ LIR* Arm64Mir2Lir::StoreBaseDispBody(RegStorage r_base, int displacement, RegSto
     case kSingle:     // Intentional fall-through.
     case k32:         // Intentional fall-trough.
     case kReference:
+      r_src = Check32BitReg(r_src);
       scale = 2;
       if (r_src.IsFloat()) {
         DCHECK(r_src.IsSingle());
@@ -1188,16 +1209,29 @@ LIR* Arm64Mir2Lir::StoreBaseDispBody(RegStorage r_base, int displacement, RegSto
   return store;
 }
 
-LIR* Arm64Mir2Lir::StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
-                                         OpSize size) {
+LIR* Arm64Mir2Lir::StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
+                                 OpSize size, VolatileKind is_volatile) {
+  if (UNLIKELY(is_volatile == kVolatile)) {
+    // There might have been a store before this volatile one so insert StoreStore barrier.
+    GenMemBarrier(kStoreStore);
+  }
+
   // StoreBaseDisp() will emit correct insn for atomic store on arm64
   // assuming r_dest is correctly prepared using RegClassForFieldLoadStore().
-  return StoreBaseDisp(r_base, displacement, r_src, size);
+
+  LIR* store = StoreBaseDispBody(r_base, displacement, r_src, size);
+
+  if (UNLIKELY(is_volatile == kVolatile)) {
+    // A load might follow the volatile store so insert a StoreLoad barrier.
+    GenMemBarrier(kStoreLoad);
+  }
+
+  return store;
 }
 
-LIR* Arm64Mir2Lir::StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
-                                 OpSize size) {
-  return StoreBaseDispBody(r_base, displacement, r_src, size);
+LIR* Arm64Mir2Lir::StoreRefDisp(RegStorage r_base, int displacement, RegStorage r_src,
+                                VolatileKind is_volatile) {
+  return StoreBaseDisp(r_base, displacement, As32BitReg(r_src), kReference, is_volatile);
 }
 
 LIR* Arm64Mir2Lir::OpFpRegCopy(RegStorage r_dest, RegStorage r_src) {
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index ec0fb43571..f31b670164 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -79,6 +79,20 @@ void Mir2Lir::MarkSafepointPC(LIR* inst) {
   DCHECK(safepoint_pc->u.m.def_mask->Equals(kEncodeAll));
 }
 
+void Mir2Lir::MarkSafepointPCAfter(LIR* after) {
+  DCHECK(!after->flags.use_def_invalid);
+  after->u.m.def_mask = &kEncodeAll;
+  // As NewLIR0 uses Append, we need to create the LIR by hand.
+  LIR* safepoint_pc = RawLIR(current_dalvik_offset_, kPseudoSafepointPC);
+  if (after->next == nullptr) {
+    DCHECK_EQ(after, last_lir_insn_);
+    AppendLIR(safepoint_pc);
+  } else {
+    InsertLIRAfter(after, safepoint_pc);
+  }
+  DCHECK(safepoint_pc->u.m.def_mask->Equals(kEncodeAll));
+}
+
 /* Remove a LIR from the list. */
 void Mir2Lir::UnlinkLIR(LIR* lir) {
   if (UNLIKELY(lir == first_lir_insn_)) {
@@ -1112,7 +1126,7 @@ void Mir2Lir::InsertLIRBefore(LIR* current_lir, LIR* new_lir) {
 
 /*
  * Insert an LIR instruction after the current instruction, which cannot be the
- * first instruction.
+ * last instruction.
  *
  * current_lir -> new_lir -> old_next
  */
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index e36b592c74..b00cbeb61d 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -196,6 +196,15 @@ void Mir2Lir::MarkPossibleNullPointerException(int opt_flags) {
   }
 }
 
+void Mir2Lir::MarkPossibleNullPointerExceptionAfter(int opt_flags, LIR* after) {
+  if (!cu_->compiler_driver->GetCompilerOptions().GetExplicitNullChecks()) {
+    if (!(cu_->disable_opt & (1 << kNullCheckElimination)) && (opt_flags & MIR_IGNORE_NULL_CHECK)) {
+      return;
+    }
+    MarkSafepointPCAfter(after);
+  }
+}
+
 void Mir2Lir::MarkPossibleStackOverflowException() {
   if (!cu_->compiler_driver->GetCompilerOptions().GetExplicitStackOverflowChecks()) {
     MarkSafepointPC(last_lir_insn_);
@@ -506,7 +515,7 @@ void Mir2Lir::GenFilledNewArray(CallInfo* info) {
     for (int i = 0; i < elems; i++) {
       RegLocation rl_arg = LoadValue(info->args[i], kCoreReg);
       Store32Disp(TargetReg(kRet0),
-                    mirror::Array::DataOffset(component_size).Int32Value() + i * 4, rl_arg.reg);
+                  mirror::Array::DataOffset(component_size).Int32Value() + i * 4, rl_arg.reg);
       // If the LoadValue caused a temp to be allocated, free it
       if (IsTemp(rl_arg.reg)) {
         FreeTemp(rl_arg.reg);
@@ -575,7 +584,8 @@ void Mir2Lir::GenSput(MIR* mir, RegLocation rl_src, bool is_long_or_double,
       // Fast path, static storage base is this method's class
       RegLocation rl_method = LoadCurrMethod();
       r_base = AllocTempRef();
-      LoadRefDisp(rl_method.reg, mirror::ArtMethod::DeclaringClassOffset().Int32Value(), r_base);
+      LoadRefDisp(rl_method.reg, mirror::ArtMethod::DeclaringClassOffset().Int32Value(), r_base,
+                  kNotVolatile);
       if (IsTemp(rl_method.reg)) {
         FreeTemp(rl_method.reg);
       }
@@ -592,9 +602,10 @@ void Mir2Lir::GenSput(MIR* mir, RegLocation rl_src, bool is_long_or_double,
       LoadCurrMethodDirect(r_method);
       r_base = TargetReg(kArg0);
       LockTemp(r_base);
-      LoadRefDisp(r_method, mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(), r_base);
+      LoadRefDisp(r_method, mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(), r_base,
+                  kNotVolatile);
       int32_t offset_of_field = ObjArray::OffsetOfElement(field_info.StorageIndex()).Int32Value();
-      LoadRefDisp(r_base, offset_of_field, r_base);
+      LoadRefDisp(r_base, offset_of_field, r_base, kNotVolatile);
       // r_base now points at static storage (Class*) or NULL if the type is not yet resolved.
       if (!field_info.IsInitialized() &&
           (mir->optimization_flags & MIR_IGNORE_CLINIT_CHECK) == 0) {
@@ -626,14 +637,12 @@ void Mir2Lir::GenSput(MIR* mir, RegLocation rl_src, bool is_long_or_double,
     } else {
       rl_src = LoadValue(rl_src, reg_class);
     }
-    if (field_info.IsVolatile()) {
-      // There might have been a store before this volatile one so insert StoreStore barrier.
-      GenMemBarrier(kStoreStore);
-      StoreBaseDispVolatile(r_base, field_info.FieldOffset().Int32Value(), rl_src.reg, store_size);
-      // A load might follow the volatile store so insert a StoreLoad barrier.
-      GenMemBarrier(kStoreLoad);
+    if (is_object) {
+      StoreRefDisp(r_base, field_info.FieldOffset().Int32Value(), rl_src.reg,
+                   field_info.IsVolatile() ? kVolatile : kNotVolatile);
     } else {
-      StoreBaseDisp(r_base, field_info.FieldOffset().Int32Value(), rl_src.reg, store_size);
+      StoreBaseDisp(r_base, field_info.FieldOffset().Int32Value(), rl_src.reg, store_size,
+                    field_info.IsVolatile() ? kVolatile : kNotVolatile);
     }
     if (is_object && !mir_graph_->IsConstantNullRef(rl_src)) {
       MarkGCCard(rl_src.reg, r_base);
@@ -672,7 +681,8 @@ void Mir2Lir::GenSget(MIR* mir, RegLocation rl_dest,
       // Fast path, static storage base is this method's class
       RegLocation rl_method  = LoadCurrMethod();
       r_base = AllocTempRef();
-      LoadRefDisp(rl_method.reg, mirror::ArtMethod::DeclaringClassOffset().Int32Value(), r_base);
+      LoadRefDisp(rl_method.reg, mirror::ArtMethod::DeclaringClassOffset().Int32Value(), r_base,
+                  kNotVolatile);
     } else {
       // Medium path, static storage base in a different class which requires checks that the other
       // class is initialized
@@ -685,9 +695,10 @@ void Mir2Lir::GenSget(MIR* mir, RegLocation rl_dest,
       LoadCurrMethodDirect(r_method);
       r_base = TargetReg(kArg0);
       LockTemp(r_base);
-      LoadRefDisp(r_method, mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(), r_base);
+      LoadRefDisp(r_method, mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(), r_base,
+                  kNotVolatile);
       int32_t offset_of_field = ObjArray::OffsetOfElement(field_info.StorageIndex()).Int32Value();
-      LoadRefDisp(r_base, offset_of_field, r_base);
+      LoadRefDisp(r_base, offset_of_field, r_base, kNotVolatile);
       // r_base now points at static storage (Class*) or NULL if the type is not yet resolved.
       if (!field_info.IsInitialized() &&
           (mir->optimization_flags & MIR_IGNORE_CLINIT_CHECK) == 0) {
@@ -717,14 +728,12 @@ void Mir2Lir::GenSget(MIR* mir, RegLocation rl_dest,
     RegLocation rl_result = EvalLoc(rl_dest, reg_class, true);
 
     int field_offset = field_info.FieldOffset().Int32Value();
-    if (field_info.IsVolatile()) {
-      LoadBaseDispVolatile(r_base, field_offset, rl_result.reg, load_size);
-      // Without context sensitive analysis, we must issue the most conservative barriers.
-      // In this case, either a load or store may follow so we issue both barriers.
-      GenMemBarrier(kLoadLoad);
-      GenMemBarrier(kLoadStore);
+    if (is_object) {
+      LoadRefDisp(r_base, field_offset, rl_result.reg, field_info.IsVolatile() ? kVolatile :
+          kNotVolatile);
     } else {
-      LoadBaseDisp(r_base, field_offset, rl_result.reg, load_size);
+      LoadBaseDisp(r_base, field_offset, rl_result.reg, load_size, field_info.IsVolatile() ?
+          kVolatile : kNotVolatile);
     }
     FreeTemp(r_base);
 
@@ -785,17 +794,15 @@ void Mir2Lir::GenIGet(MIR* mir, int opt_flags, OpSize size,
     GenNullCheck(rl_obj.reg, opt_flags);
     RegLocation rl_result = EvalLoc(rl_dest, reg_class, true);
     int field_offset = field_info.FieldOffset().Int32Value();
-    if (field_info.IsVolatile()) {
-      LoadBaseDispVolatile(rl_obj.reg, field_offset, rl_result.reg, load_size);
-      MarkPossibleNullPointerException(opt_flags);
-      // Without context sensitive analysis, we must issue the most conservative barriers.
-      // In this case, either a load or store may follow so we issue both barriers.
-      GenMemBarrier(kLoadLoad);
-      GenMemBarrier(kLoadStore);
+    LIR* load_lir;
+    if (is_object) {
+      load_lir = LoadRefDisp(rl_obj.reg, field_offset, rl_result.reg, field_info.IsVolatile() ?
+          kVolatile : kNotVolatile);
     } else {
-      LoadBaseDisp(rl_obj.reg, field_offset, rl_result.reg, load_size);
-      MarkPossibleNullPointerException(opt_flags);
+      load_lir = LoadBaseDisp(rl_obj.reg, field_offset, rl_result.reg, load_size,
+                              field_info.IsVolatile() ? kVolatile : kNotVolatile);
     }
+    MarkPossibleNullPointerExceptionAfter(opt_flags, load_lir);
     if (is_long_or_double) {
       StoreValueWide(rl_dest, rl_result);
     } else {
@@ -847,17 +854,15 @@ void Mir2Lir::GenIPut(MIR* mir, int opt_flags, OpSize size,
     }
     GenNullCheck(rl_obj.reg, opt_flags);
     int field_offset = field_info.FieldOffset().Int32Value();
-    if (field_info.IsVolatile()) {
-      // There might have been a store before this volatile one so insert StoreStore barrier.
-      GenMemBarrier(kStoreStore);
-      StoreBaseDispVolatile(rl_obj.reg, field_offset, rl_src.reg, store_size);
-      MarkPossibleNullPointerException(opt_flags);
-      // A load might follow the volatile store so insert a StoreLoad barrier.
-      GenMemBarrier(kStoreLoad);
+    LIR* store;
+    if (is_object) {
+      store = StoreRefDisp(rl_obj.reg, field_offset, rl_src.reg, field_info.IsVolatile() ?
+          kVolatile : kNotVolatile);
     } else {
-      StoreBaseDisp(rl_obj.reg, field_offset, rl_src.reg, store_size);
-      MarkPossibleNullPointerException(opt_flags);
+      store = StoreBaseDisp(rl_obj.reg, field_offset, rl_src.reg, store_size,
+                            field_info.IsVolatile() ? kVolatile : kNotVolatile);
     }
+    MarkPossibleNullPointerExceptionAfter(opt_flags, store);
     if (is_object && !mir_graph_->IsConstantNullRef(rl_src)) {
       MarkGCCard(rl_src.reg, rl_obj.reg);
     }
@@ -916,9 +921,9 @@ void Mir2Lir::GenConstClass(uint32_t type_idx, RegLocation rl_dest) {
     // We're don't need access checks, load type from dex cache
     int32_t dex_cache_offset =
         mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value();
-    LoadRefDisp(rl_method.reg, dex_cache_offset, res_reg);
+    LoadRefDisp(rl_method.reg, dex_cache_offset, res_reg, kNotVolatile);
     int32_t offset_of_type = ClassArray::OffsetOfElement(type_idx).Int32Value();
-    LoadRefDisp(res_reg, offset_of_type, rl_result.reg);
+    LoadRefDisp(res_reg, offset_of_type, rl_result.reg, kNotVolatile);
     if (!cu_->compiler_driver->CanAssumeTypeIsPresentInDexCache(*cu_->dex_file,
         type_idx) || SLOW_TYPE_PATH) {
       // Slow path, at runtime test if type is null and if so initialize
@@ -989,10 +994,10 @@ void Mir2Lir::GenConstString(uint32_t string_idx, RegLocation rl_dest) {
       LoadCurrMethodDirect(r_method);
     }
     LoadRefDisp(r_method, mirror::ArtMethod::DexCacheStringsOffset().Int32Value(),
-                TargetReg(kArg0));
+                TargetReg(kArg0), kNotVolatile);
 
     // Might call out to helper, which will return resolved string in kRet0
-    LoadRefDisp(TargetReg(kArg0), offset_of_string, TargetReg(kRet0));
+    LoadRefDisp(TargetReg(kArg0), offset_of_string, TargetReg(kRet0), kNotVolatile);
     LIR* fromfast = OpCmpImmBranch(kCondEq, TargetReg(kRet0), 0, NULL);
     LIR* cont = NewLIR0(kPseudoTargetLabel);
 
@@ -1031,8 +1036,9 @@ void Mir2Lir::GenConstString(uint32_t string_idx, RegLocation rl_dest) {
     RegLocation rl_method = LoadCurrMethod();
     RegStorage res_reg = AllocTempRef();
     RegLocation rl_result = EvalLoc(rl_dest, kRefReg, true);
-    LoadRefDisp(rl_method.reg, mirror::ArtMethod::DexCacheStringsOffset().Int32Value(), res_reg);
-    LoadRefDisp(res_reg, offset_of_string, rl_result.reg);
+    LoadRefDisp(rl_method.reg, mirror::ArtMethod::DexCacheStringsOffset().Int32Value(), res_reg,
+                kNotVolatile);
+    LoadRefDisp(res_reg, offset_of_string, rl_result.reg, kNotVolatile);
     StoreValue(rl_dest, rl_result);
   }
 }
@@ -1133,14 +1139,17 @@ void Mir2Lir::GenInstanceofFinal(bool use_declaring_class, uint32_t type_idx, Re
 
   LoadCurrMethodDirect(check_class);
   if (use_declaring_class) {
-    LoadRefDisp(check_class, mirror::ArtMethod::DeclaringClassOffset().Int32Value(), check_class);
-    LoadRefDisp(object.reg,  mirror::Object::ClassOffset().Int32Value(), object_class);
+    LoadRefDisp(check_class, mirror::ArtMethod::DeclaringClassOffset().Int32Value(), check_class,
+                kNotVolatile);
+    LoadRefDisp(object.reg,  mirror::Object::ClassOffset().Int32Value(), object_class,
+                kNotVolatile);
   } else {
     LoadRefDisp(check_class, mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(),
-                check_class);
-    LoadRefDisp(object.reg,  mirror::Object::ClassOffset().Int32Value(), object_class);
+                check_class, kNotVolatile);
+    LoadRefDisp(object.reg,  mirror::Object::ClassOffset().Int32Value(), object_class,
+                kNotVolatile);
     int32_t offset_of_type = ClassArray::OffsetOfElement(type_idx).Int32Value();
-    LoadRefDisp(check_class, offset_of_type, check_class);
+    LoadRefDisp(check_class, offset_of_type, check_class, kNotVolatile);
   }
 
   LIR* ne_branchover = NULL;
@@ -1196,14 +1205,14 @@ void Mir2Lir::GenInstanceofCallingHelper(bool needs_access_check, bool type_know
   } else if (use_declaring_class) {
     LoadValueDirectFixed(rl_src, TargetReg(kArg0));  // kArg0 <= ref
     LoadRefDisp(TargetReg(kArg1), mirror::ArtMethod::DeclaringClassOffset().Int32Value(),
-                 class_reg);
+                class_reg, kNotVolatile);
   } else {
     // Load dex cache entry into class_reg (kArg2)
     LoadValueDirectFixed(rl_src, TargetReg(kArg0));  // kArg0 <= ref
     LoadRefDisp(TargetReg(kArg1), mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(),
-                class_reg);
+                class_reg, kNotVolatile);
     int32_t offset_of_type = ClassArray::OffsetOfElement(type_idx).Int32Value();
-    LoadRefDisp(class_reg, offset_of_type, class_reg);
+    LoadRefDisp(class_reg, offset_of_type, class_reg, kNotVolatile);
     if (!can_assume_type_is_in_dex_cache) {
       // Need to test presence of type in dex cache at runtime
       LIR* hop_branch = OpCmpImmBranch(kCondNe, class_reg, 0, NULL);
@@ -1231,7 +1240,8 @@ void Mir2Lir::GenInstanceofCallingHelper(bool needs_access_check, bool type_know
 
   /* load object->klass_ */
   DCHECK_EQ(mirror::Object::ClassOffset().Int32Value(), 0);
-  LoadRefDisp(TargetReg(kArg0),  mirror::Object::ClassOffset().Int32Value(), TargetReg(kArg1));
+  LoadRefDisp(TargetReg(kArg0), mirror::Object::ClassOffset().Int32Value(), TargetReg(kArg1),
+              kNotVolatile);
   /* kArg0 is ref, kArg1 is ref->klass_, kArg2 is class */
   LIR* branchover = NULL;
   if (type_known_final) {
@@ -1344,13 +1354,13 @@ void Mir2Lir::GenCheckCast(uint32_t insn_idx, uint32_t type_idx, RegLocation rl_
     OpRegCopy(class_reg, TargetReg(kRet0));  // Align usage with fast path
   } else if (use_declaring_class) {
     LoadRefDisp(TargetReg(kArg1), mirror::ArtMethod::DeclaringClassOffset().Int32Value(),
-                class_reg);
+                class_reg, kNotVolatile);
   } else {
     // Load dex cache entry into class_reg (kArg2)
     LoadRefDisp(TargetReg(kArg1), mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(),
-                class_reg);
+                class_reg, kNotVolatile);
     int32_t offset_of_type = ClassArray::OffsetOfElement(type_idx).Int32Value();
-    LoadRefDisp(class_reg, offset_of_type, class_reg);
+    LoadRefDisp(class_reg, offset_of_type, class_reg, kNotVolatile);
     if (!cu_->compiler_driver->CanAssumeTypeIsPresentInDexCache(*cu_->dex_file, type_idx)) {
       // Need to test presence of type in dex cache at runtime
       LIR* hop_branch = OpCmpImmBranch(kCondEq, class_reg, 0, NULL);
@@ -1405,7 +1415,7 @@ void Mir2Lir::GenCheckCast(uint32_t insn_idx, uint32_t type_idx, RegLocation rl_
 
       if (load_) {
         m2l_->LoadRefDisp(m2l_->TargetReg(kArg0), mirror::Object::ClassOffset().Int32Value(),
-                          m2l_->TargetReg(kArg1));
+                          m2l_->TargetReg(kArg1), kNotVolatile);
       }
       if (m2l_->cu_->target64) {
         m2l_->CallRuntimeHelperRegReg(QUICK_ENTRYPOINT_OFFSET(8, pCheckCast), m2l_->TargetReg(kArg2),
@@ -1436,7 +1446,8 @@ void Mir2Lir::GenCheckCast(uint32_t insn_idx, uint32_t type_idx, RegLocation rl_
     LIR* branch1 = OpCmpImmBranch(kCondEq, TargetReg(kArg0), 0, NULL);
     /* load object->klass_ */
     DCHECK_EQ(mirror::Object::ClassOffset().Int32Value(), 0);
-    LoadRefDisp(TargetReg(kArg0), mirror::Object::ClassOffset().Int32Value(), TargetReg(kArg1));
+    LoadRefDisp(TargetReg(kArg0), mirror::Object::ClassOffset().Int32Value(), TargetReg(kArg1),
+                kNotVolatile);
 
     LIR* branch2 = OpCmpBranch(kCondNe, TargetReg(kArg1), class_reg, NULL);
     LIR* cont = NewLIR0(kPseudoTargetLabel);
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 638c590f2e..008ebfbd71 100644
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -501,7 +501,7 @@ void Mir2Lir::FlushIns(RegLocation* ArgLocs, RegLocation rl_method) {
   StoreValue(rl_method, rl_src);
   // If Method* has been promoted, explicitly flush
   if (rl_method.location == kLocPhysReg) {
-    StoreRefDisp(TargetReg(kSp), 0, TargetReg(kArg0));
+    StoreRefDisp(TargetReg(kSp), 0, TargetReg(kArg0), kNotVolatile);
   }
 
   if (cu_->num_ins == 0) {
@@ -616,7 +616,8 @@ static int NextSDCallInsn(CompilationUnit* cu, CallInfo* info,
     case 1:  // Get method->dex_cache_resolved_methods_
       cg->LoadRefDisp(cg->TargetReg(kArg0),
                       mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value(),
-                      cg->TargetReg(kArg0));
+                      cg->TargetReg(kArg0),
+                      kNotVolatile);
       // Set up direct code if known.
       if (direct_code != 0) {
         if (direct_code != static_cast<uintptr_t>(-1)) {
@@ -631,7 +632,8 @@ static int NextSDCallInsn(CompilationUnit* cu, CallInfo* info,
       CHECK_EQ(cu->dex_file, target_method.dex_file);
       cg->LoadRefDisp(cg->TargetReg(kArg0),
                       ObjArray::OffsetOfElement(target_method.dex_method_index).Int32Value(),
-                      cg->TargetReg(kArg0));
+                      cg->TargetReg(kArg0),
+                      kNotVolatile);
       break;
     case 3:  // Grab the code from the method*
       if (cu->instruction_set != kX86 && cu->instruction_set != kX86_64) {
@@ -676,17 +678,20 @@ static int NextVCallInsn(CompilationUnit* cu, CallInfo* info,
       cg->GenNullCheck(cg->TargetReg(kArg1), info->opt_flags);
       // get this->klass_ [use kArg1, set kInvokeTgt]
       cg->LoadRefDisp(cg->TargetReg(kArg1), mirror::Object::ClassOffset().Int32Value(),
-                      cg->TargetReg(kInvokeTgt));
+                      cg->TargetReg(kInvokeTgt),
+                      kNotVolatile);
       cg->MarkPossibleNullPointerException(info->opt_flags);
       break;
     case 2:  // Get this->klass_->vtable [usr kInvokeTgt, set kInvokeTgt]
       cg->LoadRefDisp(cg->TargetReg(kInvokeTgt), mirror::Class::VTableOffset().Int32Value(),
-                      cg->TargetReg(kInvokeTgt));
+                      cg->TargetReg(kInvokeTgt),
+                      kNotVolatile);
       break;
     case 3:  // Get target method [use kInvokeTgt, set kArg0]
       cg->LoadRefDisp(cg->TargetReg(kInvokeTgt),
                       ObjArray::OffsetOfElement(method_idx).Int32Value(),
-                      cg->TargetReg(kArg0));
+                      cg->TargetReg(kArg0),
+                      kNotVolatile);
       break;
     case 4:  // Get the compiled code address [uses kArg0, sets kInvokeTgt]
       if (cu->instruction_set != kX86 && cu->instruction_set != kX86_64) {
@@ -731,19 +736,22 @@ static int NextInterfaceCallInsn(CompilationUnit* cu, CallInfo* info, int state,
       cg->GenNullCheck(cg->TargetReg(kArg1), info->opt_flags);
       // Get this->klass_ [use kArg1, set kInvokeTgt]
       cg->LoadRefDisp(cg->TargetReg(kArg1), mirror::Object::ClassOffset().Int32Value(),
-                      cg->TargetReg(kInvokeTgt));
+                      cg->TargetReg(kInvokeTgt),
+                      kNotVolatile);
       cg->MarkPossibleNullPointerException(info->opt_flags);
       break;
     case 3:  // Get this->klass_->imtable [use kInvokeTgt, set kInvokeTgt]
       // NOTE: native pointer.
       cg->LoadRefDisp(cg->TargetReg(kInvokeTgt), mirror::Class::ImTableOffset().Int32Value(),
-                      cg->TargetReg(kInvokeTgt));
+                      cg->TargetReg(kInvokeTgt),
+                      kNotVolatile);
       break;
     case 4:  // Get target method [use kInvokeTgt, set kArg0]
       // NOTE: native pointer.
       cg->LoadRefDisp(cg->TargetReg(kInvokeTgt),
                        ObjArray::OffsetOfElement(method_idx % ClassLinker::kImtSize).Int32Value(),
-                       cg->TargetReg(kArg0));
+                       cg->TargetReg(kArg0),
+                       kNotVolatile);
       break;
     case 5:  // Get the compiled code address [use kArg0, set kInvokeTgt]
       if (cu->instruction_set != kX86 && cu->instruction_set != kX86_64) {
@@ -967,7 +975,7 @@ int Mir2Lir::GenDalvikArgsNoRange(CallInfo* info,
       {
         ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
         if (rl_arg.wide) {
-          StoreBaseDisp(TargetReg(kSp), outs_offset, arg_reg, k64);
+          StoreBaseDisp(TargetReg(kSp), outs_offset, arg_reg, k64, kNotVolatile);
           next_use += 2;
         } else {
           Store32Disp(TargetReg(kSp), outs_offset, arg_reg);
@@ -1037,7 +1045,7 @@ int Mir2Lir::GenDalvikArgsRange(CallInfo* info, int call_state,
       loc = UpdateLocWide(loc);
       if ((next_arg >= 2) && (loc.location == kLocPhysReg)) {
         ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
-        StoreBaseDisp(TargetReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, k64);
+        StoreBaseDisp(TargetReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, k64, kNotVolatile);
       }
       next_arg += 2;
     } else {
@@ -1307,7 +1315,7 @@ bool Mir2Lir::GenInlinedCharAt(CallInfo* info) {
     reg_off = AllocTemp();
     reg_ptr = AllocTempRef();
     Load32Disp(rl_obj.reg, offset_offset, reg_off);
-    LoadRefDisp(rl_obj.reg, value_offset, reg_ptr);
+    LoadRefDisp(rl_obj.reg, value_offset, reg_ptr, kNotVolatile);
   }
   if (rl_idx.is_const) {
     OpRegImm(kOpAdd, reg_off, mir_graph_->ConstantValue(rl_idx.orig_sreg));
@@ -1672,7 +1680,7 @@ bool Mir2Lir::GenInlinedUnsafeGet(CallInfo* info,
     } else {
       RegStorage rl_temp_offset = AllocTemp();
       OpRegRegReg(kOpAdd, rl_temp_offset, rl_object.reg, rl_offset.reg);
-      LoadBaseDisp(rl_temp_offset, 0, rl_result.reg, k64);
+      LoadBaseDisp(rl_temp_offset, 0, rl_result.reg, k64, kNotVolatile);
       FreeTemp(rl_temp_offset);
     }
   } else {
@@ -1719,7 +1727,7 @@ bool Mir2Lir::GenInlinedUnsafePut(CallInfo* info, bool is_long,
     } else {
       RegStorage rl_temp_offset = AllocTemp();
       OpRegRegReg(kOpAdd, rl_temp_offset, rl_object.reg, rl_offset.reg);
-      StoreBaseDisp(rl_temp_offset, 0, rl_value.reg, k64);
+      StoreBaseDisp(rl_temp_offset, 0, rl_value.reg, k64, kNotVolatile);
       FreeTemp(rl_temp_offset);
     }
   } else {
diff --git a/compiler/dex/quick/gen_loadstore.cc b/compiler/dex/quick/gen_loadstore.cc
index 6469d9c4f1..bfb77fc222 100644
--- a/compiler/dex/quick/gen_loadstore.cc
+++ b/compiler/dex/quick/gen_loadstore.cc
@@ -66,7 +66,7 @@ void Mir2Lir::Workaround7250540(RegLocation rl_dest, RegStorage zero_reg) {
       } else {
         // Lives in the frame, need to store.
         ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
-        StoreBaseDisp(TargetReg(kSp), SRegOffset(rl_dest.s_reg_low), temp_reg, k32);
+        StoreBaseDisp(TargetReg(kSp), SRegOffset(rl_dest.s_reg_low), temp_reg, k32, kNotVolatile);
       }
       if (!zero_reg.Valid()) {
         FreeTemp(temp_reg);
@@ -93,7 +93,7 @@ void Mir2Lir::LoadValueDirect(RegLocation rl_src, RegStorage r_dest) {
            (rl_src.location == kLocCompilerTemp));
     ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
     if (rl_src.ref) {
-      LoadRefDisp(TargetReg(kSp), SRegOffset(rl_src.s_reg_low), r_dest);
+      LoadRefDisp(TargetReg(kSp), SRegOffset(rl_src.s_reg_low), r_dest, kNotVolatile);
     } else {
       Load32Disp(TargetReg(kSp), SRegOffset(rl_src.s_reg_low), r_dest);
     }
@@ -126,7 +126,7 @@ void Mir2Lir::LoadValueDirectWide(RegLocation rl_src, RegStorage r_dest) {
     DCHECK((rl_src.location == kLocDalvikFrame) ||
            (rl_src.location == kLocCompilerTemp));
     ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
-    LoadBaseDisp(TargetReg(kSp), SRegOffset(rl_src.s_reg_low), r_dest, k64);
+    LoadBaseDisp(TargetReg(kSp), SRegOffset(rl_src.s_reg_low), r_dest, k64, kNotVolatile);
   }
 }
 
@@ -214,7 +214,11 @@ void Mir2Lir::StoreValue(RegLocation rl_dest, RegLocation rl_src) {
   if (IsDirty(rl_dest.reg) && LiveOut(rl_dest.s_reg_low)) {
     def_start = last_lir_insn_;
     ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
-    Store32Disp(TargetReg(kSp), SRegOffset(rl_dest.s_reg_low), rl_dest.reg);
+    if (rl_dest.ref) {
+      StoreRefDisp(TargetReg(kSp), SRegOffset(rl_dest.s_reg_low), rl_dest.reg, kNotVolatile);
+    } else {
+      Store32Disp(TargetReg(kSp), SRegOffset(rl_dest.s_reg_low), rl_dest.reg);
+    }
     MarkClean(rl_dest);
     def_end = last_lir_insn_;
     if (!rl_dest.ref) {
@@ -301,7 +305,7 @@ void Mir2Lir::StoreValueWide(RegLocation rl_dest, RegLocation rl_src) {
     DCHECK_EQ((mir_graph_->SRegToVReg(rl_dest.s_reg_low)+1),
               mir_graph_->SRegToVReg(GetSRegHi(rl_dest.s_reg_low)));
     ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
-    StoreBaseDisp(TargetReg(kSp), SRegOffset(rl_dest.s_reg_low), rl_dest.reg, k64);
+    StoreBaseDisp(TargetReg(kSp), SRegOffset(rl_dest.s_reg_low), rl_dest.reg, k64, kNotVolatile);
     MarkClean(rl_dest);
     def_end = last_lir_insn_;
     MarkDefWide(rl_dest, def_start, def_end);
@@ -365,7 +369,7 @@ void Mir2Lir::StoreFinalValueWide(RegLocation rl_dest, RegLocation rl_src) {
     DCHECK_EQ((mir_graph_->SRegToVReg(rl_dest.s_reg_low)+1),
               mir_graph_->SRegToVReg(GetSRegHi(rl_dest.s_reg_low)));
     ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
-    StoreBaseDisp(TargetReg(kSp), SRegOffset(rl_dest.s_reg_low), rl_dest.reg, k64);
+    StoreBaseDisp(TargetReg(kSp), SRegOffset(rl_dest.s_reg_low), rl_dest.reg, k64, kNotVolatile);
     MarkClean(rl_dest);
     LIR *def_end = last_lir_insn_;
     MarkDefWide(rl_dest, def_start, def_end);
diff --git a/compiler/dex/quick/mips/call_mips.cc b/compiler/dex/quick/mips/call_mips.cc
index e53105fc84..26ea6a8ec7 100644
--- a/compiler/dex/quick/mips/call_mips.cc
+++ b/compiler/dex/quick/mips/call_mips.cc
@@ -264,9 +264,9 @@ void MipsMir2Lir::GenMoveException(RegLocation rl_dest) {
   int ex_offset = Thread::ExceptionOffset<4>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kRefReg, true);
   RegStorage reset_reg = AllocTempRef();
-  LoadRefDisp(rs_rMIPS_SELF, ex_offset, rl_result.reg);
+  LoadRefDisp(rs_rMIPS_SELF, ex_offset, rl_result.reg, kNotVolatile);
   LoadConstant(reset_reg, 0);
-  StoreRefDisp(rs_rMIPS_SELF, ex_offset, reset_reg);
+  StoreRefDisp(rs_rMIPS_SELF, ex_offset, reset_reg, kNotVolatile);
   FreeTemp(reset_reg);
   StoreValue(rl_dest, rl_result);
 }
diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index 571adaccc1..c0ad9161f6 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h
@@ -33,20 +33,16 @@ class MipsMir2Lir FINAL : public Mir2Lir {
     LIR* CheckSuspendUsingLoad() OVERRIDE;
     RegStorage LoadHelper(ThreadOffset<4> offset) OVERRIDE;
     RegStorage LoadHelper(ThreadOffset<8> offset) OVERRIDE;
-    LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
-                              OpSize size) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
-                      OpSize size) OVERRIDE;
+                      OpSize size, VolatileKind is_volatile) OVERRIDE;
     LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale,
                          OpSize size) OVERRIDE;
     LIR* LoadBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale, int displacement,
                              RegStorage r_dest, OpSize size) OVERRIDE;
     LIR* LoadConstantNoClobber(RegStorage r_dest, int value);
     LIR* LoadConstantWide(RegStorage r_dest, int64_t value);
-    LIR* StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
-                               OpSize size) OVERRIDE;
     LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
-                       OpSize size) OVERRIDE;
+                       OpSize size, VolatileKind is_volatile) OVERRIDE;
     LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
                           OpSize size) OVERRIDE;
     LIR* StoreBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale, int displacement,
diff --git a/compiler/dex/quick/mips/int_mips.cc b/compiler/dex/quick/mips/int_mips.cc
index beaf6bb8ea..903a7709ca 100644
--- a/compiler/dex/quick/mips/int_mips.cc
+++ b/compiler/dex/quick/mips/int_mips.cc
@@ -294,7 +294,7 @@ bool MipsMir2Lir::GenInlinedPeek(CallInfo* info, OpSize size) {
   RegLocation rl_address = LoadValue(rl_src_address, kCoreReg);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   DCHECK(size == kSignedByte);
-  LoadBaseDisp(rl_address.reg, 0, rl_result.reg, size);
+  LoadBaseDisp(rl_address.reg, 0, rl_result.reg, size, kNotVolatile);
   StoreValue(rl_dest, rl_result);
   return true;
 }
@@ -310,7 +310,7 @@ bool MipsMir2Lir::GenInlinedPoke(CallInfo* info, OpSize size) {
   RegLocation rl_address = LoadValue(rl_src_address, kCoreReg);
   DCHECK(size == kSignedByte);
   RegLocation rl_value = LoadValue(rl_src_value, kCoreReg);
-  StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size);
+  StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size, kNotVolatile);
   return true;
 }
 
@@ -524,7 +524,7 @@ void MipsMir2Lir::GenArrayGet(int opt_flags, OpSize size, RegLocation rl_array,
       GenArrayBoundsCheck(rl_index.reg, reg_len);
       FreeTemp(reg_len);
     }
-    LoadBaseDisp(reg_ptr, 0, rl_result.reg, size);
+    LoadBaseDisp(reg_ptr, 0, rl_result.reg, size, kNotVolatile);
 
     FreeTemp(reg_ptr);
     StoreValueWide(rl_dest, rl_result);
@@ -602,7 +602,7 @@ void MipsMir2Lir::GenArrayPut(int opt_flags, OpSize size, RegLocation rl_array,
       FreeTemp(reg_len);
     }
 
-    StoreBaseDisp(reg_ptr, 0, rl_src.reg, size);
+    StoreBaseDisp(reg_ptr, 0, rl_src.reg, size, kNotVolatile);
   } else {
     rl_src = LoadValue(rl_src, reg_class);
     if (needs_range_check) {
diff --git a/compiler/dex/quick/mips/utility_mips.cc b/compiler/dex/quick/mips/utility_mips.cc
index 01b25f9291..b49f43617f 100644
--- a/compiler/dex/quick/mips/utility_mips.cc
+++ b/compiler/dex/quick/mips/utility_mips.cc
@@ -546,23 +546,31 @@ LIR* MipsMir2Lir::LoadBaseDispBody(RegStorage r_base, int displacement, RegStora
   return load;
 }
 
-LIR* MipsMir2Lir::LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
-                                       OpSize size) {
-  DCHECK(size != k64 && size != kDouble);
-  return LoadBaseDisp(r_base, displacement, r_dest, size);
-}
-
 LIR* MipsMir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
-                               OpSize size) {
+                               OpSize size, VolatileKind is_volatile) {
+  if (is_volatile == kVolatile) {
+    DCHECK(size != k64 && size != kDouble);
+  }
+
   // TODO: base this on target.
   if (size == kWord) {
     size = k32;
   }
+  LIR* load;
   if (size == k64 || size == kDouble) {
-    return LoadBaseDispBody(r_base, displacement, r_dest.GetLow(), r_dest.GetHigh(), size);
+    load = LoadBaseDispBody(r_base, displacement, r_dest.GetLow(), r_dest.GetHigh(), size);
   } else {
-    return LoadBaseDispBody(r_base, displacement, r_dest, RegStorage::InvalidReg(), size);
+    load = LoadBaseDispBody(r_base, displacement, r_dest, RegStorage::InvalidReg(), size);
   }
+
+  if (UNLIKELY(is_volatile == kVolatile)) {
+    // Without context sensitive analysis, we must issue the most conservative barriers.
+    // In this case, either a load or store may follow so we issue both barriers.
+    GenMemBarrier(kLoadLoad);
+    GenMemBarrier(kLoadStore);
+  }
+
+  return load;
 }
 
 // FIXME: don't split r_dest into 2 containers.
@@ -648,23 +656,31 @@ LIR* MipsMir2Lir::StoreBaseDispBody(RegStorage r_base, int displacement,
   return res;
 }
 
-LIR* MipsMir2Lir::StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
-                                        OpSize size) {
-  DCHECK(size != k64 && size != kDouble);
-  return StoreBaseDisp(r_base, displacement, r_src, size);
-}
-
 LIR* MipsMir2Lir::StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
-                                OpSize size) {
+                                OpSize size, VolatileKind is_volatile) {
+  if (is_volatile == kVolatile) {
+    DCHECK(size != k64 && size != kDouble);
+    // There might have been a store before this volatile one so insert StoreStore barrier.
+    GenMemBarrier(kStoreStore);
+  }
+
   // TODO: base this on target.
   if (size == kWord) {
     size = k32;
   }
+  LIR* store;
   if (size == k64 || size == kDouble) {
-    return StoreBaseDispBody(r_base, displacement, r_src.GetLow(), r_src.GetHigh(), size);
+    store = StoreBaseDispBody(r_base, displacement, r_src.GetLow(), r_src.GetHigh(), size);
   } else {
-    return StoreBaseDispBody(r_base, displacement, r_src, RegStorage::InvalidReg(), size);
+    store = StoreBaseDispBody(r_base, displacement, r_src, RegStorage::InvalidReg(), size);
   }
+
+  if (UNLIKELY(is_volatile == kVolatile)) {
+    // A load might follow the volatile store so insert a StoreLoad barrier.
+    GenMemBarrier(kStoreLoad);
+  }
+
+  return store;
 }
 
 LIR* MipsMir2Lir::OpThreadMem(OpKind op, ThreadOffset<4> thread_offset) {
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index 1fc416301c..5d68187d8b 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -92,7 +92,7 @@ RegStorage Mir2Lir::LoadArg(int in_position, RegisterClass reg_class, bool wide)
     if (!reg_arg.Valid()) {
       RegStorage new_reg =
           wide ?  AllocTypedTempWide(false, reg_class) : AllocTypedTemp(false, reg_class);
-      LoadBaseDisp(TargetReg(kSp), offset, new_reg, wide ? k64 : k32);
+      LoadBaseDisp(TargetReg(kSp), offset, new_reg, wide ? k64 : k32, kNotVolatile);
       return new_reg;
     } else {
       // Check if we need to copy the arg to a different reg_class.
@@ -120,7 +120,7 @@ RegStorage Mir2Lir::LoadArg(int in_position, RegisterClass reg_class, bool wide)
     // If the low part is not in a reg, we allocate a pair. Otherwise, we just load to high reg.
     if (!reg_arg_low.Valid()) {
       RegStorage new_regs = AllocTypedTempWide(false, reg_class);
-      LoadBaseDisp(TargetReg(kSp), offset, new_regs, k64);
+      LoadBaseDisp(TargetReg(kSp), offset, new_regs, k64, kNotVolatile);
       return new_regs;  // The reg_class is OK, we can return.
     } else {
       // Assume that no ABI allows splitting a wide fp reg between a narrow fp reg and memory,
@@ -193,7 +193,7 @@ void Mir2Lir::LoadArgDirect(int in_position, RegLocation rl_dest) {
       if (reg.Valid()) {
         OpRegCopy(rl_dest.reg, reg);
       } else {
-        LoadBaseDisp(TargetReg(kSp), offset, rl_dest.reg, k64);
+        LoadBaseDisp(TargetReg(kSp), offset, rl_dest.reg, k64, kNotVolatile);
       }
       return;
     }
@@ -211,7 +211,7 @@ void Mir2Lir::LoadArgDirect(int in_position, RegLocation rl_dest) {
       OpRegCopy(rl_dest.reg.GetHigh(), reg_arg_high);
       Load32Disp(TargetReg(kSp), offset, rl_dest.reg.GetLow());
     } else {
-      LoadBaseDisp(TargetReg(kSp), offset, rl_dest.reg, k64);
+      LoadBaseDisp(TargetReg(kSp), offset, rl_dest.reg, k64, kNotVolatile);
     }
   }
 }
@@ -243,14 +243,11 @@ bool Mir2Lir::GenSpecialIGet(MIR* mir, const InlineMethod& special) {
     r_result = wide ? AllocTypedTempWide(rl_dest.fp, reg_class)
                     : AllocTypedTemp(rl_dest.fp, reg_class);
   }
-  if (data.is_volatile) {
-    LoadBaseDispVolatile(reg_obj, data.field_offset, r_result, size);
-    // Without context sensitive analysis, we must issue the most conservative barriers.
-    // In this case, either a load or store may follow so we issue both barriers.
-    GenMemBarrier(kLoadLoad);
-    GenMemBarrier(kLoadStore);
+  if (ref) {
+    LoadRefDisp(reg_obj, data.field_offset, r_result, data.is_volatile ? kVolatile : kNotVolatile);
   } else {
-    LoadBaseDisp(reg_obj, data.field_offset, r_result, size);
+    LoadBaseDisp(reg_obj, data.field_offset, r_result, size, data.is_volatile ? kVolatile :
+        kNotVolatile);
   }
   if (r_result != rl_dest.reg) {
     if (wide) {
@@ -288,14 +285,11 @@ bool Mir2Lir::GenSpecialIPut(MIR* mir, const InlineMethod& special) {
   RegStorage reg_obj = LoadArg(data.object_arg, kRefReg);
   RegisterClass reg_class = RegClassForFieldLoadStore(size, data.is_volatile);
   RegStorage reg_src = LoadArg(data.src_arg, reg_class, wide);
-  if (data.is_volatile) {
-    // There might have been a store before this volatile one so insert StoreStore barrier.
-    GenMemBarrier(kStoreStore);
-    StoreBaseDispVolatile(reg_obj, data.field_offset, reg_src, size);
-    // A load might follow the volatile store so insert a StoreLoad barrier.
-    GenMemBarrier(kStoreLoad);
+  if (ref) {
+    StoreRefDisp(reg_obj, data.field_offset, reg_src, data.is_volatile ? kVolatile : kNotVolatile);
   } else {
-    StoreBaseDisp(reg_obj, data.field_offset, reg_src, size);
+    StoreBaseDisp(reg_obj, data.field_offset, reg_src, size, data.is_volatile ? kVolatile :
+        kNotVolatile);
   }
   if (ref) {
     MarkGCCard(reg_src, reg_obj);
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index f70087d451..b07c85e2c3 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -663,6 +663,7 @@ class Mir2Lir : public Backend {
     virtual void Materialize();
     virtual CompiledMethod* GetCompiledMethod();
     void MarkSafepointPC(LIR* inst);
+    void MarkSafepointPCAfter(LIR* after);
     void SetupResourceMasks(LIR* lir);
     void SetMemRefType(LIR* lir, bool is_load, int mem_type);
     void AnnotateDalvikRegAccess(LIR* lir, int reg_id, bool is_load, bool is64bit);
@@ -830,6 +831,7 @@ class Mir2Lir : public Backend {
     void GenArrayBoundsCheck(int32_t index, RegStorage length);
     LIR* GenNullCheck(RegStorage reg);
     void MarkPossibleNullPointerException(int opt_flags);
+    void MarkPossibleNullPointerExceptionAfter(int opt_flags, LIR* after);
     void MarkPossibleStackOverflowException();
     void ForceImplicitNullCheck(RegStorage reg, int opt_flags);
     LIR* GenImmedCheck(ConditionCode c_code, RegStorage reg, int imm_val, ThrowKind kind);
@@ -1007,15 +1009,20 @@ class Mir2Lir : public Backend {
     virtual LIR* LoadConstant(RegStorage r_dest, int value);
     // Natural word size.
     virtual LIR* LoadWordDisp(RegStorage r_base, int displacement, RegStorage r_dest) {
-      return LoadBaseDisp(r_base, displacement, r_dest, kWord);
+      return LoadBaseDisp(r_base, displacement, r_dest, kWord, kNotVolatile);
     }
     // Load 32 bits, regardless of target.
     virtual LIR* Load32Disp(RegStorage r_base, int displacement, RegStorage r_dest)  {
-      return LoadBaseDisp(r_base, displacement, r_dest, k32);
+      return LoadBaseDisp(r_base, displacement, r_dest, k32, kNotVolatile);
     }
     // Load a reference at base + displacement and decompress into register.
-    virtual LIR* LoadRefDisp(RegStorage r_base, int displacement, RegStorage r_dest) {
-      return LoadBaseDisp(r_base, displacement, r_dest, kReference);
+    virtual LIR* LoadRefDisp(RegStorage r_base, int displacement, RegStorage r_dest,
+                             VolatileKind is_volatile) {
+      return LoadBaseDisp(r_base, displacement, r_dest, kReference, is_volatile);
+    }
+    // Load a reference at base + index and decompress into register.
+    virtual LIR* LoadRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest) {
+      return LoadBaseIndexed(r_base, r_index, r_dest, 2, kReference);
     }
     // Load Dalvik value with 32-bit memory storage.  If compressed object reference, decompress.
     virtual RegLocation LoadValue(RegLocation rl_src, RegisterClass op_kind);
@@ -1033,15 +1040,20 @@ class Mir2Lir : public Backend {
     virtual void LoadValueDirectWideFixed(RegLocation rl_src, RegStorage r_dest);
     // Store an item of natural word size.
     virtual LIR* StoreWordDisp(RegStorage r_base, int displacement, RegStorage r_src) {
-      return StoreBaseDisp(r_base, displacement, r_src, kWord);
+      return StoreBaseDisp(r_base, displacement, r_src, kWord, kNotVolatile);
     }
     // Store an uncompressed reference into a compressed 32-bit container.
-    virtual LIR* StoreRefDisp(RegStorage r_base, int displacement, RegStorage r_src) {
-      return StoreBaseDisp(r_base, displacement, r_src, kReference);
+    virtual LIR* StoreRefDisp(RegStorage r_base, int displacement, RegStorage r_src,
+                              VolatileKind is_volatile) {
+      return StoreBaseDisp(r_base, displacement, r_src, kReference, is_volatile);
+    }
+    // Store an uncompressed reference into a compressed 32-bit container by index.
+    virtual LIR* StoreRefIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src) {
+      return StoreBaseIndexed(r_base, r_index, r_src, 2, kReference);
     }
     // Store 32 bits, regardless of target.
     virtual LIR* Store32Disp(RegStorage r_base, int displacement, RegStorage r_src) {
-      return StoreBaseDisp(r_base, displacement, r_src, k32);
+      return StoreBaseDisp(r_base, displacement, r_src, k32, kNotVolatile);
     }
 
     /**
@@ -1144,20 +1156,16 @@ class Mir2Lir : public Backend {
     virtual RegStorage LoadHelper(ThreadOffset<4> offset) = 0;
     virtual RegStorage LoadHelper(ThreadOffset<8> offset) = 0;
 
-    virtual LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
-                                      OpSize size) = 0;
     virtual LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
-                              OpSize size) = 0;
+                              OpSize size, VolatileKind is_volatile) = 0;
     virtual LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest,
                                  int scale, OpSize size) = 0;
     virtual LIR* LoadBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale,
                                      int displacement, RegStorage r_dest, OpSize size) = 0;
     virtual LIR* LoadConstantNoClobber(RegStorage r_dest, int value) = 0;
     virtual LIR* LoadConstantWide(RegStorage r_dest, int64_t value) = 0;
-    virtual LIR* StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
-                                       OpSize size) = 0;
     virtual LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
-                               OpSize size) = 0;
+                               OpSize size, VolatileKind is_volatile) = 0;
     virtual LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src,
                                   int scale, OpSize size) = 0;
     virtual LIR* StoreBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale,
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index 5bb0ee04d4..60eebe4a25 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -735,7 +735,7 @@ void Mir2Lir::FlushRegWide(RegStorage reg) {
       }
       int v_reg = mir_graph_->SRegToVReg(info1->SReg());
       ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
-      StoreBaseDisp(TargetReg(kSp), VRegOffset(v_reg), reg, k64);
+      StoreBaseDisp(TargetReg(kSp), VRegOffset(v_reg), reg, k64, kNotVolatile);
     }
   } else {
     RegisterInfo* info = GetRegInfo(reg);
@@ -743,7 +743,7 @@ void Mir2Lir::FlushRegWide(RegStorage reg) {
       info->SetIsDirty(false);
       int v_reg = mir_graph_->SRegToVReg(info->SReg());
       ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
-      StoreBaseDisp(TargetReg(kSp), VRegOffset(v_reg), reg, k64);
+      StoreBaseDisp(TargetReg(kSp), VRegOffset(v_reg), reg, k64, kNotVolatile);
     }
   }
 }
@@ -755,7 +755,7 @@ void Mir2Lir::FlushReg(RegStorage reg) {
     info->SetIsDirty(false);
     int v_reg = mir_graph_->SRegToVReg(info->SReg());
     ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
-    StoreBaseDisp(TargetReg(kSp), VRegOffset(v_reg), reg, kWord);
+    StoreBaseDisp(TargetReg(kSp), VRegOffset(v_reg), reg, kWord, kNotVolatile);
   }
 }
 
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 3f54798b7e..f06f08ee52 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -325,11 +325,21 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
 { kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { prefix, 0, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
 
+#define EXT_0F_REX_NO_PREFIX_ENCODING_MAP(opname, opcode, reg_def) \
+{ kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { REX, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
+{ kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { REX, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
+{ kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { REX, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
+
 #define EXT_0F_REX_W_ENCODING_MAP(opname, prefix, opcode, reg_def) \
 { kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { prefix, REX_W, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, REX_W, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
 { kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { prefix, REX_W, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
 
+#define EXT_0F_REX_W_NO_PREFIX_ENCODING_MAP(opname, opcode, reg_def) \
+{ kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { REX_W, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
+{ kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { REX_W, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
+{ kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { REX_W, 0x00, 0x0F, opcode, 0, 0, 0, 0, false }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
+
 #define EXT_0F_ENCODING2_MAP(opname, prefix, opcode, opcode2, reg_def) \
 { kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0, false }, #opname "RR", "!0r,!1r" }, \
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0, false }, #opname "RM", "!0r,[!1r+!2d]" }, \
@@ -481,6 +491,10 @@ ENCODING_MAP(Cmp, IS_LOAD, 0, 0,
   EXT_0F_ENCODING_MAP(Movzx16, 0x00, 0xB7, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movsx8,  0x00, 0xBE, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movsx16, 0x00, 0xBF, REG_DEF0),
+  EXT_0F_REX_NO_PREFIX_ENCODING_MAP(Movzx8q, 0xB6, REG_DEF0),
+  EXT_0F_REX_W_NO_PREFIX_ENCODING_MAP(Movzx16q, 0xB7, REG_DEF0),
+  EXT_0F_REX_NO_PREFIX_ENCODING_MAP(Movsx8q, 0xBE, REG_DEF0),
+  EXT_0F_REX_W_NO_PREFIX_ENCODING_MAP(Movsx16q, 0xBF, REG_DEF0),
 #undef EXT_0F_ENCODING_MAP
 
   { kX86Jcc8,  kJcc,  IS_BINARY_OP | IS_BRANCH | NEEDS_FIXUP | USES_CCODES, { 0,             0, 0x70, 0,    0, 0, 0, 0, false }, "Jcc8",  "!1c !0t" },
@@ -827,7 +841,8 @@ void X86Mir2Lir::CheckValidByteRegister(const X86EncodingMap* entry, int32_t raw
       CHECK(strchr(entry->name, '8') != nullptr) << entry->name;
     } else {
       if (entry->skeleton.immediate_bytes != 1) {  // Ignore ...I8 instructions.
-        if (!StartsWith(entry->name, "Movzx8") && !StartsWith(entry->name, "Movsx8")) {
+        if (!StartsWith(entry->name, "Movzx8") && !StartsWith(entry->name, "Movsx8")
+           && !StartsWith(entry->name, "Movzx8q") && !StartsWith(entry->name, "Movsx8q")) {
           CHECK(strchr(entry->name, '8') == nullptr) << entry->name;
         }
       }
diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc
index 28195aba36..425caec177 100644
--- a/compiler/dex/quick/x86/call_x86.cc
+++ b/compiler/dex/quick/x86/call_x86.cc
@@ -295,7 +295,8 @@ void X86Mir2Lir::GenEntrySequence(RegLocation* ArgLocs, RegLocation rl_method) {
     setup_method_address_[0] = NewLIR1(kX86StartOfMethod, rs_rX86_ARG0.GetReg());
     int displacement = SRegOffset(base_of_code_->s_reg_low);
     // Native pointer - must be natural word size.
-    setup_method_address_[1] = StoreBaseDisp(rs_rX86_SP, displacement, rs_rX86_ARG0, Gen64Bit() ? k64 : k32);
+    setup_method_address_[1] = StoreBaseDisp(rs_rX86_SP, displacement, rs_rX86_ARG0,
+                                             Gen64Bit() ? k64 : k32, kNotVolatile);
   }
 
   FreeTemp(rs_rX86_ARG0);
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index d482e58521..70382c746a 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -68,20 +68,16 @@ class X86Mir2Lir : public Mir2Lir {
   LIR* CheckSuspendUsingLoad() OVERRIDE;
   RegStorage LoadHelper(ThreadOffset<4> offset) OVERRIDE;
   RegStorage LoadHelper(ThreadOffset<8> offset) OVERRIDE;
-  LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
-                            OpSize size) OVERRIDE;
   LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
-                    OpSize size) OVERRIDE;
+                    OpSize size, VolatileKind is_volatile) OVERRIDE;
   LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale,
                        OpSize size) OVERRIDE;
   LIR* LoadBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale, int displacement,
                            RegStorage r_dest, OpSize size) OVERRIDE;
   LIR* LoadConstantNoClobber(RegStorage r_dest, int value);
   LIR* LoadConstantWide(RegStorage r_dest, int64_t value);
-  LIR* StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
-                             OpSize size) OVERRIDE;
   LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
-                     OpSize size) OVERRIDE;
+                     OpSize size, VolatileKind is_volatile) OVERRIDE;
   LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
                         OpSize size) OVERRIDE;
   LIR* StoreBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale, int displacement,
diff --git a/compiler/dex/quick/x86/fp_x86.cc b/compiler/dex/quick/x86/fp_x86.cc
index 1f05ab9cf4..f854adb175 100644
--- a/compiler/dex/quick/x86/fp_x86.cc
+++ b/compiler/dex/quick/x86/fp_x86.cc
@@ -144,7 +144,7 @@ void X86Mir2Lir::GenLongToFP(RegLocation rl_dest, RegLocation rl_src, bool is_do
     } else {
       // It must have been register promoted if it is not a temp but is still in physical
       // register. Since we need it to be in memory to convert, we place it there now.
-      StoreBaseDisp(TargetReg(kSp), src_v_reg_offset, rl_src.reg, k64);
+      StoreBaseDisp(TargetReg(kSp), src_v_reg_offset, rl_src.reg, k64, kNotVolatile);
     }
   }
 
@@ -178,7 +178,7 @@ void X86Mir2Lir::GenLongToFP(RegLocation rl_dest, RegLocation rl_src, bool is_do
      */
     rl_result = EvalLoc(rl_dest, kFPReg, true);
     if (is_double) {
-      LoadBaseDisp(TargetReg(kSp), dest_v_reg_offset, rl_result.reg, k64);
+      LoadBaseDisp(TargetReg(kSp), dest_v_reg_offset, rl_result.reg, k64, kNotVolatile);
 
       StoreFinalValueWide(rl_dest, rl_result);
     } else {
@@ -221,7 +221,7 @@ void X86Mir2Lir::GenConversion(Instruction::Code opcode, RegLocation rl_dest,
       LoadConstant(rl_result.reg, 0x7fffffff);
       NewLIR2(kX86Cvtsi2ssRR, temp_reg.GetReg(), rl_result.reg.GetReg());
       NewLIR2(kX86ComissRR, rl_src.reg.GetReg(), temp_reg.GetReg());
-      LIR* branch_pos_overflow = NewLIR2(kX86Jcc8, 0, kX86CondA);
+      LIR* branch_pos_overflow = NewLIR2(kX86Jcc8, 0, kX86CondAe);
       LIR* branch_na_n = NewLIR2(kX86Jcc8, 0, kX86CondP);
       NewLIR2(kX86Cvttss2siRR, rl_result.reg.GetReg(), rl_src.reg.GetReg());
       LIR* branch_normal = NewLIR1(kX86Jmp8, 0);
@@ -242,7 +242,7 @@ void X86Mir2Lir::GenConversion(Instruction::Code opcode, RegLocation rl_dest,
       LoadConstant(rl_result.reg, 0x7fffffff);
       NewLIR2(kX86Cvtsi2sdRR, temp_reg.GetReg(), rl_result.reg.GetReg());
       NewLIR2(kX86ComisdRR, rl_src.reg.GetReg(), temp_reg.GetReg());
-      LIR* branch_pos_overflow = NewLIR2(kX86Jcc8, 0, kX86CondA);
+      LIR* branch_pos_overflow = NewLIR2(kX86Jcc8, 0, kX86CondAe);
       LIR* branch_na_n = NewLIR2(kX86Jcc8, 0, kX86CondP);
       NewLIR2(kX86Cvttsd2siRR, rl_result.reg.GetReg(), rl_src.reg.GetReg());
       LIR* branch_normal = NewLIR1(kX86Jmp8, 0);
@@ -281,7 +281,7 @@ void X86Mir2Lir::GenConversion(Instruction::Code opcode, RegLocation rl_dest,
         LoadConstantWide(rl_result.reg, 0x7fffffffffffffff);
         NewLIR2(kX86Cvtsqi2ssRR, temp_reg.GetReg(), rl_result.reg.GetReg());
         NewLIR2(kX86ComissRR, rl_src.reg.GetReg(), temp_reg.GetReg());
-        LIR* branch_pos_overflow = NewLIR2(kX86Jcc8, 0, kX86CondA);
+        LIR* branch_pos_overflow = NewLIR2(kX86Jcc8, 0, kX86CondAe);
         LIR* branch_na_n = NewLIR2(kX86Jcc8, 0, kX86CondP);
         NewLIR2(kX86Cvttss2sqiRR, rl_result.reg.GetReg(), rl_src.reg.GetReg());
         LIR* branch_normal = NewLIR1(kX86Jmp8, 0);
@@ -306,7 +306,7 @@ void X86Mir2Lir::GenConversion(Instruction::Code opcode, RegLocation rl_dest,
         LoadConstantWide(rl_result.reg, 0x7fffffffffffffff);
         NewLIR2(kX86Cvtsqi2sdRR, temp_reg.GetReg(), rl_result.reg.GetReg());
         NewLIR2(kX86ComisdRR, rl_src.reg.GetReg(), temp_reg.GetReg());
-        LIR* branch_pos_overflow = NewLIR2(kX86Jcc8, 0, kX86CondA);
+        LIR* branch_pos_overflow = NewLIR2(kX86Jcc8, 0, kX86CondAe);
         LIR* branch_na_n = NewLIR2(kX86Jcc8, 0, kX86CondP);
         NewLIR2(kX86Cvttsd2sqiRR, rl_result.reg.GetReg(), rl_src.reg.GetReg());
         LIR* branch_normal = NewLIR1(kX86Jmp8, 0);
@@ -363,7 +363,8 @@ void X86Mir2Lir::GenRemFP(RegLocation rl_dest, RegLocation rl_src1, RegLocation
     } else {
       // It must have been register promoted if it is not a temp but is still in physical
       // register. Since we need it to be in memory to convert, we place it there now.
-      StoreBaseDisp(TargetReg(kSp), src1_v_reg_offset, rl_src1.reg, is_double ? k64 : k32);
+      StoreBaseDisp(TargetReg(kSp), src1_v_reg_offset, rl_src1.reg, is_double ? k64 : k32,
+                    kNotVolatile);
     }
   }
 
@@ -373,7 +374,8 @@ void X86Mir2Lir::GenRemFP(RegLocation rl_dest, RegLocation rl_src1, RegLocation
       FlushSpecificReg(reg_info);
       ResetDef(rl_src2.reg);
     } else {
-      StoreBaseDisp(TargetReg(kSp), src2_v_reg_offset, rl_src2.reg, is_double ? k64 : k32);
+      StoreBaseDisp(TargetReg(kSp), src2_v_reg_offset, rl_src2.reg, is_double ? k64 : k32,
+                    kNotVolatile);
     }
   }
 
@@ -433,7 +435,7 @@ void X86Mir2Lir::GenRemFP(RegLocation rl_dest, RegLocation rl_src1, RegLocation
   if (rl_result.location == kLocPhysReg) {
     rl_result = EvalLoc(rl_dest, kFPReg, true);
     if (is_double) {
-      LoadBaseDisp(TargetReg(kSp), dest_v_reg_offset, rl_result.reg, k64);
+      LoadBaseDisp(TargetReg(kSp), dest_v_reg_offset, rl_result.reg, k64, kNotVolatile);
       StoreFinalValueWide(rl_dest, rl_result);
     } else {
       Load32Disp(TargetReg(kSp), dest_v_reg_offset, rl_result.reg);
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index b905312726..350cfb86be 100644
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -35,14 +35,13 @@ void X86Mir2Lir::GenCmpLong(RegLocation rl_dest, RegLocation rl_src1,
     rl_src1 = LoadValueWide(rl_src1, kCoreReg);
     rl_src2 = LoadValueWide(rl_src2, kCoreReg);
     RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
-    OpRegReg(kOpXor, rl_result.reg, rl_result.reg);  // result = 0
-    OpRegReg(kOpCmp, rl_src1.reg, rl_src2.reg);
-    NewLIR2(kX86Set8R, rl_result.reg.GetReg(), kX86CondNe);  // result = (src1 != src2) ? 1 : result
     RegStorage temp_reg = AllocTemp();
-    OpRegReg(kOpNeg, temp_reg, rl_result.reg);
     OpRegReg(kOpCmp, rl_src1.reg, rl_src2.reg);
-    // result = (src1 < src2) ? -result : result
-    OpCondRegReg(kOpCmov, kCondLt, rl_result.reg, temp_reg);
+    NewLIR2(kX86Set8R, rl_result.reg.GetReg(), kX86CondG);   // result = (src1 > src2) ? 1 : 0
+    NewLIR2(kX86Set8R, temp_reg.GetReg(), kX86CondL);  // temp = (src1 >= src2) ? 0 : 1
+    NewLIR2(kX86Sub8RR, rl_result.reg.GetReg(), temp_reg.GetReg());
+    NewLIR2(kX86Movsx8qRR, rl_result.reg.GetReg(), rl_result.reg.GetReg());
+
     StoreValue(rl_dest, rl_result);
     FreeTemp(temp_reg);
     return;
@@ -755,7 +754,7 @@ bool X86Mir2Lir::GenInlinedPeek(CallInfo* info, OpSize size) {
   RegLocation rl_address = LoadValue(rl_src_address, kCoreReg);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   // Unaligned access is allowed on x86.
-  LoadBaseDisp(rl_address.reg, 0, rl_result.reg, size);
+  LoadBaseDisp(rl_address.reg, 0, rl_result.reg, size, kNotVolatile);
   if (size == k64) {
     StoreValueWide(rl_dest, rl_result);
   } else {
@@ -773,12 +772,12 @@ bool X86Mir2Lir::GenInlinedPoke(CallInfo* info, OpSize size) {
   if (size == k64) {
     // Unaligned access is allowed on x86.
     RegLocation rl_value = LoadValueWide(rl_src_value, kCoreReg);
-    StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size);
+    StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size, kNotVolatile);
   } else {
     DCHECK(size == kSignedByte || size == kSignedHalf || size == k32);
     // Unaligned access is allowed on x86.
     RegLocation rl_value = LoadValue(rl_src_value, kCoreReg);
-    StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size);
+    StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size, kNotVolatile);
   }
   return true;
 }
@@ -1139,7 +1138,7 @@ void X86Mir2Lir::GenImulMemImm(RegStorage dest, int sreg, int displacement, int
       NewLIR2(kX86Xor32RR, dest.GetReg(), dest.GetReg());
       break;
     case 1:
-      LoadBaseDisp(rs_rX86_SP, displacement, dest, k32);
+      LoadBaseDisp(rs_rX86_SP, displacement, dest, k32, kNotVolatile);
       break;
     default:
       m = NewLIR4(IS_SIMM8(val) ? kX86Imul32RMI8 : kX86Imul32RMI, dest.GetReg(),
@@ -1295,7 +1294,8 @@ void X86Mir2Lir::GenMulLong(Instruction::Code, RegLocation rl_dest, RegLocation
   if (src1_in_reg) {
     NewLIR2(kX86Mov32RR, rs_r1.GetReg(), rl_src1.reg.GetHighReg());
   } else {
-    LoadBaseDisp(rs_rX86_SP, SRegOffset(rl_src1.s_reg_low) + HIWORD_OFFSET, rs_r1, k32);
+    LoadBaseDisp(rs_rX86_SP, SRegOffset(rl_src1.s_reg_low) + HIWORD_OFFSET, rs_r1, k32,
+                 kNotVolatile);
   }
 
   if (is_square) {
@@ -1318,7 +1318,8 @@ void X86Mir2Lir::GenMulLong(Instruction::Code, RegLocation rl_dest, RegLocation
     if (src2_in_reg) {
       NewLIR2(kX86Mov32RR, rs_r0.GetReg(), rl_src2.reg.GetHighReg());
     } else {
-      LoadBaseDisp(rs_rX86_SP, SRegOffset(rl_src2.s_reg_low) + HIWORD_OFFSET, rs_r0, k32);
+      LoadBaseDisp(rs_rX86_SP, SRegOffset(rl_src2.s_reg_low) + HIWORD_OFFSET, rs_r0, k32,
+                   kNotVolatile);
     }
 
     // EAX <- EAX * 1L  (2H * 1L)
@@ -1351,7 +1352,8 @@ void X86Mir2Lir::GenMulLong(Instruction::Code, RegLocation rl_dest, RegLocation
   if (src2_in_reg) {
     NewLIR2(kX86Mov32RR, rs_r0.GetReg(), rl_src2.reg.GetLowReg());
   } else {
-    LoadBaseDisp(rs_rX86_SP, SRegOffset(rl_src2.s_reg_low) + LOWORD_OFFSET, rs_r0, k32);
+    LoadBaseDisp(rs_rX86_SP, SRegOffset(rl_src2.s_reg_low) + LOWORD_OFFSET, rs_r0, k32,
+                 kNotVolatile);
   }
 
   // EDX:EAX <- 2L * 1L (double precision)
@@ -2290,21 +2292,21 @@ void X86Mir2Lir::GenInstanceofFinal(bool use_declaring_class, uint32_t type_idx,
   if (rl_method.location == kLocPhysReg) {
     if (use_declaring_class) {
       LoadRefDisp(rl_method.reg, mirror::ArtMethod::DeclaringClassOffset().Int32Value(),
-                   check_class);
+                  check_class, kNotVolatile);
     } else {
       LoadRefDisp(rl_method.reg, mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(),
-                   check_class);
-      LoadRefDisp(check_class, offset_of_type, check_class);
+                  check_class, kNotVolatile);
+      LoadRefDisp(check_class, offset_of_type, check_class, kNotVolatile);
     }
   } else {
     LoadCurrMethodDirect(check_class);
     if (use_declaring_class) {
       LoadRefDisp(check_class, mirror::ArtMethod::DeclaringClassOffset().Int32Value(),
-                   check_class);
+                  check_class, kNotVolatile);
     } else {
       LoadRefDisp(check_class, mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(),
-                   check_class);
-      LoadRefDisp(check_class, offset_of_type, check_class);
+                  check_class, kNotVolatile);
+      LoadRefDisp(check_class, offset_of_type, check_class, kNotVolatile);
     }
   }
 
@@ -2351,16 +2353,16 @@ void X86Mir2Lir::GenInstanceofCallingHelper(bool needs_access_check, bool type_k
   } else if (use_declaring_class) {
     LoadValueDirectFixed(rl_src, TargetReg(kArg0));
     LoadRefDisp(TargetReg(kArg1), mirror::ArtMethod::DeclaringClassOffset().Int32Value(),
-                 class_reg);
+                class_reg, kNotVolatile);
   } else {
     // Load dex cache entry into class_reg (kArg2).
     LoadValueDirectFixed(rl_src, TargetReg(kArg0));
     LoadRefDisp(TargetReg(kArg1), mirror::ArtMethod::DexCacheResolvedTypesOffset().Int32Value(),
-                 class_reg);
+                class_reg, kNotVolatile);
     int32_t offset_of_type =
         mirror::Array::DataOffset(sizeof(mirror::HeapReference<mirror::Class*>)).Int32Value() +
         (sizeof(mirror::HeapReference<mirror::Class*>) * type_idx);
-    LoadRefDisp(class_reg, offset_of_type, class_reg);
+    LoadRefDisp(class_reg, offset_of_type, class_reg, kNotVolatile);
     if (!can_assume_type_is_in_dex_cache) {
       // Need to test presence of type in dex cache at runtime.
       LIR* hop_branch = OpCmpImmBranch(kCondNe, class_reg, 0, NULL);
@@ -2393,7 +2395,8 @@ void X86Mir2Lir::GenInstanceofCallingHelper(bool needs_access_check, bool type_k
 
   /* Load object->klass_. */
   DCHECK_EQ(mirror::Object::ClassOffset().Int32Value(), 0);
-  LoadRefDisp(TargetReg(kArg0),  mirror::Object::ClassOffset().Int32Value(), TargetReg(kArg1));
+  LoadRefDisp(TargetReg(kArg0),  mirror::Object::ClassOffset().Int32Value(), TargetReg(kArg1),
+              kNotVolatile);
   /* kArg0 is ref, kArg1 is ref->klass_, kArg2 is class. */
   LIR* branchover = nullptr;
   if (type_known_final) {
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 078dd5a73b..e369d26df3 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -1866,7 +1866,7 @@ void X86Mir2Lir::FlushIns(RegLocation* ArgLocs, RegLocation rl_method) {
   StoreValue(rl_method, rl_src);
   // If Method* has been promoted, explicitly flush
   if (rl_method.location == kLocPhysReg) {
-    StoreRefDisp(TargetReg(kSp), 0, TargetReg(kArg0));
+    StoreRefDisp(TargetReg(kSp), 0, TargetReg(kArg0), kNotVolatile);
   }
 
   if (cu_->num_ins == 0) {
@@ -1916,11 +1916,11 @@ void X86Mir2Lir::FlushIns(RegLocation* ArgLocs, RegLocation rl_method) {
       }
       if (need_flush) {
         if (t_loc->wide && t_loc->fp) {
-          StoreBaseDisp(TargetReg(kSp), SRegOffset(start_vreg + i), reg, k64);
+          StoreBaseDisp(TargetReg(kSp), SRegOffset(start_vreg + i), reg, k64, kNotVolatile);
           // Increment i to skip the next one
           i++;
         } else if (t_loc->wide && !t_loc->fp) {
-          StoreBaseDisp(TargetReg(kSp), SRegOffset(start_vreg + i), reg, k64);
+          StoreBaseDisp(TargetReg(kSp), SRegOffset(start_vreg + i), reg, k64, kNotVolatile);
           // Increment i to skip the next one
           i++;
         } else {
@@ -2018,14 +2018,14 @@ int X86Mir2Lir::GenDalvikArgsRange(CallInfo* info, int call_state,
         loc = UpdateLocWide(loc);
         if (loc.location == kLocPhysReg) {
           ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
-          StoreBaseDisp(TargetReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, k64);
+          StoreBaseDisp(TargetReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, k64, kNotVolatile);
         }
         next_arg += 2;
       } else {
         loc = UpdateLoc(loc);
         if (loc.location == kLocPhysReg) {
           ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
-          StoreBaseDisp(TargetReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, k32);
+          StoreBaseDisp(TargetReg(kSp), SRegOffset(loc.s_reg_low), loc.reg, k32, kNotVolatile);
         }
         next_arg++;
       }
@@ -2161,18 +2161,18 @@ int X86Mir2Lir::GenDalvikArgsRange(CallInfo* info, int call_state,
           ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
           if (rl_arg.wide) {
             if (rl_arg.location == kLocPhysReg) {
-              StoreBaseDisp(TargetReg(kSp), out_offset, rl_arg.reg, k64);
+              StoreBaseDisp(TargetReg(kSp), out_offset, rl_arg.reg, k64, kNotVolatile);
             } else {
               LoadValueDirectWideFixed(rl_arg, regWide);
-              StoreBaseDisp(TargetReg(kSp), out_offset, regWide, k64);
+              StoreBaseDisp(TargetReg(kSp), out_offset, regWide, k64, kNotVolatile);
             }
             i++;
           } else {
             if (rl_arg.location == kLocPhysReg) {
-              StoreBaseDisp(TargetReg(kSp), out_offset, rl_arg.reg, k32);
+              StoreBaseDisp(TargetReg(kSp), out_offset, rl_arg.reg, k32, kNotVolatile);
             } else {
               LoadValueDirectFixed(rl_arg, regSingle);
-              StoreBaseDisp(TargetReg(kSp), out_offset, regSingle, k32);
+              StoreBaseDisp(TargetReg(kSp), out_offset, regSingle, k32, kNotVolatile);
             }
           }
         }
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index 46e877f8f9..0352808a7c 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -585,7 +585,7 @@ LIR* X86Mir2Lir::LoadConstantWide(RegStorage r_dest, int64_t value) {
         // value.
         ScopedMemRefType mem_ref_type(this, ResourceMask::kLiteral);
         res = LoadBaseDisp(rl_method.reg, 256 /* bogus */, RegStorage::FloatSolo64(low_reg_val),
-                           kDouble);
+                           kDouble, kNotVolatile);
         res->target = data_target;
         res->flags.fixup = kFixupLoad;
         store_method_addr_used_ = true;
@@ -611,8 +611,12 @@ LIR* X86Mir2Lir::LoadConstantWide(RegStorage r_dest, int64_t value) {
         if (val_lo < 0) {
           val_hi += 1;
         }
-        res = LoadConstantNoClobber(RegStorage::Solo32(r_dest.GetReg()), val_hi);
-        NewLIR2(kX86Sal64RI, r_dest.GetReg(), 32);
+        if (val_hi != 0) {
+          res = LoadConstantNoClobber(RegStorage::Solo32(r_dest.GetReg()), val_hi);
+          NewLIR2(kX86Sal64RI, r_dest.GetReg(), 32);
+        } else {
+          res = NewLIR2(kX86Xor64RR, r_dest.GetReg(), r_dest.GetReg());
+        }
         if (val_lo != 0) {
           NewLIR2(kX86Add64RI, r_dest.GetReg(), val_lo);
         }
@@ -752,17 +756,22 @@ LIR* X86Mir2Lir::LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStora
   return LoadBaseIndexedDisp(r_base, r_index, scale, 0, r_dest, size);
 }
 
-LIR* X86Mir2Lir::LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
-                                      OpSize size) {
+LIR* X86Mir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
+                              OpSize size, VolatileKind is_volatile) {
   // LoadBaseDisp() will emit correct insn for atomic load on x86
   // assuming r_dest is correctly prepared using RegClassForFieldLoadStore().
-  return LoadBaseDisp(r_base, displacement, r_dest, size);
-}
 
-LIR* X86Mir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
-                              OpSize size) {
-  return LoadBaseIndexedDisp(r_base, RegStorage::InvalidReg(), 0, displacement, r_dest,
-                             size);
+  LIR* load = LoadBaseIndexedDisp(r_base, RegStorage::InvalidReg(), 0, displacement, r_dest,
+                                  size);
+
+  if (UNLIKELY(is_volatile == kVolatile)) {
+    // Without context sensitive analysis, we must issue the most conservative barriers.
+    // In this case, either a load or store may follow so we issue both barriers.
+    GenMemBarrier(kLoadLoad);
+    GenMemBarrier(kLoadStore);
+  }
+
+  return load;
 }
 
 LIR* X86Mir2Lir::StoreBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale,
@@ -850,20 +859,28 @@ LIR* X86Mir2Lir::StoreBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int
 
 /* store value base base + scaled index. */
 LIR* X86Mir2Lir::StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src,
-                      int scale, OpSize size) {
+                                  int scale, OpSize size) {
   return StoreBaseIndexedDisp(r_base, r_index, scale, 0, r_src, size);
 }
 
-LIR* X86Mir2Lir::StoreBaseDispVolatile(RegStorage r_base, int displacement,
-                                       RegStorage r_src, OpSize size) {
+LIR* X86Mir2Lir::StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src, OpSize size,
+                               VolatileKind is_volatile) {
+  if (UNLIKELY(is_volatile == kVolatile)) {
+    // There might have been a store before this volatile one so insert StoreStore barrier.
+    GenMemBarrier(kStoreStore);
+  }
+
   // StoreBaseDisp() will emit correct insn for atomic store on x86
   // assuming r_dest is correctly prepared using RegClassForFieldLoadStore().
-  return StoreBaseDisp(r_base, displacement, r_src, size);
-}
 
-LIR* X86Mir2Lir::StoreBaseDisp(RegStorage r_base, int displacement,
-                               RegStorage r_src, OpSize size) {
-  return StoreBaseIndexedDisp(r_base, RegStorage::InvalidReg(), 0, displacement, r_src, size);
+  LIR* store = StoreBaseIndexedDisp(r_base, RegStorage::InvalidReg(), 0, displacement, r_src, size);
+
+  if (UNLIKELY(is_volatile == kVolatile)) {
+    // A load might follow the volatile store so insert a StoreLoad barrier.
+    GenMemBarrier(kStoreLoad);
+  }
+
+  return store;
 }
 
 LIR* X86Mir2Lir::OpCmpMemImmBranch(ConditionCode cond, RegStorage temp_reg, RegStorage base_reg,
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index 28b9dca193..17c44bc2c7 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -609,6 +609,10 @@ enum X86OpCode {
   Binary0fOpCode(kX86Movzx16),  // zero-extend 16-bit value
   Binary0fOpCode(kX86Movsx8),   // sign-extend 8-bit value
   Binary0fOpCode(kX86Movsx16),  // sign-extend 16-bit value
+  Binary0fOpCode(kX86Movzx8q),   // zero-extend 8-bit value to quad word
+  Binary0fOpCode(kX86Movzx16q),  // zero-extend 16-bit value to quad word
+  Binary0fOpCode(kX86Movsx8q),   // sign-extend 8-bit value to quad word
+  Binary0fOpCode(kX86Movsx16q),  // sign-extend 16-bit value to quad word
 #undef Binary0fOpCode
   kX86Jcc8, kX86Jcc32,  // jCC rel8/32; lir operands - 0: rel, 1: CC, target assigned
   kX86Jmp8, kX86Jmp32,  // jmp rel8/32; lir operands - 0: rel, target assigned
@@ -707,6 +711,8 @@ struct X86EncodingMap {
 #define REX_X 0x42
 // Extension of the ModR/M r/m field, SIB base field, or Opcode reg field
 #define REX_B 0x41
+// Extended register set
+#define REX 0x40
 // Mask extracting the least 3 bits of r0..r15
 #define kRegNumMask32 0x07
 // Value indicating that base or reg is not used
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 3e326f0633..96625c5dac 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -697,7 +697,7 @@ void CompilerDriver::LoadImageClasses(TimingLogger* timings)
     return;
   }
 
-  timings->NewSplit("LoadImageClasses");
+  TimingLogger::ScopedTiming t("LoadImageClasses", timings);
   // Make a first class to load all classes explicitly listed in the file
   Thread* self = Thread::Current();
   ScopedObjectAccess soa(self);
@@ -794,8 +794,7 @@ void CompilerDriver::FindClinitImageClassesCallback(mirror::Object* object, void
 
 void CompilerDriver::UpdateImageClasses(TimingLogger* timings) {
   if (IsImage()) {
-    timings->NewSplit("UpdateImageClasses");
-
+    TimingLogger::ScopedTiming t("UpdateImageClasses", timings);
     // Update image_classes_ with classes for objects created by <clinit> methods.
     Thread* self = Thread::Current();
     const char* old_cause = self->StartAssertNoThreadSuspension("ImageWriter");
@@ -1606,11 +1605,11 @@ void CompilerDriver::ResolveDexFile(jobject class_loader, const DexFile& dex_fil
   if (IsImage()) {
     // For images we resolve all types, such as array, whereas for applications just those with
     // classdefs are resolved by ResolveClassFieldsAndMethods.
-    timings->NewSplit("Resolve Types");
+    TimingLogger::ScopedTiming t("Resolve Types", timings);
     context.ForAll(0, dex_file.NumTypeIds(), ResolveType, thread_count_);
   }
 
-  timings->NewSplit("Resolve MethodsAndFields");
+  TimingLogger::ScopedTiming t("Resolve MethodsAndFields", timings);
   context.ForAll(0, dex_file.NumClassDefs(), ResolveClassFieldsAndMethods, thread_count_);
 }
 
@@ -1672,7 +1671,7 @@ static void VerifyClass(const ParallelCompilationManager* manager, size_t class_
 
 void CompilerDriver::VerifyDexFile(jobject class_loader, const DexFile& dex_file,
                                    ThreadPool* thread_pool, TimingLogger* timings) {
-  timings->NewSplit("Verify Dex File");
+  TimingLogger::ScopedTiming t("Verify Dex File", timings);
   ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
   ParallelCompilationManager context(class_linker, class_loader, this, &dex_file, thread_pool);
   context.ForAll(0, dex_file.NumClassDefs(), VerifyClass, thread_count_);
@@ -1765,7 +1764,7 @@ static void InitializeClass(const ParallelCompilationManager* manager, size_t cl
 
 void CompilerDriver::InitializeClasses(jobject jni_class_loader, const DexFile& dex_file,
                                        ThreadPool* thread_pool, TimingLogger* timings) {
-  timings->NewSplit("InitializeNoClinit");
+  TimingLogger::ScopedTiming t("InitializeNoClinit", timings);
   ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
   ParallelCompilationManager context(class_linker, jni_class_loader, this, &dex_file, thread_pool);
   size_t thread_count;
@@ -1877,7 +1876,7 @@ void CompilerDriver::CompileClass(const ParallelCompilationManager* manager, siz
 
 void CompilerDriver::CompileDexFile(jobject class_loader, const DexFile& dex_file,
                                     ThreadPool* thread_pool, TimingLogger* timings) {
-  timings->NewSplit("Compile Dex File");
+  TimingLogger::ScopedTiming t("Compile Dex File", timings);
   ParallelCompilationManager context(Runtime::Current()->GetClassLinker(), class_loader, this,
                                      &dex_file, thread_pool);
   context.ForAll(0, dex_file.NumClassDefs(), CompilerDriver::CompileClass, thread_count_);
@@ -2054,7 +2053,9 @@ bool CompilerDriver::SkipCompilation(const std::string& method_name) {
   ProfileFile::ProfileData data;
   if (!profile_file_.GetProfileData(&data, method_name)) {
     // Not in profile, no information can be determined.
-    VLOG(compiler) << "not compiling " << method_name << " because it's not in the profile";
+    if (kIsDebugBuild) {
+      VLOG(compiler) << "not compiling " << method_name << " because it's not in the profile";
+    }
     return true;
   }
 
@@ -2063,13 +2064,16 @@ bool CompilerDriver::SkipCompilation(const std::string& method_name) {
   // falls inside a bucket.
   bool compile = data.GetTopKUsedPercentage() - data.GetUsedPercent()
                  <= compiler_options_->GetTopKProfileThreshold();
-  if (compile) {
-    LOG(INFO) << "compiling method " << method_name << " because its usage is part of top "
-        << data.GetTopKUsedPercentage() << "% with a percent of " << data.GetUsedPercent() << "%"
-        << " (topKThreshold=" << compiler_options_->GetTopKProfileThreshold() << ")";
-  } else {
-    VLOG(compiler) << "not compiling method " << method_name << " because it's not part of leading "
-        << compiler_options_->GetTopKProfileThreshold() << "% samples)";
+  if (kIsDebugBuild) {
+    if (compile) {
+      LOG(INFO) << "compiling method " << method_name << " because its usage is part of top "
+          << data.GetTopKUsedPercentage() << "% with a percent of " << data.GetUsedPercent() << "%"
+          << " (topKThreshold=" << compiler_options_->GetTopKProfileThreshold() << ")";
+    } else {
+      VLOG(compiler) << "not compiling method " << method_name
+          << " because it's not part of leading " << compiler_options_->GetTopKProfileThreshold()
+          << "% samples)";
+    }
   }
   return !compile;
 }
diff --git a/compiler/driver/compiler_driver_test.cc b/compiler/driver/compiler_driver_test.cc
index ca956aac36..5325a68b37 100644
--- a/compiler/driver/compiler_driver_test.cc
+++ b/compiler/driver/compiler_driver_test.cc
@@ -38,12 +38,12 @@ class CompilerDriverTest : public CommonCompilerTest {
  protected:
   void CompileAll(jobject class_loader) LOCKS_EXCLUDED(Locks::mutator_lock_) {
     TimingLogger timings("CompilerDriverTest::CompileAll", false, false);
-    timings.StartSplit("CompileAll");
+    TimingLogger::ScopedTiming t(__FUNCTION__, &timings);
     compiler_driver_->CompileAll(class_loader,
                                  Runtime::Current()->GetCompileTimeClassPath(class_loader),
                                  &timings);
+    t.NewTiming("MakeAllExecutable");
     MakeAllExecutable(class_loader);
-    timings.EndSplit();
   }
 
   void EnsureCompiled(jobject class_loader, const char* class_name, const char* method,
diff --git a/compiler/image_test.cc b/compiler/image_test.cc
index 92be147a40..e8bbaef1e9 100644
--- a/compiler/image_test.cc
+++ b/compiler/image_test.cc
@@ -64,7 +64,7 @@ TEST_F(ImageTest, WriteRead) {
       jobject class_loader = NULL;
       ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
       TimingLogger timings("ImageTest::WriteRead", false, false);
-      timings.StartSplit("CompileAll");
+      TimingLogger::ScopedTiming t("CompileAll", &timings);
       if (kUsePortableCompiler) {
         // TODO: we disable this for portable so the test executes in a reasonable amount of time.
         //       We shouldn't need to do this.
@@ -75,6 +75,7 @@ TEST_F(ImageTest, WriteRead) {
       }
       compiler_driver_->CompileAll(class_loader, class_linker->GetBootClassPath(), &timings);
 
+      t.NewTiming("WriteElf");
       ScopedObjectAccess soa(Thread::Current());
       OatWriter oat_writer(class_linker->GetBootClassPath(),
                            0, 0, "", compiler_driver_.get(), &timings);
@@ -84,7 +85,6 @@ TEST_F(ImageTest, WriteRead) {
                                                 &oat_writer,
                                                 oat_file.GetFile());
       ASSERT_TRUE(success);
-      timings.EndSplit();
     }
   }
   // Workound bug that mcld::Linker::emit closes oat_file by reopening as dup_oat.
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index c6b9161b63..e1b6992c47 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -91,31 +91,31 @@ OatWriter::OatWriter(const std::vector<const DexFile*>& dex_files,
     size_oat_class_method_offsets_(0) {
   size_t offset;
   {
-    TimingLogger::ScopedSplit split("InitOatHeader", timings);
+    TimingLogger::ScopedTiming split("InitOatHeader", timings);
     offset = InitOatHeader();
   }
   {
-    TimingLogger::ScopedSplit split("InitOatDexFiles", timings);
+    TimingLogger::ScopedTiming split("InitOatDexFiles", timings);
     offset = InitOatDexFiles(offset);
   }
   {
-    TimingLogger::ScopedSplit split("InitDexFiles", timings);
+    TimingLogger::ScopedTiming split("InitDexFiles", timings);
     offset = InitDexFiles(offset);
   }
   {
-    TimingLogger::ScopedSplit split("InitOatClasses", timings);
+    TimingLogger::ScopedTiming split("InitOatClasses", timings);
     offset = InitOatClasses(offset);
   }
   {
-    TimingLogger::ScopedSplit split("InitOatMaps", timings);
+    TimingLogger::ScopedTiming split("InitOatMaps", timings);
     offset = InitOatMaps(offset);
   }
   {
-    TimingLogger::ScopedSplit split("InitOatCode", timings);
+    TimingLogger::ScopedTiming split("InitOatCode", timings);
     offset = InitOatCode(offset);
   }
   {
-    TimingLogger::ScopedSplit split("InitOatCodeDexFiles", timings);
+    TimingLogger::ScopedTiming split("InitOatCodeDexFiles", timings);
     offset = InitOatCodeDexFiles(offset);
   }
   size_ = offset;
@@ -800,6 +800,7 @@ size_t OatWriter::InitOatMaps(size_t offset) {
 size_t OatWriter::InitOatCode(size_t offset) {
   // calculate the offsets within OatHeader to executable code
   size_t old_offset = offset;
+  size_t adjusted_offset = offset;
   // required to be on a new page boundary
   offset = RoundUp(offset, kPageSize);
   oat_header_->SetExecutableOffset(offset);
@@ -809,7 +810,8 @@ size_t OatWriter::InitOatCode(size_t offset) {
 
     #define DO_TRAMPOLINE(field, fn_name) \
       offset = CompiledCode::AlignCode(offset, instruction_set); \
-      oat_header_->Set ## fn_name ## Offset(offset); \
+      adjusted_offset = offset + CompiledCode::CodeDelta(instruction_set); \
+      oat_header_->Set ## fn_name ## Offset(adjusted_offset); \
       field.reset(compiler_driver_->Create ## fn_name()); \
       offset += field->size();
 
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index 521992ad3a..c3a322caee 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -93,15 +93,30 @@ static bool CanHandleCodeItem(const DexFile::CodeItem& code_item) {
 }
 
 template<typename T>
-void HGraphBuilder::If_22t(const Instruction& instruction, int32_t dex_offset, bool is_not) {
+void HGraphBuilder::If_22t(const Instruction& instruction, int32_t dex_offset) {
   HInstruction* first = LoadLocal(instruction.VRegA(), Primitive::kPrimInt);
   HInstruction* second = LoadLocal(instruction.VRegB(), Primitive::kPrimInt);
-  current_block_->AddInstruction(new (arena_) T(first, second));
-  if (is_not) {
-    current_block_->AddInstruction(new (arena_) HNot(current_block_->GetLastInstruction()));
-  }
-  current_block_->AddInstruction(new (arena_) HIf(current_block_->GetLastInstruction()));
-  HBasicBlock* target = FindBlockStartingAt(instruction.GetTargetOffset() + dex_offset);
+  T* comparison = new (arena_) T(first, second);
+  current_block_->AddInstruction(comparison);
+  HInstruction* ifinst = new (arena_) HIf(comparison);
+  current_block_->AddInstruction(ifinst);
+  HBasicBlock* target = FindBlockStartingAt(dex_offset + instruction.GetTargetOffset());
+  DCHECK(target != nullptr);
+  current_block_->AddSuccessor(target);
+  target = FindBlockStartingAt(dex_offset + instruction.SizeInCodeUnits());
+  DCHECK(target != nullptr);
+  current_block_->AddSuccessor(target);
+  current_block_ = nullptr;
+}
+
+template<typename T>
+void HGraphBuilder::If_21t(const Instruction& instruction, int32_t dex_offset) {
+  HInstruction* value = LoadLocal(instruction.VRegA(), Primitive::kPrimInt);
+  T* comparison = new (arena_) T(value, GetIntConstant(0));
+  current_block_->AddInstruction(comparison);
+  HInstruction* ifinst = new (arena_) HIf(comparison);
+  current_block_->AddInstruction(ifinst);
+  HBasicBlock* target = FindBlockStartingAt(dex_offset + instruction.GetTargetOffset());
   DCHECK(target != nullptr);
   current_block_->AddSuccessor(target);
   target = FindBlockStartingAt(dex_offset + instruction.SizeInCodeUnits());
@@ -340,16 +355,38 @@ bool HGraphBuilder::AnalyzeDexInstruction(const Instruction& instruction, int32_
       break;
     }
 
+    case Instruction::CONST: {
+      int32_t register_index = instruction.VRegA();
+      HIntConstant* constant = GetIntConstant(instruction.VRegB_31i());
+      UpdateLocal(register_index, constant);
+      break;
+    }
+
+    case Instruction::CONST_HIGH16: {
+      int32_t register_index = instruction.VRegA();
+      HIntConstant* constant = GetIntConstant(instruction.VRegB_21h() << 16);
+      UpdateLocal(register_index, constant);
+      break;
+    }
+
     case Instruction::CONST_WIDE_16: {
       int32_t register_index = instruction.VRegA();
-      HLongConstant* constant = GetLongConstant(instruction.VRegB_21s());
+      // Get 16 bits of constant value, sign extended to 64 bits.
+      int64_t value = instruction.VRegB_21s();
+      value <<= 48;
+      value >>= 48;
+      HLongConstant* constant = GetLongConstant(value);
       UpdateLocal(register_index, constant);
       break;
     }
 
     case Instruction::CONST_WIDE_32: {
       int32_t register_index = instruction.VRegA();
-      HLongConstant* constant = GetLongConstant(instruction.VRegB_31i());
+      // Get 32 bits of constant value, sign extended to 64 bits.
+      int64_t value = instruction.VRegB_31i();
+      value <<= 32;
+      value >>= 32;
+      HLongConstant* constant = GetLongConstant(value);
       UpdateLocal(register_index, constant);
       break;
     }
@@ -361,27 +398,58 @@ bool HGraphBuilder::AnalyzeDexInstruction(const Instruction& instruction, int32_
       break;
     }
 
-    case Instruction::MOVE: {
+    case Instruction::CONST_WIDE_HIGH16: {
+      int32_t register_index = instruction.VRegA();
+      int64_t value = static_cast<int64_t>(instruction.VRegB_21h()) << 48;
+      HLongConstant* constant = GetLongConstant(value);
+      UpdateLocal(register_index, constant);
+      break;
+    }
+
+    // TODO: these instructions are also used to move floating point values, so what is
+    // the type (int or float)?
+    case Instruction::MOVE:
+    case Instruction::MOVE_FROM16:
+    case Instruction::MOVE_16: {
       HInstruction* value = LoadLocal(instruction.VRegB(), Primitive::kPrimInt);
       UpdateLocal(instruction.VRegA(), value);
       break;
     }
 
-    case Instruction::RETURN_VOID: {
-      BuildReturn(instruction, Primitive::kPrimVoid);
+    // TODO: these instructions are also used to move floating point values, so what is
+    // the type (long or double)?
+    case Instruction::MOVE_WIDE:
+    case Instruction::MOVE_WIDE_FROM16:
+    case Instruction::MOVE_WIDE_16: {
+      HInstruction* value = LoadLocal(instruction.VRegB(), Primitive::kPrimLong);
+      UpdateLocal(instruction.VRegA(), value);
       break;
     }
 
-    case Instruction::IF_EQ: {
-      If_22t<HEqual>(instruction, dex_offset, false);
+    case Instruction::MOVE_OBJECT:
+    case Instruction::MOVE_OBJECT_16:
+    case Instruction::MOVE_OBJECT_FROM16: {
+      HInstruction* value = LoadLocal(instruction.VRegB(), Primitive::kPrimNot);
+      UpdateLocal(instruction.VRegA(), value);
       break;
     }
 
-    case Instruction::IF_NE: {
-      If_22t<HEqual>(instruction, dex_offset, true);
+    case Instruction::RETURN_VOID: {
+      BuildReturn(instruction, Primitive::kPrimVoid);
       break;
     }
 
+#define IF_XX(comparison, cond) \
+    case Instruction::IF_##cond: If_22t<comparison>(instruction, dex_offset); break; \
+    case Instruction::IF_##cond##Z: If_21t<comparison>(instruction, dex_offset); break
+
+    IF_XX(HEqual, EQ);
+    IF_XX(HNotEqual, NE);
+    IF_XX(HLessThan, LT);
+    IF_XX(HLessThanOrEqual, LE);
+    IF_XX(HGreaterThan, GT);
+    IF_XX(HGreaterThanOrEqual, GE);
+
     case Instruction::GOTO:
     case Instruction::GOTO_16:
     case Instruction::GOTO_32: {
@@ -500,10 +568,10 @@ bool HGraphBuilder::AnalyzeDexInstruction(const Instruction& instruction, int32_
     }
 
     case Instruction::MOVE_RESULT:
-    case Instruction::MOVE_RESULT_WIDE: {
+    case Instruction::MOVE_RESULT_WIDE:
+    case Instruction::MOVE_RESULT_OBJECT:
       UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
       break;
-    }
 
     case Instruction::NOP:
       break;
diff --git a/compiler/optimizing/builder.h b/compiler/optimizing/builder.h
index 108514a632..0852a26c55 100644
--- a/compiler/optimizing/builder.h
+++ b/compiler/optimizing/builder.h
@@ -22,17 +22,11 @@
 #include "primitive.h"
 #include "utils/allocation.h"
 #include "utils/growable_array.h"
+#include "nodes.h"
 
 namespace art {
 
-class ArenaAllocator;
 class Instruction;
-class HBasicBlock;
-class HGraph;
-class HIntConstant;
-class HLongConstant;
-class HInstruction;
-class HLocal;
 
 class HGraphBuilder : public ValueObject {
  public:
@@ -90,7 +84,11 @@ class HGraphBuilder : public ValueObject {
   template<typename T>
   void Binop_22s(const Instruction& instruction, bool reverse);
 
-  template<typename T> void If_22t(const Instruction& instruction, int32_t dex_offset, bool is_not);
+  template<typename T>
+  void If_22t(const Instruction& instruction, int32_t dex_offset);
+
+  template<typename T>
+  void If_21t(const Instruction& instruction, int32_t dex_offset);
 
   void BuildReturn(const Instruction& instruction, Primitive::Type type);
 
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 212a6dc370..c5862dad92 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -34,6 +34,35 @@ arm::ArmManagedRegister Location::AsArm() const {
 
 namespace arm {
 
+
+inline Condition ARMCondition(IfCondition cond) {
+  switch (cond) {
+    case kCondEQ: return EQ;
+    case kCondNE: return NE;
+    case kCondLT: return LT;
+    case kCondLE: return LE;
+    case kCondGT: return GT;
+    case kCondGE: return GE;
+    default:
+      LOG(FATAL) << "Unknown if condition";
+  }
+  return EQ;        // Unreachable.
+}
+
+inline Condition ARMOppositeCondition(IfCondition cond) {
+  switch (cond) {
+    case kCondEQ: return NE;
+    case kCondNE: return EQ;
+    case kCondLT: return GE;
+    case kCondLE: return GT;
+    case kCondGT: return LE;
+    case kCondGE: return LT;
+    default:
+      LOG(FATAL) << "Unknown if condition";
+  }
+  return EQ;        // Unreachable.
+}
+
 static constexpr int kNumberOfPushedRegistersAtEntry = 1;
 static constexpr int kCurrentMethodStackOffset = 0;
 
@@ -419,33 +448,103 @@ void InstructionCodeGeneratorARM::VisitExit(HExit* exit) {
 
 void LocationsBuilderARM::VisitIf(HIf* if_instr) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(if_instr);
-  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(0, Location::Any());
   if_instr->SetLocations(locations);
 }
 
 void InstructionCodeGeneratorARM::VisitIf(HIf* if_instr) {
-  // TODO: Generate the input as a condition, instead of materializing in a register.
-  __ cmp(if_instr->GetLocations()->InAt(0).AsArm().AsCoreRegister(), ShifterOperand(0));
-  __ b(codegen_->GetLabelOf(if_instr->IfFalseSuccessor()), EQ);
-  if (!codegen_->GoesToNextBlock(if_instr->GetBlock(), if_instr->IfTrueSuccessor())) {
-    __ b(codegen_->GetLabelOf(if_instr->IfTrueSuccessor()));
+  HInstruction* cond = if_instr->InputAt(0);
+  DCHECK(cond->IsCondition());
+  HCondition* condition = cond->AsCondition();
+  if (condition->NeedsMaterialization()) {
+    // Condition has been materialized, compare the output to 0
+    if (!if_instr->GetLocations()->InAt(0).IsRegister()) {
+      LOG(FATAL) << "Materialized condition is not in an ARM register";
+    }
+    __ cmp(if_instr->GetLocations()->InAt(0).AsArm().AsCoreRegister(),
+           ShifterOperand(0));
+    __ b(codegen_->GetLabelOf(if_instr->IfTrueSuccessor()), EQ);
+  } else {
+    // Condition has not been materialized, use its inputs as the comparison and its
+    // condition as the branch condition.
+    __ cmp(condition->GetLocations()->InAt(0).AsArm().AsCoreRegister(),
+           ShifterOperand(condition->GetLocations()->InAt(1).AsArm().AsCoreRegister()));
+    __ b(codegen_->GetLabelOf(if_instr->IfTrueSuccessor()),
+         ARMCondition(condition->GetCondition()));
+  }
+  if (!codegen_->GoesToNextBlock(if_instr->GetBlock(), if_instr->IfFalseSuccessor())) {
+    __ b(codegen_->GetLabelOf(if_instr->IfFalseSuccessor()));
   }
 }
 
-void LocationsBuilderARM::VisitEqual(HEqual* equal) {
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(equal);
+
+void LocationsBuilderARM::VisitCondition(HCondition* comp) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(comp);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
   locations->SetOut(Location::RequiresRegister());
-  equal->SetLocations(locations);
+  comp->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorARM::VisitCondition(HCondition* comp) {
+  if (comp->NeedsMaterialization()) {
+    LocationSummary* locations = comp->GetLocations();
+    __ cmp(locations->InAt(0).AsArm().AsCoreRegister(),
+           ShifterOperand(locations->InAt(1).AsArm().AsCoreRegister()));
+    __ it(ARMCondition(comp->GetCondition()), kItElse);
+    __ mov(locations->Out().AsArm().AsCoreRegister(), ShifterOperand(1),
+           ARMCondition(comp->GetCondition()));
+    __ mov(locations->Out().AsArm().AsCoreRegister(), ShifterOperand(0),
+           ARMOppositeCondition(comp->GetCondition()));
+  }
+}
+
+void LocationsBuilderARM::VisitEqual(HEqual* comp) {
+  VisitCondition(comp);
+}
+
+void InstructionCodeGeneratorARM::VisitEqual(HEqual* comp) {
+  VisitCondition(comp);
+}
+
+void LocationsBuilderARM::VisitNotEqual(HNotEqual* comp) {
+  VisitCondition(comp);
+}
+
+void InstructionCodeGeneratorARM::VisitNotEqual(HNotEqual* comp) {
+  VisitCondition(comp);
+}
+
+void LocationsBuilderARM::VisitLessThan(HLessThan* comp) {
+  VisitCondition(comp);
+}
+
+void InstructionCodeGeneratorARM::VisitLessThan(HLessThan* comp) {
+  VisitCondition(comp);
+}
+
+void LocationsBuilderARM::VisitLessThanOrEqual(HLessThanOrEqual* comp) {
+  VisitCondition(comp);
+}
+
+void InstructionCodeGeneratorARM::VisitLessThanOrEqual(HLessThanOrEqual* comp) {
+  VisitCondition(comp);
+}
+
+void LocationsBuilderARM::VisitGreaterThan(HGreaterThan* comp) {
+  VisitCondition(comp);
+}
+
+void InstructionCodeGeneratorARM::VisitGreaterThan(HGreaterThan* comp) {
+  VisitCondition(comp);
+}
+
+void LocationsBuilderARM::VisitGreaterThanOrEqual(HGreaterThanOrEqual* comp) {
+  VisitCondition(comp);
 }
 
-void InstructionCodeGeneratorARM::VisitEqual(HEqual* equal) {
-  LocationSummary* locations = equal->GetLocations();
-  __ teq(locations->InAt(0).AsArm().AsCoreRegister(),
-         ShifterOperand(locations->InAt(1).AsArm().AsCoreRegister()));
-  __ mov(locations->Out().AsArm().AsCoreRegister(), ShifterOperand(1), EQ);
-  __ mov(locations->Out().AsArm().AsCoreRegister(), ShifterOperand(0), NE);
+void InstructionCodeGeneratorARM::VisitGreaterThanOrEqual(HGreaterThanOrEqual* comp) {
+  VisitCondition(comp);
 }
 
 void LocationsBuilderARM::VisitLocal(HLocal* local) {
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 712a24cf67..0e2a079cde 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -20,7 +20,7 @@
 #include "code_generator.h"
 #include "nodes.h"
 #include "parallel_move_resolver.h"
-#include "utils/arm/assembler_arm32.h"
+#include "utils/arm/assembler_thumb2.h"
 
 namespace art {
 namespace arm {
@@ -180,7 +180,7 @@ class CodeGeneratorARM : public CodeGenerator {
   LocationsBuilderARM location_builder_;
   InstructionCodeGeneratorARM instruction_visitor_;
   ParallelMoveResolverARM move_resolver_;
-  Arm32Assembler assembler_;
+  Thumb2Assembler assembler_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorARM);
 };
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index f4b12e2d38..a8ee6c061e 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -34,6 +34,20 @@ x86::X86ManagedRegister Location::AsX86() const {
 
 namespace x86 {
 
+inline Condition X86Condition(IfCondition cond) {
+  switch (cond) {
+    case kCondEQ: return kEqual;
+    case kCondNE: return kNotEqual;
+    case kCondLT: return kLess;
+    case kCondLE: return kLessEqual;
+    case kCondGT: return kGreater;
+    case kCondGE: return kGreaterEqual;
+    default:
+      LOG(FATAL) << "Unknown if condition";
+  }
+  return kEqual;
+}
+
 static constexpr int kNumberOfPushedRegistersAtEntry = 1;
 static constexpr int kCurrentMethodStackOffset = 0;
 
@@ -421,16 +435,32 @@ void LocationsBuilderX86::VisitIf(HIf* if_instr) {
 }
 
 void InstructionCodeGeneratorX86::VisitIf(HIf* if_instr) {
-  // TODO: Generate the input as a condition, instead of materializing in a register.
-  Location location = if_instr->GetLocations()->InAt(0);
-  if (location.IsRegister()) {
-    __ cmpl(location.AsX86().AsCpuRegister(), Immediate(0));
+  HInstruction* cond = if_instr->InputAt(0);
+  DCHECK(cond->IsCondition());
+  HCondition* condition = cond->AsCondition();
+  if (condition->NeedsMaterialization()) {
+    // Materialized condition, compare against 0
+    Location lhs = if_instr->GetLocations()->InAt(0);
+    if (lhs.IsRegister()) {
+      __ cmpl(lhs.AsX86().AsCpuRegister(), Immediate(0));
+    } else {
+      __ cmpl(Address(ESP, lhs.GetStackIndex()), Immediate(0));
+    }
+    __ j(kEqual,  codegen_->GetLabelOf(if_instr->IfTrueSuccessor()));
   } else {
-    __ cmpl(Address(ESP, location.GetStackIndex()), Immediate(0));
+    Location lhs = condition->GetLocations()->InAt(0);
+    Location rhs = condition->GetLocations()->InAt(1);
+    // LHS is guaranteed to be in a register (see LocationsBuilderX86::VisitCondition).
+    if (rhs.IsRegister()) {
+      __ cmpl(lhs.AsX86().AsCpuRegister(), rhs.AsX86().AsCpuRegister());
+    } else {
+      __ cmpl(lhs.AsX86().AsCpuRegister(), Address(ESP, rhs.GetStackIndex()));
+    }
+    __ j(X86Condition(condition->GetCondition()),
+         codegen_->GetLabelOf(if_instr->IfTrueSuccessor()));
   }
-  __ j(kEqual, codegen_->GetLabelOf(if_instr->IfFalseSuccessor()));
-  if (!codegen_->GoesToNextBlock(if_instr->GetBlock(), if_instr->IfTrueSuccessor())) {
-    __ jmp(codegen_->GetLabelOf(if_instr->IfTrueSuccessor()));
+  if (!codegen_->GoesToNextBlock(if_instr->GetBlock(), if_instr->IfFalseSuccessor())) {
+    __ jmp(codegen_->GetLabelOf(if_instr->IfFalseSuccessor()));
   }
 }
 
@@ -475,24 +505,74 @@ void LocationsBuilderX86::VisitStoreLocal(HStoreLocal* store) {
 void InstructionCodeGeneratorX86::VisitStoreLocal(HStoreLocal* store) {
 }
 
-void LocationsBuilderX86::VisitEqual(HEqual* equal) {
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(equal);
+void LocationsBuilderX86::VisitCondition(HCondition* comp) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(comp);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::Any());
   locations->SetOut(Location::SameAsFirstInput());
-  equal->SetLocations(locations);
+  comp->SetLocations(locations);
 }
 
-void InstructionCodeGeneratorX86::VisitEqual(HEqual* equal) {
-  LocationSummary* locations = equal->GetLocations();
-  if (locations->InAt(1).IsRegister()) {
-    __ cmpl(locations->InAt(0).AsX86().AsCpuRegister(),
-            locations->InAt(1).AsX86().AsCpuRegister());
-  } else {
-    __ cmpl(locations->InAt(0).AsX86().AsCpuRegister(),
-            Address(ESP, locations->InAt(1).GetStackIndex()));
+void InstructionCodeGeneratorX86::VisitCondition(HCondition* comp) {
+  if (comp->NeedsMaterialization()) {
+    LocationSummary* locations = comp->GetLocations();
+    if (locations->InAt(1).IsRegister()) {
+      __ cmpl(locations->InAt(0).AsX86().AsCpuRegister(),
+              locations->InAt(1).AsX86().AsCpuRegister());
+    } else {
+      __ cmpl(locations->InAt(0).AsX86().AsCpuRegister(),
+              Address(ESP, locations->InAt(1).GetStackIndex()));
+    }
+    __ setb(X86Condition(comp->GetCondition()), locations->Out().AsX86().AsCpuRegister());
   }
-  __ setb(kEqual, locations->Out().AsX86().AsCpuRegister());
+}
+
+void LocationsBuilderX86::VisitEqual(HEqual* comp) {
+  VisitCondition(comp);
+}
+
+void InstructionCodeGeneratorX86::VisitEqual(HEqual* comp) {
+  VisitCondition(comp);
+}
+
+void LocationsBuilderX86::VisitNotEqual(HNotEqual* comp) {
+  VisitCondition(comp);
+}
+
+void InstructionCodeGeneratorX86::VisitNotEqual(HNotEqual* comp) {
+  VisitCondition(comp);
+}
+
+void LocationsBuilderX86::VisitLessThan(HLessThan* comp) {
+  VisitCondition(comp);
+}
+
+void InstructionCodeGeneratorX86::VisitLessThan(HLessThan* comp) {
+  VisitCondition(comp);
+}
+
+void LocationsBuilderX86::VisitLessThanOrEqual(HLessThanOrEqual* comp) {
+  VisitCondition(comp);
+}
+
+void InstructionCodeGeneratorX86::VisitLessThanOrEqual(HLessThanOrEqual* comp) {
+  VisitCondition(comp);
+}
+
+void LocationsBuilderX86::VisitGreaterThan(HGreaterThan* comp) {
+  VisitCondition(comp);
+}
+
+void InstructionCodeGeneratorX86::VisitGreaterThan(HGreaterThan* comp) {
+  VisitCondition(comp);
+}
+
+void LocationsBuilderX86::VisitGreaterThanOrEqual(HGreaterThanOrEqual* comp) {
+  VisitCondition(comp);
+}
+
+void InstructionCodeGeneratorX86::VisitGreaterThanOrEqual(HGreaterThanOrEqual* comp) {
+  VisitCondition(comp);
 }
 
 void LocationsBuilderX86::VisitIntConstant(HIntConstant* constant) {
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index ebeef9dfc1..283f1f5e57 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -35,6 +35,20 @@ x86_64::X86_64ManagedRegister Location::AsX86_64() const {
 
 namespace x86_64 {
 
+inline Condition X86_64Condition(IfCondition cond) {
+  switch (cond) {
+    case kCondEQ: return kEqual;
+    case kCondNE: return kNotEqual;
+    case kCondLT: return kLess;
+    case kCondLE: return kLessEqual;
+    case kCondGT: return kGreater;
+    case kCondGE: return kGreaterEqual;
+    default:
+      LOG(FATAL) << "Unknown if condition";
+  }
+  return kEqual;
+}
+
 // Some x86_64 instructions require a register to be available as temp.
 static constexpr Register TMP = R11;
 
@@ -295,16 +309,32 @@ void InstructionCodeGeneratorX86_64::VisitExit(HExit* exit) {
 
 void LocationsBuilderX86_64::VisitIf(HIf* if_instr) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(if_instr);
-  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(0, Location::Any());
   if_instr->SetLocations(locations);
 }
 
 void InstructionCodeGeneratorX86_64::VisitIf(HIf* if_instr) {
-  // TODO: Generate the input as a condition, instead of materializing in a register.
-  __ cmpl(if_instr->GetLocations()->InAt(0).AsX86_64().AsCpuRegister(), Immediate(0));
-  __ j(kEqual, codegen_->GetLabelOf(if_instr->IfFalseSuccessor()));
-  if (!codegen_->GoesToNextBlock(if_instr->GetBlock(), if_instr->IfTrueSuccessor())) {
-    __ jmp(codegen_->GetLabelOf(if_instr->IfTrueSuccessor()));
+  HInstruction* cond = if_instr->InputAt(0);
+  DCHECK(cond->IsCondition());
+  HCondition* condition = cond->AsCondition();
+  if (condition->NeedsMaterialization()) {
+    // Materialized condition, compare against 0.
+    Location lhs = if_instr->GetLocations()->InAt(0);
+    if (lhs.IsRegister()) {
+      __ cmpl(lhs.AsX86_64().AsCpuRegister(), Immediate(0));
+    } else {
+      __ cmpl(Address(CpuRegister(RSP), lhs.GetStackIndex()), Immediate(0));
+    }
+    __ j(kEqual, codegen_->GetLabelOf(if_instr->IfTrueSuccessor()));
+  } else {
+    Location lhs = condition->GetLocations()->InAt(0);
+    Location rhs = condition->GetLocations()->InAt(1);
+    __ cmpl(lhs.AsX86_64().AsCpuRegister(), rhs.AsX86_64().AsCpuRegister());
+    __ j(X86_64Condition(condition->GetCondition()),
+         codegen_->GetLabelOf(if_instr->IfTrueSuccessor()));
+  }
+  if (!codegen_->GoesToNextBlock(if_instr->GetBlock(), if_instr->IfFalseSuccessor())) {
+    __ jmp(codegen_->GetLabelOf(if_instr->IfFalseSuccessor()));
   }
 }
 
@@ -349,18 +379,69 @@ void LocationsBuilderX86_64::VisitStoreLocal(HStoreLocal* store) {
 void InstructionCodeGeneratorX86_64::VisitStoreLocal(HStoreLocal* store) {
 }
 
-void LocationsBuilderX86_64::VisitEqual(HEqual* equal) {
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(equal);
+void LocationsBuilderX86_64::VisitCondition(HCondition* comp) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(comp);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RequiresRegister());
   locations->SetOut(Location::SameAsFirstInput());
-  equal->SetLocations(locations);
+  comp->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86_64::VisitCondition(HCondition* comp) {
+  if (comp->NeedsMaterialization()) {
+    __ cmpq(comp->GetLocations()->InAt(0).AsX86_64().AsCpuRegister(),
+            comp->GetLocations()->InAt(1).AsX86_64().AsCpuRegister());
+    __ setcc(X86_64Condition(comp->GetCondition()),
+             comp->GetLocations()->Out().AsX86_64().AsCpuRegister());
+  }
+}
+
+void LocationsBuilderX86_64::VisitEqual(HEqual* comp) {
+  VisitCondition(comp);
+}
+
+void InstructionCodeGeneratorX86_64::VisitEqual(HEqual* comp) {
+  VisitCondition(comp);
+}
+
+void LocationsBuilderX86_64::VisitNotEqual(HNotEqual* comp) {
+  VisitCondition(comp);
+}
+
+void InstructionCodeGeneratorX86_64::VisitNotEqual(HNotEqual* comp) {
+  VisitCondition(comp);
+}
+
+void LocationsBuilderX86_64::VisitLessThan(HLessThan* comp) {
+  VisitCondition(comp);
+}
+
+void InstructionCodeGeneratorX86_64::VisitLessThan(HLessThan* comp) {
+  VisitCondition(comp);
+}
+
+void LocationsBuilderX86_64::VisitLessThanOrEqual(HLessThanOrEqual* comp) {
+  VisitCondition(comp);
+}
+
+void InstructionCodeGeneratorX86_64::VisitLessThanOrEqual(HLessThanOrEqual* comp) {
+  VisitCondition(comp);
+}
+
+void LocationsBuilderX86_64::VisitGreaterThan(HGreaterThan* comp) {
+  VisitCondition(comp);
+}
+
+void InstructionCodeGeneratorX86_64::VisitGreaterThan(HGreaterThan* comp) {
+  VisitCondition(comp);
+}
+
+void LocationsBuilderX86_64::VisitGreaterThanOrEqual(HGreaterThanOrEqual* comp) {
+  VisitCondition(comp);
 }
 
-void InstructionCodeGeneratorX86_64::VisitEqual(HEqual* equal) {
-  __ cmpq(equal->GetLocations()->InAt(0).AsX86_64().AsCpuRegister(),
-          equal->GetLocations()->InAt(1).AsX86_64().AsCpuRegister());
-  __ setcc(kEqual, equal->GetLocations()->Out().AsX86_64().AsCpuRegister());
+void InstructionCodeGeneratorX86_64::VisitGreaterThanOrEqual(HGreaterThanOrEqual* comp) {
+  VisitCondition(comp);
 }
 
 void LocationsBuilderX86_64::VisitIntConstant(HIntConstant* constant) {
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index c3baf1a7b7..fd534ced1f 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -51,7 +51,12 @@ class InternalCodeAllocator : public CodeAllocator {
 static void Run(const InternalCodeAllocator& allocator, bool has_result, int32_t expected) {
   typedef int32_t (*fptr)();
   CommonCompilerTest::MakeExecutable(allocator.GetMemory(), allocator.GetSize());
-  int32_t result = reinterpret_cast<fptr>(allocator.GetMemory())();
+  fptr f = reinterpret_cast<fptr>(allocator.GetMemory());
+#if defined(__arm__)
+  // For thumb we need the bottom bit set.
+  f = reinterpret_cast<fptr>(reinterpret_cast<uintptr_t>(f) + 1);
+#endif
+  int32_t result = f();
   if (has_result) {
     CHECK_EQ(result, expected);
   }
diff --git a/compiler/optimizing/graph_test.cc b/compiler/optimizing/graph_test.cc
index 371478c9e7..c59f8366fa 100644
--- a/compiler/optimizing/graph_test.cc
+++ b/compiler/optimizing/graph_test.cc
@@ -30,7 +30,9 @@ static HBasicBlock* createIfBlock(HGraph* graph, ArenaAllocator* allocator) {
   graph->AddBlock(if_block);
   HInstruction* instr = new (allocator) HIntConstant(4);
   if_block->AddInstruction(instr);
-  instr = new (allocator) HIf(instr);
+  HInstruction* equal = new (allocator) HEqual(instr, instr);
+  if_block->AddInstruction(equal);
+  instr = new (allocator) HIf(equal);
   if_block->AddInstruction(instr);
   return if_block;
 }
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 2a97fadbaf..490d345826 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -445,4 +445,23 @@ void HGraphVisitor::VisitBasicBlock(HBasicBlock* block) {
   }
 }
 
+
+bool HCondition::NeedsMaterialization() const {
+  if (!HasOnlyOneUse()) {
+    return true;
+  }
+  HUseListNode<HInstruction>* uses = GetUses();
+  HInstruction* user = uses->GetUser();
+  if (!user->IsIf()) {
+    return true;
+  }
+
+  // TODO: should we allow intervening instructions with no side-effect between this condition
+  // and the If instruction?
+  if (GetNext() != user) {
+    return true;
+  }
+  return false;
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 143d5c9e6f..503f31d990 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -38,6 +38,15 @@ static const int kDefaultNumberOfSuccessors = 2;
 static const int kDefaultNumberOfPredecessors = 2;
 static const int kDefaultNumberOfBackEdges = 1;
 
+enum IfCondition {
+  kCondEQ,
+  kCondNE,
+  kCondLT,
+  kCondLE,
+  kCondGT,
+  kCondGE,
+};
+
 class HInstructionList {
  public:
   HInstructionList() : first_instruction_(nullptr), last_instruction_(nullptr) {}
@@ -66,7 +75,7 @@ class HGraph : public ArenaObject {
         maximum_number_of_out_vregs_(0),
         number_of_vregs_(0),
         number_of_in_vregs_(0),
-        current_instruction_id_(0) { }
+        current_instruction_id_(0) {}
 
   ArenaAllocator* GetArena() const { return arena_; }
   const GrowableArray<HBasicBlock*>& GetBlocks() const { return blocks_; }
@@ -381,7 +390,13 @@ class HBasicBlock : public ArenaObject {
 
 #define FOR_EACH_INSTRUCTION(M)                            \
   M(Add)                                                   \
+  M(Condition)                                             \
   M(Equal)                                                 \
+  M(NotEqual)                                              \
+  M(LessThan)                                              \
+  M(LessThanOrEqual)                                       \
+  M(GreaterThan)                                           \
+  M(GreaterThanOrEqual)                                    \
   M(Exit)                                                  \
   M(Goto)                                                  \
   M(If)                                                    \
@@ -400,6 +415,7 @@ class HBasicBlock : public ArenaObject {
   M(StoreLocal)                                            \
   M(Sub)                                                   \
 
+
 #define FORWARD_DECLARATION(type) class H##type;
 FOR_EACH_INSTRUCTION(FORWARD_DECLARATION)
 #undef FORWARD_DECLARATION
@@ -413,7 +429,7 @@ template <typename T>
 class HUseListNode : public ArenaObject {
  public:
   HUseListNode(T* user, size_t index, HUseListNode* tail)
-      : user_(user), index_(index), tail_(tail) { }
+      : user_(user), index_(index), tail_(tail) {}
 
   HUseListNode* GetTail() const { return tail_; }
   T* GetUser() const { return user_; }
@@ -444,7 +460,7 @@ class HInstruction : public ArenaObject {
         live_interval_(nullptr),
         lifetime_position_(kNoLifetime) {}
 
-  virtual ~HInstruction() { }
+  virtual ~HInstruction() {}
 
   HInstruction* GetNext() const { return next_; }
   HInstruction* GetPrevious() const { return previous_; }
@@ -507,6 +523,10 @@ class HInstruction : public ArenaObject {
 
   void ReplaceWith(HInstruction* instruction);
 
+  bool HasOnlyOneUse() const {
+    return uses_ != nullptr && uses_->GetTail() == nullptr;
+  }
+
 #define INSTRUCTION_TYPE_CHECK(type)                                           \
   bool Is##type() { return (As##type() != nullptr); }                          \
   virtual H##type* As##type() { return nullptr; }
@@ -616,7 +636,7 @@ class HEnvironment : public ArenaObject {
 
 class HInputIterator : public ValueObject {
  public:
-  explicit HInputIterator(HInstruction* instruction) : instruction_(instruction), index_(0) { }
+  explicit HInputIterator(HInstruction* instruction) : instruction_(instruction), index_(0) {}
 
   bool Done() const { return index_ == instruction_->InputCount(); }
   HInstruction* Current() const { return instruction_->InputAt(index_); }
@@ -676,7 +696,7 @@ class HBackwardInstructionIterator : public ValueObject {
 template<typename T, intptr_t N>
 class EmbeddedArray {
  public:
-  EmbeddedArray() : elements_() { }
+  EmbeddedArray() : elements_() {}
 
   intptr_t GetLength() const { return N; }
 
@@ -721,8 +741,8 @@ class EmbeddedArray<T, 0> {
 template<intptr_t N>
 class HTemplateInstruction: public HInstruction {
  public:
-  HTemplateInstruction<N>() : inputs_() { }
-  virtual ~HTemplateInstruction() { }
+  HTemplateInstruction<N>() : inputs_() {}
+  virtual ~HTemplateInstruction() {}
 
   virtual size_t InputCount() const { return N; }
   virtual HInstruction* InputAt(size_t i) const { return inputs_[i]; }
@@ -738,6 +758,18 @@ class HTemplateInstruction: public HInstruction {
   friend class SsaBuilder;
 };
 
+template<intptr_t N>
+class HExpression: public HTemplateInstruction<N> {
+ public:
+  explicit HExpression<N>(Primitive::Type type) : type_(type) {}
+  virtual ~HExpression() {}
+
+  virtual Primitive::Type GetType() const { return type_; }
+
+ private:
+  const Primitive::Type type_;
+};
+
 // Represents dex's RETURN_VOID opcode. A HReturnVoid is a control flow
 // instruction that branches to the exit block.
 class HReturnVoid : public HTemplateInstruction<0> {
@@ -800,6 +832,7 @@ class HGoto : public HTemplateInstruction<0> {
   DISALLOW_COPY_AND_ASSIGN(HGoto);
 };
 
+
 // Conditional branch. A block ending with an HIf instruction must have
 // two successors.
 class HIf : public HTemplateInstruction<1> {
@@ -820,53 +853,143 @@ class HIf : public HTemplateInstruction<1> {
 
   DECLARE_INSTRUCTION(If);
 
+  virtual bool IsIfInstruction() const { return true; }
+
  private:
   DISALLOW_COPY_AND_ASSIGN(HIf);
 };
 
-class HBinaryOperation : public HTemplateInstruction<2> {
+class HBinaryOperation : public HExpression<2> {
  public:
   HBinaryOperation(Primitive::Type result_type,
                    HInstruction* left,
-                   HInstruction* right) : result_type_(result_type) {
+                   HInstruction* right) : HExpression(result_type) {
     SetRawInputAt(0, left);
     SetRawInputAt(1, right);
   }
 
   HInstruction* GetLeft() const { return InputAt(0); }
   HInstruction* GetRight() const { return InputAt(1); }
-  Primitive::Type GetResultType() const { return result_type_; }
+  Primitive::Type GetResultType() const { return GetType(); }
 
   virtual bool IsCommutative() { return false; }
-  virtual Primitive::Type GetType() const { return GetResultType(); }
 
  private:
-  const Primitive::Type result_type_;
-
   DISALLOW_COPY_AND_ASSIGN(HBinaryOperation);
 };
 
-
-// Instruction to check if two inputs are equal to each other.
-class HEqual : public HBinaryOperation {
+class HCondition : public HBinaryOperation {
  public:
-  HEqual(HInstruction* first, HInstruction* second)
+  HCondition(HInstruction* first, HInstruction* second)
       : HBinaryOperation(Primitive::kPrimBoolean, first, second) {}
 
   virtual bool IsCommutative() { return true; }
+  bool NeedsMaterialization() const;
 
-  virtual Primitive::Type GetType() const { return Primitive::kPrimBoolean; }
+  DECLARE_INSTRUCTION(Condition);
+
+  virtual IfCondition GetCondition() const = 0;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HCondition);
+};
+
+// Instruction to check if two inputs are equal to each other.
+class HEqual : public HCondition {
+ public:
+  HEqual(HInstruction* first, HInstruction* second)
+      : HCondition(first, second) {}
 
   DECLARE_INSTRUCTION(Equal);
 
+  virtual IfCondition GetCondition() const {
+    return kCondEQ;
+  }
+
  private:
   DISALLOW_COPY_AND_ASSIGN(HEqual);
 };
 
+class HNotEqual : public HCondition {
+ public:
+  HNotEqual(HInstruction* first, HInstruction* second)
+      : HCondition(first, second) {}
+
+  DECLARE_INSTRUCTION(NotEqual);
+
+  virtual IfCondition GetCondition() const {
+    return kCondNE;
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HNotEqual);
+};
+
+class HLessThan : public HCondition {
+ public:
+  HLessThan(HInstruction* first, HInstruction* second)
+      : HCondition(first, second) {}
+
+  DECLARE_INSTRUCTION(LessThan);
+
+  virtual IfCondition GetCondition() const {
+    return kCondLT;
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HLessThan);
+};
+
+class HLessThanOrEqual : public HCondition {
+ public:
+  HLessThanOrEqual(HInstruction* first, HInstruction* second)
+      : HCondition(first, second) {}
+
+  DECLARE_INSTRUCTION(LessThanOrEqual);
+
+  virtual IfCondition GetCondition() const {
+    return kCondLE;
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HLessThanOrEqual);
+};
+
+class HGreaterThan : public HCondition {
+ public:
+  HGreaterThan(HInstruction* first, HInstruction* second)
+      : HCondition(first, second) {}
+
+  DECLARE_INSTRUCTION(GreaterThan);
+
+  virtual IfCondition GetCondition() const {
+    return kCondGT;
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HGreaterThan);
+};
+
+class HGreaterThanOrEqual : public HCondition {
+ public:
+  HGreaterThanOrEqual(HInstruction* first, HInstruction* second)
+      : HCondition(first, second) {}
+
+  DECLARE_INSTRUCTION(GreaterThanOrEqual);
+
+  virtual IfCondition GetCondition() const {
+    return kCondGE;
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HGreaterThanOrEqual);
+};
+
+
 // A local in the graph. Corresponds to a Dex register.
 class HLocal : public HTemplateInstruction<0> {
  public:
-  explicit HLocal(uint16_t reg_number) : reg_number_(reg_number) { }
+  explicit HLocal(uint16_t reg_number) : reg_number_(reg_number) {}
 
   DECLARE_INSTRUCTION(Local);
 
@@ -880,21 +1003,17 @@ class HLocal : public HTemplateInstruction<0> {
 };
 
 // Load a given local. The local is an input of this instruction.
-class HLoadLocal : public HTemplateInstruction<1> {
+class HLoadLocal : public HExpression<1> {
  public:
-  explicit HLoadLocal(HLocal* local, Primitive::Type type) : type_(type) {
+  explicit HLoadLocal(HLocal* local, Primitive::Type type) : HExpression(type) {
     SetRawInputAt(0, local);
   }
 
-  virtual Primitive::Type GetType() const { return type_; }
-
   HLocal* GetLocal() const { return reinterpret_cast<HLocal*>(InputAt(0)); }
 
   DECLARE_INSTRUCTION(LoadLocal);
 
  private:
-  const Primitive::Type type_;
-
   DISALLOW_COPY_AND_ASSIGN(HLoadLocal);
 };
 
@@ -917,12 +1036,11 @@ class HStoreLocal : public HTemplateInstruction<2> {
 
 // Constants of the type int. Those can be from Dex instructions, or
 // synthesized (for example with the if-eqz instruction).
-class HIntConstant : public HTemplateInstruction<0> {
+class HIntConstant : public HExpression<0> {
  public:
-  explicit HIntConstant(int32_t value) : value_(value) { }
+  explicit HIntConstant(int32_t value) : HExpression(Primitive::kPrimInt), value_(value) {}
 
   int32_t GetValue() const { return value_; }
-  virtual Primitive::Type GetType() const { return Primitive::kPrimInt; }
 
   DECLARE_INSTRUCTION(IntConstant);
 
@@ -932,9 +1050,9 @@ class HIntConstant : public HTemplateInstruction<0> {
   DISALLOW_COPY_AND_ASSIGN(HIntConstant);
 };
 
-class HLongConstant : public HTemplateInstruction<0> {
+class HLongConstant : public HExpression<0> {
  public:
-  explicit HLongConstant(int64_t value) : value_(value) { }
+  explicit HLongConstant(int64_t value) : HExpression(Primitive::kPrimLong), value_(value) {}
 
   int64_t GetValue() const { return value_; }
 
@@ -1008,15 +1126,14 @@ class HInvokeStatic : public HInvoke {
   DISALLOW_COPY_AND_ASSIGN(HInvokeStatic);
 };
 
-class HNewInstance : public HTemplateInstruction<0> {
+class HNewInstance : public HExpression<0> {
  public:
-  HNewInstance(uint32_t dex_pc, uint16_t type_index) : dex_pc_(dex_pc), type_index_(type_index) {}
+  HNewInstance(uint32_t dex_pc, uint16_t type_index) : HExpression(Primitive::kPrimNot),
+    dex_pc_(dex_pc), type_index_(type_index) {}
 
   uint32_t GetDexPc() const { return dex_pc_; }
   uint16_t GetTypeIndex() const { return type_index_; }
 
-  virtual Primitive::Type GetType() const { return Primitive::kPrimNot; }
-
   // Calls runtime so needs an environment.
   virtual bool NeedsEnvironment() const { return true; }
 
@@ -1057,15 +1174,13 @@ class HSub : public HBinaryOperation {
 
 // The value of a parameter in this method. Its location depends on
 // the calling convention.
-class HParameterValue : public HTemplateInstruction<0> {
+class HParameterValue : public HExpression<0> {
  public:
   HParameterValue(uint8_t index, Primitive::Type parameter_type)
-      : index_(index), parameter_type_(parameter_type) {}
+      : HExpression(parameter_type), index_(index) {}
 
   uint8_t GetIndex() const { return index_; }
 
-  virtual Primitive::Type GetType() const { return parameter_type_; }
-
   DECLARE_INSTRUCTION(ParameterValue);
 
  private:
@@ -1073,19 +1188,15 @@ class HParameterValue : public HTemplateInstruction<0> {
   // than HGraph::number_of_in_vregs_;
   const uint8_t index_;
 
-  const Primitive::Type parameter_type_;
-
   DISALLOW_COPY_AND_ASSIGN(HParameterValue);
 };
 
-class HNot : public HTemplateInstruction<1> {
+class HNot : public HExpression<1> {
  public:
-  explicit HNot(HInstruction* input) {
+  explicit HNot(HInstruction* input) : HExpression(Primitive::kPrimBoolean) {
     SetRawInputAt(0, input);
   }
 
-  virtual Primitive::Type GetType() const { return Primitive::kPrimBoolean; }
-
   DECLARE_INSTRUCTION(Not);
 
  private:
@@ -1210,10 +1321,10 @@ class HParallelMove : public HTemplateInstruction<0> {
 
 class HGraphVisitor : public ValueObject {
  public:
-  explicit HGraphVisitor(HGraph* graph) : graph_(graph) { }
-  virtual ~HGraphVisitor() { }
+  explicit HGraphVisitor(HGraph* graph) : graph_(graph) {}
+  virtual ~HGraphVisitor() {}
 
-  virtual void VisitInstruction(HInstruction* instruction) { }
+  virtual void VisitInstruction(HInstruction* instruction) {}
   virtual void VisitBasicBlock(HBasicBlock* block);
 
   void VisitInsertionOrder();
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index ccacbef401..56029aa30b 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -101,10 +101,6 @@ CompiledMethod* OptimizingCompiler::TryCompile(const DexFile::CodeItem* code_ite
   }
 
   InstructionSet instruction_set = GetCompilerDriver()->GetInstructionSet();
-  // The optimizing compiler currently does not have a Thumb2 assembler.
-  if (instruction_set == kThumb2) {
-    instruction_set = kArm;
-  }
   CodeGenerator* codegen = CodeGenerator::Create(&arena, graph, instruction_set);
   if (codegen == nullptr) {
     if (shouldCompile) {
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 348e9d4921..1f4cb41582 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -100,6 +100,9 @@ void RegisterAllocator::BlockRegister(Location location,
   interval->AddRange(start, end);
 }
 
+// TODO: make the register allocator understand instructions like HCondition
+// that may not need to be materialized.  It doesn't need to allocate any
+// registers for it.
 void RegisterAllocator::AllocateRegistersInternal() {
   number_of_registers_ = processing_core_registers_
       ? codegen_->GetNumberOfCoreRegisters()
diff --git a/compiler/trampolines/trampoline_compiler.cc b/compiler/trampolines/trampoline_compiler.cc
index ac84d6ae40..d5225c1f73 100644
--- a/compiler/trampolines/trampoline_compiler.cc
+++ b/compiler/trampolines/trampoline_compiler.cc
@@ -30,11 +30,7 @@ namespace art {
 namespace arm {
 static const std::vector<uint8_t>* CreateTrampoline(EntryPointCallingConvention abi,
                                                     ThreadOffset<4> offset) {
-  // NOTE: the assembler used here is ARM, not Thumb.  This is because the address
-  // returned by this function is a pointer and for thumb we would have to set the
-  // bottom bit.  It doesn't matter since the instructions generated are the same
-  // size anyway.
-  std::unique_ptr<ArmAssembler> assembler(static_cast<ArmAssembler*>(Assembler::Create(kArm)));
+  std::unique_ptr<ArmAssembler> assembler(static_cast<ArmAssembler*>(Assembler::Create(kThumb2)));
 
   switch (abi) {
     case kInterpreterAbi:  // Thread* is first argument (R0) in interpreter ABI.
diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc
index 703d68e0b3..92a9f533ea 100644
--- a/compiler/utils/arm/assembler_thumb2.cc
+++ b/compiler/utils/arm/assembler_thumb2.cc
@@ -329,7 +329,7 @@ void Thumb2Assembler::ldm(BlockAddressMode am,
       ++reg;
     }
     CHECK_LT(reg, 16);
-    CHECK(am == DB_W);      // Only writeback is supported.
+    CHECK(am == IA_W);      // Only writeback is supported.
     ldr(static_cast<Register>(reg), Address(base, kRegisterSize, Address::PostIndex), cond);
   } else {
     EmitMultiMemOp(cond, am, true, base, regs);
@@ -352,8 +352,8 @@ void Thumb2Assembler::stm(BlockAddressMode am,
       ++reg;
     }
     CHECK_LT(reg, 16);
-    CHECK(am == IA || am == IA_W);
-    Address::Mode strmode = am == IA ? Address::PreIndex : Address::Offset;
+    CHECK(am == DB || am == DB_W);
+    Address::Mode strmode = am == DB_W ? Address::PreIndex : Address::Offset;
     str(static_cast<Register>(reg), Address(base, -kRegisterSize, strmode), cond);
   } else {
     EmitMultiMemOp(cond, am, false, base, regs);
@@ -642,6 +642,7 @@ bool Thumb2Assembler::Is32BitDataProcessing(Condition cond,
            if (imm > (1 << 9)) {    // 9 bit immediate.
              return true;
            }
+           return false;      // 16 bit good.
          } else if (opcode == ADD && rd != SP && rn == SP) {   // 10 bit immediate.
            if (imm > (1 << 10)) {
              return true;
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index a793513191..2d5514063f 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -367,12 +367,12 @@ class Dex2Oat {
 
     driver->CompileAll(class_loader, dex_files, &timings);
 
-    timings.NewSplit("dex2oat OatWriter");
+    TimingLogger::ScopedTiming t2("dex2oat OatWriter", &timings);
     std::string image_file_location;
     uint32_t image_file_location_oat_checksum = 0;
     uintptr_t image_file_location_oat_data_begin = 0;
     if (!driver->IsImage()) {
-      TimingLogger::ScopedSplit split("Loading image checksum", &timings);
+      TimingLogger::ScopedTiming t3("Loading image checksum", &timings);
       gc::space::ImageSpace* image_space = Runtime::Current()->GetHeap()->GetImageSpace();
       image_file_location_oat_checksum = image_space->GetImageHeader().GetOatChecksum();
       image_file_location_oat_data_begin =
@@ -380,14 +380,13 @@ class Dex2Oat {
       image_file_location = image_space->GetImageFilename();
     }
 
-    OatWriter oat_writer(dex_files,
-                         image_file_location_oat_checksum,
+    OatWriter oat_writer(dex_files, image_file_location_oat_checksum,
                          image_file_location_oat_data_begin,
                          image_file_location,
                          driver.get(),
                          &timings);
 
-    TimingLogger::ScopedSplit split("Writing ELF", &timings);
+    t2.NewTiming("Writing ELF");
     if (!driver->WriteElf(android_root, is_host, dex_files, &oat_writer, oat_file)) {
       LOG(ERROR) << "Failed to write ELF file " << oat_file->GetPath();
       return nullptr;
@@ -748,6 +747,7 @@ void CheckExplicitCheckOptions(InstructionSet isa, bool* explicit_null_checks,
                                bool* explicit_so_checks, bool* explicit_suspend_checks) {
   switch (isa) {
     case kArm:
+    case kThumb2:
       break;  // All checks implemented, leave as is.
 
     default:  // No checks implemented, reset all to explicit checks.
@@ -1039,8 +1039,8 @@ static int dex2oat(int argc, char** argv) {
         } else {
           Usage("--implicit-checks passed non-recognized value %s", val.c_str());
         }
-        has_explicit_checks_options = true;
       }
+      has_explicit_checks_options = true;
     } else {
       Usage("Unknown argument %s", option.data());
     }
@@ -1076,7 +1076,7 @@ static int dex2oat(int argc, char** argv) {
 
   bool image = (!image_filename.empty());
   if (!image && boot_image_filename.empty()) {
-    boot_image_filename += GetAndroidRoot();
+    boot_image_filename += android_root;
     boot_image_filename += "/framework/boot.art";
   }
   std::string boot_image_option;
@@ -1170,6 +1170,7 @@ static int dex2oat(int argc, char** argv) {
   CheckExplicitCheckOptions(instruction_set, &explicit_null_checks, &explicit_so_checks,
                             &explicit_suspend_checks);
 
+  LOG(INFO) << "init compiler options for explicit null: " << explicit_null_checks;
   CompilerOptions compiler_options(compiler_filter,
                                    huge_method_threshold,
                                    large_method_threshold,
@@ -1211,7 +1212,7 @@ static int dex2oat(int argc, char** argv) {
     return EXIT_FAILURE;
   }
 
-  timings.StartSplit("dex2oat Setup");
+  timings.StartTiming("dex2oat Setup");
   LOG(INFO) << CommandLine();
 
   Runtime::Options runtime_options;
@@ -1256,7 +1257,17 @@ static int dex2oat(int argc, char** argv) {
   // TODO: Not sure whether it's a good idea to allow anything else but the runtime option in
   // this case at all, as we'll have to throw away produced code for a mismatch.
   if (!has_explicit_checks_options) {
-    if (instruction_set == kRuntimeISA) {
+    bool cross_compiling = true;
+    switch (kRuntimeISA) {
+      case kArm:
+      case kThumb2:
+        cross_compiling = instruction_set != kArm && instruction_set != kThumb2;
+        break;
+      default:
+        cross_compiling = instruction_set != kRuntimeISA;
+        break;
+    }
+    if (!cross_compiling) {
       Runtime* runtime = Runtime::Current();
       compiler_options.SetExplicitNullChecks(runtime->ExplicitNullChecks());
       compiler_options.SetExplicitStackOverflowChecks(runtime->ExplicitStackOverflowChecks());
@@ -1436,7 +1447,7 @@ static int dex2oat(int argc, char** argv) {
   // Elf32_Phdr.p_vaddr values by the desired base address.
   //
   if (image) {
-    timings.NewSplit("dex2oat ImageWriter");
+    TimingLogger::ScopedTiming t("dex2oat ImageWriter", &timings);
     bool image_creation_success = dex2oat->CreateImageFile(image_filename,
                                                            image_base,
                                                            oat_unstripped,
@@ -1449,6 +1460,7 @@ static int dex2oat(int argc, char** argv) {
   }
 
   if (is_host) {
+    timings.EndTiming();
     if (dump_timing || (dump_slow_timing && timings.GetTotalNs() > MsToNs(1000))) {
       LOG(INFO) << Dumpable<TimingLogger>(timings);
     }
@@ -1461,7 +1473,7 @@ static int dex2oat(int argc, char** argv) {
   // If we don't want to strip in place, copy from unstripped location to stripped location.
   // We need to strip after image creation because FixupElf needs to use .strtab.
   if (oat_unstripped != oat_stripped) {
-    timings.NewSplit("dex2oat OatFile copy");
+    TimingLogger::ScopedTiming t("dex2oat OatFile copy", &timings);
     oat_file.reset();
      std::unique_ptr<File> in(OS::OpenFileForReading(oat_unstripped.c_str()));
     std::unique_ptr<File> out(OS::CreateEmptyFile(oat_stripped.c_str()));
@@ -1496,7 +1508,7 @@ static int dex2oat(int argc, char** argv) {
   }
 #endif  // ART_USE_PORTABLE_COMPILER
 
-  timings.EndSplit();
+  timings.EndTiming();
 
   if (dump_timing || (dump_slow_timing && timings.GetTotalNs() > MsToNs(1000))) {
     LOG(INFO) << Dumpable<TimingLogger>(timings);
diff --git a/disassembler/disassembler_arm.cc b/disassembler/disassembler_arm.cc
index 4e4a512713..1f565e504a 100644
--- a/disassembler/disassembler_arm.cc
+++ b/disassembler/disassembler_arm.cc
@@ -269,18 +269,34 @@ void DisassemblerArm::DumpArm(std::ostream& os, const uint8_t* instr_ptr) {
         uint32_t op = (instruction >> 21) & 0xf;
         opcode = kDataProcessingOperations[op];
         bool implicit_s = ((op & ~3) == 8);  // TST, TEQ, CMP, and CMN.
-        if (implicit_s) {
-          // Rd is unused (and not shown), and we don't show the 's' suffix either.
-        } else {
+        bool is_mov = op == 0b1101 || op == 0b1111;
+        if (is_mov) {
+          // Show only Rd and Rm.
           if (s) {
-            suffixes += 's';
-          }
-          args << ArmRegister(instruction, 12) << ", ";
-        }
-        if (i) {
-          args << ArmRegister(instruction, 16) << ", " << ShiftedImmediate(instruction);
+             suffixes += 's';
+           }
+           args << ArmRegister(instruction, 12) << ", ";
+           if (i) {
+              args << ShiftedImmediate(instruction);
+            } else {
+              // TODO: Shifted register.
+              args << ArmRegister(instruction, 16) << ", " << ArmRegister(instruction, 0);
+            }
         } else {
-          args << Rm(instruction);
+          if (implicit_s) {
+            // Rd is unused (and not shown), and we don't show the 's' suffix either.
+          } else {
+            if (s) {
+              suffixes += 's';
+            }
+            args << ArmRegister(instruction, 12) << ", ";
+          }
+          if (i) {
+            args << ArmRegister(instruction, 16) << ", " << ShiftedImmediate(instruction);
+          } else {
+            // TODO: Shifted register.
+            args << ArmRegister(instruction, 16) << ", " << ArmRegister(instruction, 0);
+          }
         }
       }
       break;
@@ -1291,7 +1307,7 @@ size_t DisassemblerArm::DumpThumb32(std::ostream& os, const uint8_t* instr_ptr)
                   int32_t imm32 = (imm8 << 24) >> 24;  // sign-extend imm8
                   if (Rn.r == 13 && P == 1 && U == 0 && W == 1 && imm32 == 4) {
                     opcode << "push";
-                    args << Rt;
+                    args << "{" << Rt << "}";
                   } else if (Rn.r == 15 || (P == 0 && W == 0)) {
                     opcode << "UNDEFINED";
                   } else {
@@ -1443,10 +1459,33 @@ size_t DisassemblerArm::DumpThumb32(std::ostream& os, const uint8_t* instr_ptr)
             }
             args << "]";
           } else {
-            // LDRT Rt, [Rn, #imm8]            - 111 11 00 00 101 nnnn tttt 1110iiiiiiii
-            uint32_t imm8 = instr & 0xFF;
-            opcode << "ldrt";
-            args << Rt << ", [" << Rn << ", #" << imm8 << "]";
+            bool p = (instr & (1 << 10)) != 0;
+            bool w = (instr & (1 << 8)) != 0;
+            bool u = (instr & (1 << 9)) != 0;
+            if (p && u && !w) {
+              // LDRT Rt, [Rn, #imm8]            - 111 11 00 00 101 nnnn tttt 1110iiiiiiii
+              uint32_t imm8 = instr & 0xFF;
+              opcode << "ldrt";
+              args << Rt << ", [" << Rn << ", #" << imm8 << "]";
+            } else if (Rn.r == 13 && !p && u && w && (instr & 0xff) == 4) {
+              // POP
+              opcode << "pop";
+              args << "{" << Rt << "}";
+           } else {
+              bool wback = !p || w;
+              uint32_t offset = (instr & 0xff);
+              opcode << "ldr.w";
+              args << Rt << ",";
+              if (p && !wback) {
+                args << "[" << Rn << ", #" << offset << "]";
+              } else if (p && wback) {
+                args << "[" << Rn << ", #" << offset << "]!";
+              } else if (!p && wback) {
+                args << "[" << Rn << "], #" << offset;
+              } else {
+                LOG(FATAL) << p << " " << w;
+              }
+            }
           }
           break;
         }
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index e6a6860626..b012bc1cc1 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -56,10 +56,16 @@ static const char* gReg64Names[] = {
   "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
 };
 
+// 64-bit opcode REX modifier.
+constexpr uint8_t REX_W = 0b1000;
+constexpr uint8_t REX_R = 0b0100;
+constexpr uint8_t REX_X = 0b0010;
+constexpr uint8_t REX_B = 0b0001;
+
 static void DumpReg0(std::ostream& os, uint8_t rex, size_t reg,
                      bool byte_operand, uint8_t size_override) {
   DCHECK_LT(reg, (rex == 0) ? 8u : 16u);
-  bool rex_w = (rex & 0b1000) != 0;
+  bool rex_w = (rex & REX_W) != 0;
   if (byte_operand) {
     os << ((rex == 0) ? gReg8Names[reg] : gExtReg8Names[reg]);
   } else if (rex_w) {
@@ -86,14 +92,14 @@ static void DumpAnyReg(std::ostream& os, uint8_t rex, size_t reg,
 
 static void DumpReg(std::ostream& os, uint8_t rex, uint8_t reg,
                     bool byte_operand, uint8_t size_override, RegFile reg_file) {
-  bool rex_r = (rex & 0b0100) != 0;
+  bool rex_r = (rex & REX_R) != 0;
   size_t reg_num = rex_r ? (reg + 8) : reg;
   DumpAnyReg(os, rex, reg_num, byte_operand, size_override, reg_file);
 }
 
 static void DumpRmReg(std::ostream& os, uint8_t rex, uint8_t reg,
                       bool byte_operand, uint8_t size_override, RegFile reg_file) {
-  bool rex_b = (rex & 0b0001) != 0;
+  bool rex_b = (rex & REX_B) != 0;
   size_t reg_num = rex_b ? (reg + 8) : reg;
   DumpAnyReg(os, rex, reg_num, byte_operand, size_override, reg_file);
 }
@@ -107,19 +113,19 @@ static void DumpAddrReg(std::ostream& os, uint8_t rex, uint8_t reg) {
 }
 
 static void DumpBaseReg(std::ostream& os, uint8_t rex, uint8_t reg) {
-  bool rex_b = (rex & 0b0001) != 0;
+  bool rex_b = (rex & REX_B) != 0;
   size_t reg_num = rex_b ? (reg + 8) : reg;
   DumpAddrReg(os, rex, reg_num);
 }
 
 static void DumpIndexReg(std::ostream& os, uint8_t rex, uint8_t reg) {
-  bool rex_x = (rex & 0b0010) != 0;
+  bool rex_x = (rex & REX_X) != 0;
   uint8_t reg_num = rex_x ? (reg + 8) : reg;
   DumpAddrReg(os, rex, reg_num);
 }
 
 static void DumpOpcodeReg(std::ostream& os, uint8_t rex, uint8_t reg) {
-  bool rex_b = (rex & 0b0001) != 0;
+  bool rex_b = (rex & REX_B) != 0;
   size_t reg_num = rex_b ? (reg + 8) : reg;
   DumpReg0(os, rex, reg_num, false, 0);
 }
@@ -896,6 +902,7 @@ DISASSEMBLER_ENTRY(cmp,
   case 0xB0: case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7:
     opcode << "mov";
     immediate_bytes = 1;
+    byte_operand = true;
     reg_in_opcode = true;
     break;
   case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE: case 0xBF:
@@ -916,6 +923,15 @@ DISASSEMBLER_ENTRY(cmp,
     byte_operand = (*instr == 0xC0);
     break;
   case 0xC3: opcode << "ret"; break;
+  case 0xC6:
+    static const char* c6_opcodes[] = {"mov", "unknown-c6", "unknown-c6", "unknown-c6", "unknown-c6", "unknown-c6", "unknown-c6", "unknown-c6"};
+    modrm_opcodes = c6_opcodes;
+    store = true;
+    immediate_bytes = 1;
+    has_modrm = true;
+    reg_is_opcode = true;
+    byte_operand = true;
+    break;
   case 0xC7:
     static const char* c7_opcodes[] = {"mov", "unknown-c7", "unknown-c7", "unknown-c7", "unknown-c7", "unknown-c7", "unknown-c7", "unknown-c7"};
     modrm_opcodes = c7_opcodes;
@@ -1064,6 +1080,16 @@ DISASSEMBLER_ENTRY(cmp,
     if (reg_is_opcode && modrm_opcodes != NULL) {
       opcode << modrm_opcodes[reg_or_opcode];
     }
+
+    // Add opcode suffixes to indicate size.
+    if (byte_operand) {
+      opcode << 'b';
+    } else if ((rex & REX_W) != 0) {
+      opcode << 'q';
+    } else if (prefix[2] == 0x66) {
+      opcode << 'w';
+    }
+
     if (load) {
       if (!reg_is_opcode) {
         DumpReg(args, rex, reg_or_opcode, byte_operand, prefix[2], dst_reg_file);
diff --git a/runtime/base/histogram-inl.h b/runtime/base/histogram-inl.h
index 7c0999992e..4c18ce405c 100644
--- a/runtime/base/histogram-inl.h
+++ b/runtime/base/histogram-inl.h
@@ -164,18 +164,18 @@ inline void Histogram<Value>::PrintBins(std::ostream& os, const CumulativeData&
 template <class Value>
 inline void Histogram<Value>::PrintConfidenceIntervals(std::ostream &os, double interval,
                                                        const CumulativeData& data) const {
+  static constexpr size_t kFractionalDigits = 3;
   DCHECK_GT(interval, 0);
   DCHECK_LT(interval, 1.0);
-
-  double per_0 = (1.0 - interval) / 2.0;
-  double per_1 = per_0 + interval;
-  TimeUnit unit = GetAppropriateTimeUnit(Mean() * kAdjust);
-  os << Name() << ":\tSum: ";
-  os << PrettyDuration(Sum() * kAdjust) << " ";
-  os << (interval * 100) << "% C.I. " << FormatDuration(Percentile(per_0, data) * kAdjust, unit);
-  os << "-" << FormatDuration(Percentile(per_1, data) * kAdjust, unit) << " ";
-  os << "Avg: " << FormatDuration(Mean() * kAdjust, unit) << " Max: ";
-  os << FormatDuration(Max() * kAdjust, unit) << "\n";
+  const double per_0 = (1.0 - interval) / 2.0;
+  const double per_1 = per_0 + interval;
+  const TimeUnit unit = GetAppropriateTimeUnit(Mean() * kAdjust);
+  os << Name() << ":\tSum: " << PrettyDuration(Sum() * kAdjust) << " "
+     << (interval * 100) << "% C.I. " << FormatDuration(Percentile(per_0, data) * kAdjust, unit,
+                                                        kFractionalDigits)
+     << "-" << FormatDuration(Percentile(per_1, data) * kAdjust, unit, kFractionalDigits) << " "
+     << "Avg: " << FormatDuration(Mean() * kAdjust, unit, kFractionalDigits) << " Max: "
+     << FormatDuration(Max() * kAdjust, unit, kFractionalDigits) << "\n";
 }
 
 template <class Value>
diff --git a/runtime/base/timing_logger.cc b/runtime/base/timing_logger.cc
index a1550028e9..b6a2aaf33b 100644
--- a/runtime/base/timing_logger.cc
+++ b/runtime/base/timing_logger.cc
@@ -33,6 +33,8 @@ namespace art {
 
 constexpr size_t CumulativeLogger::kLowMemoryBucketCount;
 constexpr size_t CumulativeLogger::kDefaultBucketCount;
+constexpr size_t TimingLogger::kIndexNotFound;
+
 CumulativeLogger::CumulativeLogger(const std::string& name)
     : name_(name),
       lock_name_("CumulativeLoggerLock" + name),
@@ -66,10 +68,12 @@ void CumulativeLogger::Reset() {
 
 void CumulativeLogger::AddLogger(const TimingLogger &logger) {
   MutexLock mu(Thread::Current(), lock_);
-  for (const TimingLogger::SplitTiming& split : logger.GetSplits()) {
-    uint64_t split_time = split.first;
-    const char* split_name = split.second;
-    AddPair(split_name, split_time);
+  TimingLogger::TimingData timing_data(logger.CalculateTimingData());
+  const std::vector<TimingLogger::Timing>& timings = logger.GetTimings();
+  for (size_t i = 0; i < timings.size(); ++i) {
+    if (timings[i].IsStartTiming()) {
+      AddPair(timings[i].GetName(), timing_data.GetExclusiveTime(i));
+    }
   }
   ++iterations_;
 }
@@ -124,166 +128,125 @@ void CumulativeLogger::DumpHistogram(std::ostream &os) const {
 }
 
 TimingLogger::TimingLogger(const char* name, bool precise, bool verbose)
-    : name_(name), precise_(precise), verbose_(verbose), current_split_(NULL) {
+    : name_(name), precise_(precise), verbose_(verbose) {
 }
 
 void TimingLogger::Reset() {
-  current_split_ = NULL;
-  splits_.clear();
+  timings_.clear();
 }
 
-void TimingLogger::StartSplit(const char* new_split_label) {
-  DCHECK(new_split_label != nullptr) << "Starting split with null label.";
-  TimingLogger::ScopedSplit* explicit_scoped_split =
-      new TimingLogger::ScopedSplit(new_split_label, this);
-  explicit_scoped_split->explicit_ = true;
+void TimingLogger::StartTiming(const char* label) {
+  DCHECK(label != nullptr);
+  timings_.push_back(Timing(NanoTime(), label));
+  ATRACE_BEGIN(label);
 }
 
-void TimingLogger::EndSplit() {
-  CHECK(current_split_ != nullptr) << "Ending a non-existent split.";
-  DCHECK(current_split_->label_ != nullptr);
-  DCHECK(current_split_->explicit_ == true)
-      << "Explicitly ending scoped split: " << current_split_->label_;
-  delete current_split_;
-  // TODO: current_split_ = nullptr;
+void TimingLogger::EndTiming() {
+  timings_.push_back(Timing(NanoTime(), nullptr));
+  ATRACE_END();
 }
 
-// Ends the current split and starts the one given by the label.
-void TimingLogger::NewSplit(const char* new_split_label) {
-  if (current_split_ == nullptr) {
-    StartSplit(new_split_label);
-  } else {
-    DCHECK(new_split_label != nullptr) << "New split (" << new_split_label << ") with null label.";
-    current_split_->TailInsertSplit(new_split_label);
+uint64_t TimingLogger::GetTotalNs() const {
+  if (timings_.size() < 2) {
+    return 0;
   }
+  return timings_.back().GetTime() - timings_.front().GetTime();
 }
 
-uint64_t TimingLogger::GetTotalNs() const {
-  uint64_t total_ns = 0;
-  for (const TimingLogger::SplitTiming& split : splits_) {
-    total_ns += split.first;
+size_t TimingLogger::FindTimingIndex(const char* name, size_t start_idx) const {
+  DCHECK_LT(start_idx, timings_.size());
+  for (size_t i = start_idx; i < timings_.size(); ++i) {
+    if (timings_[i].IsStartTiming() && strcmp(timings_[i].GetName(), name) == 0) {
+      return i;
+    }
   }
-  return total_ns;
+  return kIndexNotFound;
+}
+
+TimingLogger::TimingData TimingLogger::CalculateTimingData() const {
+  TimingLogger::TimingData ret;
+  ret.data_.resize(timings_.size());
+  std::vector<size_t> open_stack;
+  for (size_t i = 0; i < timings_.size(); ++i) {
+    if (timings_[i].IsEndTiming()) {
+      CHECK(!open_stack.empty()) << "No starting split for ending split at index " << i;
+      size_t open_idx = open_stack.back();
+      uint64_t time = timings_[i].GetTime() - timings_[open_idx].GetTime();
+      ret.data_[open_idx].exclusive_time += time;
+      DCHECK_EQ(ret.data_[open_idx].total_time, 0U);
+      ret.data_[open_idx].total_time += time;
+      // Each open split has exactly one end.
+      open_stack.pop_back();
+      // If there is a parent node, subtract from the exclusive time.
+      if (!open_stack.empty()) {
+        // Note this may go negative, but will work due to 2s complement when we add the value
+        // total time value later.
+        ret.data_[open_stack.back()].exclusive_time -= time;
+      }
+    } else {
+      open_stack.push_back(i);
+    }
+  }
+  CHECK(open_stack.empty()) << "Missing ending for timing "
+      << timings_[open_stack.back()].GetName() << " at index " << open_stack.back();
+  return ret;  // No need to fear, C++11 move semantics are here.
 }
 
-void TimingLogger::Dump(std::ostream &os) const {
+void TimingLogger::Dump(std::ostream &os, const char* indent_string) const {
+  static constexpr size_t kFractionalDigits = 3;
+  TimingLogger::TimingData timing_data(CalculateTimingData());
   uint64_t longest_split = 0;
-  uint64_t total_ns = 0;
-  for (const SplitTiming& split : splits_) {
-    uint64_t split_time = split.first;
-    longest_split = std::max(longest_split, split_time);
-    total_ns += split_time;
+  for (size_t i = 0; i < timings_.size(); ++i) {
+    longest_split = std::max(longest_split, timing_data.GetTotalTime(i));
   }
   // Compute which type of unit we will use for printing the timings.
   TimeUnit tu = GetAppropriateTimeUnit(longest_split);
   uint64_t divisor = GetNsToTimeUnitDivisor(tu);
+  uint64_t mod_fraction = divisor >= 1000 ? divisor / 1000 : 1;
   // Print formatted splits.
-  for (const SplitTiming& split : splits_) {
-    uint64_t split_time = split.first;
-    if (!precise_ && divisor >= 1000) {
-      // Make the fractional part 0.
-      split_time -= split_time % (divisor / 1000);
+  size_t tab_count = 1;
+  os << name_ << " [Exclusive time] [Total time]\n";
+  for (size_t i = 0; i < timings_.size(); ++i) {
+    if (timings_[i].IsStartTiming()) {
+      uint64_t exclusive_time = timing_data.GetExclusiveTime(i);
+      uint64_t total_time = timing_data.GetTotalTime(i);
+      if (!precise_) {
+        // Make the fractional part 0.
+        exclusive_time -= exclusive_time % mod_fraction;
+        total_time -= total_time % mod_fraction;
+      }
+      for (size_t j = 0; j < tab_count; ++j) {
+        os << indent_string;
+      }
+      os << FormatDuration(exclusive_time, tu, kFractionalDigits);
+      // If they are the same, just print one value to prevent spam.
+      if (exclusive_time != total_time) {
+        os << "/" << FormatDuration(total_time, tu, kFractionalDigits);
+      }
+      os << " " << timings_[i].GetName() << "\n";
+      ++tab_count;
+    } else {
+      --tab_count;
     }
-    os << name_ << ": " << std::setw(8) << FormatDuration(split_time, tu) << " "
-       << split.second << "\n";
   }
-  os << name_ << ": end, " << NsToMs(total_ns) << " ms\n";
+  os << name_ << ": end, " << PrettyDuration(GetTotalNs()) << "\n";
 }
 
-TimingLogger::ScopedSplit::ScopedSplit(const char* label, TimingLogger* timing_logger) {
-  DCHECK(label != NULL) << "New scoped split (" << label << ") with null label.";
-  CHECK(timing_logger != NULL) << "New scoped split (" << label << ") without TimingLogger.";
-  timing_logger_ = timing_logger;
-  label_ = label;
-  running_ns_ = 0;
-  explicit_ = false;
-
-  // Stash away the current split and pause it.
-  enclosing_split_ = timing_logger->current_split_;
-  if (enclosing_split_ != NULL) {
-    enclosing_split_->Pause();
-  }
-
-  timing_logger_->current_split_ = this;
-
-  ATRACE_BEGIN(label_);
-
-  start_ns_ = NanoTime();
-  if (timing_logger_->verbose_) {
-    LOG(INFO) << "Begin: " << label_;
-  }
-}
-
-TimingLogger::ScopedSplit::~ScopedSplit() {
-  uint64_t current_time = NanoTime();
-  uint64_t split_time = current_time - start_ns_;
-  running_ns_ += split_time;
-  ATRACE_END();
-
-  if (timing_logger_->verbose_) {
-    LOG(INFO) << "End: " << label_ << " " << PrettyDuration(split_time);
-  }
-
-  // If one or more enclosed explicitly started splits are not terminated we can
-  // either fail or "unwind" the stack of splits in the timing logger to 'this'
-  // (by deleting the intervening scoped splits). This implements the latter.
-  TimingLogger::ScopedSplit* current = timing_logger_->current_split_;
-  while ((current != NULL) && (current != this)) {
-    delete current;
-    current = timing_logger_->current_split_;
-  }
-
-  CHECK(current != NULL) << "Missing scoped split (" << this->label_
-                           << ") in timing logger (" << timing_logger_->name_ << ").";
-  CHECK(timing_logger_->current_split_ == this);
-
-  timing_logger_->splits_.push_back(SplitTiming(running_ns_, label_));
-
-  timing_logger_->current_split_ = enclosing_split_;
-  if (enclosing_split_ != NULL) {
-    enclosing_split_->Resume();
+void TimingLogger::Verify() {
+  size_t counts[2] = { 0 };
+  for (size_t i = 0; i < timings_.size(); ++i) {
+    if (i > 0) {
+      CHECK_LE(timings_[i - 1].GetTime(), timings_[i].GetTime());
+    }
+    ++counts[timings_[i].IsStartTiming() ? 0 : 1];
   }
+  CHECK_EQ(counts[0], counts[1]) << "Number of StartTiming and EndTiming doesn't match";
 }
 
-
-void TimingLogger::ScopedSplit::TailInsertSplit(const char* label) {
-  // Sleight of hand here: Rather than embedding a new scoped split, we're updating the current
-  // scoped split in place. Basically, it's one way to make explicit and scoped splits compose
-  // well while maintaining the current semantics of NewSplit. An alternative is to push a new split
-  // since we unwind the stack of scoped splits in the scoped split destructor. However, this implies
-  // that the current split is not ended by NewSplit (which calls TailInsertSplit), which would
-  // be different from what we had before.
-
-  uint64_t current_time = NanoTime();
-  uint64_t split_time = current_time - start_ns_;
-  ATRACE_END();
-  timing_logger_->splits_.push_back(std::pair<uint64_t, const char*>(split_time, label_));
-
-  if (timing_logger_->verbose_) {
-    LOG(INFO) << "End: " << label_ << " " << PrettyDuration(split_time) << "\n"
-              << "Begin: " << label;
+TimingLogger::~TimingLogger() {
+  if (kIsDebugBuild) {
+    Verify();
   }
-
-  label_ = label;
-  start_ns_ = current_time;
-  running_ns_ = 0;
-
-  ATRACE_BEGIN(label);
-}
-
-void TimingLogger::ScopedSplit::Pause() {
-  uint64_t current_time = NanoTime();
-  uint64_t split_time = current_time - start_ns_;
-  running_ns_ += split_time;
-  ATRACE_END();
-}
-
-
-void TimingLogger::ScopedSplit::Resume() {
-  uint64_t current_time = NanoTime();
-
-  start_ns_ = current_time;
-  ATRACE_BEGIN(label_);
 }
 
 }  // namespace art
diff --git a/runtime/base/timing_logger.h b/runtime/base/timing_logger.h
index 9b558980c5..b300109e31 100644
--- a/runtime/base/timing_logger.h
+++ b/runtime/base/timing_logger.h
@@ -77,93 +77,119 @@ class CumulativeLogger {
 // A timing logger that knows when a split starts for the purposes of logging tools, like systrace.
 class TimingLogger {
  public:
-  // Splits are nanosecond times and split names.
-  typedef std::pair<uint64_t, const char*> SplitTiming;
-  typedef std::vector<SplitTiming> SplitTimings;
+  static constexpr size_t kIndexNotFound = static_cast<size_t>(-1);
 
-  explicit TimingLogger(const char* name, bool precise, bool verbose);
-  ~TimingLogger() {
-    // TODO: DCHECK(current_split_ == nullptr) << "Forgot to end split: " << current_split_->label_;
-  }
-  // Clears current splits and labels.
-  void Reset();
+  class Timing {
+   public:
+    Timing(uint64_t time, const char* name) : time_(time), name_(name) {
+    }
+    bool IsStartTiming() const {
+      return !IsEndTiming();
+    }
+    bool IsEndTiming() const {
+      return name_ == nullptr;
+    }
+    uint64_t GetTime() const {
+      return time_;
+    }
+    const char* GetName() const {
+      return name_;
+    }
 
-  // Starts a split
-  void StartSplit(const char* new_split_label);
+   private:
+    uint64_t time_;
+    const char* name_;
+  };
 
-  // Ends the current split and starts the one given by the label.
-  void NewSplit(const char* new_split_label);
+  // Extra data that is only calculated when you call dump to prevent excess allocation.
+  class TimingData {
+   public:
+    TimingData() = default;
+    TimingData(TimingData&& other) {
+      std::swap(data_, other.data_);
+    }
+    TimingData& operator=(TimingData&& other) {
+      std::swap(data_, other.data_);
+      return *this;
+    }
+    uint64_t GetTotalTime(size_t idx) {
+      return data_[idx].total_time;
+    }
+    uint64_t GetExclusiveTime(size_t idx) {
+      return data_[idx].exclusive_time;
+    }
 
-  // Ends the current split and records the end time.
-  void EndSplit();
+   private:
+    // Each begin split has a total time and exclusive time. Exclusive time is total time - total
+    // time of children nodes.
+    struct CalculatedDataPoint {
+      CalculatedDataPoint() : total_time(0), exclusive_time(0) {}
+      uint64_t total_time;
+      uint64_t exclusive_time;
+    };
+    std::vector<CalculatedDataPoint> data_;
+    friend class TimingLogger;
+  };
 
+  explicit TimingLogger(const char* name, bool precise, bool verbose);
+  ~TimingLogger();
+  // Verify that all open timings have related closed timings.
+  void Verify();
+  // Clears current timings and labels.
+  void Reset();
+  // Starts a timing.
+  void StartTiming(const char* new_split_label);
+  // Ends the current timing.
+  void EndTiming();
+  // End the current timing and start a new timing. Usage not recommended.
+  void NewTiming(const char* new_split_label) {
+    EndTiming();
+    StartTiming(new_split_label);
+  }
+  // Returns the total duration of the timings (sum of total times).
   uint64_t GetTotalNs() const;
-
-  void Dump(std::ostream& os) const;
+  // Find the index of a timing by name.
+  size_t FindTimingIndex(const char* name, size_t start_idx) const;
+  void Dump(std::ostream& os, const char* indent_string = "  ") const;
 
   // Scoped timing splits that can be nested and composed with the explicit split
   // starts and ends.
-  class ScopedSplit {
-    public:
-      explicit ScopedSplit(const char* label, TimingLogger* timing_logger);
-
-      ~ScopedSplit();
-
-      friend class TimingLogger;
-
-    private:
-      // Pauses timing of the split, usually due to nesting of another split.
-      void Pause();
-
-      // Resumes timing of the split, usually because a nested split has ended.
-      void Resume();
-
-      // Used by new split to swap splits in place in a ScopedSplit instance.
-      void TailInsertSplit(const char* label);
-
-      // The scoped split immediately enclosing this split. Essentially, we get a
-      // stack of nested splits through this field.
-      ScopedSplit* enclosing_split_;
-
-      // Was this created via TimingLogger's StartSplit?
-      bool explicit_;
-
-      // The split's name.
-      const char* label_;
-
-      // The current split's latest start time. (It may have been paused and restarted.)
-      uint64_t start_ns_;
-
-      // The running time, outside of pauses.
-      uint64_t running_ns_;
-
-      // The timing logger holding this split.
-      TimingLogger* timing_logger_;
+  class ScopedTiming {
+   public:
+    explicit ScopedTiming(const char* label, TimingLogger* logger) : logger_(logger) {
+      logger_->StartTiming(label);
+    }
+    ~ScopedTiming() {
+      logger_->EndTiming();
+    }
+    // Closes the current timing and opens a new timing.
+    void NewTiming(const char* label) {
+      logger_->NewTiming(label);
+    }
 
-      DISALLOW_COPY_AND_ASSIGN(ScopedSplit);
+   private:
+    TimingLogger* const logger_;  // The timing logger which the scoped timing is associated with.
+    DISALLOW_COPY_AND_ASSIGN(ScopedTiming);
   };
 
-  const SplitTimings& GetSplits() const {
-    return splits_;
+  // Return the time points of when each start / end timings start and finish.
+  const std::vector<Timing>& GetTimings() const {
+    return timings_;
   }
 
-  friend class ScopedSplit;
+  TimingData CalculateTimingData() const;
+
  protected:
   // The name of the timing logger.
   const char* const name_;
-
   // Do we want to print the exactly recorded split (true) or round down to the time unit being
   // used (false).
   const bool precise_;
-
   // Verbose logging.
   const bool verbose_;
-
-  // The current scoped split is also the 'top' of the stack of splits in progress.
-  ScopedSplit* current_split_;
-
-  // Splits that have ended.
-  SplitTimings splits_;
+  // Timing points that are either start or end points. For each starting point ret[i] = location
+  // of end split associated with i. If it is and end split ret[i] = i.
+  std::vector<Timing> timings_;
 
  private:
   DISALLOW_COPY_AND_ASSIGN(TimingLogger);
diff --git a/runtime/base/timing_logger_test.cc b/runtime/base/timing_logger_test.cc
index 0757751822..35a73d0a76 100644
--- a/runtime/base/timing_logger_test.cc
+++ b/runtime/base/timing_logger_test.cc
@@ -26,16 +26,14 @@ class TimingLoggerTest : public CommonRuntimeTest {};
 
 TEST_F(TimingLoggerTest, StartEnd) {
   const char* split1name = "First Split";
-  TimingLogger timings("StartEnd", true, false);
-
-  timings.StartSplit(split1name);
-
-  timings.EndSplit();  // Ends split1.
-
-  const TimingLogger::SplitTimings& splits = timings.GetSplits();
-
-  EXPECT_EQ(1U, splits.size());
-  EXPECT_STREQ(splits[0].second, split1name);
+  TimingLogger logger("StartEnd", true, false);
+  logger.StartTiming(split1name);
+  logger.EndTiming();  // Ends split1.
+  const auto& timings = logger.GetTimings();
+  EXPECT_EQ(2U, timings.size());  // Start, End splits
+  EXPECT_TRUE(timings[0].IsStartTiming());
+  EXPECT_STREQ(timings[0].GetName(), split1name);
+  EXPECT_TRUE(timings[1].IsEndTiming());
 }
 
 
@@ -43,56 +41,61 @@ TEST_F(TimingLoggerTest, StartNewEnd) {
   const char* split1name = "First Split";
   const char* split2name = "Second Split";
   const char* split3name = "Third Split";
-  TimingLogger timings("StartNewEnd", true, false);
-
-  timings.StartSplit(split1name);
-
-  timings.NewSplit(split2name);  // Ends split1.
-
-  timings.NewSplit(split3name);  // Ends split2.
-
-  timings.EndSplit();  // Ends split3.
-
-  const TimingLogger::SplitTimings& splits = timings.GetSplits();
-
-  EXPECT_EQ(3U, splits.size());
-  EXPECT_STREQ(splits[0].second, split1name);
-  EXPECT_STREQ(splits[1].second, split2name);
-  EXPECT_STREQ(splits[2].second, split3name);
+  TimingLogger logger("StartNewEnd", true, false);
+  logger.StartTiming(split1name);
+  logger.NewTiming(split2name);
+  logger.NewTiming(split3name);
+  logger.EndTiming();
+  // Get the timings and verify that they are sane.
+  const auto& timings = logger.GetTimings();
+  // 6 timings in the timing logger at this point.
+  EXPECT_EQ(6U, timings.size());
+  EXPECT_TRUE(timings[0].IsStartTiming());
+  EXPECT_STREQ(timings[0].GetName(), split1name);
+  EXPECT_TRUE(timings[1].IsEndTiming());
+  EXPECT_TRUE(timings[2].IsStartTiming());
+  EXPECT_STREQ(timings[2].GetName(), split2name);
+  EXPECT_TRUE(timings[3].IsEndTiming());
+  EXPECT_TRUE(timings[4].IsStartTiming());
+  EXPECT_STREQ(timings[4].GetName(), split3name);
+  EXPECT_TRUE(timings[5].IsEndTiming());
 }
 
 TEST_F(TimingLoggerTest, StartNewEndNested) {
-  const char* split1name = "First Split";
-  const char* split2name = "Second Split";
-  const char* split3name = "Third Split";
-  const char* split4name = "Fourth Split";
-  const char* split5name = "Fifth Split";
-  TimingLogger timings("StartNewEndNested", true, false);
-
-  timings.StartSplit(split1name);
-
-  timings.NewSplit(split2name);  // Ends split1.
-
-  timings.StartSplit(split3name);
-
-  timings.StartSplit(split4name);
-
-  timings.NewSplit(split5name);  // Ends split4.
-
-  timings.EndSplit();  // Ends split5.
-
-  timings.EndSplit();  // Ends split3.
-
-  timings.EndSplit();  // Ends split2.
-
-  const TimingLogger::SplitTimings& splits = timings.GetSplits();
-
-  EXPECT_EQ(5U, splits.size());
-  EXPECT_STREQ(splits[0].second, split1name);
-  EXPECT_STREQ(splits[1].second, split4name);
-  EXPECT_STREQ(splits[2].second, split5name);
-  EXPECT_STREQ(splits[3].second, split3name);
-  EXPECT_STREQ(splits[4].second, split2name);
+  const char* name1 = "First Split";
+  const char* name2 = "Second Split";
+  const char* name3 = "Third Split";
+  const char* name4 = "Fourth Split";
+  const char* name5 = "Fifth Split";
+  TimingLogger logger("StartNewEndNested", true, false);
+  logger.StartTiming(name1);
+  logger.NewTiming(name2);  // Ends timing1.
+  logger.StartTiming(name3);
+  logger.StartTiming(name4);
+  logger.NewTiming(name5);  // Ends timing4.
+  logger.EndTiming();  // Ends timing5.
+  logger.EndTiming();  // Ends timing3.
+  logger.EndTiming();  // Ends timing2.
+  const auto& timings = logger.GetTimings();
+  EXPECT_EQ(10U, timings.size());
+  size_t idx_1 = logger.FindTimingIndex(name1, 0);
+  size_t idx_2 = logger.FindTimingIndex(name2, 0);
+  size_t idx_3 = logger.FindTimingIndex(name3, 0);
+  size_t idx_4 = logger.FindTimingIndex(name4, 0);
+  size_t idx_5 = logger.FindTimingIndex(name5, 0);
+  size_t idx_6 = logger.FindTimingIndex("Not found", 0);
+  EXPECT_NE(idx_1, TimingLogger::kIndexNotFound);
+  EXPECT_NE(idx_2, TimingLogger::kIndexNotFound);
+  EXPECT_NE(idx_3, TimingLogger::kIndexNotFound);
+  EXPECT_NE(idx_4, TimingLogger::kIndexNotFound);
+  EXPECT_NE(idx_5, TimingLogger::kIndexNotFound);
+  EXPECT_EQ(idx_6, TimingLogger::kIndexNotFound);
+  TimingLogger::TimingData data = logger.CalculateTimingData();
+  EXPECT_STREQ(timings[idx_1].GetName(), name1);
+  EXPECT_STREQ(timings[idx_2].GetName(), name2);
+  EXPECT_STREQ(timings[idx_3].GetName(), name3);
+  EXPECT_STREQ(timings[idx_4].GetName(), name4);
+  EXPECT_STREQ(timings[idx_5].GetName(), name5);
 }
 
 
@@ -101,31 +104,32 @@ TEST_F(TimingLoggerTest, Scoped) {
   const char* innersplit1 = "Inner Split 1";
   const char* innerinnersplit1 = "Inner Inner Split 1";
   const char* innersplit2 = "Inner Split 2";
-  TimingLogger timings("Scoped", true, false);
-
+  TimingLogger logger("Scoped", true, false);
   {
-      TimingLogger::ScopedSplit outer(outersplit, &timings);
-
-      {
-          TimingLogger::ScopedSplit inner1(innersplit1, &timings);
-
-          {
-              TimingLogger::ScopedSplit innerinner1(innerinnersplit1, &timings);
-          }  // Ends innerinnersplit1.
-      }  // Ends innersplit1.
-
+    TimingLogger::ScopedTiming outer(outersplit, &logger);
+    {
+      TimingLogger::ScopedTiming inner1(innersplit1, &logger);
       {
-          TimingLogger::ScopedSplit inner2(innersplit2, &timings);
-      }  // Ends innersplit2.
+        TimingLogger::ScopedTiming innerinner1(innerinnersplit1, &logger);
+      }  // Ends innerinnersplit1.
+    }  // Ends innersplit1.
+    {
+      TimingLogger::ScopedTiming inner2(innersplit2, &logger);
+    }  // Ends innersplit2.
   }  // Ends outersplit.
-
-  const TimingLogger::SplitTimings& splits = timings.GetSplits();
-
-  EXPECT_EQ(4U, splits.size());
-  EXPECT_STREQ(splits[0].second, innerinnersplit1);
-  EXPECT_STREQ(splits[1].second, innersplit1);
-  EXPECT_STREQ(splits[2].second, innersplit2);
-  EXPECT_STREQ(splits[3].second, outersplit);
+  const size_t idx_outersplit = logger.FindTimingIndex(outersplit, 0);
+  const size_t idx_innersplit1 = logger.FindTimingIndex(innersplit1, 0);
+  const size_t idx_innerinnersplit1 = logger.FindTimingIndex(innerinnersplit1, 0);
+  const size_t idx_innersplit2 = logger.FindTimingIndex(innersplit2, 0);
+  const auto& timings = logger.GetTimings();
+  EXPECT_EQ(8U, timings.size());  // 4 start timings and 4 end timings.
+  EXPECT_GE(timings[idx_innerinnersplit1].GetTime(), timings[idx_innersplit1].GetTime());
+  EXPECT_GE(timings[idx_innersplit2].GetTime(), timings[idx_innersplit1].GetTime());
+  TimingLogger::TimingData data(logger.CalculateTimingData());
+  EXPECT_GE(data.GetTotalTime(idx_outersplit), data.GetTotalTime(idx_innerinnersplit1));
+  EXPECT_GE(data.GetTotalTime(idx_outersplit),
+            data.GetTotalTime(idx_innersplit1) + data.GetTotalTime(idx_innersplit2));
+  EXPECT_GE(data.GetTotalTime(idx_innersplit1), data.GetTotalTime(idx_innerinnersplit1));
 }
 
 
@@ -134,27 +138,24 @@ TEST_F(TimingLoggerTest, ScopedAndExplicit) {
   const char* innersplit = "Inner Split";
   const char* innerinnersplit1 = "Inner Inner Split 1";
   const char* innerinnersplit2 = "Inner Inner Split 2";
-  TimingLogger timings("Scoped", true, false);
-
-  timings.StartSplit(outersplit);
-
+  TimingLogger logger("Scoped", true, false);
+  logger.StartTiming(outersplit);
   {
-      TimingLogger::ScopedSplit inner(innersplit, &timings);
-
-      timings.StartSplit(innerinnersplit1);
-
-      timings.NewSplit(innerinnersplit2);  // Ends innerinnersplit1.
+    TimingLogger::ScopedTiming inner(innersplit, &logger);
+    logger.StartTiming(innerinnersplit1);
+    logger.NewTiming(innerinnersplit2);  // Ends innerinnersplit1.
+    logger.EndTiming();
   }  // Ends innerinnersplit2, then innersplit.
-
-  timings.EndSplit();  // Ends outersplit.
-
-  const TimingLogger::SplitTimings& splits = timings.GetSplits();
-
-  EXPECT_EQ(4U, splits.size());
-  EXPECT_STREQ(splits[0].second, innerinnersplit1);
-  EXPECT_STREQ(splits[1].second, innerinnersplit2);
-  EXPECT_STREQ(splits[2].second, innersplit);
-  EXPECT_STREQ(splits[3].second, outersplit);
+  logger.EndTiming();  // Ends outersplit.
+  const size_t idx_outersplit = logger.FindTimingIndex(outersplit, 0);
+  const size_t idx_innersplit = logger.FindTimingIndex(innersplit, 0);
+  const size_t idx_innerinnersplit1 = logger.FindTimingIndex(innerinnersplit1, 0);
+  const size_t idx_innerinnersplit2 = logger.FindTimingIndex(innerinnersplit2, 0);
+  const auto& timings = logger.GetTimings();
+  EXPECT_EQ(8U, timings.size());
+  EXPECT_LE(timings[idx_outersplit].GetTime(), timings[idx_innersplit].GetTime());
+  EXPECT_LE(timings[idx_innersplit].GetTime(), timings[idx_innerinnersplit1].GetTime());
+  EXPECT_LE(timings[idx_innerinnersplit1].GetTime(), timings[idx_innerinnersplit2].GetTime());
 }
 
 }  // namespace art
diff --git a/runtime/base/unix_file/fd_file.cc b/runtime/base/unix_file/fd_file.cc
index 6d5b59cbeb..f29a7ec974 100644
--- a/runtime/base/unix_file/fd_file.cc
+++ b/runtime/base/unix_file/fd_file.cc
@@ -122,7 +122,9 @@ bool FdFile::ReadFully(void* buffer, size_t byte_count) {
   char* ptr = static_cast<char*>(buffer);
   while (byte_count > 0) {
     ssize_t bytes_read = TEMP_FAILURE_RETRY(read(fd_, ptr, byte_count));
-    if (bytes_read == -1) {
+    if (bytes_read <= 0) {
+      // 0: end of file
+      // -1: error
       return false;
     }
     byte_count -= bytes_read;  // Reduce the number of remaining bytes.
diff --git a/runtime/base/unix_file/fd_file_test.cc b/runtime/base/unix_file/fd_file_test.cc
index d620666747..33b3d3e186 100644
--- a/runtime/base/unix_file/fd_file_test.cc
+++ b/runtime/base/unix_file/fd_file_test.cc
@@ -16,6 +16,7 @@
 
 #include "base/unix_file/fd_file.h"
 #include "base/unix_file/random_access_file_test.h"
+#include "common_runtime_test.h"  // For ScratchFile
 #include "gtest/gtest.h"
 
 namespace unix_file {
@@ -60,4 +61,15 @@ TEST_F(FdFileTest, OpenClose) {
   EXPECT_TRUE(file.IsOpened());
 }
 
+TEST_F(FdFileTest, ReadFullyEmptyFile) {
+  // New scratch file, zero-length.
+  art::ScratchFile tmp;
+  FdFile file;
+  ASSERT_TRUE(file.Open(tmp.GetFilename(), O_RDONLY));
+  EXPECT_GE(file.Fd(), 0);
+  EXPECT_TRUE(file.IsOpened());
+  uint8_t buffer[16];
+  EXPECT_FALSE(file.ReadFully(&buffer, 4));
+}
+
 }  // namespace unix_file
diff --git a/runtime/dex_instruction.h b/runtime/dex_instruction.h
index edba5020d4..b6810b02b2 100644
--- a/runtime/dex_instruction.h
+++ b/runtime/dex_instruction.h
@@ -145,28 +145,30 @@ class Instruction {
   };
 
   enum VerifyFlag {
-    kVerifyNone            = 0x000000,
-    kVerifyRegA            = 0x000001,
-    kVerifyRegAWide        = 0x000002,
-    kVerifyRegB            = 0x000004,
-    kVerifyRegBField       = 0x000008,
-    kVerifyRegBMethod      = 0x000010,
-    kVerifyRegBNewInstance = 0x000020,
-    kVerifyRegBString      = 0x000040,
-    kVerifyRegBType        = 0x000080,
-    kVerifyRegBWide        = 0x000100,
-    kVerifyRegC            = 0x000200,
-    kVerifyRegCField       = 0x000400,
-    kVerifyRegCNewArray    = 0x000800,
-    kVerifyRegCType        = 0x001000,
-    kVerifyRegCWide        = 0x002000,
-    kVerifyArrayData       = 0x004000,
-    kVerifyBranchTarget    = 0x008000,
-    kVerifySwitchTargets   = 0x010000,
-    kVerifyVarArg          = 0x020000,
-    kVerifyVarArgRange     = 0x040000,
-    kVerifyRuntimeOnly     = 0x080000,
-    kVerifyError           = 0x100000,
+    kVerifyNone               = 0x000000,
+    kVerifyRegA               = 0x000001,
+    kVerifyRegAWide           = 0x000002,
+    kVerifyRegB               = 0x000004,
+    kVerifyRegBField          = 0x000008,
+    kVerifyRegBMethod         = 0x000010,
+    kVerifyRegBNewInstance    = 0x000020,
+    kVerifyRegBString         = 0x000040,
+    kVerifyRegBType           = 0x000080,
+    kVerifyRegBWide           = 0x000100,
+    kVerifyRegC               = 0x000200,
+    kVerifyRegCField          = 0x000400,
+    kVerifyRegCNewArray       = 0x000800,
+    kVerifyRegCType           = 0x001000,
+    kVerifyRegCWide           = 0x002000,
+    kVerifyArrayData          = 0x004000,
+    kVerifyBranchTarget       = 0x008000,
+    kVerifySwitchTargets      = 0x010000,
+    kVerifyVarArg             = 0x020000,
+    kVerifyVarArgNonZero      = 0x040000,
+    kVerifyVarArgRange        = 0x080000,
+    kVerifyVarArgRangeNonZero = 0x100000,
+    kVerifyRuntimeOnly        = 0x200000,
+    kVerifyError              = 0x400000,
   };
 
   static constexpr uint32_t kMaxVarArgRegs = 5;
@@ -506,7 +508,8 @@ class Instruction {
 
   int GetVerifyExtraFlags() const {
     return (kInstructionVerifyFlags[Opcode()] & (kVerifyArrayData | kVerifyBranchTarget |
-        kVerifySwitchTargets | kVerifyVarArg | kVerifyVarArgRange | kVerifyError));
+        kVerifySwitchTargets | kVerifyVarArg | kVerifyVarArgNonZero | kVerifyVarArgRange |
+        kVerifyVarArgRangeNonZero | kVerifyError));
   }
 
   bool GetVerifyIsRuntimeOnly() const {
diff --git a/runtime/dex_instruction_list.h b/runtime/dex_instruction_list.h
index 4cda58b92e..103b0d74ef 100644
--- a/runtime/dex_instruction_list.h
+++ b/runtime/dex_instruction_list.h
@@ -128,17 +128,17 @@
   V(0x6B, SPUT_BYTE, "sput-byte", k21c, false, kFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
   V(0x6C, SPUT_CHAR, "sput-char", k21c, false, kFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
   V(0x6D, SPUT_SHORT, "sput-short", k21c, false, kFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
-  V(0x6E, INVOKE_VIRTUAL, "invoke-virtual", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArg) \
-  V(0x6F, INVOKE_SUPER, "invoke-super", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArg) \
-  V(0x70, INVOKE_DIRECT, "invoke-direct", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArg) \
+  V(0x6E, INVOKE_VIRTUAL, "invoke-virtual", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgNonZero) \
+  V(0x6F, INVOKE_SUPER, "invoke-super", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgNonZero) \
+  V(0x70, INVOKE_DIRECT, "invoke-direct", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgNonZero) \
   V(0x71, INVOKE_STATIC, "invoke-static", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArg) \
-  V(0x72, INVOKE_INTERFACE, "invoke-interface", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArg) \
+  V(0x72, INVOKE_INTERFACE, "invoke-interface", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgNonZero) \
   V(0x73, RETURN_VOID_BARRIER, "return-void-barrier", k10x, false, kNone, kReturn, kVerifyNone) \
-  V(0x74, INVOKE_VIRTUAL_RANGE, "invoke-virtual/range", k3rc, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgRange) \
-  V(0x75, INVOKE_SUPER_RANGE, "invoke-super/range", k3rc, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgRange) \
-  V(0x76, INVOKE_DIRECT_RANGE, "invoke-direct/range", k3rc, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgRange) \
+  V(0x74, INVOKE_VIRTUAL_RANGE, "invoke-virtual/range", k3rc, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgRangeNonZero) \
+  V(0x75, INVOKE_SUPER_RANGE, "invoke-super/range", k3rc, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgRangeNonZero) \
+  V(0x76, INVOKE_DIRECT_RANGE, "invoke-direct/range", k3rc, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgRangeNonZero) \
   V(0x77, INVOKE_STATIC_RANGE, "invoke-static/range", k3rc, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgRange) \
-  V(0x78, INVOKE_INTERFACE_RANGE, "invoke-interface/range", k3rc, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgRange) \
+  V(0x78, INVOKE_INTERFACE_RANGE, "invoke-interface/range", k3rc, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArgRangeNonZero) \
   V(0x79, UNUSED_79, "unused-79", k10x, false, kUnknown, 0, kVerifyError) \
   V(0x7A, UNUSED_7A, "unused-7a", k10x, false, kUnknown, 0, kVerifyError) \
   V(0x7B, NEG_INT, "neg-int", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
@@ -251,8 +251,8 @@
   V(0xE6, IPUT_QUICK, "iput-quick", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
   V(0xE7, IPUT_WIDE_QUICK, "iput-wide-quick", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB | kVerifyRuntimeOnly) \
   V(0xE8, IPUT_OBJECT_QUICK, "iput-object-quick", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
-  V(0xE9, INVOKE_VIRTUAL_QUICK, "invoke-virtual-quick", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyVarArg | kVerifyRuntimeOnly) \
-  V(0xEA, INVOKE_VIRTUAL_RANGE_QUICK, "invoke-virtual/range-quick", k3rc, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyVarArgRange | kVerifyRuntimeOnly) \
+  V(0xE9, INVOKE_VIRTUAL_QUICK, "invoke-virtual-quick", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyVarArgNonZero | kVerifyRuntimeOnly) \
+  V(0xEA, INVOKE_VIRTUAL_RANGE_QUICK, "invoke-virtual/range-quick", k3rc, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyVarArgRangeNonZero | kVerifyRuntimeOnly) \
   V(0xEB, UNUSED_EB, "unused-eb", k10x, false, kUnknown, 0, kVerifyError) \
   V(0xEC, UNUSED_EC, "unused-ec", k10x, false, kUnknown, 0, kVerifyError) \
   V(0xED, UNUSED_ED, "unused-ed", k10x, false, kUnknown, 0, kVerifyError) \
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index 55262f2359..09fb97a5c9 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -1642,7 +1642,7 @@ void RosAlloc::SetFootprintLimit(size_t new_capacity) {
 void RosAlloc::RevokeThreadLocalRuns(Thread* thread) {
   Thread* self = Thread::Current();
   // Avoid race conditions on the bulk free bit maps with BulkFree() (GC).
-  WriterMutexLock wmu(self, bulk_free_lock_);
+  ReaderMutexLock wmu(self, bulk_free_lock_);
   for (size_t idx = 0; idx < kNumThreadLocalSizeBrackets; idx++) {
     MutexLock mu(self, *size_bracket_locks_[idx]);
     Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(idx));
@@ -1720,7 +1720,7 @@ void RosAlloc::AssertThreadLocalRunsAreRevoked(Thread* thread) {
   if (kIsDebugBuild) {
     Thread* self = Thread::Current();
     // Avoid race conditions on the bulk free bit maps with BulkFree() (GC).
-    WriterMutexLock wmu(self, bulk_free_lock_);
+    ReaderMutexLock wmu(self, bulk_free_lock_);
     for (size_t idx = 0; idx < kNumThreadLocalSizeBrackets; idx++) {
       MutexLock mu(self, *size_bracket_locks_[idx]);
       Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(idx));
@@ -1867,7 +1867,7 @@ void RosAlloc::Verify() {
   CHECK(Locks::mutator_lock_->IsExclusiveHeld(self))
       << "The mutator locks isn't exclusively locked at RosAlloc::Verify()";
   MutexLock mu(self, *Locks::thread_list_lock_);
-  WriterMutexLock wmu(self, bulk_free_lock_);
+  ReaderMutexLock wmu(self, bulk_free_lock_);
   std::vector<Run*> runs;
   {
     MutexLock mu(self, lock_);
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
index a439188858..13f61ec935 100644
--- a/runtime/gc/allocator/rosalloc.h
+++ b/runtime/gc/allocator/rosalloc.h
@@ -45,10 +45,7 @@ class RosAlloc {
     byte magic_num_;  // The magic number used for debugging only.
 
     bool IsFree() const {
-      if (kIsDebugBuild) {
-        return magic_num_ == kMagicNumFree;
-      }
-      return true;
+      return !kIsDebugBuild || magic_num_ == kMagicNumFree;
     }
     size_t ByteSize(RosAlloc* rosalloc) const EXCLUSIVE_LOCKS_REQUIRED(rosalloc->lock_) {
       const byte* fpr_base = reinterpret_cast<const byte*>(this);
diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc
index a17c36be6d..46d79bf796 100644
--- a/runtime/gc/collector/garbage_collector.cc
+++ b/runtime/gc/collector/garbage_collector.cc
@@ -31,20 +31,36 @@ namespace art {
 namespace gc {
 namespace collector {
 
+Iteration::Iteration()
+    : duration_ns_(0), timings_("GC iteration timing logger", true, VLOG_IS_ON(heap)) {
+  Reset(kGcCauseBackground, false);  // Reset to some place holder values.
+}
+
+void Iteration::Reset(GcCause gc_cause, bool clear_soft_references) {
+  timings_.Reset();
+  pause_times_.clear();
+  duration_ns_ = 0;
+  clear_soft_references_ = clear_soft_references;
+  gc_cause_ = gc_cause;
+  freed_ = ObjectBytePair();
+  freed_los_ = ObjectBytePair();
+}
+
+uint64_t Iteration::GetEstimatedThroughput() const {
+  // Add 1ms to prevent possible division by 0.
+  return (static_cast<uint64_t>(freed_.bytes) * 1000) / (NsToMs(GetDurationNs()) + 1);
+}
+
 GarbageCollector::GarbageCollector(Heap* heap, const std::string& name)
     : heap_(heap),
       name_(name),
-      gc_cause_(kGcCauseForAlloc),
-      clear_soft_references_(false),
-      duration_ns_(0),
-      timings_(name_.c_str(), true, VLOG_IS_ON(heap)),
       pause_histogram_((name_ + " paused").c_str(), kPauseBucketSize, kPauseBucketCount),
       cumulative_timings_(name) {
   ResetCumulativeStatistics();
 }
 
 void GarbageCollector::RegisterPause(uint64_t nano_length) {
-  pause_times_.push_back(nano_length);
+  GetCurrentIteration()->pause_times_.push_back(nano_length);
 }
 
 void GarbageCollector::ResetCumulativeStatistics() {
@@ -59,38 +75,33 @@ void GarbageCollector::Run(GcCause gc_cause, bool clear_soft_references) {
   ATRACE_BEGIN(StringPrintf("%s %s GC", PrettyCause(gc_cause), GetName()).c_str());
   Thread* self = Thread::Current();
   uint64_t start_time = NanoTime();
-  timings_.Reset();
-  pause_times_.clear();
-  duration_ns_ = 0;
-  clear_soft_references_ = clear_soft_references;
-  gc_cause_ = gc_cause;
-  // Reset stats.
-  freed_bytes_ = 0;
-  freed_large_object_bytes_ = 0;
-  freed_objects_ = 0;
-  freed_large_objects_ = 0;
+  Iteration* current_iteration = GetCurrentIteration();
+  current_iteration->Reset(gc_cause, clear_soft_references);
   RunPhases();  // Run all the GC phases.
   // Add the current timings to the cumulative timings.
-  cumulative_timings_.AddLogger(timings_);
+  cumulative_timings_.AddLogger(*GetTimings());
   // Update cumulative statistics with how many bytes the GC iteration freed.
-  total_freed_objects_ += GetFreedObjects() + GetFreedLargeObjects();
-  total_freed_bytes_ += GetFreedBytes() + GetFreedLargeObjectBytes();
+  total_freed_objects_ += current_iteration->GetFreedObjects() +
+      current_iteration->GetFreedLargeObjects();
+  total_freed_bytes_ += current_iteration->GetFreedBytes() +
+      current_iteration->GetFreedLargeObjectBytes();
   uint64_t end_time = NanoTime();
-  duration_ns_ = end_time - start_time;
+  current_iteration->SetDurationNs(end_time - start_time);
   if (Locks::mutator_lock_->IsExclusiveHeld(self)) {
     // The entire GC was paused, clear the fake pauses which might be in the pause times and add
     // the whole GC duration.
-    pause_times_.clear();
-    RegisterPause(duration_ns_);
+    current_iteration->pause_times_.clear();
+    RegisterPause(current_iteration->GetDurationNs());
   }
-  total_time_ns_ += GetDurationNs();
-  for (uint64_t pause_time : pause_times_) {
+  total_time_ns_ += current_iteration->GetDurationNs();
+  for (uint64_t pause_time : current_iteration->GetPauseTimes()) {
     pause_histogram_.AddValue(pause_time / 1000);
   }
   ATRACE_END();
 }
 
 void GarbageCollector::SwapBitmaps() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   // Swap the live and mark bitmaps for each alloc space. This is needed since sweep re-swaps
   // these bitmaps. The bitmap swapping is an optimization so that we do not need to clear the live
   // bits of dead objects in the live bitmap.
@@ -125,23 +136,6 @@ uint64_t GarbageCollector::GetEstimatedMeanThroughput() const {
   return (total_freed_bytes_ * 1000) / (NsToMs(GetCumulativeTimings().GetTotalNs()) + 1);
 }
 
-uint64_t GarbageCollector::GetEstimatedLastIterationThroughput() const {
-  // Add 1ms to prevent possible division by 0.
-  return (static_cast<uint64_t>(freed_bytes_) * 1000) / (NsToMs(GetDurationNs()) + 1);
-}
-
-void GarbageCollector::RecordFree(uint64_t freed_objects, int64_t freed_bytes) {
-  freed_objects_ += freed_objects;
-  freed_bytes_ += freed_bytes;
-  GetHeap()->RecordFree(freed_objects, freed_bytes);
-}
-
-void GarbageCollector::RecordFreeLargeObjects(uint64_t freed_objects, int64_t freed_bytes) {
-  freed_large_objects_ += freed_objects;
-  freed_large_object_bytes_ += freed_bytes;
-  GetHeap()->RecordFree(freed_objects, freed_bytes);
-}
-
 void GarbageCollector::ResetMeasurements() {
   cumulative_timings_.Reset();
   pause_histogram_.Reset();
@@ -160,6 +154,23 @@ GarbageCollector::ScopedPause::~ScopedPause() {
   Runtime::Current()->GetThreadList()->ResumeAll();
 }
 
+// Returns the current GC iteration and assocated info.
+Iteration* GarbageCollector::GetCurrentIteration() {
+  return heap_->GetCurrentGcIteration();
+}
+const Iteration* GarbageCollector::GetCurrentIteration() const {
+  return heap_->GetCurrentGcIteration();
+}
+
+void GarbageCollector::RecordFree(const ObjectBytePair& freed) {
+  GetCurrentIteration()->freed_.Add(freed);
+  heap_->RecordFree(freed.objects, freed.bytes);
+}
+void GarbageCollector::RecordFreeLOS(const ObjectBytePair& freed) {
+  GetCurrentIteration()->freed_los_.Add(freed);
+  heap_->RecordFree(freed.objects, freed.bytes);
+}
+
 }  // namespace collector
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index f4f9dbb40a..885569efd9 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -33,6 +33,78 @@ class Heap;
 
 namespace collector {
 
+struct ObjectBytePair {
+  ObjectBytePair(uint64_t num_objects = 0, int64_t num_bytes = 0)
+      : objects(num_objects), bytes(num_bytes) {}
+  void Add(const ObjectBytePair& other) {
+    objects += other.objects;
+    bytes += other.bytes;
+  }
+  // Number of objects which were freed.
+  uint64_t objects;
+  // Freed bytes are signed since the GC can free negative bytes if it promotes objects to a space
+  // which has a larger allocation size.
+  int64_t bytes;
+};
+
+// A information related single garbage collector iteration. Since we only ever have one GC running
+// at any given time, we can have a single iteration info.
+class Iteration {
+ public:
+  Iteration();
+  // Returns how long the mutators were paused in nanoseconds.
+  const std::vector<uint64_t>& GetPauseTimes() const {
+    return pause_times_;
+  }
+  TimingLogger* GetTimings() {
+    return &timings_;
+  }
+  // Returns how long the GC took to complete in nanoseconds.
+  uint64_t GetDurationNs() const {
+    return duration_ns_;
+  }
+  int64_t GetFreedBytes() const {
+    return freed_.bytes;
+  }
+  int64_t GetFreedLargeObjectBytes() const {
+    return freed_los_.bytes;
+  }
+  uint64_t GetFreedObjects() const {
+    return freed_.objects;
+  }
+  uint64_t GetFreedLargeObjects() const {
+    return freed_los_.objects;
+  }
+  void Reset(GcCause gc_cause, bool clear_soft_references);
+  // Returns the estimated throughput of the iteration.
+  uint64_t GetEstimatedThroughput() const;
+  bool GetClearSoftReferences() const {
+    return clear_soft_references_;
+  }
+  void SetClearSoftReferences(bool clear_soft_references) {
+    clear_soft_references_ = clear_soft_references;
+  }
+  GcCause GetGcCause() const {
+    return gc_cause_;
+  }
+
+ private:
+  void SetDurationNs(uint64_t duration) {
+    duration_ns_ = duration;
+  }
+
+  GcCause gc_cause_;
+  bool clear_soft_references_;
+  uint64_t duration_ns_;
+  TimingLogger timings_;
+  ObjectBytePair freed_;
+  ObjectBytePair freed_los_;
+  std::vector<uint64_t> pause_times_;
+
+  friend class GarbageCollector;
+  DISALLOW_COPY_AND_ASSIGN(Iteration);
+};
+
 class GarbageCollector {
  public:
   class SCOPED_LOCKABLE ScopedPause {
@@ -62,22 +134,7 @@ class GarbageCollector {
   Heap* GetHeap() const {
     return heap_;
   }
-
-  // Returns how long the mutators were paused in nanoseconds.
-  const std::vector<uint64_t>& GetPauseTimes() const {
-    return pause_times_;
-  }
-
-  // Returns how long the GC took to complete in nanoseconds.
-  uint64_t GetDurationNs() const {
-    return duration_ns_;
-  }
-
   void RegisterPause(uint64_t nano_length);
-
-  TimingLogger& GetTimings() {
-    return timings_;
-  }
   const CumulativeLogger& GetCumulativeTimings() const {
     return cumulative_timings_;
   }
@@ -87,52 +144,36 @@ class GarbageCollector {
   // Swap the live and mark bitmaps of spaces that are active for the collector. For partial GC,
   // this is the allocation space, for full GC then we swap the zygote bitmaps too.
   void SwapBitmaps() EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
-
-  int64_t GetFreedBytes() const {
-    return freed_bytes_;
-  }
-
-  int64_t GetFreedLargeObjectBytes() const {
-    return freed_large_object_bytes_;
-  }
-
-  uint64_t GetFreedObjects() const {
-    return freed_objects_;
-  }
-
-  uint64_t GetFreedLargeObjects() const {
-    return freed_large_objects_;
-  }
-
   uint64_t GetTotalPausedTimeNs() const {
     return pause_histogram_.AdjustedSum();
   }
-
   int64_t GetTotalFreedBytes() const {
     return total_freed_bytes_;
   }
-
   uint64_t GetTotalFreedObjects() const {
     return total_freed_objects_;
   }
-
   const Histogram<uint64_t>& GetPauseHistogram() const {
     return pause_histogram_;
   }
-
   // Reset the cumulative timings and pause histogram.
   void ResetMeasurements();
-
   // Returns the estimated throughput in bytes / second.
   uint64_t GetEstimatedMeanThroughput() const;
-
-  // Returns the estimated throughput of the last GC iteration.
-  uint64_t GetEstimatedLastIterationThroughput() const;
-
   // Returns how many GC iterations have been run.
-  size_t GetIterations() const {
+  size_t NumberOfIterations() const {
     return GetCumulativeTimings().GetIterations();
   }
+  // Returns the current GC iteration and assocated info.
+  Iteration* GetCurrentIteration();
+  const Iteration* GetCurrentIteration() const;
+  TimingLogger* GetTimings() {
+    return &GetCurrentIteration()->timings_;
+  }
+  // Record a free of normal objects.
+  void RecordFree(const ObjectBytePair& freed);
+  // Record a free of large objects.
+  void RecordFreeLOS(const ObjectBytePair& freed);
 
  protected:
   // Run all of the GC phases.
@@ -141,40 +182,17 @@ class GarbageCollector {
   // Revoke all the thread-local buffers.
   virtual void RevokeAllThreadLocalBuffers() = 0;
 
-  // Record that you have freed some objects or large objects, calls Heap::RecordFree.
-  // TODO: These are not thread safe, add a lock if we get parallel sweeping.
-  void RecordFree(uint64_t freed_objects, int64_t freed_bytes);
-  void RecordFreeLargeObjects(uint64_t freed_objects, int64_t freed_bytes);
-
   static constexpr size_t kPauseBucketSize = 500;
   static constexpr size_t kPauseBucketCount = 32;
 
   Heap* const heap_;
-
   std::string name_;
-
-  GcCause gc_cause_;
-  bool clear_soft_references_;
-
-  uint64_t duration_ns_;
-  TimingLogger timings_;
-
   // Cumulative statistics.
   Histogram<uint64_t> pause_histogram_;
   uint64_t total_time_ns_;
   uint64_t total_freed_objects_;
   int64_t total_freed_bytes_;
-
-  // Single GC statitstics, freed bytes are signed since the GC can free negative bytes if it
-  // promotes objects to a space which has a larger allocation size.
-  int64_t freed_bytes_;
-  int64_t freed_large_object_bytes_;
-  uint64_t freed_objects_;
-  uint64_t freed_large_objects_;
-
   CumulativeLogger cumulative_timings_;
-
-  std::vector<uint64_t> pause_times_;
 };
 
 }  // namespace collector
diff --git a/runtime/gc/collector/mark_compact.cc b/runtime/gc/collector/mark_compact.cc
index 595dc8f38c..40448524c6 100644
--- a/runtime/gc/collector/mark_compact.cc
+++ b/runtime/gc/collector/mark_compact.cc
@@ -49,7 +49,6 @@
 #include "thread-inl.h"
 #include "thread_list.h"
 
-using ::art::mirror::Class;
 using ::art::mirror::Object;
 
 namespace art {
@@ -57,7 +56,7 @@ namespace gc {
 namespace collector {
 
 void MarkCompact::BindBitmaps() {
-  timings_.StartSplit("BindBitmaps");
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   WriterMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
   // Mark all of the spaces we never collect as immune.
   for (const auto& space : GetHeap()->GetContinuousSpaces()) {
@@ -66,7 +65,6 @@ void MarkCompact::BindBitmaps() {
       CHECK(immune_region_.AddContinuousSpace(space)) << "Failed to add space " << *space;
     }
   }
-  timings_.EndSplit();
 }
 
 MarkCompact::MarkCompact(Heap* heap, const std::string& name_prefix)
@@ -120,7 +118,7 @@ class CalculateObjectForwardingAddressVisitor {
 };
 
 void MarkCompact::CalculateObjectForwardingAddresses() {
-  timings_.NewSplit(__FUNCTION__);
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   // The bump pointer in the space where the next forwarding address will be.
   bump_pointer_ = reinterpret_cast<byte*>(space_->Begin());
   // Visit all the marked objects in the bitmap.
@@ -131,7 +129,7 @@ void MarkCompact::CalculateObjectForwardingAddresses() {
 }
 
 void MarkCompact::InitializePhase() {
-  TimingLogger::ScopedSplit split("InitializePhase", &timings_);
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   mark_stack_ = heap_->GetMarkStack();
   DCHECK(mark_stack_ != nullptr);
   immune_region_.Reset();
@@ -143,11 +141,10 @@ void MarkCompact::InitializePhase() {
 }
 
 void MarkCompact::ProcessReferences(Thread* self) {
-  TimingLogger::ScopedSplit split("ProcessReferences", &timings_);
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   heap_->GetReferenceProcessor()->ProcessReferences(
-      false, &timings_, clear_soft_references_, &HeapReferenceMarkedCallback, &MarkObjectCallback,
-      &ProcessMarkStackCallback, this);
+      false, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(),
+      &HeapReferenceMarkedCallback, &MarkObjectCallback, &ProcessMarkStackCallback, this);
 }
 
 class BitmapSetSlowPathVisitor {
@@ -187,6 +184,7 @@ inline void MarkCompact::MarkObject(mirror::Object* obj) {
 }
 
 void MarkCompact::MarkingPhase() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   Thread* self = Thread::Current();
   // Bitmap which describes which objects we have to move.
   objects_before_forwarding_.reset(accounting::ContinuousSpaceBitmap::Create(
@@ -195,21 +193,22 @@ void MarkCompact::MarkingPhase() {
   objects_with_lockword_.reset(accounting::ContinuousSpaceBitmap::Create(
       "objects with lock words", space_->Begin(), space_->Size()));
   CHECK(Locks::mutator_lock_->IsExclusiveHeld(self));
-  TimingLogger::ScopedSplit split("MarkingPhase", &timings_);
   // Assume the cleared space is already empty.
   BindBitmaps();
+  t.NewTiming("ProcessCards");
   // Process dirty cards and add dirty cards to mod-union tables.
-  heap_->ProcessCards(timings_, false);
+  heap_->ProcessCards(GetTimings(), false);
   // Clear the whole card table since we can not Get any additional dirty cards during the
   // paused GC. This saves memory but only works for pause the world collectors.
-  timings_.NewSplit("ClearCardTable");
+  t.NewTiming("ClearCardTable");
   heap_->GetCardTable()->ClearCardTable();
   // Need to do this before the checkpoint since we don't want any threads to add references to
   // the live stack during the recursive mark.
-  timings_.NewSplit("SwapStacks");
   if (kUseThreadLocalAllocationStack) {
+    t.NewTiming("RevokeAllThreadLocalAllocationStacks");
     heap_->RevokeAllThreadLocalAllocationStacks(self);
   }
+  t.NewTiming("SwapStacks");
   heap_->SwapStacks(self);
   {
     WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
@@ -227,24 +226,22 @@ void MarkCompact::MarkingPhase() {
   // Revoke buffers before measuring how many objects were moved since the TLABs need to be revoked
   // before they are properly counted.
   RevokeAllThreadLocalBuffers();
-  timings_.StartSplit("PreSweepingGcVerification");
   // Disabled due to an issue where we have objects in the bump pointer space which reference dead
   // objects.
   // heap_->PreSweepingGcVerification(this);
-  timings_.EndSplit();
 }
 
 void MarkCompact::UpdateAndMarkModUnion() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   for (auto& space : heap_->GetContinuousSpaces()) {
     // If the space is immune then we need to mark the references to other spaces.
     if (immune_region_.ContainsSpace(space)) {
       accounting::ModUnionTable* table = heap_->FindModUnionTableFromSpace(space);
       if (table != nullptr) {
         // TODO: Improve naming.
-        TimingLogger::ScopedSplit split(
+        TimingLogger::ScopedTiming t(
             space->IsZygoteSpace() ? "UpdateAndMarkZygoteModUnionTable" :
-                                     "UpdateAndMarkImageModUnionTable",
-                                     &timings_);
+                                     "UpdateAndMarkImageModUnionTable", GetTimings());
         table->UpdateAndMarkReferences(MarkHeapReferenceCallback, this);
       }
     }
@@ -252,27 +249,28 @@ void MarkCompact::UpdateAndMarkModUnion() {
 }
 
 void MarkCompact::MarkReachableObjects() {
-  timings_.StartSplit("MarkStackAsLive");
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   accounting::ObjectStack* live_stack = heap_->GetLiveStack();
-  heap_->MarkAllocStackAsLive(live_stack);
+  {
+    TimingLogger::ScopedTiming t2("MarkAllocStackAsLive", GetTimings());
+    heap_->MarkAllocStackAsLive(live_stack);
+  }
   live_stack->Reset();
   // Recursively process the mark stack.
   ProcessMarkStack();
 }
 
 void MarkCompact::ReclaimPhase() {
-  TimingLogger::ScopedSplit split("ReclaimPhase", &timings_);
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   WriterMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
   // Reclaim unmarked objects.
   Sweep(false);
   // Swap the live and mark bitmaps for each space which we modified space. This is an
   // optimization that enables us to not clear live bits inside of the sweep. Only swaps unbound
   // bitmaps.
-  timings_.StartSplit("SwapBitmapsAndUnBindBitmaps");
   SwapBitmaps();
   GetHeap()->UnBindBitmaps();  // Unbind the live and mark bitmaps.
   Compact();
-  timings_.EndSplit();
 }
 
 void MarkCompact::ResizeMarkStack(size_t new_size) {
@@ -340,7 +338,7 @@ class UpdateObjectReferencesVisitor {
 };
 
 void MarkCompact::UpdateReferences() {
-  timings_.NewSplit(__FUNCTION__);
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   Runtime* runtime = Runtime::Current();
   // Update roots.
   runtime->VisitRoots(UpdateRootCallback, this);
@@ -350,10 +348,10 @@ void MarkCompact::UpdateReferences() {
     accounting::ModUnionTable* table = heap_->FindModUnionTableFromSpace(space);
     if (table != nullptr) {
       // TODO: Improve naming.
-      TimingLogger::ScopedSplit split(
+      TimingLogger::ScopedTiming t(
           space->IsZygoteSpace() ? "UpdateZygoteModUnionTableReferences" :
                                    "UpdateImageModUnionTableReferences",
-                                   &timings_);
+                                   GetTimings());
       table->UpdateAndMarkReferences(&UpdateHeapReferenceCallback, this);
     } else {
       // No mod union table, so we need to scan the space using bitmap visit.
@@ -381,7 +379,7 @@ void MarkCompact::UpdateReferences() {
 }
 
 void MarkCompact::Compact() {
-  timings_.NewSplit(__FUNCTION__);
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   CalculateObjectForwardingAddresses();
   UpdateReferences();
   MoveObjects();
@@ -389,9 +387,9 @@ void MarkCompact::Compact() {
   int64_t objects_freed = space_->GetObjectsAllocated() - live_objects_in_space_;
   int64_t bytes_freed = reinterpret_cast<int64_t>(space_->End()) -
       reinterpret_cast<int64_t>(bump_pointer_);
-  timings_.NewSplit("RecordFree");
+  t.NewTiming("RecordFree");
   space_->RecordFree(objects_freed, bytes_freed);
-  RecordFree(objects_freed, bytes_freed);
+  RecordFree(ObjectBytePair(objects_freed, bytes_freed));
   space_->SetEnd(bump_pointer_);
   // Need to zero out the memory we freed. TODO: Use madvise for pages.
   memset(bump_pointer_, 0, bytes_freed);
@@ -399,7 +397,7 @@ void MarkCompact::Compact() {
 
 // Marks all objects in the root set.
 void MarkCompact::MarkRoots() {
-  timings_.NewSplit("MarkRoots");
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   Runtime::Current()->VisitRoots(MarkRootCallback, this);
 }
 
@@ -483,9 +481,8 @@ bool MarkCompact::HeapReferenceMarkedCallback(mirror::HeapReference<mirror::Obje
 }
 
 void MarkCompact::SweepSystemWeaks() {
-  timings_.StartSplit("SweepSystemWeaks");
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   Runtime::Current()->SweepSystemWeaks(IsMarkedCallback, this);
-  timings_.EndSplit();
 }
 
 bool MarkCompact::ShouldSweepSpace(space::ContinuousSpace* space) const {
@@ -523,7 +520,7 @@ void MarkCompact::MoveObject(mirror::Object* obj, size_t len) {
 }
 
 void MarkCompact::MoveObjects() {
-  timings_.NewSplit(__FUNCTION__);
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   // Move the objects in the before forwarding bitmap.
   MoveObjectVisitor visitor(this);
   objects_before_forwarding_->VisitMarkedRange(reinterpret_cast<uintptr_t>(space_->Begin()),
@@ -533,31 +530,25 @@ void MarkCompact::MoveObjects() {
 }
 
 void MarkCompact::Sweep(bool swap_bitmaps) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   DCHECK(mark_stack_->IsEmpty());
-  TimingLogger::ScopedSplit split("Sweep", &timings_);
   for (const auto& space : GetHeap()->GetContinuousSpaces()) {
     if (space->IsContinuousMemMapAllocSpace()) {
       space::ContinuousMemMapAllocSpace* alloc_space = space->AsContinuousMemMapAllocSpace();
       if (!ShouldSweepSpace(alloc_space)) {
         continue;
       }
-      TimingLogger::ScopedSplit split(
-          alloc_space->IsZygoteSpace() ? "SweepZygoteSpace" : "SweepAllocSpace", &timings_);
-      size_t freed_objects = 0;
-      size_t freed_bytes = 0;
-      alloc_space->Sweep(swap_bitmaps, &freed_objects, &freed_bytes);
-      RecordFree(freed_objects, freed_bytes);
+      TimingLogger::ScopedTiming t(
+          alloc_space->IsZygoteSpace() ? "SweepZygoteSpace" : "SweepAllocSpace", GetTimings());
+      RecordFree(alloc_space->Sweep(swap_bitmaps));
     }
   }
   SweepLargeObjects(swap_bitmaps);
 }
 
 void MarkCompact::SweepLargeObjects(bool swap_bitmaps) {
-  TimingLogger::ScopedSplit split("SweepLargeObjects", &timings_);
-  size_t freed_objects = 0;
-  size_t freed_bytes = 0;
-  heap_->GetLargeObjectsSpace()->Sweep(swap_bitmaps, &freed_objects, &freed_bytes);
-  RecordFreeLargeObjects(freed_objects, freed_bytes);
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  RecordFreeLOS(heap_->GetLargeObjectsSpace()->Sweep(swap_bitmaps));
 }
 
 // Process the "referent" field in a java.lang.ref.Reference.  If the referent has not yet been
@@ -596,13 +587,12 @@ void MarkCompact::ScanObject(Object* obj) {
 
 // Scan anything that's on the mark stack.
 void MarkCompact::ProcessMarkStack() {
-  timings_.StartSplit("ProcessMarkStack");
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   while (!mark_stack_->IsEmpty()) {
     Object* obj = mark_stack_->PopBack();
     DCHECK(obj != nullptr);
     ScanObject(obj);
   }
-  timings_.EndSplit();
 }
 
 void MarkCompact::SetSpace(space::BumpPointerSpace* space) {
@@ -611,7 +601,7 @@ void MarkCompact::SetSpace(space::BumpPointerSpace* space) {
 }
 
 void MarkCompact::FinishPhase() {
-  TimingLogger::ScopedSplit split("FinishPhase", &timings_);
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   space_ = nullptr;
   CHECK(mark_stack_->IsEmpty());
   mark_stack_->Reset();
@@ -624,9 +614,8 @@ void MarkCompact::FinishPhase() {
 }
 
 void MarkCompact::RevokeAllThreadLocalBuffers() {
-  timings_.StartSplit("(Paused)RevokeAllThreadLocalBuffers");
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   GetHeap()->RevokeAllThreadLocalBuffers();
-  timings_.EndSplit();
 }
 
 }  // namespace collector
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index fbb349eea1..7e97b3b16b 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -81,7 +81,7 @@ static constexpr bool kVerifyRootsMarked = kIsDebugBuild;
 static constexpr bool kRevokeRosAllocThreadLocalBuffersAtCheckpoint = true;
 
 void MarkSweep::BindBitmaps() {
-  timings_.StartSplit("BindBitmaps");
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   WriterMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
   // Mark all of the spaces we never collect as immune.
   for (const auto& space : GetHeap()->GetContinuousSpaces()) {
@@ -89,7 +89,6 @@ void MarkSweep::BindBitmaps() {
       CHECK(immune_region_.AddContinuousSpace(space)) << "Failed to add space " << *space;
     }
   }
-  timings_.EndSplit();
 }
 
 MarkSweep::MarkSweep(Heap* heap, bool is_concurrent, const std::string& name_prefix)
@@ -110,7 +109,7 @@ MarkSweep::MarkSweep(Heap* heap, bool is_concurrent, const std::string& name_pre
 }
 
 void MarkSweep::InitializePhase() {
-  TimingLogger::ScopedSplit split("InitializePhase", &timings_);
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   mark_stack_ = heap_->GetMarkStack();
   DCHECK(mark_stack_ != nullptr);
   immune_region_.Reset();
@@ -132,9 +131,9 @@ void MarkSweep::InitializePhase() {
     ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
     mark_bitmap_ = heap_->GetMarkBitmap();
   }
-  if (!clear_soft_references_) {
+  if (!GetCurrentIteration()->GetClearSoftReferences()) {
     // Always clear soft references if a non-sticky collection.
-    clear_soft_references_ = GetGcType() != collector::kGcTypeSticky;
+    GetCurrentIteration()->SetClearSoftReferences(GetGcType() != collector::kGcTypeSticky);
   }
 }
 
@@ -170,15 +169,14 @@ void MarkSweep::RunPhases() {
 }
 
 void MarkSweep::ProcessReferences(Thread* self) {
-  TimingLogger::ScopedSplit split("ProcessReferences", &timings_);
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   GetHeap()->GetReferenceProcessor()->ProcessReferences(
-      true, &timings_, clear_soft_references_, &HeapReferenceMarkedCallback, &MarkObjectCallback,
-      &ProcessMarkStackCallback, this);
+      true, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(),
+      &HeapReferenceMarkedCallback, &MarkObjectCallback, &ProcessMarkStackCallback, this);
 }
 
 void MarkSweep::PausePhase() {
-  TimingLogger::ScopedSplit split("(Paused)PausePhase", &timings_);
+  TimingLogger::ScopedTiming t("(Paused)PausePhase", GetTimings());
   Thread* self = Thread::Current();
   Locks::mutator_lock_->AssertExclusiveHeld(self);
   if (IsConcurrent()) {
@@ -190,7 +188,7 @@ void MarkSweep::PausePhase() {
     RecursiveMarkDirtyObjects(true, accounting::CardTable::kCardDirty);
   }
   {
-    TimingLogger::ScopedSplit split("SwapStacks", &timings_);
+    TimingLogger::ScopedTiming t2("SwapStacks", GetTimings());
     WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
     heap_->SwapStacks(self);
     live_stack_freeze_size_ = heap_->GetLiveStack()->Size();
@@ -198,9 +196,7 @@ void MarkSweep::PausePhase() {
     // stacks and don't want anybody to allocate into the live stack.
     RevokeAllThreadLocalAllocationStacks(self);
   }
-  timings_.StartSplit("PreSweepingGcVerification");
   heap_->PreSweepingGcVerification(this);
-  timings_.EndSplit();
   // Disallow new system weaks to prevent a race which occurs when someone adds a new system
   // weak before we sweep them. Since this new system weak may not be marked, the GC may
   // incorrectly sweep it. This also fixes a race where interning may attempt to return a strong
@@ -214,10 +210,11 @@ void MarkSweep::PausePhase() {
 void MarkSweep::PreCleanCards() {
   // Don't do this for non concurrent GCs since they don't have any dirty cards.
   if (kPreCleanCards && IsConcurrent()) {
+    TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
     Thread* self = Thread::Current();
     CHECK(!Locks::mutator_lock_->IsExclusiveHeld(self));
     // Process dirty cards and add dirty cards to mod union tables, also ages cards.
-    heap_->ProcessCards(timings_, false);
+    heap_->ProcessCards(GetTimings(), false);
     // The checkpoint root marking is required to avoid a race condition which occurs if the
     // following happens during a reference write:
     // 1. mutator dirties the card (write barrier)
@@ -243,22 +240,19 @@ void MarkSweep::PreCleanCards() {
 
 void MarkSweep::RevokeAllThreadLocalAllocationStacks(Thread* self) {
   if (kUseThreadLocalAllocationStack) {
-    timings_.NewSplit("RevokeAllThreadLocalAllocationStacks");
+    TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
     Locks::mutator_lock_->AssertExclusiveHeld(self);
     heap_->RevokeAllThreadLocalAllocationStacks(self);
   }
 }
 
 void MarkSweep::MarkingPhase() {
-  TimingLogger::ScopedSplit split("MarkingPhase", &timings_);
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   Thread* self = Thread::Current();
-
   BindBitmaps();
   FindDefaultSpaceBitmap();
-
   // Process dirty cards and add dirty cards to mod union tables.
-  heap_->ProcessCards(timings_, false);
-
+  heap_->ProcessCards(GetTimings(), false);
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   MarkRoots(self);
   MarkReachableObjects();
@@ -271,7 +265,7 @@ void MarkSweep::UpdateAndMarkModUnion() {
     if (immune_region_.ContainsSpace(space)) {
       const char* name = space->IsZygoteSpace() ? "UpdateAndMarkZygoteModUnionTable" :
           "UpdateAndMarkImageModUnionTable";
-      TimingLogger::ScopedSplit split(name, &timings_);
+      TimingLogger::ScopedTiming t(name, GetTimings());
       accounting::ModUnionTable* mod_union_table = heap_->FindModUnionTableFromSpace(space);
       CHECK(mod_union_table != nullptr);
       mod_union_table->UpdateAndMarkReferences(MarkHeapReferenceCallback, this);
@@ -286,7 +280,7 @@ void MarkSweep::MarkReachableObjects() {
 }
 
 void MarkSweep::ReclaimPhase() {
-  TimingLogger::ScopedSplit split("ReclaimPhase", &timings_);
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   Thread* self = Thread::Current();
   // Process the references concurrently.
   ProcessReferences(self);
@@ -294,25 +288,19 @@ void MarkSweep::ReclaimPhase() {
   Runtime::Current()->AllowNewSystemWeaks();
   {
     WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-
     // Reclaim unmarked objects.
     Sweep(false);
-
     // Swap the live and mark bitmaps for each space which we modified space. This is an
     // optimization that enables us to not clear live bits inside of the sweep. Only swaps unbound
     // bitmaps.
-    timings_.StartSplit("SwapBitmaps");
     SwapBitmaps();
-    timings_.EndSplit();
-
     // Unbind the live and mark bitmaps.
-    TimingLogger::ScopedSplit split("UnBindBitmaps", &timings_);
     GetHeap()->UnBindBitmaps();
   }
 }
 
 void MarkSweep::FindDefaultSpaceBitmap() {
-  TimingLogger::ScopedSplit split("FindDefaultMarkBitmap", &timings_);
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   for (const auto& space : GetHeap()->GetContinuousSpaces()) {
     accounting::ContinuousSpaceBitmap* bitmap = space->GetMarkBitmap();
     // We want to have the main space instead of non moving if possible.
@@ -509,11 +497,10 @@ void MarkSweep::VerifyRoots() {
 }
 
 void MarkSweep::MarkRoots(Thread* self) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   if (Locks::mutator_lock_->IsExclusiveHeld(self)) {
     // If we exclusively hold the mutator lock, all threads must be suspended.
-    timings_.StartSplit("MarkRoots");
     Runtime::Current()->VisitRoots(MarkRootCallback, this);
-    timings_.EndSplit();
     RevokeAllThreadLocalAllocationStacks(self);
   } else {
     MarkRootsCheckpoint(self, kRevokeRosAllocThreadLocalBuffersAtCheckpoint);
@@ -525,16 +512,14 @@ void MarkSweep::MarkRoots(Thread* self) {
 }
 
 void MarkSweep::MarkNonThreadRoots() {
-  timings_.StartSplit("MarkNonThreadRoots");
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   Runtime::Current()->VisitNonThreadRoots(MarkRootCallback, this);
-  timings_.EndSplit();
 }
 
 void MarkSweep::MarkConcurrentRoots(VisitRootFlags flags) {
-  timings_.StartSplit("MarkConcurrentRoots");
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   // Visit all runtime roots and clear dirty flags.
   Runtime::Current()->VisitConcurrentRoots(MarkRootCallback, this, flags);
-  timings_.EndSplit();
 }
 
 class ScanObjectVisitor {
@@ -755,7 +740,8 @@ void MarkSweep::ScanGrayObjects(bool paused, byte minimum_age) {
     Thread* self = Thread::Current();
     // Can't have a different split for each space since multiple spaces can have their cards being
     // scanned at the same time.
-    timings_.StartSplit(paused ? "(Paused)ScanGrayObjects" : "ScanGrayObjects");
+    TimingLogger::ScopedTiming t(paused ? "(Paused)ScanGrayObjects" : __FUNCTION__,
+        GetTimings());
     // Try to take some of the mark stack since we can pass this off to the worker tasks.
     Object** mark_stack_begin = mark_stack_->Begin();
     Object** mark_stack_end = mark_stack_->End();
@@ -808,28 +794,28 @@ void MarkSweep::ScanGrayObjects(bool paused, byte minimum_age) {
     thread_pool->StartWorkers(self);
     thread_pool->Wait(self, true, true);
     thread_pool->StopWorkers(self);
-    timings_.EndSplit();
   } else {
     for (const auto& space : GetHeap()->GetContinuousSpaces()) {
       if (space->GetMarkBitmap() != nullptr) {
         // Image spaces are handled properly since live == marked for them.
+        const char* name = nullptr;
         switch (space->GetGcRetentionPolicy()) {
-          case space::kGcRetentionPolicyNeverCollect:
-            timings_.StartSplit(paused ? "(Paused)ScanGrayImageSpaceObjects" :
-                "ScanGrayImageSpaceObjects");
-            break;
-          case space::kGcRetentionPolicyFullCollect:
-            timings_.StartSplit(paused ? "(Paused)ScanGrayZygoteSpaceObjects" :
-                "ScanGrayZygoteSpaceObjects");
-            break;
-          case space::kGcRetentionPolicyAlwaysCollect:
-            timings_.StartSplit(paused ? "(Paused)ScanGrayAllocSpaceObjects" :
-                "ScanGrayAllocSpaceObjects");
-            break;
-          }
+        case space::kGcRetentionPolicyNeverCollect:
+          name = paused ? "(Paused)ScanGrayImageSpaceObjects" : "ScanGrayImageSpaceObjects";
+          break;
+        case space::kGcRetentionPolicyFullCollect:
+          name = paused ? "(Paused)ScanGrayZygoteSpaceObjects" : "ScanGrayZygoteSpaceObjects";
+          break;
+        case space::kGcRetentionPolicyAlwaysCollect:
+          name = paused ? "(Paused)ScanGrayAllocSpaceObjects" : "ScanGrayAllocSpaceObjects";
+          break;
+        default:
+          LOG(FATAL) << "Unreachable";
+        }
+        TimingLogger::ScopedTiming t(name, GetTimings());
         ScanObjectVisitor visitor(this);
-        card_table->Scan(space->GetMarkBitmap(), space->Begin(), space->End(), visitor, minimum_age);
-        timings_.EndSplit();
+        card_table->Scan(space->GetMarkBitmap(), space->Begin(), space->End(), visitor,
+                         minimum_age);
       }
     }
   }
@@ -839,9 +825,7 @@ class RecursiveMarkTask : public MarkStackTask<false> {
  public:
   RecursiveMarkTask(ThreadPool* thread_pool, MarkSweep* mark_sweep,
                     accounting::ContinuousSpaceBitmap* bitmap, uintptr_t begin, uintptr_t end)
-      : MarkStackTask<false>(thread_pool, mark_sweep, 0, NULL),
-        bitmap_(bitmap),
-        begin_(begin),
+      : MarkStackTask<false>(thread_pool, mark_sweep, 0, NULL), bitmap_(bitmap), begin_(begin),
         end_(end) {
   }
 
@@ -866,7 +850,7 @@ class RecursiveMarkTask : public MarkStackTask<false> {
 // Populates the mark stack based on the set of marked objects and
 // recursively marks until the mark stack is emptied.
 void MarkSweep::RecursiveMark() {
-  TimingLogger::ScopedSplit split("RecursiveMark", &timings_);
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   // RecursiveMark will build the lists of known instances of the Reference classes. See
   // DelayReferenceReferent for details.
   if (kUseRecursiveMark) {
@@ -933,25 +917,22 @@ void MarkSweep::RecursiveMarkDirtyObjects(bool paused, byte minimum_age) {
 }
 
 void MarkSweep::ReMarkRoots() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   Locks::mutator_lock_->AssertExclusiveHeld(Thread::Current());
-  timings_.StartSplit("(Paused)ReMarkRoots");
   Runtime::Current()->VisitRoots(
       MarkRootCallback, this, static_cast<VisitRootFlags>(kVisitRootFlagNewRoots |
                                                           kVisitRootFlagStopLoggingNewRoots |
                                                           kVisitRootFlagClearRootLog));
-  timings_.EndSplit();
   if (kVerifyRootsMarked) {
-    timings_.StartSplit("(Paused)VerifyRoots");
+    TimingLogger::ScopedTiming t("(Paused)VerifyRoots", GetTimings());
     Runtime::Current()->VisitRoots(VerifyRootMarked, this);
-    timings_.EndSplit();
   }
 }
 
 void MarkSweep::SweepSystemWeaks(Thread* self) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-  timings_.StartSplit("SweepSystemWeaks");
   Runtime::Current()->SweepSystemWeaks(IsMarkedCallback, this);
-  timings_.EndSplit();
 }
 
 mirror::Object* MarkSweep::VerifySystemWeakIsLiveCallback(Object* obj, void* arg) {
@@ -972,6 +953,7 @@ void MarkSweep::VerifyIsLive(const Object* obj) {
 }
 
 void MarkSweep::VerifySystemWeaks() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   // Verify system weaks, uses a special object visitor which returns the input object.
   Runtime::Current()->SweepSystemWeaks(VerifySystemWeakIsLiveCallback, this);
 }
@@ -1008,8 +990,8 @@ class CheckpointMarkThreadRoots : public Closure {
 
 void MarkSweep::MarkRootsCheckpoint(Thread* self,
                                     bool revoke_ros_alloc_thread_local_buffers_at_checkpoint) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   CheckpointMarkThreadRoots check_point(this, revoke_ros_alloc_thread_local_buffers_at_checkpoint);
-  timings_.StartSplit("MarkRootsCheckpoint");
   ThreadList* thread_list = Runtime::Current()->GetThreadList();
   // Request the check point is run on all threads returning a count of the threads that must
   // run through the barrier including self.
@@ -1024,19 +1006,16 @@ void MarkSweep::MarkRootsCheckpoint(Thread* self,
   }
   Locks::mutator_lock_->SharedLock(self);
   Locks::heap_bitmap_lock_->ExclusiveLock(self);
-  timings_.EndSplit();
 }
 
 void MarkSweep::SweepArray(accounting::ObjectStack* allocations, bool swap_bitmaps) {
-  timings_.StartSplit("SweepArray");
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   Thread* self = Thread::Current();
   mirror::Object** chunk_free_buffer = reinterpret_cast<mirror::Object**>(
       sweep_array_free_buffer_mem_map_->BaseBegin());
   size_t chunk_free_pos = 0;
-  size_t freed_bytes = 0;
-  size_t freed_large_object_bytes = 0;
-  size_t freed_objects = 0;
-  size_t freed_large_objects = 0;
+  ObjectBytePair freed;
+  ObjectBytePair freed_los;
   // How many objects are left in the array, modified after each space is swept.
   Object** objects = allocations->Begin();
   size_t count = allocations->Size();
@@ -1077,10 +1056,9 @@ void MarkSweep::SweepArray(accounting::ObjectStack* allocations, bool swap_bitma
         // if needed.
         if (!mark_bitmap->Test(obj)) {
           if (chunk_free_pos >= kSweepArrayChunkFreeSize) {
-            timings_.StartSplit("FreeList");
-            freed_objects += chunk_free_pos;
-            freed_bytes += alloc_space->FreeList(self, chunk_free_pos, chunk_free_buffer);
-            timings_.EndSplit();
+            TimingLogger::ScopedTiming t("FreeList", GetTimings());
+            freed.objects += chunk_free_pos;
+            freed.bytes += alloc_space->FreeList(self, chunk_free_pos, chunk_free_buffer);
             chunk_free_pos = 0;
           }
           chunk_free_buffer[chunk_free_pos++] = obj;
@@ -1090,10 +1068,9 @@ void MarkSweep::SweepArray(accounting::ObjectStack* allocations, bool swap_bitma
       }
     }
     if (chunk_free_pos > 0) {
-      timings_.StartSplit("FreeList");
-      freed_objects += chunk_free_pos;
-      freed_bytes += alloc_space->FreeList(self, chunk_free_pos, chunk_free_buffer);
-      timings_.EndSplit();
+      TimingLogger::ScopedTiming t("FreeList", GetTimings());
+      freed.objects += chunk_free_pos;
+      freed.bytes += alloc_space->FreeList(self, chunk_free_pos, chunk_free_buffer);
       chunk_free_pos = 0;
     }
     // All of the references which space contained are no longer in the allocation stack, update
@@ -1114,58 +1091,47 @@ void MarkSweep::SweepArray(accounting::ObjectStack* allocations, bool swap_bitma
       continue;
     }
     if (!large_mark_objects->Test(obj)) {
-      ++freed_large_objects;
-      freed_large_object_bytes += large_object_space->Free(self, obj);
+      ++freed_los.objects;
+      freed_los.bytes += large_object_space->Free(self, obj);
     }
   }
-  timings_.EndSplit();
-
-  timings_.StartSplit("RecordFree");
-  VLOG(heap) << "Freed " << freed_objects << "/" << count << " objects with size "
-             << PrettySize(freed_bytes);
-  RecordFree(freed_objects, freed_bytes);
-  RecordFreeLargeObjects(freed_large_objects, freed_large_object_bytes);
-  timings_.EndSplit();
-
-  timings_.StartSplit("ResetStack");
-  allocations->Reset();
-  timings_.EndSplit();
-
+  {
+    TimingLogger::ScopedTiming t("RecordFree", GetTimings());
+    RecordFree(freed);
+    RecordFreeLOS(freed_los);
+    t.NewTiming("ResetStack");
+    allocations->Reset();
+  }
   sweep_array_free_buffer_mem_map_->MadviseDontNeedAndZero();
 }
 
 void MarkSweep::Sweep(bool swap_bitmaps) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   // Ensure that nobody inserted items in the live stack after we swapped the stacks.
   CHECK_GE(live_stack_freeze_size_, GetHeap()->GetLiveStack()->Size());
-  // Mark everything allocated since the last as GC live so that we can sweep concurrently,
-  // knowing that new allocations won't be marked as live.
-  timings_.StartSplit("MarkStackAsLive");
-  accounting::ObjectStack* live_stack = heap_->GetLiveStack();
-  heap_->MarkAllocStackAsLive(live_stack);
-  live_stack->Reset();
-  timings_.EndSplit();
-
-  DCHECK(mark_stack_->IsEmpty());
+  {
+    TimingLogger::ScopedTiming t2("MarkAllocStackAsLive", GetTimings());
+    // Mark everything allocated since the last as GC live so that we can sweep concurrently,
+    // knowing that new allocations won't be marked as live.
+    accounting::ObjectStack* live_stack = heap_->GetLiveStack();
+    heap_->MarkAllocStackAsLive(live_stack);
+    live_stack->Reset();
+    DCHECK(mark_stack_->IsEmpty());
+  }
   for (const auto& space : GetHeap()->GetContinuousSpaces()) {
     if (space->IsContinuousMemMapAllocSpace()) {
       space::ContinuousMemMapAllocSpace* alloc_space = space->AsContinuousMemMapAllocSpace();
-      TimingLogger::ScopedSplit split(
-          alloc_space->IsZygoteSpace() ? "SweepZygoteSpace" : "SweepMallocSpace", &timings_);
-      size_t freed_objects = 0;
-      size_t freed_bytes = 0;
-      alloc_space->Sweep(swap_bitmaps, &freed_objects, &freed_bytes);
-      RecordFree(freed_objects, freed_bytes);
+      TimingLogger::ScopedTiming split(
+          alloc_space->IsZygoteSpace() ? "SweepZygoteSpace" : "SweepMallocSpace", GetTimings());
+      RecordFree(alloc_space->Sweep(swap_bitmaps));
     }
   }
   SweepLargeObjects(swap_bitmaps);
 }
 
 void MarkSweep::SweepLargeObjects(bool swap_bitmaps) {
-  TimingLogger::ScopedSplit split("SweepLargeObjects", &timings_);
-  size_t freed_objects = 0;
-  size_t freed_bytes = 0;
-  heap_->GetLargeObjectsSpace()->Sweep(swap_bitmaps, &freed_objects, &freed_bytes);
-  RecordFreeLargeObjects(freed_objects, freed_bytes);
+  TimingLogger::ScopedTiming split(__FUNCTION__, GetTimings());
+  RecordFreeLOS(heap_->GetLargeObjectsSpace()->Sweep(swap_bitmaps));
 }
 
 // Process the "referent" field in a java.lang.ref.Reference.  If the referent has not yet been
@@ -1233,7 +1199,7 @@ void MarkSweep::ProcessMarkStackParallel(size_t thread_count) {
 
 // Scan anything that's on the mark stack.
 void MarkSweep::ProcessMarkStack(bool paused) {
-  timings_.StartSplit(paused ? "(Paused)ProcessMarkStack" : "ProcessMarkStack");
+  TimingLogger::ScopedTiming t(paused ? "(Paused)ProcessMarkStack" : __FUNCTION__, GetTimings());
   size_t thread_count = GetThreadCount(paused);
   if (kParallelProcessMarkStack && thread_count > 1 &&
       mark_stack_->Size() >= kMinimumParallelMarkStackSize) {
@@ -1266,7 +1232,6 @@ void MarkSweep::ProcessMarkStack(bool paused) {
       ScanObject(obj);
     }
   }
-  timings_.EndSplit();
 }
 
 inline bool MarkSweep::IsMarked(const Object* object) const {
@@ -1280,7 +1245,7 @@ inline bool MarkSweep::IsMarked(const Object* object) const {
 }
 
 void MarkSweep::FinishPhase() {
-  TimingLogger::ScopedSplit split("FinishPhase", &timings_);
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   if (kCountScannedTypes) {
     VLOG(gc) << "MarkSweep scanned classes=" << class_count_.LoadRelaxed()
         << " arrays=" << array_count_.LoadRelaxed() << " other=" << other_count_.LoadRelaxed();
@@ -1317,9 +1282,8 @@ void MarkSweep::RevokeAllThreadLocalBuffers() {
     // not be in use.
     GetHeap()->AssertAllBumpPointerSpaceThreadLocalBuffersAreRevoked();
   } else {
-    timings_.StartSplit("(Paused)RevokeAllThreadLocalBuffers");
+    TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
     GetHeap()->RevokeAllThreadLocalBuffers();
-    timings_.EndSplit();
   }
 }
 
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index 54e77a7fe0..cabfe2176c 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -59,7 +59,7 @@ static constexpr size_t kBytesPromotedThreshold = 4 * MB;
 static constexpr size_t kLargeObjectBytesAllocatedThreshold = 16 * MB;
 
 void SemiSpace::BindBitmaps() {
-  timings_.StartSplit("BindBitmaps");
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   WriterMutexLock mu(self_, *Locks::heap_bitmap_lock_);
   // Mark all of the spaces we never collect as immune.
   for (const auto& space : GetHeap()->GetContinuousSpaces()) {
@@ -83,7 +83,6 @@ void SemiSpace::BindBitmaps() {
     // We won't collect the large object space if a bump pointer space only collection.
     is_large_object_space_immune_ = true;
   }
-  timings_.EndSplit();
 }
 
 SemiSpace::SemiSpace(Heap* heap, bool generational, const std::string& name_prefix)
@@ -131,7 +130,7 @@ void SemiSpace::RunPhases() {
 }
 
 void SemiSpace::InitializePhase() {
-  TimingLogger::ScopedSplit split("InitializePhase", &timings_);
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   mark_stack_ = heap_->GetMarkStack();
   DCHECK(mark_stack_ != nullptr);
   immune_region_.Reset();
@@ -151,14 +150,14 @@ void SemiSpace::InitializePhase() {
 }
 
 void SemiSpace::ProcessReferences(Thread* self) {
-  TimingLogger::ScopedSplit split("ProcessReferences", &timings_);
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   GetHeap()->GetReferenceProcessor()->ProcessReferences(
-      false, &timings_, clear_soft_references_, &HeapReferenceMarkedCallback,
-      &MarkObjectCallback, &ProcessMarkStackCallback, this);
+      false, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(),
+      &HeapReferenceMarkedCallback, &MarkObjectCallback, &ProcessMarkStackCallback, this);
 }
 
 void SemiSpace::MarkingPhase() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   CHECK(Locks::mutator_lock_->IsExclusiveHeld(self_));
   if (kStoreStackTraces) {
     Locks::mutator_lock_->AssertExclusiveHeld(self_);
@@ -176,8 +175,9 @@ void SemiSpace::MarkingPhase() {
   // to prevent fragmentation.
   RevokeAllThreadLocalBuffers();
   if (generational_) {
-    if (gc_cause_ == kGcCauseExplicit || gc_cause_ == kGcCauseForNativeAlloc ||
-        clear_soft_references_) {
+    if (GetCurrentIteration()->GetGcCause() == kGcCauseExplicit ||
+        GetCurrentIteration()->GetGcCause() == kGcCauseForNativeAlloc ||
+        GetCurrentIteration()->GetClearSoftReferences()) {
       // If an explicit, native allocation-triggered, or last attempt
       // collection, collect the whole heap.
       whole_heap_collection_ = true;
@@ -191,21 +191,12 @@ void SemiSpace::MarkingPhase() {
     }
   }
 
-  if (!clear_soft_references_) {
-    if (!generational_) {
-      // If non-generational, always clear soft references.
-      clear_soft_references_ = true;
-    } else {
-      // If generational, clear soft references if a whole heap collection.
-      if (whole_heap_collection_) {
-        clear_soft_references_ = true;
-      }
-    }
+  if (!generational_ || whole_heap_collection_) {
+    // If non-generational, always clear soft references.
+    // If generational, clear soft references if a whole heap collection.
+    GetCurrentIteration()->SetClearSoftReferences(true);
   }
-
   Locks::mutator_lock_->AssertExclusiveHeld(self_);
-
-  TimingLogger::ScopedSplit split("MarkingPhase", &timings_);
   if (generational_) {
     // If last_gc_to_space_end_ is out of the bounds of the from-space
     // (the to-space from last GC), then point it to the beginning of
@@ -220,15 +211,16 @@ void SemiSpace::MarkingPhase() {
   // Assume the cleared space is already empty.
   BindBitmaps();
   // Process dirty cards and add dirty cards to mod-union tables.
-  heap_->ProcessCards(timings_, kUseRememberedSet && generational_);
+  heap_->ProcessCards(GetTimings(), kUseRememberedSet && generational_);
   // Clear the whole card table since we can not Get any additional dirty cards during the
   // paused GC. This saves memory but only works for pause the world collectors.
-  timings_.NewSplit("ClearCardTable");
+  t.NewTiming("ClearCardTable");
   heap_->GetCardTable()->ClearCardTable();
   // Need to do this before the checkpoint since we don't want any threads to add references to
   // the live stack during the recursive mark.
-  timings_.NewSplit("SwapStacks");
+  t.NewTiming("SwapStacks");
   if (kUseThreadLocalAllocationStack) {
+    TimingLogger::ScopedTiming t("RevokeAllThreadLocalAllocationStacks", GetTimings());
     heap_->RevokeAllThreadLocalAllocationStacks(self_);
   }
   heap_->SwapStacks(self_);
@@ -245,7 +237,6 @@ void SemiSpace::MarkingPhase() {
     ReaderMutexLock mu(self_, *Locks::heap_bitmap_lock_);
     SweepSystemWeaks();
   }
-  timings_.NewSplit("RecordFree");
   // Revoke buffers before measuring how many objects were moved since the TLABs need to be revoked
   // before they are properly counted.
   RevokeAllThreadLocalBuffers();
@@ -257,14 +248,12 @@ void SemiSpace::MarkingPhase() {
   CHECK_LE(to_objects, from_objects);
   // Note: Freed bytes can be negative if we copy form a compacted space to a free-list backed
   // space.
-  RecordFree(from_objects - to_objects, from_bytes - to_bytes);
+  RecordFree(ObjectBytePair(from_objects - to_objects, from_bytes - to_bytes));
   // Clear and protect the from space.
   from_space_->Clear();
   VLOG(heap) << "Protecting from_space_: " << *from_space_;
   from_space_->GetMemMap()->Protect(kProtectFromSpace ? PROT_NONE : PROT_READ);
-  timings_.StartSplit("PreSweepingGcVerification");
   heap_->PreSweepingGcVerification(this);
-  timings_.EndSplit();
   if (swap_semi_spaces_) {
     heap_->SwapSemiSpaces();
   }
@@ -277,10 +266,10 @@ void SemiSpace::UpdateAndMarkModUnion() {
       accounting::ModUnionTable* table = heap_->FindModUnionTableFromSpace(space);
       if (table != nullptr) {
         // TODO: Improve naming.
-        TimingLogger::ScopedSplit split(
+        TimingLogger::ScopedTiming t(
             space->IsZygoteSpace() ? "UpdateAndMarkZygoteModUnionTable" :
                                      "UpdateAndMarkImageModUnionTable",
-                                     &timings_);
+                                     GetTimings());
         table->UpdateAndMarkReferences(MarkHeapReferenceCallback, this);
       } else if (heap_->FindRememberedSetFromSpace(space) != nullptr) {
         DCHECK(kUseRememberedSet);
@@ -359,12 +348,14 @@ class SemiSpaceVerifyNoFromSpaceReferencesObjectVisitor {
 };
 
 void SemiSpace::MarkReachableObjects() {
-  timings_.StartSplit("MarkStackAsLive");
-  accounting::ObjectStack* live_stack = heap_->GetLiveStack();
-  heap_->MarkAllocStackAsLive(live_stack);
-  live_stack->Reset();
-
-  timings_.NewSplit("UpdateAndMarkRememberedSets");
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  {
+    TimingLogger::ScopedTiming t2("MarkStackAsLive", GetTimings());
+    accounting::ObjectStack* live_stack = heap_->GetLiveStack();
+    heap_->MarkAllocStackAsLive(live_stack);
+    live_stack->Reset();
+  }
+  t.NewTiming("UpdateAndMarkRememberedSets");
   for (auto& space : heap_->GetContinuousSpaces()) {
     // If the space is immune and has no mod union table (the
     // non-moving space when the bump pointer space only collection is
@@ -403,7 +394,7 @@ void SemiSpace::MarkReachableObjects() {
   }
 
   if (is_large_object_space_immune_) {
-    timings_.NewSplit("VisitLargeObjects");
+    TimingLogger::ScopedTiming t("VisitLargeObjects", GetTimings());
     DCHECK(generational_ && !whole_heap_collection_);
     // Delay copying the live set to the marked set until here from
     // BindBitmaps() as the large objects on the allocation stack may
@@ -421,31 +412,24 @@ void SemiSpace::MarkReachableObjects() {
                                         reinterpret_cast<uintptr_t>(large_object_space->End()),
                                         visitor);
   }
-  timings_.EndSplit();
   // Recursively process the mark stack.
   ProcessMarkStack();
 }
 
 void SemiSpace::ReclaimPhase() {
-  TimingLogger::ScopedSplit split("ReclaimPhase", &timings_);
-  {
-    WriterMutexLock mu(self_, *Locks::heap_bitmap_lock_);
-    // Reclaim unmarked objects.
-    Sweep(false);
-    // Swap the live and mark bitmaps for each space which we modified space. This is an
-    // optimization that enables us to not clear live bits inside of the sweep. Only swaps unbound
-    // bitmaps.
-    timings_.StartSplit("SwapBitmaps");
-    SwapBitmaps();
-    timings_.EndSplit();
-    // Unbind the live and mark bitmaps.
-    TimingLogger::ScopedSplit split("UnBindBitmaps", &timings_);
-    GetHeap()->UnBindBitmaps();
-  }
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  WriterMutexLock mu(self_, *Locks::heap_bitmap_lock_);
+  // Reclaim unmarked objects.
+  Sweep(false);
+  // Swap the live and mark bitmaps for each space which we modified space. This is an
+  // optimization that enables us to not clear live bits inside of the sweep. Only swaps unbound
+  // bitmaps.
+  SwapBitmaps();
+  // Unbind the live and mark bitmaps.
+  GetHeap()->UnBindBitmaps();
   if (saved_bytes_ > 0) {
     VLOG(heap) << "Avoided dirtying " << PrettySize(saved_bytes_);
   }
-
   if (generational_) {
     // Record the end (top) of the to space so we can distinguish
     // between objects that were allocated since the last GC and the
@@ -634,8 +618,7 @@ void SemiSpace::MarkRootCallback(Object** root, void* arg, uint32_t /*thread_id*
 
 // Marks all objects in the root set.
 void SemiSpace::MarkRoots() {
-  timings_.NewSplit("MarkRoots");
-  // TODO: Visit up image roots as well?
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   Runtime::Current()->VisitRoots(MarkRootCallback, this);
 }
 
@@ -660,9 +643,8 @@ mirror::Object* SemiSpace::MarkedForwardingAddressCallback(mirror::Object* objec
 }
 
 void SemiSpace::SweepSystemWeaks() {
-  timings_.StartSplit("SweepSystemWeaks");
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   Runtime::Current()->SweepSystemWeaks(MarkedForwardingAddressCallback, this);
-  timings_.EndSplit();
 }
 
 bool SemiSpace::ShouldSweepSpace(space::ContinuousSpace* space) const {
@@ -670,20 +652,17 @@ bool SemiSpace::ShouldSweepSpace(space::ContinuousSpace* space) const {
 }
 
 void SemiSpace::Sweep(bool swap_bitmaps) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   DCHECK(mark_stack_->IsEmpty());
-  TimingLogger::ScopedSplit split("Sweep", &timings_);
   for (const auto& space : GetHeap()->GetContinuousSpaces()) {
     if (space->IsContinuousMemMapAllocSpace()) {
       space::ContinuousMemMapAllocSpace* alloc_space = space->AsContinuousMemMapAllocSpace();
       if (!ShouldSweepSpace(alloc_space)) {
         continue;
       }
-      TimingLogger::ScopedSplit split(
-          alloc_space->IsZygoteSpace() ? "SweepZygoteSpace" : "SweepAllocSpace", &timings_);
-      size_t freed_objects = 0;
-      size_t freed_bytes = 0;
-      alloc_space->Sweep(swap_bitmaps, &freed_objects, &freed_bytes);
-      RecordFree(freed_objects, freed_bytes);
+      TimingLogger::ScopedTiming split(
+          alloc_space->IsZygoteSpace() ? "SweepZygoteSpace" : "SweepAllocSpace", GetTimings());
+      RecordFree(alloc_space->Sweep(swap_bitmaps));
     }
   }
   if (!is_large_object_space_immune_) {
@@ -693,11 +672,8 @@ void SemiSpace::Sweep(bool swap_bitmaps) {
 
 void SemiSpace::SweepLargeObjects(bool swap_bitmaps) {
   DCHECK(!is_large_object_space_immune_);
-  TimingLogger::ScopedSplit split("SweepLargeObjects", &timings_);
-  size_t freed_objects = 0;
-  size_t freed_bytes = 0;
-  heap_->GetLargeObjectsSpace()->Sweep(swap_bitmaps, &freed_objects, &freed_bytes);
-  RecordFreeLargeObjects(freed_objects, freed_bytes);
+  TimingLogger::ScopedTiming split("SweepLargeObjects", GetTimings());
+  RecordFreeLOS(heap_->GetLargeObjectsSpace()->Sweep(swap_bitmaps));
 }
 
 // Process the "referent" field in a java.lang.ref.Reference.  If the referent has not yet been
@@ -737,6 +713,7 @@ void SemiSpace::ScanObject(Object* obj) {
 
 // Scan anything that's on the mark stack.
 void SemiSpace::ProcessMarkStack() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   space::MallocSpace* promo_dest_space = nullptr;
   accounting::ContinuousSpaceBitmap* live_bitmap = nullptr;
   if (generational_ && !whole_heap_collection_) {
@@ -750,7 +727,6 @@ void SemiSpace::ProcessMarkStack() {
     DCHECK(mark_bitmap != nullptr);
     DCHECK_EQ(live_bitmap, mark_bitmap);
   }
-  timings_.StartSplit("ProcessMarkStack");
   while (!mark_stack_->IsEmpty()) {
     Object* obj = mark_stack_->PopBack();
     if (generational_ && !whole_heap_collection_ && promo_dest_space->HasAddress(obj)) {
@@ -761,7 +737,6 @@ void SemiSpace::ProcessMarkStack() {
     }
     ScanObject(obj);
   }
-  timings_.EndSplit();
 }
 
 inline Object* SemiSpace::GetMarkedForwardAddress(mirror::Object* obj) const
@@ -792,7 +767,7 @@ void SemiSpace::SetFromSpace(space::ContinuousMemMapAllocSpace* from_space) {
 }
 
 void SemiSpace::FinishPhase() {
-  TimingLogger::ScopedSplit split("FinishPhase", &timings_);
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   // Null the "to" and "from" spaces since compacting from one to the other isn't valid until
   // further action is done by the heap.
   to_space_ = nullptr;
@@ -833,9 +808,8 @@ void SemiSpace::FinishPhase() {
 }
 
 void SemiSpace::RevokeAllThreadLocalBuffers() {
-  timings_.StartSplit("(Paused)RevokeAllThreadLocalBuffers");
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   GetHeap()->RevokeAllThreadLocalBuffers();
-  timings_.EndSplit();
 }
 
 }  // namespace collector
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 1c94d6f224..696728ba9a 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -866,7 +866,10 @@ void Heap::DoPendingTransitionOrTrim() {
     // about pauses.
     Runtime* runtime = Runtime::Current();
     runtime->GetThreadList()->SuspendAll();
-    runtime->GetMonitorList()->DeflateMonitors();
+    uint64_t start_time = NanoTime();
+    size_t count = runtime->GetMonitorList()->DeflateMonitors();
+    VLOG(heap) << "Deflating " << count << " monitors took "
+        << PrettyDuration(NanoTime() - start_time);
     runtime->GetThreadList()->ResumeAll();
     // Do a heap trim if it is needed.
     Trim();
@@ -1580,6 +1583,7 @@ class ZygoteCompactingCollector FINAL : public collector::SemiSpace {
 };
 
 void Heap::UnBindBitmaps() {
+  TimingLogger::ScopedTiming t("UnBindBitmaps", GetCurrentGcIteration()->GetTimings());
   for (const auto& space : GetContinuousSpaces()) {
     if (space->IsContinuousMemMapAllocSpace()) {
       space::ContinuousMemMapAllocSpace* alloc_space = space->AsContinuousMemMapAllocSpace();
@@ -1643,8 +1647,8 @@ void Heap::PreZygoteFork() {
     if (temp_space_ != nullptr) {
       CHECK(temp_space_->IsEmpty());
     }
-    total_objects_freed_ever_ += semi_space_collector_->GetFreedObjects();
-    total_bytes_freed_ever_ += semi_space_collector_->GetFreedBytes();
+    total_objects_freed_ever_ += GetCurrentGcIteration()->GetFreedObjects();
+    total_bytes_freed_ever_ += GetCurrentGcIteration()->GetFreedBytes();
     // Update the end and write out image.
     non_moving_space_->SetEnd(target_space.End());
     non_moving_space_->SetLimit(target_space.Limit());
@@ -1838,17 +1842,17 @@ collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type, GcCaus
       << "Could not find garbage collector with collector_type="
       << static_cast<size_t>(collector_type_) << " and gc_type=" << gc_type;
   collector->Run(gc_cause, clear_soft_references || runtime->IsZygote());
-  total_objects_freed_ever_ += collector->GetFreedObjects();
-  total_bytes_freed_ever_ += collector->GetFreedBytes();
+  total_objects_freed_ever_ += GetCurrentGcIteration()->GetFreedObjects();
+  total_bytes_freed_ever_ += GetCurrentGcIteration()->GetFreedBytes();
   RequestHeapTrim();
   // Enqueue cleared references.
   reference_processor_.EnqueueClearedReferences(self);
   // Grow the heap so that we know when to perform the next GC.
   GrowForUtilization(collector);
-  const size_t duration = collector->GetDurationNs();
-  const std::vector<uint64_t>& pause_times = collector->GetPauseTimes();
+  const size_t duration = GetCurrentGcIteration()->GetDurationNs();
+  const std::vector<uint64_t>& pause_times = GetCurrentGcIteration()->GetPauseTimes();
   // Print the GC if it is an explicit GC (e.g. Runtime.gc()) or a slow GC
-  // (mutator time blocked >=  long_pause_log_threshold_).
+  // (mutator time blocked >= long_pause_log_threshold_).
   bool log_gc = gc_cause == kGcCauseExplicit;
   if (!log_gc && CareAboutPauseTimes()) {
     // GC for alloc pauses the allocating thread, so consider it as a pause.
@@ -1868,14 +1872,14 @@ collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type, GcCaus
                      << ((i != pause_times.size() - 1) ? "," : "");
     }
     LOG(INFO) << gc_cause << " " << collector->GetName()
-              << " GC freed "  << collector->GetFreedObjects() << "("
-              << PrettySize(collector->GetFreedBytes()) << ") AllocSpace objects, "
-              << collector->GetFreedLargeObjects() << "("
-              << PrettySize(collector->GetFreedLargeObjectBytes()) << ") LOS objects, "
+              << " GC freed "  << current_gc_iteration_.GetFreedObjects() << "("
+              << PrettySize(current_gc_iteration_.GetFreedBytes()) << ") AllocSpace objects, "
+              << current_gc_iteration_.GetFreedLargeObjects() << "("
+              << PrettySize(current_gc_iteration_.GetFreedLargeObjectBytes()) << ") LOS objects, "
               << percent_free << "% free, " << PrettySize(current_heap_size) << "/"
               << PrettySize(total_memory) << ", " << "paused " << pause_string.str()
               << " total " << PrettyDuration((duration / 1000) * 1000);
-    VLOG(heap) << ConstDumpable<TimingLogger>(collector->GetTimings());
+    VLOG(heap) << ConstDumpable<TimingLogger>(*current_gc_iteration_.GetTimings());
   }
   FinishGC(self, gc_type);
   // Inform DDMS that a GC completed.
@@ -2313,7 +2317,8 @@ accounting::RememberedSet* Heap::FindRememberedSetFromSpace(space::Space* space)
   return it->second;
 }
 
-void Heap::ProcessCards(TimingLogger& timings, bool use_rem_sets) {
+void Heap::ProcessCards(TimingLogger* timings, bool use_rem_sets) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, timings);
   // Clear cards and keep track of cards cleared in the mod-union table.
   for (const auto& space : continuous_spaces_) {
     accounting::ModUnionTable* table = FindModUnionTableFromSpace(space);
@@ -2321,15 +2326,15 @@ void Heap::ProcessCards(TimingLogger& timings, bool use_rem_sets) {
     if (table != nullptr) {
       const char* name = space->IsZygoteSpace() ? "ZygoteModUnionClearCards" :
           "ImageModUnionClearCards";
-      TimingLogger::ScopedSplit split(name, &timings);
+      TimingLogger::ScopedTiming t(name, timings);
       table->ClearCards();
     } else if (use_rem_sets && rem_set != nullptr) {
       DCHECK(collector::SemiSpace::kUseRememberedSet && collector_type_ == kCollectorTypeGSS)
           << static_cast<int>(collector_type_);
-      TimingLogger::ScopedSplit split("AllocSpaceRemSetClearCards", &timings);
+      TimingLogger::ScopedTiming t("AllocSpaceRemSetClearCards", timings);
       rem_set->ClearCards();
     } else if (space->GetType() != space::kSpaceTypeBumpPointerSpace) {
-      TimingLogger::ScopedSplit split("AllocSpaceClearCards", &timings);
+      TimingLogger::ScopedTiming t("AllocSpaceClearCards", timings);
       // No mod union table for the AllocSpace. Age the cards so that the GC knows that these cards
       // were dirty before the GC started.
       // TODO: Need to use atomic for the case where aged(cleaning thread) -> dirty(other thread)
@@ -2337,7 +2342,8 @@ void Heap::ProcessCards(TimingLogger& timings, bool use_rem_sets) {
       // The races are we either end up with: Aged card, unaged card. Since we have the checkpoint
       // roots and then we scan / update mod union tables after. We will always scan either card.
       // If we end up with the non aged card, we scan it it in the pause.
-      card_table_->ModifyCardsAtomic(space->Begin(), space->End(), AgeCardVisitor(), VoidFunctor());
+      card_table_->ModifyCardsAtomic(space->Begin(), space->End(), AgeCardVisitor(),
+                                     VoidFunctor());
     }
   }
 }
@@ -2347,9 +2353,10 @@ static void IdentityMarkHeapReferenceCallback(mirror::HeapReference<mirror::Obje
 
 void Heap::PreGcVerificationPaused(collector::GarbageCollector* gc) {
   Thread* const self = Thread::Current();
-  TimingLogger* const timings = &gc->GetTimings();
+  TimingLogger* const timings = current_gc_iteration_.GetTimings();
+  TimingLogger::ScopedTiming t(__FUNCTION__, timings);
   if (verify_pre_gc_heap_) {
-    TimingLogger::ScopedSplit split("PreGcVerifyHeapReferences", timings);
+    TimingLogger::ScopedTiming t("(Paused)PreGcVerifyHeapReferences", timings);
     ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
     size_t failures = VerifyHeapReferences();
     if (failures > 0) {
@@ -2359,7 +2366,7 @@ void Heap::PreGcVerificationPaused(collector::GarbageCollector* gc) {
   }
   // Check that all objects which reference things in the live stack are on dirty cards.
   if (verify_missing_card_marks_) {
-    TimingLogger::ScopedSplit split("PreGcVerifyMissingCardMarks", timings);
+    TimingLogger::ScopedTiming t("(Paused)PreGcVerifyMissingCardMarks", timings);
     ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
     SwapStacks(self);
     // Sort the live stack so that we can quickly binary search it later.
@@ -2369,7 +2376,7 @@ void Heap::PreGcVerificationPaused(collector::GarbageCollector* gc) {
     SwapStacks(self);
   }
   if (verify_mod_union_table_) {
-    TimingLogger::ScopedSplit split("PreGcVerifyModUnionTables", timings);
+    TimingLogger::ScopedTiming t("(Paused)PreGcVerifyModUnionTables", timings);
     ReaderMutexLock reader_lock(self, *Locks::heap_bitmap_lock_);
     for (const auto& table_pair : mod_union_tables_) {
       accounting::ModUnionTable* mod_union_table = table_pair.second;
@@ -2389,17 +2396,18 @@ void Heap::PreGcVerification(collector::GarbageCollector* gc) {
 void Heap::PrePauseRosAllocVerification(collector::GarbageCollector* gc) {
   // TODO: Add a new runtime option for this?
   if (verify_pre_gc_rosalloc_) {
-    RosAllocVerification(&gc->GetTimings(), "PreGcRosAllocVerification");
+    RosAllocVerification(current_gc_iteration_.GetTimings(), "PreGcRosAllocVerification");
   }
 }
 
 void Heap::PreSweepingGcVerification(collector::GarbageCollector* gc) {
   Thread* const self = Thread::Current();
-  TimingLogger* const timings = &gc->GetTimings();
+  TimingLogger* const timings = current_gc_iteration_.GetTimings();
+  TimingLogger::ScopedTiming t(__FUNCTION__, timings);
   // Called before sweeping occurs since we want to make sure we are not going so reclaim any
   // reachable objects.
   if (verify_pre_sweeping_heap_) {
-    TimingLogger::ScopedSplit split("PostSweepingVerifyHeapReferences", timings);
+    TimingLogger::ScopedTiming t("(Paused)PostSweepingVerifyHeapReferences", timings);
     CHECK_NE(self->GetState(), kRunnable);
     WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
     // Swapping bound bitmaps does nothing.
@@ -2421,17 +2429,18 @@ void Heap::PreSweepingGcVerification(collector::GarbageCollector* gc) {
 void Heap::PostGcVerificationPaused(collector::GarbageCollector* gc) {
   // Only pause if we have to do some verification.
   Thread* const self = Thread::Current();
-  TimingLogger* const timings = &gc->GetTimings();
+  TimingLogger* const timings = GetCurrentGcIteration()->GetTimings();
+  TimingLogger::ScopedTiming t(__FUNCTION__, timings);
   if (verify_system_weaks_) {
     ReaderMutexLock mu2(self, *Locks::heap_bitmap_lock_);
     collector::MarkSweep* mark_sweep = down_cast<collector::MarkSweep*>(gc);
     mark_sweep->VerifySystemWeaks();
   }
   if (verify_post_gc_rosalloc_) {
-    RosAllocVerification(timings, "PostGcRosAllocVerification");
+    RosAllocVerification(timings, "(Paused)PostGcRosAllocVerification");
   }
   if (verify_post_gc_heap_) {
-    TimingLogger::ScopedSplit split("PostGcVerifyHeapReferences", timings);
+    TimingLogger::ScopedTiming t("(Paused)PostGcVerifyHeapReferences", timings);
     ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
     size_t failures = VerifyHeapReferences();
     if (failures > 0) {
@@ -2449,7 +2458,7 @@ void Heap::PostGcVerification(collector::GarbageCollector* gc) {
 }
 
 void Heap::RosAllocVerification(TimingLogger* timings, const char* name) {
-  TimingLogger::ScopedSplit split(name, timings);
+  TimingLogger::ScopedTiming t(name, timings);
   for (const auto& space : continuous_spaces_) {
     if (space->IsRosAllocSpace()) {
       VLOG(heap) << name << " : " << space->GetName();
@@ -2575,9 +2584,9 @@ void Heap::GrowForUtilization(collector::GarbageCollector* collector_ran) {
     // We also check that the bytes allocated aren't over the footprint limit in order to prevent a
     // pathological case where dead objects which aren't reclaimed by sticky could get accumulated
     // if the sticky GC throughput always remained >= the full/partial throughput.
-    if (collector_ran->GetEstimatedLastIterationThroughput() * kStickyGcThroughputAdjustment >=
+    if (current_gc_iteration_.GetEstimatedThroughput() * kStickyGcThroughputAdjustment >=
         non_sticky_collector->GetEstimatedMeanThroughput() &&
-        non_sticky_collector->GetIterations() > 0 &&
+        non_sticky_collector->NumberOfIterations() > 0 &&
         bytes_allocated <= max_allowed_footprint_) {
       next_gc_type_ = collector::kGcTypeSticky;
     } else {
@@ -2595,7 +2604,7 @@ void Heap::GrowForUtilization(collector::GarbageCollector* collector_ran) {
     if (IsGcConcurrent()) {
       // Calculate when to perform the next ConcurrentGC.
       // Calculate the estimated GC duration.
-      const double gc_duration_seconds = NsToMs(collector_ran->GetDurationNs()) / 1000.0;
+      const double gc_duration_seconds = NsToMs(current_gc_iteration_.GetDurationNs()) / 1000.0;
       // Estimate how many remaining bytes we will have when we need to start the next GC.
       size_t remaining_bytes = allocation_rate_ * gc_duration_seconds;
       remaining_bytes = std::min(remaining_bytes, kMaxConcurrentRemainingBytes);
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 368a20c5da..a34cd3871d 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -27,6 +27,7 @@
 #include "gc/accounting/atomic_stack.h"
 #include "gc/accounting/card_table.h"
 #include "gc/gc_cause.h"
+#include "gc/collector/garbage_collector.h"
 #include "gc/collector/gc_type.h"
 #include "gc/collector_type.h"
 #include "globals.h"
@@ -317,6 +318,13 @@ class Heap {
     return discontinuous_spaces_;
   }
 
+  const collector::Iteration* GetCurrentGcIteration() const {
+    return &current_gc_iteration_;
+  }
+  collector::Iteration* GetCurrentGcIteration() {
+    return &current_gc_iteration_;
+  }
+
   // Enable verification of object references when the runtime is sufficiently initialized.
   void EnableObjectValidation() {
     verify_object_mode_ = kVerifyObjectSupport;
@@ -690,7 +698,7 @@ class Heap {
   void SwapStacks(Thread* self);
 
   // Clear cards and update the mod union table.
-  void ProcessCards(TimingLogger& timings, bool use_rem_sets);
+  void ProcessCards(TimingLogger* timings, bool use_rem_sets);
 
   // Signal the heap trim daemon that there is something to do, either a heap transition or heap
   // trim.
@@ -849,6 +857,9 @@ class Heap {
   // Data structure GC overhead.
   Atomic<size_t> gc_memory_overhead_;
 
+  // Info related to the current or previous GC iteration.
+  collector::Iteration current_gc_iteration_;
+
   // Heap verification flags.
   const bool verify_missing_card_marks_;
   const bool verify_system_weaks_;
diff --git a/runtime/gc/reference_processor.cc b/runtime/gc/reference_processor.cc
index 292781e6e7..e52bc1fd1e 100644
--- a/runtime/gc/reference_processor.cc
+++ b/runtime/gc/reference_processor.cc
@@ -110,6 +110,7 @@ void ReferenceProcessor::ProcessReferences(bool concurrent, TimingLogger* timing
                                            MarkObjectCallback* mark_object_callback,
                                            ProcessMarkStackCallback* process_mark_stack_callback,
                                            void* arg) {
+  TimingLogger::ScopedTiming t(concurrent ? __FUNCTION__ : "(Paused)ProcessReferences", timings);
   Thread* self = Thread::Current();
   {
     MutexLock mu(self, lock_);
@@ -118,10 +119,9 @@ void ReferenceProcessor::ProcessReferences(bool concurrent, TimingLogger* timing
     process_references_args_.arg_ = arg;
     CHECK_EQ(slow_path_enabled_, concurrent) << "Slow path must be enabled iff concurrent";
   }
-  timings->StartSplit(concurrent ? "ProcessReferences" : "(Paused)ProcessReferences");
   // Unless required to clear soft references with white references, preserve some white referents.
   if (!clear_soft_references) {
-    TimingLogger::ScopedSplit split(concurrent ? "ForwardSoftReferences" :
+    TimingLogger::ScopedTiming split(concurrent ? "ForwardSoftReferences" :
         "(Paused)ForwardSoftReferences", timings);
     if (concurrent) {
       StartPreservingReferences(self);
@@ -138,7 +138,7 @@ void ReferenceProcessor::ProcessReferences(bool concurrent, TimingLogger* timing
   soft_reference_queue_.ClearWhiteReferences(&cleared_references_, is_marked_callback, arg);
   weak_reference_queue_.ClearWhiteReferences(&cleared_references_, is_marked_callback, arg);
   {
-    TimingLogger::ScopedSplit split(concurrent ? "EnqueueFinalizerReferences" :
+    TimingLogger::ScopedTiming t(concurrent ? "EnqueueFinalizerReferences" :
         "(Paused)EnqueueFinalizerReferences", timings);
     if (concurrent) {
       StartPreservingReferences(self);
@@ -173,7 +173,6 @@ void ReferenceProcessor::ProcessReferences(bool concurrent, TimingLogger* timing
       DisableSlowPath(self);
     }
   }
-  timings->EndSplit();
 }
 
 // Process the "referent" field in a java.lang.ref.Reference.  If the referent has not yet been
diff --git a/runtime/gc/space/large_object_space.cc b/runtime/gc/space/large_object_space.cc
index 54a63f065d..abae8ff346 100644
--- a/runtime/gc/space/large_object_space.cc
+++ b/runtime/gc/space/large_object_space.cc
@@ -411,28 +411,24 @@ void LargeObjectSpace::SweepCallback(size_t num_ptrs, mirror::Object** ptrs, voi
       bitmap->Clear(ptrs[i]);
     }
   }
-  context->freed_objects += num_ptrs;
-  context->freed_bytes += space->FreeList(self, num_ptrs, ptrs);
+  context->freed.objects += num_ptrs;
+  context->freed.bytes += space->FreeList(self, num_ptrs, ptrs);
 }
 
-void LargeObjectSpace::Sweep(bool swap_bitmaps, size_t* out_freed_objects,
-                             size_t* out_freed_bytes) {
+collector::ObjectBytePair LargeObjectSpace::Sweep(bool swap_bitmaps) {
   if (Begin() >= End()) {
-    return;
+    return collector::ObjectBytePair(0, 0);
   }
   accounting::LargeObjectBitmap* live_bitmap = GetLiveBitmap();
   accounting::LargeObjectBitmap* mark_bitmap = GetMarkBitmap();
   if (swap_bitmaps) {
     std::swap(live_bitmap, mark_bitmap);
   }
-  DCHECK(out_freed_objects != nullptr);
-  DCHECK(out_freed_bytes != nullptr);
-  SweepCallbackContext scc(swap_bitmaps, this);
+  AllocSpace::SweepCallbackContext scc(swap_bitmaps, this);
   accounting::LargeObjectBitmap::SweepWalk(*live_bitmap, *mark_bitmap,
                                            reinterpret_cast<uintptr_t>(Begin()),
                                            reinterpret_cast<uintptr_t>(End()), SweepCallback, &scc);
-  *out_freed_objects += scc.freed_objects;
-  *out_freed_bytes += scc.freed_bytes;
+  return scc.freed;
 }
 
 }  // namespace space
diff --git a/runtime/gc/space/large_object_space.h b/runtime/gc/space/large_object_space.h
index a84b43a8a1..01982d06ab 100644
--- a/runtime/gc/space/large_object_space.h
+++ b/runtime/gc/space/large_object_space.h
@@ -73,7 +73,7 @@ class LargeObjectSpace : public DiscontinuousSpace, public AllocSpace {
     return this;
   }
 
-  void Sweep(bool swap_bitmaps, size_t* out_freed_objects, size_t* out_freed_bytes);
+  collector::ObjectBytePair Sweep(bool swap_bitmaps);
 
   virtual bool CanMoveObjects() const OVERRIDE {
     return false;
diff --git a/runtime/gc/space/malloc_space.cc b/runtime/gc/space/malloc_space.cc
index 57ed0bd35c..4d74f3c246 100644
--- a/runtime/gc/space/malloc_space.cc
+++ b/runtime/gc/space/malloc_space.cc
@@ -242,8 +242,8 @@ void MallocSpace::SweepCallback(size_t num_ptrs, mirror::Object** ptrs, void* ar
   // Use a bulk free, that merges consecutive objects before freeing or free per object?
   // Documentation suggests better free performance with merging, but this may be at the expensive
   // of allocation.
-  context->freed_objects += num_ptrs;
-  context->freed_bytes += space->FreeList(self, num_ptrs, ptrs);
+  context->freed.objects += num_ptrs;
+  context->freed.bytes += space->FreeList(self, num_ptrs, ptrs);
 }
 
 }  // namespace space
diff --git a/runtime/gc/space/space.cc b/runtime/gc/space/space.cc
index 4e2841691e..bff28f6d19 100644
--- a/runtime/gc/space/space.cc
+++ b/runtime/gc/space/space.cc
@@ -81,14 +81,12 @@ DiscontinuousSpace::DiscontinuousSpace(const std::string& name,
   CHECK(mark_bitmap_.get() != nullptr);
 }
 
-void ContinuousMemMapAllocSpace::Sweep(bool swap_bitmaps, size_t* freed_objects, size_t* freed_bytes) {
-  DCHECK(freed_objects != nullptr);
-  DCHECK(freed_bytes != nullptr);
+collector::ObjectBytePair ContinuousMemMapAllocSpace::Sweep(bool swap_bitmaps) {
   accounting::ContinuousSpaceBitmap* live_bitmap = GetLiveBitmap();
   accounting::ContinuousSpaceBitmap* mark_bitmap = GetMarkBitmap();
   // If the bitmaps are bound then sweeping this space clearly won't do anything.
   if (live_bitmap == mark_bitmap) {
-    return;
+    return collector::ObjectBytePair(0, 0);
   }
   SweepCallbackContext scc(swap_bitmaps, this);
   if (swap_bitmaps) {
@@ -98,8 +96,7 @@ void ContinuousMemMapAllocSpace::Sweep(bool swap_bitmaps, size_t* freed_objects,
   accounting::ContinuousSpaceBitmap::SweepWalk(
       *live_bitmap, *mark_bitmap, reinterpret_cast<uintptr_t>(Begin()),
       reinterpret_cast<uintptr_t>(End()), GetSweepCallback(), reinterpret_cast<void*>(&scc));
-  *freed_objects += scc.freed_objects;
-  *freed_bytes += scc.freed_bytes;
+  return scc.freed;
 }
 
 // Returns the old mark bitmap.
@@ -136,9 +133,8 @@ void ContinuousMemMapAllocSpace::SwapBitmaps() {
   mark_bitmap_->SetName(temp_name);
 }
 
-Space::SweepCallbackContext::SweepCallbackContext(bool swap_bitmaps, space::Space* space)
-    : swap_bitmaps(swap_bitmaps), space(space), self(Thread::Current()), freed_objects(0),
-      freed_bytes(0) {
+AllocSpace::SweepCallbackContext::SweepCallbackContext(bool swap_bitmaps, space::Space* space)
+    : swap_bitmaps(swap_bitmaps), space(space), self(Thread::Current()) {
 }
 
 }  // namespace space
diff --git a/runtime/gc/space/space.h b/runtime/gc/space/space.h
index 8415fa18ad..8444a70b9c 100644
--- a/runtime/gc/space/space.h
+++ b/runtime/gc/space/space.h
@@ -23,6 +23,7 @@
 #include "base/macros.h"
 #include "base/mutex.h"
 #include "gc/accounting/space_bitmap.h"
+#include "gc/collector/garbage_collector.h"
 #include "globals.h"
 #include "image.h"
 #include "mem_map.h"
@@ -172,16 +173,6 @@ class Space {
   std::string name_;
 
  protected:
-  struct SweepCallbackContext {
-   public:
-    SweepCallbackContext(bool swap_bitmaps, space::Space* space);
-    const bool swap_bitmaps;
-    space::Space* const space;
-    Thread* const self;
-    size_t freed_objects;
-    size_t freed_bytes;
-  };
-
   // When should objects within this space be reclaimed? Not constant as we vary it in the case
   // of Zygote forking.
   GcRetentionPolicy gc_retention_policy_;
@@ -232,6 +223,14 @@ class AllocSpace {
   virtual void RevokeAllThreadLocalBuffers() = 0;
 
  protected:
+  struct SweepCallbackContext {
+    SweepCallbackContext(bool swap_bitmaps, space::Space* space);
+    const bool swap_bitmaps;
+    space::Space* const space;
+    Thread* const self;
+    collector::ObjectBytePair freed;
+  };
+
   AllocSpace() {}
   virtual ~AllocSpace() {}
 
@@ -415,7 +414,7 @@ class ContinuousMemMapAllocSpace : public MemMapSpace, public AllocSpace {
     return mark_bitmap_.get();
   }
 
-  void Sweep(bool swap_bitmaps, size_t* freed_objects, size_t* freed_bytes);
+  collector::ObjectBytePair Sweep(bool swap_bitmaps);
   virtual accounting::ContinuousSpaceBitmap::SweepCallback* GetSweepCallback() = 0;
 
  protected:
diff --git a/runtime/jdwp/jdwp.h b/runtime/jdwp/jdwp.h
index 1477324ca0..325b089a48 100644
--- a/runtime/jdwp/jdwp.h
+++ b/runtime/jdwp/jdwp.h
@@ -294,14 +294,14 @@ struct JdwpState {
                                      ObjectId threadId)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void CleanupMatchList(JdwpEvent** match_list,
-                        int match_count)
+                        size_t match_count)
       EXCLUSIVE_LOCKS_REQUIRED(event_list_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void EventFinish(ExpandBuf* pReq);
   void FindMatchingEvents(JdwpEventKind eventKind,
-                          ModBasket* basket,
+                          const ModBasket& basket,
                           JdwpEvent** match_list,
-                          int* pMatchCount)
+                          size_t* pMatchCount)
       EXCLUSIVE_LOCKS_REQUIRED(event_list_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void UnregisterEvent(JdwpEvent* pEvent)
diff --git a/runtime/jdwp/jdwp_event.cc b/runtime/jdwp/jdwp_event.cc
index cb2c420dbb..86c84e8b0f 100644
--- a/runtime/jdwp/jdwp_event.cc
+++ b/runtime/jdwp/jdwp_event.cc
@@ -397,7 +397,7 @@ static JdwpEvent** AllocMatchList(size_t event_count) {
  * Run through the list and remove any entries with an expired "count" mod
  * from the event list, then free the match list.
  */
-void JdwpState::CleanupMatchList(JdwpEvent** match_list, int match_count) {
+void JdwpState::CleanupMatchList(JdwpEvent** match_list, size_t match_count) {
   JdwpEvent** ppEvent = match_list;
 
   while (match_count--) {
@@ -405,7 +405,8 @@ void JdwpState::CleanupMatchList(JdwpEvent** match_list, int match_count) {
 
     for (int i = 0; i < pEvent->modCount; i++) {
       if (pEvent->mods[i].modKind == MK_COUNT && pEvent->mods[i].count.count == 0) {
-        VLOG(jdwp) << "##### Removing expired event";
+        VLOG(jdwp) << StringPrintf("##### Removing expired event (requestId=%#" PRIx32 ")",
+                                   pEvent->requestId);
         UnregisterEvent(pEvent);
         EventFree(pEvent);
         break;
@@ -445,7 +446,7 @@ static bool PatternMatch(const char* pattern, const std::string& target) {
  * If we find a Count mod before rejecting an event, we decrement it.  We
  * need to do this even if later mods cause us to ignore the event.
  */
-static bool ModsMatch(JdwpEvent* pEvent, ModBasket* basket)
+static bool ModsMatch(JdwpEvent* pEvent, const ModBasket& basket)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   JdwpEventMod* pMod = pEvent->mods;
 
@@ -462,53 +463,53 @@ static bool ModsMatch(JdwpEvent* pEvent, ModBasket* basket)
       CHECK(false);  // should not be getting these
       break;
     case MK_THREAD_ONLY:
-      if (pMod->threadOnly.threadId != basket->threadId) {
+      if (pMod->threadOnly.threadId != basket.threadId) {
         return false;
       }
       break;
     case MK_CLASS_ONLY:
-      if (!Dbg::MatchType(basket->classId, pMod->classOnly.refTypeId)) {
+      if (!Dbg::MatchType(basket.classId, pMod->classOnly.refTypeId)) {
         return false;
       }
       break;
     case MK_CLASS_MATCH:
-      if (!PatternMatch(pMod->classMatch.classPattern, basket->className)) {
+      if (!PatternMatch(pMod->classMatch.classPattern, basket.className)) {
         return false;
       }
       break;
     case MK_CLASS_EXCLUDE:
-      if (PatternMatch(pMod->classMatch.classPattern, basket->className)) {
+      if (PatternMatch(pMod->classMatch.classPattern, basket.className)) {
         return false;
       }
       break;
     case MK_LOCATION_ONLY:
-      if (pMod->locationOnly.loc != *basket->pLoc) {
+      if (pMod->locationOnly.loc != *basket.pLoc) {
         return false;
       }
       break;
     case MK_EXCEPTION_ONLY:
-      if (pMod->exceptionOnly.refTypeId != 0 && !Dbg::MatchType(basket->excepClassId, pMod->exceptionOnly.refTypeId)) {
+      if (pMod->exceptionOnly.refTypeId != 0 && !Dbg::MatchType(basket.excepClassId, pMod->exceptionOnly.refTypeId)) {
         return false;
       }
-      if ((basket->caught && !pMod->exceptionOnly.caught) || (!basket->caught && !pMod->exceptionOnly.uncaught)) {
+      if ((basket.caught && !pMod->exceptionOnly.caught) || (!basket.caught && !pMod->exceptionOnly.uncaught)) {
         return false;
       }
       break;
     case MK_FIELD_ONLY:
-      if (pMod->fieldOnly.fieldId != basket->fieldId) {
+      if (pMod->fieldOnly.fieldId != basket.fieldId) {
         return false;
       }
-      if (!Dbg::MatchType(basket->fieldTypeID, pMod->fieldOnly.refTypeId)) {
+      if (!Dbg::MatchType(basket.fieldTypeID, pMod->fieldOnly.refTypeId)) {
         return false;
       }
       break;
     case MK_STEP:
-      if (pMod->step.threadId != basket->threadId) {
+      if (pMod->step.threadId != basket.threadId) {
         return false;
       }
       break;
     case MK_INSTANCE_ONLY:
-      if (pMod->instanceOnly.objectId != basket->thisPtr) {
+      if (pMod->instanceOnly.objectId != basket.thisPtr) {
         return false;
       }
       break;
@@ -530,19 +531,16 @@ static bool ModsMatch(JdwpEvent* pEvent, ModBasket* basket)
  * DO NOT call this multiple times for the same eventKind, as Count mods are
  * decremented during the scan.
  */
-void JdwpState::FindMatchingEvents(JdwpEventKind eventKind, ModBasket* basket,
-                                   JdwpEvent** match_list, int* pMatchCount) {
+void JdwpState::FindMatchingEvents(JdwpEventKind eventKind, const ModBasket& basket,
+                                   JdwpEvent** match_list, size_t* pMatchCount) {
   /* start after the existing entries */
   match_list += *pMatchCount;
 
-  JdwpEvent* pEvent = event_list_;
-  while (pEvent != NULL) {
+  for (JdwpEvent* pEvent = event_list_; pEvent != nullptr; pEvent = pEvent->next) {
     if (pEvent->eventKind == eventKind && ModsMatch(pEvent, basket)) {
       *match_list++ = pEvent;
       (*pMatchCount)++;
     }
-
-    pEvent = pEvent->next;
   }
 }
 
@@ -774,6 +772,22 @@ bool JdwpState::PostVMStart() {
   return true;
 }
 
+static void LogMatchingEventsAndThread(JdwpEvent** match_list, size_t match_count,
+                                       const ModBasket& basket)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  for (size_t i = 0; i < match_count; ++i) {
+    JdwpEvent* pEvent = match_list[i];
+    VLOG(jdwp) << "EVENT #" << i << ": " << pEvent->eventKind
+               << StringPrintf(" (requestId=%#" PRIx32 ")", pEvent->requestId);
+  }
+  std::string thread_name;
+  JdwpError error = Dbg::GetThreadName(basket.threadId, thread_name);
+  if (error != JDWP::ERR_NONE) {
+    thread_name = "<unknown>";
+  }
+  VLOG(jdwp) << StringPrintf("  thread=%#" PRIx64, basket.threadId) << " " << thread_name;
+}
+
 /*
  * A location of interest has been reached.  This handles:
  *   Breakpoint
@@ -829,39 +843,40 @@ bool JdwpState::PostLocationEvent(const JdwpLocation* pLoc, ObjectId thisPtr, in
     return false;
   }
 
-  int match_count = 0;
+  size_t match_count = 0;
   ExpandBuf* pReq = NULL;
   JdwpSuspendPolicy suspend_policy = SP_NONE;
   {
     MutexLock mu(Thread::Current(), event_list_lock_);
     JdwpEvent** match_list = AllocMatchList(event_list_size_);
     if ((eventFlags & Dbg::kBreakpoint) != 0) {
-      FindMatchingEvents(EK_BREAKPOINT, &basket, match_list, &match_count);
+      FindMatchingEvents(EK_BREAKPOINT, basket, match_list, &match_count);
     }
     if ((eventFlags & Dbg::kSingleStep) != 0) {
-      FindMatchingEvents(EK_SINGLE_STEP, &basket, match_list, &match_count);
+      FindMatchingEvents(EK_SINGLE_STEP, basket, match_list, &match_count);
     }
     if ((eventFlags & Dbg::kMethodEntry) != 0) {
-      FindMatchingEvents(EK_METHOD_ENTRY, &basket, match_list, &match_count);
+      FindMatchingEvents(EK_METHOD_ENTRY, basket, match_list, &match_count);
     }
     if ((eventFlags & Dbg::kMethodExit) != 0) {
-      FindMatchingEvents(EK_METHOD_EXIT, &basket, match_list, &match_count);
-      FindMatchingEvents(EK_METHOD_EXIT_WITH_RETURN_VALUE, &basket, match_list, &match_count);
+      FindMatchingEvents(EK_METHOD_EXIT, basket, match_list, &match_count);
+      FindMatchingEvents(EK_METHOD_EXIT_WITH_RETURN_VALUE, basket, match_list, &match_count);
     }
     if (match_count != 0) {
-      VLOG(jdwp) << "EVENT: " << match_list[0]->eventKind << "(" << match_count << " total) "
-                 << basket.className << "." << Dbg::GetMethodName(pLoc->method_id)
-                 << StringPrintf(" thread=%#" PRIx64 "  dex_pc=%#" PRIx64 ")",
-                                 basket.threadId, pLoc->dex_pc);
-
       suspend_policy = scanSuspendPolicy(match_list, match_count);
-      VLOG(jdwp) << "  suspend_policy=" << suspend_policy;
+
+      if (VLOG_IS_ON(jdwp)) {
+        LogMatchingEventsAndThread(match_list, match_count, basket);
+        VLOG(jdwp) << "  location=" << *pLoc;
+        VLOG(jdwp) << StringPrintf("  this=%#" PRIx64, basket.thisPtr);
+        VLOG(jdwp) << "  suspend_policy=" << suspend_policy;
+      }
 
       pReq = eventPrep();
       expandBufAdd1(pReq, suspend_policy);
       expandBufAdd4BE(pReq, match_count);
 
-      for (int i = 0; i < match_count; i++) {
+      for (size_t i = 0; i < match_count; i++) {
         expandBufAdd1(pReq, match_list[i]->eventKind);
         expandBufAdd4BE(pReq, match_list[i]->requestId);
         expandBufAdd8BE(pReq, basket.threadId);
@@ -892,6 +907,8 @@ bool JdwpState::PostFieldEvent(const JdwpLocation* pLoc, RefTypeId typeId, Field
   basket.fieldTypeID = typeId;
   basket.fieldId = fieldId;
 
+  DCHECK_EQ(fieldValue != nullptr, is_modification);
+
   if (InvokeInProgress()) {
     VLOG(jdwp) << "Not posting field event during invoke";
     return false;
@@ -912,7 +929,7 @@ bool JdwpState::PostFieldEvent(const JdwpLocation* pLoc, RefTypeId typeId, Field
     return false;
   }
 
-  int match_count = 0;
+  size_t match_count = 0;
   ExpandBuf* pReq = NULL;
   JdwpSuspendPolicy suspend_policy = SP_NONE;
   {
@@ -920,24 +937,29 @@ bool JdwpState::PostFieldEvent(const JdwpLocation* pLoc, RefTypeId typeId, Field
     JdwpEvent** match_list = AllocMatchList(event_list_size_);
 
     if (is_modification) {
-      FindMatchingEvents(EK_FIELD_MODIFICATION, &basket, match_list, &match_count);
+      FindMatchingEvents(EK_FIELD_MODIFICATION, basket, match_list, &match_count);
     } else {
-      FindMatchingEvents(EK_FIELD_ACCESS, &basket, match_list, &match_count);
+      FindMatchingEvents(EK_FIELD_ACCESS, basket, match_list, &match_count);
     }
     if (match_count != 0) {
-      VLOG(jdwp) << "EVENT: " << match_list[0]->eventKind << "(" << match_count << " total) "
-                 << basket.className << "." << Dbg::GetMethodName(pLoc->method_id)
-                 << StringPrintf(" thread=%#" PRIx64 "  dex_pc=%#" PRIx64 ")",
-                                 basket.threadId, pLoc->dex_pc);
-
       suspend_policy = scanSuspendPolicy(match_list, match_count);
-      VLOG(jdwp) << "  suspend_policy=" << suspend_policy;
+
+      if (VLOG_IS_ON(jdwp)) {
+        LogMatchingEventsAndThread(match_list, match_count, basket);
+        VLOG(jdwp) << "  location=" << *pLoc;
+        VLOG(jdwp) << StringPrintf("  this=%#" PRIx64, basket.thisPtr);
+        VLOG(jdwp) << StringPrintf("  type=%#" PRIx64, basket.fieldTypeID) << " "
+                   << Dbg::GetClassName(basket.fieldTypeID);
+        VLOG(jdwp) << StringPrintf("  field=%#" PRIx32, basket.fieldId) << " "
+                   << Dbg::GetFieldName(basket.fieldId);
+        VLOG(jdwp) << "  suspend_policy=" << suspend_policy;
+      }
 
       pReq = eventPrep();
       expandBufAdd1(pReq, suspend_policy);
       expandBufAdd4BE(pReq, match_count);
 
-      for (int i = 0; i < match_count; i++) {
+      for (size_t i = 0; i < match_count; i++) {
         expandBufAdd1(pReq, match_list[i]->eventKind);
         expandBufAdd4BE(pReq, match_list[i]->requestId);
         expandBufAdd8BE(pReq, basket.threadId);
@@ -984,30 +1006,31 @@ bool JdwpState::PostThreadChange(ObjectId threadId, bool start) {
 
   ExpandBuf* pReq = NULL;
   JdwpSuspendPolicy suspend_policy = SP_NONE;
-  int match_count = 0;
+  size_t match_count = 0;
   {
     // Don't allow the list to be updated while we scan it.
     MutexLock mu(Thread::Current(), event_list_lock_);
     JdwpEvent** match_list = AllocMatchList(event_list_size_);
 
     if (start) {
-      FindMatchingEvents(EK_THREAD_START, &basket, match_list, &match_count);
+      FindMatchingEvents(EK_THREAD_START, basket, match_list, &match_count);
     } else {
-      FindMatchingEvents(EK_THREAD_DEATH, &basket, match_list, &match_count);
+      FindMatchingEvents(EK_THREAD_DEATH, basket, match_list, &match_count);
     }
 
     if (match_count != 0) {
-      VLOG(jdwp) << "EVENT: " << match_list[0]->eventKind << "(" << match_count << " total) "
-                 << StringPrintf("thread=%#" PRIx64, basket.threadId) << ")";
-
       suspend_policy = scanSuspendPolicy(match_list, match_count);
-      VLOG(jdwp) << "  suspend_policy=" << suspend_policy;
+
+      if (VLOG_IS_ON(jdwp)) {
+        LogMatchingEventsAndThread(match_list, match_count, basket);
+        VLOG(jdwp) << "  suspend_policy=" << suspend_policy;
+      }
 
       pReq = eventPrep();
       expandBufAdd1(pReq, suspend_policy);
       expandBufAdd4BE(pReq, match_count);
 
-      for (int i = 0; i < match_count; i++) {
+      for (size_t i = 0; i < match_count; i++) {
         expandBufAdd1(pReq, match_list[i]->eventKind);
         expandBufAdd4BE(pReq, match_list[i]->requestId);
         expandBufAdd8BE(pReq, basket.threadId);
@@ -1072,33 +1095,35 @@ bool JdwpState::PostException(const JdwpLocation* pThrowLoc,
     return false;
   }
 
-  int match_count = 0;
+  size_t match_count = 0;
   ExpandBuf* pReq = NULL;
   JdwpSuspendPolicy suspend_policy = SP_NONE;
   {
     MutexLock mu(Thread::Current(), event_list_lock_);
     JdwpEvent** match_list = AllocMatchList(event_list_size_);
-    FindMatchingEvents(EK_EXCEPTION, &basket, match_list, &match_count);
+    FindMatchingEvents(EK_EXCEPTION, basket, match_list, &match_count);
     if (match_count != 0) {
-      VLOG(jdwp) << "EVENT: " << match_list[0]->eventKind << "(" << match_count << " total)"
-                 << StringPrintf(" thread=%#" PRIx64, basket.threadId)
-                 << StringPrintf(" exceptId=%#" PRIx64, exceptionId)
-                 << " caught=" << basket.caught << ")"
-                 << "  throw: " << *pThrowLoc;
-      if (pCatchLoc->class_id == 0) {
-        VLOG(jdwp) << "  catch: (not caught)";
-      } else {
-        VLOG(jdwp) << "  catch: " << *pCatchLoc;
-      }
-
       suspend_policy = scanSuspendPolicy(match_list, match_count);
-      VLOG(jdwp) << "  suspend_policy=" << suspend_policy;
+
+      if (VLOG_IS_ON(jdwp)) {
+        LogMatchingEventsAndThread(match_list, match_count, basket);
+        VLOG(jdwp) << "  throwLocation=" << *pThrowLoc;
+        if (pCatchLoc->class_id == 0) {
+          VLOG(jdwp) << "  catchLocation=uncaught";
+        } else {
+          VLOG(jdwp) << "  catchLocation=" << *pCatchLoc;
+        }
+        VLOG(jdwp) << StringPrintf("  this=%#" PRIx64, basket.thisPtr);
+        VLOG(jdwp) << StringPrintf("  exceptionClass=%#" PRIx64, basket.excepClassId) << " "
+                   << Dbg::GetClassName(basket.excepClassId);
+        VLOG(jdwp) << "  suspend_policy=" << suspend_policy;
+      }
 
       pReq = eventPrep();
       expandBufAdd1(pReq, suspend_policy);
       expandBufAdd4BE(pReq, match_count);
 
-      for (int i = 0; i < match_count; i++) {
+      for (size_t i = 0; i < match_count; i++) {
         expandBufAdd1(pReq, match_list[i]->eventKind);
         expandBufAdd4BE(pReq, match_list[i]->requestId);
         expandBufAdd8BE(pReq, basket.threadId);
@@ -1142,17 +1167,19 @@ bool JdwpState::PostClassPrepare(JdwpTypeTag tag, RefTypeId refTypeId, const std
 
   ExpandBuf* pReq = NULL;
   JdwpSuspendPolicy suspend_policy = SP_NONE;
-  int match_count = 0;
+  size_t match_count = 0;
   {
     MutexLock mu(Thread::Current(), event_list_lock_);
     JdwpEvent** match_list = AllocMatchList(event_list_size_);
-    FindMatchingEvents(EK_CLASS_PREPARE, &basket, match_list, &match_count);
+    FindMatchingEvents(EK_CLASS_PREPARE, basket, match_list, &match_count);
     if (match_count != 0) {
-      VLOG(jdwp) << "EVENT: " << match_list[0]->eventKind << "(" << match_count << " total) "
-                 << StringPrintf("thread=%#" PRIx64, basket.threadId) << ") " << signature;
-
       suspend_policy = scanSuspendPolicy(match_list, match_count);
-      VLOG(jdwp) << "  suspend_policy=" << suspend_policy;
+
+      if (VLOG_IS_ON(jdwp)) {
+        LogMatchingEventsAndThread(match_list, match_count, basket);
+        VLOG(jdwp) << StringPrintf("  type=%#" PRIx64, basket.classId)<< " " << signature;
+        VLOG(jdwp) << "  suspend_policy=" << suspend_policy;
+      }
 
       if (basket.threadId == debug_thread_id_) {
         /*
@@ -1171,7 +1198,7 @@ bool JdwpState::PostClassPrepare(JdwpTypeTag tag, RefTypeId refTypeId, const std
       expandBufAdd1(pReq, suspend_policy);
       expandBufAdd4BE(pReq, match_count);
 
-      for (int i = 0; i < match_count; i++) {
+      for (size_t i = 0; i < match_count; i++) {
         expandBufAdd1(pReq, match_list[i]->eventKind);
         expandBufAdd4BE(pReq, match_list[i]->requestId);
         expandBufAdd8BE(pReq, basket.threadId);
diff --git a/runtime/mem_map.cc b/runtime/mem_map.cc
index 81a86235ec..8d987dfd9a 100644
--- a/runtime/mem_map.cc
+++ b/runtime/mem_map.cc
@@ -34,6 +34,13 @@
 
 #ifdef USE_ASHMEM
 #include <cutils/ashmem.h>
+#ifndef ANDROID_OS
+#include <sys/resource.h>
+#endif
+#endif
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
 #endif
 
 namespace art {
@@ -179,20 +186,32 @@ MemMap* MemMap::MapAnonymous(const char* name, byte* expected, size_t byte_count
   }
   size_t page_aligned_byte_count = RoundUp(byte_count, kPageSize);
 
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+  ScopedFd fd(-1);
+
 #ifdef USE_ASHMEM
-  // android_os_Debug.cpp read_mapinfo assumes all ashmem regions associated with the VM are
-  // prefixed "dalvik-".
-  std::string debug_friendly_name("dalvik-");
-  debug_friendly_name += name;
-  ScopedFd fd(ashmem_create_region(debug_friendly_name.c_str(), page_aligned_byte_count));
-  if (fd.get() == -1) {
-    *error_msg = StringPrintf("ashmem_create_region failed for '%s': %s", name, strerror(errno));
-    return nullptr;
-  }
-  int flags = MAP_PRIVATE;
+#ifdef HAVE_ANDROID_OS
+  const bool use_ashmem = true;
 #else
-  ScopedFd fd(-1);
-  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+  // When not on Android ashmem is faked using files in /tmp. Ensure that such files won't
+  // fail due to ulimit restrictions. If they will then use a regular mmap.
+  struct rlimit rlimit_fsize;
+  CHECK_EQ(getrlimit(RLIMIT_FSIZE, &rlimit_fsize), 0);
+  const bool use_ashmem = (rlimit_fsize.rlim_cur == RLIM_INFINITY) ||
+      (page_aligned_byte_count < rlimit_fsize.rlim_cur);
+#endif
+  if (use_ashmem) {
+    // android_os_Debug.cpp read_mapinfo assumes all ashmem regions associated with the VM are
+    // prefixed "dalvik-".
+    std::string debug_friendly_name("dalvik-");
+    debug_friendly_name += name;
+    fd.reset(ashmem_create_region(debug_friendly_name.c_str(), page_aligned_byte_count));
+    if (fd.get() == -1) {
+      *error_msg = StringPrintf("ashmem_create_region failed for '%s': %s", name, strerror(errno));
+      return nullptr;
+    }
+    flags = MAP_PRIVATE;
+  }
 #endif
 
   // We need to store and potentially set an error number for pretty printing of errors
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index a19445b189..999a9e504b 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -1115,20 +1115,29 @@ void MonitorList::SweepMonitorList(IsMarkedCallback* callback, void* arg) {
   }
 }
 
+struct MonitorDeflateArgs {
+  MonitorDeflateArgs() : self(Thread::Current()), deflate_count(0) {}
+  Thread* const self;
+  size_t deflate_count;
+};
+
 static mirror::Object* MonitorDeflateCallback(mirror::Object* object, void* arg)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  if (Monitor::Deflate(reinterpret_cast<Thread*>(arg), object)) {
+  MonitorDeflateArgs* args = reinterpret_cast<MonitorDeflateArgs*>(arg);
+  if (Monitor::Deflate(args->self, object)) {
     DCHECK_NE(object->GetLockWord(true).GetState(), LockWord::kFatLocked);
+    ++args->deflate_count;
     // If we deflated, return nullptr so that the monitor gets removed from the array.
     return nullptr;
   }
   return object;  // Monitor was not deflated.
 }
 
-void MonitorList::DeflateMonitors() {
-  Thread* self = Thread::Current();
-  Locks::mutator_lock_->AssertExclusiveHeld(self);
-  SweepMonitorList(MonitorDeflateCallback, reinterpret_cast<Thread*>(self));
+size_t MonitorList::DeflateMonitors() {
+  MonitorDeflateArgs args;
+  Locks::mutator_lock_->AssertExclusiveHeld(args.self);
+  SweepMonitorList(MonitorDeflateCallback, &args);
+  return args.deflate_count;
 }
 
 MonitorInfo::MonitorInfo(mirror::Object* obj) : owner_(NULL), entry_count_(0) {
diff --git a/runtime/monitor.h b/runtime/monitor.h
index a28823d184..d7552a3fd5 100644
--- a/runtime/monitor.h
+++ b/runtime/monitor.h
@@ -229,7 +229,8 @@ class MonitorList {
       LOCKS_EXCLUDED(monitor_list_lock_) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void DisallowNewMonitors() LOCKS_EXCLUDED(monitor_list_lock_);
   void AllowNewMonitors() LOCKS_EXCLUDED(monitor_list_lock_);
-  void DeflateMonitors() LOCKS_EXCLUDED(monitor_list_lock_)
+  // Returns how many monitors were deflated.
+  size_t DeflateMonitors() LOCKS_EXCLUDED(monitor_list_lock_)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
 
  private:
diff --git a/runtime/utils.cc b/runtime/utils.cc
index f60f795e18..e5b8b224df 100644
--- a/runtime/utils.cc
+++ b/runtime/utils.cc
@@ -468,11 +468,12 @@ std::string PrettySize(int64_t byte_count) {
                       negative_str, byte_count / kBytesPerUnit[i], kUnitStrings[i]);
 }
 
-std::string PrettyDuration(uint64_t nano_duration) {
+std::string PrettyDuration(uint64_t nano_duration, size_t max_fraction_digits) {
   if (nano_duration == 0) {
     return "0";
   } else {
-    return FormatDuration(nano_duration, GetAppropriateTimeUnit(nano_duration));
+    return FormatDuration(nano_duration, GetAppropriateTimeUnit(nano_duration),
+                          max_fraction_digits);
   }
 }
 
@@ -509,45 +510,41 @@ uint64_t GetNsToTimeUnitDivisor(TimeUnit time_unit) {
   return 0;
 }
 
-std::string FormatDuration(uint64_t nano_duration, TimeUnit time_unit) {
-  const char* unit = NULL;
+std::string FormatDuration(uint64_t nano_duration, TimeUnit time_unit,
+                           size_t max_fraction_digits) {
+  const char* unit = nullptr;
   uint64_t divisor = GetNsToTimeUnitDivisor(time_unit);
-  uint32_t zero_fill = 1;
   switch (time_unit) {
     case kTimeUnitSecond:
       unit = "s";
-      zero_fill = 9;
       break;
     case kTimeUnitMillisecond:
       unit = "ms";
-      zero_fill = 6;
       break;
     case kTimeUnitMicrosecond:
       unit = "us";
-      zero_fill = 3;
       break;
     case kTimeUnitNanosecond:
       unit = "ns";
-      zero_fill = 0;
       break;
   }
-
-  uint64_t whole_part = nano_duration / divisor;
+  const uint64_t whole_part = nano_duration / divisor;
   uint64_t fractional_part = nano_duration % divisor;
   if (fractional_part == 0) {
     return StringPrintf("%" PRIu64 "%s", whole_part, unit);
   } else {
-    while ((fractional_part % 1000) == 0) {
-      zero_fill -= 3;
-      fractional_part /= 1000;
-    }
-    if (zero_fill == 3) {
-      return StringPrintf("%" PRIu64 ".%03" PRIu64 "%s", whole_part, fractional_part, unit);
-    } else if (zero_fill == 6) {
-      return StringPrintf("%" PRIu64 ".%06" PRIu64 "%s", whole_part, fractional_part, unit);
-    } else {
-      return StringPrintf("%" PRIu64 ".%09" PRIu64 "%s", whole_part, fractional_part, unit);
+    static constexpr size_t kMaxDigits = 30;
+    char fraction_buffer[kMaxDigits];
+    char* ptr = fraction_buffer;
+    uint64_t multiplier = 10;
+    // This infinite loops if fractional part is 0.
+    while (fractional_part * multiplier < divisor) {
+      multiplier *= 10;
+      *ptr++ = '0';
     }
+    sprintf(ptr, "%" PRIu64, fractional_part);
+    fraction_buffer[std::min(kMaxDigits - 1, max_fraction_digits)] = '\0';
+    return StringPrintf("%" PRIu64 ".%s%s", whole_part, fraction_buffer, unit);
   }
 }
 
diff --git a/runtime/utils.h b/runtime/utils.h
index 6d52459ec8..a61d30fb43 100644
--- a/runtime/utils.h
+++ b/runtime/utils.h
@@ -265,10 +265,11 @@ std::string PrettySize(int64_t size_in_bytes);
 // Returns a human-readable time string which prints every nanosecond while trying to limit the
 // number of trailing zeros. Prints using the largest human readable unit up to a second.
 // e.g. "1ms", "1.000000001s", "1.001us"
-std::string PrettyDuration(uint64_t nano_duration);
+std::string PrettyDuration(uint64_t nano_duration, size_t max_fraction_digits = 3);
 
 // Format a nanosecond time to specified units.
-std::string FormatDuration(uint64_t nano_duration, TimeUnit time_unit);
+std::string FormatDuration(uint64_t nano_duration, TimeUnit time_unit,
+                           size_t max_fraction_digits);
 
 // Get the appropriate unit for a nanosecond duration.
 TimeUnit GetAppropriateTimeUnit(uint64_t nano_duration);
diff --git a/runtime/utils_test.cc b/runtime/utils_test.cc
index 4a1e477bbd..7cd5980c44 100644
--- a/runtime/utils_test.cc
+++ b/runtime/utils_test.cc
@@ -171,14 +171,15 @@ TEST_F(UtilsTest, PrettyDuration) {
   EXPECT_EQ("10s", PrettyDuration(10 * one_sec));
   EXPECT_EQ("100s", PrettyDuration(100 * one_sec));
   EXPECT_EQ("1.001s", PrettyDuration(1 * one_sec + one_ms));
-  EXPECT_EQ("1.000001s", PrettyDuration(1 * one_sec + one_us));
-  EXPECT_EQ("1.000000001s", PrettyDuration(1 * one_sec + 1));
+  EXPECT_EQ("1.000001s", PrettyDuration(1 * one_sec + one_us, 6));
+  EXPECT_EQ("1.000000001s", PrettyDuration(1 * one_sec + 1, 9));
+  EXPECT_EQ("1.000s", PrettyDuration(1 * one_sec + one_us, 3));
 
   EXPECT_EQ("1ms", PrettyDuration(1 * one_ms));
   EXPECT_EQ("10ms", PrettyDuration(10 * one_ms));
   EXPECT_EQ("100ms", PrettyDuration(100 * one_ms));
   EXPECT_EQ("1.001ms", PrettyDuration(1 * one_ms + one_us));
-  EXPECT_EQ("1.000001ms", PrettyDuration(1 * one_ms + 1));
+  EXPECT_EQ("1.000001ms", PrettyDuration(1 * one_ms + 1, 6));
 
   EXPECT_EQ("1us", PrettyDuration(1 * one_us));
   EXPECT_EQ("10us", PrettyDuration(10 * one_us));
diff --git a/runtime/verifier/method_verifier.cc b/runtime/verifier/method_verifier.cc
index f8e75ea850..89cfcdd1de 100644
--- a/runtime/verifier/method_verifier.cc
+++ b/runtime/verifier/method_verifier.cc
@@ -717,13 +717,28 @@ bool MethodVerifier::VerifyInstruction(const Instruction* inst, uint32_t code_of
     case Instruction::kVerifySwitchTargets:
       result = result && CheckSwitchTargets(code_offset);
       break;
+    case Instruction::kVerifyVarArgNonZero:
+      // Fall-through.
     case Instruction::kVerifyVarArg: {
+      if (inst->GetVerifyExtraFlags() == Instruction::kVerifyVarArgNonZero && inst->VRegA() <= 0) {
+        Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "invalid arg count (" << inst->VRegA() << ") in "
+                                             "non-range invoke";
+        return false;
+      }
       uint32_t args[Instruction::kMaxVarArgRegs];
       inst->GetVarArgs(args);
       result = result && CheckVarArgRegs(inst->VRegA(), args);
       break;
     }
+    case Instruction::kVerifyVarArgRangeNonZero:
+      // Fall-through.
     case Instruction::kVerifyVarArgRange:
+      if (inst->GetVerifyExtraFlags() == Instruction::kVerifyVarArgRangeNonZero &&
+          inst->VRegA() <= 0) {
+        Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "invalid arg count (" << inst->VRegA() << ") in "
+                                             "range invoke";
+        return false;
+      }
       result = result && CheckVarArgRangeRegs(inst->VRegA(), inst->VRegC());
       break;
     case Instruction::kVerifyError:
diff --git a/test/003-omnibus-opcodes/build b/test/003-omnibus-opcodes/build
index 9dff837ab6..f909fb2219 100644
--- a/test/003-omnibus-opcodes/build
+++ b/test/003-omnibus-opcodes/build
@@ -22,5 +22,5 @@ ${JAVAC} -d classes `find src -name '*.java'`
 rm classes/UnresClass.class
 ${JAVAC} -d classes `find src2 -name '*.java'`
 
-${DX} -JXmx256m --debug --dex --dump-to=classes.lst --output=classes.dex classes
+${DX} -JXmx256m --debug --dex --output=classes.dex classes
 zip $TEST_NAME.jar classes.dex
diff --git a/test/056-const-string-jumbo/build b/test/056-const-string-jumbo/build
index a12c9d32b5..ef286d140e 100644
--- a/test/056-const-string-jumbo/build
+++ b/test/056-const-string-jumbo/build
@@ -42,5 +42,5 @@ function writeFile(name, start, end) {
 mkdir classes
 ${JAVAC} -d classes src/*.java
 
-${DX} -JXmx500m --debug --dex --no-optimize --positions=none --no-locals --dump-to=classes.lst --output=classes.dex classes
+${DX} -JXmx500m --debug --dex --no-optimize --positions=none --no-locals --output=classes.dex classes
 zip $TEST_NAME.jar classes.dex
diff --git a/test/302-float-conversion/expected.txt b/test/302-float-conversion/expected.txt
index 7d5c1eba62..04230761af 100644
--- a/test/302-float-conversion/expected.txt
+++ b/test/302-float-conversion/expected.txt
@@ -1,2 +1,3 @@
 Iteration Result is as expected
 inter4:2.0
+max_long:9223372036854775807
diff --git a/test/302-float-conversion/src/Main.java b/test/302-float-conversion/src/Main.java
index afc5e976d9..27331353f0 100644
--- a/test/302-float-conversion/src/Main.java
+++ b/test/302-float-conversion/src/Main.java
@@ -21,6 +21,7 @@ public class Main {
     public static void main(String args[]) {
         test1();
         test2();
+        test3();
     }
 
     public static void test1() {
@@ -55,4 +56,9 @@ public class Main {
         System.out.println("inter4:" + inter4);
     }
 
+    public static void test3() {
+        double d = Long.MAX_VALUE;
+        System.out.println("max_long:" + (long)d);
+    }
+
 }
diff --git a/test/303-verification-stress/build b/test/303-verification-stress/build
index 2ef9beafd1..c1935d2e9b 100644
--- a/test/303-verification-stress/build
+++ b/test/303-verification-stress/build
@@ -24,5 +24,5 @@ gcc -o classes-gen classes-gen.c
 mkdir classes
 ${JAVAC} -d classes src/*.java
 
-${DX} --debug --dex --dump-to=classes.lst --output=classes.dex classes
+${DX} --debug --dex --output=classes.dex classes
 zip $TEST_NAME.jar classes.dex
diff --git a/test/run-test b/test/run-test
index 34b06cc3a1..d1c5bb2360 100755
--- a/test/run-test
+++ b/test/run-test
@@ -298,6 +298,17 @@ chmod 755 "$run"
 
 export TEST_NAME=`basename ${test_dir}`
 
+# To cause tests to fail fast, limit the file sizes created by dx, dex2oat and ART output to 2MB.
+file_size_limit=2048
+if echo "$test_dir" | grep 089; then
+  file_size_limit=5120
+elif echo "$test_dir" | grep 083; then
+  file_size_limit=5120
+fi
+if ! ulimit "$file_size_limit"; then
+   echo "ulimit file size setting failed"
+fi
+
 good="no"
 if [ "$dev_mode" = "yes" ]; then
     "./${build}" 2>&1
@@ -376,7 +387,7 @@ fi
         echo '#################### info'
         cat "${td_info}" | sed 's/^/# /g'
         echo '#################### diffs'
-        diff --strip-trailing-cr -u "$expected" "$output"
+        diff --strip-trailing-cr -u "$expected" "$output" | tail -n 500
         echo '####################'
         echo ' '
     fi