87 files changed, 3006 insertions, 770 deletions
diff --git a/build/Android.common_build.mk b/build/Android.common_build.mk
index bac0ff36fe..0cd90c97a6 100644
--- a/build/Android.common_build.mk
+++ b/build/Android.common_build.mk
@@ -252,6 +252,7 @@ art_non_debug_cflags := \
 art_debug_cflags := \
   $(ART_DEBUG_OPT_FLAG) \
   -DDYNAMIC_ANNOTATIONS_ENABLED=1 \
+  -DVIXL_DEBUG \
   -UNDEBUG
 
 # Assembler flags for non-debug ART and ART tools.
@@ -263,20 +264,46 @@ art_debug_asflags := -UNDEBUG
 art_host_non_debug_cflags := $(art_non_debug_cflags)
 art_target_non_debug_cflags := $(art_non_debug_cflags)
 
+###
+# Frame size
+###
+
+# Size of the stack-overflow gap.
+ART_STACK_OVERFLOW_GAP_arm := 8192
+ART_STACK_OVERFLOW_GAP_arm64 := 8192
+ART_STACK_OVERFLOW_GAP_mips := 16384
+ART_STACK_OVERFLOW_GAP_mips64 := 16384
+ART_STACK_OVERFLOW_GAP_x86 := 8192
+ART_STACK_OVERFLOW_GAP_x86_64 := 8192
+ART_COMMON_STACK_OVERFLOW_DEFINES := \
+  -DART_STACK_OVERFLOW_GAP_arm=$(ART_STACK_OVERFLOW_GAP_arm) \
+  -DART_STACK_OVERFLOW_GAP_arm64=$(ART_STACK_OVERFLOW_GAP_arm64) \
+  -DART_STACK_OVERFLOW_GAP_mips=$(ART_STACK_OVERFLOW_GAP_mips) \
+  -DART_STACK_OVERFLOW_GAP_mips64=$(ART_STACK_OVERFLOW_GAP_mips64) \
+  -DART_STACK_OVERFLOW_GAP_x86=$(ART_STACK_OVERFLOW_GAP_x86) \
+  -DART_STACK_OVERFLOW_GAP_x86_64=$(ART_STACK_OVERFLOW_GAP_x86_64) \
+
+# Keep these as small as possible. We have separate values as we have some host vs target
+# specific code (and previously GCC vs Clang).
+ART_HOST_FRAME_SIZE_LIMIT := 1736
+ART_TARGET_FRAME_SIZE_LIMIT := 1736
+
+# Frame size adaptations for instrumented builds.
+ifdef SANITIZE_TARGET
+  ART_TARGET_FRAME_SIZE_LIMIT := 6400
+endif
+
+# Add frame-size checks for non-debug builds.
 ifeq ($(HOST_OS),linux)
-  # Larger frame-size for host clang builds today
   ifneq ($(ART_COVERAGE),true)
     ifneq ($(NATIVE_COVERAGE),true)
-      art_host_non_debug_cflags += -Wframe-larger-than=2700
-      ifdef SANITIZE_TARGET
-        art_target_non_debug_cflags += -Wframe-larger-than=6400
-      else
-        art_target_non_debug_cflags += -Wframe-larger-than=1736
-      endif
+      art_host_non_debug_cflags += -Wframe-larger-than=$(ART_HOST_FRAME_SIZE_LIMIT)
+      art_target_non_debug_cflags += -Wframe-larger-than=$(ART_TARGET_FRAME_SIZE_LIMIT)
     endif
   endif
 endif
 
+
 ART_HOST_CFLAGS := $(art_cflags)
 ART_TARGET_CFLAGS := $(art_cflags)
 
@@ -293,6 +320,10 @@ endif
 ART_HOST_CFLAGS += -DART_BASE_ADDRESS=$(LIBART_IMG_HOST_BASE_ADDRESS)
 ART_HOST_CFLAGS += -DART_DEFAULT_INSTRUCTION_SET_FEATURES=default $(art_host_cflags)
 
+ART_HOST_CFLAGS += -DART_FRAME_SIZE_LIMIT=$(ART_HOST_FRAME_SIZE_LIMIT) \
+                   $(ART_COMMON_STACK_OVERFLOW_DEFINES)
+
+
 ifndef LIBART_IMG_TARGET_BASE_ADDRESS
   $(error LIBART_IMG_TARGET_BASE_ADDRESS unset)
 endif
@@ -300,6 +331,9 @@ endif
 ART_TARGET_CFLAGS += -DART_TARGET \
                      -DART_BASE_ADDRESS=$(LIBART_IMG_TARGET_BASE_ADDRESS) \
 
+ART_TARGET_CFLAGS += -DART_FRAME_SIZE_LIMIT=$(ART_TARGET_FRAME_SIZE_LIMIT) \
+                     $(ART_COMMON_STACK_OVERFLOW_DEFINES)
+
 ifeq ($(ART_TARGET_LINUX),true)
 # Setting ART_TARGET_LINUX to true compiles art/ assuming that the target device
 # will be running linux rather than android.
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index c79205fca6..3d07fc0ca8 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -634,7 +634,7 @@ define define-art-gtest
   ifeq ($$(art_target_or_host),target)
     $$(eval LOCAL_CLANG := $$(ART_TARGET_CLANG))
     $$(eval $$(call set-target-local-cflags-vars,debug))
-    LOCAL_SHARED_LIBRARIES += libdl libicuuc libicui18n libnativehelper libz libcutils libvixl-arm64
+    LOCAL_SHARED_LIBRARIES += libdl libicuuc libicui18n libnativehelper libz libcutils libvixld-arm64
     LOCAL_MODULE_PATH_32 := $$(ART_TARGET_NATIVETEST_OUT)/$$(ART_TARGET_ARCH_32)
     LOCAL_MODULE_PATH_64 := $$(ART_TARGET_NATIVETEST_OUT)/$$(ART_TARGET_ARCH_64)
     LOCAL_MULTILIB := both
@@ -678,7 +678,7 @@ valgrind-test-art-target-gtest-$$(art_gtest_name): $$(ART_TEST_TARGET_VALGRIND_G
     LOCAL_CLANG := $$(ART_HOST_CLANG)
     LOCAL_CFLAGS += $$(ART_HOST_CFLAGS) $$(ART_HOST_DEBUG_CFLAGS)
     LOCAL_ASFLAGS += $$(ART_HOST_ASFLAGS) $$(ART_HOST_DEBUG_ASFLAGS)
-    LOCAL_SHARED_LIBRARIES += libicuuc-host libicui18n-host libnativehelper libziparchive-host libz-host libvixl-arm64
+    LOCAL_SHARED_LIBRARIES += libicuuc-host libicui18n-host libnativehelper libziparchive-host libz-host libvixld-arm64
     LOCAL_LDLIBS := -lpthread -ldl
     LOCAL_IS_HOST_MODULE := true
     LOCAL_MULTILIB := both
diff --git a/cmdline/cmdline_types.h b/cmdline/cmdline_types.h
index 1146f958ca..b57383b963 100644
--- a/cmdline/cmdline_types.h
+++ b/cmdline/cmdline_types.h
@@ -24,14 +24,16 @@
 
 // Includes for the types that are being specialized
 #include <string>
-#include "unit.h"
-#include "jdwp/jdwp.h"
 #include "base/logging.h"
 #include "base/time_utils.h"
 #include "experimental_flags.h"
 #include "gc/collector_type.h"
 #include "gc/space/large_object_space.h"
+#include "jdwp/jdwp.h"
 #include "jit/profile_saver_options.h"
+#include "plugin.h"
+#include "ti/agent.h"
+#include "unit.h"
 
 namespace art {
 
@@ -381,6 +383,38 @@ struct CmdlineType<std::string> : CmdlineTypeParser<std::string> {
 };
 
 template <>
+struct CmdlineType<std::vector<Plugin>> : CmdlineTypeParser<std::vector<Plugin>> {
+  Result Parse(const std::string& args) {
+    assert(false && "Use AppendValues() for a Plugin vector type");
+    return Result::Failure("Unconditional failure: Plugin vector must be appended: " + args);
+  }
+
+  Result ParseAndAppend(const std::string& args,
+                        std::vector<Plugin>& existing_value) {
+    existing_value.push_back(Plugin::Create(args));
+    return Result::SuccessNoValue();
+  }
+
+  static const char* Name() { return "std::vector<Plugin>"; }
+};
+
+template <>
+struct CmdlineType<std::vector<ti::Agent>> : CmdlineTypeParser<std::vector<ti::Agent>> {
+  Result Parse(const std::string& args) {
+    assert(false && "Use AppendValues() for an Agent vector type");
+    return Result::Failure("Unconditional failure: Agent vector must be appended: " + args);
+  }
+
+  Result ParseAndAppend(const std::string& args,
+                        std::vector<ti::Agent>& existing_value) {
+    existing_value.push_back(ti::Agent::Create(args));
+    return Result::SuccessNoValue();
+  }
+
+  static const char* Name() { return "std::vector<ti::Agent>"; }
+};
+
+template <>
 struct CmdlineType<std::vector<std::string>> : CmdlineTypeParser<std::vector<std::string>> {
   Result Parse(const std::string& args) {
     assert(false && "Use AppendValues() for a string vector type");
@@ -625,6 +659,8 @@ struct CmdlineType<LogVerbosity> : CmdlineTypeParser<LogVerbosity> {
         log_verbosity.image = true;
       } else if (verbose_options[j] == "systrace-locks") {
         log_verbosity.systrace_lock_logging = true;
+      } else if (verbose_options[j] == "agents") {
+        log_verbosity.agents = true;
       } else {
         return Result::Usage(std::string("Unknown -verbose option ") + verbose_options[j]);
       }
@@ -735,6 +771,10 @@ struct CmdlineType<ExperimentalFlags> : CmdlineTypeParser<ExperimentalFlags> {
   Result ParseAndAppend(const std::string& option, ExperimentalFlags& existing) {
     if (option == "none") {
       existing = ExperimentalFlags::kNone;
+    } else if (option == "agents") {
+      existing = existing | ExperimentalFlags::kAgents;
+    } else if (option == "runtime-plugins") {
+      existing = existing | ExperimentalFlags::kRuntimePlugins;
     } else {
       return Result::Failure(std::string("Unknown option '") + option + "'");
     }
diff --git a/cmdline/detail/cmdline_parse_argument_detail.h b/cmdline/detail/cmdline_parse_argument_detail.h
index 4b56804ea6..84beff59c7 100644
--- a/cmdline/detail/cmdline_parse_argument_detail.h
+++ b/cmdline/detail/cmdline_parse_argument_detail.h
@@ -497,7 +497,7 @@ namespace art {
       std::function<void(TArg&)> save_argument_;
       std::function<TArg&(void)> load_argument_;
     };
-  } // namespace detail // NOLINT [readability/namespace] [5] [whitespace/comments] [2]
+  }  // namespace detail  // NOLINT [readability/namespace] [5]
 }  // namespace art
 
 #endif  // ART_CMDLINE_DETAIL_CMDLINE_PARSE_ARGUMENT_DETAIL_H_
diff --git a/cmdline/detail/cmdline_parser_detail.h b/cmdline/detail/cmdline_parser_detail.h
index 9b43bb0f5d..24dbca2642 100644
--- a/cmdline/detail/cmdline_parser_detail.h
+++ b/cmdline/detail/cmdline_parser_detail.h
@@ -35,7 +35,7 @@ namespace art {
      private:
       template <typename TStream, typename T>
       static std::true_type InsertionOperatorTest(TStream& os, const T& value,
-                                                  std::remove_reference<decltype(os << value)>* = 0); // NOLINT [whitespace/operators] [3]
+                                                  std::remove_reference<decltype(os << value)>* = 0);  // NOLINT [whitespace/operators] [3]
 
       template <typename TStream, typename ... T>
       static std::false_type InsertionOperatorTest(TStream& os, const T& ... args);
@@ -53,7 +53,7 @@ namespace art {
      private:
       template <typename TL, typename TR>
       static std::true_type EqualityOperatorTest(const TL& left, const TR& right,
-                                                 std::remove_reference<decltype(left == right)>* = 0); // NOLINT [whitespace/operators] [3]
+                                                 std::remove_reference<decltype(left == right)>* = 0);  // NOLINT [whitespace/operators] [3]
 
       template <typename TL, typename ... T>
       static std::false_type EqualityOperatorTest(const TL& left, const T& ... args);
diff --git a/compiler/Android.mk b/compiler/Android.mk
index 0ede30d03a..6c6d99f616 100644
--- a/compiler/Android.mk
+++ b/compiler/Android.mk
@@ -284,12 +284,12 @@ $$(ENUM_OPERATOR_OUT_GEN): $$(GENERATED_SRC_DIR)/%_operator_out.cc : $(LOCAL_PAT
   endif
   LOCAL_ADDITIONAL_DEPENDENCIES := art/build/Android.common_build.mk
   LOCAL_ADDITIONAL_DEPENDENCIES += $(LOCAL_PATH)/Android.mk
-  # Vixl assembly support for ARM64 targets.
+  # VIXL assembly support for ARM64 targets.
   ifeq ($$(art_ndebug_or_debug),debug)
     ifeq ($$(art_static_or_shared), static)
-      LOCAL_WHOLESTATIC_LIBRARIES += libvixl-arm64
+      LOCAL_WHOLESTATIC_LIBRARIES += libvixld-arm64
     else
-      LOCAL_SHARED_LIBRARIES += libvixl-arm64
+      LOCAL_SHARED_LIBRARIES += libvixld-arm64
     endif
   else
     ifeq ($$(art_static_or_shared), static)
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 828603398b..d0a8335a99 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -77,10 +77,6 @@ namespace art {
 
 static constexpr bool kTimeCompileMethod = !kIsDebugBuild;
 
-// Whether classes-to-compile and methods-to-compile are only applied to the boot image, or, when
-// given, too all compilations.
-static constexpr bool kRestrictCompilationFiltersToImage = true;
-
 // Print additional info during profile guided compilation.
 static constexpr bool kDebugProfileGuidedCompilation = false;
 
@@ -946,10 +942,6 @@ bool CompilerDriver::IsImageClass(const char* descriptor) const {
 }
 
 bool CompilerDriver::IsClassToCompile(const char* descriptor) const {
-  if (kRestrictCompilationFiltersToImage && !IsBootImage()) {
-    return true;
-  }
-
   if (classes_to_compile_ == nullptr) {
     return true;
   }
@@ -957,10 +949,6 @@ bool CompilerDriver::IsClassToCompile(const char* descriptor) const {
 }
 
 bool CompilerDriver::IsMethodToCompile(const MethodReference& method_ref) const {
-  if (kRestrictCompilationFiltersToImage && !IsBootImage()) {
-    return true;
-  }
-
   if (methods_to_compile_ == nullptr) {
     return true;
   }
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index ab85c12a1d..4c4128c5f8 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -59,8 +59,8 @@ static constexpr DRegister DTMP = D31;
 
 static constexpr uint32_t kPackedSwitchCompareJumpThreshold = 7;
 
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<ArmAssembler*>(codegen->GetAssembler())-> // NOLINT
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<ArmAssembler*>(codegen->GetAssembler())->  // NOLINT
 #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, x).Int32Value()
 
 class NullCheckSlowPathARM : public SlowPathCode {
@@ -432,11 +432,6 @@ class ReadBarrierMarkSlowPathARM : public SlowPathCode {
            (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
         << "Unexpected instruction in read barrier marking slow path: "
         << instruction_->DebugName();
-    // The read barrier instrumentation of object ArrayGet
-    // instructions does not support the HIntermediateAddress
-    // instruction.
-    DCHECK(!(instruction_->IsArrayGet() &&
-             instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
 
     __ Bind(GetEntryLabel());
     // No need to save live registers; it's taken care of by the
@@ -517,11 +512,6 @@ class ReadBarrierForHeapReferenceSlowPathARM : public SlowPathCode {
            (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
-    // The read barrier instrumentation of object ArrayGet
-    // instructions does not support the HIntermediateAddress
-    // instruction.
-    DCHECK(!(instruction_->IsArrayGet() &&
-             instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
 
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, locations);
@@ -706,8 +696,8 @@ class ReadBarrierForRootSlowPathARM : public SlowPathCode {
 };
 
 #undef __
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<ArmAssembler*>(GetAssembler())-> // NOLINT
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<ArmAssembler*>(GetAssembler())->  // NOLINT
 
 inline Condition ARMCondition(IfCondition cond) {
   switch (cond) {
@@ -4507,6 +4497,8 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
   Primitive::Type type = instruction->GetType();
   HInstruction* array_instr = instruction->GetArray();
   bool has_intermediate_address = array_instr->IsIntermediateAddress();
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!(has_intermediate_address && kEmitCompilerReadBarrier));
 
   switch (type) {
     case Primitive::kPrimBoolean:
@@ -4541,11 +4533,6 @@ void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
     }
 
     case Primitive::kPrimNot: {
-      // The read barrier instrumentation of object ArrayGet
-      // instructions does not support the HIntermediateAddress
-      // instruction.
-      DCHECK(!(has_intermediate_address && kEmitCompilerReadBarrier));
-
       static_assert(
           sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
           "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
@@ -4688,6 +4675,8 @@ void InstructionCodeGeneratorARM::VisitArraySet(HArraySet* instruction) {
   Location value_loc = locations->InAt(2);
   HInstruction* array_instr = instruction->GetArray();
   bool has_intermediate_address = array_instr->IsIntermediateAddress();
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!(has_intermediate_address && kEmitCompilerReadBarrier));
 
   switch (value_type) {
     case Primitive::kPrimBoolean:
@@ -4952,6 +4941,8 @@ void InstructionCodeGeneratorARM::VisitArrayLength(HArrayLength* instruction) {
 }
 
 void LocationsBuilderARM::VisitIntermediateAddress(HIntermediateAddress* instruction) {
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!kEmitCompilerReadBarrier);
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
 
@@ -4966,6 +4957,9 @@ void InstructionCodeGeneratorARM::VisitIntermediateAddress(HIntermediateAddress*
   Location first = locations->InAt(0);
   Location second = locations->InAt(1);
 
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!kEmitCompilerReadBarrier);
+
   if (second.IsRegister()) {
     __ add(out.AsRegister<Register>(),
            first.AsRegister<Register>(),
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 9ceb3109cd..d95e7df6b4 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -131,8 +131,8 @@ Location InvokeRuntimeCallingConvention::GetReturnLocation(Primitive::Type retur
   return ARM64ReturnLocation(return_type);
 }
 
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler()-> // NOLINT
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler()->  // NOLINT
 #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArm64PointerSize, x).Int32Value()
 
 // Calculate memory accessing operand for save/restore live registers.
@@ -598,11 +598,6 @@ class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 {
            (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
         << "Unexpected instruction in read barrier marking slow path: "
         << instruction_->DebugName();
-    // The read barrier instrumentation of object ArrayGet
-    // instructions does not support the HIntermediateAddress
-    // instruction.
-    DCHECK(!(instruction_->IsArrayGet() &&
-             instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
 
     __ Bind(GetEntryLabel());
     // No need to save live registers; it's taken care of by the
@@ -685,9 +680,7 @@ class ReadBarrierForHeapReferenceSlowPathARM64 : public SlowPathCodeARM64 {
            (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
-    // The read barrier instrumentation of object ArrayGet
-    // instructions does not support the HIntermediateAddress
-    // instruction.
+    // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
     DCHECK(!(instruction_->IsArrayGet() &&
              instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
 
@@ -1990,6 +1983,8 @@ void InstructionCodeGeneratorARM64::VisitArm64DataProcWithShifterOp(
 }
 
 void LocationsBuilderARM64::VisitIntermediateAddress(HIntermediateAddress* instruction) {
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!kEmitCompilerReadBarrier);
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
@@ -1997,7 +1992,10 @@ void LocationsBuilderARM64::VisitIntermediateAddress(HIntermediateAddress* instr
   locations->SetOut(Location::RequiresRegister());
 }
 
-void InstructionCodeGeneratorARM64::VisitIntermediateAddress(HIntermediateAddress* instruction) {
+void InstructionCodeGeneratorARM64::VisitIntermediateAddress(
+    HIntermediateAddress* instruction) {
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!kEmitCompilerReadBarrier);
   __ Add(OutputRegister(instruction),
          InputRegisterAt(instruction, 0),
          Operand(InputOperandAt(instruction, 1)));
@@ -2093,15 +2091,11 @@ void InstructionCodeGeneratorARM64::VisitArrayGet(HArrayGet* instruction) {
   // Block pools between `Load` and `MaybeRecordImplicitNullCheck`.
   BlockPoolsScope block_pools(masm);
 
-  // The read barrier instrumentation of object ArrayGet instructions
-  // does not support the HIntermediateAddress instruction.
-  DCHECK(!((type == Primitive::kPrimNot) &&
-           instruction->GetArray()->IsIntermediateAddress() &&
-           kEmitCompilerReadBarrier));
-
   if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
     // Object ArrayGet with Baker's read barrier case.
     Register temp = temps.AcquireW();
+    // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+    DCHECK(!instruction->GetArray()->IsIntermediateAddress());
     // Note that a potential implicit null check is handled in the
     // CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier call.
     codegen_->GenerateArrayLoadWithBakerReadBarrier(
@@ -2115,6 +2109,9 @@ void InstructionCodeGeneratorARM64::VisitArrayGet(HArrayGet* instruction) {
     } else {
       Register temp = temps.AcquireSameSizeAs(obj);
       if (instruction->GetArray()->IsIntermediateAddress()) {
+        // The read barrier instrumentation does not support the
+        // HIntermediateAddress instruction yet.
+        DCHECK(!kEmitCompilerReadBarrier);
         // We do not need to compute the intermediate address from the array: the
         // input instruction has done it already. See the comment in
         // `TryExtractArrayAccessAddress()`.
@@ -2204,6 +2201,9 @@ void InstructionCodeGeneratorARM64::VisitArraySet(HArraySet* instruction) {
       UseScratchRegisterScope temps(masm);
       Register temp = temps.AcquireSameSizeAs(array);
       if (instruction->GetArray()->IsIntermediateAddress()) {
+        // The read barrier instrumentation does not support the
+        // HIntermediateAddress instruction yet.
+        DCHECK(!kEmitCompilerReadBarrier);
         // We do not need to compute the intermediate address from the array: the
         // input instruction has done it already. See the comment in
         // `TryExtractArrayAccessAddress()`.
@@ -2223,6 +2223,7 @@ void InstructionCodeGeneratorARM64::VisitArraySet(HArraySet* instruction) {
     codegen_->Store(value_type, value, destination);
     codegen_->MaybeRecordImplicitNullCheck(instruction);
   } else {
+    DCHECK(needs_write_barrier);
     DCHECK(!instruction->GetArray()->IsIntermediateAddress());
     vixl::aarch64::Label done;
     SlowPathCodeARM64* slow_path = nullptr;
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 1b5fa857e7..921ce10aaa 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -27,11 +27,11 @@
 #include "utils/arm64/assembler_arm64.h"
 #include "utils/type_reference.h"
 
-// TODO: make vixl clean wrt -Wshadow.
+// TODO(VIXL): Make VIXL compile with -Wshadow.
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wshadow"
-#include "a64/disasm-a64.h"
-#include "a64/macro-assembler-a64.h"
+#include "aarch64/disasm-aarch64.h"
+#include "aarch64/macro-assembler-aarch64.h"
 #pragma GCC diagnostic pop
 
 namespace art {
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 59e103a3bd..58879bc2f1 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -145,8 +145,8 @@ Location InvokeRuntimeCallingConvention::GetReturnLocation(Primitive::Type type)
   return MipsReturnLocation(type);
 }
 
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<CodeGeneratorMIPS*>(codegen)->GetAssembler()-> // NOLINT
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<CodeGeneratorMIPS*>(codegen)->GetAssembler()->  // NOLINT
 #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, x).Int32Value()
 
 class BoundsCheckSlowPathMIPS : public SlowPathCodeMIPS {
@@ -501,8 +501,8 @@ CodeGeneratorMIPS::CodeGeneratorMIPS(HGraph* graph,
 }
 
 #undef __
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<MipsAssembler*>(GetAssembler())-> // NOLINT
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<MipsAssembler*>(GetAssembler())->  // NOLINT
 #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, x).Int32Value()
 
 void CodeGeneratorMIPS::Finalize(CodeAllocator* allocator) {
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index fe1fddc7bf..4e7a2728b1 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -102,8 +102,8 @@ Location InvokeRuntimeCallingConvention::GetReturnLocation(Primitive::Type type)
   return Mips64ReturnLocation(type);
 }
 
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<CodeGeneratorMIPS64*>(codegen)->GetAssembler()-> // NOLINT
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<CodeGeneratorMIPS64*>(codegen)->GetAssembler()->  // NOLINT
 #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMips64PointerSize, x).Int32Value()
 
 class BoundsCheckSlowPathMIPS64 : public SlowPathCodeMIPS64 {
@@ -427,8 +427,8 @@ CodeGeneratorMIPS64::CodeGeneratorMIPS64(HGraph* graph,
 }
 
 #undef __
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<Mips64Assembler*>(GetAssembler())-> // NOLINT
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<Mips64Assembler*>(GetAssembler())->  // NOLINT
 #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kMips64PointerSize, x).Int32Value()
 
 void CodeGeneratorMIPS64::Finalize(CodeAllocator* allocator) {
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index ade21174f4..7a561bb4ad 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -47,8 +47,8 @@ static constexpr int kC2ConditionMask = 0x400;
 
 static constexpr int kFakeReturnRegister = Register(8);
 
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<X86Assembler*>(codegen->GetAssembler())-> // NOLINT
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<X86Assembler*>(codegen->GetAssembler())->  // NOLINT
 #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kX86PointerSize, x).Int32Value()
 
 class NullCheckSlowPathX86 : public SlowPathCode {
@@ -729,8 +729,8 @@ class ReadBarrierForRootSlowPathX86 : public SlowPathCode {
 };
 
 #undef __
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<X86Assembler*>(GetAssembler())-> /* NOLINT */
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<X86Assembler*>(GetAssembler())->  // NOLINT
 
 inline Condition X86Condition(IfCondition cond) {
   switch (cond) {
@@ -7099,12 +7099,6 @@ void CodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i
   // /* LockWord */ lock_word = LockWord(monitor)
   static_assert(sizeof(LockWord) == sizeof(int32_t),
                 "art::LockWord and int32_t have different sizes.");
-  // /* uint32_t */ rb_state = lock_word.ReadBarrierState()
-  __ shrl(temp_reg, Immediate(LockWord::kReadBarrierStateShift));
-  __ andl(temp_reg, Immediate(LockWord::kReadBarrierStateMask));
-  static_assert(
-      LockWord::kReadBarrierStateMask == ReadBarrier::rb_ptr_mask_,
-      "art::LockWord::kReadBarrierStateMask is not equal to art::ReadBarrier::rb_ptr_mask_.");
 
   // Load fence to prevent load-load reordering.
   // Note that this is a no-op, thanks to the x86 memory model.
@@ -7124,8 +7118,13 @@ void CodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i
 
   // if (rb_state == ReadBarrier::gray_ptr_)
   //   ref = ReadBarrier::Mark(ref);
-  __ cmpl(temp_reg, Immediate(ReadBarrier::gray_ptr_));
-  __ j(kEqual, slow_path->GetEntryLabel());
+  // Given the numeric representation, it's enough to check the low bit of the
+  // rb_state. We do that by shifting the bit out of the lock word with SHR.
+  static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+  static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+  __ shrl(temp_reg, Immediate(LockWord::kReadBarrierStateShift + 1));
+  __ j(kCarrySet, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
 
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index eadb431440..cf01a791ee 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -51,8 +51,8 @@ static constexpr FloatRegister kFpuCalleeSaves[] = { XMM12, XMM13, XMM14, XMM15
 
 static constexpr int kC2ConditionMask = 0x400;
 
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<X86_64Assembler*>(codegen->GetAssembler())-> // NOLINT
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<X86_64Assembler*>(codegen->GetAssembler())->  // NOLINT
 #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kX86_64PointerSize, x).Int32Value()
 
 class NullCheckSlowPathX86_64 : public SlowPathCode {
@@ -748,8 +748,8 @@ class ReadBarrierForRootSlowPathX86_64 : public SlowPathCode {
 };
 
 #undef __
-// NOLINT on __ macro to suppress wrong warning/fix from clang-tidy.
-#define __ down_cast<X86_64Assembler*>(GetAssembler())-> // NOLINT
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<X86_64Assembler*>(GetAssembler())->  // NOLINT
 
 inline Condition X86_64IntegerCondition(IfCondition cond) {
   switch (cond) {
@@ -6551,12 +6551,6 @@ void CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction
   // /* LockWord */ lock_word = LockWord(monitor)
   static_assert(sizeof(LockWord) == sizeof(int32_t),
                 "art::LockWord and int32_t have different sizes.");
-  // /* uint32_t */ rb_state = lock_word.ReadBarrierState()
-  __ shrl(temp_reg, Immediate(LockWord::kReadBarrierStateShift));
-  __ andl(temp_reg, Immediate(LockWord::kReadBarrierStateMask));
-  static_assert(
-      LockWord::kReadBarrierStateMask == ReadBarrier::rb_ptr_mask_,
-      "art::LockWord::kReadBarrierStateMask is not equal to art::ReadBarrier::rb_ptr_mask_.");
 
   // Load fence to prevent load-load reordering.
   // Note that this is a no-op, thanks to the x86-64 memory model.
@@ -6576,8 +6570,13 @@ void CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction
 
   // if (rb_state == ReadBarrier::gray_ptr_)
   //   ref = ReadBarrier::Mark(ref);
-  __ cmpl(temp_reg, Immediate(ReadBarrier::gray_ptr_));
-  __ j(kEqual, slow_path->GetEntryLabel());
+  // Given the numeric representation, it's enough to check the low bit of the
+  // rb_state. We do that by shifting the bit out of the lock word with SHR.
+  static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+  static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+  __ shrl(temp_reg, Immediate(LockWord::kReadBarrierStateShift + 1));
+  __ j(kCarrySet, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
 
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index fe9a7af250..18db507c48 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -247,7 +247,7 @@ static void RunCode(InstructionSet target_isa,
   } else if (target_isa == kX86) {
     std::unique_ptr<const X86InstructionSetFeatures> features_x86(
         X86InstructionSetFeatures::FromCppDefines());
-    x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), compiler_options);
+    TestCodeGeneratorX86 codegenX86(graph, *features_x86.get(), compiler_options);
     RunCode(&codegenX86, graph, hook_before_codegen, has_result, expected);
   } else if (target_isa == kX86_64) {
     std::unique_ptr<const X86_64InstructionSetFeatures> features_x86_64(
diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
index af0ee4e197..cc949c5275 100644
--- a/compiler/optimizing/common_arm64.h
+++ b/compiler/optimizing/common_arm64.h
@@ -22,8 +22,13 @@
 #include "nodes.h"
 #include "utils/arm64/assembler_arm64.h"
 
-#include "a64/disasm-a64.h"
-#include "a64/macro-assembler-a64.h"
+// TODO(VIXL): Make VIXL compile with -Wshadow.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wshadow"
+#include "aarch64/disasm-aarch64.h"
+#include "aarch64/macro-assembler-aarch64.h"
+#include "aarch64/simulator-aarch64.h"
+#pragma GCC diagnostic pop
 
 namespace art {
 namespace arm64 {
diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc
index 6632cd9969..8f7778fe68 100644
--- a/compiler/optimizing/instruction_simplifier_shared.cc
+++ b/compiler/optimizing/instruction_simplifier_shared.cc
@@ -231,6 +231,15 @@ bool TryExtractArrayAccessAddress(HInstruction* access,
                                   HInstruction* array,
                                   HInstruction* index,
                                   size_t data_offset) {
+  if (kEmitCompilerReadBarrier) {
+    // The read barrier instrumentation does not support the
+    // HIntermediateAddress instruction yet.
+    //
+    // TODO: Handle this case properly in the ARM64 and ARM code generator and
+    // re-enable this optimization; otherwise, remove this TODO.
+    // b/26601270
+    return false;
+  }
   if (index->IsConstant() ||
       (index->IsBoundsCheck() && index->AsBoundsCheck()->GetIndex()->IsConstant())) {
     // When the index is a constant all the addressing can be fitted in the
@@ -242,13 +251,6 @@ bool TryExtractArrayAccessAddress(HInstruction* access,
     // The access may require a runtime call or the original array pointer.
     return false;
   }
-  if (kEmitCompilerReadBarrier &&
-      access->IsArrayGet() &&
-      access->AsArrayGet()->GetType() == Primitive::kPrimNot) {
-    // For object arrays, the read barrier instrumentation requires
-    // the original array pointer.
-    return false;
-  }
 
   // Proceed to extract the base address computation.
   HGraph* graph = access->GetBlock()->GetGraph();
diff --git a/compiler/optimizing/intrinsics.h b/compiler/optimizing/intrinsics.h
index 3429a8fdbb..1a8eb58857 100644
--- a/compiler/optimizing/intrinsics.h
+++ b/compiler/optimizing/intrinsics.h
@@ -27,9 +27,6 @@ namespace art {
 class CompilerDriver;
 class DexFile;
 
-// Temporary measure until we have caught up with the Java 7 definition of Math.round. b/26327751
-static constexpr bool kRoundIsPlusPointFive = false;
-
 // Positive floating-point infinities.
 static constexpr uint32_t kPositiveInfinityFloat = 0x7f800000U;
 static constexpr uint64_t kPositiveInfinityDouble = UINT64_C(0x7ff0000000000000);
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index e7c40e6600..9cfe3ce569 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -29,11 +29,11 @@
 
 using namespace vixl::aarch64;  // NOLINT(build/namespaces)
 
-// TODO: make vixl clean wrt -Wshadow.
+// TODO(VIXL): Make VIXL compile with -Wshadow.
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wshadow"
-#include "a64/disasm-a64.h"
-#include "a64/macro-assembler-a64.h"
+#include "aarch64/disasm-aarch64.h"
+#include "aarch64/macro-assembler-aarch64.h"
 #pragma GCC diagnostic pop
 
 namespace art {
@@ -1160,8 +1160,10 @@ void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) {
   MacroAssembler* masm = GetVIXLAssembler();
   LocationSummary* locations = invoke->GetLocations();
 
-  Register str = XRegisterFrom(locations->InAt(0));
-  Register arg = XRegisterFrom(locations->InAt(1));
+  Register str = InputRegisterAt(invoke, 0);
+  Register arg = InputRegisterAt(invoke, 1);
+  DCHECK(str.IsW());
+  DCHECK(arg.IsW());
   Register out = OutputRegister(invoke);
 
   Register temp0 = WRegisterFrom(locations->GetTemp(0));
@@ -1192,8 +1194,8 @@ void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) {
   __ Subs(out, str, arg);
   __ B(&end, eq);
   // Load lengths of this and argument strings.
-  __ Ldr(temp0, MemOperand(str.X(), count_offset));
-  __ Ldr(temp1, MemOperand(arg.X(), count_offset));
+  __ Ldr(temp0, HeapOperand(str, count_offset));
+  __ Ldr(temp1, HeapOperand(arg, count_offset));
   // Return zero if both strings are empty.
   __ Orr(out, temp0, temp1);
   __ Cbz(out, &end);
@@ -1222,8 +1224,8 @@ void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) {
 
   // Loop to compare 4x16-bit characters at a time (ok because of string data alignment).
   __ Bind(&loop);
-  __ Ldr(temp4, MemOperand(str.X(), temp1));
-  __ Ldr(temp0, MemOperand(arg.X(), temp1));
+  __ Ldr(temp4, MemOperand(str.X(), temp1.X()));
+  __ Ldr(temp0, MemOperand(arg.X(), temp1.X()));
   __ Cmp(temp4, temp0);
   __ B(ne, &find_char_diff);
   __ Add(temp1, temp1, char_size * 4);
@@ -1242,14 +1244,14 @@ void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) {
   __ Clz(temp1, temp1);
   // If the number of 16-bit chars remaining <= the index where the difference occurs (0-3), then
   // the difference occurs outside the remaining string data, so just return length diff (out).
-  __ Cmp(temp2, Operand(temp1, LSR, 4));
+  __ Cmp(temp2, Operand(temp1.W(), LSR, 4));
   __ B(le, &end);
   // Extract the characters and calculate the difference.
   __ Bic(temp1, temp1, 0xf);
   __ Lsr(temp0, temp0, temp1);
   __ Lsr(temp4, temp4, temp1);
   __ And(temp4, temp4, 0xffff);
-  __ Sub(out, temp4, Operand(temp0, UXTH));
+  __ Sub(out, temp4.W(), Operand(temp0.W(), UXTH));
 
   __ Bind(&end);
 
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index dc409c92d6..22f4181b92 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -753,11 +753,6 @@ void IntrinsicCodeGeneratorX86::VisitMathRint(HInvoke* invoke) {
 }
 
 void IntrinsicLocationsBuilderX86::VisitMathRoundFloat(HInvoke* invoke) {
-  // See intrinsics.h.
-  if (!kRoundIsPlusPointFive) {
-    return;
-  }
-
   // Do we have instruction support?
   if (codegen_->GetInstructionSetFeatures().HasSSE4_1()) {
     HInvokeStaticOrDirect* static_or_direct = invoke->AsInvokeStaticOrDirect();
@@ -795,7 +790,6 @@ void IntrinsicCodeGeneratorX86::VisitMathRoundFloat(HInvoke* invoke) {
   }
 
   XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
-  Register constant_area = locations->InAt(1).AsRegister<Register>();
   XmmRegister t1 = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
   XmmRegister t2 = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
   Register out = locations->Out().AsRegister<Register>();
@@ -810,10 +804,23 @@ void IntrinsicCodeGeneratorX86::VisitMathRoundFloat(HInvoke* invoke) {
   __ movss(t2, in);
   __ roundss(t1, in, Immediate(1));
   __ subss(t2, t1);
-  __ comiss(t2, codegen_->LiteralInt32Address(bit_cast<int32_t, float>(0.5f), constant_area));
-  __ j(kBelow, &skip_incr);
-  __ addss(t1, codegen_->LiteralInt32Address(bit_cast<int32_t, float>(1.0f), constant_area));
-  __ Bind(&skip_incr);
+  if (locations->GetInputCount() == 2 && locations->InAt(1).IsValid()) {
+    // Direct constant area available.
+    Register constant_area = locations->InAt(1).AsRegister<Register>();
+    __ comiss(t2, codegen_->LiteralInt32Address(bit_cast<int32_t, float>(0.5f), constant_area));
+    __ j(kBelow, &skip_incr);
+    __ addss(t1, codegen_->LiteralInt32Address(bit_cast<int32_t, float>(1.0f), constant_area));
+    __ Bind(&skip_incr);
+  } else {
+    // No constant area: go through stack.
+    __ pushl(Immediate(bit_cast<int32_t, float>(0.5f)));
+    __ pushl(Immediate(bit_cast<int32_t, float>(1.0f)));
+    __ comiss(t2, Address(ESP, 4));
+    __ j(kBelow, &skip_incr);
+    __ addss(t1, Address(ESP, 0));
+    __ Bind(&skip_incr);
+    __ addl(ESP, Immediate(8));
+  }
 
   // Final conversion to an integer. Unfortunately this also does not have a
   // direct x86 instruction, since NaN should map to 0 and large positive
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 7dfbfb09be..ab8b05c3d4 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -598,10 +598,6 @@ static void CreateSSE41FPToIntLocations(ArenaAllocator* arena,
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitMathRoundFloat(HInvoke* invoke) {
-  // See intrinsics.h.
-  if (!kRoundIsPlusPointFive) {
-    return;
-  }
   CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
 }
 
@@ -646,10 +642,6 @@ void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) {
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitMathRoundDouble(HInvoke* invoke) {
-  // See intrinsics.h.
-  if (!kRoundIsPlusPointFive) {
-    return;
-  }
   CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
 }
 
diff --git a/compiler/optimizing/register_allocator_graph_color.cc b/compiler/optimizing/register_allocator_graph_color.cc
index 79ca5a0d86..cfdb41ab62 100644
--- a/compiler/optimizing/register_allocator_graph_color.cc
+++ b/compiler/optimizing/register_allocator_graph_color.cc
@@ -37,6 +37,165 @@ static constexpr size_t kMaxNumRegs = 32;
 // intervals are split when coloring fails.
 static constexpr size_t kMaxGraphColoringAttemptsDebug = 100;
 
+// We always want to avoid spilling inside loops.
+static constexpr size_t kLoopSpillWeightMultiplier = 10;
+
+// If we avoid moves in single jump blocks, we can avoid jumps to jumps.
+static constexpr size_t kSingleJumpBlockWeightMultiplier = 2;
+
+// We avoid moves in blocks that dominate the exit block, since these blocks will
+// be executed on every path through the method.
+static constexpr size_t kDominatesExitBlockWeightMultiplier = 2;
+
+enum class CoalesceKind {
+  kAdjacentSibling,       // Prevents moves at interval split points.
+  kFixedOutputSibling,    // Prevents moves from a fixed output location.
+  kFixedInput,            // Prevents moves into a fixed input location.
+  kNonlinearControlFlow,  // Prevents moves between blocks.
+  kPhi,                   // Prevents phi resolution moves.
+  kFirstInput,            // Prevents a single input move.
+  kAnyInput,              // May lead to better instruction selection / smaller encodings.
+};
+
+std::ostream& operator<<(std::ostream& os, const CoalesceKind& kind) {
+  return os << static_cast<typename std::underlying_type<CoalesceKind>::type>(kind);
+}
+
+static size_t LoopDepthAt(HBasicBlock* block) {
+  HLoopInformation* loop_info = block->GetLoopInformation();
+  size_t depth = 0;
+  while (loop_info != nullptr) {
+    ++depth;
+    loop_info = loop_info->GetPreHeader()->GetLoopInformation();
+  }
+  return depth;
+}
+
+// Return the runtime cost of inserting a move instruction at the specified location.
+static size_t CostForMoveAt(size_t position, const SsaLivenessAnalysis& liveness) {
+  HBasicBlock* block = liveness.GetBlockFromPosition(position / 2);
+  DCHECK(block != nullptr);
+  size_t cost = 1;
+  if (block->IsSingleJump()) {
+    cost *= kSingleJumpBlockWeightMultiplier;
+  }
+  if (block->Dominates(block->GetGraph()->GetExitBlock())) {
+    cost *= kDominatesExitBlockWeightMultiplier;
+  }
+  for (size_t loop_depth = LoopDepthAt(block); loop_depth > 0; --loop_depth) {
+    cost *= kLoopSpillWeightMultiplier;
+  }
+  return cost;
+}
+
+// In general, we estimate coalesce priority by whether it will definitely avoid a move,
+// and by how likely it is to create an interference graph that's harder to color.
+static size_t ComputeCoalescePriority(CoalesceKind kind,
+                                      size_t position,
+                                      const SsaLivenessAnalysis& liveness) {
+  if (kind == CoalesceKind::kAnyInput) {
+    // This type of coalescing can affect instruction selection, but not moves, so we
+    // give it the lowest priority.
+    return 0;
+  } else {
+    return CostForMoveAt(position, liveness);
+  }
+}
+
+enum class CoalesceStage {
+  kWorklist,  // Currently in the iterative coalescing worklist.
+  kActive,    // Not in a worklist, but could be considered again during iterative coalescing.
+  kInactive,  // No longer considered until last-chance coalescing.
+  kDefunct,   // Either the two nodes interfere, or have already been coalesced.
+};
+
+std::ostream& operator<<(std::ostream& os, const CoalesceStage& stage) {
+  return os << static_cast<typename std::underlying_type<CoalesceStage>::type>(stage);
+}
+
+// Represents a coalesce opportunity between two nodes.
+struct CoalesceOpportunity : public ArenaObject<kArenaAllocRegisterAllocator> {
+  CoalesceOpportunity(InterferenceNode* a,
+                      InterferenceNode* b,
+                      CoalesceKind kind,
+                      size_t position,
+                      const SsaLivenessAnalysis& liveness)
+        : node_a(a),
+          node_b(b),
+          stage(CoalesceStage::kWorklist),
+          priority(ComputeCoalescePriority(kind, position, liveness)) {}
+
+  // Compare two coalesce opportunities based on their priority.
+  // Return true if lhs has a lower priority than that of rhs.
+  static bool CmpPriority(const CoalesceOpportunity* lhs,
+                          const CoalesceOpportunity* rhs) {
+    return lhs->priority < rhs->priority;
+  }
+
+  InterferenceNode* const node_a;
+  InterferenceNode* const node_b;
+
+  // The current stage of this coalesce opportunity, indicating whether it is in a worklist,
+  // and whether it should still be considered.
+  CoalesceStage stage;
+
+  // The priority of this coalesce opportunity, based on heuristics.
+  const size_t priority;
+};
+
+enum class NodeStage {
+  kInitial,           // Uninitialized.
+  kPrecolored,        // Marks fixed nodes.
+  kSafepoint,         // Marks safepoint nodes.
+  kPrunable,          // Marks uncolored nodes in the interference graph.
+  kSimplifyWorklist,  // Marks non-move-related nodes with degree less than the number of registers.
+  kFreezeWorklist,    // Marks move-related nodes with degree less than the number of registers.
+  kSpillWorklist,     // Marks nodes with degree greater or equal to the number of registers.
+  kPruned             // Marks nodes already pruned from the interference graph.
+};
+
+std::ostream& operator<<(std::ostream& os, const NodeStage& stage) {
+  return os << static_cast<typename std::underlying_type<NodeStage>::type>(stage);
+}
+
+// Returns the estimated cost of spilling a particular live interval.
+static float ComputeSpillWeight(LiveInterval* interval, const SsaLivenessAnalysis& liveness) {
+  if (interval->HasRegister()) {
+    // Intervals with a fixed register cannot be spilled.
+    return std::numeric_limits<float>::min();
+  }
+
+  size_t length = interval->GetLength();
+  if (length == 1) {
+    // Tiny intervals should have maximum priority, since they cannot be split any further.
+    return std::numeric_limits<float>::max();
+  }
+
+  size_t use_weight = 0;
+  if (interval->GetDefinedBy() != nullptr && interval->DefinitionRequiresRegister()) {
+    // Cost for spilling at a register definition point.
+    use_weight += CostForMoveAt(interval->GetStart() + 1, liveness);
+  }
+
+  UsePosition* use = interval->GetFirstUse();
+  while (use != nullptr && use->GetPosition() <= interval->GetStart()) {
+    // Skip uses before the start of this live interval.
+    use = use->GetNext();
+  }
+
+  while (use != nullptr && use->GetPosition() <= interval->GetEnd()) {
+    if (use->GetUser() != nullptr && use->RequiresRegister()) {
+      // Cost for spilling at a register use point.
+      use_weight += CostForMoveAt(use->GetUser()->GetLifetimePosition() - 1, liveness);
+    }
+    use = use->GetNext();
+  }
+
+  // We divide by the length of the interval because we want to prioritize
+  // short intervals; we do not benefit much if we split them further.
+  return static_cast<float>(use_weight) / static_cast<float>(length);
+}
+
 // Interference nodes make up the interference graph, which is the primary data structure in
 // graph coloring register allocation. Each node represents a single live interval, and contains
 // a set of adjacent nodes corresponding to intervals overlapping with its own. To save memory,
@@ -58,84 +217,320 @@ static constexpr size_t kMaxGraphColoringAttemptsDebug = 100;
 // and thus whether it is safe to prune it from the interference graph early on.
 class InterferenceNode : public ArenaObject<kArenaAllocRegisterAllocator> {
  public:
-  InterferenceNode(ArenaAllocator* allocator, LiveInterval* interval, size_t id)
-        : interval_(interval),
-          adjacent_nodes_(CmpPtr, allocator->Adapter(kArenaAllocRegisterAllocator)),
-          out_degree_(0),
-          id_(id) {}
-
-  // Used to maintain determinism when storing InterferenceNode pointers in sets.
-  static bool CmpPtr(const InterferenceNode* lhs, const InterferenceNode* rhs) {
-    return lhs->id_ < rhs->id_;
+  InterferenceNode(ArenaAllocator* allocator,
+                   LiveInterval* interval,
+                   const SsaLivenessAnalysis& liveness)
+        : stage(NodeStage::kInitial),
+          interval_(interval),
+          adjacent_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+          coalesce_opportunities_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+          out_degree_(interval->HasRegister() ? std::numeric_limits<size_t>::max() : 0),
+          alias_(this),
+          spill_weight_(ComputeSpillWeight(interval, liveness)),
+          requires_color_(interval->RequiresRegister()) {
+    DCHECK(!interval->IsHighInterval()) << "Pair nodes should be represented by the low interval";
   }
 
-  void AddInterference(InterferenceNode* other) {
-    if (adjacent_nodes_.insert(other).second) {
+  void AddInterference(InterferenceNode* other, bool guaranteed_not_interfering_yet) {
+    DCHECK(!IsPrecolored()) << "To save memory, fixed nodes should not have outgoing interferences";
+    DCHECK_NE(this, other) << "Should not create self loops in the interference graph";
+    DCHECK_EQ(this, alias_) << "Should not add interferences to a node that aliases another";
+    DCHECK_NE(stage, NodeStage::kPruned);
+    DCHECK_NE(other->stage, NodeStage::kPruned);
+    if (guaranteed_not_interfering_yet) {
+      DCHECK(std::find(adjacent_nodes_.begin(), adjacent_nodes_.end(), other)
+             == adjacent_nodes_.end());
+      adjacent_nodes_.push_back(other);
       out_degree_ += EdgeWeightWith(other);
+    } else {
+      auto it = std::find(adjacent_nodes_.begin(), adjacent_nodes_.end(), other);
+      if (it == adjacent_nodes_.end()) {
+        adjacent_nodes_.push_back(other);
+        out_degree_ += EdgeWeightWith(other);
+      }
     }
   }
 
   void RemoveInterference(InterferenceNode* other) {
-    if (adjacent_nodes_.erase(other) > 0) {
+    DCHECK_EQ(this, alias_) << "Should not remove interferences from a coalesced node";
+    DCHECK_EQ(other->stage, NodeStage::kPruned) << "Should only remove interferences when pruning";
+    auto it = std::find(adjacent_nodes_.begin(), adjacent_nodes_.end(), other);
+    if (it != adjacent_nodes_.end()) {
+      adjacent_nodes_.erase(it);
       out_degree_ -= EdgeWeightWith(other);
     }
   }
 
   bool ContainsInterference(InterferenceNode* other) const {
-    return adjacent_nodes_.count(other) > 0;
+    DCHECK(!IsPrecolored()) << "Should not query fixed nodes for interferences";
+    DCHECK_EQ(this, alias_) << "Should not query a coalesced node for interferences";
+    auto it = std::find(adjacent_nodes_.begin(), adjacent_nodes_.end(), other);
+    return it != adjacent_nodes_.end();
   }
 
   LiveInterval* GetInterval() const {
     return interval_;
   }
 
-  const ArenaSet<InterferenceNode*, decltype(&CmpPtr)>& GetAdjacentNodes() const {
+  const ArenaVector<InterferenceNode*>& GetAdjacentNodes() const {
     return adjacent_nodes_;
   }
 
   size_t GetOutDegree() const {
+    // Pre-colored nodes have infinite degree.
+    DCHECK(!IsPrecolored() || out_degree_ == std::numeric_limits<size_t>::max());
     return out_degree_;
   }
 
-  size_t GetId() const {
-    return id_;
+  void AddCoalesceOpportunity(CoalesceOpportunity* opportunity) {
+    coalesce_opportunities_.push_back(opportunity);
+  }
+
+  void ClearCoalesceOpportunities() {
+    coalesce_opportunities_.clear();
+  }
+
+  bool IsMoveRelated() const {
+    for (CoalesceOpportunity* opportunity : coalesce_opportunities_) {
+      if (opportunity->stage == CoalesceStage::kWorklist ||
+          opportunity->stage == CoalesceStage::kActive) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Return whether this node already has a color.
+  // Used to find fixed nodes in the interference graph before coloring.
+  bool IsPrecolored() const {
+    return interval_->HasRegister();
+  }
+
+  bool IsPair() const {
+    return interval_->HasHighInterval();
+  }
+
+  void SetAlias(InterferenceNode* rep) {
+    DCHECK_NE(rep->stage, NodeStage::kPruned);
+    DCHECK_EQ(this, alias_) << "Should only set a node's alias once";
+    alias_ = rep;
+  }
+
+  InterferenceNode* GetAlias() {
+    if (alias_ != this) {
+      // Recurse in order to flatten tree of alias pointers.
+      alias_ = alias_->GetAlias();
+    }
+    return alias_;
+  }
+
+  const ArenaVector<CoalesceOpportunity*>& GetCoalesceOpportunities() const {
+    return coalesce_opportunities_;
+  }
+
+  float GetSpillWeight() const {
+    return spill_weight_;
+  }
+
+  bool RequiresColor() const {
+    return requires_color_;
   }
 
- private:
   // We give extra weight to edges adjacent to pair nodes. See the general comment on the
   // interference graph above.
-  size_t EdgeWeightWith(InterferenceNode* other) const {
-    return (interval_->HasHighInterval() || other->interval_->HasHighInterval()) ? 2 : 1;
+  size_t EdgeWeightWith(const InterferenceNode* other) const {
+    return (IsPair() || other->IsPair()) ? 2 : 1;
   }
 
+  // The current stage of this node, indicating which worklist it belongs to.
+  NodeStage stage;
+
+ private:
   // The live interval that this node represents.
   LiveInterval* const interval_;
 
   // All nodes interfering with this one.
-  // TODO: There is potential to use a cheaper data structure here, especially since
-  //       adjacency sets will usually be small.
-  ArenaSet<InterferenceNode*, decltype(&CmpPtr)> adjacent_nodes_;
+  // We use an unsorted vector as a set, since a tree or hash set is too heavy for the
+  // set sizes that we encounter. Using a vector leads to much better performance.
+  ArenaVector<InterferenceNode*> adjacent_nodes_;
+
+  // Interference nodes that this node should be coalesced with to reduce moves.
+  ArenaVector<CoalesceOpportunity*> coalesce_opportunities_;
 
   // The maximum number of colors with which this node could interfere. This could be more than
   // the number of adjacent nodes if this is a pair node, or if some adjacent nodes are pair nodes.
   // We use "out" degree because incoming edges come from nodes already pruned from the graph,
   // and do not affect the coloring of this node.
+  // Pre-colored nodes are treated as having infinite degree.
   size_t out_degree_;
 
-  // A unique identifier for this node, used to maintain determinism when storing
-  // interference nodes in sets.
-  const size_t id_;
+  // The node representing this node in the interference graph.
+  // Initially set to `this`, and only changed if this node is coalesced into another.
+  InterferenceNode* alias_;
 
-  // TODO: We could cache the result of interval_->RequiresRegister(), since it
-  //       will not change for the lifetime of this node. (Currently, RequiresRegister() requires
-  //       iterating through all uses of a live interval.)
+  // The cost of splitting and spilling this interval to the stack.
+  // Nodes with a higher spill weight should be prioritized when assigning registers.
+  // This is essentially based on use density and location; short intervals with many uses inside
+  // deeply nested loops have a high spill weight.
+  const float spill_weight_;
+
+  const bool requires_color_;
 
   DISALLOW_COPY_AND_ASSIGN(InterferenceNode);
 };
 
+// The order in which we color nodes is important. To guarantee forward progress,
+// we prioritize intervals that require registers, and after that we prioritize
+// short intervals. That way, if we fail to color a node, it either won't require a
+// register, or it will be a long interval that can be split in order to make the
+// interference graph sparser.
+// To improve code quality, we prioritize intervals used frequently in deeply nested loops.
+// (This metric is secondary to the forward progress requirements above.)
+// TODO: May also want to consider:
+// - Constants (since they can be rematerialized)
+// - Allocated spill slots
+static bool HasGreaterNodePriority(const InterferenceNode* lhs,
+                                   const InterferenceNode* rhs) {
+  // (1) Prioritize the node that requires a color.
+  if (lhs->RequiresColor() != rhs->RequiresColor()) {
+    return lhs->RequiresColor();
+  }
+
+  // (2) Prioritize the interval that has a higher spill weight.
+  return lhs->GetSpillWeight() > rhs->GetSpillWeight();
+}
+
+// A ColoringIteration holds the many data structures needed for a single graph coloring attempt,
+// and provides methods for each phase of the attempt.
+class ColoringIteration {
+ public:
+  ColoringIteration(RegisterAllocatorGraphColor* register_allocator,
+                    ArenaAllocator* allocator,
+                    bool processing_core_regs,
+                    size_t num_regs)
+        : register_allocator_(register_allocator),
+          allocator_(allocator),
+          processing_core_regs_(processing_core_regs),
+          num_regs_(num_regs),
+          interval_node_map_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+          prunable_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+          pruned_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+          simplify_worklist_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+          freeze_worklist_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+          spill_worklist_(HasGreaterNodePriority, allocator->Adapter(kArenaAllocRegisterAllocator)),
+          coalesce_worklist_(CoalesceOpportunity::CmpPriority,
+                             allocator->Adapter(kArenaAllocRegisterAllocator)) {}
+
+  // Use the intervals collected from instructions to construct an
+  // interference graph mapping intervals to adjacency lists.
+  // Also, collect synthesized safepoint nodes, used to keep
+  // track of live intervals across safepoints.
+  // TODO: Should build safepoints elsewhere.
+  void BuildInterferenceGraph(const ArenaVector<LiveInterval*>& intervals,
+                              const ArenaVector<InterferenceNode*>& physical_nodes,
+                              ArenaVector<InterferenceNode*>* safepoints);
+
+  // Add coalesce opportunities to interference nodes.
+  void FindCoalesceOpportunities();
+
+  // Prune nodes from the interference graph to be colored later. Build
+  // a stack (pruned_nodes) containing these intervals in an order determined
+  // by various heuristics.
+  void PruneInterferenceGraph();
+
+  // Process pruned_intervals_ to color the interference graph, spilling when
+  // necessary. Returns true if successful. Else, some intervals have been
+  // split, and the interference graph should be rebuilt for another attempt.
+  bool ColorInterferenceGraph();
+
+  // Return prunable nodes.
+  // The register allocator will need to access prunable nodes after coloring
+  // in order to tell the code generator which registers have been assigned.
+  const ArenaVector<InterferenceNode*>& GetPrunableNodes() const {
+    return prunable_nodes_;
+  }
+
+ private:
+  // Create a coalesce opportunity between two nodes.
+  void CreateCoalesceOpportunity(InterferenceNode* a,
+                                 InterferenceNode* b,
+                                 CoalesceKind kind,
+                                 size_t position);
+
+  // Add an edge in the interference graph, if valid.
+  // Note that `guaranteed_not_interfering_yet` is used to optimize adjacency set insertion
+  // when possible.
+  void AddPotentialInterference(InterferenceNode* from,
+                                InterferenceNode* to,
+                                bool guaranteed_not_interfering_yet,
+                                bool both_directions = true);
+
+  // Invalidate all coalesce opportunities this node has, so that it (and possibly its neighbors)
+  // may be pruned from the interference graph.
+  void FreezeMoves(InterferenceNode* node);
+
+  // Prune a node from the interference graph, updating worklists if necessary.
+  void PruneNode(InterferenceNode* node);
+
+  // Add coalesce opportunities associated with this node to the coalesce worklist.
+  void EnableCoalesceOpportunities(InterferenceNode* node);
+
+  // If needed, from `node` from the freeze worklist to the simplify worklist.
+  void CheckTransitionFromFreezeWorklist(InterferenceNode* node);
+
+  // Return true if `into` is colored, and `from` can be coalesced with `into` conservatively.
+  bool PrecoloredHeuristic(InterferenceNode* from, InterferenceNode* into);
+
+  // Return true if `from` and `into` are uncolored, and can be coalesced conservatively.
+  bool UncoloredHeuristic(InterferenceNode* from, InterferenceNode* into);
+
+  void Coalesce(CoalesceOpportunity* opportunity);
+
+  // Merge `from` into `into` in the interference graph.
+  void Combine(InterferenceNode* from, InterferenceNode* into);
+
+  // A reference to the register allocator instance,
+  // needed to split intervals and assign spill slots.
+  RegisterAllocatorGraphColor* register_allocator_;
+
+  // An arena allocator used for a single graph coloring attempt.
+  ArenaAllocator* allocator_;
+
+  const bool processing_core_regs_;
+
+  const size_t num_regs_;
+
+  // A map from live intervals to interference nodes.
+  ArenaHashMap<LiveInterval*, InterferenceNode*> interval_node_map_;
+
+  // Uncolored nodes that should be pruned from the interference graph.
+  ArenaVector<InterferenceNode*> prunable_nodes_;
+
+  // A stack of nodes pruned from the interference graph, waiting to be pruned.
+  ArenaStdStack<InterferenceNode*> pruned_nodes_;
+
+  // A queue containing low degree, non-move-related nodes that can pruned immediately.
+  ArenaDeque<InterferenceNode*> simplify_worklist_;
+
+  // A queue containing low degree, move-related nodes.
+  ArenaDeque<InterferenceNode*> freeze_worklist_;
+
+  // A queue containing high degree nodes.
+  // If we have to prune from the spill worklist, we cannot guarantee
+  // the pruned node a color, so we order the worklist by priority.
+  ArenaPriorityQueue<InterferenceNode*, decltype(&HasGreaterNodePriority)> spill_worklist_;
+
+  // A queue containing coalesce opportunities.
+  // We order the coalesce worklist by priority, since some coalesce opportunities (e.g., those
+  // inside of loops) are more important than others.
+  ArenaPriorityQueue<CoalesceOpportunity*,
+                     decltype(&CoalesceOpportunity::CmpPriority)> coalesce_worklist_;
+
+  DISALLOW_COPY_AND_ASSIGN(ColoringIteration);
+};
+
 static bool IsCoreInterval(LiveInterval* interval) {
-  return interval->GetType() != Primitive::kPrimFloat
-      && interval->GetType() != Primitive::kPrimDouble;
+  return !Primitive::IsFloatingPointType(interval->GetType());
 }
 
 static size_t ComputeReservedArtMethodSlots(const CodeGenerator& codegen) {
@@ -144,14 +539,16 @@ static size_t ComputeReservedArtMethodSlots(const CodeGenerator& codegen) {
 
 RegisterAllocatorGraphColor::RegisterAllocatorGraphColor(ArenaAllocator* allocator,
                                                          CodeGenerator* codegen,
-                                                         const SsaLivenessAnalysis& liveness)
+                                                         const SsaLivenessAnalysis& liveness,
+                                                         bool iterative_move_coalescing)
       : RegisterAllocator(allocator, codegen, liveness),
+        iterative_move_coalescing_(iterative_move_coalescing),
         core_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
         fp_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
         temp_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
         safepoints_(allocator->Adapter(kArenaAllocRegisterAllocator)),
-        physical_core_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
-        physical_fp_intervals_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        physical_core_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)),
+        physical_fp_nodes_(allocator->Adapter(kArenaAllocRegisterAllocator)),
         int_spill_slot_counter_(0),
         double_spill_slot_counter_(0),
         float_spill_slot_counter_(0),
@@ -162,17 +559,18 @@ RegisterAllocatorGraphColor::RegisterAllocatorGraphColor(ArenaAllocator* allocat
         number_of_globally_blocked_core_regs_(0),
         number_of_globally_blocked_fp_regs_(0),
         max_safepoint_live_core_regs_(0),
-        max_safepoint_live_fp_regs_(0),
-        coloring_attempt_allocator_(nullptr) {
+        max_safepoint_live_fp_regs_(0) {
   // Before we ask for blocked registers, set them up in the code generator.
   codegen->SetupBlockedRegisters();
 
   // Initialize physical core register live intervals and blocked registers.
   // This includes globally blocked registers, such as the stack pointer.
-  physical_core_intervals_.resize(codegen->GetNumberOfCoreRegisters(), nullptr);
-  for (size_t i = 0; i < codegen->GetNumberOfCoreRegisters(); ++i) {
+  physical_core_nodes_.resize(codegen_->GetNumberOfCoreRegisters(), nullptr);
+  for (size_t i = 0; i < codegen_->GetNumberOfCoreRegisters(); ++i) {
     LiveInterval* interval = LiveInterval::MakeFixedInterval(allocator_, i, Primitive::kPrimInt);
-    physical_core_intervals_[i] = interval;
+    physical_core_nodes_[i] =
+        new (allocator_) InterferenceNode(allocator_, interval, liveness);
+    physical_core_nodes_[i]->stage = NodeStage::kPrecolored;
     core_intervals_.push_back(interval);
     if (codegen_->IsBlockedCoreRegister(i)) {
       ++number_of_globally_blocked_core_regs_;
@@ -180,10 +578,12 @@ RegisterAllocatorGraphColor::RegisterAllocatorGraphColor(ArenaAllocator* allocat
     }
   }
   // Initialize physical floating point register live intervals and blocked registers.
-  physical_fp_intervals_.resize(codegen->GetNumberOfFloatingPointRegisters(), nullptr);
-  for (size_t i = 0; i < codegen->GetNumberOfFloatingPointRegisters(); ++i) {
+  physical_fp_nodes_.resize(codegen_->GetNumberOfFloatingPointRegisters(), nullptr);
+  for (size_t i = 0; i < codegen_->GetNumberOfFloatingPointRegisters(); ++i) {
     LiveInterval* interval = LiveInterval::MakeFixedInterval(allocator_, i, Primitive::kPrimFloat);
-    physical_fp_intervals_[i] = interval;
+    physical_fp_nodes_[i] =
+        new (allocator_) InterferenceNode(allocator_, interval, liveness);
+    physical_fp_nodes_[i]->stage = NodeStage::kPrecolored;
     fp_intervals_.push_back(interval);
     if (codegen_->IsBlockedFloatingPointRegister(i)) {
       ++number_of_globally_blocked_fp_regs_;
@@ -213,24 +613,44 @@ void RegisterAllocatorGraphColor::AllocateRegisters() {
           << "which could be caused by prioritizing the wrong live intervals. (Short intervals "
           << "should be prioritized over long ones, because they cannot be split further.)";
 
-      // Reset the allocator for the next coloring attempt.
+      // Many data structures are cleared between graph coloring attempts, so we reduce
+      // total memory usage by using a new arena allocator for each attempt.
       ArenaAllocator coloring_attempt_allocator(allocator_->GetArenaPool());
-      coloring_attempt_allocator_ = &coloring_attempt_allocator;
+      ColoringIteration iteration(this,
+                                  &coloring_attempt_allocator,
+                                  processing_core_regs,
+                                  num_registers);
 
-      // (2) Build the interference graph.
-      ArenaVector<InterferenceNode*> prunable_nodes(
-          coloring_attempt_allocator_->Adapter(kArenaAllocRegisterAllocator));
+      // (2) Build the interference graph. Also gather safepoints.
       ArenaVector<InterferenceNode*> safepoints(
-          coloring_attempt_allocator_->Adapter(kArenaAllocRegisterAllocator));
-      BuildInterferenceGraph(intervals, &prunable_nodes, &safepoints);
+          coloring_attempt_allocator.Adapter(kArenaAllocRegisterAllocator));
+      ArenaVector<InterferenceNode*>& physical_nodes = processing_core_regs
+          ? physical_core_nodes_
+          : physical_fp_nodes_;
+      iteration.BuildInterferenceGraph(intervals, physical_nodes, &safepoints);
+
+      // (3) Add coalesce opportunities.
+      //     If we have tried coloring the graph a suspiciously high number of times, give
+      //     up on move coalescing, just in case the coalescing heuristics are not conservative.
+      //     (This situation will be caught if DCHECKs are turned on.)
+      if (iterative_move_coalescing_ && attempt <= kMaxGraphColoringAttemptsDebug) {
+        iteration.FindCoalesceOpportunities();
+      }
 
-      // (3) Prune all uncolored nodes from interference graph.
-      ArenaStdStack<InterferenceNode*> pruned_nodes(
-          coloring_attempt_allocator_->Adapter(kArenaAllocRegisterAllocator));
-      PruneInterferenceGraph(prunable_nodes, num_registers, &pruned_nodes);
+      // (4) Prune all uncolored nodes from interference graph.
+      iteration.PruneInterferenceGraph();
 
-      // (4) Color pruned nodes based on interferences.
-      bool successful = ColorInterferenceGraph(&pruned_nodes, num_registers);
+      // (5) Color pruned nodes based on interferences.
+      bool successful = iteration.ColorInterferenceGraph();
+
+      // We manually clear coalesce opportunities for physical nodes,
+      // since they persist across coloring attempts.
+      for (InterferenceNode* node : physical_core_nodes_) {
+        node->ClearCoalesceOpportunities();
+      }
+      for (InterferenceNode* node : physical_fp_nodes_) {
+        node->ClearCoalesceOpportunities();
+      }
 
       if (successful) {
         // Compute the maximum number of live registers across safepoints.
@@ -250,7 +670,7 @@ void RegisterAllocatorGraphColor::AllocateRegisters() {
         // We only look at prunable_nodes because we already told the code generator about
         // fixed intervals while processing instructions. We also ignore the fixed intervals
         // placed at the top of catch blocks.
-        for (InterferenceNode* node : prunable_nodes) {
+        for (InterferenceNode* node : iteration.GetPrunableNodes()) {
           LiveInterval* interval = node->GetInterval();
           if (interval->HasRegister()) {
             Location low_reg = processing_core_regs
@@ -275,7 +695,7 @@ void RegisterAllocatorGraphColor::AllocateRegisters() {
     }  // while unsuccessful
   }  // for processing_core_instructions
 
-  // (5) Resolve locations and deconstruct SSA form.
+  // (6) Resolve locations and deconstruct SSA form.
   RegisterAllocationResolver(allocator_, codegen_, liveness_)
       .Resolve(max_safepoint_live_core_regs_,
                max_safepoint_live_fp_regs_,
@@ -304,11 +724,12 @@ bool RegisterAllocatorGraphColor::Validate(bool log_fatal_on_failure) {
       }
     }
 
-    ArenaVector<LiveInterval*>& physical_intervals = processing_core_regs
-        ? physical_core_intervals_
-        : physical_fp_intervals_;
-    for (LiveInterval* fixed : physical_intervals) {
-      if (fixed->GetFirstRange() != nullptr) {
+    ArenaVector<InterferenceNode*>& physical_nodes = processing_core_regs
+        ? physical_core_nodes_
+        : physical_fp_nodes_;
+    for (InterferenceNode* fixed : physical_nodes) {
+      LiveInterval* interval = fixed->GetInterval();
+      if (interval->GetFirstRange() != nullptr) {
         // Ideally we would check fixed ranges as well, but currently there are times when
         // two fixed intervals for the same register will overlap. For example, a fixed input
         // and a fixed output may sometimes share the same register, in which there will be two
@@ -358,7 +779,8 @@ void RegisterAllocatorGraphColor::ProcessInstructions() {
       ProcessInstruction(phi_it.Current());
     }
 
-    if (block->IsCatchBlock() || (block->IsLoopHeader() && block->GetLoopInformation()->IsIrreducible())) {
+    if (block->IsCatchBlock()
+        || (block->IsLoopHeader() && block->GetLoopInformation()->IsIrreducible())) {
       // By blocking all registers at the top of each catch block or irreducible loop, we force
       // intervals belonging to the live-in set of the catch/header block to be spilled.
       // TODO(ngeoffray): Phis in this block could be allocated in register.
@@ -435,7 +857,9 @@ void RegisterAllocatorGraphColor::CheckForFixedInputs(HInstruction* instruction)
   // TODO: Ideally we would coalesce the physical register with the register
   //       allocated to the input value, but this can be tricky if, e.g., there
   //       could be multiple physical register uses of the same value at the
-  //       same instruction. Need to think about it more.
+  //       same instruction. Furthermore, there's currently no distinction between
+  //       fixed inputs to a call (which will be clobbered) and other fixed inputs (which
+  //       may not be clobbered).
   LocationSummary* locations = instruction->GetLocations();
   size_t position = instruction->GetLifetimePosition();
   for (size_t i = 0; i < locations->GetInputCount(); ++i) {
@@ -639,8 +1063,8 @@ void RegisterAllocatorGraphColor::BlockRegister(Location location,
   DCHECK(location.IsRegister() || location.IsFpuRegister());
   int reg = location.reg();
   LiveInterval* interval = location.IsRegister()
-      ? physical_core_intervals_[reg]
-      : physical_fp_intervals_[reg];
+      ? physical_core_nodes_[reg]->GetInterval()
+      : physical_fp_nodes_[reg]->GetInterval();
   DCHECK(interval->GetRegister() == reg);
   bool blocked_by_codegen = location.IsRegister()
       ? codegen_->IsBlockedCoreRegister(reg)
@@ -666,28 +1090,105 @@ void RegisterAllocatorGraphColor::BlockRegisters(size_t start, size_t end, bool
   }
 }
 
-// Add an interference edge, but only if necessary.
-static void AddPotentialInterference(InterferenceNode* from, InterferenceNode* to) {
-  if (from->GetInterval()->HasRegister()) {
+void ColoringIteration::AddPotentialInterference(InterferenceNode* from,
+                                                 InterferenceNode* to,
+                                                 bool guaranteed_not_interfering_yet,
+                                                 bool both_directions) {
+  if (from->IsPrecolored()) {
     // We save space by ignoring outgoing edges from fixed nodes.
   } else if (to->GetInterval()->IsSlowPathSafepoint()) {
     // Safepoint intervals are only there to count max live registers,
     // so no need to give them incoming interference edges.
     // This is also necessary for correctness, because we don't want nodes
     // to remove themselves from safepoint adjacency sets when they're pruned.
+  } else if (to->IsPrecolored()) {
+    // It is important that only a single node represents a given fixed register in the
+    // interference graph. We retrieve that node here.
+    const ArenaVector<InterferenceNode*>& physical_nodes = to->GetInterval()->IsFloatingPoint()
+        ? register_allocator_->physical_fp_nodes_
+        : register_allocator_->physical_core_nodes_;
+    InterferenceNode* physical_node = physical_nodes[to->GetInterval()->GetRegister()];
+    from->AddInterference(physical_node, /*guaranteed_not_interfering_yet*/ false);
+    DCHECK_EQ(to->GetInterval()->GetRegister(), physical_node->GetInterval()->GetRegister());
+    DCHECK_EQ(to->GetAlias(), physical_node) << "Fixed nodes should alias the canonical fixed node";
+
+    // If a node interferes with a fixed pair node, the weight of the edge may
+    // be inaccurate after using the alias of the pair node, because the alias of the pair node
+    // is a singular node.
+    // We could make special pair fixed nodes, but that ends up being too conservative because
+    // a node could then interfere with both {r1} and {r1,r2}, leading to a degree of
+    // three rather than two.
+    // Instead, we explicitly add an interference with the high node of the fixed pair node.
+    // TODO: This is too conservative at time for pair nodes, but the fact that fixed pair intervals
+    //       can be unaligned on x86 complicates things.
+    if (to->IsPair()) {
+      InterferenceNode* high_node =
+          physical_nodes[to->GetInterval()->GetHighInterval()->GetRegister()];
+      DCHECK_EQ(to->GetInterval()->GetHighInterval()->GetRegister(),
+                high_node->GetInterval()->GetRegister());
+      from->AddInterference(high_node, /*guaranteed_not_interfering_yet*/ false);
+    }
   } else {
-    from->AddInterference(to);
+    // Standard interference between two uncolored nodes.
+    from->AddInterference(to, guaranteed_not_interfering_yet);
+  }
+
+  if (both_directions) {
+    AddPotentialInterference(to, from, guaranteed_not_interfering_yet, /*both_directions*/ false);
   }
 }
 
-// TODO: See locations->OutputCanOverlapWithInputs(); we may want to consider
-//       this when building the interference graph.
-void RegisterAllocatorGraphColor::BuildInterferenceGraph(
+// Returns true if `in_node` represents an input interval of `out_node`, and the output interval
+// is allowed to have the same register as the input interval.
+// TODO: Ideally we should just produce correct intervals in liveness analysis.
+//       We would need to refactor the current live interval layout to do so, which is
+//       no small task.
+static bool CheckInputOutputCanOverlap(InterferenceNode* in_node, InterferenceNode* out_node) {
+  LiveInterval* output_interval = out_node->GetInterval();
+  HInstruction* defined_by = output_interval->GetDefinedBy();
+  if (defined_by == nullptr) {
+    // This must not be a definition point.
+    return false;
+  }
+
+  LocationSummary* locations = defined_by->GetLocations();
+  if (locations->OutputCanOverlapWithInputs()) {
+    // This instruction does not allow the output to reuse a register from an input.
+    return false;
+  }
+
+  LiveInterval* input_interval = in_node->GetInterval();
+  LiveInterval* next_sibling = input_interval->GetNextSibling();
+  size_t def_position = defined_by->GetLifetimePosition();
+  size_t use_position = def_position + 1;
+  if (next_sibling != nullptr && next_sibling->GetStart() == use_position) {
+    // The next sibling starts at the use position, so reusing the input register in the output
+    // would clobber the input before it's moved into the sibling interval location.
+    return false;
+  }
+
+  if (!input_interval->IsDeadAt(use_position) && input_interval->CoversSlow(use_position)) {
+    // The input interval is live after the use position.
+    return false;
+  }
+
+  HInputsRef inputs = defined_by->GetInputs();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    if (inputs[i]->GetLiveInterval()->GetSiblingAt(def_position) == input_interval) {
+      DCHECK(input_interval->SameRegisterKind(*output_interval));
+      return true;
+    }
+  }
+
+  // The input interval was not an input for this instruction.
+  return false;
+}
+
+void ColoringIteration::BuildInterferenceGraph(
     const ArenaVector<LiveInterval*>& intervals,
-    ArenaVector<InterferenceNode*>* prunable_nodes,
+    const ArenaVector<InterferenceNode*>& physical_nodes,
     ArenaVector<InterferenceNode*>* safepoints) {
-  size_t interval_id_counter = 0;
-
+  DCHECK(interval_node_map_.Empty() && prunable_nodes_.empty());
   // Build the interference graph efficiently by ordering range endpoints
   // by position and doing a linear sweep to find interferences. (That is, we
   // jump from endpoint to endpoint, maintaining a set of intervals live at each
@@ -701,21 +1202,34 @@ void RegisterAllocatorGraphColor::BuildInterferenceGraph(
   // For simplicity, we create a tuple for each endpoint, and then sort the tuples.
   // Tuple contents: (position, is_range_beginning, node).
   ArenaVector<std::tuple<size_t, bool, InterferenceNode*>> range_endpoints(
-      coloring_attempt_allocator_->Adapter(kArenaAllocRegisterAllocator));
+      allocator_->Adapter(kArenaAllocRegisterAllocator));
+
+  // We reserve plenty of space to avoid excessive copying.
+  range_endpoints.reserve(4 * prunable_nodes_.size());
+
   for (LiveInterval* parent : intervals) {
     for (LiveInterval* sibling = parent; sibling != nullptr; sibling = sibling->GetNextSibling()) {
       LiveRange* range = sibling->GetFirstRange();
       if (range != nullptr) {
-        InterferenceNode* node = new (coloring_attempt_allocator_) InterferenceNode(
-            coloring_attempt_allocator_, sibling, interval_id_counter++);
+        InterferenceNode* node = new (allocator_) InterferenceNode(
+            allocator_, sibling, register_allocator_->liveness_);
+        interval_node_map_.Insert(std::make_pair(sibling, node));
+
         if (sibling->HasRegister()) {
-          // Fixed nodes will never be pruned, so no need to keep track of them.
+          // Fixed nodes should alias the canonical node for the corresponding register.
+          node->stage = NodeStage::kPrecolored;
+          InterferenceNode* physical_node = physical_nodes[sibling->GetRegister()];
+          node->SetAlias(physical_node);
+          DCHECK_EQ(node->GetInterval()->GetRegister(),
+                    physical_node->GetInterval()->GetRegister());
         } else if (sibling->IsSlowPathSafepoint()) {
           // Safepoint intervals are synthesized to count max live registers.
           // They will be processed separately after coloring.
+          node->stage = NodeStage::kSafepoint;
           safepoints->push_back(node);
         } else {
-          prunable_nodes->push_back(node);
+          node->stage = NodeStage::kPrunable;
+          prunable_nodes_.push_back(node);
         }
 
         while (range != nullptr) {
@@ -728,11 +1242,18 @@ void RegisterAllocatorGraphColor::BuildInterferenceGraph(
   }
 
   // Sort the endpoints.
-  std::sort(range_endpoints.begin(), range_endpoints.end());
+  // We explicitly ignore the third entry of each tuple (the node pointer) in order
+  // to maintain determinism.
+  std::sort(range_endpoints.begin(), range_endpoints.end(),
+            [] (const std::tuple<size_t, bool, InterferenceNode*>& lhs,
+                const std::tuple<size_t, bool, InterferenceNode*>& rhs) {
+    return std::tie(std::get<0>(lhs), std::get<1>(lhs))
+         < std::tie(std::get<0>(rhs), std::get<1>(rhs));
+  });
 
   // Nodes live at the current position in the linear sweep.
-  ArenaSet<InterferenceNode*, decltype(&InterferenceNode::CmpPtr)> live(
-      InterferenceNode::CmpPtr, coloring_attempt_allocator_->Adapter(kArenaAllocRegisterAllocator));
+  ArenaVector<InterferenceNode*> live(
+      allocator_->Adapter(kArenaAllocRegisterAllocator));
 
   // Linear sweep. When we encounter the beginning of a range, we add the corresponding node to the
   // live set. When we encounter the end of a range, we remove the corresponding node
@@ -740,131 +1261,505 @@ void RegisterAllocatorGraphColor::BuildInterferenceGraph(
   for (auto it = range_endpoints.begin(); it != range_endpoints.end(); ++it) {
     bool is_range_beginning;
     InterferenceNode* node;
+    size_t position;
     // Extract information from the tuple, including the node this tuple represents.
-    std::tie(std::ignore, is_range_beginning, node) = *it;
+    std::tie(position, is_range_beginning, node) = *it;
 
     if (is_range_beginning) {
+      bool guaranteed_not_interfering_yet = position == node->GetInterval()->GetStart();
       for (InterferenceNode* conflicting : live) {
         DCHECK_NE(node, conflicting);
-        AddPotentialInterference(node, conflicting);
-        AddPotentialInterference(conflicting, node);
+        if (CheckInputOutputCanOverlap(conflicting, node)) {
+          // We do not add an interference, because the instruction represented by `node` allows
+          // its output to share a register with an input, represented here by `conflicting`.
+        } else {
+          AddPotentialInterference(node, conflicting, guaranteed_not_interfering_yet);
+        }
       }
-      DCHECK_EQ(live.count(node), 0u);
-      live.insert(node);
+      DCHECK(std::find(live.begin(), live.end(), node) == live.end());
+      live.push_back(node);
     } else {
       // End of range.
-      DCHECK_EQ(live.count(node), 1u);
-      live.erase(node);
+      auto live_it = std::find(live.begin(), live.end(), node);
+      DCHECK(live_it != live.end());
+      live.erase(live_it);
     }
   }
   DCHECK(live.empty());
 }
 
-// The order in which we color nodes is vital to both correctness (forward
-// progress) and code quality. Specifically, we must prioritize intervals
-// that require registers, and after that we must prioritize short intervals.
-// That way, if we fail to color a node, it either won't require a register,
-// or it will be a long interval that can be split in order to make the
-// interference graph sparser.
-// TODO: May also want to consider:
-// - Loop depth
-// - Constants (since they can be rematerialized)
-// - Allocated spill slots
-static bool GreaterNodePriority(const InterferenceNode* lhs,
-                                const InterferenceNode* rhs) {
-  LiveInterval* lhs_interval = lhs->GetInterval();
-  LiveInterval* rhs_interval = rhs->GetInterval();
+void ColoringIteration::CreateCoalesceOpportunity(InterferenceNode* a,
+                                                  InterferenceNode* b,
+                                                  CoalesceKind kind,
+                                                  size_t position) {
+  DCHECK_EQ(a->IsPair(), b->IsPair())
+      << "Nodes of different memory widths should never be coalesced";
+  CoalesceOpportunity* opportunity =
+      new (allocator_) CoalesceOpportunity(a, b, kind, position, register_allocator_->liveness_);
+  a->AddCoalesceOpportunity(opportunity);
+  b->AddCoalesceOpportunity(opportunity);
+  coalesce_worklist_.push(opportunity);
+}
 
-  // (1) Choose the interval that requires a register.
-  if (lhs_interval->RequiresRegister() != rhs_interval->RequiresRegister()) {
-    return lhs_interval->RequiresRegister();
-  }
+// When looking for coalesce opportunities, we use the interval_node_map_ to find the node
+// corresponding to an interval. Note that not all intervals are in this map, notably the parents
+// of constants and stack arguments. (However, these interval should not be involved in coalesce
+// opportunities anyway, because they're not going to be in registers.)
+void ColoringIteration::FindCoalesceOpportunities() {
+  DCHECK(coalesce_worklist_.empty());
 
-  // (2) Choose the interval that has a shorter life span.
-  if (lhs_interval->GetLength() != rhs_interval->GetLength()) {
-    return lhs_interval->GetLength() < rhs_interval->GetLength();
-  }
+  for (InterferenceNode* node : prunable_nodes_) {
+    LiveInterval* interval = node->GetInterval();
+
+    // Coalesce siblings.
+    LiveInterval* next_sibling = interval->GetNextSibling();
+    if (next_sibling != nullptr && interval->GetEnd() == next_sibling->GetStart()) {
+      auto it = interval_node_map_.Find(next_sibling);
+      if (it != interval_node_map_.end()) {
+        InterferenceNode* sibling_node = it->second;
+        CreateCoalesceOpportunity(node,
+                                  sibling_node,
+                                  CoalesceKind::kAdjacentSibling,
+                                  interval->GetEnd());
+      }
+    }
+
+    // Coalesce fixed outputs with this interval if this interval is an adjacent sibling.
+    LiveInterval* parent = interval->GetParent();
+    if (parent->HasRegister()
+        && parent->GetNextSibling() == interval
+        && parent->GetEnd() == interval->GetStart()) {
+      auto it = interval_node_map_.Find(parent);
+      if (it != interval_node_map_.end()) {
+        InterferenceNode* parent_node = it->second;
+        CreateCoalesceOpportunity(node,
+                                  parent_node,
+                                  CoalesceKind::kFixedOutputSibling,
+                                  parent->GetEnd());
+      }
+    }
+
+    // Try to prevent moves across blocks.
+    // Note that this does not lead to many succeeding coalesce attempts, so could be removed
+    // if found to add to compile time.
+    const SsaLivenessAnalysis& liveness = register_allocator_->liveness_;
+    if (interval->IsSplit() && liveness.IsAtBlockBoundary(interval->GetStart() / 2)) {
+      // If the start of this interval is at a block boundary, we look at the
+      // location of the interval in blocks preceding the block this interval
+      // starts at. This can avoid a move between the two blocks.
+      HBasicBlock* block = liveness.GetBlockFromPosition(interval->GetStart() / 2);
+      for (HBasicBlock* predecessor : block->GetPredecessors()) {
+        size_t position = predecessor->GetLifetimeEnd() - 1;
+        LiveInterval* existing = interval->GetParent()->GetSiblingAt(position);
+        if (existing != nullptr) {
+          auto it = interval_node_map_.Find(existing);
+          if (it != interval_node_map_.end()) {
+            InterferenceNode* existing_node = it->second;
+            CreateCoalesceOpportunity(node,
+                                      existing_node,
+                                      CoalesceKind::kNonlinearControlFlow,
+                                      position);
+          }
+        }
+      }
+    }
+
+    // Coalesce phi inputs with the corresponding output.
+    HInstruction* defined_by = interval->GetDefinedBy();
+    if (defined_by != nullptr && defined_by->IsPhi()) {
+      const ArenaVector<HBasicBlock*>& predecessors = defined_by->GetBlock()->GetPredecessors();
+      HInputsRef inputs = defined_by->GetInputs();
+
+      for (size_t i = 0, e = inputs.size(); i < e; ++i) {
+        // We want the sibling at the end of the appropriate predecessor block.
+        size_t position = predecessors[i]->GetLifetimeEnd() - 1;
+        LiveInterval* input_interval = inputs[i]->GetLiveInterval()->GetSiblingAt(position);
+
+        auto it = interval_node_map_.Find(input_interval);
+        if (it != interval_node_map_.end()) {
+          InterferenceNode* input_node = it->second;
+          CreateCoalesceOpportunity(node, input_node, CoalesceKind::kPhi, position);
+        }
+      }
+    }
+
+    // Coalesce output with first input when policy is kSameAsFirstInput.
+    if (defined_by != nullptr) {
+      Location out = defined_by->GetLocations()->Out();
+      if (out.IsUnallocated() && out.GetPolicy() == Location::kSameAsFirstInput) {
+        LiveInterval* input_interval
+            = defined_by->InputAt(0)->GetLiveInterval()->GetSiblingAt(interval->GetStart() - 1);
+        // TODO: Could we consider lifetime holes here?
+        if (input_interval->GetEnd() == interval->GetStart()) {
+          auto it = interval_node_map_.Find(input_interval);
+          if (it != interval_node_map_.end()) {
+            InterferenceNode* input_node = it->second;
+            CreateCoalesceOpportunity(node,
+                                      input_node,
+                                      CoalesceKind::kFirstInput,
+                                      interval->GetStart());
+          }
+        }
+      }
+    }
+
+    // An interval that starts an instruction (that is, it is not split), may
+    // re-use the registers used by the inputs of that instruction, based on the
+    // location summary.
+    if (defined_by != nullptr) {
+      DCHECK(!interval->IsSplit());
+      LocationSummary* locations = defined_by->GetLocations();
+      if (!locations->OutputCanOverlapWithInputs()) {
+        HInputsRef inputs = defined_by->GetInputs();
+        for (size_t i = 0; i < inputs.size(); ++i) {
+          size_t def_point = defined_by->GetLifetimePosition();
+          // TODO: Getting the sibling at the def_point might not be quite what we want
+          //       for fixed inputs, since the use will be *at* the def_point rather than after.
+          LiveInterval* input_interval = inputs[i]->GetLiveInterval()->GetSiblingAt(def_point);
+          if (input_interval != nullptr &&
+              input_interval->HasHighInterval() == interval->HasHighInterval()) {
+            auto it = interval_node_map_.Find(input_interval);
+            if (it != interval_node_map_.end()) {
+              InterferenceNode* input_node = it->second;
+              CreateCoalesceOpportunity(node,
+                                        input_node,
+                                        CoalesceKind::kAnyInput,
+                                        interval->GetStart());
+            }
+          }
+        }
+      }
+    }
+
+    // Try to prevent moves into fixed input locations.
+    UsePosition* use = interval->GetFirstUse();
+    for (; use != nullptr && use->GetPosition() <= interval->GetStart(); use = use->GetNext()) {
+      // Skip past uses before the start of this interval.
+    }
+    for (; use != nullptr && use->GetPosition() <= interval->GetEnd(); use = use->GetNext()) {
+      HInstruction* user = use->GetUser();
+      if (user == nullptr) {
+        // User may be null for certain intervals, such as temp intervals.
+        continue;
+      }
+      LocationSummary* locations = user->GetLocations();
+      Location input = locations->InAt(use->GetInputIndex());
+      if (input.IsRegister() || input.IsFpuRegister()) {
+        // TODO: Could try to handle pair interval too, but coalescing with fixed pair nodes
+        //       is currently not supported.
+        InterferenceNode* fixed_node = input.IsRegister()
+            ? register_allocator_->physical_core_nodes_[input.reg()]
+            : register_allocator_->physical_fp_nodes_[input.reg()];
+        CreateCoalesceOpportunity(node,
+                                  fixed_node,
+                                  CoalesceKind::kFixedInput,
+                                  user->GetLifetimePosition());
+      }
+    }
+  }  // for node in prunable_nodes
+}
 
-  // (3) Just choose the interval based on a deterministic ordering.
-  return InterferenceNode::CmpPtr(lhs, rhs);
+static bool IsLowDegreeNode(InterferenceNode* node, size_t num_regs) {
+  return node->GetOutDegree() < num_regs;
 }
 
-void RegisterAllocatorGraphColor::PruneInterferenceGraph(
-      const ArenaVector<InterferenceNode*>& prunable_nodes,
-      size_t num_regs,
-      ArenaStdStack<InterferenceNode*>* pruned_nodes) {
+static bool IsHighDegreeNode(InterferenceNode* node, size_t num_regs) {
+  return !IsLowDegreeNode(node, num_regs);
+}
+
+void ColoringIteration::PruneInterferenceGraph() {
+  DCHECK(pruned_nodes_.empty()
+      && simplify_worklist_.empty()
+      && freeze_worklist_.empty()
+      && spill_worklist_.empty());
   // When pruning the graph, we refer to nodes with degree less than num_regs as low degree nodes,
   // and all others as high degree nodes. The distinction is important: low degree nodes are
   // guaranteed a color, while high degree nodes are not.
 
-  // Low-degree nodes are guaranteed a color, so worklist order does not matter.
-  ArenaDeque<InterferenceNode*> low_degree_worklist(
-      coloring_attempt_allocator_->Adapter(kArenaAllocRegisterAllocator));
-
-  // If we have to prune from the high-degree worklist, we cannot guarantee
-  // the pruned node a color. So, we order the worklist by priority.
-  ArenaSet<InterferenceNode*, decltype(&GreaterNodePriority)> high_degree_worklist(
-      GreaterNodePriority, coloring_attempt_allocator_->Adapter(kArenaAllocRegisterAllocator));
-
-  // Build worklists.
-  for (InterferenceNode* node : prunable_nodes) {
-    DCHECK(!node->GetInterval()->HasRegister())
-        << "Fixed nodes should never be pruned";
-    DCHECK(!node->GetInterval()->IsSlowPathSafepoint())
-        << "Safepoint nodes should never be pruned";
-    if (node->GetOutDegree() < num_regs) {
-      low_degree_worklist.push_back(node);
-    } else {
-      high_degree_worklist.insert(node);
-    }
-  }
-
-  // Helper function to prune an interval from the interference graph,
-  // which includes updating the worklists.
-  auto prune_node = [this,
-                     num_regs,
-                     &pruned_nodes,
-                     &low_degree_worklist,
-                     &high_degree_worklist] (InterferenceNode* node) {
-    DCHECK(!node->GetInterval()->HasRegister());
-    pruned_nodes->push(node);
-    for (InterferenceNode* adjacent : node->GetAdjacentNodes()) {
-      DCHECK(!adjacent->GetInterval()->IsSlowPathSafepoint())
-          << "Nodes should never interfere with synthesized safepoint nodes";
-      if (adjacent->GetInterval()->HasRegister()) {
-        // No effect on pre-colored nodes; they're never pruned.
+  // Build worklists. Note that the coalesce worklist has already been
+  // filled by FindCoalesceOpportunities().
+  for (InterferenceNode* node : prunable_nodes_) {
+    DCHECK(!node->IsPrecolored()) << "Fixed nodes should never be pruned";
+    DCHECK(!node->GetInterval()->IsSlowPathSafepoint()) << "Safepoint nodes should never be pruned";
+    if (IsLowDegreeNode(node, num_regs_)) {
+      if (node->GetCoalesceOpportunities().empty()) {
+        // Simplify Worklist.
+        node->stage = NodeStage::kSimplifyWorklist;
+        simplify_worklist_.push_back(node);
       } else {
-        bool was_high_degree = adjacent->GetOutDegree() >= num_regs;
-        DCHECK(adjacent->ContainsInterference(node))
-            << "Missing incoming interference edge from non-fixed node";
-        adjacent->RemoveInterference(node);
-        if (was_high_degree && adjacent->GetOutDegree() < num_regs) {
-          // This is a transition from high degree to low degree.
-          DCHECK_EQ(high_degree_worklist.count(adjacent), 1u);
-          high_degree_worklist.erase(adjacent);
-          low_degree_worklist.push_back(adjacent);
-        }
+        // Freeze Worklist.
+        node->stage = NodeStage::kFreezeWorklist;
+        freeze_worklist_.push_back(node);
       }
+    } else {
+      // Spill worklist.
+      node->stage = NodeStage::kSpillWorklist;
+      spill_worklist_.push(node);
     }
-  };
+  }
 
   // Prune graph.
-  while (!low_degree_worklist.empty() || !high_degree_worklist.empty()) {
-    while (!low_degree_worklist.empty()) {
-      InterferenceNode* node = low_degree_worklist.front();
-      // TODO: pop_back() should work as well, but it doesn't; we get a
+  // Note that we do not remove a node from its current worklist if it moves to another, so it may
+  // be in multiple worklists at once; the node's `phase` says which worklist it is really in.
+  while (true) {
+    if (!simplify_worklist_.empty()) {
+      // Prune low-degree nodes.
+      // TODO: pop_back() should work as well, but it didn't; we get a
       //       failed check while pruning. We should look into this.
-      low_degree_worklist.pop_front();
-      prune_node(node);
-    }
-    if (!high_degree_worklist.empty()) {
-      // We prune the lowest-priority node, because pruning a node earlier
+      InterferenceNode* node = simplify_worklist_.front();
+      simplify_worklist_.pop_front();
+      DCHECK_EQ(node->stage, NodeStage::kSimplifyWorklist) << "Cannot move from simplify list";
+      DCHECK_LT(node->GetOutDegree(), num_regs_) << "Nodes in simplify list should be low degree";
+      DCHECK(!node->IsMoveRelated()) << "Nodes in simplify list should not be move related";
+      PruneNode(node);
+    } else if (!coalesce_worklist_.empty()) {
+      // Coalesce.
+      CoalesceOpportunity* opportunity = coalesce_worklist_.top();
+      coalesce_worklist_.pop();
+      if (opportunity->stage == CoalesceStage::kWorklist) {
+        Coalesce(opportunity);
+      }
+    } else if (!freeze_worklist_.empty()) {
+      // Freeze moves and prune a low-degree move-related node.
+      InterferenceNode* node = freeze_worklist_.front();
+      freeze_worklist_.pop_front();
+      if (node->stage == NodeStage::kFreezeWorklist) {
+        DCHECK_LT(node->GetOutDegree(), num_regs_) << "Nodes in freeze list should be low degree";
+        DCHECK(node->IsMoveRelated()) << "Nodes in freeze list should be move related";
+        FreezeMoves(node);
+        PruneNode(node);
+      }
+    } else if (!spill_worklist_.empty()) {
+      // We spill the lowest-priority node, because pruning a node earlier
       // gives it a higher chance of being spilled.
-      InterferenceNode* node = *high_degree_worklist.rbegin();
-      high_degree_worklist.erase(node);
-      prune_node(node);
+      InterferenceNode* node = spill_worklist_.top();
+      spill_worklist_.pop();
+      if (node->stage == NodeStage::kSpillWorklist) {
+        DCHECK_GE(node->GetOutDegree(), num_regs_) << "Nodes in spill list should be high degree";
+        FreezeMoves(node);
+        PruneNode(node);
+      }
+    } else {
+      // Pruning complete.
+      break;
+    }
+  }
+  DCHECK_EQ(prunable_nodes_.size(), pruned_nodes_.size());
+}
+
+void ColoringIteration::EnableCoalesceOpportunities(InterferenceNode* node) {
+  for (CoalesceOpportunity* opportunity : node->GetCoalesceOpportunities()) {
+    if (opportunity->stage == CoalesceStage::kActive) {
+      opportunity->stage = CoalesceStage::kWorklist;
+      coalesce_worklist_.push(opportunity);
+    }
+  }
+}
+
+void ColoringIteration::PruneNode(InterferenceNode* node) {
+  DCHECK_NE(node->stage, NodeStage::kPruned);
+  DCHECK(!node->IsPrecolored());
+  node->stage = NodeStage::kPruned;
+  pruned_nodes_.push(node);
+
+  for (InterferenceNode* adj : node->GetAdjacentNodes()) {
+    DCHECK(!adj->GetInterval()->IsSlowPathSafepoint())
+        << "Nodes should never interfere with synthesized safepoint nodes";
+    DCHECK_NE(adj->stage, NodeStage::kPruned) << "Should be no interferences with pruned nodes";
+
+    if (adj->IsPrecolored()) {
+      // No effect on pre-colored nodes; they're never pruned.
+    } else {
+      // Remove the interference.
+      bool was_high_degree = IsHighDegreeNode(adj, num_regs_);
+      DCHECK(adj->ContainsInterference(node))
+          << "Missing reflexive interference from non-fixed node";
+      adj->RemoveInterference(node);
+
+      // Handle transitions from high degree to low degree.
+      if (was_high_degree && IsLowDegreeNode(adj, num_regs_)) {
+        EnableCoalesceOpportunities(adj);
+        for (InterferenceNode* adj_adj : adj->GetAdjacentNodes()) {
+          EnableCoalesceOpportunities(adj_adj);
+        }
+
+        DCHECK_EQ(adj->stage, NodeStage::kSpillWorklist);
+        if (adj->IsMoveRelated()) {
+          adj->stage = NodeStage::kFreezeWorklist;
+          freeze_worklist_.push_back(adj);
+        } else {
+          adj->stage = NodeStage::kSimplifyWorklist;
+          simplify_worklist_.push_back(adj);
+        }
+      }
+    }
+  }
+}
+
+void ColoringIteration::CheckTransitionFromFreezeWorklist(InterferenceNode* node) {
+  if (IsLowDegreeNode(node, num_regs_) && !node->IsMoveRelated()) {
+    DCHECK_EQ(node->stage, NodeStage::kFreezeWorklist);
+    node->stage = NodeStage::kSimplifyWorklist;
+    simplify_worklist_.push_back(node);
+  }
+}
+
+void ColoringIteration::FreezeMoves(InterferenceNode* node) {
+  for (CoalesceOpportunity* opportunity : node->GetCoalesceOpportunities()) {
+    if (opportunity->stage == CoalesceStage::kDefunct) {
+      // Constrained moves should remain constrained, since they will not be considered
+      // during last-chance coalescing.
+    } else {
+      opportunity->stage = CoalesceStage::kInactive;
+    }
+    InterferenceNode* other = opportunity->node_a->GetAlias() == node
+        ? opportunity->node_b->GetAlias()
+        : opportunity->node_a->GetAlias();
+    if (other != node && other->stage == NodeStage::kFreezeWorklist) {
+      DCHECK(IsLowDegreeNode(node, num_regs_));
+      CheckTransitionFromFreezeWorklist(other);
+    }
+  }
+}
+
+bool ColoringIteration::PrecoloredHeuristic(InterferenceNode* from,
+                                            InterferenceNode* into) {
+  if (!into->IsPrecolored()) {
+    // The uncolored heuristic will cover this case.
+    return false;
+  }
+  if (from->IsPair() || into->IsPair()) {
+    // TODO: Merging from a pair node is currently not supported, since fixed pair nodes
+    //       are currently represented as two single fixed nodes in the graph, and `into` is
+    //       only one of them. (We may lose the implicit connections to the second one in a merge.)
+    return false;
+  }
+
+  // If all adjacent nodes of `from` are "ok", then we can conservatively merge with `into`.
+  // Reasons an adjacent node `adj` can be "ok":
+  // (1) If `adj` is low degree, interference with `into` will not affect its existing
+  //     colorable guarantee. (Notice that coalescing cannot increase its degree.)
+  // (2) If `adj` is pre-colored, it already interferes with `into`. See (3).
+  // (3) If there's already an interference with `into`, coalescing will not add interferences.
+  for (InterferenceNode* adj : from->GetAdjacentNodes()) {
+    if (IsLowDegreeNode(adj, num_regs_) || adj->IsPrecolored() || adj->ContainsInterference(into)) {
+      // Ok.
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ColoringIteration::UncoloredHeuristic(InterferenceNode* from,
+                                           InterferenceNode* into) {
+  if (into->IsPrecolored()) {
+    // The pre-colored heuristic will handle this case.
+    return false;
+  }
+
+  // Arbitrary cap to improve compile time. Tests show that this has negligible affect
+  // on generated code.
+  if (from->GetOutDegree() + into->GetOutDegree() > 2 * num_regs_) {
+    return false;
+  }
+
+  // It's safe to coalesce two nodes if the resulting node has fewer than `num_regs` neighbors
+  // of high degree. (Low degree neighbors can be ignored, because they will eventually be
+  // pruned from the interference graph in the simplify stage.)
+  size_t high_degree_interferences = 0;
+  for (InterferenceNode* adj : from->GetAdjacentNodes()) {
+    if (IsHighDegreeNode(adj, num_regs_)) {
+      high_degree_interferences += from->EdgeWeightWith(adj);
+    }
+  }
+  for (InterferenceNode* adj : into->GetAdjacentNodes()) {
+    if (IsHighDegreeNode(adj, num_regs_)) {
+      if (from->ContainsInterference(adj)) {
+        // We've already counted this adjacent node.
+        // Furthermore, its degree will decrease if coalescing succeeds. Thus, it's possible that
+        // we should not have counted it at all. (This extends the textbook Briggs coalescing test,
+        // but remains conservative.)
+        if (adj->GetOutDegree() - into->EdgeWeightWith(adj) < num_regs_) {
+          high_degree_interferences -= from->EdgeWeightWith(adj);
+        }
+      } else {
+        high_degree_interferences += into->EdgeWeightWith(adj);
+      }
+    }
+  }
+
+  return high_degree_interferences < num_regs_;
+}
+
+void ColoringIteration::Combine(InterferenceNode* from,
+                                InterferenceNode* into) {
+  from->SetAlias(into);
+
+  // Add interferences.
+  for (InterferenceNode* adj : from->GetAdjacentNodes()) {
+    bool was_low_degree = IsLowDegreeNode(adj, num_regs_);
+    AddPotentialInterference(adj, into, /*guaranteed_not_interfering_yet*/ false);
+    if (was_low_degree && IsHighDegreeNode(adj, num_regs_)) {
+      // This is a (temporary) transition to a high degree node. Its degree will decrease again
+      // when we prune `from`, but it's best to be consistent about the current worklist.
+      adj->stage = NodeStage::kSpillWorklist;
+      spill_worklist_.push(adj);
+    }
+  }
+
+  // Add coalesce opportunities.
+  for (CoalesceOpportunity* opportunity : from->GetCoalesceOpportunities()) {
+    if (opportunity->stage != CoalesceStage::kDefunct) {
+      into->AddCoalesceOpportunity(opportunity);
     }
   }
+  EnableCoalesceOpportunities(from);
+
+  // Prune and update worklists.
+  PruneNode(from);
+  if (IsLowDegreeNode(into, num_regs_)) {
+    // Coalesce(...) takes care of checking for a transition to the simplify worklist.
+    DCHECK_EQ(into->stage, NodeStage::kFreezeWorklist);
+  } else if (into->stage == NodeStage::kFreezeWorklist) {
+    // This is a transition to a high degree node.
+    into->stage = NodeStage::kSpillWorklist;
+    spill_worklist_.push(into);
+  } else {
+    DCHECK(into->stage == NodeStage::kSpillWorklist || into->stage == NodeStage::kPrecolored);
+  }
+}
+
+void ColoringIteration::Coalesce(CoalesceOpportunity* opportunity) {
+  InterferenceNode* from = opportunity->node_a->GetAlias();
+  InterferenceNode* into = opportunity->node_b->GetAlias();
+  DCHECK_NE(from->stage, NodeStage::kPruned);
+  DCHECK_NE(into->stage, NodeStage::kPruned);
+
+  if (from->IsPrecolored()) {
+    // If we have one pre-colored node, make sure it's the `into` node.
+    std::swap(from, into);
+  }
+
+  if (from == into) {
+    // These nodes have already been coalesced.
+    opportunity->stage = CoalesceStage::kDefunct;
+    CheckTransitionFromFreezeWorklist(from);
+  } else if (from->IsPrecolored() || from->ContainsInterference(into)) {
+    // These nodes interfere.
+    opportunity->stage = CoalesceStage::kDefunct;
+    CheckTransitionFromFreezeWorklist(from);
+    CheckTransitionFromFreezeWorklist(into);
+  } else if (PrecoloredHeuristic(from, into)
+          || UncoloredHeuristic(from, into)) {
+    // We can coalesce these nodes.
+    opportunity->stage = CoalesceStage::kDefunct;
+    Combine(from, into);
+    CheckTransitionFromFreezeWorklist(into);
+  } else {
+    // We cannot coalesce, but we may be able to later.
+    opportunity->stage = CoalesceStage::kActive;
+  }
 }
 
 // Build a mask with a bit set for each register assigned to some
@@ -888,35 +1783,115 @@ static std::bitset<kMaxNumRegs> BuildConflictMask(Container& intervals) {
   return conflict_mask;
 }
 
-bool RegisterAllocatorGraphColor::ColorInterferenceGraph(
-      ArenaStdStack<InterferenceNode*>* pruned_nodes,
-      size_t num_regs) {
-  DCHECK_LE(num_regs, kMaxNumRegs) << "kMaxNumRegs is too small";
+bool RegisterAllocatorGraphColor::IsCallerSave(size_t reg, bool processing_core_regs) {
+  return processing_core_regs
+      ? !codegen_->IsCoreCalleeSaveRegister(reg)
+      : !codegen_->IsCoreCalleeSaveRegister(reg);
+}
+
+static bool RegisterIsAligned(size_t reg) {
+  return reg % 2 == 0;
+}
+
+static size_t FindFirstZeroInConflictMask(std::bitset<kMaxNumRegs> conflict_mask) {
+  // We use CTZ (count trailing zeros) to quickly find the lowest 0 bit.
+  // Note that CTZ is undefined if all bits are 0, so we special-case it.
+  return conflict_mask.all() ? conflict_mask.size() : CTZ(~conflict_mask.to_ulong());
+}
+
+bool ColoringIteration::ColorInterferenceGraph() {
+  DCHECK_LE(num_regs_, kMaxNumRegs) << "kMaxNumRegs is too small";
   ArenaVector<LiveInterval*> colored_intervals(
-      coloring_attempt_allocator_->Adapter(kArenaAllocRegisterAllocator));
+      allocator_->Adapter(kArenaAllocRegisterAllocator));
   bool successful = true;
 
-  while (!pruned_nodes->empty()) {
-    InterferenceNode* node = pruned_nodes->top();
-    pruned_nodes->pop();
+  while (!pruned_nodes_.empty()) {
+    InterferenceNode* node = pruned_nodes_.top();
+    pruned_nodes_.pop();
     LiveInterval* interval = node->GetInterval();
-
-    // Search for free register(s).
-    // Note that the graph coloring allocator assumes that pair intervals are aligned here,
-    // excluding pre-colored pair intervals (which can currently be unaligned on x86).
-    std::bitset<kMaxNumRegs> conflict_mask = BuildConflictMask(node->GetAdjacentNodes());
     size_t reg = 0;
-    if (interval->HasHighInterval()) {
-      while (reg < num_regs - 1 && (conflict_mask[reg] || conflict_mask[reg + 1])) {
-        reg += 2;
+
+    InterferenceNode* alias = node->GetAlias();
+    if (alias != node) {
+      // This node was coalesced with another.
+      LiveInterval* alias_interval = alias->GetInterval();
+      if (alias_interval->HasRegister()) {
+        reg = alias_interval->GetRegister();
+        DCHECK(!BuildConflictMask(node->GetAdjacentNodes())[reg])
+            << "This node conflicts with the register it was coalesced with";
+      } else {
+        DCHECK(false) << node->GetOutDegree() << " " << alias->GetOutDegree() << " "
+            << "Move coalescing was not conservative, causing a node to be coalesced "
+            << "with another node that could not be colored";
+        if (interval->RequiresRegister()) {
+          successful = false;
+        }
       }
     } else {
-      // We use CTZ (count trailing zeros) to quickly find the lowest available register.
-      // Note that CTZ is undefined for 0, so we special-case it.
-      reg = conflict_mask.all() ? conflict_mask.size() : CTZ(~conflict_mask.to_ulong());
+      // Search for free register(s).
+      std::bitset<kMaxNumRegs> conflict_mask = BuildConflictMask(node->GetAdjacentNodes());
+      if (interval->HasHighInterval()) {
+        // Note that the graph coloring allocator assumes that pair intervals are aligned here,
+        // excluding pre-colored pair intervals (which can currently be unaligned on x86). If we
+        // change the alignment requirements here, we will have to update the algorithm (e.g.,
+        // be more conservative about the weight of edges adjacent to pair nodes.)
+        while (reg < num_regs_ - 1 && (conflict_mask[reg] || conflict_mask[reg + 1])) {
+          reg += 2;
+        }
+
+        // Try to use a caller-save register first.
+        for (size_t i = 0; i < num_regs_ - 1; i += 2) {
+          bool low_caller_save  = register_allocator_->IsCallerSave(i, processing_core_regs_);
+          bool high_caller_save = register_allocator_->IsCallerSave(i + 1, processing_core_regs_);
+          if (!conflict_mask[i] && !conflict_mask[i + 1]) {
+            if (low_caller_save && high_caller_save) {
+              reg = i;
+              break;
+            } else if (low_caller_save || high_caller_save) {
+              reg = i;
+              // Keep looking to try to get both parts in caller-save registers.
+            }
+          }
+        }
+      } else {
+        // Not a pair interval.
+        reg = FindFirstZeroInConflictMask(conflict_mask);
+
+        // Try to use caller-save registers first.
+        for (size_t i = 0; i < num_regs_; ++i) {
+          if (!conflict_mask[i] && register_allocator_->IsCallerSave(i, processing_core_regs_)) {
+            reg = i;
+            break;
+          }
+        }
+      }
+
+      // Last-chance coalescing.
+      for (CoalesceOpportunity* opportunity : node->GetCoalesceOpportunities()) {
+        if (opportunity->stage == CoalesceStage::kDefunct) {
+          continue;
+        }
+        LiveInterval* other_interval = opportunity->node_a->GetAlias() == node
+            ? opportunity->node_b->GetAlias()->GetInterval()
+            : opportunity->node_a->GetAlias()->GetInterval();
+        if (other_interval->HasRegister()) {
+          size_t coalesce_register = other_interval->GetRegister();
+          if (interval->HasHighInterval()) {
+            if (!conflict_mask[coalesce_register] &&
+                !conflict_mask[coalesce_register + 1] &&
+                RegisterIsAligned(coalesce_register)) {
+              reg = coalesce_register;
+              break;
+            }
+          } else if (!conflict_mask[coalesce_register]) {
+            reg = coalesce_register;
+            break;
+          }
+        }
+      }
     }
 
-    if (reg < (interval->HasHighInterval() ? num_regs - 1 : num_regs)) {
+    if (reg < (interval->HasHighInterval() ? num_regs_ - 1 : num_regs_)) {
       // Assign register.
       DCHECK(!interval->HasRegister());
       interval->SetRegister(reg);
@@ -930,12 +1905,12 @@ bool RegisterAllocatorGraphColor::ColorInterferenceGraph(
       // The interference graph is too dense to color. Make it sparser by
       // splitting this live interval.
       successful = false;
-      SplitAtRegisterUses(interval);
+      register_allocator_->SplitAtRegisterUses(interval);
       // We continue coloring, because there may be additional intervals that cannot
       // be colored, and that we should split.
     } else {
       // Spill.
-      AllocateSpillSlotFor(interval);
+      register_allocator_->AllocateSpillSlotFor(interval);
     }
   }
 
diff --git a/compiler/optimizing/register_allocator_graph_color.h b/compiler/optimizing/register_allocator_graph_color.h
index 0b5af96b40..9dddcea685 100644
--- a/compiler/optimizing/register_allocator_graph_color.h
+++ b/compiler/optimizing/register_allocator_graph_color.h
@@ -34,6 +34,8 @@ class HParallelMove;
 class Location;
 class SsaLivenessAnalysis;
 class InterferenceNode;
+struct CoalesceOpportunity;
+enum class CoalesceKind;
 
 /**
  * A graph coloring register allocator.
@@ -60,6 +62,25 @@ class InterferenceNode;
  *       sparser, so that future coloring attempts may succeed.
  *     - If the node does not require a register, we simply assign it a location on the stack.
  *
+ * If iterative move coalescing is enabled, the algorithm also attempts to conservatively
+ * combine nodes in the graph that would prefer to have the same color. (For example, the output
+ * of a phi instruction would prefer to have the same register as at least one of its inputs.)
+ * There are several additional steps involved with this:
+ * - We look for coalesce opportunities by examining each live interval, a step similar to that
+ *   used by linear scan when looking for register hints.
+ * - When pruning the graph, we maintain a worklist of coalesce opportunities, as well as a worklist
+ *   of low degree nodes that have associated coalesce opportunities. Only when we run out of
+ *   coalesce opportunities do we start pruning coalesce-associated nodes.
+ * - When pruning a node, if any nodes transition from high degree to low degree, we add
+ *   associated coalesce opportunities to the worklist, since these opportunities may now succeed.
+ * - Whether two nodes can be combined is decided by two different heuristics--one used when
+ *   coalescing uncolored nodes, and one used for coalescing an uncolored node with a colored node.
+ *   It is vital that we only combine two nodes if the node that remains is guaranteed to receive
+ *   a color. This is because additionally spilling is more costly than failing to coalesce.
+ * - Even if nodes are not coalesced while pruning, we keep the coalesce opportunities around
+ *   to be used as last-chance register hints when coloring. If nothing else, we try to use
+ *   caller-save registers before callee-save registers.
+ *
  * A good reference for graph coloring register allocation is
  * "Modern Compiler Implementation in Java" (Andrew W. Appel, 2nd Edition).
  */
@@ -67,7 +88,8 @@ class RegisterAllocatorGraphColor : public RegisterAllocator {
  public:
   RegisterAllocatorGraphColor(ArenaAllocator* allocator,
                               CodeGenerator* codegen,
-                              const SsaLivenessAnalysis& analysis);
+                              const SsaLivenessAnalysis& analysis,
+                              bool iterative_move_coalescing = true);
   ~RegisterAllocatorGraphColor() OVERRIDE {}
 
   void AllocateRegisters() OVERRIDE;
@@ -116,26 +138,7 @@ class RegisterAllocatorGraphColor : public RegisterAllocator {
   void BlockRegister(Location location, size_t start, size_t end);
   void BlockRegisters(size_t start, size_t end, bool caller_save_only = false);
 
-  // Use the intervals collected from instructions to construct an
-  // interference graph mapping intervals to adjacency lists.
-  // Also, collect synthesized safepoint nodes, used to keep
-  // track of live intervals across safepoints.
-  void BuildInterferenceGraph(const ArenaVector<LiveInterval*>& intervals,
-                              ArenaVector<InterferenceNode*>* prunable_nodes,
-                              ArenaVector<InterferenceNode*>* safepoints);
-
-  // Prune nodes from the interference graph to be colored later. Build
-  // a stack (pruned_nodes) containing these intervals in an order determined
-  // by various heuristics.
-  void PruneInterferenceGraph(const ArenaVector<InterferenceNode*>& prunable_nodes,
-                              size_t num_registers,
-                              ArenaStdStack<InterferenceNode*>* pruned_nodes);
-
-  // Process pruned_intervals to color the interference graph, spilling when
-  // necessary. Return true if successful. Else, split some intervals to make
-  // the interference graph sparser.
-  bool ColorInterferenceGraph(ArenaStdStack<InterferenceNode*>* pruned_nodes,
-                              size_t num_registers);
+  bool IsCallerSave(size_t reg, bool processing_core_regs);
 
   // Return the maximum number of registers live at safepoints,
   // based on the outgoing interference edges of safepoint nodes.
@@ -145,6 +148,10 @@ class RegisterAllocatorGraphColor : public RegisterAllocator {
   // and make sure it's ready to be spilled to the stack.
   void AllocateSpillSlotFor(LiveInterval* interval);
 
+  // Whether iterative move coalescing should be performed. Iterative move coalescing
+  // improves code quality, but increases compile time.
+  const bool iterative_move_coalescing_;
+
   // Live intervals, split by kind (core and floating point).
   // These should not contain high intervals, as those are represented by
   // the corresponding low interval throughout register allocation.
@@ -157,10 +164,10 @@ class RegisterAllocatorGraphColor : public RegisterAllocator {
   // Safepoints, saved for special handling while processing instructions.
   ArenaVector<HInstruction*> safepoints_;
 
-  // Live intervals for specific registers. These become pre-colored nodes
+  // Interference nodes representing specific registers. These are "pre-colored" nodes
   // in the interference graph.
-  ArenaVector<LiveInterval*> physical_core_intervals_;
-  ArenaVector<LiveInterval*> physical_fp_intervals_;
+  ArenaVector<InterferenceNode*> physical_core_nodes_;
+  ArenaVector<InterferenceNode*> physical_fp_nodes_;
 
   // Allocated stack slot counters.
   size_t int_spill_slot_counter_;
@@ -184,10 +191,7 @@ class RegisterAllocatorGraphColor : public RegisterAllocator {
   size_t max_safepoint_live_core_regs_;
   size_t max_safepoint_live_fp_regs_;
 
-  // An arena allocator used for a single graph coloring attempt.
-  // Many data structures are cleared between graph coloring attempts, so we reduce
-  // total memory usage by using a new arena allocator for each attempt.
-  ArenaAllocator* coloring_attempt_allocator_;
+  friend class ColoringIteration;
 
   DISALLOW_COPY_AND_ASSIGN(RegisterAllocatorGraphColor);
 };
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index 346753b775..92788fe6b8 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -514,7 +514,9 @@ class LiveInterval : public ArenaObject<kArenaAllocSsaLiveness> {
 
   // Whether the interval requires a register rather than a stack location.
   // If needed for performance, this could be cached.
-  bool RequiresRegister() const { return FirstRegisterUse() != kNoLifetime; }
+  bool RequiresRegister() const {
+    return !HasRegister() && FirstRegisterUse() != kNoLifetime;
+  }
 
   size_t FirstUseAfter(size_t position) const {
     if (is_temp_) {
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index ff0bbafb9a..86548e153b 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -680,6 +680,8 @@ class ArmAssembler : public Assembler {
   virtual void vpushd(DRegister reg, int nregs, Condition cond = AL) = 0;
   virtual void vpops(SRegister reg, int nregs, Condition cond = AL) = 0;
   virtual void vpopd(DRegister reg, int nregs, Condition cond = AL) = 0;
+  virtual void vldmiad(Register base_reg, DRegister reg, int nregs, Condition cond = AL) = 0;
+  virtual void vstmiad(Register base_reg, DRegister reg, int nregs, Condition cond = AL) = 0;
 
   // Branch instructions.
   virtual void b(Label* label, Condition cond = AL) = 0;
diff --git a/compiler/utils/arm/assembler_arm32.cc b/compiler/utils/arm/assembler_arm32.cc
index 6f9d5f32af..b8eb60c387 100644
--- a/compiler/utils/arm/assembler_arm32.cc
+++ b/compiler/utils/arm/assembler_arm32.cc
@@ -1106,6 +1106,18 @@ void Arm32Assembler::vpopd(DRegister reg, int nregs, Condition cond) {
 }
 
 
+void Arm32Assembler::vldmiad(Register, DRegister, int, Condition) {
+  LOG(FATAL) << "Unimplemented.";
+  UNREACHABLE();
+}
+
+
+void Arm32Assembler::vstmiad(Register, DRegister, int, Condition) {
+  LOG(FATAL) << "Unimplemented.";
+  UNREACHABLE();
+}
+
+
 void Arm32Assembler::EmitVPushPop(uint32_t reg, int nregs, bool push, bool dbl, Condition cond) {
   CHECK_NE(cond, kNoCondition);
   CHECK_GT(nregs, 0);
diff --git a/compiler/utils/arm/assembler_arm32.h b/compiler/utils/arm/assembler_arm32.h
index 044eaa1edf..0cb6b171ce 100644
--- a/compiler/utils/arm/assembler_arm32.h
+++ b/compiler/utils/arm/assembler_arm32.h
@@ -212,6 +212,8 @@ class Arm32Assembler FINAL : public ArmAssembler {
   void vpushd(DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
   void vpops(SRegister reg, int nregs, Condition cond = AL) OVERRIDE;
   void vpopd(DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
+  void vldmiad(Register base_reg, DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
+  void vstmiad(Register base_reg, DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
 
   // Branch instructions.
   void b(Label* label, Condition cond = AL) OVERRIDE;
diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc
index ee69698ce8..ebdfc98554 100644
--- a/compiler/utils/arm/assembler_thumb2.cc
+++ b/compiler/utils/arm/assembler_thumb2.cc
@@ -3020,9 +3020,49 @@ void Thumb2Assembler::vpopd(DRegister reg, int nregs, Condition cond) {
 }
 
 
+void Thumb2Assembler::vldmiad(Register base_reg, DRegister reg, int nregs, Condition cond) {
+  int32_t rest = B23;
+  EmitVLdmOrStm(rest,
+                static_cast<uint32_t>(reg),
+                nregs,
+                base_reg,
+                /*is_load*/ true,
+                /*dbl*/ true,
+                cond);
+}
+
+
+void Thumb2Assembler::vstmiad(Register base_reg, DRegister reg, int nregs, Condition cond) {
+  int32_t rest = B23;
+  EmitVLdmOrStm(rest,
+                static_cast<uint32_t>(reg),
+                nregs,
+                base_reg,
+                /*is_load*/ false,
+                /*dbl*/ true,
+                cond);
+}
+
+
 void Thumb2Assembler::EmitVPushPop(uint32_t reg, int nregs, bool push, bool dbl, Condition cond) {
+  int32_t rest = B21 | (push ? B24 : B23);
+  EmitVLdmOrStm(rest, reg, nregs, SP, /*is_load*/ !push, dbl, cond);
+}
+
+
+void Thumb2Assembler::EmitVLdmOrStm(int32_t rest,
+                                    uint32_t reg,
+                                    int nregs,
+                                    Register rn,
+                                    bool is_load,
+                                    bool dbl,
+                                    Condition cond) {
   CheckCondition(cond);
 
+  DCHECK_GT(nregs, 0);
+  DCHECK_LE(reg + nregs, 32u);
+  DCHECK(!dbl || (nregs <= 16));
+
   uint32_t D;
   uint32_t Vd;
   if (dbl) {
@@ -3034,14 +3074,17 @@ void Thumb2Assembler::EmitVPushPop(uint32_t reg, int nregs, bool push, bool dbl,
     D = reg & 1;
     Vd = (reg >> 1) & 15U /* 0b1111 */;
   }
-  int32_t encoding = B27 | B26 | B21 | B19 | B18 | B16 |
-                    B11 | B9 |
-        (dbl ? B8 : 0) |
-        (push ? B24 : (B23 | B20)) |
-        14U /* 0b1110 */ << 28 |
-        nregs << (dbl ? 1 : 0) |
-        D << 22 |
-        Vd << 12;
+
+  int32_t encoding = rest |
+                     14U /* 0b1110 */ << 28 |
+                     B27 | B26 | B11 | B9 |
+                     (is_load ? B20 : 0) |
+                     static_cast<int16_t>(rn) << 16 |
+                     D << 22 |
+                     Vd << 12 |
+                     (dbl ? B8 : 0) |
+                     nregs << (dbl ? 1 : 0);
+
   Emit32(encoding);
 }
 
diff --git a/compiler/utils/arm/assembler_thumb2.h b/compiler/utils/arm/assembler_thumb2.h
index 1c1c98b52b..13f3becb6d 100644
--- a/compiler/utils/arm/assembler_thumb2.h
+++ b/compiler/utils/arm/assembler_thumb2.h
@@ -258,6 +258,8 @@ class Thumb2Assembler FINAL : public ArmAssembler {
   void vpushd(DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
   void vpops(SRegister reg, int nregs, Condition cond = AL) OVERRIDE;
   void vpopd(DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
+  void vldmiad(Register base_reg, DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
+  void vstmiad(Register base_reg, DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
 
   // Branch instructions.
   void b(Label* label, Condition cond = AL);
@@ -748,6 +750,14 @@ class Thumb2Assembler FINAL : public ArmAssembler {
                   SRegister sn,
                   SRegister sm);
 
+  void EmitVLdmOrStm(int32_t rest,
+                     uint32_t reg,
+                     int nregs,
+                     Register rn,
+                     bool is_load,
+                     bool dbl,
+                     Condition cond);
+
   void EmitVFPddd(Condition cond,
                   int32_t opcode,
                   DRegister dd,
diff --git a/compiler/utils/arm/assembler_thumb2_test.cc b/compiler/utils/arm/assembler_thumb2_test.cc
index 3ca37145d5..d0799d6112 100644
--- a/compiler/utils/arm/assembler_thumb2_test.cc
+++ b/compiler/utils/arm/assembler_thumb2_test.cc
@@ -1611,4 +1611,46 @@ TEST_F(AssemblerThumb2Test, LoadFromShiftedRegOffset) {
   DriverStr(expected, "LoadFromShiftedRegOffset");
 }
 
+TEST_F(AssemblerThumb2Test, VStmLdmPushPop) {
+  // Different D register numbers are used here, to test register encoding.
+  // Source register number is encoded as M:Vm, destination register number is encoded as D:Vd,
+  // For source and destination registers which use D0..D15, the M bit and D bit should be 0.
+  // For source and destination registers which use D16..D32, the M bit and D bit should be 1.
+  // Different data types (signed and unsigned) are also tested.
+  __ vstmiad(arm::R0, arm::D0, 4);
+  __ vldmiad(arm::R1, arm::D9, 5);
+  __ vpopd(arm::D0, 4);
+  __ vpushd(arm::D9, 5);
+  __ vpops(arm::S0, 4);
+  __ vpushs(arm::S9, 5);
+  __ vpushs(arm::S16, 5);
+  __ vpushd(arm::D0, 16);
+  __ vpushd(arm::D1, 15);
+  __ vpushd(arm::D8, 16);
+  __ vpushd(arm::D31, 1);
+  __ vpushs(arm::S0, 32);
+  __ vpushs(arm::S1, 31);
+  __ vpushs(arm::S16, 16);
+  __ vpushs(arm::S31, 1);
+
+  std::string expected =
+      "vstmia r0, {d0 - d3}\n"
+      "vldmia r1, {d9 - d13}\n"
+      "vpop {d0 - d3}\n"
+      "vpush {d9 - d13}\n"
+      "vpop {s0 - s3}\n"
+      "vpush {s9 - s13}\n"
+      "vpush {s16 - s20}\n"
+      "vpush {d0 - d15}\n"
+      "vpush {d1 - d15}\n"
+      "vpush {d8 - d23}\n"
+      "vpush {d31}\n"
+      "vpush {s0 - s31}\n"
+      "vpush {s1 - s31}\n"
+      "vpush {s16 - s31}\n"
+      "vpush {s31}\n";
+
+  DriverStr(expected, "VStmLdmPushPop");
+}
+
 }  // namespace art
diff --git a/compiler/utils/arm/jni_macro_assembler_arm.cc b/compiler/utils/arm/jni_macro_assembler_arm.cc
index c03981653e..af5ebb4ce8 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm.cc
@@ -243,14 +243,16 @@ void ArmJNIMacroAssembler::CopyRef(FrameOffset dest, FrameOffset src, ManagedReg
 }
 
 void ArmJNIMacroAssembler::LoadRef(ManagedRegister mdest,
-                                   ManagedRegister base,
+                                   ManagedRegister mbase,
                                    MemberOffset offs,
                                    bool unpoison_reference) {
+  ArmManagedRegister base = mbase.AsArm();
   ArmManagedRegister dst = mdest.AsArm();
-  CHECK(dst.IsCoreRegister() && dst.IsCoreRegister()) << dst;
+  CHECK(base.IsCoreRegister()) << base;
+  CHECK(dst.IsCoreRegister()) << dst;
   __ LoadFromOffset(kLoadWord,
                     dst.AsCoreRegister(),
-                    base.AsArm().AsCoreRegister(),
+                    base.AsCoreRegister(),
                     offs.Int32Value());
   if (unpoison_reference) {
     __ MaybeUnpoisonHeapReference(dst.AsCoreRegister());
@@ -263,13 +265,16 @@ void ArmJNIMacroAssembler::LoadRef(ManagedRegister mdest, FrameOffset  src) {
   __ LoadFromOffset(kLoadWord, dst.AsCoreRegister(), SP, src.Int32Value());
 }
 
-void ArmJNIMacroAssembler::LoadRawPtr(ManagedRegister mdest, ManagedRegister base,
-                           Offset offs) {
+void ArmJNIMacroAssembler::LoadRawPtr(ManagedRegister mdest,
+                                      ManagedRegister mbase,
+                                      Offset offs) {
+  ArmManagedRegister base = mbase.AsArm();
   ArmManagedRegister dst = mdest.AsArm();
-  CHECK(dst.IsCoreRegister() && dst.IsCoreRegister()) << dst;
+  CHECK(base.IsCoreRegister()) << base;
+  CHECK(dst.IsCoreRegister()) << dst;
   __ LoadFromOffset(kLoadWord,
                     dst.AsCoreRegister(),
-                    base.AsArm().AsCoreRegister(),
+                    base.AsCoreRegister(),
                     offs.Int32Value());
 }
 
@@ -530,8 +535,9 @@ void ArmJNIMacroAssembler::VerifyObject(FrameOffset /*src*/, bool /*could_be_nul
   // TODO: not validating references.
 }
 
-void ArmJNIMacroAssembler::Call(ManagedRegister mbase, Offset offset,
-                        ManagedRegister mscratch) {
+void ArmJNIMacroAssembler::Call(ManagedRegister mbase,
+                                Offset offset,
+                                ManagedRegister mscratch) {
   ArmManagedRegister base = mbase.AsArm();
   ArmManagedRegister scratch = mscratch.AsArm();
   CHECK(base.IsCoreRegister()) << base;
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index 22221e752a..19450b3a32 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -36,7 +36,7 @@ void Arm64Assembler::FinalizeCode() {
 }
 
 size_t Arm64Assembler::CodeSize() const {
-  return vixl_masm_.GetBufferCapacity() - vixl_masm_.GetRemainingBufferSpace();
+  return vixl_masm_.GetSizeOfCodeGenerated();
 }
 
 const uint8_t* Arm64Assembler::CodeBufferBaseAddress() const {
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index 4e88e640e5..2847cb86a8 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -27,13 +27,11 @@
 #include "utils/assembler.h"
 #include "offsets.h"
 
-// TODO: make vixl clean wrt -Wshadow, -Wunknown-pragmas, -Wmissing-noreturn
+// TODO(VIXL): Make VIXL compile with -Wshadow.
 #pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunknown-pragmas"
 #pragma GCC diagnostic ignored "-Wshadow"
-#pragma GCC diagnostic ignored "-Wmissing-noreturn"
-#include "a64/disasm-a64.h"
-#include "a64/macro-assembler-a64.h"
+#include "aarch64/disasm-aarch64.h"
+#include "aarch64/macro-assembler-aarch64.h"
 #pragma GCC diagnostic pop
 
 namespace art {
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.h b/compiler/utils/arm64/jni_macro_assembler_arm64.h
index 79ee441144..b9f6854b01 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.h
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.h
@@ -29,12 +29,10 @@
 #include "utils/jni_macro_assembler.h"
 #include "offsets.h"
 
-// TODO: make vixl clean wrt -Wshadow, -Wunknown-pragmas, -Wmissing-noreturn
+// TODO(VIXL): Make VIXL compile with -Wshadow.
 #pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunknown-pragmas"
 #pragma GCC diagnostic ignored "-Wshadow"
-#pragma GCC diagnostic ignored "-Wmissing-noreturn"
-#include "a64/macro-assembler-a64.h"
+#include "aarch64/macro-assembler-aarch64.h"
 #pragma GCC diagnostic pop
 
 namespace art {
diff --git a/compiler/utils/x86/constants_x86.h b/compiler/utils/x86/constants_x86.h
index 2dfb65c479..0bc1560ed7 100644
--- a/compiler/utils/x86/constants_x86.h
+++ b/compiler/utils/x86/constants_x86.h
@@ -97,6 +97,8 @@ enum Condition {
   kNotZero      = kNotEqual,
   kNegative     = kSign,
   kPositive     = kNotSign,
+  kCarrySet     = kBelow,
+  kCarryClear   = kAboveEqual,
   kUnordered    = kParityEven
 };
 
diff --git a/compiler/utils/x86_64/constants_x86_64.h b/compiler/utils/x86_64/constants_x86_64.h
index 37db6b1543..cc508a196b 100644
--- a/compiler/utils/x86_64/constants_x86_64.h
+++ b/compiler/utils/x86_64/constants_x86_64.h
@@ -106,6 +106,8 @@ enum Condition {
   kNotZero      = kNotEqual,
   kNegative     = kSign,
   kPositive     = kNotSign,
+  kCarrySet     = kBelow,
+  kCarryClear   = kAboveEqual,
   kUnordered    = kParityEven
 };
 
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index 47fb59b1d8..3e687a7758 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -288,21 +288,27 @@ void X86_64JNIMacroAssembler::LoadRef(ManagedRegister mdest, FrameOffset src) {
 }
 
 void X86_64JNIMacroAssembler::LoadRef(ManagedRegister mdest,
-                                      ManagedRegister base,
+                                      ManagedRegister mbase,
                                       MemberOffset offs,
                                       bool unpoison_reference) {
+  X86_64ManagedRegister base = mbase.AsX86_64();
   X86_64ManagedRegister dest = mdest.AsX86_64();
-  CHECK(dest.IsCpuRegister() && dest.IsCpuRegister());
-  __ movl(dest.AsCpuRegister(), Address(base.AsX86_64().AsCpuRegister(), offs));
+  CHECK(base.IsCpuRegister());
+  CHECK(dest.IsCpuRegister());
+  __ movl(dest.AsCpuRegister(), Address(base.AsCpuRegister(), offs));
   if (unpoison_reference) {
     __ MaybeUnpoisonHeapReference(dest.AsCpuRegister());
   }
 }
 
-void X86_64JNIMacroAssembler::LoadRawPtr(ManagedRegister mdest, ManagedRegister base, Offset offs) {
+void X86_64JNIMacroAssembler::LoadRawPtr(ManagedRegister mdest,
+                                         ManagedRegister mbase,
+                                         Offset offs) {
+  X86_64ManagedRegister base = mbase.AsX86_64();
   X86_64ManagedRegister dest = mdest.AsX86_64();
-  CHECK(dest.IsCpuRegister() && dest.IsCpuRegister());
-  __ movq(dest.AsCpuRegister(), Address(base.AsX86_64().AsCpuRegister(), offs));
+  CHECK(base.IsCpuRegister());
+  CHECK(dest.IsCpuRegister());
+  __ movq(dest.AsCpuRegister(), Address(base.AsCpuRegister(), offs));
 }
 
 void X86_64JNIMacroAssembler::LoadRawPtrFromThread(ManagedRegister mdest, ThreadOffset64 offs) {
diff --git a/dex2oat/Android.mk b/dex2oat/Android.mk
index f5f02cd966..37acef666e 100644
--- a/dex2oat/Android.mk
+++ b/dex2oat/Android.mk
@@ -62,7 +62,6 @@ DEX2OAT_STATIC_DEPENDENCIES := \
   libnativebridge \
   libnativeloader \
   libsigchain_dummy \
-  libvixl-arm64 \
   liblog \
   libz \
   libbacktrace \
@@ -83,14 +82,14 @@ DEX2OAT_STATIC_DEPENDENCIES := \
 ifeq ($(ART_BUILD_HOST_NDEBUG),true)
   $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libcutils libart-compiler libsigchain libziparchive-host liblz4,art/compiler,host,ndebug,$(dex2oat_host_arch)))
   ifeq ($(ART_BUILD_HOST_STATIC),true)
-    $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libart libart-compiler libart $(DEX2OAT_STATIC_DEPENDENCIES),art/compiler,host,ndebug,$(dex2oat_host_arch),static))
+    $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libart libart-compiler libart libvixl-arm64 $(DEX2OAT_STATIC_DEPENDENCIES),art/compiler,host,ndebug,$(dex2oat_host_arch),static))
   endif
 endif
 
 ifeq ($(ART_BUILD_HOST_DEBUG),true)
   $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libcutils libartd-compiler libsigchain libziparchive-host liblz4,art/compiler,host,debug,$(dex2oat_host_arch)))
   ifeq ($(ART_BUILD_HOST_STATIC),true)
-    $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libartd libartd-compiler libartd $(DEX2OAT_STATIC_DEPENDENCIES),art/compiler,host,debug,$(dex2oat_host_arch),static))
+    $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libartd libartd-compiler libartd libvixld-arm64 $(DEX2OAT_STATIC_DEPENDENCIES),art/compiler,host,debug,$(dex2oat_host_arch),static))
   endif
 endif
 
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index eb11f6d3d7..cfcfe1c999 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -1576,7 +1576,7 @@ class Dex2Oat FINAL {
                                      IsAppImage(),
                                      image_classes_.release(),
                                      compiled_classes_.release(),
-                                     /* compiled_methods */ nullptr,
+                                     compiled_methods_.release(),
                                      thread_count_,
                                      dump_stats_,
                                      dump_passes_,
diff --git a/disassembler/Android.mk b/disassembler/Android.mk
index 778fe8ee96..db327fcdb6 100644
--- a/disassembler/Android.mk
+++ b/disassembler/Android.mk
@@ -90,9 +90,9 @@ define build-libart-disassembler
   LOCAL_NATIVE_COVERAGE := $(ART_COVERAGE)
   # For disassembler_arm64.
   ifeq ($$(art_ndebug_or_debug),debug)
-     LOCAL_SHARED_LIBRARIES += libvixl-arm64
+    LOCAL_SHARED_LIBRARIES += libvixld-arm64
   else
-     LOCAL_SHARED_LIBRARIES += libvixl-arm64
+    LOCAL_SHARED_LIBRARIES += libvixl-arm64
   endif
   ifeq ($$(art_target_or_host),target)
     include $(BUILD_SHARED_LIBRARY)
diff --git a/disassembler/disassembler_arm64.h b/disassembler/disassembler_arm64.h
index c64d8eaf9d..7c64792b13 100644
--- a/disassembler/disassembler_arm64.h
+++ b/disassembler/disassembler_arm64.h
@@ -19,10 +19,11 @@
 
 #include "disassembler.h"
 
+// TODO(VIXL): Make VIXL compile with -Wshadow.
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wshadow"
-#include "a64/decoder-a64.h"
-#include "a64/disasm-a64.h"
+#include "aarch64/decoder-aarch64.h"
+#include "aarch64/disasm-aarch64.h"
 #pragma GCC diagnostic pop
 
 namespace art {
diff --git a/runtime/Android.mk b/runtime/Android.mk
index 2f8b11361c..b31eaf60d8 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -164,6 +164,7 @@ LIBART_COMMON_SRC_FILES := \
   offsets.cc \
   os_linux.cc \
   parsed_options.cc \
+  plugin.cc \
   primitive.cc \
   quick_exception_handler.cc \
   quick/inline_method_analyser.cc \
@@ -177,6 +178,7 @@ LIBART_COMMON_SRC_FILES := \
   thread.cc \
   thread_list.cc \
   thread_pool.cc \
+  ti/agent.cc \
   trace.cc \
   transaction.cc \
   type_lookup_table.cc \
@@ -370,6 +372,7 @@ LIBART_ENUM_OPERATOR_OUT_HEADER_FILES := \
   stack.h \
   thread.h \
   thread_state.h \
+  ti/agent.h \
   verifier/method_verifier.h
 
 LIBOPENJDKJVM_SRC_FILES := openjdkjvm/OpenjdkJvm.cc
@@ -419,7 +422,7 @@ define build-runtime-library
   endif
   ifneq ($(4),libart)
     ifneq ($(4),libopenjdkjvm)
-      $$(error expected libart of libopenjdkjvm for argument 4, received $(4))
+      $$(error expected libart or libopenjdkjvm for argument 4, received $(4))
     endif
   endif
 
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index 0e2a6720ae..492a12d02b 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -133,7 +133,7 @@ void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) {
   qpoints->pReadBarrierMarkReg09 = art_quick_read_barrier_mark_reg09;
   qpoints->pReadBarrierMarkReg10 = art_quick_read_barrier_mark_reg10;
   qpoints->pReadBarrierMarkReg11 = art_quick_read_barrier_mark_reg11;
-  qpoints->pReadBarrierMarkReg12 = art_quick_read_barrier_mark_reg12;
+  qpoints->pReadBarrierMarkReg12 = nullptr;  // Cannot use register 12 (IP) to pass arguments.
   qpoints->pReadBarrierMarkReg13 = nullptr;  // Cannot use register 13 (SP) to pass arguments.
   qpoints->pReadBarrierMarkReg14 = nullptr;  // Cannot use register 14 (LR) to pass arguments.
   qpoints->pReadBarrierMarkReg15 = nullptr;  // Cannot use register 15 (PC) to pass arguments.
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 0fcf866e18..c4ec72685f 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -191,7 +191,7 @@
     .cfi_rel_offset r11, 44
     .cfi_rel_offset ip, 48
     .cfi_rel_offset lr, 52
-    vpush {s0-s31}                      @ 32 words of float args.
+    vpush {d0-d15}                      @ 32 words of float args.
     .cfi_adjust_cfa_offset 128
     sub sp, #8                          @ 2 words of space, alignment padding and Method*
     .cfi_adjust_cfa_offset 8
@@ -210,7 +210,7 @@
 .macro RESTORE_SAVE_EVERYTHING_FRAME
     add  sp, #8                         @ rewind sp
     .cfi_adjust_cfa_offset -8
-    vpop {s0-s31}
+    vpop {d0-d15}
     .cfi_adjust_cfa_offset -128
     pop {r0-r12, lr}                    @ 14 words of callee saves
     .cfi_restore r0
@@ -1246,9 +1246,15 @@ ENTRY art_quick_alloc_object_region_tlab
     ldr    r2, [r2, r0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
                                                               // Read barrier for class load.
     ldr    r3, [r9, #THREAD_IS_GC_MARKING_OFFSET]
-    cbnz   r3, .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
+    cbnz   r3, .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_marking
 .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
     ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
+.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_marking:
+    cbz    r2, .Lart_quick_alloc_object_region_tlab_slow_path  // Null check for loading lock word.
+    // Check lock word for mark bit, if marked do the allocation.
+    ldr r3, [r2, MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    ands r3, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
+    bne .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
 .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:
                                                               // The read barrier slow path. Mark
                                                               // the class.
@@ -1817,6 +1823,39 @@ ENTRY art_quick_l2f
     pop   {pc}
 END art_quick_l2f
 
+.macro CONDITIONAL_CBZ reg, reg_if, dest
+.ifc \reg, \reg_if
+    cbz \reg, \dest
+.endif
+.endm
+
+.macro CONDITIONAL_CMPBZ reg, reg_if, dest
+.ifc \reg, \reg_if
+    cmp \reg, #0
+    beq \dest
+.endif
+.endm
+
+// Use CBZ if the register is in {r0, r7} otherwise compare and branch.
+.macro SMART_CBZ reg, dest
+    CONDITIONAL_CBZ \reg, r0, \dest
+    CONDITIONAL_CBZ \reg, r1, \dest
+    CONDITIONAL_CBZ \reg, r2, \dest
+    CONDITIONAL_CBZ \reg, r3, \dest
+    CONDITIONAL_CBZ \reg, r4, \dest
+    CONDITIONAL_CBZ \reg, r5, \dest
+    CONDITIONAL_CBZ \reg, r6, \dest
+    CONDITIONAL_CBZ \reg, r7, \dest
+    CONDITIONAL_CMPBZ \reg, r8, \dest
+    CONDITIONAL_CMPBZ \reg, r9, \dest
+    CONDITIONAL_CMPBZ \reg, r10, \dest
+    CONDITIONAL_CMPBZ \reg, r11, \dest
+    CONDITIONAL_CMPBZ \reg, r12, \dest
+    CONDITIONAL_CMPBZ \reg, r13, \dest
+    CONDITIONAL_CMPBZ \reg, r14, \dest
+    CONDITIONAL_CMPBZ \reg, r15, \dest
+.endm
+
     /*
      * Create a function `name` calling the ReadBarrier::Mark routine,
      * getting its argument and returning its result through register
@@ -1835,28 +1874,25 @@ END art_quick_l2f
 .macro READ_BARRIER_MARK_REG name, reg
 ENTRY \name
     // Null check so that we can load the lock word.
-    cmp \reg, #0
-    beq .Lret_rb_\name
-    // Check lock word for mark bit, if marked return.
-    push {r0}
-    ldr r0, [\reg, MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    and r0, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
-    cbz r0, .Lslow_rb_\name
-    // Restore LR and return.
-    pop   {r0}
-    bx    lr
+    SMART_CBZ \reg, .Lret_rb_\name
+    // Check lock word for mark bit, if marked return. Use IP for scratch since it is blocked.
+    ldr ip, [\reg, MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    ands ip, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
+    beq .Lslow_rb_\name
+    // Already marked, return right away.
+    bx lr
 
 .Lslow_rb_\name:
-    pop   {r0}
-    push  {r0-r4, r9, r12, lr}          @ save return address and core caller-save registers
+    push  {r0-r5, r9, lr}               @ save return address and core caller-save registers
+                                        @ also save callee save r5 for 16 byte alignment
     .cfi_adjust_cfa_offset 32
     .cfi_rel_offset r0, 0
     .cfi_rel_offset r1, 4
     .cfi_rel_offset r2, 8
     .cfi_rel_offset r3, 12
     .cfi_rel_offset r4, 16
-    .cfi_rel_offset r9, 20
-    .cfi_rel_offset r12, 24
+    .cfi_rel_offset r5, 20
+    .cfi_rel_offset r9, 24
     .cfi_rel_offset lr, 28
     vpush {s0-s15}                      @ save floating-point caller-save registers
     .cfi_adjust_cfa_offset 64
@@ -1865,48 +1901,11 @@ ENTRY \name
       mov   r0, \reg                    @ pass arg1 - obj from `reg`
     .endif
     bl    artReadBarrierMark            @ r0 <- artReadBarrierMark(obj)
-
+    mov ip, r0                          @ Save result in IP
     vpop {s0-s15}                       @ restore floating-point registers
     .cfi_adjust_cfa_offset -64
-    @ If `reg` is a caller-save register, save the result to its
-    @ corresponding stack slot; it will be restored by the "pop"
-    @ instruction below. Otherwise, move result into `reg`.
-    @
-    @ (Note that saving `reg` to its stack slot will overwrite the value
-    @ previously stored by the "push" instruction above. That is
-    @ alright, as in that case we know that `reg` is not a live
-    @ register, as it is used to pass the argument and return the result
-    @ of this function.)
-    .ifc \reg, r0
-      PUSH_REG r0, 0                    @ copy result to r0's stack location
-    .else
-      .ifc \reg, r1
-        PUSH_REG r0, 4                  @ copy result to r1's stack location
-      .else
-        .ifc \reg, r2
-          PUSH_REG r0, 8                @ copy result to r2's stack location
-        .else
-          .ifc \reg, r3
-            PUSH_REG r0, 12             @ copy result to r3's stack location
-          .else
-            .ifc \reg, r4
-              PUSH_REG r0, 16           @ copy result to r4's stack location
-            .else
-              .ifc \reg, r9
-                PUSH_REG r0, 20         @ copy result to r9's stack location
-              .else
-                .ifc \reg, r12
-                  PUSH_REG r0, 24       @ copy result to r12's stack location
-                .else
-                  mov   \reg, r0        @ return result into `reg`
-                .endif
-              .endif
-            .endif
-          .endif
-        .endif
-      .endif
-    .endif
-    pop   {r0-r4, r9, r12, pc}          @ restore caller-save registers and return
+    pop   {r0-r5, r9, lr}               @ restore caller-save registers
+    mov \reg, ip                        @ copy result to reg
 .Lret_rb_\name:
     bx lr
 END \name
@@ -1924,4 +1923,3 @@ READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg08, r8
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg09, r9
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg10, r10
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg11, r11
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg12, r12
diff --git a/runtime/arch/arm64/entrypoints_init_arm64.cc b/runtime/arch/arm64/entrypoints_init_arm64.cc
index cc5bf29609..55b09c318c 100644
--- a/runtime/arch/arm64/entrypoints_init_arm64.cc
+++ b/runtime/arch/arm64/entrypoints_init_arm64.cc
@@ -149,7 +149,7 @@ void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) {
   qpoints->pReadBarrierMarkReg13 = art_quick_read_barrier_mark_reg13;
   qpoints->pReadBarrierMarkReg14 = art_quick_read_barrier_mark_reg14;
   qpoints->pReadBarrierMarkReg15 = art_quick_read_barrier_mark_reg15;
-  qpoints->pReadBarrierMarkReg16 = art_quick_read_barrier_mark_reg16;
+  qpoints->pReadBarrierMarkReg16 = nullptr;  // IP0 is used as a temp by the asm stub.
   qpoints->pReadBarrierMarkReg17 = art_quick_read_barrier_mark_reg17;
   qpoints->pReadBarrierMarkReg18 = art_quick_read_barrier_mark_reg18;
   qpoints->pReadBarrierMarkReg19 = art_quick_read_barrier_mark_reg19;
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index bdad966496..4289cabbc6 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -331,22 +331,23 @@
 #endif
 
     // Save FP registers.
-    stp d0, d1,   [sp, #8]
-    stp d2, d3,   [sp, #24]
-    stp d4, d5,   [sp, #40]
-    stp d6, d7,   [sp, #56]
-    stp d8, d9,   [sp, #72]
-    stp d10, d11, [sp, #88]
-    stp d12, d13, [sp, #104]
-    stp d14, d15, [sp, #120]
-    stp d16, d17, [sp, #136]
-    stp d18, d19, [sp, #152]
-    stp d20, d21, [sp, #168]
-    stp d22, d23, [sp, #184]
-    stp d24, d25, [sp, #200]
-    stp d26, d27, [sp, #216]
-    stp d28, d29, [sp, #232]
-    stp d30, d31, [sp, #248]
+    str d0,       [sp, #8]
+    stp d1, d2,   [sp, #16]
+    stp d3, d4,   [sp, #32]
+    stp d5, d6,   [sp, #48]
+    stp d7, d8,   [sp, #64]
+    stp d9, d10,  [sp, #80]
+    stp d11, d12, [sp, #96]
+    stp d13, d14, [sp, #112]
+    stp d15, d16, [sp, #128]
+    stp d17, d18, [sp, #144]
+    stp d19, d20, [sp, #160]
+    stp d21, d22, [sp, #176]
+    stp d23, d24, [sp, #192]
+    stp d25, d26, [sp, #208]
+    stp d27, d28, [sp, #224]
+    stp d29, d30, [sp, #240]
+    str d31,      [sp, #256]
 
     // Save core registers.
     str x0,       [sp, #264]
@@ -430,22 +431,23 @@
 
 .macro RESTORE_SAVE_EVERYTHING_FRAME
     // Restore FP registers.
-    ldp d0, d1,   [sp, #8]
-    ldp d2, d3,   [sp, #24]
-    ldp d4, d5,   [sp, #40]
-    ldp d6, d7,   [sp, #56]
-    ldp d8, d9,   [sp, #72]
-    ldp d10, d11, [sp, #88]
-    ldp d12, d13, [sp, #104]
-    ldp d14, d15, [sp, #120]
-    ldp d16, d17, [sp, #136]
-    ldp d18, d19, [sp, #152]
-    ldp d20, d21, [sp, #168]
-    ldp d22, d23, [sp, #184]
-    ldp d24, d25, [sp, #200]
-    ldp d26, d27, [sp, #216]
-    ldp d28, d29, [sp, #232]
-    ldp d30, d31, [sp, #248]
+    ldr d0,       [sp, #8]
+    ldp d1, d2,   [sp, #16]
+    ldp d3, d4,   [sp, #32]
+    ldp d5, d6,   [sp, #48]
+    ldp d7, d8,   [sp, #64]
+    ldp d9, d10,  [sp, #80]
+    ldp d11, d12, [sp, #96]
+    ldp d13, d14, [sp, #112]
+    ldp d15, d16, [sp, #128]
+    ldp d17, d18, [sp, #144]
+    ldp d19, d20, [sp, #160]
+    ldp d21, d22, [sp, #176]
+    ldp d23, d24, [sp, #192]
+    ldp d25, d26, [sp, #208]
+    ldp d27, d28, [sp, #224]
+    ldp d29, d30, [sp, #240]
+    ldr d31,      [sp, #256]
 
     // Restore core registers.
     ldr x0,       [sp, #264]
@@ -1939,10 +1941,13 @@ END art_quick_alloc_object_rosalloc
                                                               // (for 64 bit alignment).
     and    \xTemp0, \xTemp0, #4
     add    \xTemp1, \xTemp1, \xTemp0
-    and    \xTemp1, \xTemp1, #OBJECT_ALIGNMENT_MASK_TOGGLED   // Round up the object size by the
-                                                              // object alignment. (addr + 7) & ~7.
-                                                              // Add by 7 is done above.
-
+    and    \xTemp1, \xTemp1, #OBJECT_ALIGNMENT_MASK_TOGGLED64 // Apply alignemnt mask
+                                                              // (addr + 7) & ~7. The mask must
+                                                              // be 64 bits to keep high bits in
+                                                              // case of overflow.
+    // Negative sized arrays are handled here since xCount holds a zero extended 32 bit value.
+    // Negative ints become large 64 bit unsigned ints which will always be larger than max signed
+    // 32 bit int. Since the max shift for arrays is 3, it can not become a negative 64 bit int.
     cmp    \xTemp1, #MIN_LARGE_OBJECT_THRESHOLD               // Possibly a large object, go slow
     bhs    \slowPathLabel                                     // path.
 
@@ -1956,7 +1961,6 @@ END art_quick_alloc_object_rosalloc
     sub    \xTemp2, \xTemp2, \xTemp0
     cmp    \xTemp1, \xTemp2
     bhi    \slowPathLabel
-
     // "Point of no slow path". Won't go to the slow path from here on. OK to clobber x0 and x1.
                                                               // Move old thread_local_pos to x0
                                                               // for the return value.
@@ -2747,7 +2751,7 @@ READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg12, w12, x12
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg13, w13, x13
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg14, w14, x14
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg15, w15, x15
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg16, w16, x16
+// READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg16, w16, x16 ip0 is blocked
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg17, w17, x17
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg18, w18, x18
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg19, w19, x19
diff --git a/runtime/arch/instruction_set.cc b/runtime/arch/instruction_set.cc
index 81ca010423..b35e0889e4 100644
--- a/runtime/arch/instruction_set.cc
+++ b/runtime/arch/instruction_set.cc
@@ -18,6 +18,7 @@
 
 // Explicitly include our own elf.h to avoid Linux and other dependencies.
 #include "../elf.h"
+#include "base/bit_utils.h"
 #include "globals.h"
 
 namespace art {
@@ -113,14 +114,44 @@ size_t GetInstructionSetAlignment(InstructionSet isa) {
   }
 }
 
-static constexpr size_t kDefaultStackOverflowReservedBytes = 16 * KB;
-static constexpr size_t kMipsStackOverflowReservedBytes = kDefaultStackOverflowReservedBytes;
-static constexpr size_t kMips64StackOverflowReservedBytes = kDefaultStackOverflowReservedBytes;
-
-static constexpr size_t kArmStackOverflowReservedBytes =    8 * KB;
-static constexpr size_t kArm64StackOverflowReservedBytes =  8 * KB;
-static constexpr size_t kX86StackOverflowReservedBytes =    8 * KB;
-static constexpr size_t kX86_64StackOverflowReservedBytes = 8 * KB;
+#if !defined(ART_STACK_OVERFLOW_GAP_arm) || !defined(ART_STACK_OVERFLOW_GAP_arm64) || \
+    !defined(ART_STACK_OVERFLOW_GAP_mips) || !defined(ART_STACK_OVERFLOW_GAP_mips64) || \
+    !defined(ART_STACK_OVERFLOW_GAP_x86) || !defined(ART_STACK_OVERFLOW_GAP_x86_64)
+#error "Missing defines for stack overflow gap"
+#endif
+
+static constexpr size_t kArmStackOverflowReservedBytes    = ART_STACK_OVERFLOW_GAP_arm;
+static constexpr size_t kArm64StackOverflowReservedBytes  = ART_STACK_OVERFLOW_GAP_arm64;
+static constexpr size_t kMipsStackOverflowReservedBytes   = ART_STACK_OVERFLOW_GAP_mips;
+static constexpr size_t kMips64StackOverflowReservedBytes = ART_STACK_OVERFLOW_GAP_mips64;
+static constexpr size_t kX86StackOverflowReservedBytes    = ART_STACK_OVERFLOW_GAP_x86;
+static constexpr size_t kX86_64StackOverflowReservedBytes = ART_STACK_OVERFLOW_GAP_x86_64;
+
+static_assert(IsAligned<kPageSize>(kArmStackOverflowReservedBytes), "ARM gap not page aligned");
+static_assert(IsAligned<kPageSize>(kArm64StackOverflowReservedBytes), "ARM64 gap not page aligned");
+static_assert(IsAligned<kPageSize>(kMipsStackOverflowReservedBytes), "Mips gap not page aligned");
+static_assert(IsAligned<kPageSize>(kMips64StackOverflowReservedBytes),
+              "Mips64 gap not page aligned");
+static_assert(IsAligned<kPageSize>(kX86StackOverflowReservedBytes), "X86 gap not page aligned");
+static_assert(IsAligned<kPageSize>(kX86_64StackOverflowReservedBytes),
+              "X86_64 gap not page aligned");
+
+#if !defined(ART_FRAME_SIZE_LIMIT)
+#error "ART frame size limit missing"
+#endif
+
+// TODO: Should we require an extra page (RoundUp(SIZE) + kPageSize)?
+static_assert(ART_FRAME_SIZE_LIMIT < kArmStackOverflowReservedBytes, "Frame size limit too large");
+static_assert(ART_FRAME_SIZE_LIMIT < kArm64StackOverflowReservedBytes,
+              "Frame size limit too large");
+static_assert(ART_FRAME_SIZE_LIMIT < kMipsStackOverflowReservedBytes,
+              "Frame size limit too large");
+static_assert(ART_FRAME_SIZE_LIMIT < kMips64StackOverflowReservedBytes,
+              "Frame size limit too large");
+static_assert(ART_FRAME_SIZE_LIMIT < kX86StackOverflowReservedBytes,
+              "Frame size limit too large");
+static_assert(ART_FRAME_SIZE_LIMIT < kX86_64StackOverflowReservedBytes,
+              "Frame size limit too large");
 
 size_t GetStackOverflowReservedBytes(InstructionSet isa) {
   switch (isa) {
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index ac8f5233da..32768b0263 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -910,7 +910,20 @@ MACRO0(RETURN_OR_DELIVER_PENDING_EXCEPTION)
 END_MACRO
 
 // Generate the allocation entrypoints for each allocator.
-GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
+GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_REGION_TLAB_ALLOCATORS
+// Comment out allocators that have x86_64 specific asm.
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_region_tlab, RegionTLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_region_tlab, RegionTLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_region_tlab, RegionTLAB)
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_region_tlab, RegionTLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_WITH_ACCESS_CHECK(_region_tlab, RegionTLAB)
+GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY(_region_tlab, RegionTLAB)
+GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_region_tlab, RegionTLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_region_tlab, RegionTLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_tlab, RegionTLAB)
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB)
 
 // A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc).
 DEFINE_FUNCTION art_quick_alloc_object_rosalloc
@@ -1003,6 +1016,14 @@ END_FUNCTION art_quick_alloc_object_rosalloc
 MACRO1(ALLOC_OBJECT_TLAB_FAST_PATH, slowPathLabel)
     testl %edx, %edx                                       // Check null class
     jz   RAW_VAR(slowPathLabel)
+    ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH(RAW_VAR(slowPathLabel))
+END_MACRO
+
+// The common fast path code for art_quick_alloc_object_resolved_region_tlab.
+//
+// RDI: type_idx, RSI: ArtMethod*, RDX/EDX: the class, RAX: return value.
+// RCX: scratch, r8: Thread::Current().
+MACRO1(ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH, slowPathLabel)
                                                            // Check class status.
     cmpl LITERAL(MIRROR_CLASS_STATUS_INITIALIZED), MIRROR_CLASS_STATUS_OFFSET(%rdx)
     jne  RAW_VAR(slowPathLabel)
@@ -1014,26 +1035,73 @@ MACRO1(ALLOC_OBJECT_TLAB_FAST_PATH, slowPathLabel)
                                                            // kAccClassIsFinalizable
     testl LITERAL(ACCESS_FLAGS_CLASS_IS_FINALIZABLE), MIRROR_CLASS_ACCESS_FLAGS_OFFSET(%rdx)
     jnz  RAW_VAR(slowPathLabel)
-    movq %gs:THREAD_SELF_OFFSET, %r8                       // r8 = thread
-    movq THREAD_LOCAL_END_OFFSET(%r8), %rax                // Load thread_local_end.
-    subq THREAD_LOCAL_POS_OFFSET(%r8), %rax                // Compute the remaining buffer size.
-    movl MIRROR_CLASS_OBJECT_SIZE_OFFSET(%rdx), %ecx       // Load the object size.
-    cmpq %rax, %rcx                                        // Check if it fits. OK to do this
-                                                           // before rounding up the object size
-                                                           // assuming the buf size alignment.
+    ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH(RAW_VAR(slowPathLabel))
+END_MACRO
+
+// The fast path code for art_quick_alloc_object_initialized_region_tlab.
+//
+// RDI: type_idx, RSI: ArtMethod*, RDX/EDX: the class, RAX: return value.
+// RCX: scratch, r8: Thread::Current().
+MACRO1(ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH, slowPathLabel)
+    movq %gs:THREAD_SELF_OFFSET, %r8                           // r8 = thread
+    movl MIRROR_CLASS_OBJECT_SIZE_OFFSET(%rdx), %ecx           // Load the object size.
+    movq THREAD_LOCAL_POS_OFFSET(%r8), %rax
+    leaq OBJECT_ALIGNMENT_MASK(%rax, %rcx), %rcx               // Add size to pos, note that these
+                                                               // are both 32 bit ints, overflow
+                                                               // will cause the add to be past the
+                                                               // end of the thread local region.
+                                                               // Also sneak in alignment mask add.
+    andq LITERAL(OBJECT_ALIGNMENT_MASK_TOGGLED64), %rcx        // Align the size by 8. (addr + 7) &
+                                                               // ~7.
+    cmpq THREAD_LOCAL_END_OFFSET(%r8), %rcx                    // Check if it fits.
     ja   RAW_VAR(slowPathLabel)
-    addl LITERAL(OBJECT_ALIGNMENT_MASK), %ecx              // Align the size by 8. (addr + 7) & ~7.
-    andl LITERAL(OBJECT_ALIGNMENT_MASK_TOGGLED), %ecx
-    movq THREAD_LOCAL_POS_OFFSET(%r8), %rax                // Load thread_local_pos
-                                                           // as allocated object.
-    addq %rax, %rcx                                        // Add the object size.
-    movq %rcx, THREAD_LOCAL_POS_OFFSET(%r8)                // Update thread_local_pos.
-    addq LITERAL(1), THREAD_LOCAL_OBJECTS_OFFSET(%r8)      // Increase thread_local_objects.
-                                                           // Store the class pointer in the header.
-                                                           // No fence needed for x86.
+    movq %rcx, THREAD_LOCAL_POS_OFFSET(%r8)                    // Update thread_local_pos.
+    addq LITERAL(1), THREAD_LOCAL_OBJECTS_OFFSET(%r8)          // Increase thread_local_objects.
+                                                               // Store the class pointer in the
+                                                               // header.
+                                                               // No fence needed for x86.
     POISON_HEAP_REF edx
     movl %edx, MIRROR_OBJECT_CLASS_OFFSET(%rax)
-    ret                                                    // Fast path succeeded.
+    ret                                                        // Fast path succeeded.
+END_MACRO
+
+// The fast path code for art_quick_alloc_array_region_tlab.
+// Inputs: RDI: uint32_t type_idx, RSI: int32_t component_count, RDX: ArtMethod* method
+// Temps: RCX: the class, r8, r9
+// Output: RAX: return value.
+MACRO1(ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED, slowPathLabel)
+    movq %rcx, %r8                                             // Save class for later
+    movl MIRROR_CLASS_COMPONENT_TYPE_OFFSET(%rcx), %ecx        // Load component type.
+    UNPOISON_HEAP_REF ecx
+    movl MIRROR_CLASS_OBJECT_PRIMITIVE_TYPE_OFFSET(%rcx), %ecx // Load primitive type.
+    shrq LITERAL(PRIMITIVE_TYPE_SIZE_SHIFT_SHIFT), %rcx        // Get component size shift.
+    movq %rsi, %r9
+    salq %cl, %r9                                              // Calculate array count shifted.
+    // Add array header + alignment rounding.
+    addq LITERAL(MIRROR_INT_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK), %r9
+    // Add 4 extra bytes if we are doing a long array.
+    addq LITERAL(1), %rcx
+    andq LITERAL(4), %rcx
+    addq %rcx, %r9
+    movq %gs:THREAD_SELF_OFFSET, %rcx                          // rcx = thread
+#if MIRROR_LONG_ARRAY_DATA_OFFSET != MIRROR_INT_ARRAY_DATA_OFFSET + 4
+#error Long array data offset must be 4 greater than int array data offset.
+#endif
+    // Mask out the unaligned part to make sure we are 8 byte aligned.
+    andq LITERAL(OBJECT_ALIGNMENT_MASK_TOGGLED64), %r9
+    movq THREAD_LOCAL_POS_OFFSET(%rcx), %rax
+    addq %rax, %r9
+    cmpq THREAD_LOCAL_END_OFFSET(%rcx), %r9                    // Check if it fits.
+    ja   RAW_VAR(slowPathLabel)
+    movq %r9, THREAD_LOCAL_POS_OFFSET(%rcx)                    // Update thread_local_pos.
+    addq LITERAL(1), THREAD_LOCAL_OBJECTS_OFFSET(%rcx)         // Increase thread_local_objects.
+                                                               // Store the class pointer in the
+                                                               // header.
+                                                               // No fence needed for x86.
+    POISON_HEAP_REF r8d
+    movl %r8d, MIRROR_OBJECT_CLASS_OFFSET(%rax)
+    movl %esi, MIRROR_ARRAY_LENGTH_OFFSET(%rax)
+    ret                                                        // Fast path succeeded.
 END_MACRO
 
 // The common slow path code for art_quick_alloc_object_tlab and art_quick_alloc_object_region_tlab.
@@ -1046,6 +1114,16 @@ MACRO1(ALLOC_OBJECT_TLAB_SLOW_PATH, cxx_name)
     RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER                // return or deliver exception
 END_MACRO
 
+// The slow path code for art_quick_alloc_array_region_tlab.
+MACRO1(ALLOC_ARRAY_TLAB_SLOW_PATH, cxx_name)
+    SETUP_SAVE_REFS_ONLY_FRAME                                 // save ref containing registers for GC
+    // Outgoing argument set up
+    movq %gs:THREAD_SELF_OFFSET, %rcx                          // pass Thread::Current()
+    call CALLVAR(cxx_name)                                     // cxx_name(arg0, arg1, arg2, Thread*)
+    RESTORE_SAVE_REFS_ONLY_FRAME                               // restore frame up to return address
+    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER                    // return or deliver exception
+END_MACRO
+
 // A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB).
 DEFINE_FUNCTION art_quick_alloc_object_tlab
     // Fast path tlab allocation.
@@ -1065,6 +1143,82 @@ DEFINE_FUNCTION art_quick_alloc_object_tlab
     ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeTLAB
 END_FUNCTION art_quick_alloc_object_tlab
 
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_region_tlab, RegionTLAB).
+DEFINE_FUNCTION art_quick_alloc_array_region_tlab
+    // Fast path region tlab allocation.
+    // RDI: uint32_t type_idx, RSI: int32_t component_count, RDX: ArtMethod*
+    // RCX: klass, R8, R9: free. RAX: return val.
+#if !defined(USE_READ_BARRIER)
+    int3
+    int3
+#endif
+    movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rdx), %rcx      // Load dex cache resolved types array
+    movl 0(%rcx, %rdi, COMPRESSED_REFERENCE_SIZE), %ecx        // Load the class
+    // Null check so that we can load the lock word.
+    testl %ecx, %ecx
+    jz .Lart_quick_alloc_array_region_tlab_slow_path
+
+    cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
+    jne .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_marking
+.Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path_exit:
+    ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_region_tlab_slow_path
+.Lart_quick_alloc_array_region_tlab_class_load_read_barrier_marking:
+    // Check the mark bit, if it is 1 return.
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx)
+    jnz .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path_exit
+.Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path:
+    // The read barrier slow path. Mark the class.
+    PUSH rdi
+    PUSH rsi
+    PUSH rdx
+    // Outgoing argument set up
+    movq %rcx, %rdi                                            // Pass the class as the first param.
+    call SYMBOL(artReadBarrierMark)                            // cxx_name(mirror::Object* obj)
+    movq %rax, %rcx
+    POP rdx
+    POP rsi
+    POP rdi
+    jmp .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path_exit
+.Lart_quick_alloc_array_region_tlab_slow_path:
+    ALLOC_ARRAY_TLAB_SLOW_PATH artAllocArrayFromCodeRegionTLAB
+END_FUNCTION art_quick_alloc_array_region_tlab
+
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_region_tlab, RegionTLAB).
+DEFINE_FUNCTION art_quick_alloc_array_resolved_region_tlab
+    // Fast path region tlab allocation.
+    // RDI: mirror::Class* klass, RSI: int32_t component_count, RDX: ArtMethod*
+    // RCX: mirror::Class* klass, R8, R9: free. RAX: return val.
+#if !defined(USE_READ_BARRIER)
+    int3
+    int3
+#endif
+    movq %rdi, %rcx
+    // Already resolved, no null check.
+    cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
+    jne .Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_marking
+.Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_slow_path_exit:
+    ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_resolved_region_tlab_slow_path
+.Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_marking:
+    // Check the mark bit, if it is 1 return.
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx)
+    jnz .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path_exit
+.Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_slow_path:
+    // The read barrier slow path. Mark the class.
+    PUSH rdi
+    PUSH rsi
+    PUSH rdx
+    // Outgoing argument set up
+    movq %rcx, %rdi                                            // Pass the class as the first param.
+    call SYMBOL(artReadBarrierMark)                            // cxx_name(mirror::Object* obj)
+    movq %rax, %rcx
+    POP rdx
+    POP rsi
+    POP rdi
+    jmp .Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_slow_path_exit
+.Lart_quick_alloc_array_resolved_region_tlab_slow_path:
+    ALLOC_ARRAY_TLAB_SLOW_PATH artAllocArrayFromCodeResolvedRegionTLAB
+END_FUNCTION art_quick_alloc_array_resolved_region_tlab
+
 // A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB).
 DEFINE_FUNCTION art_quick_alloc_object_region_tlab
     // Fast path region tlab allocation.
@@ -1074,29 +1228,30 @@ DEFINE_FUNCTION art_quick_alloc_object_region_tlab
     int3
     int3
 #endif
-    // Might need a special macro since rsi and edx is 32b/64b mismatched.
     movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rsi), %rdx  // Load dex cache resolved types array
-    // Might need to break down into multiple instructions to get the base address in a register.
-                                                               // Load the class
-    movl 0(%rdx, %rdi, COMPRESSED_REFERENCE_SIZE), %edx
-    cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
-    jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+    movl 0(%rdx, %rdi, COMPRESSED_REFERENCE_SIZE), %edx    // Load the class
     // Null check so that we can load the lock word.
     testl %edx, %edx
-    jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
-    // Check the mark bit, if it is 1 return.
-    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
-    jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
+    jz .Lart_quick_alloc_object_region_tlab_slow_path
+    // Test if the GC is marking.
+    cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
+    jne .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_marking
 .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
     ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
+.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_marking:
+    // Check the mark bit, if it is 1 avoid the read barrier.
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
+    jnz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
 .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:
     // The read barrier slow path. Mark the class.
     PUSH rdi
     PUSH rsi
+    subq LITERAL(8), %rsp // 16 byte alignment
     // Outgoing argument set up
     movq %rdx, %rdi                                            // Pass the class as the first param.
     call SYMBOL(artReadBarrierMark)                            // cxx_name(mirror::Object* obj)
     movq %rax, %rdx
+    addq LITERAL(8), %rsp
     POP rsi
     POP rdi
     jmp .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
@@ -1104,6 +1259,77 @@ DEFINE_FUNCTION art_quick_alloc_object_region_tlab
     ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeRegionTLAB
 END_FUNCTION art_quick_alloc_object_region_tlab
 
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB).
+DEFINE_FUNCTION art_quick_alloc_object_resolved_region_tlab
+    // Fast path region tlab allocation.
+    // RDI: mirror::Class* klass, RSI: ArtMethod*
+    // RDX, RCX, R8, R9: free. RAX: return val.
+#if !defined(USE_READ_BARRIER)
+    int3
+    int3
+#endif
+    movq %rdi, %rdx
+    cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
+    jne .Lart_quick_alloc_object_resolved_region_tlab_class_load_read_barrier_marking
+.Lart_quick_alloc_object_resolved_region_tlab_class_load_read_barrier_slow_path_exit:
+    ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_region_tlab_slow_path
+.Lart_quick_alloc_object_resolved_region_tlab_class_load_read_barrier_marking:
+    // Check the mark bit, if it is 1 avoid the read barrier.
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
+    jnz .Lart_quick_alloc_object_resolved_region_tlab_class_load_read_barrier_slow_path_exit
+.Lart_quick_alloc_object_resolved_region_tlab_class_load_read_barrier_slow_path:
+    // The read barrier slow path. Mark the class.
+    PUSH rdi
+    PUSH rsi
+    subq LITERAL(8), %rsp // 16 byte alignment
+    // Outgoing argument set up
+    movq %rdx, %rdi                                            // Pass the class as the first param.
+    call SYMBOL(artReadBarrierMark)                            // cxx_name(mirror::Object* obj)
+    movq %rax, %rdx
+    addq LITERAL(8), %rsp
+    POP rsi
+    POP rdi
+    jmp .Lart_quick_alloc_object_resolved_region_tlab_class_load_read_barrier_slow_path_exit
+.Lart_quick_alloc_object_resolved_region_tlab_slow_path:
+    ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeResolvedRegionTLAB
+END_FUNCTION art_quick_alloc_object_resolved_region_tlab
+
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_region_tlab, RegionTLAB).
+DEFINE_FUNCTION art_quick_alloc_object_initialized_region_tlab
+    // Fast path region tlab allocation.
+    // RDI: mirror::Class* klass, RSI: ArtMethod*
+    // RDX, RCX, R8, R9: free. RAX: return val.
+#if !defined(USE_READ_BARRIER)
+    int3
+    int3
+#endif
+    // Might need a special macro since rsi and edx is 32b/64b mismatched.
+    movq %rdi, %rdx
+    cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
+    jne .Lart_quick_alloc_object_initialized_region_tlab_class_load_read_barrier_marking
+.Lart_quick_alloc_object_initialized_region_tlab_class_load_read_barrier_slow_path_exit:
+    ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH .Lart_quick_alloc_object_initialized_region_tlab_slow_path
+.Lart_quick_alloc_object_initialized_region_tlab_class_load_read_barrier_marking:
+    // Check the mark bit, if it is 1 avoid the read barrier.
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
+    jnz .Lart_quick_alloc_object_initialized_region_tlab_class_load_read_barrier_slow_path
+.Lart_quick_alloc_object_initialized_region_tlab_class_load_read_barrier_slow_path:
+    // The read barrier slow path. Mark the class.
+    PUSH rdi
+    PUSH rsi
+    subq LITERAL(8), %rsp // 16 byte alignment
+    // Outgoing argument set up
+    movq %rdx, %rdi                                            // Pass the class as the first param.
+    call SYMBOL(artReadBarrierMark)                            // cxx_name(mirror::Object* obj)
+    movq %rax, %rdx
+    addq LITERAL(8), %rsp
+    POP rsi
+    POP rdi
+    jmp .Lart_quick_alloc_object_initialized_region_tlab_class_load_read_barrier_slow_path_exit
+.Lart_quick_alloc_object_initialized_region_tlab_slow_path:
+    ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeInitializedRegionTLAB
+END_FUNCTION art_quick_alloc_object_initialized_region_tlab
+
 ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 ONE_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 ONE_ARG_DOWNCALL art_quick_initialize_type, artInitializeTypeFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
diff --git a/runtime/base/logging.h b/runtime/base/logging.h
index 6323eee53a..ac21a3f0ea 100644
--- a/runtime/base/logging.h
+++ b/runtime/base/logging.h
@@ -57,6 +57,7 @@ struct LogVerbosity {
   bool verifier;
   bool image;
   bool systrace_lock_logging;  // Enabled with "-verbose:sys-locks".
+  bool agents;
 };
 
 // Global log verbosity setting, initialized by InitLogging.
diff --git a/runtime/base/macros.h b/runtime/base/macros.h
index 3c43253e67..5a50247f5a 100644
--- a/runtime/base/macros.h
+++ b/runtime/base/macros.h
@@ -75,7 +75,7 @@ template<typename T> ART_FRIEND_TEST(test_set_name, individual_test)
     ALWAYS_INLINE void* operator new(size_t, void* ptr) noexcept { return ptr; } \
     ALWAYS_INLINE void operator delete(void*, void*) noexcept { } \
   private: \
-    void* operator new(size_t) = delete // NOLINT
+    void* operator new(size_t) = delete  // NOLINT
 
 // The arraysize(arr) macro returns the # of elements in an array arr.
 // The expression is a compile-time constant, and therefore can be
@@ -135,13 +135,13 @@ char (&ArraySizeHelper(T (&array)[N]))[N];
 #define ARRAYSIZE_UNSAFE(a) \
   ((sizeof(a) / sizeof(*(a))) / static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
 
-#define SIZEOF_MEMBER(t, f) sizeof((reinterpret_cast<t*>(4096))->f) // NOLINT
+#define SIZEOF_MEMBER(t, f) sizeof((reinterpret_cast<t*>(4096))->f)  // NOLINT
 
 #define OFFSETOF_MEMBER(t, f) \
-  (reinterpret_cast<uintptr_t>(&reinterpret_cast<t*>(16)->f) - static_cast<uintptr_t>(16u)) // NOLINT
+  (reinterpret_cast<uintptr_t>(&reinterpret_cast<t*>(16)->f) - static_cast<uintptr_t>(16u))  // NOLINT
 
 #define OFFSETOF_MEMBERPTR(t, f) \
-  (reinterpret_cast<uintptr_t>(&(reinterpret_cast<t*>(16)->*f)) - static_cast<uintptr_t>(16)) // NOLINT
+  (reinterpret_cast<uintptr_t>(&(reinterpret_cast<t*>(16)->*f)) - static_cast<uintptr_t>(16))  // NOLINT
 
 #define PACKED(x) __attribute__ ((__aligned__(x), __packed__))
 
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 46722ecad7..4d48da6a83 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -4630,18 +4630,23 @@ bool ClassLinker::InitializeClass(Thread* self, Handle<mirror::Class> klass,
         } else {
           value_it.ReadValueToField<false>(field);
         }
+        if (self->IsExceptionPending()) {
+          break;
+        }
         DCHECK(!value_it.HasNext() || field_it.HasNextStaticField());
       }
     }
   }
 
-  ArtMethod* clinit = klass->FindClassInitializer(image_pointer_size_);
-  if (clinit != nullptr) {
-    CHECK(can_init_statics);
-    JValue result;
-    clinit->Invoke(self, nullptr, 0, &result, "V");
-  }
 
+  if (!self->IsExceptionPending()) {
+    ArtMethod* clinit = klass->FindClassInitializer(image_pointer_size_);
+    if (clinit != nullptr) {
+      CHECK(can_init_statics);
+      JValue result;
+      clinit->Invoke(self, nullptr, 0, &result, "V");
+    }
+  }
   self->AllowThreadSuspension();
   uint64_t t1 = NanoTime();
 
diff --git a/runtime/experimental_flags.h b/runtime/experimental_flags.h
index fde1a5f3ab..7faa2dc7e3 100644
--- a/runtime/experimental_flags.h
+++ b/runtime/experimental_flags.h
@@ -26,6 +26,8 @@ struct ExperimentalFlags {
   // The actual flag values.
   enum {
     kNone           = 0x0000,
+    kAgents         = 0x0001,  // 0b00000001
+    kRuntimePlugins = 0x0002,  // 0b00000010
   };
 
   constexpr ExperimentalFlags() : value_(0x0000) {}
@@ -61,9 +63,19 @@ struct ExperimentalFlags {
   uint32_t value_;
 };
 
-inline std::ostream& operator<<(std::ostream& stream,
-                                const ExperimentalFlags& e ATTRIBUTE_UNUSED) {
-  stream << "kNone";
+inline std::ostream& operator<<(std::ostream& stream, const ExperimentalFlags& e) {
+  bool started = false;
+  if (e & ExperimentalFlags::kAgents) {
+    stream << (started ? "|" : "") << "kAgents";
+    started = true;
+  }
+  if (e & ExperimentalFlags::kRuntimePlugins) {
+    stream << (started ? "|" : "") << "kRuntimePlugins";
+    started = true;
+  }
+  if (!started) {
+    stream << "kNone";
+  }
   return stream;
 }
 
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 88fbf781bc..b574c3bf3a 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -230,6 +230,9 @@ Heap::Heap(size_t initial_size,
       total_wait_time_(0),
       verify_object_mode_(kVerifyObjectModeDisabled),
       disable_moving_gc_count_(0),
+      semi_space_collector_(nullptr),
+      mark_compact_collector_(nullptr),
+      concurrent_copying_collector_(nullptr),
       is_running_on_memory_tool_(Runtime::Current()->IsRunningOnMemoryTool()),
       use_tlab_(use_tlab),
       main_space_backup_(nullptr),
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index c2e2a1edd2..6fcad295bb 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -384,30 +384,7 @@ ImageSpace* ImageSpace::CreateBootImage(const char* image_location,
                                        &has_system, &cache_filename, &dalvik_cache_exists,
                                        &has_cache, &is_global_cache);
 
-  // If we're starting with the global cache, and we're the zygote, try to see whether there are
-  // OTA artifacts from the A/B OTA preopting to move over.
-  // (It is structurally simpler to check this here, instead of complicating the compile/relocate
-  // logic below.)
   const bool is_zygote = Runtime::Current()->IsZygote();
-  if (is_global_cache && is_zygote) {
-    VLOG(startup) << "Checking for A/B OTA data.";
-    TryMoveOTAArtifacts(cache_filename, dalvik_cache_exists);
-
-    // Retry. There are two cases where the old info is outdated:
-    // * There wasn't a boot image before (e.g., some failure on boot), but now the OTA preopted
-    //   image has been moved in-place.
-    // * There was a boot image before, and we tried to move the OTA preopted image, but a failure
-    //   happened and there is no file anymore.
-    found_image = FindImageFilename(image_location,
-                                    image_isa,
-                                    &system_filename,
-                                    &has_system,
-                                    &cache_filename,
-                                    &dalvik_cache_exists,
-                                    &has_cache,
-                                    &is_global_cache);
-  }
-
   if (is_zygote && !secondary_image) {
     MarkZygoteStart(image_isa, Runtime::Current()->GetZygoteMaxFailedBoots());
   }
@@ -529,6 +506,17 @@ ImageSpace* ImageSpace::CreateBootImage(const char* image_location,
                                error_msg);
     }
     if (space != nullptr) {
+      // Check whether there is enough space left over in the data partition. Even if we can load
+      // the image, we need to be conservative, as some parts of the platform are not very tolerant
+      // of space constraints.
+      // ImageSpace doesn't know about the data partition per se, it relies on the FindImageFilename
+      // helper (which relies on GetDalvikCache). So for now, if we load an image out of /system,
+      // ignore the check (as it would test for free space in /system instead).
+      if (!is_system && !CheckSpace(*image_filename, error_msg)) {
+        // No. Delete the generated image and try to run out of the dex files.
+        PruneDalvikCache(image_isa);
+        return nullptr;
+      }
       return space;
     }
 
diff --git a/runtime/gc/space/image_space_fs.h b/runtime/gc/space/image_space_fs.h
index 8e852fa54b..fa941c0376 100644
--- a/runtime/gc/space/image_space_fs.h
+++ b/runtime/gc/space/image_space_fs.h
@@ -79,115 +79,6 @@ static void DeleteDirectoryContents(const std::string& dir, bool recurse) {
   CHECK_EQ(0, closedir(c_dir)) << "Unable to close directory.";
 }
 
-static bool HasContent(const char* dir) {
-  if (!OS::DirectoryExists(dir)) {
-    return false;
-  }
-  DIR* c_dir = opendir(dir);
-  if (c_dir == nullptr) {
-    PLOG(WARNING) << "Unable to open " << dir << " to delete it if empty";
-    return false;
-  }
-
-  for (struct dirent* de = readdir(c_dir); de != nullptr; de = readdir(c_dir)) {
-    const char* name = de->d_name;
-    if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0) {
-      continue;
-    }
-    // Something here.
-    CHECK_EQ(0, closedir(c_dir)) << "Unable to close directory.";
-    return true;
-  }
-  CHECK_EQ(0, closedir(c_dir)) << "Unable to close directory.";
-  return false;
-}
-
-// Delete this directory, if empty. Then repeat with the parents. Skips non-existing directories.
-// If stop_at isn't null, the recursion will stop when a directory with the given name is found.
-static void DeleteEmptyDirectoriesUpTo(const std::string& dir, const char* stop_at) {
-  if (HasContent(dir.c_str())) {
-    return;
-  }
-  if (stop_at != nullptr) {
-    // This check isn't precise, but good enough in practice.
-    if (EndsWith(dir, stop_at)) {
-      return;
-    }
-  }
-  if (OS::DirectoryExists(dir.c_str())) {
-    if (rmdir(dir.c_str()) != 0) {
-      PLOG(ERROR) << "Unable to rmdir " << dir;
-      return;
-    }
-  }
-  size_t last_slash = dir.rfind('/');
-  if (last_slash != std::string::npos) {
-    DeleteEmptyDirectoriesUpTo(dir.substr(0, last_slash), stop_at);
-  }
-}
-
-static void MoveOTAArtifacts(const char* src, const char* trg) {
-  DCHECK(OS::DirectoryExists(src));
-  DCHECK(OS::DirectoryExists(trg));
-
-  if (HasContent(trg)) {
-    LOG(WARNING) << "We do not support merging caches, but the target isn't empty: " << src
-                 << " to " << trg;
-    return;
-  }
-
-  if (rename(src, trg) != 0) {
-    PLOG(ERROR) << "Could not rename OTA cache " << src << " to target " << trg;
-  }
-}
-
-// This is some dlopen/dlsym and hardcoded data to avoid a dependency on libselinux. Make sure
-// this stays in sync!
-static bool RelabelOTAFiles(const std::string& dalvik_cache_dir) {
-  // We only expect selinux on devices. Don't even attempt this on the host.
-  if (!kIsTargetBuild) {
-    return true;
-  }
-
-  // Custom deleter, so we can use std::unique_ptr.
-  struct HandleDeleter {
-    void operator()(void* in) {
-      if (in != nullptr && dlclose(in) != 0) {
-        PLOG(ERROR) << "Could not close selinux handle.";
-      }
-    }
-  };
-
-  // Look for selinux library.
-  std::unique_ptr<void, HandleDeleter> selinux_handle(dlopen("libselinux.so", RTLD_NOW));
-  if (selinux_handle == nullptr) {
-    // Assume everything's OK if we can't open the library.
-    return true;
-  }
-  dlerror();  // Clean dlerror string.
-
-  void* restorecon_ptr = dlsym(selinux_handle.get(), "selinux_android_restorecon");
-  if (restorecon_ptr == nullptr) {
-    // Can't find the relabel function. That's bad. Make sure the zygote fails, as we have no
-    // other recourse to make this error obvious.
-    const char* error_string = dlerror();
-    LOG(FATAL) << "Could not find selinux restorecon function: "
-               << ((error_string != nullptr) ? error_string : "(unknown error)");
-    UNREACHABLE();
-  }
-
-  using RestoreconFn = int (*)(const char*, unsigned int);
-  constexpr unsigned int kRecursive = 4U;
-
-  RestoreconFn restorecon_fn = reinterpret_cast<RestoreconFn>(restorecon_ptr);
-  if (restorecon_fn(dalvik_cache_dir.c_str(), kRecursive) != 0) {
-    LOG(ERROR) << "Failed to restorecon " << dalvik_cache_dir;
-    return false;
-  }
-
-  return true;
-}
-
 }  // namespace impl
 
 
@@ -226,8 +117,21 @@ static void MarkZygoteStart(const InstructionSet isa, const uint32_t max_failed_
     file.reset(OS::CreateEmptyFile(file_name));
 
     if (file.get() == nullptr) {
+      int saved_errno = errno;
       PLOG(WARNING) << "Failed to create boot marker.";
-      return;
+      if (saved_errno != ENOSPC) {
+        return;
+      }
+
+      LOG(WARNING) << "Pruning dalvik cache because of low-memory situation.";
+      impl::DeleteDirectoryContents(isa_subdir, false);
+
+      // Try once more.
+      file.reset(OS::OpenFileReadWrite(file_name));
+      if (file == nullptr) {
+        PLOG(WARNING) << "Failed to create boot marker.";
+        return;
+      }
     }
   } else {
     if (!file->ReadFully(&num_failed_boots, sizeof(num_failed_boots))) {
@@ -262,53 +166,6 @@ static void MarkZygoteStart(const InstructionSet isa, const uint32_t max_failed_
   }
 }
 
-static void TryMoveOTAArtifacts(const std::string& cache_filename, bool dalvik_cache_exists) {
-  // We really assume here global means /data/dalvik-cache, and we'll inject 'ota.' Make sure
-  // that's true.
-  CHECK(StartsWith(cache_filename, "/data/dalvik-cache")) << cache_filename;
-
-  // Inject ota subdirectory.
-  std::string ota_filename(cache_filename);
-  ota_filename = ota_filename.insert(strlen("/data/"), "ota/");
-  CHECK(StartsWith(ota_filename, "/data/ota/dalvik-cache")) << ota_filename;
-
-  // See if the file exists.
-  if (OS::FileExists(ota_filename.c_str())) {
-    VLOG(startup) << "OTA directory does exist, checking for artifacts";
-
-    size_t last_slash = ota_filename.rfind('/');
-    CHECK_NE(last_slash, std::string::npos);
-    std::string ota_source_dir = ota_filename.substr(0, last_slash);
-
-    // We need the dalvik cache now, really.
-    if (dalvik_cache_exists) {
-      size_t last_cache_slash = cache_filename.rfind('/');
-      DCHECK_NE(last_cache_slash, std::string::npos);
-      std::string dalvik_cache_target_dir = cache_filename.substr(0, last_cache_slash);
-
-      // First clean the target cache.
-      impl::DeleteDirectoryContents(dalvik_cache_target_dir.c_str(), false);
-
-      // Now move things over.
-      impl::MoveOTAArtifacts(ota_source_dir.c_str(), dalvik_cache_target_dir.c_str());
-
-      // Last step: ensure the files have the right selinux label.
-      if (!impl::RelabelOTAFiles(dalvik_cache_target_dir)) {
-        // This isn't good. We potentially moved files, but they have the wrong label. Delete the
-        // files.
-        LOG(WARNING) << "Could not relabel files, must delete dalvik-cache.";
-        impl::DeleteDirectoryContents(dalvik_cache_target_dir.c_str(), false);
-      }
-    }
-
-    // Cleanup.
-    impl::DeleteDirectoryContents(ota_source_dir.c_str(), true);
-    impl::DeleteEmptyDirectoriesUpTo(ota_source_dir, "ota");
-  } else {
-    VLOG(startup) << "No OTA directory.";
-  }
-}
-
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/generated/asm_support_gen.h b/runtime/generated/asm_support_gen.h
index 96924722d8..716c23d1b0 100644
--- a/runtime/generated/asm_support_gen.h
+++ b/runtime/generated/asm_support_gen.h
@@ -98,6 +98,8 @@ DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_MARK_BIT_MASK_SHIFTED), (static_
 DEFINE_CHECK_EQ(static_cast<size_t>(OBJECT_ALIGNMENT_MASK), (static_cast<size_t>(art::kObjectAlignment - 1)))
 #define OBJECT_ALIGNMENT_MASK_TOGGLED 0xfffffff8
 DEFINE_CHECK_EQ(static_cast<uint32_t>(OBJECT_ALIGNMENT_MASK_TOGGLED), (static_cast<uint32_t>(~static_cast<uint32_t>(art::kObjectAlignment - 1))))
+#define OBJECT_ALIGNMENT_MASK_TOGGLED64 0xfffffffffffffff8
+DEFINE_CHECK_EQ(static_cast<uint64_t>(OBJECT_ALIGNMENT_MASK_TOGGLED64), (static_cast<uint64_t>(~static_cast<uint64_t>(art::kObjectAlignment - 1))))
 #define ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE 128
 DEFINE_CHECK_EQ(static_cast<int32_t>(ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE), (static_cast<int32_t>((art::gc::allocator::RosAlloc::kMaxThreadLocalBracketSize))))
 #define ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT 3
diff --git a/runtime/java_vm_ext.cc b/runtime/java_vm_ext.cc
index c644cde5db..2401bec9f3 100644
--- a/runtime/java_vm_ext.cc
+++ b/runtime/java_vm_ext.cc
@@ -48,7 +48,7 @@ static size_t gGlobalsMax = 51200;  // Arbitrary sanity check. (Must fit in 16 b
 static const size_t kWeakGlobalsInitial = 16;  // Arbitrary.
 static const size_t kWeakGlobalsMax = 51200;  // Arbitrary sanity check. (Must fit in 16 bits.)
 
-static bool IsBadJniVersion(int version) {
+bool JavaVMExt::IsBadJniVersion(int version) {
   // We don't support JNI_VERSION_1_1. These are the only other valid versions.
   return version != JNI_VERSION_1_2 && version != JNI_VERSION_1_4 && version != JNI_VERSION_1_6;
 }
@@ -344,13 +344,6 @@ class JII {
   }
 
   static jint GetEnv(JavaVM* vm, void** env, jint version) {
-    // GetEnv always returns a JNIEnv* for the most current supported JNI version,
-    // and unlike other calls that take a JNI version doesn't care if you supply
-    // JNI_VERSION_1_1, which we don't otherwise support.
-    if (IsBadJniVersion(version) && version != JNI_VERSION_1_1) {
-      LOG(ERROR) << "Bad JNI version passed to GetEnv: " << version;
-      return JNI_EVERSION;
-    }
     if (vm == nullptr || env == nullptr) {
       return JNI_ERR;
     }
@@ -359,8 +352,8 @@ class JII {
       *env = nullptr;
       return JNI_EDETACHED;
     }
-    *env = thread->GetJniEnv();
-    return JNI_OK;
+    JavaVMExt* raw_vm = reinterpret_cast<JavaVMExt*>(vm);
+    return raw_vm->HandleGetEnv(env, version);
   }
 
  private:
@@ -388,7 +381,7 @@ class JII {
     const char* thread_name = nullptr;
     jobject thread_group = nullptr;
     if (args != nullptr) {
-      if (IsBadJniVersion(args->version)) {
+      if (JavaVMExt::IsBadJniVersion(args->version)) {
         LOG(ERROR) << "Bad JNI version passed to "
                    << (as_daemon ? "AttachCurrentThreadAsDaemon" : "AttachCurrentThread") << ": "
                    << args->version;
@@ -436,7 +429,8 @@ JavaVMExt::JavaVMExt(Runtime* runtime, const RuntimeArgumentMap& runtime_options
       weak_globals_lock_("JNI weak global reference table lock", kJniWeakGlobalsLock),
       weak_globals_(kWeakGlobalsInitial, kWeakGlobalsMax, kWeakGlobal),
       allow_accessing_weak_globals_(true),
-      weak_globals_add_condition_("weak globals add condition", weak_globals_lock_) {
+      weak_globals_add_condition_("weak globals add condition", weak_globals_lock_),
+      env_hooks_() {
   functions = unchecked_functions_;
   SetCheckJniEnabled(runtime_options.Exists(RuntimeArgumentMap::CheckJni));
 }
@@ -444,6 +438,26 @@ JavaVMExt::JavaVMExt(Runtime* runtime, const RuntimeArgumentMap& runtime_options
 JavaVMExt::~JavaVMExt() {
 }
 
+jint JavaVMExt::HandleGetEnv(/*out*/void** env, jint version) {
+  for (GetEnvHook hook : env_hooks_) {
+    jint res = hook(this, env, version);
+    if (res == JNI_OK) {
+      return JNI_OK;
+    } else if (res != JNI_EVERSION) {
+      LOG(ERROR) << "Error returned from a plugin GetEnv handler! " << res;
+      return res;
+    }
+  }
+  LOG(ERROR) << "Bad JNI version passed to GetEnv: " << version;
+  return JNI_EVERSION;
+}
+
+// Add a hook to handle getting environments from the GetEnv call.
+void JavaVMExt::AddEnvironmentHook(GetEnvHook hook) {
+  CHECK(hook != nullptr) << "environment hooks shouldn't be null!";
+  env_hooks_.push_back(hook);
+}
+
 void JavaVMExt::JniAbort(const char* jni_function_name, const char* msg) {
   Thread* self = Thread::Current();
   ScopedObjectAccess soa(self);
@@ -866,7 +880,7 @@ bool JavaVMExt::LoadNativeLibrary(JNIEnv* env,
 
     if (version == JNI_ERR) {
       StringAppendF(error_msg, "JNI_ERR returned from JNI_OnLoad in \"%s\"", path.c_str());
-    } else if (IsBadJniVersion(version)) {
+    } else if (JavaVMExt::IsBadJniVersion(version)) {
       StringAppendF(error_msg, "Bad JNI version returned from JNI_OnLoad in \"%s\": %d",
                     path.c_str(), version);
       // It's unwise to call dlclose() here, but we can mark it
@@ -939,7 +953,7 @@ void JavaVMExt::VisitRoots(RootVisitor* visitor) {
 extern "C" jint JNI_CreateJavaVM(JavaVM** p_vm, JNIEnv** p_env, void* vm_args) {
   ScopedTrace trace(__FUNCTION__);
   const JavaVMInitArgs* args = static_cast<JavaVMInitArgs*>(vm_args);
-  if (IsBadJniVersion(args->version)) {
+  if (JavaVMExt::IsBadJniVersion(args->version)) {
     LOG(ERROR) << "Bad JNI version passed to CreateJavaVM: " << args->version;
     return JNI_EVERSION;
   }
diff --git a/runtime/java_vm_ext.h b/runtime/java_vm_ext.h
index 3d055cd7ce..ed9d3abfe2 100644
--- a/runtime/java_vm_ext.h
+++ b/runtime/java_vm_ext.h
@@ -36,6 +36,10 @@ class ParsedOptions;
 class Runtime;
 struct RuntimeArgumentMap;
 
+class JavaVMExt;
+// Hook definition for runtime plugins.
+using GetEnvHook = jint (*)(JavaVMExt* vm, /*out*/void** new_env, jint version);
+
 class JavaVMExt : public JavaVM {
  public:
   JavaVMExt(Runtime* runtime, const RuntimeArgumentMap& runtime_options);
@@ -171,6 +175,12 @@ class JavaVMExt : public JavaVM {
   void TrimGlobals() SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!globals_lock_);
 
+  jint HandleGetEnv(/*out*/void** env, jint version);
+
+  void AddEnvironmentHook(GetEnvHook hook);
+
+  static bool IsBadJniVersion(int version);
+
  private:
   // Return true if self can currently access weak globals.
   bool MayAccessWeakGlobalsUnlocked(Thread* self) const SHARED_REQUIRES(Locks::mutator_lock_);
@@ -215,6 +225,9 @@ class JavaVMExt : public JavaVM {
   Atomic<bool> allow_accessing_weak_globals_;
   ConditionVariable weak_globals_add_condition_ GUARDED_BY(weak_globals_lock_);
 
+  // TODO Maybe move this to Runtime.
+  std::vector<GetEnvHook> env_hooks_;
+
   DISALLOW_COPY_AND_ASSIGN(JavaVMExt);
 };
 
diff --git a/runtime/jit/profile_saver.cc b/runtime/jit/profile_saver.cc
index 5a469e51b4..b35c958b0b 100644
--- a/runtime/jit/profile_saver.cc
+++ b/runtime/jit/profile_saver.cc
@@ -176,14 +176,13 @@ void ProfileSaver::NotifyJitActivityInternal() {
     MutexLock wait_mutex(Thread::Current(), wait_lock_);
     if ((NanoTime() - last_time_ns_saver_woke_up_) > MsToNs(options_.GetMinSavePeriodMs())) {
       WakeUpSaver();
+    } else if (jit_activity_notifications_ > options_.GetMaxNotificationBeforeWake()) {
+      // Make sure to wake up the saver if we see a spike in the number of notifications.
+      // This is a precaution to avoid losing a big number of methods in case
+      // this is a spike with no jit after.
+      total_number_of_hot_spikes_++;
+      WakeUpSaver();
     }
-  } else if (jit_activity_notifications_ > options_.GetMaxNotificationBeforeWake()) {
-    // Make sure to wake up the saver if we see a spike in the number of notifications.
-    // This is a precaution to avoid "loosing" a big number of methods in case
-    // this is a spike with no jit after.
-    total_number_of_hot_spikes_++;
-    MutexLock wait_mutex(Thread::Current(), wait_lock_);
-    WakeUpSaver();
   }
 }
 
diff --git a/runtime/jni_env_ext.cc b/runtime/jni_env_ext.cc
index 1ee1611ef7..40efc898b8 100644
--- a/runtime/jni_env_ext.cc
+++ b/runtime/jni_env_ext.cc
@@ -45,6 +45,20 @@ static bool CheckLocalsValid(JNIEnvExt* in) NO_THREAD_SAFETY_ANALYSIS {
   return in->locals.IsValid();
 }
 
+jint JNIEnvExt::GetEnvHandler(JavaVMExt* vm, /*out*/void** env, jint version) {
+  UNUSED(vm);
+  // GetEnv always returns a JNIEnv* for the most current supported JNI version,
+  // and unlike other calls that take a JNI version doesn't care if you supply
+  // JNI_VERSION_1_1, which we don't otherwise support.
+  if (JavaVMExt::IsBadJniVersion(version) && version != JNI_VERSION_1_1) {
+    return JNI_EVERSION;
+  }
+  Thread* thread = Thread::Current();
+  CHECK(thread != nullptr);
+  *env = thread->GetJniEnv();
+  return JNI_OK;
+}
+
 JNIEnvExt* JNIEnvExt::Create(Thread* self_in, JavaVMExt* vm_in) {
   std::unique_ptr<JNIEnvExt> ret(new JNIEnvExt(self_in, vm_in));
   if (CheckLocalsValid(ret.get())) {
diff --git a/runtime/jni_env_ext.h b/runtime/jni_env_ext.h
index d4accc342b..ac287d488a 100644
--- a/runtime/jni_env_ext.h
+++ b/runtime/jni_env_ext.h
@@ -54,6 +54,8 @@ struct JNIEnvExt : public JNIEnv {
   static Offset LocalRefCookieOffset(size_t pointer_size);
   static Offset SelfOffset(size_t pointer_size);
 
+  static jint GetEnvHandler(JavaVMExt* vm, /*out*/void** out, jint version);
+
   jobject NewLocalRef(mirror::Object* obj) SHARED_REQUIRES(Locks::mutator_lock_);
   void DeleteLocalRef(jobject obj) SHARED_REQUIRES(Locks::mutator_lock_);
 
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index c7e4f8b343..174da79030 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -23,6 +23,7 @@
 #include "gc/heap.h"
 #include "monitor.h"
 #include "runtime.h"
+#include "ti/agent.h"
 #include "trace.h"
 #include "utils.h"
 
@@ -90,6 +91,13 @@ std::unique_ptr<RuntimeParser> ParsedOptions::MakeParser(bool ignore_unrecognize
       .Define({"-Xrunjdwp:_", "-agentlib:jdwp=_"})
           .WithType<JDWP::JdwpOptions>()
           .IntoKey(M::JdwpOptions)
+      // TODO Re-enable -agentlib: once I have a good way to transform the values.
+      // .Define("-agentlib:_")
+      //     .WithType<std::vector<ti::Agent>>().AppendValues()
+      //     .IntoKey(M::AgentLib)
+      .Define("-agentpath:_")
+          .WithType<std::vector<ti::Agent>>().AppendValues()
+          .IntoKey(M::AgentPath)
       .Define("-Xms_")
           .WithType<MemoryKiB>()
           .IntoKey(M::MemoryInitialSize)
@@ -289,6 +297,9 @@ std::unique_ptr<RuntimeParser> ParsedOptions::MakeParser(bool ignore_unrecognize
           .IntoKey(M::Experimental)
       .Define("-Xforce-nb-testing")
           .IntoKey(M::ForceNativeBridge)
+      .Define("-Xplugin:_")
+          .WithType<std::vector<Plugin>>().AppendValues()
+          .IntoKey(M::Plugins)
       .Ignore({
           "-ea", "-da", "-enableassertions", "-disableassertions", "--runtime-arg", "-esa",
           "-dsa", "-enablesystemassertions", "-disablesystemassertions", "-Xrs", "-Xint:_",
@@ -583,6 +594,42 @@ bool ParsedOptions::DoParse(const RuntimeOptions& options,
     args.Set(M::HeapGrowthLimit, args.GetOrDefault(M::MemoryMaximumSize));
   }
 
+  if (args.GetOrDefault(M::Experimental) & ExperimentalFlags::kRuntimePlugins) {
+    LOG(WARNING) << "Experimental runtime plugin support has been enabled. No guarantees are made "
+                 << "about stability or usage of this plugin support. Use at your own risk. Do "
+                 << "not attempt to write shipping code that relies on the implementation of "
+                 << "runtime plugins.";
+  } else if (!args.GetOrDefault(M::Plugins).empty()) {
+    LOG(WARNING) << "Experimental runtime plugin support has not been enabled. Ignored options: ";
+    for (auto& op : args.GetOrDefault(M::Plugins)) {
+      LOG(WARNING) << "    -plugin:" << op.GetLibrary();
+    }
+  }
+
+  if (args.GetOrDefault(M::Experimental) & ExperimentalFlags::kAgents) {
+    LOG(WARNING) << "Experimental runtime agent support has been enabled. No guarantees are made "
+                 << "the completeness, accuracy, reliability, or stability of the agent "
+                 << "implementation. Use at your own risk. Do not attempt to write shipping code "
+                 << "that relies on the implementation of any part of this api.";
+  } else if (!args.GetOrDefault(M::AgentLib).empty() || !args.GetOrDefault(M::AgentPath).empty()) {
+    LOG(WARNING) << "agent support has not been enabled. Enable experimental agent "
+                 << " support with '-XExperimental:agent'. Ignored options are:";
+    for (auto op : args.GetOrDefault(M::AgentLib)) {
+      if (op.HasArgs()) {
+        LOG(WARNING) << "    -agentlib:" << op.GetName() << "=" << op.GetArgs();
+      } else {
+        LOG(WARNING) << "    -agentlib:" << op.GetName();
+      }
+    }
+    for (auto op : args.GetOrDefault(M::AgentPath)) {
+      if (op.HasArgs()) {
+        LOG(WARNING) << "    -agentpath:" << op.GetName() << "=" << op.GetArgs();
+      } else {
+        LOG(WARNING) << "    -agentpath:" << op.GetName();
+      }
+    }
+  }
+
   *runtime_options = std::move(args);
   return true;
 }
@@ -627,6 +674,11 @@ void ParsedOptions::Usage(const char* fmt, ...) {
   UsageMessage(stream, "  -showversion\n");
   UsageMessage(stream, "  -help\n");
   UsageMessage(stream, "  -agentlib:jdwp=options\n");
+  // TODO add back in once -agentlib actually does something.
+  // UsageMessage(stream, "  -agentlib:library=options (Experimental feature, "
+  //                      "requires -Xexperimental:agent, some features might not be supported)\n");
+  UsageMessage(stream, "  -agentpath:library_path=options (Experimental feature, "
+                       "requires -Xexperimental:agent, some features might not be supported)\n");
   UsageMessage(stream, "\n");
 
   UsageMessage(stream, "The following extended options are supported:\n");
@@ -703,6 +755,12 @@ void ParsedOptions::Usage(const char* fmt, ...) {
   UsageMessage(stream, "  -X[no]image-dex2oat (Whether to create and use a boot image)\n");
   UsageMessage(stream, "  -Xno-dex-file-fallback "
                        "(Don't fall back to dex files without oat files)\n");
+  UsageMessage(stream, "  -Xplugin:<library.so> "
+                       "(Load a runtime plugin, requires -Xexperimental:runtime-plugins)\n");
+  UsageMessage(stream, "  -Xexperimental:runtime-plugins"
+                       "(Enable new and experimental agent support)\n");
+  UsageMessage(stream, "  -Xexperimental:agents"
+                       "(Enable new and experimental agent support)\n");
   UsageMessage(stream, "\n");
 
   UsageMessage(stream, "The following previously supported Dalvik options are ignored:\n");
diff --git a/runtime/plugin.cc b/runtime/plugin.cc
new file mode 100644
index 0000000000..481b1caa15
--- /dev/null
+++ b/runtime/plugin.cc
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "plugin.h"
+
+#include <dlfcn.h>
+#include "base/stringprintf.h"
+#include "base/logging.h"
+
+namespace art {
+
+const char* PLUGIN_INITIALIZATION_FUNCTION_NAME = "ArtPlugin_Initialize";
+const char* PLUGIN_DEINITIALIZATION_FUNCTION_NAME = "ArtPlugin_Deinitialize";
+
+Plugin::Plugin(const Plugin& other) : library_(other.library_), dlopen_handle_(nullptr) {
+  if (other.IsLoaded()) {
+    std::string err;
+    Load(&err);
+  }
+}
+
+bool Plugin::Load(/*out*/std::string* error_msg) {
+  DCHECK(!IsLoaded());
+  void* res = dlopen(library_.c_str(), RTLD_LAZY);
+  if (res == nullptr) {
+    *error_msg = StringPrintf("dlopen failed: %s", dlerror());
+    return false;
+  }
+  // Get the initializer function
+  PluginInitializationFunction init = reinterpret_cast<PluginInitializationFunction>(
+      dlsym(res, PLUGIN_INITIALIZATION_FUNCTION_NAME));
+  if (init != nullptr) {
+    if (!init()) {
+      dlclose(res);
+      *error_msg = StringPrintf("Initialization of plugin failed");
+      return false;
+    }
+  } else {
+    LOG(WARNING) << this << " does not include an initialization function";
+  }
+  dlopen_handle_ = res;
+  return true;
+}
+
+bool Plugin::Unload() {
+  DCHECK(IsLoaded());
+  bool ret = true;
+  void* handle = dlopen_handle_;
+  PluginDeinitializationFunction deinit = reinterpret_cast<PluginDeinitializationFunction>(
+      dlsym(handle, PLUGIN_DEINITIALIZATION_FUNCTION_NAME));
+  if (deinit != nullptr) {
+    if (!deinit()) {
+      LOG(WARNING) << this << " failed deinitialization";
+      ret = false;
+    }
+  } else {
+    LOG(WARNING) << this << " does not include a deinitialization function";
+  }
+  dlopen_handle_ = nullptr;
+  if (dlclose(handle) != 0) {
+    LOG(ERROR) << this << " failed to dlclose: " << dlerror();
+    ret = false;
+  }
+  return ret;
+}
+
+std::ostream& operator<<(std::ostream &os, const Plugin* m) {
+  return os << *m;
+}
+
+std::ostream& operator<<(std::ostream &os, Plugin const& m) {
+  return os << "Plugin { library=\"" << m.library_ << "\", handle=" << m.dlopen_handle_ << " }";
+}
+
+}  // namespace art
diff --git a/runtime/plugin.h b/runtime/plugin.h
new file mode 100644
index 0000000000..18f3977bd5
--- /dev/null
+++ b/runtime/plugin.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_PLUGIN_H_
+#define ART_RUNTIME_PLUGIN_H_
+
+#include <string>
+#include "base/logging.h"
+
+namespace art {
+
+// This function is loaded from the plugin (if present) and called during runtime initialization.
+// By the time this has been called the runtime has been fully initialized but not other native
+// libraries have been loaded yet. Failure to initialize is considered a fatal error.
+// TODO might want to give initialization function some arguments
+using PluginInitializationFunction = bool (*)();
+using PluginDeinitializationFunction = bool (*)();
+
+// A class encapsulating a plugin. There is no stable plugin ABI or API and likely never will be.
+// TODO Might want to put some locking in this but ATM we only load these at initialization in a
+// single-threaded fashion so not much need
+class Plugin {
+ public:
+  static Plugin Create(std::string lib) {
+    return Plugin(lib);
+  }
+
+  bool IsLoaded() const {
+    return dlopen_handle_ != nullptr;
+  }
+
+  const std::string& GetLibrary() const {
+    return library_;
+  }
+
+  bool Load(/*out*/std::string* error_msg);
+  bool Unload();
+
+
+  ~Plugin() {
+    if (IsLoaded() && !Unload()) {
+      LOG(ERROR) << "Error unloading " << this;
+    }
+  }
+
+  Plugin(const Plugin& other);
+
+  // Create move constructor for putting this in a list
+  Plugin(Plugin&& other)
+      : library_(other.library_),
+        dlopen_handle_(other.dlopen_handle_) {
+    other.dlopen_handle_ = nullptr;
+  }
+
+ private:
+  explicit Plugin(std::string library) : library_(library), dlopen_handle_(nullptr) { }
+
+  std::string library_;
+  void* dlopen_handle_;
+
+  friend std::ostream& operator<<(std::ostream &os, Plugin const& m);
+};
+
+std::ostream& operator<<(std::ostream &os, Plugin const& m);
+std::ostream& operator<<(std::ostream &os, const Plugin* m);
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_PLUGIN_H_
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 68fa0d32be..ddcfb6d5aa 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -130,6 +130,7 @@
 #include "signal_set.h"
 #include "thread.h"
 #include "thread_list.h"
+#include "ti/agent.h"
 #include "trace.h"
 #include "transaction.h"
 #include "utils.h"
@@ -281,6 +282,16 @@ Runtime::~Runtime() {
     jit_->StopProfileSaver();
   }
 
+  // TODO Maybe do some locking.
+  for (auto& agent : agents_) {
+    agent.Unload();
+  }
+
+  // TODO Maybe do some locking
+  for (auto& plugin : plugins_) {
+    plugin.Unload();
+  }
+
   // Make sure our internal threads are dead before we start tearing down things they're using.
   Dbg::StopJdwp();
   delete signal_catcher_;
@@ -960,6 +971,16 @@ bool Runtime::Init(RuntimeArgumentMap&& runtime_options_in) {
   experimental_flags_ = runtime_options.GetOrDefault(Opt::Experimental);
   is_low_memory_mode_ = runtime_options.Exists(Opt::LowMemoryMode);
 
+  if (experimental_flags_ & ExperimentalFlags::kRuntimePlugins) {
+    plugins_ = runtime_options.ReleaseOrDefault(Opt::Plugins);
+  }
+  if (experimental_flags_ & ExperimentalFlags::kAgents) {
+    agents_ = runtime_options.ReleaseOrDefault(Opt::AgentPath);
+    // TODO Add back in -agentlib
+    // for (auto lib : runtime_options.ReleaseOrDefault(Opt::AgentLib)) {
+    //   agents_.push_back(lib);
+    // }
+  }
   XGcOption xgc_option = runtime_options.GetOrDefault(Opt::GcOption);
   heap_ = new gc::Heap(runtime_options.GetOrDefault(Opt::MemoryInitialSize),
                        runtime_options.GetOrDefault(Opt::HeapGrowthLimit),
@@ -1084,6 +1105,10 @@ bool Runtime::Init(RuntimeArgumentMap&& runtime_options_in) {
 
   java_vm_ = new JavaVMExt(this, runtime_options);
 
+  // Add the JniEnv handler.
+  // TODO Refactor this stuff.
+  java_vm_->AddEnvironmentHook(JNIEnvExt::GetEnvHandler);
+
   Thread::Startup();
 
   // ClassLinker needs an attached thread, but we can't fully attach a thread without creating
@@ -1200,6 +1225,16 @@ bool Runtime::Init(RuntimeArgumentMap&& runtime_options_in) {
   pre_allocated_NoClassDefFoundError_ = GcRoot<mirror::Throwable>(self->GetException());
   self->ClearException();
 
+  // Runtime initialization is largely done now.
+  // We load plugins first since that can modify the runtime state slightly.
+  // Load all plugins
+  for (auto& plugin : plugins_) {
+    std::string err;
+    if (!plugin.Load(&err)) {
+      LOG(FATAL) << plugin << " failed to load: " << err;
+    }
+  }
+
   // Look for a native bridge.
   //
   // The intended flow here is, in the case of a running system:
@@ -1232,6 +1267,20 @@ bool Runtime::Init(RuntimeArgumentMap&& runtime_options_in) {
     is_native_bridge_loaded_ = LoadNativeBridge(native_bridge_file_name);
   }
 
+  // Startup agents
+  // TODO Maybe we should start a new thread to run these on. Investigate RI behavior more.
+  for (auto& agent : agents_) {
+    // TODO Check err
+    int res = 0;
+    std::string err = "";
+    ti::Agent::LoadError result = agent.Load(&res, &err);
+    if (result == ti::Agent::kInitializationError) {
+      LOG(FATAL) << "Unable to initialize agent!";
+    } else if (result != ti::Agent::kNoError) {
+      LOG(ERROR) << "Unable to load an agent: " << err;
+    }
+  }
+
   VLOG(startup) << "Runtime::Init exiting";
 
   return true;
diff --git a/runtime/runtime.h b/runtime/runtime.h
index c971646195..6da60f27a3 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -63,6 +63,9 @@ namespace mirror {
   class String;
   class Throwable;
 }  // namespace mirror
+namespace ti {
+  class Agent;
+}  // namespace ti
 namespace verifier {
   class MethodVerifier;
   enum class VerifyMode : int8_t;
@@ -80,6 +83,7 @@ class MonitorList;
 class MonitorPool;
 class NullPointerHandler;
 class OatFileManager;
+class Plugin;
 struct RuntimeArgumentMap;
 class SignalCatcher;
 class StackOverflowHandler;
@@ -698,6 +702,9 @@ class Runtime {
   std::string class_path_string_;
   std::vector<std::string> properties_;
 
+  std::vector<ti::Agent> agents_;
+  std::vector<Plugin> plugins_;
+
   // The default stack size for managed threads created by the runtime.
   size_t default_stack_size_;
 
diff --git a/runtime/runtime_options.def b/runtime/runtime_options.def
index b95dfad550..146afc7ad8 100644
--- a/runtime/runtime_options.def
+++ b/runtime/runtime_options.def
@@ -117,7 +117,10 @@ RUNTIME_OPTIONS_KEY (unsigned int,        ZygoteMaxFailedBoots,           10)
 RUNTIME_OPTIONS_KEY (Unit,                NoDexFileFallback)
 RUNTIME_OPTIONS_KEY (std::string,         CpuAbiList)
 RUNTIME_OPTIONS_KEY (std::string,         Fingerprint)
-RUNTIME_OPTIONS_KEY (ExperimentalFlags,   Experimental,     ExperimentalFlags::kNone) // -Xexperimental:{none}
+RUNTIME_OPTIONS_KEY (ExperimentalFlags,   Experimental,     ExperimentalFlags::kNone) // -Xexperimental:{none, agents}
+RUNTIME_OPTIONS_KEY (std::vector<ti::Agent>,         AgentLib)  // -agentlib:<libname>=<options>, Requires -Xexperimental:agents
+RUNTIME_OPTIONS_KEY (std::vector<ti::Agent>,         AgentPath)  // -agentpath:<libname>=<options>, Requires -Xexperimental:agents
+RUNTIME_OPTIONS_KEY (std::vector<Plugin>,            Plugins)  // -Xplugin:<library> Requires -Xexperimental:runtime-plugins
 
 // Not parse-able from command line, but can be provided explicitly.
 // (Do not add anything here that is defined in ParsedOptions::MakeParser)
diff --git a/runtime/simulator/Android.mk b/runtime/simulator/Android.mk
index 953a37733d..a34a84100a 100644
--- a/runtime/simulator/Android.mk
+++ b/runtime/simulator/Android.mk
@@ -88,9 +88,9 @@ define build-libart-simulator
   LOCAL_NATIVE_COVERAGE := $(ART_COVERAGE)
   # For simulator_arm64.
   ifeq ($$(art_ndebug_or_debug),debug)
-     LOCAL_SHARED_LIBRARIES += libvixl-arm64
+    LOCAL_SHARED_LIBRARIES += libvixld-arm64
   else
-     LOCAL_SHARED_LIBRARIES += libvixl-arm64
+    LOCAL_SHARED_LIBRARIES += libvixl-arm64
   endif
   ifeq ($$(art_target_or_host),target)
     include $(BUILD_SHARED_LIBRARY)
diff --git a/runtime/simulator/code_simulator_arm64.h b/runtime/simulator/code_simulator_arm64.h
index 69388b122c..59ea34fb80 100644
--- a/runtime/simulator/code_simulator_arm64.h
+++ b/runtime/simulator/code_simulator_arm64.h
@@ -20,10 +20,10 @@
 #include "memory"
 #include "simulator/code_simulator.h"
 
-// TODO: make vixl clean wrt -Wshadow.
+// TODO(VIXL): Make VIXL compile with -Wshadow.
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wshadow"
-#include "a64/simulator-a64.h"
+#include "aarch64/simulator-aarch64.h"
 #pragma GCC diagnostic pop
 
 namespace art {
diff --git a/runtime/stack_map.h b/runtime/stack_map.h
index 4647d67699..dd7e53100f 100644
--- a/runtime/stack_map.h
+++ b/runtime/stack_map.h
@@ -1050,7 +1050,7 @@ struct CodeInfoEncoding {
       inline_info_encoding = *reinterpret_cast<const InlineInfoEncoding*>(ptr);
       ptr += sizeof(InlineInfoEncoding);
     } else {
-      inline_info_encoding = InlineInfoEncoding{}; // NOLINT.
+      inline_info_encoding = InlineInfoEncoding{};  // NOLINT.
     }
     header_size = dchecked_integral_cast<uint8_t>(ptr - reinterpret_cast<const uint8_t*>(data));
   }
diff --git a/runtime/ti/agent.cc b/runtime/ti/agent.cc
new file mode 100644
index 0000000000..41a21f70f3
--- /dev/null
+++ b/runtime/ti/agent.cc
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "agent.h"
+#include "java_vm_ext.h"
+#include "runtime.h"
+
+namespace art {
+namespace ti {
+
+const char* AGENT_ON_LOAD_FUNCTION_NAME = "Agent_OnLoad";
+const char* AGENT_ON_ATTACH_FUNCTION_NAME = "Agent_OnAttach";
+const char* AGENT_ON_UNLOAD_FUNCTION_NAME = "Agent_OnUnload";
+
+Agent Agent::Create(std::string arg) {
+  size_t eq = arg.find_first_of('=');
+  if (eq == std::string::npos) {
+    return Agent(arg, "");
+  } else {
+    return Agent(arg.substr(0, eq), arg.substr(eq + 1, arg.length()));
+  }
+}
+
+// TODO We need to acquire some locks probably.
+Agent::LoadError Agent::Load(/*out*/jint* call_res, /*out*/ std::string* error_msg) {
+  DCHECK(call_res != nullptr);
+  DCHECK(error_msg != nullptr);
+  if (IsStarted()) {
+    *error_msg = StringPrintf("the agent at %s has already been started!", name_.c_str());
+    VLOG(agents) << "err: " << *error_msg;
+    return kAlreadyStarted;
+  }
+  LoadError err = DoDlOpen(error_msg);
+  if (err != kNoError) {
+    VLOG(agents) << "err: " << *error_msg;
+    return err;
+  }
+  if (onload_ == nullptr) {
+    *error_msg = StringPrintf("Unable to start agent %s: No Agent_OnLoad function found",
+                              name_.c_str());
+    VLOG(agents) << "err: " << *error_msg;
+    return kLoadingError;
+  }
+  // TODO Need to do some checks that we are at a good spot etc.
+  *call_res = onload_(static_cast<JavaVM*>(Runtime::Current()->GetJavaVM()),
+                      args_.c_str(),
+                      nullptr);
+  if (*call_res != 0) {
+    *error_msg = StringPrintf("Initialization of %s returned non-zero value of %d",
+                              name_.c_str(), *call_res);
+    VLOG(agents) << "err: " << *error_msg;
+    return kInitializationError;
+  } else {
+    return kNoError;
+  }
+}
+
+Agent::LoadError Agent::DoDlOpen(/*out*/std::string* error_msg) {
+  DCHECK(error_msg != nullptr);
+  dlopen_handle_ = dlopen(name_.c_str(), RTLD_LAZY);
+  if (dlopen_handle_ == nullptr) {
+    *error_msg = StringPrintf("Unable to dlopen %s: %s", name_.c_str(), dlerror());
+    return kLoadingError;
+  }
+
+  onload_ = reinterpret_cast<AgentOnLoadFunction>(dlsym(dlopen_handle_,
+                                                        AGENT_ON_LOAD_FUNCTION_NAME));
+  if (onload_ == nullptr) {
+    VLOG(agents) << "Unable to find 'Agent_OnLoad' symbol in " << this;
+  }
+  onattach_ = reinterpret_cast<AgentOnAttachFunction>(dlsym(dlopen_handle_,
+                                                            AGENT_ON_ATTACH_FUNCTION_NAME));
+  if (onattach_ == nullptr) {
+    VLOG(agents) << "Unable to find 'Agent_OnAttach' symbol in " << this;
+  }
+  onunload_= reinterpret_cast<AgentOnUnloadFunction>(dlsym(dlopen_handle_,
+                                                           AGENT_ON_UNLOAD_FUNCTION_NAME));
+  if (onunload_ == nullptr) {
+    VLOG(agents) << "Unable to find 'Agent_OnUnload' symbol in " << this;
+  }
+  return kNoError;
+}
+
+// TODO Lock some stuff probably.
+void Agent::Unload() {
+  if (dlopen_handle_ != nullptr) {
+    if (onunload_ != nullptr) {
+      onunload_(Runtime::Current()->GetJavaVM());
+    }
+    dlclose(dlopen_handle_);
+    dlopen_handle_ = nullptr;
+  } else {
+    VLOG(agents) << this << " is not currently loaded!";
+  }
+}
+
+Agent::Agent(const Agent& other)
+  : name_(other.name_),
+    args_(other.args_),
+    dlopen_handle_(other.dlopen_handle_),
+    onload_(other.onload_),
+    onattach_(other.onattach_),
+    onunload_(other.onunload_) {
+  if (other.dlopen_handle_ != nullptr) {
+    dlopen(other.name_.c_str(), 0);
+  }
+}
+
+Agent::~Agent() {
+  if (dlopen_handle_ != nullptr) {
+    dlclose(dlopen_handle_);
+  }
+}
+
+std::ostream& operator<<(std::ostream &os, const Agent* m) {
+  return os << *m;
+}
+
+std::ostream& operator<<(std::ostream &os, Agent const& m) {
+  return os << "Agent { name=\"" << m.name_ << "\", args=\"" << m.args_ << "\", handle="
+            << m.dlopen_handle_ << " }";
+}
+
+}  // namespace ti
+}  // namespace art
diff --git a/runtime/ti/agent.h b/runtime/ti/agent.h
new file mode 100644
index 0000000000..521e21e4e4
--- /dev/null
+++ b/runtime/ti/agent.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_TI_AGENT_H_
+#define ART_RUNTIME_TI_AGENT_H_
+
+#include <dlfcn.h>
+#include <jni.h>  // for jint, JavaVM* etc declarations
+
+#include "base/stringprintf.h"
+#include "runtime.h"
+#include "utils.h"
+
+namespace art {
+namespace ti {
+
+using AgentOnLoadFunction = jint (*)(JavaVM*, const char*, void*);
+using AgentOnAttachFunction = jint (*)(JavaVM*, const char*, void*);
+using AgentOnUnloadFunction = void (*)(JavaVM*);
+
+class Agent {
+ public:
+  enum LoadError {
+    kNoError,              // No error occurred..
+    kAlreadyStarted,       // The agent has already been loaded.
+    kLoadingError,         // dlopen or dlsym returned an error.
+    kInitializationError,  // The entrypoint did not return 0. This might require an abort.
+  };
+
+  bool IsStarted() const {
+    return dlopen_handle_ != nullptr;
+  }
+
+  const std::string& GetName() const {
+    return name_;
+  }
+
+  const std::string& GetArgs() const {
+    return args_;
+  }
+
+  bool HasArgs() const {
+    return !GetArgs().empty();
+  }
+
+  // TODO We need to acquire some locks probably.
+  LoadError Load(/*out*/jint* call_res, /*out*/std::string* error_msg);
+
+  // TODO We need to acquire some locks probably.
+  void Unload();
+
+  // Tries to attach the agent using its OnAttach method. Returns true on success.
+  // TODO We need to acquire some locks probably.
+  LoadError Attach(std::string* error_msg) {
+    // TODO
+    *error_msg = "Attach has not yet been implemented!";
+    return kLoadingError;
+  }
+
+  static Agent Create(std::string arg);
+
+  static Agent Create(std::string name, std::string args) {
+    return Agent(name, args);
+  }
+
+  ~Agent();
+
+  // We need move constructor and copy for vectors
+  Agent(const Agent& other);
+
+  Agent(Agent&& other)
+      : name_(other.name_),
+        args_(other.args_),
+        dlopen_handle_(nullptr),
+        onload_(nullptr),
+        onattach_(nullptr),
+        onunload_(nullptr) {
+    other.dlopen_handle_ = nullptr;
+    other.onload_ = nullptr;
+    other.onattach_ = nullptr;
+    other.onunload_ = nullptr;
+  }
+
+  // We don't need an operator=
+  void operator=(const Agent&) = delete;
+
+ private:
+  Agent(std::string name, std::string args)
+      : name_(name),
+        args_(args),
+        dlopen_handle_(nullptr),
+        onload_(nullptr),
+        onattach_(nullptr),
+        onunload_(nullptr) { }
+
+  LoadError DoDlOpen(/*out*/std::string* error_msg);
+
+  const std::string name_;
+  const std::string args_;
+  void* dlopen_handle_;
+
+  // The entrypoints.
+  AgentOnLoadFunction onload_;
+  AgentOnAttachFunction onattach_;
+  AgentOnUnloadFunction onunload_;
+
+  friend std::ostream& operator<<(std::ostream &os, Agent const& m);
+};
+
+std::ostream& operator<<(std::ostream &os, Agent const& m);
+std::ostream& operator<<(std::ostream &os, const Agent* m);
+
+}  // namespace ti
+}  // namespace art
+
+#endif  // ART_RUNTIME_TI_AGENT_H_
+
diff --git a/test/617-clinit-oome/expected.txt b/test/617-clinit-oome/expected.txt
new file mode 100644
index 0000000000..c1d33ff9e6
--- /dev/null
+++ b/test/617-clinit-oome/expected.txt
@@ -0,0 +1 @@
+Filling heap
diff --git a/test/617-clinit-oome/info.txt b/test/617-clinit-oome/info.txt
new file mode 100644
index 0000000000..ece35b28cd
--- /dev/null
+++ b/test/617-clinit-oome/info.txt
@@ -0,0 +1 @@
+Regression test for encoded static strings caussing OOME b/30690988
diff --git a/test/617-clinit-oome/src/Main.java b/test/617-clinit-oome/src/Main.java
new file mode 100644
index 0000000000..749a2325ef
--- /dev/null
+++ b/test/617-clinit-oome/src/Main.java
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  public static void main(String[] args) {
+    Class klass = Other.class;
+    Object[] data = new Object[100000];
+    try {
+        System.out.println("Filling heap");
+        int size = 256 * 1024 * 1024;
+        int index = 0;
+        while (true) {
+            try {
+                data[index] = new byte[size];
+                index++;
+            } catch (OutOfMemoryError e) {
+                size /= 2;
+                if (size == 0) {
+                    break;
+                }
+            }
+        }
+        // Initialize now that the heap is full.
+        Other.print();
+    } catch (OutOfMemoryError e) {
+    } catch (Exception e) {
+        System.err.println(e);
+    }
+  }
+}
diff --git a/test/617-clinit-oome/src/Other.java b/test/617-clinit-oome/src/Other.java
new file mode 100644
index 0000000000..20306ee4c4
--- /dev/null
+++ b/test/617-clinit-oome/src/Other.java
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public final class Other {
+    public static final String string1 = "ABCDEFG1";
+    public static final String string2 = "ABCDEFG2";
+    public static final String string3 = "ABCDEFG3";
+    public static final String string4 = "ABCDEFG4";
+    public static final String string5 = "ABCDEFG5";
+    public static final int int1 = 12;
+
+    public static void print() {
+        System.out.println(string2);
+    }
+}
diff --git a/test/900-hello-plugin/build b/test/900-hello-plugin/build
new file mode 100755
index 0000000000..898e2e54a2
--- /dev/null
+++ b/test/900-hello-plugin/build
@@ -0,0 +1,17 @@
+#!/bin/bash
+#
+# Copyright 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+./default-build "$@" --experimental agents
diff --git a/test/900-hello-plugin/expected.txt b/test/900-hello-plugin/expected.txt
new file mode 100644
index 0000000000..43db31c722
--- /dev/null
+++ b/test/900-hello-plugin/expected.txt
@@ -0,0 +1,8 @@
+ArtPlugin_Initialize called in test 900
+Agent_OnLoad called with options "test_900"
+GetEnvHandler called in test 900
+GetEnvHandler called with version 0x900fffff
+GetEnv returned '900' environment!
+Hello, world!
+Agent_OnUnload called
+ArtPlugin_Deinitialize called in test 900
diff --git a/test/900-hello-plugin/info.txt b/test/900-hello-plugin/info.txt
new file mode 100644
index 0000000000..47b15c2e6a
--- /dev/null
+++ b/test/900-hello-plugin/info.txt
@@ -0,0 +1,2 @@
+Tests that agents and plugins are loaded.
+
diff --git a/test/900-hello-plugin/load_unload.cc b/test/900-hello-plugin/load_unload.cc
new file mode 100644
index 0000000000..a38cc3d6ac
--- /dev/null
+++ b/test/900-hello-plugin/load_unload.cc
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <jni.h>
+#include <stdio.h>
+
+#include "art_method-inl.h"
+#include "base/logging.h"
+#include "base/macros.h"
+
+namespace art {
+
+constexpr jint TEST_900_ENV_VERSION_NUMBER = 0x900FFFFF;
+constexpr uintptr_t ENV_VALUE = 900;
+
+// Allow this library to be used as a plugin too so we can test the stack.
+static jint GetEnvHandler(JavaVMExt* vm ATTRIBUTE_UNUSED, void** new_env, jint version) {
+  printf("%s called in test 900\n", __func__);
+  if (version != TEST_900_ENV_VERSION_NUMBER) {
+    return JNI_EVERSION;
+  }
+  printf("GetEnvHandler called with version 0x%x\n", version);
+  *new_env = reinterpret_cast<void*>(ENV_VALUE);
+  return JNI_OK;
+}
+
+extern "C" bool ArtPlugin_Initialize() {
+  printf("%s called in test 900\n", __func__);
+  Runtime::Current()->GetJavaVM()->AddEnvironmentHook(GetEnvHandler);
+  return true;
+}
+
+extern "C" bool ArtPlugin_Deinitialize() {
+  printf("%s called in test 900\n", __func__);
+  return true;
+}
+
+extern "C" JNIEXPORT jint JNICALL Agent_OnLoad(JavaVM* vm,
+                                               char* options,
+                                               void* reserved ATTRIBUTE_UNUSED) {
+  printf("Agent_OnLoad called with options \"%s\"\n", options);
+  uintptr_t env = 0;
+  jint res = vm->GetEnv(reinterpret_cast<void**>(&env), TEST_900_ENV_VERSION_NUMBER);
+  if (res != JNI_OK) {
+    printf("GetEnv(TEST_900_ENV_VERSION_NUMBER) returned non-zero\n");
+  }
+  printf("GetEnv returned '%" PRIdPTR "' environment!\n", env);
+  return 0;
+}
+
+extern "C" JNIEXPORT void JNICALL Agent_OnUnload(JavaVM* vm ATTRIBUTE_UNUSED) {
+  printf("Agent_OnUnload called\n");
+}
+
+}  // namespace art
diff --git a/test/900-hello-plugin/run b/test/900-hello-plugin/run
new file mode 100755
index 0000000000..35b08715a1
--- /dev/null
+++ b/test/900-hello-plugin/run
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# Copyright 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+plugin=libartagentd.so
+if  [[ "$@" == *"-O"* ]]; then
+  plugin=libartagent.so
+fi
+./default-run "$@" --experimental agents \
+                   --experimental runtime-plugins \
+                   --runtime-option -agentpath:${plugin}=test_900 \
+                   --android-runtime-option -Xplugin:${plugin}
diff --git a/test/900-hello-plugin/src/Main.java b/test/900-hello-plugin/src/Main.java
new file mode 100644
index 0000000000..1ef6289559
--- /dev/null
+++ b/test/900-hello-plugin/src/Main.java
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  public static void main(String[] args) {
+    System.out.println("Hello, world!");
+  }
+}
diff --git a/test/Android.libartagent.mk b/test/Android.libartagent.mk
new file mode 100644
index 0000000000..729de3f7ae
--- /dev/null
+++ b/test/Android.libartagent.mk
@@ -0,0 +1,101 @@
+#
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+LOCAL_PATH := $(call my-dir)
+
+include art/build/Android.common_build.mk
+
+LIBARTAGENT_COMMON_SRC_FILES := \
+    900-hello-plugin/load_unload.cc
+
+# $(1): target or host
+# $(2): debug or <empty>
+define build-libartagent
+  ifneq ($(1),target)
+    ifneq ($(1),host)
+      $$(error expected target or host for argument 1, received $(1))
+    endif
+  endif
+  ifneq ($(2),debug)
+    ifneq ($(2),)
+      $$(error d or empty for argument 2, received $(2))
+    endif
+    suffix := d
+  else
+    suffix :=
+  endif
+
+  art_target_or_host := $(1)
+
+  include $(CLEAR_VARS)
+  LOCAL_CPP_EXTENSION := $(ART_CPP_EXTENSION)
+  LOCAL_MODULE := libartagent$$(suffix)
+  ifeq ($$(art_target_or_host),target)
+    LOCAL_MODULE_TAGS := tests
+  endif
+  LOCAL_SRC_FILES := $(LIBARTAGENT_COMMON_SRC_FILES)
+  LOCAL_SHARED_LIBRARIES += libart$$(suffix) libbacktrace libnativehelper
+  LOCAL_C_INCLUDES += $(ART_C_INCLUDES) art/runtime
+  LOCAL_ADDITIONAL_DEPENDENCIES := art/build/Android.common_build.mk
+  LOCAL_ADDITIONAL_DEPENDENCIES += $(LOCAL_PATH)/Android.libartagent.mk
+  ifeq ($$(art_target_or_host),target)
+    $(call set-target-local-clang-vars)
+    ifeq ($$(suffix),d)
+      $(call set-target-local-cflags-vars,debug)
+    else
+      $(call set-target-local-cflags-vars,ndebug)
+    endif
+    LOCAL_SHARED_LIBRARIES += libdl
+    LOCAL_MULTILIB := both
+    LOCAL_MODULE_PATH_32 := $(ART_TARGET_TEST_OUT)/$(ART_TARGET_ARCH_32)
+    LOCAL_MODULE_PATH_64 := $(ART_TARGET_TEST_OUT)/$(ART_TARGET_ARCH_64)
+    LOCAL_MODULE_TARGET_ARCH := $(ART_SUPPORTED_ARCH)
+    include $(BUILD_SHARED_LIBRARY)
+  else # host
+    LOCAL_CLANG := $(ART_HOST_CLANG)
+    LOCAL_CFLAGS := $(ART_HOST_CFLAGS)
+    LOCAL_ASFLAGS := $(ART_HOST_ASFLAGS)
+    ifeq ($$(suffix),d)
+      LOCAL_CFLAGS += $(ART_HOST_DEBUG_CFLAGS)
+      LOCAL_ASFLAGS += $(ART_HOST_DEBUG_ASFLAGS)
+    else
+      LOCAL_CFLAGS += $(ART_HOST_NON_DEBUG_CFLAGS)
+      LOCAL_ASFLAGS += $(ART_HOST_NON_DEBUG_ASFLAGS)
+    endif
+    LOCAL_LDLIBS := $(ART_HOST_LDLIBS) -ldl -lpthread
+    LOCAL_IS_HOST_MODULE := true
+    LOCAL_MULTILIB := both
+    include $(BUILD_HOST_SHARED_LIBRARY)
+  endif
+
+  # Clear locally used variables.
+  art_target_or_host :=
+  suffix :=
+endef
+
+ifeq ($(ART_BUILD_TARGET),true)
+  $(eval $(call build-libartagent,target,))
+  $(eval $(call build-libartagent,target,debug))
+endif
+ifeq ($(ART_BUILD_HOST),true)
+  $(eval $(call build-libartagent,host,))
+  $(eval $(call build-libartagent,host,debug))
+endif
+
+# Clear locally used variables.
+LOCAL_PATH :=
+LIBARTAGENT_COMMON_SRC_FILES :=
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index bba6f8e721..b87e142811 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -555,10 +555,13 @@ TEST_ART_BROKEN_INTERPRETER_READ_BARRIER_RUN_TESTS :=
 # Tests that should fail in the read barrier configuration with the Optimizing compiler (AOT).
 # 484: Baker's fast path based read barrier compiler instrumentation generates code containing
 #      more parallel moves on x86, thus some Checker assertions may fail.
+# 527: On ARM64 and ARM, the read barrier instrumentation does not support the HIntermediateAddress
+#      instruction yet (b/26601270).
 # 537: Expects an array copy to be intrinsified on x86-64, but calling-on-slowpath intrinsics are
 #      not yet handled in the read barrier configuration.
 TEST_ART_BROKEN_OPTIMIZING_READ_BARRIER_RUN_TESTS := \
   484-checker-register-hints \
+  527-checker-array-access-split \
   537-checker-arraycopy
 
 # Tests that should fail in the read barrier configuration with JIT (Optimizing compiler).
@@ -653,6 +656,14 @@ $(foreach target, $(TARGET_TYPES), \
 # only once).
 TEST_ART_TARGET_SYNC_DEPS += $(ART_TARGET_EXECUTABLES) $(TARGET_CORE_IMG_OUTS)
 
+# Also need libartagent.
+TEST_ART_TARGET_SYNC_DEPS += $(ART_TARGET_TEST_OUT)/$(TARGET_ARCH)/libartagent.so
+TEST_ART_TARGET_SYNC_DEPS += $(ART_TARGET_TEST_OUT)/$(TARGET_ARCH)/libartagentd.so
+ifdef TARGET_2ND_ARCH
+TEST_ART_TARGET_SYNC_DEPS += $(ART_TARGET_TEST_OUT)/$(TARGET_2ND_ARCH)/libartagent.so
+TEST_ART_TARGET_SYNC_DEPS += $(ART_TARGET_TEST_OUT)/$(TARGET_2ND_ARCH)/libartagentd.so
+endif
+
 # Also need libarttest.
 TEST_ART_TARGET_SYNC_DEPS += $(ART_TARGET_TEST_OUT)/$(TARGET_ARCH)/libarttest.so
 TEST_ART_TARGET_SYNC_DEPS += $(ART_TARGET_TEST_OUT)/$(TARGET_ARCH)/libarttestd.so
@@ -671,6 +682,8 @@ endif
 # specific version depending on the compiler.
 ART_TEST_HOST_RUN_TEST_DEPENDENCIES := \
   $(ART_HOST_EXECUTABLES) \
+  $(ART_HOST_OUT_SHARED_LIBRARIES)/libartagent$(ART_HOST_SHLIB_EXTENSION) \
+  $(ART_HOST_OUT_SHARED_LIBRARIES)/libartagentd$(ART_HOST_SHLIB_EXTENSION) \
   $(ART_HOST_OUT_SHARED_LIBRARIES)/libarttest$(ART_HOST_SHLIB_EXTENSION) \
   $(ART_HOST_OUT_SHARED_LIBRARIES)/libarttestd$(ART_HOST_SHLIB_EXTENSION) \
   $(ART_HOST_OUT_SHARED_LIBRARIES)/libnativebridgetest$(ART_HOST_SHLIB_EXTENSION) \
@@ -680,6 +693,8 @@ ART_TEST_HOST_RUN_TEST_DEPENDENCIES := \
 
 ifneq ($(HOST_PREFER_32_BIT),true)
 ART_TEST_HOST_RUN_TEST_DEPENDENCIES += \
+  $(2ND_ART_HOST_OUT_SHARED_LIBRARIES)/libartagent$(ART_HOST_SHLIB_EXTENSION) \
+  $(2ND_ART_HOST_OUT_SHARED_LIBRARIES)/libartagentd$(ART_HOST_SHLIB_EXTENSION) \
   $(2ND_ART_HOST_OUT_SHARED_LIBRARIES)/libarttest$(ART_HOST_SHLIB_EXTENSION) \
   $(2ND_ART_HOST_OUT_SHARED_LIBRARIES)/libarttestd$(ART_HOST_SHLIB_EXTENSION) \
   $(2ND_ART_HOST_OUT_SHARED_LIBRARIES)/libnativebridgetest$(ART_HOST_SHLIB_EXTENSION) \
@@ -1092,5 +1107,9 @@ ALL_ADDRESS_SIZES :=
 RUN_TYPES :=
 DEBUGGABLE_TYPES :=
 
-include $(LOCAL_PATH)/Android.libarttest.mk
-include art/test/Android.libnativebridgetest.mk
+MY_LOCAL_PATH := $(LOCAL_PATH)
+include $(MY_LOCAL_PATH)/Android.libartagent.mk
+include $(MY_LOCAL_PATH)/Android.libarttest.mk
+include $(MY_LOCAL_PATH)/Android.libnativebridgetest.mk
+MY_LOCAL_PATH :=
+LOCAL_PATH :=
diff --git a/test/etc/run-test-jar b/test/etc/run-test-jar
index c6c9380412..d12bd79b3a 100755
--- a/test/etc/run-test-jar
+++ b/test/etc/run-test-jar
@@ -21,6 +21,7 @@ DEX2OAT=""
 EXPERIMENTAL=""
 FALSE_BIN="/system/bin/false"
 FLAGS=""
+ANDROID_FLAGS=""
 GDB=""
 GDB_ARGS=""
 GDB_SERVER="gdbserver"
@@ -59,6 +60,9 @@ while true; do
     if [ "x$1" = "x--quiet" ]; then
         QUIET="y"
         shift
+    elif [ "x$1" = "x-O" ]; then
+        # Ignore this option.
+        shift
     elif [ "x$1" = "x--lib" ]; then
         shift
         if [ "x$1" = "x" ]; then
@@ -93,6 +97,11 @@ while true; do
         FLAGS="${FLAGS} -Xcompiler-option $option"
         COMPILE_FLAGS="${COMPILE_FLAGS} $option"
         shift
+    elif [ "x$1" = "x--android-runtime-option" ]; then
+        shift
+        option="$1"
+        ANDROID_FLAGS="${ANDROID_FLAGS} $option"
+        shift
     elif [ "x$1" = "x--runtime-option" ]; then
         shift
         option="$1"
@@ -233,6 +242,7 @@ while true; do
 done
 
 if [ "$USE_JVM" = "n" ]; then
+    FLAGS="${FLAGS} ${ANDROID_FLAGS}"
     for feature in ${EXPERIMENTAL}; do
         FLAGS="${FLAGS} -Xexperimental:${feature} -Xcompiler-option --runtime-arg -Xcompiler-option -Xexperimental:${feature}"
         COMPILE_FLAGS="${COMPILE_FLAGS} --runtime-arg -Xexperimental:${feature}"
@@ -469,12 +479,12 @@ if [ "$HOST" = "n" ]; then
       adb push $TEST_NAME-ex.jar $DEX_LOCATION >/dev/null 2>&1
     fi
 
-    LD_LIBRARY_PATH=
+    LD_LIBRARY_PATH=/data/art-test/$ISA
     if [ "$ANDROID_ROOT" != "/system" ]; then
       # Current default installation is dalvikvm 64bits and dex2oat 32bits,
       # so we can only use LD_LIBRARY_PATH when testing on a local
       # installation.
-      LD_LIBRARY_PATH=$ANDROID_ROOT/$LIBRARY_DIRECTORY
+      LD_LIBRARY_PATH=$ANDROID_ROOT/$LIBRARY_DIRECTORY:$LD_LIBRARY_PATH
     fi
 
     PUBLIC_LIBS=libart.so:libartd.so
diff --git a/test/run-test b/test/run-test
index edee4ae31f..621fc24a26 100755
--- a/test/run-test
+++ b/test/run-test
@@ -165,6 +165,7 @@ while true; do
     elif [ "x$1" = "x-O" ]; then
         lib="libart.so"
         testlib="arttest"
+        run_args="${run_args} -O"
         shift
     elif [ "x$1" = "x--dalvik" ]; then
         lib="libdvm.so"
diff --git a/tools/cpp-define-generator/constant_globals.def b/tools/cpp-define-generator/constant_globals.def
index 1e24d64dda..a3ccc72bb6 100644
--- a/tools/cpp-define-generator/constant_globals.def
+++ b/tools/cpp-define-generator/constant_globals.def
@@ -25,6 +25,7 @@
 
 DEFINE_OBJECT_EXPR(ALIGNMENT_MASK,         size_t,   art::kObjectAlignment - 1)
 DEFINE_OBJECT_EXPR(ALIGNMENT_MASK_TOGGLED, uint32_t, ~static_cast<uint32_t>(art::kObjectAlignment - 1))
+DEFINE_OBJECT_EXPR(ALIGNMENT_MASK_TOGGLED64, uint64_t, ~static_cast<uint64_t>(art::kObjectAlignment - 1))
 
 #undef DEFINE_OBJECT_EXPR