Merge "ARM64: Make runtime invokes use InvokeRuntime()."
diff --git a/build/Android.common_build.mk b/build/Android.common_build.mk
index 1e2cfa3..b5d41d9 100644
--- a/build/Android.common_build.mk
+++ b/build/Android.common_build.mk
@@ -136,7 +136,6 @@
 # Base set of cflags used by all things ART.
 art_cflags += \
   -fno-rtti \
-  -std=gnu++11 \
   -ggdb3 \
   -Wall \
   -Werror \
@@ -152,7 +151,7 @@
 
 # The architectures the compiled tools are able to run on. Setting this to 'all' will cause all
 # architectures to be included.
-ART_TARGET_CODEGEN_ARCHS ?= all
+ART_TARGET_CODEGEN_ARCHS ?= svelte
 ART_HOST_CODEGEN_ARCHS ?= all
 
 ifeq ($(ART_TARGET_CODEGEN_ARCHS),all)
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index c538c4f..fecf0ba 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -373,7 +373,6 @@
 
 COMPILER_GTEST_HOST_SRC_FILES_arm := \
   $(COMPILER_GTEST_COMMON_SRC_FILES_arm) \
-  compiler/utils/arm/assembler_arm32_test.cc \
   compiler/utils/arm/assembler_thumb2_test.cc \
   compiler/utils/assembler_thumb_test.cc \
 
@@ -634,7 +633,7 @@
   ifeq ($$(art_target_or_host),target)
     $$(eval LOCAL_CLANG := $$(ART_TARGET_CLANG))
     $$(eval $$(call set-target-local-cflags-vars,debug))
-    LOCAL_SHARED_LIBRARIES += libdl libicuuc libicui18n libnativehelper libz libcutils libvixld-arm64
+    LOCAL_SHARED_LIBRARIES += libdl libicuuc libicui18n libnativehelper libz libcutils libvixld-arm libvixld-arm64
     LOCAL_MODULE_PATH_32 := $$(ART_TARGET_NATIVETEST_OUT)/$$(ART_TARGET_ARCH_32)
     LOCAL_MODULE_PATH_64 := $$(ART_TARGET_NATIVETEST_OUT)/$$(ART_TARGET_ARCH_64)
     LOCAL_MULTILIB := both
@@ -678,7 +677,7 @@
     LOCAL_CLANG := $$(ART_HOST_CLANG)
     LOCAL_CFLAGS += $$(ART_HOST_CFLAGS) $$(ART_HOST_DEBUG_CFLAGS)
     LOCAL_ASFLAGS += $$(ART_HOST_ASFLAGS) $$(ART_HOST_DEBUG_ASFLAGS)
-    LOCAL_SHARED_LIBRARIES += libicuuc-host libicui18n-host libnativehelper libziparchive-host libz-host libvixld-arm64
+    LOCAL_SHARED_LIBRARIES += libicuuc-host libicui18n-host libnativehelper libziparchive-host libz-host libvixld-arm libvixld-arm64
     LOCAL_LDLIBS := -lpthread -ldl
     LOCAL_IS_HOST_MODULE := true
     LOCAL_MULTILIB := both
diff --git a/compiler/Android.mk b/compiler/Android.mk
index 410b2d0..08fd309 100644
--- a/compiler/Android.mk
+++ b/compiler/Android.mk
@@ -92,11 +92,14 @@
 	linker/arm/relative_patcher_thumb2.cc \
 	optimizing/code_generator_arm.cc \
 	optimizing/dex_cache_array_fixups_arm.cc \
+	optimizing/instruction_simplifier_arm.cc \
+	optimizing/instruction_simplifier_shared.cc \
 	optimizing/intrinsics_arm.cc \
 	utils/arm/assembler_arm.cc \
-	utils/arm/assembler_arm32.cc \
+	utils/arm/assembler_arm_vixl.cc \
 	utils/arm/assembler_thumb2.cc \
 	utils/arm/jni_macro_assembler_arm.cc \
+	utils/arm/jni_macro_assembler_arm_vixl.cc \
 	utils/arm/managed_register_arm.cc \
 
 # TODO We should really separate out those files that are actually needed for both variants of an
@@ -109,7 +112,6 @@
 	linker/arm64/relative_patcher_arm64.cc \
 	optimizing/nodes_arm64.cc \
 	optimizing/code_generator_arm64.cc \
-	optimizing/instruction_simplifier_arm.cc \
 	optimizing/instruction_simplifier_arm64.cc \
 	optimizing/instruction_simplifier_shared.cc \
 	optimizing/intrinsics_arm64.cc \
@@ -287,15 +289,15 @@
   # VIXL assembly support for ARM64 targets.
   ifeq ($$(art_ndebug_or_debug),debug)
     ifeq ($$(art_static_or_shared), static)
-      LOCAL_WHOLESTATIC_LIBRARIES += libvixld-arm64
+      LOCAL_WHOLESTATIC_LIBRARIES += libvixld-arm libvixld-arm64
     else
-      LOCAL_SHARED_LIBRARIES += libvixld-arm64
+      LOCAL_SHARED_LIBRARIES += libvixld-arm libvixld-arm64
     endif
   else
     ifeq ($$(art_static_or_shared), static)
-      LOCAL_WHOLE_STATIC_LIBRARIES += libvixl-arm64
+      LOCAL_WHOLE_STATIC_LIBRARIES += libvixl-arm libvixl-arm64
     else
-      LOCAL_SHARED_LIBRARIES += libvixl-arm64
+      LOCAL_SHARED_LIBRARIES += libvixl-arm libvixl-arm64
     endif
   endif
 
diff --git a/compiler/cfi_test.h b/compiler/cfi_test.h
index f8b7460..c754e55 100644
--- a/compiler/cfi_test.h
+++ b/compiler/cfi_test.h
@@ -22,11 +22,13 @@
 #include <sstream>
 
 #include "arch/instruction_set.h"
+#include "base/enums.h"
 #include "debug/dwarf/dwarf_constants.h"
 #include "debug/dwarf/dwarf_test.h"
 #include "debug/dwarf/headers.h"
 #include "disassembler/disassembler.h"
 #include "gtest/gtest.h"
+#include "thread.h"
 
 namespace art {
 
@@ -57,7 +59,13 @@
     // Pretty-print assembly.
     const uint8_t* asm_base = actual_asm.data();
     const uint8_t* asm_end = asm_base + actual_asm.size();
-    auto* opts = new DisassemblerOptions(false, asm_base, asm_end, true);
+    auto* opts = new DisassemblerOptions(false,
+                                         asm_base,
+                                         asm_end,
+                                         true,
+                                         is64bit
+                                             ? &Thread::DumpThreadOffset<PointerSize::k64>
+                                             : &Thread::DumpThreadOffset<PointerSize::k32>);
     std::unique_ptr<Disassembler> disasm(Disassembler::Create(isa, opts));
     std::stringstream stream;
     const uint8_t* base = actual_asm.data() + (isa == kThumb2 ? 1 : 0);
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 758cd93..77ec4b7 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -372,7 +372,7 @@
       method_inliner_map_(method_inliner_map),
       compiler_(Compiler::Create(this, compiler_kind)),
       compiler_kind_(compiler_kind),
-      instruction_set_(instruction_set),
+      instruction_set_(instruction_set == kArm ? kThumb2: instruction_set),
       instruction_set_features_(instruction_set_features),
       requires_constructor_barrier_lock_("constructor barrier lock"),
       compiled_classes_lock_("compiled classes lock"),
diff --git a/compiler/image_test.cc b/compiler/image_test.cc
index 91579e9..e1ee0d2 100644
--- a/compiler/image_test.cc
+++ b/compiler/image_test.cc
@@ -188,6 +188,7 @@
   }
 
   uint64_t image_file_size;
+  size_t image_size;
   {
     std::unique_ptr<File> file(OS::OpenFileForReading(image_file.GetFilename().c_str()));
     ASSERT_TRUE(file.get() != nullptr);
@@ -206,6 +207,7 @@
     ASSERT_TRUE(space->IsMallocSpace());
 
     image_file_size = file->GetLength();
+    image_size = image_header.GetImageSize();
   }
 
   ASSERT_TRUE(compiler_driver_->GetImageClasses() != nullptr);
@@ -255,10 +257,10 @@
   ASSERT_TRUE(image_space != nullptr);
   if (storage_mode == ImageHeader::kStorageModeUncompressed) {
     // Uncompressed, image should be smaller than file.
-    ASSERT_LE(image_space->Size(), image_file_size);
+    ASSERT_LE(image_size, image_file_size);
   } else {
     // Compressed, file should be smaller than image.
-    ASSERT_LE(image_file_size, image_space->Size());
+    ASSERT_LE(image_file_size, image_size);
   }
 
   image_space->VerifyImageAllocations();
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 404f044..6d9c55c 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -6377,7 +6377,7 @@
                     "art::mirror::CompressedReference<mirror::Object> and int32_t "
                     "have different sizes.");
 
-      // Slow path used to mark the GC root `root`.
+      // Slow path marking the GC root `root`.
       SlowPathCode* slow_path =
           new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, root);
       codegen_->AddSlowPath(slow_path);
@@ -6518,7 +6518,7 @@
   // Object* ref = ref_addr->AsMirrorPtr()
   __ MaybeUnpoisonHeapReference(ref_reg);
 
-  // Slow path used to mark the object `ref` when it is gray.
+  // Slow path marking the object `ref` when it is gray.
   SlowPathCode* slow_path =
       new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, ref);
   AddSlowPath(slow_path);
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index f3ebcf4..1101edf 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -46,16 +46,20 @@
 
 namespace arm64 {
 
+using helpers::ARM64EncodableConstantOrRegister;
+using helpers::ArtVixlRegCodeCoherentForRegSet;
 using helpers::CPURegisterFrom;
 using helpers::DRegisterFrom;
 using helpers::FPRegisterFrom;
 using helpers::HeapOperand;
 using helpers::HeapOperandFrom;
 using helpers::InputCPURegisterAt;
+using helpers::InputCPURegisterOrZeroRegAt;
 using helpers::InputFPRegisterAt;
-using helpers::InputRegisterAt;
 using helpers::InputOperandAt;
+using helpers::InputRegisterAt;
 using helpers::Int64ConstantFrom;
+using helpers::IsConstantZeroBitPattern;
 using helpers::LocationFrom;
 using helpers::OperandFromMemOperand;
 using helpers::OutputCPURegister;
@@ -66,8 +70,6 @@
 using helpers::VIXLRegCodeFromART;
 using helpers::WRegisterFrom;
 using helpers::XRegisterFrom;
-using helpers::ARM64EncodableConstantOrRegister;
-using helpers::ArtVixlRegCodeCoherentForRegSet;
 
 static constexpr int kCurrentMethodStackOffset = 0;
 // The compare/jump sequence will generate about (1.5 * num_entries + 3) instructions. While jump
@@ -1457,12 +1459,18 @@
       break;
     case Primitive::kPrimFloat:
     case Primitive::kPrimDouble: {
-      DCHECK(src.IsFPRegister());
       DCHECK_EQ(src.Is64Bits(), Primitive::Is64BitType(type));
+      Register temp_src;
+      if (src.IsZero()) {
+        // The zero register is used to avoid synthesizing zero constants.
+        temp_src = Register(src);
+      } else {
+        DCHECK(src.IsFPRegister());
+        temp_src = src.Is64Bits() ? temps.AcquireX() : temps.AcquireW();
+        __ Fmov(temp_src, FPRegister(src));
+      }
 
-      Register temp = src.Is64Bits() ? temps.AcquireX() : temps.AcquireW();
-      __ Fmov(temp, FPRegister(src));
-      __ Stlr(temp, base);
+      __ Stlr(temp_src, base);
       break;
     }
     case Primitive::kPrimVoid:
@@ -1693,7 +1701,9 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
-  if (Primitive::IsFloatingPointType(instruction->InputAt(1)->GetType())) {
+  if (IsConstantZeroBitPattern(instruction->InputAt(1))) {
+    locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant()));
+  } else if (Primitive::IsFloatingPointType(instruction->InputAt(1)->GetType())) {
     locations->SetInAt(1, Location::RequiresFpuRegister());
   } else {
     locations->SetInAt(1, Location::RequiresRegister());
@@ -1707,7 +1717,7 @@
   BlockPoolsScope block_pools(GetVIXLAssembler());
 
   Register obj = InputRegisterAt(instruction, 0);
-  CPURegister value = InputCPURegisterAt(instruction, 1);
+  CPURegister value = InputCPURegisterOrZeroRegAt(instruction, 1);
   CPURegister source = value;
   Offset offset = field_info.GetFieldOffset();
   Primitive::Type field_type = field_info.GetFieldType();
@@ -2155,7 +2165,9 @@
           LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
-  if (Primitive::IsFloatingPointType(value_type)) {
+  if (IsConstantZeroBitPattern(instruction->InputAt(2))) {
+    locations->SetInAt(2, Location::ConstantLocation(instruction->InputAt(2)->AsConstant()));
+  } else if (Primitive::IsFloatingPointType(value_type)) {
     locations->SetInAt(2, Location::RequiresFpuRegister());
   } else {
     locations->SetInAt(2, Location::RequiresRegister());
@@ -2170,7 +2182,7 @@
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
 
   Register array = InputRegisterAt(instruction, 0);
-  CPURegister value = InputCPURegisterAt(instruction, 2);
+  CPURegister value = InputCPURegisterOrZeroRegAt(instruction, 2);
   CPURegister source = value;
   Location index = locations->InAt(1);
   size_t offset = mirror::Array::DataOffset(Primitive::ComponentSize(value_type)).Uint32Value();
@@ -5012,7 +5024,7 @@
                     "art::mirror::CompressedReference<mirror::Object> and int32_t "
                     "have different sizes.");
 
-      // Slow path used to mark the GC root `root`.
+      // Slow path marking the GC root `root`.
       SlowPathCodeARM64* slow_path =
           new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, root);
       codegen_->AddSlowPath(slow_path);
@@ -5210,7 +5222,7 @@
   // Object* ref = ref_addr->AsMirrorPtr()
   GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
 
-  // Slow path used to mark the object `ref` when it is gray.
+  // Slow path marking the object `ref` when it is gray.
   SlowPathCodeARM64* slow_path =
       new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, ref);
   AddSlowPath(slow_path);
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index a7fbc84..8a2f90d 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -1833,11 +1833,19 @@
   }
 }
 
+auto InstructionCodeGeneratorMIPS::GetImplicitNullChecker(HInstruction* instruction) {
+  auto null_checker = [this, instruction]() {
+    this->codegen_->MaybeRecordImplicitNullCheck(instruction);
+  };
+  return null_checker;
+}
+
 void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   Register obj = locations->InAt(0).AsRegister<Register>();
   Location index = locations->InAt(1);
   uint32_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
+  auto null_checker = GetImplicitNullChecker(instruction);
 
   Primitive::Type type = instruction->GetType();
   switch (type) {
@@ -1846,10 +1854,10 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
-        __ LoadFromOffset(kLoadUnsignedByte, out, obj, offset);
+        __ LoadFromOffset(kLoadUnsignedByte, out, obj, offset, null_checker);
       } else {
         __ Addu(TMP, obj, index.AsRegister<Register>());
-        __ LoadFromOffset(kLoadUnsignedByte, out, TMP, data_offset);
+        __ LoadFromOffset(kLoadUnsignedByte, out, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -1859,10 +1867,10 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
-        __ LoadFromOffset(kLoadSignedByte, out, obj, offset);
+        __ LoadFromOffset(kLoadSignedByte, out, obj, offset, null_checker);
       } else {
         __ Addu(TMP, obj, index.AsRegister<Register>());
-        __ LoadFromOffset(kLoadSignedByte, out, TMP, data_offset);
+        __ LoadFromOffset(kLoadSignedByte, out, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -1872,11 +1880,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
-        __ LoadFromOffset(kLoadSignedHalfword, out, obj, offset);
+        __ LoadFromOffset(kLoadSignedHalfword, out, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_2);
         __ Addu(TMP, obj, TMP);
-        __ LoadFromOffset(kLoadSignedHalfword, out, TMP, data_offset);
+        __ LoadFromOffset(kLoadSignedHalfword, out, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -1886,11 +1894,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
-        __ LoadFromOffset(kLoadUnsignedHalfword, out, obj, offset);
+        __ LoadFromOffset(kLoadUnsignedHalfword, out, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_2);
         __ Addu(TMP, obj, TMP);
-        __ LoadFromOffset(kLoadUnsignedHalfword, out, TMP, data_offset);
+        __ LoadFromOffset(kLoadUnsignedHalfword, out, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -1902,11 +1910,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
-        __ LoadFromOffset(kLoadWord, out, obj, offset);
+        __ LoadFromOffset(kLoadWord, out, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_4);
         __ Addu(TMP, obj, TMP);
-        __ LoadFromOffset(kLoadWord, out, TMP, data_offset);
+        __ LoadFromOffset(kLoadWord, out, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -1916,11 +1924,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
-        __ LoadFromOffset(kLoadDoubleword, out, obj, offset);
+        __ LoadFromOffset(kLoadDoubleword, out, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_8);
         __ Addu(TMP, obj, TMP);
-        __ LoadFromOffset(kLoadDoubleword, out, TMP, data_offset);
+        __ LoadFromOffset(kLoadDoubleword, out, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -1930,11 +1938,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
-        __ LoadSFromOffset(out, obj, offset);
+        __ LoadSFromOffset(out, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_4);
         __ Addu(TMP, obj, TMP);
-        __ LoadSFromOffset(out, TMP, data_offset);
+        __ LoadSFromOffset(out, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -1944,11 +1952,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
-        __ LoadDFromOffset(out, obj, offset);
+        __ LoadDFromOffset(out, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_8);
         __ Addu(TMP, obj, TMP);
-        __ LoadDFromOffset(out, TMP, data_offset);
+        __ LoadDFromOffset(out, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -1957,7 +1965,6 @@
       LOG(FATAL) << "Unreachable type " << instruction->GetType();
       UNREACHABLE();
   }
-  codegen_->MaybeRecordImplicitNullCheck(instruction);
 }
 
 void LocationsBuilderMIPS::VisitArrayLength(HArrayLength* instruction) {
@@ -2004,6 +2011,7 @@
   bool needs_runtime_call = locations->WillCall();
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
+  auto null_checker = GetImplicitNullChecker(instruction);
 
   switch (value_type) {
     case Primitive::kPrimBoolean:
@@ -2013,10 +2021,10 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
-        __ StoreToOffset(kStoreByte, value, obj, offset);
+        __ StoreToOffset(kStoreByte, value, obj, offset, null_checker);
       } else {
         __ Addu(TMP, obj, index.AsRegister<Register>());
-        __ StoreToOffset(kStoreByte, value, TMP, data_offset);
+        __ StoreToOffset(kStoreByte, value, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -2028,11 +2036,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
-        __ StoreToOffset(kStoreHalfword, value, obj, offset);
+        __ StoreToOffset(kStoreHalfword, value, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_2);
         __ Addu(TMP, obj, TMP);
-        __ StoreToOffset(kStoreHalfword, value, TMP, data_offset);
+        __ StoreToOffset(kStoreHalfword, value, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -2045,14 +2053,13 @@
         if (index.IsConstant()) {
           size_t offset =
               (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
-          __ StoreToOffset(kStoreWord, value, obj, offset);
+          __ StoreToOffset(kStoreWord, value, obj, offset, null_checker);
         } else {
           DCHECK(index.IsRegister()) << index;
           __ Sll(TMP, index.AsRegister<Register>(), TIMES_4);
           __ Addu(TMP, obj, TMP);
-          __ StoreToOffset(kStoreWord, value, TMP, data_offset);
+          __ StoreToOffset(kStoreWord, value, TMP, data_offset, null_checker);
         }
-        codegen_->MaybeRecordImplicitNullCheck(instruction);
         if (needs_write_barrier) {
           DCHECK_EQ(value_type, Primitive::kPrimNot);
           codegen_->MarkGCCard(obj, value);
@@ -2075,11 +2082,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
-        __ StoreToOffset(kStoreDoubleword, value, obj, offset);
+        __ StoreToOffset(kStoreDoubleword, value, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_8);
         __ Addu(TMP, obj, TMP);
-        __ StoreToOffset(kStoreDoubleword, value, TMP, data_offset);
+        __ StoreToOffset(kStoreDoubleword, value, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -2091,11 +2098,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
-        __ StoreSToOffset(value, obj, offset);
+        __ StoreSToOffset(value, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_4);
         __ Addu(TMP, obj, TMP);
-        __ StoreSToOffset(value, TMP, data_offset);
+        __ StoreSToOffset(value, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -2107,11 +2114,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
-        __ StoreDToOffset(value, obj, offset);
+        __ StoreDToOffset(value, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_8);
         __ Addu(TMP, obj, TMP);
-        __ StoreDToOffset(value, TMP, data_offset);
+        __ StoreDToOffset(value, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -2120,11 +2127,6 @@
       LOG(FATAL) << "Unreachable type " << instruction->GetType();
       UNREACHABLE();
   }
-
-  // Ints and objects are handled in the switch.
-  if (value_type != Primitive::kPrimInt && value_type != Primitive::kPrimNot) {
-    codegen_->MaybeRecordImplicitNullCheck(instruction);
-  }
 }
 
 void LocationsBuilderMIPS::VisitBoundsCheck(HBoundsCheck* instruction) {
@@ -3589,6 +3591,7 @@
   LoadOperandType load_type = kLoadUnsignedByte;
   bool is_volatile = field_info.IsVolatile();
   uint32_t offset = field_info.GetFieldOffset().Uint32Value();
+  auto null_checker = GetImplicitNullChecker(instruction);
 
   switch (type) {
     case Primitive::kPrimBoolean:
@@ -3654,34 +3657,20 @@
       if (type == Primitive::kPrimLong) {
         DCHECK(locations->Out().IsRegisterPair());
         dst = locations->Out().AsRegisterPairLow<Register>();
-        Register dst_high = locations->Out().AsRegisterPairHigh<Register>();
-        if (obj == dst) {
-          __ LoadFromOffset(kLoadWord, dst_high, obj, offset + kMipsWordSize);
-          codegen_->MaybeRecordImplicitNullCheck(instruction);
-          __ LoadFromOffset(kLoadWord, dst, obj, offset);
-        } else {
-          __ LoadFromOffset(kLoadWord, dst, obj, offset);
-          codegen_->MaybeRecordImplicitNullCheck(instruction);
-          __ LoadFromOffset(kLoadWord, dst_high, obj, offset + kMipsWordSize);
-        }
       } else {
         DCHECK(locations->Out().IsRegister());
         dst = locations->Out().AsRegister<Register>();
-        __ LoadFromOffset(load_type, dst, obj, offset);
       }
+      __ LoadFromOffset(load_type, dst, obj, offset, null_checker);
     } else {
       DCHECK(locations->Out().IsFpuRegister());
       FRegister dst = locations->Out().AsFpuRegister<FRegister>();
       if (type == Primitive::kPrimFloat) {
-        __ LoadSFromOffset(dst, obj, offset);
+        __ LoadSFromOffset(dst, obj, offset, null_checker);
       } else {
-        __ LoadDFromOffset(dst, obj, offset);
+        __ LoadDFromOffset(dst, obj, offset, null_checker);
       }
     }
-    // Longs are handled earlier.
-    if (type != Primitive::kPrimLong) {
-      codegen_->MaybeRecordImplicitNullCheck(instruction);
-    }
   }
 
   if (is_volatile) {
@@ -3729,6 +3718,7 @@
   StoreOperandType store_type = kStoreByte;
   bool is_volatile = field_info.IsVolatile();
   uint32_t offset = field_info.GetFieldOffset().Uint32Value();
+  auto null_checker = GetImplicitNullChecker(instruction);
 
   switch (type) {
     case Primitive::kPrimBoolean:
@@ -3800,28 +3790,20 @@
       if (type == Primitive::kPrimLong) {
         DCHECK(locations->InAt(1).IsRegisterPair());
         src = locations->InAt(1).AsRegisterPairLow<Register>();
-        Register src_high = locations->InAt(1).AsRegisterPairHigh<Register>();
-        __ StoreToOffset(kStoreWord, src, obj, offset);
-        codegen_->MaybeRecordImplicitNullCheck(instruction);
-        __ StoreToOffset(kStoreWord, src_high, obj, offset + kMipsWordSize);
       } else {
         DCHECK(locations->InAt(1).IsRegister());
         src = locations->InAt(1).AsRegister<Register>();
-        __ StoreToOffset(store_type, src, obj, offset);
       }
+      __ StoreToOffset(store_type, src, obj, offset, null_checker);
     } else {
       DCHECK(locations->InAt(1).IsFpuRegister());
       FRegister src = locations->InAt(1).AsFpuRegister<FRegister>();
       if (type == Primitive::kPrimFloat) {
-        __ StoreSToOffset(src, obj, offset);
+        __ StoreSToOffset(src, obj, offset, null_checker);
       } else {
-        __ StoreDToOffset(src, obj, offset);
+        __ StoreDToOffset(src, obj, offset, null_checker);
       }
     }
-    // Longs are handled earlier.
-    if (type != Primitive::kPrimLong) {
-      codegen_->MaybeRecordImplicitNullCheck(instruction);
-    }
   }
 
   // TODO: memory barriers?
diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index 63a0345..46810d6 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h
@@ -257,6 +257,7 @@
   void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
   void GenerateDivRemIntegral(HBinaryOperation* instruction);
   void HandleGoto(HInstruction* got, HBasicBlock* successor);
+  auto GetImplicitNullChecker(HInstruction* instruction);
 
   MipsAssembler* const assembler_;
   CodeGeneratorMIPS* const codegen_;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 7aca16f..f50eb5c 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -445,8 +445,8 @@
 // Slow path marking an object during a read barrier.
 class ReadBarrierMarkSlowPathX86 : public SlowPathCode {
  public:
-  ReadBarrierMarkSlowPathX86(HInstruction* instruction, Location obj)
-      : SlowPathCode(instruction), obj_(obj) {
+  ReadBarrierMarkSlowPathX86(HInstruction* instruction, Location obj, bool unpoison)
+      : SlowPathCode(instruction), obj_(obj), unpoison_(unpoison) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -470,6 +470,10 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
+    if (unpoison_) {
+      // Object* ref = ref_addr->AsMirrorPtr()
+      __ MaybeUnpoisonHeapReference(reg);
+    }
     // No need to save live registers; it's taken care of by the
     // entrypoint. Also, there is no need to update the stack mask,
     // as this runtime call will not trigger a garbage collection.
@@ -499,6 +503,7 @@
 
  private:
   const Location obj_;
+  const bool unpoison_;
 
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathX86);
 };
@@ -4631,10 +4636,6 @@
     // load the temp into the XMM and then copy the XMM into the
     // output, 32 bits at a time).
     locations->AddTemp(Location::RequiresFpuRegister());
-  } else if (object_field_get_with_read_barrier && kUseBakerReadBarrier) {
-    // We need a temporary register for the read barrier marking slow
-    // path in CodeGeneratorX86::GenerateFieldLoadWithBakerReadBarrier.
-    locations->AddTemp(Location::RequiresRegister());
   }
 }
 
@@ -4678,11 +4679,10 @@
     case Primitive::kPrimNot: {
       // /* HeapReference<Object> */ out = *(base + offset)
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-        Location temp_loc = locations->GetTemp(0);
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorX86::GenerateFieldLoadWithBakerReadBarrier call.
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
-            instruction, out, base, offset, temp_loc, /* needs_null_check */ true);
+            instruction, out, base, offset, /* needs_null_check */ true);
         if (is_volatile) {
           codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
         }
@@ -5093,11 +5093,6 @@
             Location::kOutputOverlap :
             Location::kNoOutputOverlap);
   }
-  // We need a temporary register for the read barrier marking slow
-  // path in CodeGeneratorX86::GenerateArrayLoadWithBakerReadBarrier.
-  if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
-    locations->AddTemp(Location::RequiresRegister());
-  }
 }
 
 void InstructionCodeGeneratorX86::VisitArrayGet(HArrayGet* instruction) {
@@ -5172,11 +5167,10 @@
       // /* HeapReference<Object> */ out =
       //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-        Location temp = locations->GetTemp(0);
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorX86::GenerateArrayLoadWithBakerReadBarrier call.
         codegen_->GenerateArrayLoadWithBakerReadBarrier(
-            instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ true);
+            instruction, out_loc, obj, data_offset, index, /* needs_null_check */ true);
       } else {
         Register out = out_loc.AsRegister<Register>();
         if (index.IsConstant()) {
@@ -6281,8 +6275,8 @@
 
 static bool TypeCheckNeedsATemporary(TypeCheckKind type_check_kind) {
   return kEmitCompilerReadBarrier &&
-      (kUseBakerReadBarrier ||
-       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+      !kUseBakerReadBarrier &&
+      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
        type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
        type_check_kind == TypeCheckKind::kArrayObjectCheck);
 }
@@ -6343,7 +6337,7 @@
   }
 
   // /* HeapReference<Class> */ out = obj->klass_
-  GenerateReferenceLoadTwoRegisters(instruction, out_loc, obj_loc, class_offset, maybe_temp_loc);
+  GenerateReferenceLoadTwoRegisters(instruction, out_loc, obj_loc, class_offset);
 
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck: {
@@ -6565,7 +6559,7 @@
   }
 
   // /* HeapReference<Class> */ temp = obj->klass_
-  GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+  GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
 
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
@@ -6601,8 +6595,7 @@
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
       __ jmp(type_check_slow_path->GetEntryLabel());
 
       __ Bind(&compare_classes);
@@ -6641,8 +6634,7 @@
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
       __ jmp(type_check_slow_path->GetEntryLabel());
       break;
     }
@@ -6674,8 +6666,7 @@
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
       __ jmp(type_check_slow_path->GetEntryLabel());
 
       __ Bind(&check_non_primitive_component_type);
@@ -6683,8 +6674,7 @@
       __ j(kEqual, &done);
       // Same comment as above regarding `temp` and the slow path.
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
       __ jmp(type_check_slow_path->GetEntryLabel());
       break;
     }
@@ -6875,17 +6865,17 @@
                                                                    Location maybe_temp) {
   Register out_reg = out.AsRegister<Register>();
   if (kEmitCompilerReadBarrier) {
-    DCHECK(maybe_temp.IsRegister()) << maybe_temp;
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(out + offset)
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
-          instruction, out, out_reg, offset, maybe_temp, /* needs_null_check */ false);
+          instruction, out, out_reg, offset, /* needs_null_check */ false);
     } else {
       // Load with slow path based read barrier.
       // Save the value of `out` into `maybe_temp` before overwriting it
       // in the following move operation, as we will need it for the
       // read barrier below.
+      DCHECK(maybe_temp.IsRegister()) << maybe_temp;
       __ movl(maybe_temp.AsRegister<Register>(), out_reg);
       // /* HeapReference<Object> */ out = *(out + offset)
       __ movl(out_reg, Address(out_reg, offset));
@@ -6902,17 +6892,15 @@
 void InstructionCodeGeneratorX86::GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
                                                                     Location out,
                                                                     Location obj,
-                                                                    uint32_t offset,
-                                                                    Location maybe_temp) {
+                                                                    uint32_t offset) {
   Register out_reg = out.AsRegister<Register>();
   Register obj_reg = obj.AsRegister<Register>();
   if (kEmitCompilerReadBarrier) {
     if (kUseBakerReadBarrier) {
-      DCHECK(maybe_temp.IsRegister()) << maybe_temp;
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(obj + offset)
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
-          instruction, out, obj_reg, offset, maybe_temp, /* needs_null_check */ false);
+          instruction, out, obj_reg, offset, /* needs_null_check */ false);
     } else {
       // Load with slow path based read barrier.
       // /* HeapReference<Object> */ out = *(obj + offset)
@@ -6955,9 +6943,9 @@
                     "art::mirror::CompressedReference<mirror::Object> and int32_t "
                     "have different sizes.");
 
-      // Slow path used to mark the GC root `root`.
-      SlowPathCode* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(instruction, root);
+      // Slow path marking the GC root `root`.
+      SlowPathCode* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(
+          instruction, root, /* unpoison */ false);
       codegen_->AddSlowPath(slow_path);
 
       __ fs()->cmpl(Address::Absolute(Thread::IsGcMarkingOffset<kX86PointerSize>().Int32Value()),
@@ -6991,14 +6979,13 @@
                                                              Location ref,
                                                              Register obj,
                                                              uint32_t offset,
-                                                             Location temp,
                                                              bool needs_null_check) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
   // /* HeapReference<Object> */ ref = *(obj + offset)
   Address src(obj, offset);
-  GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, src, temp, needs_null_check);
+  GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, src, needs_null_check);
 }
 
 void CodeGeneratorX86::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
@@ -7006,7 +6993,6 @@
                                                              Register obj,
                                                              uint32_t data_offset,
                                                              Location index,
-                                                             Location temp,
                                                              bool needs_null_check) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
@@ -7019,14 +7005,13 @@
   Address src = index.IsConstant() ?
       Address(obj, (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset) :
       Address(obj, index.AsRegister<Register>(), TIMES_4, data_offset);
-  GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, src, temp, needs_null_check);
+  GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, src, needs_null_check);
 }
 
 void CodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
                                                                  Location ref,
                                                                  Register obj,
                                                                  const Address& src,
-                                                                 Location temp,
                                                                  bool needs_null_check) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
@@ -7056,17 +7041,23 @@
   //   performance reasons.
 
   Register ref_reg = ref.AsRegister<Register>();
-  Register temp_reg = temp.AsRegister<Register>();
   uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
-  // /* int32_t */ monitor = obj->monitor_
-  __ movl(temp_reg, Address(obj, monitor_offset));
+  // Given the numeric representation, it's enough to check the low bit of the rb_state.
+  static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+  static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+  constexpr uint32_t gray_byte_position = LockWord::kReadBarrierStateShift / kBitsPerByte;
+  constexpr uint32_t gray_bit_position = LockWord::kReadBarrierStateShift % kBitsPerByte;
+  constexpr int32_t test_value = static_cast<int8_t>(1 << gray_bit_position);
+
+  // if (rb_state == ReadBarrier::gray_ptr_)
+  //   ref = ReadBarrier::Mark(ref);
+  // At this point, just do the "if" and make sure that flags are preserved until the branch.
+  __ testb(Address(obj, monitor_offset + gray_byte_position), Immediate(test_value));
   if (needs_null_check) {
     MaybeRecordImplicitNullCheck(instruction);
   }
-  // /* LockWord */ lock_word = LockWord(monitor)
-  static_assert(sizeof(LockWord) == sizeof(int32_t),
-                "art::LockWord and int32_t have different sizes.");
 
   // Load fence to prevent load-load reordering.
   // Note that this is a no-op, thanks to the x86 memory model.
@@ -7074,25 +7065,20 @@
 
   // The actual reference load.
   // /* HeapReference<Object> */ ref = *src
-  __ movl(ref_reg, src);
+  __ movl(ref_reg, src);  // Flags are unaffected.
+
+  // Note: Reference unpoisoning modifies the flags, so we need to delay it after the branch.
+  // Slow path marking the object `ref` when it is gray.
+  SlowPathCode* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(
+      instruction, ref, /* unpoison */ true);
+  AddSlowPath(slow_path);
+
+  // We have done the "if" of the gray bit check above, now branch based on the flags.
+  __ j(kNotZero, slow_path->GetEntryLabel());
 
   // Object* ref = ref_addr->AsMirrorPtr()
   __ MaybeUnpoisonHeapReference(ref_reg);
 
-  // Slow path used to mark the object `ref` when it is gray.
-  SlowPathCode* slow_path =
-      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(instruction, ref);
-  AddSlowPath(slow_path);
-
-  // if (rb_state == ReadBarrier::gray_ptr_)
-  //   ref = ReadBarrier::Mark(ref);
-  // Given the numeric representation, it's enough to check the low bit of the
-  // rb_state. We do that by shifting the bit out of the lock word with SHR.
-  static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
-  static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
-  static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
-  __ shrl(temp_reg, Immediate(LockWord::kReadBarrierStateShift + 1));
-  __ j(kCarrySet, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
 
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 894f2e8..c644e40 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -254,8 +254,7 @@
   void GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
                                          Location out,
                                          Location obj,
-                                         uint32_t offset,
-                                         Location maybe_temp);
+                                         uint32_t offset);
   // Generate a GC root reference load:
   //
   //   root <- *address
@@ -487,7 +486,6 @@
                                              Location ref,
                                              Register obj,
                                              uint32_t offset,
-                                             Location temp,
                                              bool needs_null_check);
   // Fast path implementation of ReadBarrier::Barrier for a heap
   // reference array load when Baker's read barriers are used.
@@ -496,7 +494,6 @@
                                              Register obj,
                                              uint32_t data_offset,
                                              Location index,
-                                             Location temp,
                                              bool needs_null_check);
   // Factored implementation used by GenerateFieldLoadWithBakerReadBarrier
   // and GenerateArrayLoadWithBakerReadBarrier.
@@ -504,7 +501,6 @@
                                                  Location ref,
                                                  Register obj,
                                                  const Address& src,
-                                                 Location temp,
                                                  bool needs_null_check);
 
   // Generate a read barrier for a heap reference within `instruction`
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 0c55ae4..ec37e5d 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -466,8 +466,8 @@
 // Slow path marking an object during a read barrier.
 class ReadBarrierMarkSlowPathX86_64 : public SlowPathCode {
  public:
-  ReadBarrierMarkSlowPathX86_64(HInstruction* instruction, Location obj)
-      : SlowPathCode(instruction), obj_(obj) {
+  ReadBarrierMarkSlowPathX86_64(HInstruction* instruction, Location obj, bool unpoison)
+      : SlowPathCode(instruction), obj_(obj), unpoison_(unpoison) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -491,6 +491,10 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
+    if (unpoison_) {
+      // Object* ref = ref_addr->AsMirrorPtr()
+      __ MaybeUnpoisonHeapReference(obj_.AsRegister<CpuRegister>());
+    }
     // No need to save live registers; it's taken care of by the
     // entrypoint. Also, there is no need to update the stack mask,
     // as this runtime call will not trigger a garbage collection.
@@ -520,6 +524,7 @@
 
  private:
   const Location obj_;
+  const bool unpoison_;
 
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathX86_64);
 };
@@ -4152,11 +4157,6 @@
         Location::RequiresRegister(),
         object_field_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
-  if (object_field_get_with_read_barrier && kUseBakerReadBarrier) {
-    // We need a temporary register for the read barrier marking slow
-    // path in CodeGeneratorX86_64::GenerateFieldLoadWithBakerReadBarrier.
-    locations->AddTemp(Location::RequiresRegister());
-  }
 }
 
 void InstructionCodeGeneratorX86_64::HandleFieldGet(HInstruction* instruction,
@@ -4200,11 +4200,10 @@
     case Primitive::kPrimNot: {
       // /* HeapReference<Object> */ out = *(base + offset)
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-        Location temp_loc = locations->GetTemp(0);
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorX86::GenerateFieldLoadWithBakerReadBarrier call.
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
-            instruction, out, base, offset, temp_loc, /* needs_null_check */ true);
+            instruction, out, base, offset, /* needs_null_check */ true);
         if (is_volatile) {
           codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
         }
@@ -4588,11 +4587,6 @@
         Location::RequiresRegister(),
         object_array_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
-  // We need a temporary register for the read barrier marking slow
-  // path in CodeGeneratorX86_64::GenerateArrayLoadWithBakerReadBarrier.
-  if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
-    locations->AddTemp(Location::RequiresRegister());
-  }
 }
 
 void InstructionCodeGeneratorX86_64::VisitArrayGet(HArrayGet* instruction) {
@@ -4667,11 +4661,10 @@
       // /* HeapReference<Object> */ out =
       //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-        Location temp = locations->GetTemp(0);
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorX86::GenerateArrayLoadWithBakerReadBarrier call.
         codegen_->GenerateArrayLoadWithBakerReadBarrier(
-            instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ true);
+            instruction, out_loc, obj, data_offset, index, /* needs_null_check */ true);
       } else {
         CpuRegister out = out_loc.AsRegister<CpuRegister>();
         if (index.IsConstant()) {
@@ -5687,8 +5680,8 @@
 
 static bool TypeCheckNeedsATemporary(TypeCheckKind type_check_kind) {
   return kEmitCompilerReadBarrier &&
-      (kUseBakerReadBarrier ||
-       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+      !kUseBakerReadBarrier &&
+      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
        type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
        type_check_kind == TypeCheckKind::kArrayObjectCheck);
 }
@@ -5749,7 +5742,7 @@
   }
 
   // /* HeapReference<Class> */ out = obj->klass_
-  GenerateReferenceLoadTwoRegisters(instruction, out_loc, obj_loc, class_offset, maybe_temp_loc);
+  GenerateReferenceLoadTwoRegisters(instruction, out_loc, obj_loc, class_offset);
 
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck: {
@@ -5979,8 +5972,7 @@
       }
 
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
 
       if (cls.IsRegister()) {
         __ cmpl(temp, cls.AsRegister<CpuRegister>());
@@ -6004,8 +5996,7 @@
       }
 
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
 
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
@@ -6025,8 +6016,7 @@
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
       __ jmp(type_check_slow_path->GetEntryLabel());
 
       __ Bind(&compare_classes);
@@ -6050,8 +6040,7 @@
       }
 
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
 
       // Walk over the class hierarchy to find a match.
       NearLabel loop;
@@ -6077,8 +6066,7 @@
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
       __ jmp(type_check_slow_path->GetEntryLabel());
       __ Bind(&done);
       break;
@@ -6097,8 +6085,7 @@
       }
 
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
 
       // Do an exact check.
       NearLabel check_non_primitive_component_type;
@@ -6126,8 +6113,7 @@
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
       __ jmp(type_check_slow_path->GetEntryLabel());
 
       __ Bind(&check_non_primitive_component_type);
@@ -6135,8 +6121,7 @@
       __ j(kEqual, &done);
       // Same comment as above regarding `temp` and the slow path.
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
       __ jmp(type_check_slow_path->GetEntryLabel());
       __ Bind(&done);
       break;
@@ -6152,8 +6137,7 @@
       }
 
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
 
       // We always go into the type check slow path for the unresolved
       // and interface check cases.
@@ -6321,17 +6305,17 @@
                                                                       Location maybe_temp) {
   CpuRegister out_reg = out.AsRegister<CpuRegister>();
   if (kEmitCompilerReadBarrier) {
-    DCHECK(maybe_temp.IsRegister()) << maybe_temp;
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(out + offset)
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
-          instruction, out, out_reg, offset, maybe_temp, /* needs_null_check */ false);
+          instruction, out, out_reg, offset, /* needs_null_check */ false);
     } else {
       // Load with slow path based read barrier.
       // Save the value of `out` into `maybe_temp` before overwriting it
       // in the following move operation, as we will need it for the
       // read barrier below.
+      DCHECK(maybe_temp.IsRegister()) << maybe_temp;
       __ movl(maybe_temp.AsRegister<CpuRegister>(), out_reg);
       // /* HeapReference<Object> */ out = *(out + offset)
       __ movl(out_reg, Address(out_reg, offset));
@@ -6348,17 +6332,15 @@
 void InstructionCodeGeneratorX86_64::GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
                                                                        Location out,
                                                                        Location obj,
-                                                                       uint32_t offset,
-                                                                       Location maybe_temp) {
+                                                                       uint32_t offset) {
   CpuRegister out_reg = out.AsRegister<CpuRegister>();
   CpuRegister obj_reg = obj.AsRegister<CpuRegister>();
   if (kEmitCompilerReadBarrier) {
     if (kUseBakerReadBarrier) {
-      DCHECK(maybe_temp.IsRegister()) << maybe_temp;
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(obj + offset)
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
-          instruction, out, obj_reg, offset, maybe_temp, /* needs_null_check */ false);
+          instruction, out, obj_reg, offset, /* needs_null_check */ false);
     } else {
       // Load with slow path based read barrier.
       // /* HeapReference<Object> */ out = *(obj + offset)
@@ -6401,9 +6383,9 @@
                     "art::mirror::CompressedReference<mirror::Object> and int32_t "
                     "have different sizes.");
 
-      // Slow path used to mark the GC root `root`.
-      SlowPathCode* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(instruction, root);
+      // Slow path marking the GC root `root`.
+      SlowPathCode* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(
+          instruction, root, /* unpoison */ false);
       codegen_->AddSlowPath(slow_path);
 
       __ gs()->cmpl(Address::Absolute(Thread::IsGcMarkingOffset<kX86_64PointerSize>().Int32Value(),
@@ -6438,14 +6420,13 @@
                                                                 Location ref,
                                                                 CpuRegister obj,
                                                                 uint32_t offset,
-                                                                Location temp,
                                                                 bool needs_null_check) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
   // /* HeapReference<Object> */ ref = *(obj + offset)
   Address src(obj, offset);
-  GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, src, temp, needs_null_check);
+  GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, src, needs_null_check);
 }
 
 void CodeGeneratorX86_64::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
@@ -6453,7 +6434,6 @@
                                                                 CpuRegister obj,
                                                                 uint32_t data_offset,
                                                                 Location index,
-                                                                Location temp,
                                                                 bool needs_null_check) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
@@ -6466,14 +6446,13 @@
   Address src = index.IsConstant() ?
       Address(obj, (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset) :
       Address(obj, index.AsRegister<CpuRegister>(), TIMES_4, data_offset);
-  GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, src, temp, needs_null_check);
+  GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, src, needs_null_check);
 }
 
 void CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
                                                                     Location ref,
                                                                     CpuRegister obj,
                                                                     const Address& src,
-                                                                    Location temp,
                                                                     bool needs_null_check) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
@@ -6503,17 +6482,23 @@
   //   performance reasons.
 
   CpuRegister ref_reg = ref.AsRegister<CpuRegister>();
-  CpuRegister temp_reg = temp.AsRegister<CpuRegister>();
   uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
-  // /* int32_t */ monitor = obj->monitor_
-  __ movl(temp_reg, Address(obj, monitor_offset));
+  // Given the numeric representation, it's enough to check the low bit of the rb_state.
+  static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+  static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+  constexpr uint32_t gray_byte_position = LockWord::kReadBarrierStateShift / kBitsPerByte;
+  constexpr uint32_t gray_bit_position = LockWord::kReadBarrierStateShift % kBitsPerByte;
+  constexpr int32_t test_value = static_cast<int8_t>(1 << gray_bit_position);
+
+  // if (rb_state == ReadBarrier::gray_ptr_)
+  //   ref = ReadBarrier::Mark(ref);
+  // At this point, just do the "if" and make sure that flags are preserved until the branch.
+  __ testb(Address(obj, monitor_offset + gray_byte_position), Immediate(test_value));
   if (needs_null_check) {
     MaybeRecordImplicitNullCheck(instruction);
   }
-  // /* LockWord */ lock_word = LockWord(monitor)
-  static_assert(sizeof(LockWord) == sizeof(int32_t),
-                "art::LockWord and int32_t have different sizes.");
 
   // Load fence to prevent load-load reordering.
   // Note that this is a no-op, thanks to the x86-64 memory model.
@@ -6521,25 +6506,20 @@
 
   // The actual reference load.
   // /* HeapReference<Object> */ ref = *src
-  __ movl(ref_reg, src);
+  __ movl(ref_reg, src);  // Flags are unaffected.
+
+  // Note: Reference unpoisoning modifies the flags, so we need to delay it after the branch.
+  // Slow path marking the object `ref` when it is gray.
+  SlowPathCode* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(
+      instruction, ref, /* unpoison */ true);
+  AddSlowPath(slow_path);
+
+  // We have done the "if" of the gray bit check above, now branch based on the flags.
+  __ j(kNotZero, slow_path->GetEntryLabel());
 
   // Object* ref = ref_addr->AsMirrorPtr()
   __ MaybeUnpoisonHeapReference(ref_reg);
 
-  // Slow path used to mark the object `ref` when it is gray.
-  SlowPathCode* slow_path =
-      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(instruction, ref);
-  AddSlowPath(slow_path);
-
-  // if (rb_state == ReadBarrier::gray_ptr_)
-  //   ref = ReadBarrier::Mark(ref);
-  // Given the numeric representation, it's enough to check the low bit of the
-  // rb_state. We do that by shifting the bit out of the lock word with SHR.
-  static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
-  static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
-  static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
-  __ shrl(temp_reg, Immediate(LockWord::kReadBarrierStateShift + 1));
-  __ j(kCarrySet, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
 
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 4e0e34c..44844ac 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -248,8 +248,7 @@
   void GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
                                          Location out,
                                          Location obj,
-                                         uint32_t offset,
-                                         Location maybe_temp);
+                                         uint32_t offset);
   // Generate a GC root reference load:
   //
   //   root <- *address
@@ -427,7 +426,6 @@
                                              Location ref,
                                              CpuRegister obj,
                                              uint32_t offset,
-                                             Location temp,
                                              bool needs_null_check);
   // Fast path implementation of ReadBarrier::Barrier for a heap
   // reference array load when Baker's read barriers are used.
@@ -436,7 +434,6 @@
                                              CpuRegister obj,
                                              uint32_t data_offset,
                                              Location index,
-                                             Location temp,
                                              bool needs_null_check);
   // Factored implementation used by GenerateFieldLoadWithBakerReadBarrier
   // and GenerateArrayLoadWithBakerReadBarrier.
@@ -444,7 +441,6 @@
                                                  Location ref,
                                                  CpuRegister obj,
                                                  const Address& src,
-                                                 Location temp,
                                                  bool needs_null_check);
 
   // Generate a read barrier for a heap reference within `instruction`
diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
index cc949c5..cea4a7e 100644
--- a/compiler/optimizing/common_arm64.h
+++ b/compiler/optimizing/common_arm64.h
@@ -124,6 +124,18 @@
       : static_cast<vixl::aarch64::CPURegister>(InputRegisterAt(instr, index));
 }
 
+static inline vixl::aarch64::CPURegister InputCPURegisterOrZeroRegAt(HInstruction* instr,
+                                                                     int index) {
+  HInstruction* input = instr->InputAt(index);
+  Primitive::Type input_type = input->GetType();
+  if (input->IsConstant() && input->AsConstant()->IsZeroBitPattern()) {
+    return (Primitive::ComponentSize(input_type) >= vixl::aarch64::kXRegSizeInBytes)
+        ?  vixl::aarch64::xzr
+        : vixl::aarch64::wzr;
+  }
+  return InputCPURegisterAt(instr, index);
+}
+
 static inline int64_t Int64ConstantFrom(Location location) {
   HConstant* instr = location.GetConstant();
   if (instr->IsIntConstant()) {
@@ -339,6 +351,10 @@
   return instruction->IsAdd() || instruction->IsSub();
 }
 
+static inline bool IsConstantZeroBitPattern(const HInstruction* instruction) {
+  return instruction->IsConstant() && instruction->AsConstant()->IsZeroBitPattern();
+}
+
 }  // namespace helpers
 }  // namespace arm64
 }  // namespace art
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 89d80cc..b3d5341 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -122,7 +122,10 @@
             new DisassemblerOptions(/* absolute_addresses */ false,
                                     base_address,
                                     end_address,
-                                    /* can_read_literals */ true)));
+                                    /* can_read_literals */ true,
+                                    Is64BitInstructionSet(instruction_set)
+                                        ? &Thread::DumpThreadOffset<PointerSize::k64>
+                                        : &Thread::DumpThreadOffset<PointerSize::k32>)));
   }
 
   ~HGraphVisualizerDisassembler() {
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 55e1ab2..6e5eb66 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -2456,16 +2456,18 @@
   __ FloorWS(FTMP, in);
   __ Mfc1(out, FTMP);
 
-  __ LoadConst32(TMP, 1);
+  if (!IsR6()) {
+    __ LoadConst32(TMP, -1);
+  }
 
-  // TMP = (out = java.lang.Integer.MAX_VALUE) ? 1 : 0;
+  // TMP = (out = java.lang.Integer.MAX_VALUE) ? -1 : 0;
   __ LoadConst32(AT, std::numeric_limits<int32_t>::max());
   __ Bne(AT, out, &finite);
 
   __ Mtc1(ZERO, FTMP);
   if (IsR6()) {
     __ CmpLtS(FTMP, in, FTMP);
-    __ Mfc1(AT, FTMP);
+    __ Mfc1(TMP, FTMP);
   } else {
     __ ColtS(in, FTMP);
   }
@@ -2474,28 +2476,26 @@
 
   __ Bind(&finite);
 
-  // TMP = (0.5f <= (in - out)) ? 1 : 0;
+  // TMP = (0.5f <= (in - out)) ? -1 : 0;
   __ Cvtsw(FTMP, FTMP);  // Convert output of floor.w.s back to "float".
   __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f));
   __ SubS(FTMP, in, FTMP);
   __ Mtc1(AT, half);
   if (IsR6()) {
     __ CmpLeS(FTMP, half, FTMP);
-    __ Mfc1(AT, FTMP);
+    __ Mfc1(TMP, FTMP);
   } else {
     __ ColeS(half, FTMP);
   }
 
   __ Bind(&add);
 
-  if (IsR6()) {
-    __ Selnez(TMP, TMP, AT);
-  } else {
+  if (!IsR6()) {
     __ Movf(TMP, ZERO);
   }
 
-  // Return out += TMP.
-  __ Addu(out, out, TMP);
+  // Return out -= TMP.
+  __ Subu(out, out, TMP);
 
   __ Bind(&done);
 }
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 49d6c19..cf4a040 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -1934,10 +1934,9 @@
       Register output = output_loc.AsRegister<Register>();
       if (kEmitCompilerReadBarrier) {
         if (kUseBakerReadBarrier) {
-          Location temp = locations->GetTemp(0);
           Address src(base, offset, ScaleFactor::TIMES_1, 0);
           codegen->GenerateReferenceLoadWithBakerReadBarrier(
-              invoke, output_loc, base, src, temp, /* needs_null_check */ false);
+              invoke, output_loc, base, src, /* needs_null_check */ false);
         } else {
           __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
           codegen->GenerateReadBarrierSlow(
@@ -2000,11 +1999,6 @@
     locations->SetOut(Location::RequiresRegister(),
                       can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
-  if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-    // We need a temporary register for the read barrier marking slow
-    // path in InstructionCodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier.
-    locations->AddTemp(Location::RequiresRegister());
-  }
 }
 
 void IntrinsicLocationsBuilderX86::VisitUnsafeGet(HInvoke* invoke) {
@@ -2933,11 +2927,11 @@
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
         // /* HeapReference<Class> */ temp1 = src->klass_
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
-            invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false);
+            invoke, temp1_loc, src, class_offset, /* needs_null_check */ false);
         // Bail out if the source is not a non primitive array.
         // /* HeapReference<Class> */ temp1 = temp1->component_type_
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
-            invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
+            invoke, temp1_loc, temp1, component_offset, /* needs_null_check */ false);
         __ testl(temp1, temp1);
         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
         // If heap poisoning is enabled, `temp1` has been unpoisoned
@@ -2970,7 +2964,7 @@
 
       // /* HeapReference<Class> */ temp1 = dest->klass_
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
-          invoke, temp1_loc, dest, class_offset, temp2_loc, /* needs_null_check */ false);
+          invoke, temp1_loc, dest, class_offset, /* needs_null_check */ false);
 
       if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
         // Bail out if the destination is not a non primitive array.
@@ -2982,7 +2976,7 @@
         // temporaries such a `temp1`.
         // /* HeapReference<Class> */ temp2 = temp1->component_type_
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
-            invoke, temp2_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false);
+            invoke, temp2_loc, temp1, component_offset, /* needs_null_check */ false);
         __ testl(temp2, temp2);
         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
         // If heap poisoning is enabled, `temp2` has been unpoisoned
@@ -2995,7 +2989,7 @@
       // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
       // /* HeapReference<Class> */ temp2 = src->klass_
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
-          invoke, temp2_loc, src, class_offset, temp3_loc, /* needs_null_check */ false);
+          invoke, temp2_loc, src, class_offset, /* needs_null_check */ false);
       // Note: if heap poisoning is on, we are comparing two unpoisoned references here.
       __ cmpl(temp1, temp2);
 
@@ -3004,7 +2998,7 @@
         __ j(kEqual, &do_copy);
         // /* HeapReference<Class> */ temp1 = temp1->component_type_
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
-            invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
+            invoke, temp1_loc, temp1, component_offset, /* needs_null_check */ false);
         // We do not need to emit a read barrier for the following
         // heap reference load, as `temp1` is only used in a
         // comparison with null below, and this reference is not
@@ -3058,10 +3052,10 @@
     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
       // /* HeapReference<Class> */ temp1 = src->klass_
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
-          invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false);
+          invoke, temp1_loc, src, class_offset, /* needs_null_check */ false);
       // /* HeapReference<Class> */ temp1 = temp1->component_type_
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
-          invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
+          invoke, temp1_loc, temp1, component_offset, /* needs_null_check */ false);
       __ testl(temp1, temp1);
       __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
       // If heap poisoning is enabled, `temp1` has been unpoisoned
@@ -3139,11 +3133,18 @@
     __ cmpl(temp1, temp3);
     __ j(kEqual, &done);
 
-    // /* int32_t */ monitor = src->monitor_
-    __ movl(temp2, Address(src, monitor_offset));
-    // /* LockWord */ lock_word = LockWord(monitor)
-    static_assert(sizeof(LockWord) == sizeof(int32_t),
-                  "art::LockWord and int32_t have different sizes.");
+    // Given the numeric representation, it's enough to check the low bit of the rb_state.
+    static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+    static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+    static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+    constexpr uint32_t gray_byte_position = LockWord::kReadBarrierStateShift / kBitsPerByte;
+    constexpr uint32_t gray_bit_position = LockWord::kReadBarrierStateShift % kBitsPerByte;
+    constexpr int32_t test_value = static_cast<int8_t>(1 << gray_bit_position);
+
+    // if (rb_state == ReadBarrier::gray_ptr_)
+    //   goto slow_path;
+    // At this point, just do the "if" and make sure that flags are preserved until the branch.
+    __ testb(Address(src, monitor_offset + gray_byte_position), Immediate(test_value));
 
     // Load fence to prevent load-load reordering.
     // Note that this is a no-op, thanks to the x86 memory model.
@@ -3154,13 +3155,8 @@
         new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathX86(invoke);
     codegen_->AddSlowPath(read_barrier_slow_path);
 
-    // Given the numeric representation, it's enough to check the low bit of the
-    // rb_state. We do that by shifting the bit out of the lock word with SHR.
-    static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
-    static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
-    static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
-    __ shrl(temp2, Immediate(LockWord::kReadBarrierStateShift + 1));
-    __ j(kCarrySet, read_barrier_slow_path->GetEntryLabel());
+    // We have done the "if" of the gray bit check above, now branch based on the flags.
+    __ j(kNotZero, read_barrier_slow_path->GetEntryLabel());
 
     // Fast-path copy.
 
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 311e1cd..a4ee546 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -1241,7 +1241,7 @@
     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
       // /* HeapReference<Class> */ temp1 = dest->klass_
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
-          invoke, temp1_loc, dest, class_offset, temp3_loc, /* needs_null_check */ false);
+          invoke, temp1_loc, dest, class_offset, /* needs_null_check */ false);
       // Register `temp1` is not trashed by the read barrier emitted
       // by GenerateFieldLoadWithBakerReadBarrier below, as that
       // method produces a call to a ReadBarrierMarkRegX entry point,
@@ -1249,7 +1249,7 @@
       // temporaries such a `temp1`.
       // /* HeapReference<Class> */ temp2 = src->klass_
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
-          invoke, temp2_loc, src, class_offset, temp3_loc, /* needs_null_check */ false);
+          invoke, temp2_loc, src, class_offset, /* needs_null_check */ false);
       // If heap poisoning is enabled, `temp1` and `temp2` have been
       // unpoisoned by the the previous calls to
       // GenerateFieldLoadWithBakerReadBarrier.
@@ -1273,7 +1273,7 @@
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
         // /* HeapReference<Class> */ TMP = temp1->component_type_
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
-            invoke, TMP_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false);
+            invoke, TMP_loc, temp1, component_offset, /* needs_null_check */ false);
         __ testl(CpuRegister(TMP), CpuRegister(TMP));
         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
         // If heap poisoning is enabled, `TMP` has been unpoisoned by
@@ -1296,7 +1296,7 @@
         // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
         // /* HeapReference<Class> */ TMP = temp2->component_type_
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
-            invoke, TMP_loc, temp2, component_offset, temp3_loc, /* needs_null_check */ false);
+            invoke, TMP_loc, temp2, component_offset, /* needs_null_check */ false);
         __ testl(CpuRegister(TMP), CpuRegister(TMP));
         __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
         // If heap poisoning is enabled, `TMP` has been unpoisoned by
@@ -1320,7 +1320,7 @@
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
         // /* HeapReference<Class> */ temp1 = temp1->component_type_
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
-            invoke, temp1_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false);
+            invoke, temp1_loc, temp1, component_offset, /* needs_null_check */ false);
         // We do not need to emit a read barrier for the following
         // heap reference load, as `temp1` is only used in a
         // comparison with null below, and this reference is not
@@ -1348,10 +1348,10 @@
     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
       // /* HeapReference<Class> */ temp1 = src->klass_
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
-          invoke, temp1_loc, src, class_offset, temp3_loc, /* needs_null_check */ false);
+          invoke, temp1_loc, src, class_offset, /* needs_null_check */ false);
       // /* HeapReference<Class> */ TMP = temp1->component_type_
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
-          invoke, TMP_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false);
+          invoke, TMP_loc, temp1, component_offset, /* needs_null_check */ false);
       __ testl(CpuRegister(TMP), CpuRegister(TMP));
       __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
     } else {
@@ -1421,11 +1421,18 @@
     __ cmpl(temp1, temp3);
     __ j(kEqual, &done);
 
-    // /* int32_t */ monitor = src->monitor_
-    __ movl(CpuRegister(TMP), Address(src, monitor_offset));
-    // /* LockWord */ lock_word = LockWord(monitor)
-    static_assert(sizeof(LockWord) == sizeof(int32_t),
-                  "art::LockWord and int32_t have different sizes.");
+    // Given the numeric representation, it's enough to check the low bit of the rb_state.
+    static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+    static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+    static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+    constexpr uint32_t gray_byte_position = LockWord::kReadBarrierStateShift / kBitsPerByte;
+    constexpr uint32_t gray_bit_position = LockWord::kReadBarrierStateShift % kBitsPerByte;
+    constexpr int32_t test_value = static_cast<int8_t>(1 << gray_bit_position);
+
+    // if (rb_state == ReadBarrier::gray_ptr_)
+    //   goto slow_path;
+    // At this point, just do the "if" and make sure that flags are preserved until the branch.
+    __ testb(Address(src, monitor_offset + gray_byte_position), Immediate(test_value));
 
     // Load fence to prevent load-load reordering.
     // Note that this is a no-op, thanks to the x86-64 memory model.
@@ -1436,13 +1443,8 @@
         new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathX86_64(invoke);
     codegen_->AddSlowPath(read_barrier_slow_path);
 
-    // Given the numeric representation, it's enough to check the low bit of the
-    // rb_state. We do that by shifting the bit out of the lock word with SHR.
-    static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
-    static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
-    static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
-    __ shrl(CpuRegister(TMP), Immediate(LockWord::kReadBarrierStateShift + 1));
-    __ j(kCarrySet, read_barrier_slow_path->GetEntryLabel());
+    // We have done the "if" of the gray bit check above, now branch based on the flags.
+    __ j(kNotZero, read_barrier_slow_path->GetEntryLabel());
 
     // Fast-path copy.
     // Iterate over the arrays and do a raw copy of the objects. We don't need to
@@ -2087,10 +2089,9 @@
     case Primitive::kPrimNot: {
       if (kEmitCompilerReadBarrier) {
         if (kUseBakerReadBarrier) {
-          Location temp = locations->GetTemp(0);
           Address src(base, offset, ScaleFactor::TIMES_1, 0);
           codegen->GenerateReferenceLoadWithBakerReadBarrier(
-              invoke, output_loc, base, src, temp, /* needs_null_check */ false);
+              invoke, output_loc, base, src, /* needs_null_check */ false);
         } else {
           __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
           codegen->GenerateReadBarrierSlow(
@@ -2113,9 +2114,7 @@
   }
 }
 
-static void CreateIntIntIntToIntLocations(ArenaAllocator* arena,
-                                          HInvoke* invoke,
-                                          Primitive::Type type) {
+static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
   bool can_call = kEmitCompilerReadBarrier &&
       (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
        invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
@@ -2129,30 +2128,25 @@
   locations->SetInAt(2, Location::RequiresRegister());
   locations->SetOut(Location::RequiresRegister(),
                     can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap);
-  if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-    // We need a temporary register for the read barrier marking slow
-    // path in InstructionCodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier.
-    locations->AddTemp(Location::RequiresRegister());
-  }
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGet(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt);
+  CreateIntIntIntToIntLocations(arena_, invoke);
 }
 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt);
+  CreateIntIntIntToIntLocations(arena_, invoke);
 }
 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimLong);
+  CreateIntIntIntToIntLocations(arena_, invoke);
 }
 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimLong);
+  CreateIntIntIntToIntLocations(arena_, invoke);
 }
 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot);
+  CreateIntIntIntToIntLocations(arena_, invoke);
 }
 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot);
+  CreateIntIntIntToIntLocations(arena_, invoke);
 }
 
 
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index a1da20b..f7c325e 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -428,8 +428,14 @@
       || instruction_set == kX86_64;
 }
 
+// Strip pass name suffix to get optimization name.
+static std::string ConvertPassNameToOptimizationName(const std::string& pass_name) {
+  size_t pos = pass_name.find(kPassNameSeparator);
+  return pos == std::string::npos ? pass_name : pass_name.substr(0, pos);
+}
+
 static HOptimization* BuildOptimization(
-    const std::string& opt_name,
+    const std::string& pass_name,
     ArenaAllocator* arena,
     HGraph* graph,
     OptimizingCompilerStats* stats,
@@ -439,6 +445,7 @@
     StackHandleScopeCollection* handles,
     SideEffectsAnalysis* most_recent_side_effects,
     HInductionVarAnalysis* most_recent_induction) {
+  std::string opt_name = ConvertPassNameToOptimizationName(pass_name);
   if (opt_name == BoundsCheckElimination::kBoundsCheckEliminationPassName) {
     CHECK(most_recent_side_effects != nullptr && most_recent_induction != nullptr);
     return new (arena) BoundsCheckElimination(graph,
@@ -446,11 +453,11 @@
                                               most_recent_induction);
   } else if (opt_name == GVNOptimization::kGlobalValueNumberingPassName) {
     CHECK(most_recent_side_effects != nullptr);
-    return new (arena) GVNOptimization(graph, *most_recent_side_effects);
+    return new (arena) GVNOptimization(graph, *most_recent_side_effects, pass_name.c_str());
   } else if (opt_name == HConstantFolding::kConstantFoldingPassName) {
-    return new (arena) HConstantFolding(graph);
+    return new (arena) HConstantFolding(graph, pass_name.c_str());
   } else if (opt_name == HDeadCodeElimination::kDeadCodeEliminationPassName) {
-    return new (arena) HDeadCodeElimination(graph, stats);
+    return new (arena) HDeadCodeElimination(graph, stats, pass_name.c_str());
   } else if (opt_name == HInliner::kInlinerPassName) {
     size_t number_of_dex_registers = dex_compilation_unit.GetCodeItem()->registers_size_;
     return new (arena) HInliner(graph,                   // outer_graph
@@ -470,7 +477,7 @@
   } else if (opt_name == HInductionVarAnalysis::kInductionPassName) {
     return new (arena) HInductionVarAnalysis(graph);
   } else if (opt_name == InstructionSimplifier::kInstructionSimplifierPassName) {
-    return new (arena) InstructionSimplifier(graph, stats);
+    return new (arena) InstructionSimplifier(graph, stats, pass_name.c_str());
   } else if (opt_name == IntrinsicsRecognizer::kIntrinsicsRecognizerPassName) {
     return new (arena) IntrinsicsRecognizer(graph, driver, stats);
   } else if (opt_name == LICM::kLoopInvariantCodeMotionPassName) {
@@ -522,12 +529,9 @@
   SideEffectsAnalysis* most_recent_side_effects = nullptr;
   HInductionVarAnalysis* most_recent_induction = nullptr;
   ArenaVector<HOptimization*> ret(arena->Adapter());
-  for (std::string pass_name : pass_names) {
-    size_t pos = pass_name.find(kPassNameSeparator);    // Strip suffix to get base pass name.
-    std::string opt_name = pos == std::string::npos ? pass_name : pass_name.substr(0, pos);
-
+  for (const std::string& pass_name : pass_names) {
     HOptimization* opt = BuildOptimization(
-        opt_name,
+        pass_name,
         arena,
         graph,
         stats,
@@ -540,6 +544,7 @@
     CHECK(opt != nullptr) << "Couldn't build optimization: \"" << pass_name << "\"";
     ret.push_back(opt);
 
+    std::string opt_name = ConvertPassNameToOptimizationName(pass_name);
     if (opt_name == SideEffectsAnalysis::kSideEffectsAnalysisPassName) {
       most_recent_side_effects = down_cast<SideEffectsAnalysis*>(opt);
     } else if (opt_name == HInductionVarAnalysis::kInductionPassName) {
@@ -833,9 +838,7 @@
 
   // Always use the Thumb-2 assembler: some runtime functionality
   // (like implicit stack overflow checks) assume Thumb-2.
-  if (instruction_set == kArm) {
-    instruction_set = kThumb2;
-  }
+  DCHECK_NE(instruction_set, kArm);
 
   // Do not attempt to compile on architectures we do not support.
   if (!IsInstructionSetSupported(instruction_set)) {
diff --git a/compiler/trampolines/trampoline_compiler.cc b/compiler/trampolines/trampoline_compiler.cc
index 304e56b..55835e7 100644
--- a/compiler/trampolines/trampoline_compiler.cc
+++ b/compiler/trampolines/trampoline_compiler.cc
@@ -20,7 +20,7 @@
 #include "jni_env_ext.h"
 
 #ifdef ART_ENABLE_CODEGEN_arm
-#include "utils/arm/assembler_thumb2.h"
+#include "utils/arm/assembler_arm_vixl.h"
 #endif
 
 #ifdef ART_ENABLE_CODEGEN_arm64
@@ -49,22 +49,37 @@
 
 #ifdef ART_ENABLE_CODEGEN_arm
 namespace arm {
+
+#ifdef ___
+#error "ARM Assembler macro already defined."
+#else
+#define ___ assembler.GetVIXLAssembler()->
+#endif
+
 static std::unique_ptr<const std::vector<uint8_t>> CreateTrampoline(
     ArenaAllocator* arena, EntryPointCallingConvention abi, ThreadOffset32 offset) {
-  Thumb2Assembler assembler(arena);
+  using vixl::aarch32::MemOperand;
+  using vixl::aarch32::pc;
+  using vixl::aarch32::r0;
+  ArmVIXLAssembler assembler(arena);
 
   switch (abi) {
     case kInterpreterAbi:  // Thread* is first argument (R0) in interpreter ABI.
-      __ LoadFromOffset(kLoadWord, PC, R0, offset.Int32Value());
+      ___ Ldr(pc, MemOperand(r0, offset.Int32Value()));
       break;
-    case kJniAbi:  // Load via Thread* held in JNIEnv* in first argument (R0).
-      __ LoadFromOffset(kLoadWord, IP, R0, JNIEnvExt::SelfOffset(4).Int32Value());
-      __ LoadFromOffset(kLoadWord, PC, IP, offset.Int32Value());
+    case kJniAbi: {  // Load via Thread* held in JNIEnv* in first argument (R0).
+      vixl::aarch32::UseScratchRegisterScope temps(assembler.GetVIXLAssembler());
+      const vixl::aarch32::Register temp_reg = temps.Acquire();
+
+      // VIXL will use the destination as a scratch register if
+      // the offset is not encodable as an immediate operand.
+      ___ Ldr(temp_reg, MemOperand(r0, JNIEnvExt::SelfOffset(4).Int32Value()));
+      ___ Ldr(pc, MemOperand(temp_reg, offset.Int32Value()));
       break;
-    case kQuickAbi:  // R9 holds Thread*.
-      __ LoadFromOffset(kLoadWord, PC, R9, offset.Int32Value());
+    }
+    case kQuickAbi:  // TR holds Thread*.
+      ___ Ldr(pc, MemOperand(tr, offset.Int32Value()));
   }
-  __ bkpt(0);
 
   __ FinalizeCode();
   size_t cs = __ CodeSize();
@@ -74,6 +89,9 @@
 
   return std::move(entry_stub);
 }
+
+#undef ___
+
 }  // namespace arm
 #endif  // ART_ENABLE_CODEGEN_arm
 
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index c52a5a9..3084e6e 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -28,6 +28,7 @@
 #include "base/stl_util.h"
 #include "base/value_object.h"
 #include "constants_arm.h"
+#include "utils/arm/assembler_arm_shared.h"
 #include "utils/arm/managed_register_arm.h"
 #include "utils/assembler.h"
 #include "utils/jni_macro_assembler.h"
@@ -36,7 +37,6 @@
 namespace art {
 namespace arm {
 
-class Arm32Assembler;
 class Thumb2Assembler;
 
 // Assembler literal is a value embedded in code, retrieved using a PC-relative load.
@@ -208,7 +208,6 @@
   uint32_t rotate_;
   uint32_t immed_;
 
-  friend class Arm32Assembler;
   friend class Thumb2Assembler;
 
 #ifdef SOURCE_ASSEMBLER_SUPPORT
@@ -216,29 +215,6 @@
 #endif
 };
 
-
-enum LoadOperandType {
-  kLoadSignedByte,
-  kLoadUnsignedByte,
-  kLoadSignedHalfword,
-  kLoadUnsignedHalfword,
-  kLoadWord,
-  kLoadWordPair,
-  kLoadSWord,
-  kLoadDWord
-};
-
-
-enum StoreOperandType {
-  kStoreByte,
-  kStoreHalfword,
-  kStoreWord,
-  kStoreWordPair,
-  kStoreSWord,
-  kStoreDWord
-};
-
-
 // Load/store multiple addressing mode.
 enum BlockAddressMode {
   // bit encoding P U W
@@ -419,13 +395,6 @@
   kItE = kItElse
 };
 
-// Set condition codes request.
-enum SetCc {
-  kCcDontCare,  // Allows prioritizing 16-bit instructions on Thumb2 whether they set CCs or not.
-  kCcSet,
-  kCcKeep,
-};
-
 constexpr uint32_t kNoItCondition = 3;
 constexpr uint32_t kInvalidModifiedImmediate = -1;
 
diff --git a/compiler/utils/arm/assembler_arm32.cc b/compiler/utils/arm/assembler_arm32.cc
deleted file mode 100644
index b8eb60c..0000000
--- a/compiler/utils/arm/assembler_arm32.cc
+++ /dev/null
@@ -1,1725 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "assembler_arm32.h"
-
-#include "base/bit_utils.h"
-#include "base/logging.h"
-#include "entrypoints/quick/quick_entrypoints.h"
-#include "offsets.h"
-#include "thread.h"
-
-namespace art {
-namespace arm {
-
-bool Arm32Assembler::ShifterOperandCanHoldArm32(uint32_t immediate, ShifterOperand* shifter_op) {
-  // Avoid the more expensive test for frequent small immediate values.
-  if (immediate < (1 << kImmed8Bits)) {
-    shifter_op->type_ = ShifterOperand::kImmediate;
-    shifter_op->is_rotate_ = true;
-    shifter_op->rotate_ = 0;
-    shifter_op->immed_ = immediate;
-    return true;
-  }
-  // Note that immediate must be unsigned for the test to work correctly.
-  for (int rot = 0; rot < 16; rot++) {
-    uint32_t imm8 = (immediate << 2*rot) | (immediate >> (32 - 2*rot));
-    if (imm8 < (1 << kImmed8Bits)) {
-      shifter_op->type_ = ShifterOperand::kImmediate;
-      shifter_op->is_rotate_ = true;
-      shifter_op->rotate_ = rot;
-      shifter_op->immed_ = imm8;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool Arm32Assembler::ShifterOperandCanAlwaysHold(uint32_t immediate) {
-  ShifterOperand shifter_op;
-  return ShifterOperandCanHoldArm32(immediate, &shifter_op);
-}
-
-bool Arm32Assembler::ShifterOperandCanHold(Register rd ATTRIBUTE_UNUSED,
-                                           Register rn ATTRIBUTE_UNUSED,
-                                           Opcode opcode ATTRIBUTE_UNUSED,
-                                           uint32_t immediate,
-                                           SetCc set_cc ATTRIBUTE_UNUSED,
-                                           ShifterOperand* shifter_op) {
-  return ShifterOperandCanHoldArm32(immediate, shifter_op);
-}
-
-void Arm32Assembler::and_(Register rd, Register rn, const ShifterOperand& so,
-                          Condition cond, SetCc set_cc) {
-  EmitType01(cond, so.type(), AND, set_cc, rn, rd, so);
-}
-
-
-void Arm32Assembler::eor(Register rd, Register rn, const ShifterOperand& so,
-                         Condition cond, SetCc set_cc) {
-  EmitType01(cond, so.type(), EOR, set_cc, rn, rd, so);
-}
-
-
-void Arm32Assembler::sub(Register rd, Register rn, const ShifterOperand& so,
-                         Condition cond, SetCc set_cc) {
-  EmitType01(cond, so.type(), SUB, set_cc, rn, rd, so);
-}
-
-void Arm32Assembler::rsb(Register rd, Register rn, const ShifterOperand& so,
-                         Condition cond, SetCc set_cc) {
-  EmitType01(cond, so.type(), RSB, set_cc, rn, rd, so);
-}
-
-void Arm32Assembler::add(Register rd, Register rn, const ShifterOperand& so,
-                         Condition cond, SetCc set_cc) {
-  EmitType01(cond, so.type(), ADD, set_cc, rn, rd, so);
-}
-
-
-void Arm32Assembler::adc(Register rd, Register rn, const ShifterOperand& so,
-                         Condition cond, SetCc set_cc) {
-  EmitType01(cond, so.type(), ADC, set_cc, rn, rd, so);
-}
-
-
-void Arm32Assembler::sbc(Register rd, Register rn, const ShifterOperand& so,
-                         Condition cond, SetCc set_cc) {
-  EmitType01(cond, so.type(), SBC, set_cc, rn, rd, so);
-}
-
-
-void Arm32Assembler::rsc(Register rd, Register rn, const ShifterOperand& so,
-                         Condition cond, SetCc set_cc) {
-  EmitType01(cond, so.type(), RSC, set_cc, rn, rd, so);
-}
-
-
-void Arm32Assembler::tst(Register rn, const ShifterOperand& so, Condition cond) {
-  CHECK_NE(rn, PC);  // Reserve tst pc instruction for exception handler marker.
-  EmitType01(cond, so.type(), TST, kCcSet, rn, R0, so);
-}
-
-
-void Arm32Assembler::teq(Register rn, const ShifterOperand& so, Condition cond) {
-  CHECK_NE(rn, PC);  // Reserve teq pc instruction for exception handler marker.
-  EmitType01(cond, so.type(), TEQ, kCcSet, rn, R0, so);
-}
-
-
-void Arm32Assembler::cmp(Register rn, const ShifterOperand& so, Condition cond) {
-  EmitType01(cond, so.type(), CMP, kCcSet, rn, R0, so);
-}
-
-
-void Arm32Assembler::cmn(Register rn, const ShifterOperand& so, Condition cond) {
-  EmitType01(cond, so.type(), CMN, kCcSet, rn, R0, so);
-}
-
-
-void Arm32Assembler::orr(Register rd, Register rn, const ShifterOperand& so,
-                         Condition cond, SetCc set_cc) {
-  EmitType01(cond, so.type(), ORR, set_cc, rn, rd, so);
-}
-
-
-void Arm32Assembler::orn(Register rd ATTRIBUTE_UNUSED,
-                         Register rn ATTRIBUTE_UNUSED,
-                         const ShifterOperand& so ATTRIBUTE_UNUSED,
-                         Condition cond ATTRIBUTE_UNUSED,
-                         SetCc set_cc ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "orn is not supported on ARM32";
-}
-
-
-void Arm32Assembler::mov(Register rd, const ShifterOperand& so,
-                         Condition cond, SetCc set_cc) {
-  EmitType01(cond, so.type(), MOV, set_cc, R0, rd, so);
-}
-
-
-void Arm32Assembler::bic(Register rd, Register rn, const ShifterOperand& so,
-                         Condition cond, SetCc set_cc) {
-  EmitType01(cond, so.type(), BIC, set_cc, rn, rd, so);
-}
-
-
-void Arm32Assembler::mvn(Register rd, const ShifterOperand& so,
-                         Condition cond, SetCc set_cc) {
-  EmitType01(cond, so.type(), MVN, set_cc, R0, rd, so);
-}
-
-
-void Arm32Assembler::mul(Register rd, Register rn, Register rm, Condition cond) {
-  // Assembler registers rd, rn, rm are encoded as rn, rm, rs.
-  EmitMulOp(cond, 0, R0, rd, rn, rm);
-}
-
-
-void Arm32Assembler::mla(Register rd, Register rn, Register rm, Register ra,
-                         Condition cond) {
-  // Assembler registers rd, rn, rm, ra are encoded as rn, rm, rs, rd.
-  EmitMulOp(cond, B21, ra, rd, rn, rm);
-}
-
-
-void Arm32Assembler::mls(Register rd, Register rn, Register rm, Register ra,
-                         Condition cond) {
-  // Assembler registers rd, rn, rm, ra are encoded as rn, rm, rs, rd.
-  EmitMulOp(cond, B22 | B21, ra, rd, rn, rm);
-}
-
-
-void Arm32Assembler::smull(Register rd_lo, Register rd_hi, Register rn,
-                           Register rm, Condition cond) {
-  // Assembler registers rd_lo, rd_hi, rn, rm are encoded as rd, rn, rm, rs.
-  EmitMulOp(cond, B23 | B22, rd_lo, rd_hi, rn, rm);
-}
-
-
-void Arm32Assembler::umull(Register rd_lo, Register rd_hi, Register rn,
-                           Register rm, Condition cond) {
-  // Assembler registers rd_lo, rd_hi, rn, rm are encoded as rd, rn, rm, rs.
-  EmitMulOp(cond, B23, rd_lo, rd_hi, rn, rm);
-}
-
-
-void Arm32Assembler::sdiv(Register rd, Register rn, Register rm, Condition cond) {
-  CHECK_NE(rd, kNoRegister);
-  CHECK_NE(rn, kNoRegister);
-  CHECK_NE(rm, kNoRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = B26 | B25 | B24 | B20 |
-      B15 | B14 | B13 | B12 |
-      (static_cast<int32_t>(cond) << kConditionShift) |
-      (static_cast<int32_t>(rn) << 0) |
-      (static_cast<int32_t>(rd) << 16) |
-      (static_cast<int32_t>(rm) << 8) |
-      B4;
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::udiv(Register rd, Register rn, Register rm, Condition cond) {
-  CHECK_NE(rd, kNoRegister);
-  CHECK_NE(rn, kNoRegister);
-  CHECK_NE(rm, kNoRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = B26 | B25 | B24 | B21 | B20 |
-      B15 | B14 | B13 | B12 |
-      (static_cast<int32_t>(cond) << kConditionShift) |
-      (static_cast<int32_t>(rn) << 0) |
-      (static_cast<int32_t>(rd) << 16) |
-      (static_cast<int32_t>(rm) << 8) |
-      B4;
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::sbfx(Register rd, Register rn, uint32_t lsb, uint32_t width, Condition cond) {
-  CHECK_NE(rd, kNoRegister);
-  CHECK_NE(rn, kNoRegister);
-  CHECK_NE(cond, kNoCondition);
-  CHECK_LE(lsb, 31U);
-  CHECK(1U <= width && width <= 32U) << width;
-  uint32_t widthminus1 = width - 1;
-
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-      B26 | B25 | B24 | B23 | B21 |
-      (widthminus1 << 16) |
-      (static_cast<uint32_t>(rd) << 12) |
-      (lsb << 7) |
-      B6 | B4 |
-      static_cast<uint32_t>(rn);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::ubfx(Register rd, Register rn, uint32_t lsb, uint32_t width, Condition cond) {
-  CHECK_NE(rd, kNoRegister);
-  CHECK_NE(rn, kNoRegister);
-  CHECK_NE(cond, kNoCondition);
-  CHECK_LE(lsb, 31U);
-  CHECK(1U <= width && width <= 32U) << width;
-  uint32_t widthminus1 = width - 1;
-
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-      B26 | B25 | B24 | B23 | B22 | B21 |
-      (widthminus1 << 16) |
-      (static_cast<uint32_t>(rd) << 12) |
-      (lsb << 7) |
-      B6 | B4 |
-      static_cast<uint32_t>(rn);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::ldr(Register rd, const Address& ad, Condition cond) {
-  EmitMemOp(cond, true, false, rd, ad);
-}
-
-
-void Arm32Assembler::str(Register rd, const Address& ad, Condition cond) {
-  EmitMemOp(cond, false, false, rd, ad);
-}
-
-
-void Arm32Assembler::ldrb(Register rd, const Address& ad, Condition cond) {
-  EmitMemOp(cond, true, true, rd, ad);
-}
-
-
-void Arm32Assembler::strb(Register rd, const Address& ad, Condition cond) {
-  EmitMemOp(cond, false, true, rd, ad);
-}
-
-
-void Arm32Assembler::ldrh(Register rd, const Address& ad, Condition cond) {
-  EmitMemOpAddressMode3(cond, L | B7 | H | B4, rd, ad);
-}
-
-
-void Arm32Assembler::strh(Register rd, const Address& ad, Condition cond) {
-  EmitMemOpAddressMode3(cond, B7 | H | B4, rd, ad);
-}
-
-
-void Arm32Assembler::ldrsb(Register rd, const Address& ad, Condition cond) {
-  EmitMemOpAddressMode3(cond, L | B7 | B6 | B4, rd, ad);
-}
-
-
-void Arm32Assembler::ldrsh(Register rd, const Address& ad, Condition cond) {
-  EmitMemOpAddressMode3(cond, L | B7 | B6 | H | B4, rd, ad);
-}
-
-
-void Arm32Assembler::ldrd(Register rd, const Address& ad, Condition cond) {
-  CHECK_EQ(rd % 2, 0);
-  EmitMemOpAddressMode3(cond, B7 | B6 | B4, rd, ad);
-}
-
-
-void Arm32Assembler::strd(Register rd, const Address& ad, Condition cond) {
-  CHECK_EQ(rd % 2, 0);
-  EmitMemOpAddressMode3(cond, B7 | B6 | B5 | B4, rd, ad);
-}
-
-
-void Arm32Assembler::ldm(BlockAddressMode am,
-                       Register base,
-                       RegList regs,
-                       Condition cond) {
-  EmitMultiMemOp(cond, am, true, base, regs);
-}
-
-
-void Arm32Assembler::stm(BlockAddressMode am,
-                       Register base,
-                       RegList regs,
-                       Condition cond) {
-  EmitMultiMemOp(cond, am, false, base, regs);
-}
-
-
-void Arm32Assembler::vmovs(SRegister sd, SRegister sm, Condition cond) {
-  EmitVFPsss(cond, B23 | B21 | B20 | B6, sd, S0, sm);
-}
-
-
-void Arm32Assembler::vmovd(DRegister dd, DRegister dm, Condition cond) {
-  EmitVFPddd(cond, B23 | B21 | B20 | B6, dd, D0, dm);
-}
-
-
-bool Arm32Assembler::vmovs(SRegister sd, float s_imm, Condition cond) {
-  uint32_t imm32 = bit_cast<uint32_t, float>(s_imm);
-  if (((imm32 & ((1 << 19) - 1)) == 0) &&
-      ((((imm32 >> 25) & ((1 << 6) - 1)) == (1 << 5)) ||
-       (((imm32 >> 25) & ((1 << 6) - 1)) == ((1 << 5) -1)))) {
-    uint8_t imm8 = ((imm32 >> 31) << 7) | (((imm32 >> 29) & 1) << 6) |
-        ((imm32 >> 19) & ((1 << 6) -1));
-    EmitVFPsss(cond, B23 | B21 | B20 | ((imm8 >> 4)*B16) | (imm8 & 0xf),
-               sd, S0, S0);
-    return true;
-  }
-  return false;
-}
-
-
-bool Arm32Assembler::vmovd(DRegister dd, double d_imm, Condition cond) {
-  uint64_t imm64 = bit_cast<uint64_t, double>(d_imm);
-  if (((imm64 & ((1LL << 48) - 1)) == 0) &&
-      ((((imm64 >> 54) & ((1 << 9) - 1)) == (1 << 8)) ||
-       (((imm64 >> 54) & ((1 << 9) - 1)) == ((1 << 8) -1)))) {
-    uint8_t imm8 = ((imm64 >> 63) << 7) | (((imm64 >> 61) & 1) << 6) |
-        ((imm64 >> 48) & ((1 << 6) -1));
-    EmitVFPddd(cond, B23 | B21 | B20 | ((imm8 >> 4)*B16) | B8 | (imm8 & 0xf),
-               dd, D0, D0);
-    return true;
-  }
-  return false;
-}
-
-
-void Arm32Assembler::vadds(SRegister sd, SRegister sn, SRegister sm,
-                           Condition cond) {
-  EmitVFPsss(cond, B21 | B20, sd, sn, sm);
-}
-
-
-void Arm32Assembler::vaddd(DRegister dd, DRegister dn, DRegister dm,
-                           Condition cond) {
-  EmitVFPddd(cond, B21 | B20, dd, dn, dm);
-}
-
-
-void Arm32Assembler::vsubs(SRegister sd, SRegister sn, SRegister sm,
-                           Condition cond) {
-  EmitVFPsss(cond, B21 | B20 | B6, sd, sn, sm);
-}
-
-
-void Arm32Assembler::vsubd(DRegister dd, DRegister dn, DRegister dm,
-                           Condition cond) {
-  EmitVFPddd(cond, B21 | B20 | B6, dd, dn, dm);
-}
-
-
-void Arm32Assembler::vmuls(SRegister sd, SRegister sn, SRegister sm,
-                           Condition cond) {
-  EmitVFPsss(cond, B21, sd, sn, sm);
-}
-
-
-void Arm32Assembler::vmuld(DRegister dd, DRegister dn, DRegister dm,
-                           Condition cond) {
-  EmitVFPddd(cond, B21, dd, dn, dm);
-}
-
-
-void Arm32Assembler::vmlas(SRegister sd, SRegister sn, SRegister sm,
-                           Condition cond) {
-  EmitVFPsss(cond, 0, sd, sn, sm);
-}
-
-
-void Arm32Assembler::vmlad(DRegister dd, DRegister dn, DRegister dm,
-                           Condition cond) {
-  EmitVFPddd(cond, 0, dd, dn, dm);
-}
-
-
-void Arm32Assembler::vmlss(SRegister sd, SRegister sn, SRegister sm,
-                           Condition cond) {
-  EmitVFPsss(cond, B6, sd, sn, sm);
-}
-
-
-void Arm32Assembler::vmlsd(DRegister dd, DRegister dn, DRegister dm,
-                           Condition cond) {
-  EmitVFPddd(cond, B6, dd, dn, dm);
-}
-
-
-void Arm32Assembler::vdivs(SRegister sd, SRegister sn, SRegister sm,
-                           Condition cond) {
-  EmitVFPsss(cond, B23, sd, sn, sm);
-}
-
-
-void Arm32Assembler::vdivd(DRegister dd, DRegister dn, DRegister dm,
-                           Condition cond) {
-  EmitVFPddd(cond, B23, dd, dn, dm);
-}
-
-
-void Arm32Assembler::vabss(SRegister sd, SRegister sm, Condition cond) {
-  EmitVFPsss(cond, B23 | B21 | B20 | B7 | B6, sd, S0, sm);
-}
-
-
-void Arm32Assembler::vabsd(DRegister dd, DRegister dm, Condition cond) {
-  EmitVFPddd(cond, B23 | B21 | B20 | B7 | B6, dd, D0, dm);
-}
-
-
-void Arm32Assembler::vnegs(SRegister sd, SRegister sm, Condition cond) {
-  EmitVFPsss(cond, B23 | B21 | B20 | B16 | B6, sd, S0, sm);
-}
-
-
-void Arm32Assembler::vnegd(DRegister dd, DRegister dm, Condition cond) {
-  EmitVFPddd(cond, B23 | B21 | B20 | B16 | B6, dd, D0, dm);
-}
-
-
-void Arm32Assembler::vsqrts(SRegister sd, SRegister sm, Condition cond) {
-  EmitVFPsss(cond, B23 | B21 | B20 | B16 | B7 | B6, sd, S0, sm);
-}
-
-void Arm32Assembler::vsqrtd(DRegister dd, DRegister dm, Condition cond) {
-  EmitVFPddd(cond, B23 | B21 | B20 | B16 | B7 | B6, dd, D0, dm);
-}
-
-
-void Arm32Assembler::vcvtsd(SRegister sd, DRegister dm, Condition cond) {
-  EmitVFPsd(cond, B23 | B21 | B20 | B18 | B17 | B16 | B8 | B7 | B6, sd, dm);
-}
-
-
-void Arm32Assembler::vcvtds(DRegister dd, SRegister sm, Condition cond) {
-  EmitVFPds(cond, B23 | B21 | B20 | B18 | B17 | B16 | B7 | B6, dd, sm);
-}
-
-
-void Arm32Assembler::vcvtis(SRegister sd, SRegister sm, Condition cond) {
-  EmitVFPsss(cond, B23 | B21 | B20 | B19 | B18 | B16 | B7 | B6, sd, S0, sm);
-}
-
-
-void Arm32Assembler::vcvtid(SRegister sd, DRegister dm, Condition cond) {
-  EmitVFPsd(cond, B23 | B21 | B20 | B19 | B18 | B16 | B8 | B7 | B6, sd, dm);
-}
-
-
-void Arm32Assembler::vcvtsi(SRegister sd, SRegister sm, Condition cond) {
-  EmitVFPsss(cond, B23 | B21 | B20 | B19 | B7 | B6, sd, S0, sm);
-}
-
-
-void Arm32Assembler::vcvtdi(DRegister dd, SRegister sm, Condition cond) {
-  EmitVFPds(cond, B23 | B21 | B20 | B19 | B8 | B7 | B6, dd, sm);
-}
-
-
-void Arm32Assembler::vcvtus(SRegister sd, SRegister sm, Condition cond) {
-  EmitVFPsss(cond, B23 | B21 | B20 | B19 | B18 | B7 | B6, sd, S0, sm);
-}
-
-
-void Arm32Assembler::vcvtud(SRegister sd, DRegister dm, Condition cond) {
-  EmitVFPsd(cond, B23 | B21 | B20 | B19 | B18 | B8 | B7 | B6, sd, dm);
-}
-
-
-void Arm32Assembler::vcvtsu(SRegister sd, SRegister sm, Condition cond) {
-  EmitVFPsss(cond, B23 | B21 | B20 | B19 | B6, sd, S0, sm);
-}
-
-
-void Arm32Assembler::vcvtdu(DRegister dd, SRegister sm, Condition cond) {
-  EmitVFPds(cond, B23 | B21 | B20 | B19 | B8 | B6, dd, sm);
-}
-
-
-void Arm32Assembler::vcmps(SRegister sd, SRegister sm, Condition cond) {
-  EmitVFPsss(cond, B23 | B21 | B20 | B18 | B6, sd, S0, sm);
-}
-
-
-void Arm32Assembler::vcmpd(DRegister dd, DRegister dm, Condition cond) {
-  EmitVFPddd(cond, B23 | B21 | B20 | B18 | B6, dd, D0, dm);
-}
-
-
-void Arm32Assembler::vcmpsz(SRegister sd, Condition cond) {
-  EmitVFPsss(cond, B23 | B21 | B20 | B18 | B16 | B6, sd, S0, S0);
-}
-
-
-void Arm32Assembler::vcmpdz(DRegister dd, Condition cond) {
-  EmitVFPddd(cond, B23 | B21 | B20 | B18 | B16 | B6, dd, D0, D0);
-}
-
-void Arm32Assembler::b(Label* label, Condition cond) {
-  EmitBranch(cond, label, false);
-}
-
-
-void Arm32Assembler::bl(Label* label, Condition cond) {
-  EmitBranch(cond, label, true);
-}
-
-
-void Arm32Assembler::MarkExceptionHandler(Label* label) {
-  EmitType01(AL, 1, TST, kCcSet, PC, R0, ShifterOperand(0));
-  Label l;
-  b(&l);
-  EmitBranch(AL, label, false);
-  Bind(&l);
-}
-
-
-void Arm32Assembler::Emit(int32_t value) {
-  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
-  buffer_.Emit<int32_t>(value);
-}
-
-
-void Arm32Assembler::EmitType01(Condition cond,
-                                int type,
-                                Opcode opcode,
-                                SetCc set_cc,
-                                Register rn,
-                                Register rd,
-                                const ShifterOperand& so) {
-  CHECK_NE(rd, kNoRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = static_cast<int32_t>(cond) << kConditionShift |
-                     type << kTypeShift |
-                     static_cast<int32_t>(opcode) << kOpcodeShift |
-                     (set_cc == kCcSet ? 1 : 0) << kSShift |
-                     static_cast<int32_t>(rn) << kRnShift |
-                     static_cast<int32_t>(rd) << kRdShift |
-                     so.encodingArm();
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::EmitType5(Condition cond, int offset, bool link) {
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = static_cast<int32_t>(cond) << kConditionShift |
-                     5 << kTypeShift |
-                     (link ? 1 : 0) << kLinkShift;
-  Emit(Arm32Assembler::EncodeBranchOffset(offset, encoding));
-}
-
-
-void Arm32Assembler::EmitMemOp(Condition cond,
-                               bool load,
-                               bool byte,
-                               Register rd,
-                               const Address& ad) {
-  CHECK_NE(rd, kNoRegister);
-  CHECK_NE(cond, kNoCondition);
-  const Address& addr = static_cast<const Address&>(ad);
-
-  int32_t encoding = 0;
-  if (!ad.IsImmediate() && ad.GetRegisterOffset() == PC) {
-    // PC relative LDR(literal)
-    int32_t offset = ad.GetOffset();
-    int32_t u = B23;
-    if (offset < 0) {
-      offset = -offset;
-      u = 0;
-    }
-    CHECK_LT(offset, (1 << 12));
-    encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-         B26 | B24 | u | B20 |
-         (load ? L : 0) |
-         (byte ? B : 0) |
-         (static_cast<int32_t>(rd) << kRdShift) |
-         0xf << 16 |
-         (offset & 0xfff);
-
-  } else {
-    encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-        B26 |
-        (load ? L : 0) |
-        (byte ? B : 0) |
-        (static_cast<int32_t>(rd) << kRdShift) |
-        addr.encodingArm();
-  }
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::EmitMemOpAddressMode3(Condition cond,
-                                           int32_t mode,
-                                           Register rd,
-                                           const Address& ad) {
-  CHECK_NE(rd, kNoRegister);
-  CHECK_NE(cond, kNoCondition);
-  const Address& addr = static_cast<const Address&>(ad);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B22  |
-                     mode |
-                     (static_cast<int32_t>(rd) << kRdShift) |
-                     addr.encoding3();
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::EmitMultiMemOp(Condition cond,
-                                    BlockAddressMode am,
-                                    bool load,
-                                    Register base,
-                                    RegList regs) {
-  CHECK_NE(base, kNoRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B27 |
-                     am |
-                     (load ? L : 0) |
-                     (static_cast<int32_t>(base) << kRnShift) |
-                     regs;
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::EmitShiftImmediate(Condition cond,
-                                        Shift opcode,
-                                        Register rd,
-                                        Register rm,
-                                        const ShifterOperand& so) {
-  CHECK_NE(cond, kNoCondition);
-  CHECK(so.IsImmediate());
-  int32_t encoding = static_cast<int32_t>(cond) << kConditionShift |
-                     static_cast<int32_t>(MOV) << kOpcodeShift |
-                     static_cast<int32_t>(rd) << kRdShift |
-                     so.encodingArm() << kShiftImmShift |
-                     static_cast<int32_t>(opcode) << kShiftShift |
-                     static_cast<int32_t>(rm);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::EmitShiftRegister(Condition cond,
-                                       Shift opcode,
-                                       Register rd,
-                                       Register rm,
-                                       const ShifterOperand& so) {
-  CHECK_NE(cond, kNoCondition);
-  CHECK(so.IsRegister());
-  int32_t encoding = static_cast<int32_t>(cond) << kConditionShift |
-                     static_cast<int32_t>(MOV) << kOpcodeShift |
-                     static_cast<int32_t>(rd) << kRdShift |
-                     so.encodingArm() << kShiftRegisterShift |
-                     static_cast<int32_t>(opcode) << kShiftShift |
-                     B4 |
-                     static_cast<int32_t>(rm);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::EmitBranch(Condition cond, Label* label, bool link) {
-  if (label->IsBound()) {
-    EmitType5(cond, label->Position() - buffer_.Size(), link);
-  } else {
-    int position = buffer_.Size();
-    // Use the offset field of the branch instruction for linking the sites.
-    EmitType5(cond, label->position_, link);
-    label->LinkTo(position);
-  }
-}
-
-
-void Arm32Assembler::clz(Register rd, Register rm, Condition cond) {
-  CHECK_NE(rd, kNoRegister);
-  CHECK_NE(rm, kNoRegister);
-  CHECK_NE(cond, kNoCondition);
-  CHECK_NE(rd, PC);
-  CHECK_NE(rm, PC);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B24 | B22 | B21 | (0xf << 16) |
-                     (static_cast<int32_t>(rd) << kRdShift) |
-                     (0xf << 8) | B4 | static_cast<int32_t>(rm);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::movw(Register rd, uint16_t imm16, Condition cond) {
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = static_cast<int32_t>(cond) << kConditionShift |
-                     B25 | B24 | ((imm16 >> 12) << 16) |
-                     static_cast<int32_t>(rd) << kRdShift | (imm16 & 0xfff);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::movt(Register rd, uint16_t imm16, Condition cond) {
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = static_cast<int32_t>(cond) << kConditionShift |
-                     B25 | B24 | B22 | ((imm16 >> 12) << 16) |
-                     static_cast<int32_t>(rd) << kRdShift | (imm16 & 0xfff);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::EmitMiscellaneous(Condition cond, uint8_t op1,
-                                       uint8_t op2, uint32_t a_part,
-                                       uint32_t rest) {
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                      B26 | B25 | B23 |
-                      (op1 << 20) |
-                      (a_part << 16) |
-                      (op2 << 5) |
-                      B4 |
-                      rest;
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::EmitReverseBytes(Register rd, Register rm, Condition cond,
-                                      uint8_t op1, uint8_t op2) {
-  CHECK_NE(rd, kNoRegister);
-  CHECK_NE(rm, kNoRegister);
-  CHECK_NE(cond, kNoCondition);
-  CHECK_NE(rd, PC);
-  CHECK_NE(rm, PC);
-
-  int32_t encoding = (static_cast<int32_t>(rd) << kRdShift) |
-                     (0b1111 << 8) |
-                     static_cast<int32_t>(rm);
-  EmitMiscellaneous(cond, op1, op2, 0b1111, encoding);
-}
-
-
-void Arm32Assembler::rbit(Register rd, Register rm, Condition cond) {
-  CHECK_NE(rd, kNoRegister);
-  CHECK_NE(rm, kNoRegister);
-  CHECK_NE(cond, kNoCondition);
-  CHECK_NE(rd, PC);
-  CHECK_NE(rm, PC);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B26 | B25 | B23 | B22 | B21 | B20 | (0xf << 16) |
-                     (static_cast<int32_t>(rd) << kRdShift) |
-                     (0xf << 8) | B5 | B4 | static_cast<int32_t>(rm);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::rev(Register rd, Register rm, Condition cond) {
-  EmitReverseBytes(rd, rm, cond, 0b011, 0b001);
-}
-
-
-void Arm32Assembler::rev16(Register rd, Register rm, Condition cond) {
-  EmitReverseBytes(rd, rm, cond, 0b011, 0b101);
-}
-
-
-void Arm32Assembler::revsh(Register rd, Register rm, Condition cond) {
-  EmitReverseBytes(rd, rm, cond, 0b111, 0b101);
-}
-
-
-void Arm32Assembler::EmitMulOp(Condition cond, int32_t opcode,
-                               Register rd, Register rn,
-                               Register rm, Register rs) {
-  CHECK_NE(rd, kNoRegister);
-  CHECK_NE(rn, kNoRegister);
-  CHECK_NE(rm, kNoRegister);
-  CHECK_NE(rs, kNoRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = opcode |
-      (static_cast<int32_t>(cond) << kConditionShift) |
-      (static_cast<int32_t>(rn) << kRnShift) |
-      (static_cast<int32_t>(rd) << kRdShift) |
-      (static_cast<int32_t>(rs) << kRsShift) |
-      B7 | B4 |
-      (static_cast<int32_t>(rm) << kRmShift);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::ldrex(Register rt, Register rn, Condition cond) {
-  CHECK_NE(rn, kNoRegister);
-  CHECK_NE(rt, kNoRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B24 |
-                     B23 |
-                     L   |
-                     (static_cast<int32_t>(rn) << kLdExRnShift) |
-                     (static_cast<int32_t>(rt) << kLdExRtShift) |
-                     B11 | B10 | B9 | B8 | B7 | B4 | B3 | B2 | B1 | B0;
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::ldrexd(Register rt, Register rt2, Register rn, Condition cond) {
-  CHECK_NE(rn, kNoRegister);
-  CHECK_NE(rt, kNoRegister);
-  CHECK_NE(rt2, kNoRegister);
-  CHECK_NE(rt, R14);
-  CHECK_EQ(0u, static_cast<uint32_t>(rt) % 2);
-  CHECK_EQ(static_cast<uint32_t>(rt) + 1, static_cast<uint32_t>(rt2));
-  CHECK_NE(cond, kNoCondition);
-
-  int32_t encoding =
-      (static_cast<uint32_t>(cond) << kConditionShift) |
-      B24 | B23 | B21 | B20 |
-      static_cast<uint32_t>(rn) << 16 |
-      static_cast<uint32_t>(rt) << 12 |
-      B11 | B10 | B9 | B8 | B7 | B4 | B3 | B2 | B1 | B0;
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::strex(Register rd,
-                           Register rt,
-                           Register rn,
-                           Condition cond) {
-  CHECK_NE(rn, kNoRegister);
-  CHECK_NE(rd, kNoRegister);
-  CHECK_NE(rt, kNoRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B24 |
-                     B23 |
-                     (static_cast<int32_t>(rn) << kStrExRnShift) |
-                     (static_cast<int32_t>(rd) << kStrExRdShift) |
-                     B11 | B10 | B9 | B8 | B7 | B4 |
-                     (static_cast<int32_t>(rt) << kStrExRtShift);
-  Emit(encoding);
-}
-
-void Arm32Assembler::strexd(Register rd, Register rt, Register rt2, Register rn, Condition cond) {
-  CHECK_NE(rd, kNoRegister);
-  CHECK_NE(rn, kNoRegister);
-  CHECK_NE(rt, kNoRegister);
-  CHECK_NE(rt2, kNoRegister);
-  CHECK_NE(rt, R14);
-  CHECK_NE(rd, rt);
-  CHECK_NE(rd, rt2);
-  CHECK_EQ(0u, static_cast<uint32_t>(rt) % 2);
-  CHECK_EQ(static_cast<uint32_t>(rt) + 1, static_cast<uint32_t>(rt2));
-  CHECK_NE(cond, kNoCondition);
-
-  int32_t encoding =
-      (static_cast<uint32_t>(cond) << kConditionShift) |
-      B24 | B23 | B21 |
-      static_cast<uint32_t>(rn) << 16 |
-      static_cast<uint32_t>(rd) << 12 |
-      B11 | B10 | B9 | B8 | B7 | B4 |
-      static_cast<uint32_t>(rt);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::clrex(Condition cond) {
-  CHECK_EQ(cond, AL);   // This cannot be conditional on ARM.
-  int32_t encoding = (kSpecialCondition << kConditionShift) |
-                     B26 | B24 | B22 | B21 | B20 | (0xff << 12) | B4 | 0xf;
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::nop(Condition cond) {
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B25 | B24 | B21 | (0xf << 12);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::vmovsr(SRegister sn, Register rt, Condition cond) {
-  CHECK_NE(sn, kNoSRegister);
-  CHECK_NE(rt, kNoRegister);
-  CHECK_NE(rt, SP);
-  CHECK_NE(rt, PC);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B27 | B26 | B25 |
-                     ((static_cast<int32_t>(sn) >> 1)*B16) |
-                     (static_cast<int32_t>(rt)*B12) | B11 | B9 |
-                     ((static_cast<int32_t>(sn) & 1)*B7) | B4;
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::vmovrs(Register rt, SRegister sn, Condition cond) {
-  CHECK_NE(sn, kNoSRegister);
-  CHECK_NE(rt, kNoRegister);
-  CHECK_NE(rt, SP);
-  CHECK_NE(rt, PC);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B27 | B26 | B25 | B20 |
-                     ((static_cast<int32_t>(sn) >> 1)*B16) |
-                     (static_cast<int32_t>(rt)*B12) | B11 | B9 |
-                     ((static_cast<int32_t>(sn) & 1)*B7) | B4;
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::vmovsrr(SRegister sm, Register rt, Register rt2,
-                             Condition cond) {
-  CHECK_NE(sm, kNoSRegister);
-  CHECK_NE(sm, S31);
-  CHECK_NE(rt, kNoRegister);
-  CHECK_NE(rt, SP);
-  CHECK_NE(rt, PC);
-  CHECK_NE(rt2, kNoRegister);
-  CHECK_NE(rt2, SP);
-  CHECK_NE(rt2, PC);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B27 | B26 | B22 |
-                     (static_cast<int32_t>(rt2)*B16) |
-                     (static_cast<int32_t>(rt)*B12) | B11 | B9 |
-                     ((static_cast<int32_t>(sm) & 1)*B5) | B4 |
-                     (static_cast<int32_t>(sm) >> 1);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::vmovrrs(Register rt, Register rt2, SRegister sm,
-                             Condition cond) {
-  CHECK_NE(sm, kNoSRegister);
-  CHECK_NE(sm, S31);
-  CHECK_NE(rt, kNoRegister);
-  CHECK_NE(rt, SP);
-  CHECK_NE(rt, PC);
-  CHECK_NE(rt2, kNoRegister);
-  CHECK_NE(rt2, SP);
-  CHECK_NE(rt2, PC);
-  CHECK_NE(rt, rt2);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B27 | B26 | B22 | B20 |
-                     (static_cast<int32_t>(rt2)*B16) |
-                     (static_cast<int32_t>(rt)*B12) | B11 | B9 |
-                     ((static_cast<int32_t>(sm) & 1)*B5) | B4 |
-                     (static_cast<int32_t>(sm) >> 1);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::vmovdrr(DRegister dm, Register rt, Register rt2,
-                             Condition cond) {
-  CHECK_NE(dm, kNoDRegister);
-  CHECK_NE(rt, kNoRegister);
-  CHECK_NE(rt, SP);
-  CHECK_NE(rt, PC);
-  CHECK_NE(rt2, kNoRegister);
-  CHECK_NE(rt2, SP);
-  CHECK_NE(rt2, PC);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B27 | B26 | B22 |
-                     (static_cast<int32_t>(rt2)*B16) |
-                     (static_cast<int32_t>(rt)*B12) | B11 | B9 | B8 |
-                     ((static_cast<int32_t>(dm) >> 4)*B5) | B4 |
-                     (static_cast<int32_t>(dm) & 0xf);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::vmovrrd(Register rt, Register rt2, DRegister dm,
-                             Condition cond) {
-  CHECK_NE(dm, kNoDRegister);
-  CHECK_NE(rt, kNoRegister);
-  CHECK_NE(rt, SP);
-  CHECK_NE(rt, PC);
-  CHECK_NE(rt2, kNoRegister);
-  CHECK_NE(rt2, SP);
-  CHECK_NE(rt2, PC);
-  CHECK_NE(rt, rt2);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B27 | B26 | B22 | B20 |
-                     (static_cast<int32_t>(rt2)*B16) |
-                     (static_cast<int32_t>(rt)*B12) | B11 | B9 | B8 |
-                     ((static_cast<int32_t>(dm) >> 4)*B5) | B4 |
-                     (static_cast<int32_t>(dm) & 0xf);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::vldrs(SRegister sd, const Address& ad, Condition cond) {
-  const Address& addr = static_cast<const Address&>(ad);
-  CHECK_NE(sd, kNoSRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B27 | B26 | B24 | B20 |
-                     ((static_cast<int32_t>(sd) & 1)*B22) |
-                     ((static_cast<int32_t>(sd) >> 1)*B12) |
-                     B11 | B9 | addr.vencoding();
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::vstrs(SRegister sd, const Address& ad, Condition cond) {
-  const Address& addr = static_cast<const Address&>(ad);
-  CHECK_NE(static_cast<Register>(addr.encodingArm() & (0xf << kRnShift)), PC);
-  CHECK_NE(sd, kNoSRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B27 | B26 | B24 |
-                     ((static_cast<int32_t>(sd) & 1)*B22) |
-                     ((static_cast<int32_t>(sd) >> 1)*B12) |
-                     B11 | B9 | addr.vencoding();
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::vldrd(DRegister dd, const Address& ad, Condition cond) {
-  const Address& addr = static_cast<const Address&>(ad);
-  CHECK_NE(dd, kNoDRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B27 | B26 | B24 | B20 |
-                     ((static_cast<int32_t>(dd) >> 4)*B22) |
-                     ((static_cast<int32_t>(dd) & 0xf)*B12) |
-                     B11 | B9 | B8 | addr.vencoding();
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::vstrd(DRegister dd, const Address& ad, Condition cond) {
-  const Address& addr = static_cast<const Address&>(ad);
-  CHECK_NE(static_cast<Register>(addr.encodingArm() & (0xf << kRnShift)), PC);
-  CHECK_NE(dd, kNoDRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B27 | B26 | B24 |
-                     ((static_cast<int32_t>(dd) >> 4)*B22) |
-                     ((static_cast<int32_t>(dd) & 0xf)*B12) |
-                     B11 | B9 | B8 | addr.vencoding();
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::vpushs(SRegister reg, int nregs, Condition cond) {
-  EmitVPushPop(static_cast<uint32_t>(reg), nregs, true, false, cond);
-}
-
-
-void Arm32Assembler::vpushd(DRegister reg, int nregs, Condition cond) {
-  EmitVPushPop(static_cast<uint32_t>(reg), nregs, true, true, cond);
-}
-
-
-void Arm32Assembler::vpops(SRegister reg, int nregs, Condition cond) {
-  EmitVPushPop(static_cast<uint32_t>(reg), nregs, false, false, cond);
-}
-
-
-void Arm32Assembler::vpopd(DRegister reg, int nregs, Condition cond) {
-  EmitVPushPop(static_cast<uint32_t>(reg), nregs, false, true, cond);
-}
-
-
-void Arm32Assembler::vldmiad(Register, DRegister, int, Condition) {
-  LOG(FATAL) << "Unimplemented.";
-  UNREACHABLE();
-}
-
-
-void Arm32Assembler::vstmiad(Register, DRegister, int, Condition) {
-  LOG(FATAL) << "Unimplemented.";
-  UNREACHABLE();
-}
-
-
-void Arm32Assembler::EmitVPushPop(uint32_t reg, int nregs, bool push, bool dbl, Condition cond) {
-  CHECK_NE(cond, kNoCondition);
-  CHECK_GT(nregs, 0);
-  uint32_t D;
-  uint32_t Vd;
-  if (dbl) {
-    // Encoded as D:Vd.
-    D = (reg >> 4) & 1;
-    Vd = reg & 15U /* 0b1111 */;
-  } else {
-    // Encoded as Vd:D.
-    D = reg & 1;
-    Vd = (reg >> 1) & 15U /* 0b1111 */;
-  }
-  int32_t encoding = B27 | B26 | B21 | B19 | B18 | B16 |
-                    B11 | B9 |
-        (dbl ? B8 : 0) |
-        (push ? B24 : (B23 | B20)) |
-        static_cast<int32_t>(cond) << kConditionShift |
-        nregs << (dbl ? 1 : 0) |
-        D << 22 |
-        Vd << 12;
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::EmitVFPsss(Condition cond, int32_t opcode,
-                                SRegister sd, SRegister sn, SRegister sm) {
-  CHECK_NE(sd, kNoSRegister);
-  CHECK_NE(sn, kNoSRegister);
-  CHECK_NE(sm, kNoSRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B27 | B26 | B25 | B11 | B9 | opcode |
-                     ((static_cast<int32_t>(sd) & 1)*B22) |
-                     ((static_cast<int32_t>(sn) >> 1)*B16) |
-                     ((static_cast<int32_t>(sd) >> 1)*B12) |
-                     ((static_cast<int32_t>(sn) & 1)*B7) |
-                     ((static_cast<int32_t>(sm) & 1)*B5) |
-                     (static_cast<int32_t>(sm) >> 1);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::EmitVFPddd(Condition cond, int32_t opcode,
-                                DRegister dd, DRegister dn, DRegister dm) {
-  CHECK_NE(dd, kNoDRegister);
-  CHECK_NE(dn, kNoDRegister);
-  CHECK_NE(dm, kNoDRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B27 | B26 | B25 | B11 | B9 | B8 | opcode |
-                     ((static_cast<int32_t>(dd) >> 4)*B22) |
-                     ((static_cast<int32_t>(dn) & 0xf)*B16) |
-                     ((static_cast<int32_t>(dd) & 0xf)*B12) |
-                     ((static_cast<int32_t>(dn) >> 4)*B7) |
-                     ((static_cast<int32_t>(dm) >> 4)*B5) |
-                     (static_cast<int32_t>(dm) & 0xf);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::EmitVFPsd(Condition cond, int32_t opcode,
-                               SRegister sd, DRegister dm) {
-  CHECK_NE(sd, kNoSRegister);
-  CHECK_NE(dm, kNoDRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B27 | B26 | B25 | B11 | B9 | opcode |
-                     ((static_cast<int32_t>(sd) & 1)*B22) |
-                     ((static_cast<int32_t>(sd) >> 1)*B12) |
-                     ((static_cast<int32_t>(dm) >> 4)*B5) |
-                     (static_cast<int32_t>(dm) & 0xf);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::EmitVFPds(Condition cond, int32_t opcode,
-                             DRegister dd, SRegister sm) {
-  CHECK_NE(dd, kNoDRegister);
-  CHECK_NE(sm, kNoSRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B27 | B26 | B25 | B11 | B9 | opcode |
-                     ((static_cast<int32_t>(dd) >> 4)*B22) |
-                     ((static_cast<int32_t>(dd) & 0xf)*B12) |
-                     ((static_cast<int32_t>(sm) & 1)*B5) |
-                     (static_cast<int32_t>(sm) >> 1);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::Lsl(Register rd, Register rm, uint32_t shift_imm,
-                         Condition cond, SetCc set_cc) {
-  CHECK_LE(shift_imm, 31u);
-  mov(rd, ShifterOperand(rm, LSL, shift_imm), cond, set_cc);
-}
-
-
-void Arm32Assembler::Lsr(Register rd, Register rm, uint32_t shift_imm,
-                         Condition cond, SetCc set_cc) {
-  CHECK(1u <= shift_imm && shift_imm <= 32u);
-  if (shift_imm == 32) shift_imm = 0;  // Comply to UAL syntax.
-  mov(rd, ShifterOperand(rm, LSR, shift_imm), cond, set_cc);
-}
-
-
-void Arm32Assembler::Asr(Register rd, Register rm, uint32_t shift_imm,
-                         Condition cond, SetCc set_cc) {
-  CHECK(1u <= shift_imm && shift_imm <= 32u);
-  if (shift_imm == 32) shift_imm = 0;  // Comply to UAL syntax.
-  mov(rd, ShifterOperand(rm, ASR, shift_imm), cond, set_cc);
-}
-
-
-void Arm32Assembler::Ror(Register rd, Register rm, uint32_t shift_imm,
-                         Condition cond, SetCc set_cc) {
-  CHECK(1u <= shift_imm && shift_imm <= 31u);
-  mov(rd, ShifterOperand(rm, ROR, shift_imm), cond, set_cc);
-}
-
-void Arm32Assembler::Rrx(Register rd, Register rm, Condition cond, SetCc set_cc) {
-  mov(rd, ShifterOperand(rm, ROR, 0), cond, set_cc);
-}
-
-
-void Arm32Assembler::Lsl(Register rd, Register rm, Register rn,
-                         Condition cond, SetCc set_cc) {
-  mov(rd, ShifterOperand(rm, LSL, rn), cond, set_cc);
-}
-
-
-void Arm32Assembler::Lsr(Register rd, Register rm, Register rn,
-                         Condition cond, SetCc set_cc) {
-  mov(rd, ShifterOperand(rm, LSR, rn), cond, set_cc);
-}
-
-
-void Arm32Assembler::Asr(Register rd, Register rm, Register rn,
-                         Condition cond, SetCc set_cc) {
-  mov(rd, ShifterOperand(rm, ASR, rn), cond, set_cc);
-}
-
-
-void Arm32Assembler::Ror(Register rd, Register rm, Register rn,
-                         Condition cond, SetCc set_cc) {
-  mov(rd, ShifterOperand(rm, ROR, rn), cond, set_cc);
-}
-
-void Arm32Assembler::vmstat(Condition cond) {  // VMRS APSR_nzcv, FPSCR
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-      B27 | B26 | B25 | B23 | B22 | B21 | B20 | B16 |
-      (static_cast<int32_t>(PC)*B12) |
-      B11 | B9 | B4;
-  Emit(encoding);
-}
-
-void Arm32Assembler::vcntd(DRegister dd, DRegister dm) {
-  uint32_t encoding = (B31 | B30 | B29 | B28 | B25 | B24 | B23 | B21 | B20) |
-    ((static_cast<int32_t>(dd) >> 4) * B22) |
-    ((static_cast<uint32_t>(dd) & 0xf) * B12) |
-    (B10 | B8) |
-    ((static_cast<int32_t>(dm) >> 4) * B5) |
-    (static_cast<uint32_t>(dm) & 0xf);
-
-  Emit(encoding);
-}
-
-void Arm32Assembler::vpaddld(DRegister dd, DRegister dm, int32_t size, bool is_unsigned) {
-  CHECK(size == 8 || size == 16 || size == 32) << size;
-  uint32_t encoding = (B31 | B30 | B29 | B28 | B25 | B24 | B23 | B21 | B20) |
-    ((static_cast<uint32_t>(size >> 4) & 0x3) * B18) |
-    ((static_cast<int32_t>(dd) >> 4) * B22) |
-    ((static_cast<uint32_t>(dd) & 0xf) * B12) |
-    (B9) |
-    (is_unsigned ? B7 : 0) |
-    ((static_cast<int32_t>(dm) >> 4) * B5) |
-    (static_cast<uint32_t>(dm) & 0xf);
-
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::svc(uint32_t imm24) {
-  CHECK(IsUint<24>(imm24)) << imm24;
-  int32_t encoding = (AL << kConditionShift) | B27 | B26 | B25 | B24 | imm24;
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::bkpt(uint16_t imm16) {
-  int32_t encoding = (AL << kConditionShift) | B24 | B21 |
-                     ((imm16 >> 4) << 8) | B6 | B5 | B4 | (imm16 & 0xf);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::blx(Register rm, Condition cond) {
-  CHECK_NE(rm, kNoRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B24 | B21 | (0xfff << 8) | B5 | B4 |
-                     (static_cast<int32_t>(rm) << kRmShift);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::bx(Register rm, Condition cond) {
-  CHECK_NE(rm, kNoRegister);
-  CHECK_NE(cond, kNoCondition);
-  int32_t encoding = (static_cast<int32_t>(cond) << kConditionShift) |
-                     B24 | B21 | (0xfff << 8) | B4 |
-                     (static_cast<int32_t>(rm) << kRmShift);
-  Emit(encoding);
-}
-
-
-void Arm32Assembler::Push(Register rd, Condition cond) {
-  str(rd, Address(SP, -kRegisterSize, Address::PreIndex), cond);
-}
-
-
-void Arm32Assembler::Pop(Register rd, Condition cond) {
-  ldr(rd, Address(SP, kRegisterSize, Address::PostIndex), cond);
-}
-
-
-void Arm32Assembler::PushList(RegList regs, Condition cond) {
-  stm(DB_W, SP, regs, cond);
-}
-
-
-void Arm32Assembler::PopList(RegList regs, Condition cond) {
-  ldm(IA_W, SP, regs, cond);
-}
-
-
-void Arm32Assembler::Mov(Register rd, Register rm, Condition cond) {
-  if (rd != rm) {
-    mov(rd, ShifterOperand(rm), cond);
-  }
-}
-
-
-void Arm32Assembler::Bind(Label* label) {
-  CHECK(!label->IsBound());
-  int bound_pc = buffer_.Size();
-  while (label->IsLinked()) {
-    int32_t position = label->Position();
-    int32_t next = buffer_.Load<int32_t>(position);
-    int32_t encoded = Arm32Assembler::EncodeBranchOffset(bound_pc - position, next);
-    buffer_.Store<int32_t>(position, encoded);
-    label->position_ = Arm32Assembler::DecodeBranchOffset(next);
-  }
-  label->BindTo(bound_pc);
-}
-
-
-int32_t Arm32Assembler::EncodeBranchOffset(int offset, int32_t inst) {
-  // The offset is off by 8 due to the way the ARM CPUs read PC.
-  offset -= 8;
-  CHECK_ALIGNED(offset, 4);
-  CHECK(IsInt(POPCOUNT(kBranchOffsetMask), offset)) << offset;
-
-  // Properly preserve only the bits supported in the instruction.
-  offset >>= 2;
-  offset &= kBranchOffsetMask;
-  return (inst & ~kBranchOffsetMask) | offset;
-}
-
-
-int Arm32Assembler::DecodeBranchOffset(int32_t inst) {
-  // Sign-extend, left-shift by 2, then add 8.
-  return ((((inst & kBranchOffsetMask) << 8) >> 6) + 8);
-}
-
-
-uint32_t Arm32Assembler::GetAdjustedPosition(uint32_t old_position ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "Unimplemented.";
-  UNREACHABLE();
-}
-
-Literal* Arm32Assembler::NewLiteral(size_t size ATTRIBUTE_UNUSED,
-                                    const uint8_t* data ATTRIBUTE_UNUSED)  {
-  LOG(FATAL) << "Unimplemented.";
-  UNREACHABLE();
-}
-
-void Arm32Assembler::LoadLiteral(Register rt ATTRIBUTE_UNUSED,
-                                 Literal* literal ATTRIBUTE_UNUSED)  {
-  LOG(FATAL) << "Unimplemented.";
-  UNREACHABLE();
-}
-
-void Arm32Assembler::LoadLiteral(Register rt ATTRIBUTE_UNUSED, Register rt2 ATTRIBUTE_UNUSED,
-                                 Literal* literal ATTRIBUTE_UNUSED)  {
-  LOG(FATAL) << "Unimplemented.";
-  UNREACHABLE();
-}
-
-void Arm32Assembler::LoadLiteral(SRegister sd ATTRIBUTE_UNUSED,
-                                 Literal* literal ATTRIBUTE_UNUSED)  {
-  LOG(FATAL) << "Unimplemented.";
-  UNREACHABLE();
-}
-
-void Arm32Assembler::LoadLiteral(DRegister dd ATTRIBUTE_UNUSED,
-                                 Literal* literal ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "Unimplemented.";
-  UNREACHABLE();
-}
-
-
-void Arm32Assembler::AddConstant(Register rd, Register rn, int32_t value,
-                                 Condition cond, SetCc set_cc) {
-  if (value == 0 && set_cc != kCcSet) {
-    if (rd != rn) {
-      mov(rd, ShifterOperand(rn), cond, set_cc);
-    }
-    return;
-  }
-  // We prefer to select the shorter code sequence rather than selecting add for
-  // positive values and sub for negatives ones, which would slightly improve
-  // the readability of generated code for some constants.
-  ShifterOperand shifter_op;
-  if (ShifterOperandCanHoldArm32(value, &shifter_op)) {
-    add(rd, rn, shifter_op, cond, set_cc);
-  } else if (ShifterOperandCanHoldArm32(-value, &shifter_op)) {
-    sub(rd, rn, shifter_op, cond, set_cc);
-  } else {
-    CHECK(rn != IP);
-    if (ShifterOperandCanHoldArm32(~value, &shifter_op)) {
-      mvn(IP, shifter_op, cond, kCcKeep);
-      add(rd, rn, ShifterOperand(IP), cond, set_cc);
-    } else if (ShifterOperandCanHoldArm32(~(-value), &shifter_op)) {
-      mvn(IP, shifter_op, cond, kCcKeep);
-      sub(rd, rn, ShifterOperand(IP), cond, set_cc);
-    } else {
-      movw(IP, Low16Bits(value), cond);
-      uint16_t value_high = High16Bits(value);
-      if (value_high != 0) {
-        movt(IP, value_high, cond);
-      }
-      add(rd, rn, ShifterOperand(IP), cond, set_cc);
-    }
-  }
-}
-
-void Arm32Assembler::CmpConstant(Register rn, int32_t value, Condition cond) {
-  ShifterOperand shifter_op;
-  if (ShifterOperandCanHoldArm32(value, &shifter_op)) {
-    cmp(rn, shifter_op, cond);
-  } else if (ShifterOperandCanHoldArm32(~value, &shifter_op)) {
-    cmn(rn, shifter_op, cond);
-  } else {
-    movw(IP, Low16Bits(value), cond);
-    uint16_t value_high = High16Bits(value);
-    if (value_high != 0) {
-      movt(IP, value_high, cond);
-    }
-    cmp(rn, ShifterOperand(IP), cond);
-  }
-}
-
-void Arm32Assembler::LoadImmediate(Register rd, int32_t value, Condition cond) {
-  ShifterOperand shifter_op;
-  if (ShifterOperandCanHoldArm32(value, &shifter_op)) {
-    mov(rd, shifter_op, cond);
-  } else if (ShifterOperandCanHoldArm32(~value, &shifter_op)) {
-    mvn(rd, shifter_op, cond);
-  } else {
-    movw(rd, Low16Bits(value), cond);
-    uint16_t value_high = High16Bits(value);
-    if (value_high != 0) {
-      movt(rd, value_high, cond);
-    }
-  }
-}
-
-void Arm32Assembler::LoadDImmediate(DRegister dd, double value, Condition cond) {
-  if (!vmovd(dd, value, cond)) {
-    uint64_t int_value = bit_cast<uint64_t, double>(value);
-    if (int_value == bit_cast<uint64_t, double>(0.0)) {
-      // 0.0 is quite common, so we special case it by loading
-      // 2.0 in `dd` and then subtracting it.
-      bool success = vmovd(dd, 2.0, cond);
-      CHECK(success);
-      vsubd(dd, dd, dd, cond);
-    } else {
-      if (dd < 16) {
-        // Note: Depending on the particular CPU, this may cause register
-        // forwarding hazard, negatively impacting the performance.
-        SRegister low = static_cast<SRegister>(dd << 1);
-        SRegister high = static_cast<SRegister>(low + 1);
-        LoadSImmediate(low, bit_cast<float, uint32_t>(Low32Bits(int_value)), cond);
-        if (High32Bits(int_value) == Low32Bits(int_value)) {
-          vmovs(high, low);
-        } else {
-          LoadSImmediate(high, bit_cast<float, uint32_t>(High32Bits(int_value)), cond);
-        }
-      } else {
-        LOG(FATAL) << "Unimplemented loading of double into a D register "
-                   << "that cannot be split into two S registers";
-      }
-    }
-  }
-}
-
-// Implementation note: this method must emit at most one instruction when
-// Address::CanHoldLoadOffsetArm.
-void Arm32Assembler::LoadFromOffset(LoadOperandType type,
-                                    Register reg,
-                                    Register base,
-                                    int32_t offset,
-                                    Condition cond) {
-  if (!Address::CanHoldLoadOffsetArm(type, offset)) {
-    CHECK(base != IP);
-    LoadImmediate(IP, offset, cond);
-    add(IP, IP, ShifterOperand(base), cond);
-    base = IP;
-    offset = 0;
-  }
-  CHECK(Address::CanHoldLoadOffsetArm(type, offset));
-  switch (type) {
-    case kLoadSignedByte:
-      ldrsb(reg, Address(base, offset), cond);
-      break;
-    case kLoadUnsignedByte:
-      ldrb(reg, Address(base, offset), cond);
-      break;
-    case kLoadSignedHalfword:
-      ldrsh(reg, Address(base, offset), cond);
-      break;
-    case kLoadUnsignedHalfword:
-      ldrh(reg, Address(base, offset), cond);
-      break;
-    case kLoadWord:
-      ldr(reg, Address(base, offset), cond);
-      break;
-    case kLoadWordPair:
-      ldrd(reg, Address(base, offset), cond);
-      break;
-    default:
-      LOG(FATAL) << "UNREACHABLE";
-      UNREACHABLE();
-  }
-}
-
-
-// Implementation note: this method must emit at most one instruction when
-// Address::CanHoldLoadOffsetArm, as expected by JIT::GuardedLoadFromOffset.
-void Arm32Assembler::LoadSFromOffset(SRegister reg,
-                                     Register base,
-                                     int32_t offset,
-                                     Condition cond) {
-  if (!Address::CanHoldLoadOffsetArm(kLoadSWord, offset)) {
-    CHECK_NE(base, IP);
-    LoadImmediate(IP, offset, cond);
-    add(IP, IP, ShifterOperand(base), cond);
-    base = IP;
-    offset = 0;
-  }
-  CHECK(Address::CanHoldLoadOffsetArm(kLoadSWord, offset));
-  vldrs(reg, Address(base, offset), cond);
-}
-
-
-// Implementation note: this method must emit at most one instruction when
-// Address::CanHoldLoadOffsetArm, as expected by JIT::GuardedLoadFromOffset.
-void Arm32Assembler::LoadDFromOffset(DRegister reg,
-                                     Register base,
-                                     int32_t offset,
-                                     Condition cond) {
-  if (!Address::CanHoldLoadOffsetArm(kLoadDWord, offset)) {
-    CHECK_NE(base, IP);
-    LoadImmediate(IP, offset, cond);
-    add(IP, IP, ShifterOperand(base), cond);
-    base = IP;
-    offset = 0;
-  }
-  CHECK(Address::CanHoldLoadOffsetArm(kLoadDWord, offset));
-  vldrd(reg, Address(base, offset), cond);
-}
-
-
-// Implementation note: this method must emit at most one instruction when
-// Address::CanHoldStoreOffsetArm.
-void Arm32Assembler::StoreToOffset(StoreOperandType type,
-                                   Register reg,
-                                   Register base,
-                                   int32_t offset,
-                                   Condition cond) {
-  if (!Address::CanHoldStoreOffsetArm(type, offset)) {
-    CHECK(reg != IP);
-    CHECK(base != IP);
-    LoadImmediate(IP, offset, cond);
-    add(IP, IP, ShifterOperand(base), cond);
-    base = IP;
-    offset = 0;
-  }
-  CHECK(Address::CanHoldStoreOffsetArm(type, offset));
-  switch (type) {
-    case kStoreByte:
-      strb(reg, Address(base, offset), cond);
-      break;
-    case kStoreHalfword:
-      strh(reg, Address(base, offset), cond);
-      break;
-    case kStoreWord:
-      str(reg, Address(base, offset), cond);
-      break;
-    case kStoreWordPair:
-      strd(reg, Address(base, offset), cond);
-      break;
-    default:
-      LOG(FATAL) << "UNREACHABLE";
-      UNREACHABLE();
-  }
-}
-
-
-// Implementation note: this method must emit at most one instruction when
-// Address::CanHoldStoreOffsetArm, as expected by JIT::GuardedStoreToOffset.
-void Arm32Assembler::StoreSToOffset(SRegister reg,
-                                    Register base,
-                                    int32_t offset,
-                                    Condition cond) {
-  if (!Address::CanHoldStoreOffsetArm(kStoreSWord, offset)) {
-    CHECK_NE(base, IP);
-    LoadImmediate(IP, offset, cond);
-    add(IP, IP, ShifterOperand(base), cond);
-    base = IP;
-    offset = 0;
-  }
-  CHECK(Address::CanHoldStoreOffsetArm(kStoreSWord, offset));
-  vstrs(reg, Address(base, offset), cond);
-}
-
-
-// Implementation note: this method must emit at most one instruction when
-// Address::CanHoldStoreOffsetArm, as expected by JIT::GuardedStoreSToOffset.
-void Arm32Assembler::StoreDToOffset(DRegister reg,
-                                    Register base,
-                                    int32_t offset,
-                                    Condition cond) {
-  if (!Address::CanHoldStoreOffsetArm(kStoreDWord, offset)) {
-    CHECK_NE(base, IP);
-    LoadImmediate(IP, offset, cond);
-    add(IP, IP, ShifterOperand(base), cond);
-    base = IP;
-    offset = 0;
-  }
-  CHECK(Address::CanHoldStoreOffsetArm(kStoreDWord, offset));
-  vstrd(reg, Address(base, offset), cond);
-}
-
-
-void Arm32Assembler::dmb(DmbOptions flavor) {
-  int32_t encoding = 0xf57ff05f;  // dmb
-  Emit(encoding | flavor);
-}
-
-
-void Arm32Assembler::cbz(Register rn ATTRIBUTE_UNUSED, Label* target ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "cbz is not supported on ARM32";
-}
-
-
-void Arm32Assembler::cbnz(Register rn ATTRIBUTE_UNUSED, Label* target ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "cbnz is not supported on ARM32";
-}
-
-
-void Arm32Assembler::CompareAndBranchIfZero(Register r, Label* label) {
-  cmp(r, ShifterOperand(0));
-  b(label, EQ);
-}
-
-
-void Arm32Assembler::CompareAndBranchIfNonZero(Register r, Label* label) {
-  cmp(r, ShifterOperand(0));
-  b(label, NE);
-}
-
-JumpTable* Arm32Assembler::CreateJumpTable(std::vector<Label*>&& labels ATTRIBUTE_UNUSED,
-                                           Register base_reg ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "CreateJumpTable is not supported on ARM32";
-  UNREACHABLE();
-}
-
-void Arm32Assembler::EmitJumpTableDispatch(JumpTable* jump_table ATTRIBUTE_UNUSED,
-                                           Register displacement_reg ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "EmitJumpTableDispatch is not supported on ARM32";
-  UNREACHABLE();
-}
-
-void Arm32Assembler::FinalizeCode() {
-  ArmAssembler::FinalizeCode();
-  // Currently the arm32 assembler does not support fixups, and thus no tracking. We must not call
-  // FinalizeTrackedLabels(), which would lead to an abort.
-}
-
-}  // namespace arm
-}  // namespace art
diff --git a/compiler/utils/arm/assembler_arm32.h b/compiler/utils/arm/assembler_arm32.h
deleted file mode 100644
index 0cb6b17..0000000
--- a/compiler/utils/arm/assembler_arm32.h
+++ /dev/null
@@ -1,411 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_COMPILER_UTILS_ARM_ASSEMBLER_ARM32_H_
-#define ART_COMPILER_UTILS_ARM_ASSEMBLER_ARM32_H_
-
-#include <vector>
-
-#include "base/logging.h"
-#include "constants_arm.h"
-#include "utils/arm/managed_register_arm.h"
-#include "utils/arm/assembler_arm.h"
-#include "offsets.h"
-
-namespace art {
-namespace arm {
-
-class Arm32Assembler FINAL : public ArmAssembler {
- public:
-  explicit Arm32Assembler(ArenaAllocator* arena) : ArmAssembler(arena) {}
-  virtual ~Arm32Assembler() {}
-
-  bool IsThumb() const OVERRIDE {
-    return false;
-  }
-
-  // Data-processing instructions.
-  virtual void and_(Register rd, Register rn, const ShifterOperand& so,
-                    Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-
-  virtual void eor(Register rd, Register rn, const ShifterOperand& so,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-
-  virtual void sub(Register rd, Register rn, const ShifterOperand& so,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-
-  virtual void rsb(Register rd, Register rn, const ShifterOperand& so,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-
-  virtual void add(Register rd, Register rn, const ShifterOperand& so,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-
-  virtual void adc(Register rd, Register rn, const ShifterOperand& so,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-
-  virtual void sbc(Register rd, Register rn, const ShifterOperand& so,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-
-  virtual void rsc(Register rd, Register rn, const ShifterOperand& so,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-
-  void tst(Register rn, const ShifterOperand& so, Condition cond = AL) OVERRIDE;
-
-  void teq(Register rn, const ShifterOperand& so, Condition cond = AL) OVERRIDE;
-
-  void cmp(Register rn, const ShifterOperand& so, Condition cond = AL) OVERRIDE;
-
-  void cmn(Register rn, const ShifterOperand& so, Condition cond = AL) OVERRIDE;
-
-  virtual void orr(Register rd, Register rn, const ShifterOperand& so,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-
-  virtual void orn(Register rd, Register rn, const ShifterOperand& so,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-
-  virtual void mov(Register rd, const ShifterOperand& so,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-
-  virtual void bic(Register rd, Register rn, const ShifterOperand& so,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-
-  virtual void mvn(Register rd, const ShifterOperand& so,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-
-  // Miscellaneous data-processing instructions.
-  void clz(Register rd, Register rm, Condition cond = AL) OVERRIDE;
-  void movw(Register rd, uint16_t imm16, Condition cond = AL) OVERRIDE;
-  void movt(Register rd, uint16_t imm16, Condition cond = AL) OVERRIDE;
-  void rbit(Register rd, Register rm, Condition cond = AL) OVERRIDE;
-  void rev(Register rd, Register rm, Condition cond = AL) OVERRIDE;
-  void rev16(Register rd, Register rm, Condition cond = AL) OVERRIDE;
-  void revsh(Register rd, Register rm, Condition cond = AL) OVERRIDE;
-
-  // Multiply instructions.
-  void mul(Register rd, Register rn, Register rm, Condition cond = AL) OVERRIDE;
-  void mla(Register rd, Register rn, Register rm, Register ra,
-           Condition cond = AL) OVERRIDE;
-  void mls(Register rd, Register rn, Register rm, Register ra,
-           Condition cond = AL) OVERRIDE;
-  void smull(Register rd_lo, Register rd_hi, Register rn, Register rm,
-             Condition cond = AL) OVERRIDE;
-  void umull(Register rd_lo, Register rd_hi, Register rn, Register rm,
-             Condition cond = AL) OVERRIDE;
-
-  void sdiv(Register rd, Register rn, Register rm, Condition cond = AL) OVERRIDE;
-  void udiv(Register rd, Register rn, Register rm, Condition cond = AL) OVERRIDE;
-
-  // Bit field extract instructions.
-  void sbfx(Register rd, Register rn, uint32_t lsb, uint32_t width, Condition cond = AL) OVERRIDE;
-  void ubfx(Register rd, Register rn, uint32_t lsb, uint32_t width, Condition cond = AL) OVERRIDE;
-
-  // Load/store instructions.
-  void ldr(Register rd, const Address& ad, Condition cond = AL) OVERRIDE;
-  void str(Register rd, const Address& ad, Condition cond = AL) OVERRIDE;
-
-  void ldrb(Register rd, const Address& ad, Condition cond = AL) OVERRIDE;
-  void strb(Register rd, const Address& ad, Condition cond = AL) OVERRIDE;
-
-  void ldrh(Register rd, const Address& ad, Condition cond = AL) OVERRIDE;
-  void strh(Register rd, const Address& ad, Condition cond = AL) OVERRIDE;
-
-  void ldrsb(Register rd, const Address& ad, Condition cond = AL) OVERRIDE;
-  void ldrsh(Register rd, const Address& ad, Condition cond = AL) OVERRIDE;
-
-  void ldrd(Register rd, const Address& ad, Condition cond = AL) OVERRIDE;
-  void strd(Register rd, const Address& ad, Condition cond = AL) OVERRIDE;
-
-  void ldm(BlockAddressMode am, Register base,
-           RegList regs, Condition cond = AL) OVERRIDE;
-  void stm(BlockAddressMode am, Register base,
-           RegList regs, Condition cond = AL) OVERRIDE;
-
-  void ldrex(Register rd, Register rn, Condition cond = AL) OVERRIDE;
-  void strex(Register rd, Register rt, Register rn, Condition cond = AL) OVERRIDE;
-  void ldrexd(Register rt, Register rt2, Register rn, Condition cond = AL) OVERRIDE;
-  void strexd(Register rd, Register rt, Register rt2, Register rn, Condition cond = AL) OVERRIDE;
-
-  // Miscellaneous instructions.
-  void clrex(Condition cond = AL) OVERRIDE;
-  void nop(Condition cond = AL) OVERRIDE;
-
-  // Note that gdb sets breakpoints using the undefined instruction 0xe7f001f0.
-  void bkpt(uint16_t imm16) OVERRIDE;
-  void svc(uint32_t imm24) OVERRIDE;
-
-  void cbz(Register rn, Label* target) OVERRIDE;
-  void cbnz(Register rn, Label* target) OVERRIDE;
-
-  // Floating point instructions (VFPv3-D16 and VFPv3-D32 profiles).
-  void vmovsr(SRegister sn, Register rt, Condition cond = AL) OVERRIDE;
-  void vmovrs(Register rt, SRegister sn, Condition cond = AL) OVERRIDE;
-  void vmovsrr(SRegister sm, Register rt, Register rt2, Condition cond = AL) OVERRIDE;
-  void vmovrrs(Register rt, Register rt2, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vmovdrr(DRegister dm, Register rt, Register rt2, Condition cond = AL) OVERRIDE;
-  void vmovrrd(Register rt, Register rt2, DRegister dm, Condition cond = AL) OVERRIDE;
-  void vmovs(SRegister sd, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vmovd(DRegister dd, DRegister dm, Condition cond = AL) OVERRIDE;
-
-  // Returns false if the immediate cannot be encoded.
-  bool vmovs(SRegister sd, float s_imm, Condition cond = AL) OVERRIDE;
-  bool vmovd(DRegister dd, double d_imm, Condition cond = AL) OVERRIDE;
-
-  void vldrs(SRegister sd, const Address& ad, Condition cond = AL) OVERRIDE;
-  void vstrs(SRegister sd, const Address& ad, Condition cond = AL) OVERRIDE;
-  void vldrd(DRegister dd, const Address& ad, Condition cond = AL) OVERRIDE;
-  void vstrd(DRegister dd, const Address& ad, Condition cond = AL) OVERRIDE;
-
-  void vadds(SRegister sd, SRegister sn, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vaddd(DRegister dd, DRegister dn, DRegister dm, Condition cond = AL) OVERRIDE;
-  void vsubs(SRegister sd, SRegister sn, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vsubd(DRegister dd, DRegister dn, DRegister dm, Condition cond = AL) OVERRIDE;
-  void vmuls(SRegister sd, SRegister sn, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vmuld(DRegister dd, DRegister dn, DRegister dm, Condition cond = AL) OVERRIDE;
-  void vmlas(SRegister sd, SRegister sn, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vmlad(DRegister dd, DRegister dn, DRegister dm, Condition cond = AL) OVERRIDE;
-  void vmlss(SRegister sd, SRegister sn, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vmlsd(DRegister dd, DRegister dn, DRegister dm, Condition cond = AL) OVERRIDE;
-  void vdivs(SRegister sd, SRegister sn, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vdivd(DRegister dd, DRegister dn, DRegister dm, Condition cond = AL) OVERRIDE;
-
-  void vabss(SRegister sd, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vabsd(DRegister dd, DRegister dm, Condition cond = AL) OVERRIDE;
-  void vnegs(SRegister sd, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vnegd(DRegister dd, DRegister dm, Condition cond = AL) OVERRIDE;
-  void vsqrts(SRegister sd, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vsqrtd(DRegister dd, DRegister dm, Condition cond = AL) OVERRIDE;
-
-  void vcvtsd(SRegister sd, DRegister dm, Condition cond = AL) OVERRIDE;
-  void vcvtds(DRegister dd, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vcvtis(SRegister sd, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vcvtid(SRegister sd, DRegister dm, Condition cond = AL) OVERRIDE;
-  void vcvtsi(SRegister sd, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vcvtdi(DRegister dd, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vcvtus(SRegister sd, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vcvtud(SRegister sd, DRegister dm, Condition cond = AL) OVERRIDE;
-  void vcvtsu(SRegister sd, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vcvtdu(DRegister dd, SRegister sm, Condition cond = AL) OVERRIDE;
-
-  void vcmps(SRegister sd, SRegister sm, Condition cond = AL) OVERRIDE;
-  void vcmpd(DRegister dd, DRegister dm, Condition cond = AL) OVERRIDE;
-  void vcmpsz(SRegister sd, Condition cond = AL) OVERRIDE;
-  void vcmpdz(DRegister dd, Condition cond = AL) OVERRIDE;
-  void vmstat(Condition cond = AL) OVERRIDE;  // VMRS APSR_nzcv, FPSCR
-
-  void vcntd(DRegister dd, DRegister dm) OVERRIDE;
-  void vpaddld(DRegister dd, DRegister dm, int32_t size, bool is_unsigned) OVERRIDE;
-
-  void vpushs(SRegister reg, int nregs, Condition cond = AL) OVERRIDE;
-  void vpushd(DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
-  void vpops(SRegister reg, int nregs, Condition cond = AL) OVERRIDE;
-  void vpopd(DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
-  void vldmiad(Register base_reg, DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
-  void vstmiad(Register base_reg, DRegister reg, int nregs, Condition cond = AL) OVERRIDE;
-
-  // Branch instructions.
-  void b(Label* label, Condition cond = AL) OVERRIDE;
-  void bl(Label* label, Condition cond = AL) OVERRIDE;
-  void blx(Register rm, Condition cond = AL) OVERRIDE;
-  void bx(Register rm, Condition cond = AL) OVERRIDE;
-  virtual void Lsl(Register rd, Register rm, uint32_t shift_imm,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-  virtual void Lsr(Register rd, Register rm, uint32_t shift_imm,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-  virtual void Asr(Register rd, Register rm, uint32_t shift_imm,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-  virtual void Ror(Register rd, Register rm, uint32_t shift_imm,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-  virtual void Rrx(Register rd, Register rm,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-
-  virtual void Lsl(Register rd, Register rm, Register rn,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-  virtual void Lsr(Register rd, Register rm, Register rn,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-  virtual void Asr(Register rd, Register rm, Register rn,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-  virtual void Ror(Register rd, Register rm, Register rn,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-
-  void Push(Register rd, Condition cond = AL) OVERRIDE;
-  void Pop(Register rd, Condition cond = AL) OVERRIDE;
-
-  void PushList(RegList regs, Condition cond = AL) OVERRIDE;
-  void PopList(RegList regs, Condition cond = AL) OVERRIDE;
-
-  void Mov(Register rd, Register rm, Condition cond = AL) OVERRIDE;
-
-  void CompareAndBranchIfZero(Register r, Label* label) OVERRIDE;
-  void CompareAndBranchIfNonZero(Register r, Label* label) OVERRIDE;
-
-  // Memory barriers.
-  void dmb(DmbOptions flavor) OVERRIDE;
-
-  // Get the final position of a label after local fixup based on the old position
-  // recorded before FinalizeCode().
-  uint32_t GetAdjustedPosition(uint32_t old_position) OVERRIDE;
-
-  Literal* NewLiteral(size_t size, const uint8_t* data) OVERRIDE;
-  void LoadLiteral(Register rt, Literal* literal) OVERRIDE;
-  void LoadLiteral(Register rt, Register rt2, Literal* literal) OVERRIDE;
-  void LoadLiteral(SRegister sd, Literal* literal) OVERRIDE;
-  void LoadLiteral(DRegister dd, Literal* literal) OVERRIDE;
-
-  // Add signed constant value to rd. May clobber IP.
-  void AddConstant(Register rd, Register rn, int32_t value,
-                   Condition cond = AL, SetCc set_cc = kCcDontCare) OVERRIDE;
-
-  void CmpConstant(Register rn, int32_t value, Condition cond = AL) OVERRIDE;
-
-  // Load and Store. May clobber IP.
-  void LoadImmediate(Register rd, int32_t value, Condition cond = AL) OVERRIDE;
-  void LoadDImmediate(DRegister dd, double value, Condition cond = AL) OVERRIDE;
-  void MarkExceptionHandler(Label* label) OVERRIDE;
-  void LoadFromOffset(LoadOperandType type,
-                      Register reg,
-                      Register base,
-                      int32_t offset,
-                      Condition cond = AL) OVERRIDE;
-  void StoreToOffset(StoreOperandType type,
-                     Register reg,
-                     Register base,
-                     int32_t offset,
-                     Condition cond = AL) OVERRIDE;
-  void LoadSFromOffset(SRegister reg,
-                       Register base,
-                       int32_t offset,
-                       Condition cond = AL) OVERRIDE;
-  void StoreSToOffset(SRegister reg,
-                      Register base,
-                      int32_t offset,
-                      Condition cond = AL) OVERRIDE;
-  void LoadDFromOffset(DRegister reg,
-                       Register base,
-                       int32_t offset,
-                       Condition cond = AL) OVERRIDE;
-  void StoreDToOffset(DRegister reg,
-                      Register base,
-                      int32_t offset,
-                      Condition cond = AL) OVERRIDE;
-
-  bool ShifterOperandCanHold(Register rd,
-                             Register rn,
-                             Opcode opcode,
-                             uint32_t immediate,
-                             SetCc set_cc,
-                             ShifterOperand* shifter_op) OVERRIDE;
-  using ArmAssembler::ShifterOperandCanHold;  // Don't hide the non-virtual override.
-
-  bool ShifterOperandCanAlwaysHold(uint32_t immediate) OVERRIDE;
-
-  static bool IsInstructionForExceptionHandling(uintptr_t pc);
-
-  // Emit data (e.g. encoded instruction or immediate) to the
-  // instruction stream.
-  void Emit(int32_t value);
-  void Bind(Label* label) OVERRIDE;
-
-  JumpTable* CreateJumpTable(std::vector<Label*>&& labels, Register base_reg) OVERRIDE;
-  void EmitJumpTableDispatch(JumpTable* jump_table, Register displacement_reg) OVERRIDE;
-
-  void FinalizeCode() OVERRIDE;
-
- private:
-  void EmitType01(Condition cond,
-                  int type,
-                  Opcode opcode,
-                  SetCc set_cc,
-                  Register rn,
-                  Register rd,
-                  const ShifterOperand& so);
-
-  void EmitType5(Condition cond, int offset, bool link);
-
-  void EmitMemOp(Condition cond,
-                 bool load,
-                 bool byte,
-                 Register rd,
-                 const Address& ad);
-
-  void EmitMemOpAddressMode3(Condition cond,
-                             int32_t mode,
-                             Register rd,
-                             const Address& ad);
-
-  void EmitMultiMemOp(Condition cond,
-                      BlockAddressMode am,
-                      bool load,
-                      Register base,
-                      RegList regs);
-
-  void EmitShiftImmediate(Condition cond,
-                          Shift opcode,
-                          Register rd,
-                          Register rm,
-                          const ShifterOperand& so);
-
-  void EmitShiftRegister(Condition cond,
-                         Shift opcode,
-                         Register rd,
-                         Register rm,
-                         const ShifterOperand& so);
-
-  void EmitMulOp(Condition cond,
-                 int32_t opcode,
-                 Register rd,
-                 Register rn,
-                 Register rm,
-                 Register rs);
-
-  void EmitVFPsss(Condition cond,
-                  int32_t opcode,
-                  SRegister sd,
-                  SRegister sn,
-                  SRegister sm);
-
-  void EmitVFPddd(Condition cond,
-                  int32_t opcode,
-                  DRegister dd,
-                  DRegister dn,
-                  DRegister dm);
-
-  void EmitVFPsd(Condition cond,
-                 int32_t opcode,
-                 SRegister sd,
-                 DRegister dm);
-
-  void EmitVFPds(Condition cond,
-                 int32_t opcode,
-                 DRegister dd,
-                 SRegister sm);
-
-  void EmitVPushPop(uint32_t reg, int nregs, bool push, bool dbl, Condition cond);
-
-  void EmitMiscellaneous(Condition cond, uint8_t op1, uint8_t op2,
-                         uint32_t a_part, uint32_t rest);
-  void EmitReverseBytes(Register rd, Register rm, Condition cond,
-                        uint8_t op1, uint8_t op2);
-
-  void EmitBranch(Condition cond, Label* label, bool link);
-  static int32_t EncodeBranchOffset(int offset, int32_t inst);
-  static int DecodeBranchOffset(int32_t inst);
-  bool ShifterOperandCanHoldArm32(uint32_t immediate, ShifterOperand* shifter_op);
-};
-
-}  // namespace arm
-}  // namespace art
-
-#endif  // ART_COMPILER_UTILS_ARM_ASSEMBLER_ARM32_H_
diff --git a/compiler/utils/arm/assembler_arm32_test.cc b/compiler/utils/arm/assembler_arm32_test.cc
deleted file mode 100644
index b214062..0000000
--- a/compiler/utils/arm/assembler_arm32_test.cc
+++ /dev/null
@@ -1,941 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "assembler_arm32.h"
-
-#include <functional>
-#include <type_traits>
-
-#include "base/macros.h"
-#include "base/stl_util.h"
-#include "utils/arm/assembler_arm_test.h"
-
-namespace art {
-
-using std::placeholders::_1;
-using std::placeholders::_2;
-using std::placeholders::_3;
-using std::placeholders::_4;
-using std::placeholders::_5;
-
-// To speed up tests, don't use all register combinations.
-static constexpr bool kUseSparseRegisterList = true;
-
-// To speed up tests, don't use all condition codes.
-static constexpr bool kUseSparseConditionList = true;
-
-// To speed up tests, don't use all shift immediates.
-static constexpr bool kUseSparseShiftImmediates = true;
-
-class AssemblerArm32Test : public AssemblerArmTest<arm::Arm32Assembler,
-                                                   arm::Register, arm::SRegister,
-                                                   uint32_t, arm::ShifterOperand, arm::Condition,
-                                                   arm::SetCc> {
- protected:
-  std::string GetArchitectureString() OVERRIDE {
-    return "arm";
-  }
-
-  std::string GetAssemblerParameters() OVERRIDE {
-    // Arm-v7a, cortex-a15 (means we have sdiv).
-    return " -march=armv7-a -mcpu=cortex-a15 -mfpu=neon";
-  }
-
-  const char* GetAssemblyHeader() OVERRIDE {
-    return kArm32AssemblyHeader;
-  }
-
-  std::string GetDisassembleParameters() OVERRIDE {
-    return " -D -bbinary -marm --no-show-raw-insn";
-  }
-
-  void SetUpHelpers() OVERRIDE {
-    if (registers_.size() == 0) {
-      if (kUseSparseRegisterList) {
-        registers_.insert(end(registers_),
-                          {  // NOLINT(whitespace/braces)
-                              new arm::Register(arm::R0),
-                              new arm::Register(arm::R1),
-                              new arm::Register(arm::R4),
-                              new arm::Register(arm::R8),
-                              new arm::Register(arm::R11),
-                              new arm::Register(arm::R12),
-                              new arm::Register(arm::R13),
-                              new arm::Register(arm::R14),
-                              new arm::Register(arm::R15)
-                          });
-      } else {
-        registers_.insert(end(registers_),
-                          {  // NOLINT(whitespace/braces)
-                              new arm::Register(arm::R0),
-                              new arm::Register(arm::R1),
-                              new arm::Register(arm::R2),
-                              new arm::Register(arm::R3),
-                              new arm::Register(arm::R4),
-                              new arm::Register(arm::R5),
-                              new arm::Register(arm::R6),
-                              new arm::Register(arm::R7),
-                              new arm::Register(arm::R8),
-                              new arm::Register(arm::R9),
-                              new arm::Register(arm::R10),
-                              new arm::Register(arm::R11),
-                              new arm::Register(arm::R12),
-                              new arm::Register(arm::R13),
-                              new arm::Register(arm::R14),
-                              new arm::Register(arm::R15)
-                          });
-      }
-    }
-
-    if (!kUseSparseConditionList) {
-      conditions_.push_back(arm::Condition::EQ);
-      conditions_.push_back(arm::Condition::NE);
-      conditions_.push_back(arm::Condition::CS);
-      conditions_.push_back(arm::Condition::CC);
-      conditions_.push_back(arm::Condition::MI);
-      conditions_.push_back(arm::Condition::PL);
-      conditions_.push_back(arm::Condition::VS);
-      conditions_.push_back(arm::Condition::VC);
-      conditions_.push_back(arm::Condition::HI);
-      conditions_.push_back(arm::Condition::LS);
-      conditions_.push_back(arm::Condition::GE);
-      conditions_.push_back(arm::Condition::LT);
-      conditions_.push_back(arm::Condition::GT);
-      conditions_.push_back(arm::Condition::LE);
-      conditions_.push_back(arm::Condition::AL);
-    } else {
-      conditions_.push_back(arm::Condition::EQ);
-      conditions_.push_back(arm::Condition::NE);
-      conditions_.push_back(arm::Condition::CC);
-      conditions_.push_back(arm::Condition::VC);
-      conditions_.push_back(arm::Condition::HI);
-      conditions_.push_back(arm::Condition::LT);
-      conditions_.push_back(arm::Condition::AL);
-    }
-
-    set_ccs_.push_back(arm::kCcDontCare);
-    set_ccs_.push_back(arm::kCcSet);
-    set_ccs_.push_back(arm::kCcKeep);
-
-    shifter_operands_.push_back(arm::ShifterOperand(0));
-    shifter_operands_.push_back(arm::ShifterOperand(1));
-    shifter_operands_.push_back(arm::ShifterOperand(2));
-    shifter_operands_.push_back(arm::ShifterOperand(3));
-    shifter_operands_.push_back(arm::ShifterOperand(4));
-    shifter_operands_.push_back(arm::ShifterOperand(5));
-    shifter_operands_.push_back(arm::ShifterOperand(127));
-    shifter_operands_.push_back(arm::ShifterOperand(128));
-    shifter_operands_.push_back(arm::ShifterOperand(254));
-    shifter_operands_.push_back(arm::ShifterOperand(255));
-
-    if (!kUseSparseRegisterList) {
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R0));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R1));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R2));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R3));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R4));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R5));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R6));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R7));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R8));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R9));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R10));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R11));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R12));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R13));
-    } else {
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R0));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R1));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R4));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R8));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R11));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R12));
-      shifter_operands_.push_back(arm::ShifterOperand(arm::R13));
-    }
-
-    std::vector<arm::Shift> shifts {
-      arm::Shift::LSL, arm::Shift::LSR, arm::Shift::ASR, arm::Shift::ROR, arm::Shift::RRX
-    };
-
-    // ShifterOperands of form "reg shift-type imm."
-    for (arm::Shift shift : shifts) {
-      for (arm::Register* reg : registers_) {  // Note: this will pick up the sparse set.
-        if (*reg == arm::R15) {  // Skip PC.
-          continue;
-        }
-        if (shift != arm::Shift::RRX) {
-          if (!kUseSparseShiftImmediates) {
-            for (uint32_t imm = 1; imm < 32; ++imm) {
-              shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, imm));
-            }
-          } else {
-            shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 1));
-            shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 2));
-            shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 3));
-            shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 7));
-            shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 15));
-            shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 16));
-            shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 30));
-            shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 31));
-          }
-        } else {
-          // RRX doesn't have an immediate.
-          shifter_operands_.push_back(arm::ShifterOperand(*reg, shift, 0));
-        }
-      }
-    }
-  }
-
-  std::vector<arm::ShifterOperand> CreateRegisterShifts(std::vector<arm::Register*>& base_regs,
-                                                        int32_t shift_min, int32_t shift_max) {
-    std::vector<arm::ShifterOperand> res;
-    static constexpr arm::Shift kShifts[] = { arm::Shift::LSL, arm::Shift::LSR, arm::Shift::ASR,
-                                              arm::Shift::ROR };
-
-    for (arm::Shift shift : kShifts) {
-      for (arm::Register* reg : base_regs) {
-        // Take the min, the max, and three values in between.
-        res.push_back(arm::ShifterOperand(*reg, shift, shift_min));
-        if (shift_min != shift_max) {
-          res.push_back(arm::ShifterOperand(*reg, shift, shift_max));
-          int32_t middle = (shift_min + shift_max) / 2;
-          res.push_back(arm::ShifterOperand(*reg, shift, middle));
-          res.push_back(arm::ShifterOperand(*reg, shift, middle - 1));
-          res.push_back(arm::ShifterOperand(*reg, shift, middle + 1));
-        }
-      }
-    }
-
-    return res;
-  }
-
-  void TearDown() OVERRIDE {
-    AssemblerArmTest::TearDown();
-    STLDeleteElements(&registers_);
-  }
-
-  std::vector<arm::Register*> GetRegisters() OVERRIDE {
-    return registers_;
-  }
-
-  uint32_t CreateImmediate(int64_t imm_value) OVERRIDE {
-    return imm_value;
-  }
-
-  std::vector<arm::Condition>& GetConditions() OVERRIDE {
-    return conditions_;
-  }
-
-  std::string GetConditionString(arm::Condition c) OVERRIDE {
-    std::ostringstream oss;
-    oss << c;
-    return oss.str();
-  }
-
-  std::vector<arm::SetCc>& GetSetCcs() OVERRIDE {
-    return set_ccs_;
-  }
-
-  std::string GetSetCcString(arm::SetCc s) OVERRIDE {
-    // For arm32, kCcDontCare defaults to not setting condition codes.
-    return s == arm::kCcSet ? "s" : "";
-  }
-
-  arm::Register GetPCRegister() OVERRIDE {
-    return arm::R15;
-  }
-
-  std::vector<arm::ShifterOperand>& GetShiftOperands() OVERRIDE {
-    return shifter_operands_;
-  }
-
-  std::string GetShiftString(arm::ShifterOperand sop) OVERRIDE {
-    std::ostringstream oss;
-    if (sop.IsShift()) {
-      // Not a rotate...
-      if (sop.GetShift() == arm::Shift::RRX) {
-        oss << sop.GetRegister() << ", " << sop.GetShift();
-      } else {
-        oss << sop.GetRegister() << ", " << sop.GetShift() << " #" << sop.GetImmediate();
-      }
-    } else if (sop.IsRegister()) {
-      oss << sop.GetRegister();
-    } else {
-      CHECK(sop.IsImmediate());
-      oss << "#" << sop.GetImmediate();
-    }
-    return oss.str();
-  }
-
-  static const char* GetRegTokenFromDepth(int depth) {
-    switch (depth) {
-      case 0:
-        return Base::REG1_TOKEN;
-      case 1:
-        return Base::REG2_TOKEN;
-      case 2:
-        return Base::REG3_TOKEN;
-      case 3:
-        return REG4_TOKEN;
-      default:
-        LOG(FATAL) << "Depth problem.";
-        UNREACHABLE();
-    }
-  }
-
-  void ExecuteAndPrint(std::function<void()> f, std::string fmt, std::ostringstream& oss) {
-    if (first_) {
-      first_ = false;
-    } else {
-      oss << "\n";
-    }
-    oss << fmt;
-
-    f();
-  }
-
-  // NOTE: Only support simple test like "aaa=bbb"
-  bool EvalFilterString(std::string filter) {
-    if (filter.compare("") == 0) {
-      return false;
-    }
-
-    size_t equal_sign_index = filter.find('=');
-    if (equal_sign_index == std::string::npos) {
-      EXPECT_TRUE(false) << "Unsupported filter string.";
-    }
-
-    std::string lhs = filter.substr(0, equal_sign_index);
-    std::string rhs = filter.substr(equal_sign_index + 1, std::string::npos);
-    return lhs.compare(rhs) == 0;
-  }
-
-  void TemplateHelper(std::function<void(arm::Register)> f, int depth ATTRIBUTE_UNUSED,
-                      bool without_pc, std::string fmt, std::string filter,
-                      std::ostringstream& oss) {
-    std::vector<arm::Register*> registers = without_pc ? GetRegistersWithoutPC() : GetRegisters();
-    for (auto reg : registers) {
-      std::string after_reg = fmt;
-      std::string after_reg_filter = filter;
-
-      std::string reg_string = GetRegName<RegisterView::kUsePrimaryName>(*reg);
-      size_t reg_index;
-      const char* reg_token = GetRegTokenFromDepth(depth);
-
-      while ((reg_index = after_reg.find(reg_token)) != std::string::npos) {
-        after_reg.replace(reg_index, strlen(reg_token), reg_string);
-      }
-
-      while ((reg_index = after_reg_filter.find(reg_token)) != std::string::npos) {
-        after_reg_filter.replace(reg_index, strlen(reg_token), reg_string);
-      }
-      if (EvalFilterString(after_reg_filter)) {
-        continue;
-      }
-
-      ExecuteAndPrint([&] () { f(*reg); }, after_reg, oss);
-    }
-  }
-
-  void TemplateHelper(std::function<void(const arm::ShifterOperand&)> f, int depth ATTRIBUTE_UNUSED,
-                      bool without_pc ATTRIBUTE_UNUSED, std::string fmt, std::string filter,
-                      std::ostringstream& oss) {
-    for (const arm::ShifterOperand& shift : GetShiftOperands()) {
-      std::string after_shift = fmt;
-      std::string after_shift_filter = filter;
-
-      std::string shift_string = GetShiftString(shift);
-      size_t shift_index;
-      while ((shift_index = after_shift.find(SHIFT_TOKEN)) != std::string::npos) {
-        after_shift.replace(shift_index, ConstexprStrLen(SHIFT_TOKEN), shift_string);
-      }
-
-      while ((shift_index = after_shift_filter.find(SHIFT_TOKEN)) != std::string::npos) {
-        after_shift_filter.replace(shift_index, ConstexprStrLen(SHIFT_TOKEN), shift_string);
-      }
-      if (EvalFilterString(after_shift_filter)) {
-        continue;
-      }
-
-      ExecuteAndPrint([&] () { f(shift); }, after_shift, oss);
-    }
-  }
-
-  void TemplateHelper(std::function<void(arm::Condition)> f, int depth ATTRIBUTE_UNUSED,
-                      bool without_pc ATTRIBUTE_UNUSED, std::string fmt, std::string filter,
-                      std::ostringstream& oss) {
-    for (arm::Condition c : GetConditions()) {
-      std::string after_cond = fmt;
-      std::string after_cond_filter = filter;
-
-      size_t cond_index = after_cond.find(COND_TOKEN);
-      if (cond_index != std::string::npos) {
-        after_cond.replace(cond_index, ConstexprStrLen(COND_TOKEN), GetConditionString(c));
-      }
-
-      cond_index = after_cond_filter.find(COND_TOKEN);
-      if (cond_index != std::string::npos) {
-        after_cond_filter.replace(cond_index, ConstexprStrLen(COND_TOKEN), GetConditionString(c));
-      }
-      if (EvalFilterString(after_cond_filter)) {
-        continue;
-      }
-
-      ExecuteAndPrint([&] () { f(c); }, after_cond, oss);
-    }
-  }
-
-  void TemplateHelper(std::function<void(arm::SetCc)> f, int depth ATTRIBUTE_UNUSED,
-                      bool without_pc ATTRIBUTE_UNUSED, std::string fmt, std::string filter,
-                      std::ostringstream& oss) {
-    for (arm::SetCc s : GetSetCcs()) {
-      std::string after_cond = fmt;
-      std::string after_cond_filter = filter;
-
-      size_t cond_index = after_cond.find(SET_CC_TOKEN);
-      if (cond_index != std::string::npos) {
-        after_cond.replace(cond_index, ConstexprStrLen(SET_CC_TOKEN), GetSetCcString(s));
-      }
-
-      cond_index = after_cond_filter.find(SET_CC_TOKEN);
-      if (cond_index != std::string::npos) {
-        after_cond_filter.replace(cond_index, ConstexprStrLen(SET_CC_TOKEN), GetSetCcString(s));
-      }
-      if (EvalFilterString(after_cond_filter)) {
-        continue;
-      }
-
-      ExecuteAndPrint([&] () { f(s); }, after_cond, oss);
-    }
-  }
-
-  template <typename... Args>
-  void TemplateHelper(std::function<void(arm::Register, Args...)> f, int depth, bool without_pc,
-                      std::string fmt, std::string filter, std::ostringstream& oss) {
-    std::vector<arm::Register*> registers = without_pc ? GetRegistersWithoutPC() : GetRegisters();
-    for (auto reg : registers) {
-      std::string after_reg = fmt;
-      std::string after_reg_filter = filter;
-
-      std::string reg_string = GetRegName<RegisterView::kUsePrimaryName>(*reg);
-      size_t reg_index;
-      const char* reg_token = GetRegTokenFromDepth(depth);
-
-      while ((reg_index = after_reg.find(reg_token)) != std::string::npos) {
-        after_reg.replace(reg_index, strlen(reg_token), reg_string);
-      }
-
-      while ((reg_index = after_reg_filter.find(reg_token)) != std::string::npos) {
-        after_reg_filter.replace(reg_index, strlen(reg_token), reg_string);
-      }
-      if (EvalFilterString(after_reg_filter)) {
-        continue;
-      }
-
-      auto lambda = [&] (Args... args) { f(*reg, args...); };  // NOLINT [readability/braces] [4]
-      TemplateHelper(std::function<void(Args...)>(lambda), depth + 1, without_pc,
-          after_reg, after_reg_filter, oss);
-    }
-  }
-
-  template <typename... Args>
-  void TemplateHelper(std::function<void(const arm::ShifterOperand&, Args...)> f, int depth,
-                      bool without_pc, std::string fmt, std::string filter,
-                      std::ostringstream& oss) {
-    for (const arm::ShifterOperand& shift : GetShiftOperands()) {
-      std::string after_shift = fmt;
-      std::string after_shift_filter = filter;
-
-      std::string shift_string = GetShiftString(shift);
-      size_t shift_index;
-      while ((shift_index = after_shift.find(SHIFT_TOKEN)) != std::string::npos) {
-        after_shift.replace(shift_index, ConstexprStrLen(SHIFT_TOKEN), shift_string);
-      }
-
-      while ((shift_index = after_shift_filter.find(SHIFT_TOKEN)) != std::string::npos) {
-        after_shift_filter.replace(shift_index, ConstexprStrLen(SHIFT_TOKEN), shift_string);
-      }
-      if (EvalFilterString(after_shift_filter)) {
-        continue;
-      }
-
-      auto lambda = [&] (Args... args) { f(shift, args...); };  // NOLINT [readability/braces] [4]
-      TemplateHelper(std::function<void(Args...)>(lambda), depth, without_pc,
-          after_shift, after_shift_filter, oss);
-    }
-  }
-
-  template <typename... Args>
-  void TemplateHelper(std::function<void(arm::Condition, Args...)> f, int depth, bool without_pc,
-                      std::string fmt, std::string filter, std::ostringstream& oss) {
-    for (arm::Condition c : GetConditions()) {
-      std::string after_cond = fmt;
-      std::string after_cond_filter = filter;
-
-      size_t cond_index = after_cond.find(COND_TOKEN);
-      if (cond_index != std::string::npos) {
-        after_cond.replace(cond_index, ConstexprStrLen(COND_TOKEN), GetConditionString(c));
-      }
-
-      cond_index = after_cond_filter.find(COND_TOKEN);
-      if (cond_index != std::string::npos) {
-        after_cond_filter.replace(cond_index, ConstexprStrLen(COND_TOKEN), GetConditionString(c));
-      }
-      if (EvalFilterString(after_cond_filter)) {
-        continue;
-      }
-
-      auto lambda = [&] (Args... args) { f(c, args...); };  // NOLINT [readability/braces] [4]
-      TemplateHelper(std::function<void(Args...)>(lambda), depth, without_pc,
-          after_cond, after_cond_filter, oss);
-    }
-  }
-
-  template <typename... Args>
-  void TemplateHelper(std::function<void(arm::SetCc, Args...)> f, int depth, bool without_pc,
-                      std::string fmt, std::string filter, std::ostringstream& oss) {
-    for (arm::SetCc s : GetSetCcs()) {
-      std::string after_cond = fmt;
-      std::string after_cond_filter = filter;
-
-      size_t cond_index = after_cond.find(SET_CC_TOKEN);
-      if (cond_index != std::string::npos) {
-        after_cond.replace(cond_index, ConstexprStrLen(SET_CC_TOKEN), GetSetCcString(s));
-      }
-
-      cond_index = after_cond_filter.find(SET_CC_TOKEN);
-      if (cond_index != std::string::npos) {
-        after_cond_filter.replace(cond_index, ConstexprStrLen(SET_CC_TOKEN), GetSetCcString(s));
-      }
-      if (EvalFilterString(after_cond_filter)) {
-        continue;
-      }
-
-      auto lambda = [&] (Args... args) { f(s, args...); };  // NOLINT [readability/braces] [4]
-      TemplateHelper(std::function<void(Args...)>(lambda), depth, without_pc,
-          after_cond, after_cond_filter, oss);
-    }
-  }
-
-  template <typename Assembler, typename T1, typename T2>
-  std::function<void(T1, T2)> GetBoundFunction2(void (Assembler::*f)(T1, T2)) {
-    return std::bind(f, GetAssembler(), _1, _2);
-  }
-
-  template <typename Assembler, typename T1, typename T2, typename T3>
-  std::function<void(T1, T2, T3)> GetBoundFunction3(void (Assembler::*f)(T1, T2, T3)) {
-    return std::bind(f, GetAssembler(), _1, _2, _3);
-  }
-
-  template <typename Assembler, typename T1, typename T2, typename T3, typename T4>
-  std::function<void(T1, T2, T3, T4)> GetBoundFunction4(
-      void (Assembler::*f)(T1, T2, T3, T4)) {
-    return std::bind(f, GetAssembler(), _1, _2, _3, _4);
-  }
-
-  template <typename Assembler, typename T1, typename T2, typename T3, typename T4, typename T5>
-  std::function<void(T1, T2, T3, T4, T5)> GetBoundFunction5(
-      void (Assembler::*f)(T1, T2, T3, T4, T5)) {
-    return std::bind(f, GetAssembler(), _1, _2, _3, _4, _5);
-  }
-
-  template <typename... Args>
-  void GenericTemplateHelper(std::function<void(Args...)> f, bool without_pc,
-                             std::string fmt, std::string test_name, std::string filter) {
-    first_ = false;
-    WarnOnCombinations(CountHelper<Args...>(without_pc));
-
-    std::ostringstream oss;
-
-    TemplateHelper(f, 0, without_pc, fmt, filter, oss);
-
-    oss << "\n";  // Trailing newline.
-
-    DriverStr(oss.str(), test_name);
-  }
-
-  template <typename Assembler, typename... Args>
-  void T2Helper(void (Assembler::*f)(Args...), bool without_pc, std::string fmt,
-                std::string test_name, std::string filter = "") {
-    GenericTemplateHelper(GetBoundFunction2(f), without_pc, fmt, test_name, filter);
-  }
-
-  template <typename Assembler, typename... Args>
-  void T3Helper(void (Assembler::*f)(Args...), bool without_pc, std::string fmt,
-      std::string test_name, std::string filter = "") {
-    GenericTemplateHelper(GetBoundFunction3(f), without_pc, fmt, test_name, filter);
-  }
-
-  template <typename Assembler, typename... Args>
-  void T4Helper(void (Assembler::*f)(Args...), bool without_pc, std::string fmt,
-      std::string test_name, std::string filter = "") {
-    GenericTemplateHelper(GetBoundFunction4(f), without_pc, fmt, test_name, filter);
-  }
-
-  template <typename Assembler, typename... Args>
-  void T5Helper(void (Assembler::*f)(Args...), bool without_pc, std::string fmt,
-      std::string test_name, std::string filter = "") {
-    GenericTemplateHelper(GetBoundFunction5(f), without_pc, fmt, test_name, filter);
-  }
-
- private:
-  template <typename T>
-  size_t CountHelper(bool without_pc) {
-    size_t tmp;
-    if (std::is_same<T, arm::Register>::value) {
-      tmp = GetRegisters().size();
-      if (without_pc) {
-        tmp--;;  // Approximation...
-      }
-      return tmp;
-    } else if (std::is_same<T, const arm::ShifterOperand&>::value) {
-      return GetShiftOperands().size();
-    } else if (std::is_same<T, arm::Condition>::value) {
-      return GetConditions().size();
-    } else {
-      LOG(WARNING) << "Unknown type while counting.";
-      return 1;
-    }
-  }
-
-  template <typename T1, typename T2, typename... Args>
-  size_t CountHelper(bool without_pc) {
-    size_t tmp;
-    if (std::is_same<T1, arm::Register>::value) {
-      tmp = GetRegisters().size();
-      if (without_pc) {
-        tmp--;;  // Approximation...
-      }
-    } else if (std::is_same<T1, const arm::ShifterOperand&>::value) {
-      tmp =  GetShiftOperands().size();
-    } else if (std::is_same<T1, arm::Condition>::value) {
-      tmp = GetConditions().size();
-    } else {
-      LOG(WARNING) << "Unknown type while counting.";
-      tmp = 1;
-    }
-    size_t rec = CountHelper<T2, Args...>(without_pc);
-    return rec * tmp;
-  }
-
-  bool first_;
-
-  static constexpr const char* kArm32AssemblyHeader = ".arm\n";
-
-  std::vector<arm::Register*> registers_;
-  std::vector<arm::Condition> conditions_;
-  std::vector<arm::SetCc> set_ccs_;
-  std::vector<arm::ShifterOperand> shifter_operands_;
-};
-
-
-TEST_F(AssemblerArm32Test, Toolchain) {
-  EXPECT_TRUE(CheckTools());
-}
-
-TEST_F(AssemblerArm32Test, Sbfx) {
-  std::vector<std::pair<uint32_t, uint32_t>> immediates;
-  immediates.push_back({0, 1});
-  immediates.push_back({0, 8});
-  immediates.push_back({0, 15});
-  immediates.push_back({0, 16});
-  immediates.push_back({0, 31});
-  immediates.push_back({0, 32});
-
-  immediates.push_back({1, 1});
-  immediates.push_back({1, 15});
-  immediates.push_back({1, 31});
-
-  immediates.push_back({8, 1});
-  immediates.push_back({8, 15});
-  immediates.push_back({8, 16});
-  immediates.push_back({8, 24});
-
-  immediates.push_back({31, 1});
-
-  DriverStr(RepeatRRiiC(&arm::Arm32Assembler::sbfx, immediates,
-                        "sbfx{cond} {reg1}, {reg2}, #{imm1}, #{imm2}"), "sbfx");
-}
-
-TEST_F(AssemblerArm32Test, Ubfx) {
-  std::vector<std::pair<uint32_t, uint32_t>> immediates;
-  immediates.push_back({0, 1});
-  immediates.push_back({0, 8});
-  immediates.push_back({0, 15});
-  immediates.push_back({0, 16});
-  immediates.push_back({0, 31});
-  immediates.push_back({0, 32});
-
-  immediates.push_back({1, 1});
-  immediates.push_back({1, 15});
-  immediates.push_back({1, 31});
-
-  immediates.push_back({8, 1});
-  immediates.push_back({8, 15});
-  immediates.push_back({8, 16});
-  immediates.push_back({8, 24});
-
-  immediates.push_back({31, 1});
-
-  DriverStr(RepeatRRiiC(&arm::Arm32Assembler::ubfx, immediates,
-                        "ubfx{cond} {reg1}, {reg2}, #{imm1}, #{imm2}"), "ubfx");
-}
-
-TEST_F(AssemblerArm32Test, Mul) {
-  T4Helper(&arm::Arm32Assembler::mul, true, "mul{cond} {reg1}, {reg2}, {reg3}", "mul");
-}
-
-TEST_F(AssemblerArm32Test, Mla) {
-  T5Helper(&arm::Arm32Assembler::mla, true, "mla{cond} {reg1}, {reg2}, {reg3}, {reg4}", "mla");
-}
-
-TEST_F(AssemblerArm32Test, Umull) {
-  T5Helper(&arm::Arm32Assembler::umull, true, "umull{cond} {reg1}, {reg2}, {reg3}, {reg4}",
-           "umull", "{reg1}={reg2}");  // Skip the cases where reg1 == reg2.
-}
-
-TEST_F(AssemblerArm32Test, Smull) {
-  T5Helper(&arm::Arm32Assembler::smull, true, "smull{cond} {reg1}, {reg2}, {reg3}, {reg4}",
-           "smull", "{reg1}={reg2}");  // Skip the cases where reg1 == reg2.
-}
-
-TEST_F(AssemblerArm32Test, Sdiv) {
-  T4Helper(&arm::Arm32Assembler::sdiv, true, "sdiv{cond} {reg1}, {reg2}, {reg3}", "sdiv");
-}
-
-TEST_F(AssemblerArm32Test, Udiv) {
-  T4Helper(&arm::Arm32Assembler::udiv, true, "udiv{cond} {reg1}, {reg2}, {reg3}", "udiv");
-}
-
-TEST_F(AssemblerArm32Test, And) {
-  T5Helper(&arm::Arm32Assembler::and_, true, "and{cond}{s} {reg1}, {reg2}, {shift}", "and");
-}
-
-TEST_F(AssemblerArm32Test, Ands) {
-  T4Helper(&arm::Arm32Assembler::ands, true, "and{cond}s {reg1}, {reg2}, {shift}", "ands");
-}
-
-TEST_F(AssemblerArm32Test, Eor) {
-  T5Helper(&arm::Arm32Assembler::eor, true, "eor{cond}{s} {reg1}, {reg2}, {shift}", "eor");
-}
-
-TEST_F(AssemblerArm32Test, Eors) {
-  T4Helper(&arm::Arm32Assembler::eors, true, "eor{cond}s {reg1}, {reg2}, {shift}", "eors");
-}
-
-TEST_F(AssemblerArm32Test, Orr) {
-  T5Helper(&arm::Arm32Assembler::orr, true, "orr{cond}{s} {reg1}, {reg2}, {shift}", "orr");
-}
-
-TEST_F(AssemblerArm32Test, Orrs) {
-  T4Helper(&arm::Arm32Assembler::orrs, true, "orr{cond}s {reg1}, {reg2}, {shift}", "orrs");
-}
-
-TEST_F(AssemblerArm32Test, Bic) {
-  T5Helper(&arm::Arm32Assembler::bic, true, "bic{cond}{s} {reg1}, {reg2}, {shift}", "bic");
-}
-
-TEST_F(AssemblerArm32Test, Bics) {
-  T4Helper(&arm::Arm32Assembler::bics, true, "bic{cond}s {reg1}, {reg2}, {shift}", "bics");
-}
-
-TEST_F(AssemblerArm32Test, Mov) {
-  T4Helper(&arm::Arm32Assembler::mov, true, "mov{cond}{s} {reg1}, {shift}", "mov");
-}
-
-TEST_F(AssemblerArm32Test, Movs) {
-  T3Helper(&arm::Arm32Assembler::movs, true, "mov{cond}s {reg1}, {shift}", "movs");
-}
-
-TEST_F(AssemblerArm32Test, Mvn) {
-  T4Helper(&arm::Arm32Assembler::mvn, true, "mvn{cond}{s} {reg1}, {shift}", "mvn");
-}
-
-TEST_F(AssemblerArm32Test, Mvns) {
-  T3Helper(&arm::Arm32Assembler::mvns, true, "mvn{cond}s {reg1}, {shift}", "mvns");
-}
-
-TEST_F(AssemblerArm32Test, Add) {
-  T5Helper(&arm::Arm32Assembler::add, false, "add{cond}{s} {reg1}, {reg2}, {shift}", "add");
-}
-
-TEST_F(AssemblerArm32Test, Adds) {
-  T4Helper(&arm::Arm32Assembler::adds, false, "add{cond}s {reg1}, {reg2}, {shift}", "adds");
-}
-
-TEST_F(AssemblerArm32Test, Adc) {
-  T5Helper(&arm::Arm32Assembler::adc, false, "adc{cond}{s} {reg1}, {reg2}, {shift}", "adc");
-}
-
-TEST_F(AssemblerArm32Test, Adcs) {
-  T4Helper(&arm::Arm32Assembler::adcs, false, "adc{cond}s {reg1}, {reg2}, {shift}", "adcs");
-}
-
-TEST_F(AssemblerArm32Test, Sub) {
-  T5Helper(&arm::Arm32Assembler::sub, false, "sub{cond}{s} {reg1}, {reg2}, {shift}", "sub");
-}
-
-TEST_F(AssemblerArm32Test, Subs) {
-  T4Helper(&arm::Arm32Assembler::subs, false, "sub{cond}s {reg1}, {reg2}, {shift}", "subs");
-}
-
-TEST_F(AssemblerArm32Test, Sbc) {
-  T5Helper(&arm::Arm32Assembler::sbc, false, "sbc{cond}{s} {reg1}, {reg2}, {shift}", "sbc");
-}
-
-TEST_F(AssemblerArm32Test, Sbcs) {
-  T4Helper(&arm::Arm32Assembler::sbcs, false, "sbc{cond}s {reg1}, {reg2}, {shift}", "sbcs");
-}
-
-TEST_F(AssemblerArm32Test, Rsb) {
-  T5Helper(&arm::Arm32Assembler::rsb, true, "rsb{cond}{s} {reg1}, {reg2}, {shift}", "rsb");
-}
-
-TEST_F(AssemblerArm32Test, Rsbs) {
-  T4Helper(&arm::Arm32Assembler::rsbs, true, "rsb{cond}s {reg1}, {reg2}, {shift}", "rsbs");
-}
-
-TEST_F(AssemblerArm32Test, Rsc) {
-  T5Helper(&arm::Arm32Assembler::rsc, true, "rsc{cond}{s} {reg1}, {reg2}, {shift}", "rsc");
-}
-
-TEST_F(AssemblerArm32Test, Rscs) {
-  T4Helper(&arm::Arm32Assembler::rscs, false, "rsc{cond}s {reg1}, {reg2}, {shift}", "rscs");
-}
-
-/* TODO: Need better filter support.
-TEST_F(AssemblerArm32Test, Strex) {
-  T4Helper(&arm::Arm32Assembler::strex, "strex{cond} {reg1}, {reg2}, [{reg3}]", "strex",
-           "{reg1}={reg2}||{reg1}={reg3}");  // Skip the cases where reg1 == reg2 || reg1 == reg3.
-}
-*/
-
-TEST_F(AssemblerArm32Test, Clz) {
-  T3Helper(&arm::Arm32Assembler::clz, true, "clz{cond} {reg1}, {reg2}", "clz");
-}
-
-TEST_F(AssemblerArm32Test, Tst) {
-  T3Helper(&arm::Arm32Assembler::tst, true, "tst{cond} {reg1}, {shift}", "tst");
-}
-
-TEST_F(AssemblerArm32Test, Teq) {
-  T3Helper(&arm::Arm32Assembler::teq, true, "teq{cond} {reg1}, {shift}", "teq");
-}
-
-TEST_F(AssemblerArm32Test, Cmp) {
-  T3Helper(&arm::Arm32Assembler::cmp, true, "cmp{cond} {reg1}, {shift}", "cmp");
-}
-
-TEST_F(AssemblerArm32Test, Cmn) {
-  T3Helper(&arm::Arm32Assembler::cmn, true, "cmn{cond} {reg1}, {shift}", "cmn");
-}
-
-TEST_F(AssemblerArm32Test, Blx) {
-  T2Helper(&arm::Arm32Assembler::blx, true, "blx{cond} {reg1}", "blx");
-}
-
-TEST_F(AssemblerArm32Test, Bx) {
-  T2Helper(&arm::Arm32Assembler::bx, true, "bx{cond} {reg1}", "bx");
-}
-
-TEST_F(AssemblerArm32Test, Vmstat) {
-  GetAssembler()->vmstat();
-
-  const char* expected = "vmrs APSR_nzcv, FPSCR\n";
-
-  DriverStr(expected, "vmrs");
-}
-
-TEST_F(AssemblerArm32Test, ldrexd) {
-  GetAssembler()->ldrexd(arm::R0, arm::R1, arm::R0);
-  GetAssembler()->ldrexd(arm::R0, arm::R1, arm::R1);
-  GetAssembler()->ldrexd(arm::R0, arm::R1, arm::R2);
-
-  const char* expected =
-      "ldrexd r0, r1, [r0]\n"
-      "ldrexd r0, r1, [r1]\n"
-      "ldrexd r0, r1, [r2]\n";
-  DriverStr(expected, "ldrexd");
-}
-
-TEST_F(AssemblerArm32Test, strexd) {
-  GetAssembler()->strexd(arm::R9, arm::R0, arm::R1, arm::R0);
-  GetAssembler()->strexd(arm::R9, arm::R0, arm::R1, arm::R1);
-  GetAssembler()->strexd(arm::R9, arm::R0, arm::R1, arm::R2);
-
-  const char* expected =
-      "strexd r9, r0, r1, [r0]\n"
-      "strexd r9, r0, r1, [r1]\n"
-      "strexd r9, r0, r1, [r2]\n";
-  DriverStr(expected, "strexd");
-}
-
-TEST_F(AssemblerArm32Test, rbit) {
-  T3Helper(&arm::Arm32Assembler::rbit, true, "rbit{cond} {reg1}, {reg2}", "rbit");
-}
-
-TEST_F(AssemblerArm32Test, rev) {
-  T3Helper(&arm::Arm32Assembler::rev, true, "rev{cond} {reg1}, {reg2}", "rev");
-}
-
-TEST_F(AssemblerArm32Test, rev16) {
-  T3Helper(&arm::Arm32Assembler::rev16, true, "rev16{cond} {reg1}, {reg2}", "rev16");
-}
-
-TEST_F(AssemblerArm32Test, revsh) {
-  T3Helper(&arm::Arm32Assembler::revsh, true, "revsh{cond} {reg1}, {reg2}", "revsh");
-}
-
-TEST_F(AssemblerArm32Test, vcnt) {
-  // Different D register numbers are used here, to test register encoding.
-  // Source register number is encoded as M:Vm, destination register number is encoded as D:Vd,
-  // For source and destination registers which use D0..D15, the M bit and D bit should be 0.
-  // For source and destination registers which use D16..D32, the M bit and D bit should be 1.
-  GetAssembler()->vcntd(arm::D0, arm::D1);
-  GetAssembler()->vcntd(arm::D19, arm::D20);
-  GetAssembler()->vcntd(arm::D0, arm::D9);
-  GetAssembler()->vcntd(arm::D16, arm::D20);
-
-  std::string expected =
-      "vcnt.8 d0, d1\n"
-      "vcnt.8 d19, d20\n"
-      "vcnt.8 d0, d9\n"
-      "vcnt.8 d16, d20\n";
-
-  DriverStr(expected, "vcnt");
-}
-
-TEST_F(AssemblerArm32Test, vpaddl) {
-  // Different D register numbers are used here, to test register encoding.
-  // Source register number is encoded as M:Vm, destination register number is encoded as D:Vd,
-  // For source and destination registers which use D0..D15, the M bit and D bit should be 0.
-  // For source and destination registers which use D16..D32, the M bit and D bit should be 1.
-  // Different data types (signed and unsigned) are also tested.
-  GetAssembler()->vpaddld(arm::D0, arm::D0, 8, true);
-  GetAssembler()->vpaddld(arm::D20, arm::D20, 8, false);
-  GetAssembler()->vpaddld(arm::D0, arm::D20, 16, false);
-  GetAssembler()->vpaddld(arm::D20, arm::D0, 32, true);
-
-  std::string expected =
-      "vpaddl.u8 d0, d0\n"
-      "vpaddl.s8 d20, d20\n"
-      "vpaddl.s16 d0, d20\n"
-      "vpaddl.u32 d20, d0\n";
-
-  DriverStr(expected, "vpaddl");
-}
-
-}  // namespace art
diff --git a/compiler/utils/arm/assembler_arm_shared.h b/compiler/utils/arm/assembler_arm_shared.h
new file mode 100644
index 0000000..21f13ee
--- /dev/null
+++ b/compiler/utils/arm/assembler_arm_shared.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_UTILS_ARM_ASSEMBLER_ARM_SHARED_H_
+#define ART_COMPILER_UTILS_ARM_ASSEMBLER_ARM_SHARED_H_
+
+namespace art {
+namespace arm {
+
+enum LoadOperandType {
+  kLoadSignedByte,
+  kLoadUnsignedByte,
+  kLoadSignedHalfword,
+  kLoadUnsignedHalfword,
+  kLoadWord,
+  kLoadWordPair,
+  kLoadSWord,
+  kLoadDWord
+};
+
+enum StoreOperandType {
+  kStoreByte,
+  kStoreHalfword,
+  kStoreWord,
+  kStoreWordPair,
+  kStoreSWord,
+  kStoreDWord
+};
+
+// Set condition codes request.
+enum SetCc {
+  kCcDontCare,  // Allows prioritizing 16-bit instructions on Thumb2 whether they set CCs or not.
+  kCcSet,
+  kCcKeep,
+};
+
+}  // namespace arm
+}  // namespace art
+
+#endif  // ART_COMPILER_UTILS_ARM_ASSEMBLER_ARM_SHARED_H_
diff --git a/compiler/utils/arm/assembler_arm_vixl.cc b/compiler/utils/arm/assembler_arm_vixl.cc
new file mode 100644
index 0000000..3c5973e
--- /dev/null
+++ b/compiler/utils/arm/assembler_arm_vixl.cc
@@ -0,0 +1,382 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <type_traits>
+
+#include "assembler_arm_vixl.h"
+#include "entrypoints/quick/quick_entrypoints.h"
+#include "thread.h"
+
+using namespace vixl::aarch32;  // NOLINT(build/namespaces)
+
+namespace art {
+namespace arm {
+
+#ifdef ___
+#error "ARM Assembler macro already defined."
+#else
+#define ___   vixl_masm_.
+#endif
+
+extern const vixl32::Register tr(TR);
+
+void ArmVIXLAssembler::FinalizeCode() {
+  vixl_masm_.FinalizeCode();
+}
+
+size_t ArmVIXLAssembler::CodeSize() const {
+  return vixl_masm_.GetSizeOfCodeGenerated();
+}
+
+const uint8_t* ArmVIXLAssembler::CodeBufferBaseAddress() const {
+  return vixl_masm_.GetStartAddress<uint8_t*>();
+}
+
+void ArmVIXLAssembler::FinalizeInstructions(const MemoryRegion& region) {
+  // Copy the instructions from the buffer.
+  MemoryRegion from(vixl_masm_.GetStartAddress<void*>(), CodeSize());
+  region.CopyFrom(0, from);
+}
+
+void ArmVIXLAssembler::PoisonHeapReference(vixl::aarch32::Register reg) {
+  // reg = -reg.
+  ___ Rsb(reg, reg, 0);
+}
+
+void ArmVIXLAssembler::UnpoisonHeapReference(vixl::aarch32::Register reg) {
+  // reg = -reg.
+  ___ Rsb(reg, reg, 0);
+}
+
+void ArmVIXLAssembler::MaybeUnpoisonHeapReference(vixl32::Register reg) {
+  if (kPoisonHeapReferences) {
+    UnpoisonHeapReference(reg);
+  }
+}
+
+void ArmVIXLAssembler::LoadImmediate(vixl32::Register rd, int32_t value) {
+  // TODO(VIXL): Implement this optimization in VIXL.
+  if (!ShifterOperandCanAlwaysHold(value) && ShifterOperandCanAlwaysHold(~value)) {
+    ___ Mvn(rd, ~value);
+  } else {
+    ___ Mov(rd, value);
+  }
+}
+
+bool ArmVIXLAssembler::ShifterOperandCanAlwaysHold(uint32_t immediate) {
+  return vixl_masm_.IsModifiedImmediate(immediate);
+}
+
+bool ArmVIXLAssembler::ShifterOperandCanHold(Opcode opcode, uint32_t immediate, SetCc set_cc) {
+  switch (opcode) {
+    case ADD:
+    case SUB:
+      // Less than (or equal to) 12 bits can be done if we don't need to set condition codes.
+      if (IsUint<12>(immediate) && set_cc != kCcSet) {
+        return true;
+      }
+      return ShifterOperandCanAlwaysHold(immediate);
+
+    case MOV:
+      // TODO: Support less than or equal to 12bits.
+      return ShifterOperandCanAlwaysHold(immediate);
+
+    case MVN:
+    default:
+      return ShifterOperandCanAlwaysHold(immediate);
+  }
+}
+
+bool ArmVIXLAssembler::CanSplitLoadStoreOffset(int32_t allowed_offset_bits,
+                                               int32_t offset,
+                                               /*out*/ int32_t* add_to_base,
+                                               /*out*/ int32_t* offset_for_load_store) {
+  int32_t other_bits = offset & ~allowed_offset_bits;
+  if (ShifterOperandCanAlwaysHold(other_bits) || ShifterOperandCanAlwaysHold(-other_bits)) {
+    *add_to_base = offset & ~allowed_offset_bits;
+    *offset_for_load_store = offset & allowed_offset_bits;
+    return true;
+  }
+  return false;
+}
+
+int32_t ArmVIXLAssembler::AdjustLoadStoreOffset(int32_t allowed_offset_bits,
+                                                vixl32::Register temp,
+                                                vixl32::Register base,
+                                                int32_t offset) {
+  DCHECK_NE(offset & ~allowed_offset_bits, 0);
+  int32_t add_to_base, offset_for_load;
+  if (CanSplitLoadStoreOffset(allowed_offset_bits, offset, &add_to_base, &offset_for_load)) {
+    ___ Add(temp, base, add_to_base);
+    return offset_for_load;
+  } else {
+    ___ Mov(temp, offset);
+    ___ Add(temp, temp, base);
+    return 0;
+  }
+}
+
+// TODO(VIXL): Implement this in VIXL.
+int32_t ArmVIXLAssembler::GetAllowedLoadOffsetBits(LoadOperandType type) {
+  switch (type) {
+    case kLoadSignedByte:
+    case kLoadSignedHalfword:
+    case kLoadUnsignedHalfword:
+    case kLoadUnsignedByte:
+    case kLoadWord:
+      // We can encode imm12 offset.
+      return 0xfff;
+    case kLoadSWord:
+    case kLoadDWord:
+    case kLoadWordPair:
+      // We can encode imm8:'00' offset.
+      return 0xff << 2;
+    default:
+      LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
+  }
+}
+
+// TODO(VIXL): Implement this in VIXL.
+int32_t ArmVIXLAssembler::GetAllowedStoreOffsetBits(StoreOperandType type) {
+  switch (type) {
+    case kStoreHalfword:
+    case kStoreByte:
+    case kStoreWord:
+      // We can encode imm12 offset.
+      return 0xfff;
+    case kStoreSWord:
+    case kStoreDWord:
+    case kStoreWordPair:
+      // We can encode imm8:'00' offset.
+      return 0xff << 2;
+    default:
+      LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
+  }
+}
+
+// TODO(VIXL): Implement this in VIXL.
+static bool CanHoldLoadOffsetThumb(LoadOperandType type, int offset) {
+  switch (type) {
+    case kLoadSignedByte:
+    case kLoadSignedHalfword:
+    case kLoadUnsignedHalfword:
+    case kLoadUnsignedByte:
+    case kLoadWord:
+      return IsAbsoluteUint<12>(offset);
+    case kLoadSWord:
+    case kLoadDWord:
+      return IsAbsoluteUint<10>(offset) && IsAligned<4>(offset);  // VFP addressing mode.
+    case kLoadWordPair:
+      return IsAbsoluteUint<10>(offset) && IsAligned<4>(offset);
+    default:
+      LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
+  }
+}
+
+// TODO(VIXL): Implement this in VIXL.
+static bool CanHoldStoreOffsetThumb(StoreOperandType type, int offset) {
+  switch (type) {
+    case kStoreHalfword:
+    case kStoreByte:
+    case kStoreWord:
+      return IsAbsoluteUint<12>(offset);
+    case kStoreSWord:
+    case kStoreDWord:
+      return IsAbsoluteUint<10>(offset) && IsAligned<4>(offset);  // VFP addressing mode.
+    case kStoreWordPair:
+      return IsAbsoluteUint<10>(offset) && IsAligned<4>(offset);
+    default:
+      LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
+  }
+}
+
+// Implementation note: this method must emit at most one instruction when
+// Address::CanHoldStoreOffsetThumb.
+// TODO(VIXL): Implement AdjustLoadStoreOffset logic in VIXL.
+void ArmVIXLAssembler::StoreToOffset(StoreOperandType type,
+                                     vixl32::Register reg,
+                                     vixl32::Register base,
+                                     int32_t offset) {
+  vixl32::Register tmp_reg;
+  UseScratchRegisterScope temps(&vixl_masm_);
+
+  if (!CanHoldStoreOffsetThumb(type, offset)) {
+    CHECK_NE(base.GetCode(), kIpCode);
+    if ((reg.GetCode() != kIpCode) &&
+        ((type != kStoreWordPair) || (reg.GetCode() + 1 != kIpCode))) {
+      tmp_reg = temps.Acquire();
+    } else {
+      // Be careful not to use ip twice (for `reg` (or `reg` + 1 in
+      // the case of a word-pair store) and `base`) to build the
+      // Address object used by the store instruction(s) below.
+      // Instead, save R5 on the stack (or R6 if R5 is already used by
+      // `base`), use it as secondary temporary register, and restore
+      // it after the store instruction has been emitted.
+      tmp_reg = (base.GetCode() != 5) ? r5 : r6;
+      ___ Push(tmp_reg);
+      if (base.GetCode() == kSpCode) {
+        offset += kRegisterSize;
+      }
+    }
+    // TODO: Implement indexed store (not available for STRD), inline AdjustLoadStoreOffset()
+    // and in the "unsplittable" path get rid of the "add" by using the store indexed instead.
+    offset = AdjustLoadStoreOffset(GetAllowedStoreOffsetBits(type), tmp_reg, base, offset);
+    base = tmp_reg;
+  }
+  DCHECK(CanHoldStoreOffsetThumb(type, offset));
+  switch (type) {
+    case kStoreByte:
+      ___ Strb(reg, MemOperand(base, offset));
+      break;
+    case kStoreHalfword:
+      ___ Strh(reg, MemOperand(base, offset));
+      break;
+    case kStoreWord:
+      ___ Str(reg, MemOperand(base, offset));
+      break;
+    case kStoreWordPair:
+      ___ Strd(reg, vixl32::Register(reg.GetCode() + 1), MemOperand(base, offset));
+      break;
+    default:
+      LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
+  }
+  if ((tmp_reg.IsValid()) && (tmp_reg.GetCode() != kIpCode)) {
+    CHECK(tmp_reg.Is(r5) || tmp_reg.Is(r6)) << tmp_reg;
+    ___ Pop(tmp_reg);
+  }
+}
+
+// Implementation note: this method must emit at most one instruction when
+// Address::CanHoldLoadOffsetThumb.
+// TODO(VIXL): Implement AdjustLoadStoreOffset logic in VIXL.
+void ArmVIXLAssembler::LoadFromOffset(LoadOperandType type,
+                                      vixl32::Register dest,
+                                      vixl32::Register base,
+                                      int32_t offset) {
+  if (!CanHoldLoadOffsetThumb(type, offset)) {
+    CHECK(!base.Is(ip));
+    // Inlined AdjustLoadStoreOffset() allows us to pull a few more tricks.
+    int32_t allowed_offset_bits = GetAllowedLoadOffsetBits(type);
+    DCHECK_NE(offset & ~allowed_offset_bits, 0);
+    int32_t add_to_base, offset_for_load;
+    if (CanSplitLoadStoreOffset(allowed_offset_bits, offset, &add_to_base, &offset_for_load)) {
+      // Use reg for the adjusted base. If it's low reg, we may end up using 16-bit load.
+      AddConstant(dest, base, add_to_base);
+      base = dest;
+      offset = offset_for_load;
+    } else {
+      UseScratchRegisterScope temps(&vixl_masm_);
+      vixl32::Register temp = (dest.Is(base)) ? temps.Acquire() : dest;
+      LoadImmediate(temp, offset);
+      // TODO: Implement indexed load (not available for LDRD) and use it here to avoid the ADD.
+      // Use reg for the adjusted base. If it's low reg, we may end up using 16-bit load.
+      ___ Add(dest, dest, (dest.Is(base)) ? temp : base);
+      base = dest;
+      offset = 0;
+    }
+  }
+
+  DCHECK(CanHoldLoadOffsetThumb(type, offset));
+  switch (type) {
+    case kLoadSignedByte:
+      ___ Ldrsb(dest, MemOperand(base, offset));
+      break;
+    case kLoadUnsignedByte:
+      ___ Ldrb(dest, MemOperand(base, offset));
+      break;
+    case kLoadSignedHalfword:
+      ___ Ldrsh(dest, MemOperand(base, offset));
+      break;
+    case kLoadUnsignedHalfword:
+      ___ Ldrh(dest, MemOperand(base, offset));
+      break;
+    case kLoadWord:
+      CHECK(!dest.IsSP());
+      ___ Ldr(dest, MemOperand(base, offset));
+      break;
+    case kLoadWordPair:
+      ___ Ldrd(dest, vixl32::Register(dest.GetCode() + 1), MemOperand(base, offset));
+      break;
+    default:
+      LOG(FATAL) << "UNREACHABLE";
+      UNREACHABLE();
+  }
+}
+
+void ArmVIXLAssembler::StoreSToOffset(vixl32::SRegister source,
+                                      vixl32::Register base,
+                                      int32_t offset) {
+  ___ Vstr(source, MemOperand(base, offset));
+}
+
+void ArmVIXLAssembler::StoreDToOffset(vixl32::DRegister source,
+                                      vixl32::Register base,
+                                      int32_t offset) {
+  ___ Vstr(source, MemOperand(base, offset));
+}
+
+void ArmVIXLAssembler::LoadSFromOffset(vixl32::SRegister reg,
+                                       vixl32::Register base,
+                                       int32_t offset) {
+  ___ Vldr(reg, MemOperand(base, offset));
+}
+
+void ArmVIXLAssembler::LoadDFromOffset(vixl32::DRegister reg,
+                                       vixl32::Register base,
+                                       int32_t offset) {
+  ___ Vldr(reg, MemOperand(base, offset));
+}
+
+void ArmVIXLAssembler::AddConstant(vixl32::Register rd, int32_t value) {
+  AddConstant(rd, rd, value);
+}
+
+// TODO(VIXL): think about using adds which updates flags where possible.
+void ArmVIXLAssembler::AddConstant(vixl32::Register rd,
+                                   vixl32::Register rn,
+                                   int32_t value) {
+  DCHECK(vixl_masm_.OutsideITBlock());
+  // TODO(VIXL): implement this optimization in VIXL.
+  if (value == 0) {
+    if (!rd.Is(rn)) {
+      ___ Mov(rd, rn);
+    }
+    return;
+  }
+  ___ Add(rd, rn, value);
+}
+
+// Inside IT block we must use assembler, macroassembler instructions are not permitted.
+void ArmVIXLAssembler::AddConstantInIt(vixl32::Register rd,
+                                       vixl32::Register rn,
+                                       int32_t value,
+                                       vixl32::Condition cond) {
+  DCHECK(vixl_masm_.InITBlock());
+  if (value == 0) {
+    ___ mov(cond, rd, rn);
+  } else {
+    ___ add(cond, rd, rn, value);
+  }
+}
+
+}  // namespace arm
+}  // namespace art
diff --git a/compiler/utils/arm/assembler_arm_vixl.h b/compiler/utils/arm/assembler_arm_vixl.h
new file mode 100644
index 0000000..c8f3a9b
--- /dev/null
+++ b/compiler/utils/arm/assembler_arm_vixl.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_UTILS_ARM_ASSEMBLER_ARM_VIXL_H_
+#define ART_COMPILER_UTILS_ARM_ASSEMBLER_ARM_VIXL_H_
+
+#include "base/arena_containers.h"
+#include "base/logging.h"
+#include "constants_arm.h"
+#include "offsets.h"
+#include "utils/arm/assembler_arm_shared.h"
+#include "utils/arm/managed_register_arm.h"
+#include "utils/assembler.h"
+#include "utils/jni_macro_assembler.h"
+
+// TODO(VIXL): Make VIXL compile with -Wshadow and remove pragmas.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wshadow"
+#include "aarch32/macro-assembler-aarch32.h"
+#pragma GCC diagnostic pop
+
+namespace vixl32 = vixl::aarch32;
+
+namespace art {
+namespace arm {
+
+class ArmVIXLAssembler FINAL : public Assembler {
+ private:
+  class ArmException;
+ public:
+  explicit ArmVIXLAssembler(ArenaAllocator* arena)
+      : Assembler(arena) {
+    // Use Thumb2 instruction set.
+    vixl_masm_.UseT32();
+  }
+
+  virtual ~ArmVIXLAssembler() {}
+  vixl32::MacroAssembler* GetVIXLAssembler() { return &vixl_masm_; }
+  void FinalizeCode() OVERRIDE;
+
+  // Size of generated code.
+  size_t CodeSize() const OVERRIDE;
+  const uint8_t* CodeBufferBaseAddress() const OVERRIDE;
+
+  // Copy instructions out of assembly buffer into the given region of memory.
+  void FinalizeInstructions(const MemoryRegion& region) OVERRIDE;
+
+  void Bind(Label* label ATTRIBUTE_UNUSED) OVERRIDE {
+    UNIMPLEMENTED(FATAL) << "Do not use Bind for ARM";
+  }
+  void Jump(Label* label ATTRIBUTE_UNUSED) OVERRIDE {
+    UNIMPLEMENTED(FATAL) << "Do not use Jump for ARM";
+  }
+
+  //
+  // Heap poisoning.
+  //
+  // Poison a heap reference contained in `reg`.
+  void PoisonHeapReference(vixl32::Register reg);
+  // Unpoison a heap reference contained in `reg`.
+  void UnpoisonHeapReference(vixl32::Register reg);
+  // Unpoison a heap reference contained in `reg` if heap poisoning is enabled.
+  void MaybeUnpoisonHeapReference(vixl32::Register reg);
+
+  void StoreToOffset(StoreOperandType type,
+                     vixl32::Register reg,
+                     vixl32::Register base,
+                     int32_t offset);
+  void StoreSToOffset(vixl32::SRegister source, vixl32::Register base, int32_t offset);
+  void StoreDToOffset(vixl32::DRegister source, vixl32::Register base, int32_t offset);
+
+  void LoadImmediate(vixl32::Register dest, int32_t value);
+  void LoadFromOffset(LoadOperandType type,
+                      vixl32::Register reg,
+                      vixl32::Register base,
+                      int32_t offset);
+  void LoadSFromOffset(vixl32::SRegister reg, vixl32::Register base, int32_t offset);
+  void LoadDFromOffset(vixl32::DRegister reg, vixl32::Register base, int32_t offset);
+
+  bool ShifterOperandCanAlwaysHold(uint32_t immediate);
+  bool ShifterOperandCanHold(Opcode opcode, uint32_t immediate, SetCc set_cc);
+  bool CanSplitLoadStoreOffset(int32_t allowed_offset_bits,
+                               int32_t offset,
+                               /*out*/ int32_t* add_to_base,
+                               /*out*/ int32_t* offset_for_load_store);
+  int32_t AdjustLoadStoreOffset(int32_t allowed_offset_bits,
+                                vixl32::Register temp,
+                                vixl32::Register base,
+                                int32_t offset);
+  int32_t GetAllowedLoadOffsetBits(LoadOperandType type);
+  int32_t GetAllowedStoreOffsetBits(StoreOperandType type);
+
+  void AddConstant(vixl32::Register rd, int32_t value);
+  void AddConstant(vixl32::Register rd, vixl32::Register rn, int32_t value);
+  void AddConstantInIt(vixl32::Register rd,
+                       vixl32::Register rn,
+                       int32_t value,
+                       vixl32::Condition cond = vixl32::al);
+
+ private:
+  // VIXL assembler.
+  vixl32::MacroAssembler vixl_masm_;
+};
+
+// Thread register declaration.
+extern const vixl32::Register tr;
+
+}  // namespace arm
+}  // namespace art
+
+#endif  // ART_COMPILER_UTILS_ARM_ASSEMBLER_ARM_VIXL_H_
diff --git a/compiler/utils/arm/jni_macro_assembler_arm.cc b/compiler/utils/arm/jni_macro_assembler_arm.cc
index af5ebb4..e0bfa12 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm.cc
@@ -18,7 +18,6 @@
 
 #include <algorithm>
 
-#include "assembler_arm32.h"
 #include "assembler_thumb2.h"
 #include "base/arena_allocator.h"
 #include "base/bit_utils.h"
@@ -47,9 +46,6 @@
 ArmJNIMacroAssembler::ArmJNIMacroAssembler(ArenaAllocator* arena, InstructionSet isa) {
   switch (isa) {
     case kArm:
-      asm_.reset(new (arena) Arm32Assembler(arena));
-      break;
-
     case kThumb2:
       asm_.reset(new (arena) Thumb2Assembler(arena));
       break;
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
new file mode 100644
index 0000000..719fe7f
--- /dev/null
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -0,0 +1,599 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+#include <type_traits>
+
+#include "jni_macro_assembler_arm_vixl.h"
+#include "entrypoints/quick/quick_entrypoints.h"
+#include "thread.h"
+
+using namespace vixl::aarch32;  // NOLINT(build/namespaces)
+namespace vixl32 = vixl::aarch32;
+
+namespace art {
+namespace arm {
+
+#ifdef ___
+#error "ARM Assembler macro already defined."
+#else
+#define ___   asm_.GetVIXLAssembler()->
+#endif
+
+void ArmVIXLJNIMacroAssembler::FinalizeCode() {
+  for (const std::unique_ptr<
+      ArmVIXLJNIMacroAssembler::ArmException>& exception : exception_blocks_) {
+    EmitExceptionPoll(exception.get());
+  }
+  asm_.FinalizeCode();
+}
+
+static dwarf::Reg DWARFReg(vixl32::Register reg) {
+  return dwarf::Reg::ArmCore(static_cast<int>(reg.GetCode()));
+}
+
+static dwarf::Reg DWARFReg(vixl32::SRegister reg) {
+  return dwarf::Reg::ArmFp(static_cast<int>(reg.GetCode()));
+}
+
+static constexpr size_t kFramePointerSize = static_cast<size_t>(kArmPointerSize);;
+
+void ArmVIXLJNIMacroAssembler::BuildFrame(size_t frame_size,
+                                          ManagedRegister method_reg,
+                                          ArrayRef<const ManagedRegister> callee_save_regs,
+                                          const ManagedRegisterEntrySpills& entry_spills) {
+  CHECK_ALIGNED(frame_size, kStackAlignment);
+  CHECK(r0.Is(method_reg.AsArm().AsVIXLRegister()));
+
+  // Push callee saves and link register.
+  RegList core_spill_mask = 1 << LR;
+  uint32_t fp_spill_mask = 0;
+  for (const ManagedRegister& reg : callee_save_regs) {
+    if (reg.AsArm().IsCoreRegister()) {
+      core_spill_mask |= 1 << reg.AsArm().AsCoreRegister();
+    } else {
+      fp_spill_mask |= 1 << reg.AsArm().AsSRegister();
+    }
+  }
+  ___ Push(RegisterList(core_spill_mask));
+  cfi().AdjustCFAOffset(POPCOUNT(core_spill_mask) * kFramePointerSize);
+  cfi().RelOffsetForMany(DWARFReg(r0), 0, core_spill_mask, kFramePointerSize);
+  if (fp_spill_mask != 0) {
+    uint32_t first = CTZ(fp_spill_mask);
+    uint32_t last  = first + POPCOUNT(fp_spill_mask) - 1;
+
+    // Check that list is contiguous.
+    DCHECK_EQ(fp_spill_mask >> CTZ(fp_spill_mask), ~0u >> (32 - POPCOUNT(fp_spill_mask)));
+
+    ___ Vpush(SRegisterList(vixl32::SRegister(first), vixl32::SRegister(last)));
+    cfi().AdjustCFAOffset(POPCOUNT(fp_spill_mask) * kFramePointerSize);
+    cfi().RelOffsetForMany(DWARFReg(s0), 0, fp_spill_mask, kFramePointerSize);
+  }
+
+  // Increase frame to required size.
+  int pushed_values = POPCOUNT(core_spill_mask) + POPCOUNT(fp_spill_mask);
+  // Must at least have space for Method*.
+  CHECK_GT(frame_size, pushed_values * kFramePointerSize);
+  IncreaseFrameSize(frame_size - pushed_values * kFramePointerSize);  // handles CFI as well.
+
+  // Write out Method*.
+  asm_.StoreToOffset(kStoreWord, r0, sp, 0);
+
+  // Write out entry spills.
+  int32_t offset = frame_size + kFramePointerSize;
+  for (size_t i = 0; i < entry_spills.size(); ++i) {
+    ArmManagedRegister reg = entry_spills.at(i).AsArm();
+    if (reg.IsNoRegister()) {
+      // only increment stack offset.
+      ManagedRegisterSpill spill = entry_spills.at(i);
+      offset += spill.getSize();
+    } else if (reg.IsCoreRegister()) {
+      asm_.StoreToOffset(kStoreWord, reg.AsVIXLRegister(), sp, offset);
+      offset += 4;
+    } else if (reg.IsSRegister()) {
+      asm_.StoreSToOffset(reg.AsVIXLSRegister(), sp, offset);
+      offset += 4;
+    } else if (reg.IsDRegister()) {
+      asm_.StoreDToOffset(reg.AsVIXLDRegister(), sp, offset);
+      offset += 8;
+    }
+  }
+}
+
+void ArmVIXLJNIMacroAssembler::RemoveFrame(size_t frame_size,
+                                           ArrayRef<const ManagedRegister> callee_save_regs) {
+  CHECK_ALIGNED(frame_size, kStackAlignment);
+  cfi().RememberState();
+
+  // Compute callee saves to pop and PC.
+  RegList core_spill_mask = 1 << PC;
+  uint32_t fp_spill_mask = 0;
+  for (const ManagedRegister& reg : callee_save_regs) {
+    if (reg.AsArm().IsCoreRegister()) {
+      core_spill_mask |= 1 << reg.AsArm().AsCoreRegister();
+    } else {
+      fp_spill_mask |= 1 << reg.AsArm().AsSRegister();
+    }
+  }
+
+  // Decrease frame to start of callee saves.
+  int pop_values = POPCOUNT(core_spill_mask) + POPCOUNT(fp_spill_mask);
+  CHECK_GT(frame_size, pop_values * kFramePointerSize);
+  DecreaseFrameSize(frame_size - (pop_values * kFramePointerSize));  // handles CFI as well.
+
+  if (fp_spill_mask != 0) {
+    uint32_t first = CTZ(fp_spill_mask);
+    uint32_t last  = first + POPCOUNT(fp_spill_mask) - 1;
+    // Check that list is contiguous.
+     DCHECK_EQ(fp_spill_mask >> CTZ(fp_spill_mask), ~0u >> (32 - POPCOUNT(fp_spill_mask)));
+
+    ___ Vpop(SRegisterList(vixl32::SRegister(first), vixl32::SRegister(last)));
+    cfi().AdjustCFAOffset(-kFramePointerSize * POPCOUNT(fp_spill_mask));
+    cfi().RestoreMany(DWARFReg(s0), fp_spill_mask);
+  }
+
+  // Pop callee saves and PC.
+  ___ Pop(RegisterList(core_spill_mask));
+
+  // The CFI should be restored for any code that follows the exit block.
+  cfi().RestoreState();
+  cfi().DefCFAOffset(frame_size);
+}
+
+
+void ArmVIXLJNIMacroAssembler::IncreaseFrameSize(size_t adjust) {
+  asm_.AddConstant(sp, -adjust);
+  cfi().AdjustCFAOffset(adjust);
+}
+
+void ArmVIXLJNIMacroAssembler::DecreaseFrameSize(size_t adjust) {
+  asm_.AddConstant(sp, adjust);
+  cfi().AdjustCFAOffset(-adjust);
+}
+
+void ArmVIXLJNIMacroAssembler::Store(FrameOffset dest, ManagedRegister m_src, size_t size) {
+  ArmManagedRegister src = m_src.AsArm();
+  if (src.IsNoRegister()) {
+    CHECK_EQ(0u, size);
+  } else if (src.IsCoreRegister()) {
+    CHECK_EQ(4u, size);
+    asm_.StoreToOffset(kStoreWord, src.AsVIXLRegister(), sp, dest.Int32Value());
+  } else if (src.IsRegisterPair()) {
+    CHECK_EQ(8u, size);
+    asm_.StoreToOffset(kStoreWord, src.AsVIXLRegisterPairLow(),  sp, dest.Int32Value());
+    asm_.StoreToOffset(kStoreWord, src.AsVIXLRegisterPairHigh(), sp, dest.Int32Value() + 4);
+  } else if (src.IsSRegister()) {
+    CHECK_EQ(4u, size);
+    asm_.StoreSToOffset(src.AsVIXLSRegister(), sp, dest.Int32Value());
+  } else {
+    CHECK_EQ(8u, size);
+    CHECK(src.IsDRegister()) << src;
+    asm_.StoreDToOffset(src.AsVIXLDRegister(), sp, dest.Int32Value());
+  }
+}
+
+void ArmVIXLJNIMacroAssembler::StoreRef(FrameOffset dest, ManagedRegister msrc) {
+  ArmManagedRegister src = msrc.AsArm();
+  CHECK(src.IsCoreRegister()) << src;
+  asm_.StoreToOffset(kStoreWord, src.AsVIXLRegister(), sp, dest.Int32Value());
+}
+
+void ArmVIXLJNIMacroAssembler::StoreRawPtr(FrameOffset dest, ManagedRegister msrc) {
+  ArmManagedRegister src = msrc.AsArm();
+  CHECK(src.IsCoreRegister()) << src;
+  asm_.StoreToOffset(kStoreWord, src.AsVIXLRegister(), sp, dest.Int32Value());
+}
+
+void ArmVIXLJNIMacroAssembler::StoreSpanning(FrameOffset dest,
+                                             ManagedRegister msrc,
+                                             FrameOffset in_off,
+                                             ManagedRegister mscratch) {
+  ArmManagedRegister src = msrc.AsArm();
+  ArmManagedRegister scratch = mscratch.AsArm();
+  asm_.StoreToOffset(kStoreWord, src.AsVIXLRegister(), sp, dest.Int32Value());
+  asm_.LoadFromOffset(kLoadWord, scratch.AsVIXLRegister(), sp, in_off.Int32Value());
+  asm_.StoreToOffset(kStoreWord, scratch.AsVIXLRegister(), sp, dest.Int32Value() + 4);
+}
+
+void ArmVIXLJNIMacroAssembler::CopyRef(FrameOffset dest,
+                                       FrameOffset src,
+                                       ManagedRegister mscratch) {
+  ArmManagedRegister scratch = mscratch.AsArm();
+  asm_.LoadFromOffset(kLoadWord, scratch.AsVIXLRegister(), sp, src.Int32Value());
+  asm_.StoreToOffset(kStoreWord, scratch.AsVIXLRegister(), sp, dest.Int32Value());
+}
+
+void ArmVIXLJNIMacroAssembler::LoadRef(ManagedRegister dest,
+                                       ManagedRegister base,
+                                       MemberOffset offs,
+                                       bool unpoison_reference) {
+  ArmManagedRegister dst = dest.AsArm();
+  CHECK(dst.IsCoreRegister() && dst.IsCoreRegister()) << dst;
+  asm_.LoadFromOffset(kLoadWord,
+                      dst.AsVIXLRegister(),
+                      base.AsArm().AsVIXLRegister(),
+                      offs.Int32Value());
+
+  if (unpoison_reference) {
+    asm_.MaybeUnpoisonHeapReference(dst.AsVIXLRegister());
+  }
+}
+
+void ArmVIXLJNIMacroAssembler::LoadRef(ManagedRegister dest ATTRIBUTE_UNUSED,
+                                       FrameOffset src ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void ArmVIXLJNIMacroAssembler::LoadRawPtr(ManagedRegister dest ATTRIBUTE_UNUSED,
+                                          ManagedRegister base ATTRIBUTE_UNUSED,
+                                          Offset offs ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void ArmVIXLJNIMacroAssembler::StoreImmediateToFrame(FrameOffset dest,
+                                                     uint32_t imm,
+                                                     ManagedRegister scratch) {
+  ArmManagedRegister mscratch = scratch.AsArm();
+  CHECK(mscratch.IsCoreRegister()) << mscratch;
+  asm_.LoadImmediate(mscratch.AsVIXLRegister(), imm);
+  asm_.StoreToOffset(kStoreWord, mscratch.AsVIXLRegister(), sp, dest.Int32Value());
+}
+
+void ArmVIXLJNIMacroAssembler::Load(ManagedRegister m_dst, FrameOffset src, size_t size) {
+  return Load(m_dst.AsArm(), sp, src.Int32Value(), size);
+}
+
+void ArmVIXLJNIMacroAssembler::LoadFromThread(ManagedRegister m_dst ATTRIBUTE_UNUSED,
+                                              ThreadOffset32 src ATTRIBUTE_UNUSED,
+                                              size_t size ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void ArmVIXLJNIMacroAssembler::LoadRawPtrFromThread(ManagedRegister m_dst, ThreadOffset32 offs) {
+  ArmManagedRegister dst = m_dst.AsArm();
+  CHECK(dst.IsCoreRegister()) << dst;
+  asm_.LoadFromOffset(kLoadWord, dst.AsVIXLRegister(), tr, offs.Int32Value());
+}
+
+void ArmVIXLJNIMacroAssembler::CopyRawPtrFromThread(FrameOffset fr_offs,
+                                                    ThreadOffset32 thr_offs,
+                                                    ManagedRegister mscratch) {
+  ArmManagedRegister scratch = mscratch.AsArm();
+  CHECK(scratch.IsCoreRegister()) << scratch;
+  asm_.LoadFromOffset(kLoadWord, scratch.AsVIXLRegister(), tr, thr_offs.Int32Value());
+  asm_.StoreToOffset(kStoreWord, scratch.AsVIXLRegister(), sp, fr_offs.Int32Value());
+}
+
+void ArmVIXLJNIMacroAssembler::CopyRawPtrToThread(ThreadOffset32 thr_offs ATTRIBUTE_UNUSED,
+                                                  FrameOffset fr_offs ATTRIBUTE_UNUSED,
+                                                  ManagedRegister mscratch ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void ArmVIXLJNIMacroAssembler::StoreStackOffsetToThread(ThreadOffset32 thr_offs,
+                                                        FrameOffset fr_offs,
+                                                        ManagedRegister mscratch) {
+  ArmManagedRegister scratch = mscratch.AsArm();
+  CHECK(scratch.IsCoreRegister()) << scratch;
+  asm_.AddConstant(scratch.AsVIXLRegister(), sp, fr_offs.Int32Value());
+  asm_.StoreToOffset(kStoreWord, scratch.AsVIXLRegister(), tr, thr_offs.Int32Value());
+}
+
+void ArmVIXLJNIMacroAssembler::StoreStackPointerToThread(ThreadOffset32 thr_offs) {
+  asm_.StoreToOffset(kStoreWord, sp, tr, thr_offs.Int32Value());
+}
+
+void ArmVIXLJNIMacroAssembler::SignExtend(ManagedRegister mreg ATTRIBUTE_UNUSED,
+                                          size_t size ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL) << "no sign extension necessary for arm";
+}
+
+void ArmVIXLJNIMacroAssembler::ZeroExtend(ManagedRegister mreg ATTRIBUTE_UNUSED,
+                                          size_t size ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL) << "no zero extension necessary for arm";
+}
+
+void ArmVIXLJNIMacroAssembler::Move(ManagedRegister m_dst,
+                                    ManagedRegister m_src,
+                                    size_t size  ATTRIBUTE_UNUSED) {
+  ArmManagedRegister dst = m_dst.AsArm();
+  ArmManagedRegister src = m_src.AsArm();
+  if (!dst.Equals(src)) {
+    if (dst.IsCoreRegister()) {
+      CHECK(src.IsCoreRegister()) << src;
+      ___ Mov(dst.AsVIXLRegister(), src.AsVIXLRegister());
+    } else if (dst.IsDRegister()) {
+      CHECK(src.IsDRegister()) << src;
+      ___ Vmov(F64, dst.AsVIXLDRegister(), src.AsVIXLDRegister());
+    } else if (dst.IsSRegister()) {
+      CHECK(src.IsSRegister()) << src;
+      ___ Vmov(F32, dst.AsVIXLSRegister(), src.AsVIXLSRegister());
+    } else {
+      CHECK(dst.IsRegisterPair()) << dst;
+      CHECK(src.IsRegisterPair()) << src;
+      // Ensure that the first move doesn't clobber the input of the second.
+      if (src.AsRegisterPairHigh() != dst.AsRegisterPairLow()) {
+        ___ Mov(dst.AsVIXLRegisterPairLow(),  src.AsVIXLRegisterPairLow());
+        ___ Mov(dst.AsVIXLRegisterPairHigh(), src.AsVIXLRegisterPairHigh());
+      } else {
+        ___ Mov(dst.AsVIXLRegisterPairHigh(), src.AsVIXLRegisterPairHigh());
+        ___ Mov(dst.AsVIXLRegisterPairLow(),  src.AsVIXLRegisterPairLow());
+      }
+    }
+  }
+}
+
+void ArmVIXLJNIMacroAssembler::Copy(FrameOffset dest,
+                                    FrameOffset src,
+                                    ManagedRegister scratch,
+                                    size_t size) {
+  ArmManagedRegister temp = scratch.AsArm();
+  CHECK(temp.IsCoreRegister()) << temp;
+  CHECK(size == 4 || size == 8) << size;
+  if (size == 4) {
+    asm_.LoadFromOffset(kLoadWord, temp.AsVIXLRegister(), sp, src.Int32Value());
+    asm_.StoreToOffset(kStoreWord, temp.AsVIXLRegister(), sp, dest.Int32Value());
+  } else if (size == 8) {
+    asm_.LoadFromOffset(kLoadWord, temp.AsVIXLRegister(), sp, src.Int32Value());
+    asm_.StoreToOffset(kStoreWord, temp.AsVIXLRegister(), sp, dest.Int32Value());
+    asm_.LoadFromOffset(kLoadWord, temp.AsVIXLRegister(), sp, src.Int32Value() + 4);
+    asm_.StoreToOffset(kStoreWord, temp.AsVIXLRegister(), sp, dest.Int32Value() + 4);
+  }
+}
+
+void ArmVIXLJNIMacroAssembler::Copy(FrameOffset dest ATTRIBUTE_UNUSED,
+                                    ManagedRegister src_base ATTRIBUTE_UNUSED,
+                                    Offset src_offset ATTRIBUTE_UNUSED,
+                                    ManagedRegister mscratch ATTRIBUTE_UNUSED,
+                                    size_t size ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void ArmVIXLJNIMacroAssembler::Copy(ManagedRegister dest_base ATTRIBUTE_UNUSED,
+                                    Offset dest_offset ATTRIBUTE_UNUSED,
+                                    FrameOffset src ATTRIBUTE_UNUSED,
+                                    ManagedRegister mscratch ATTRIBUTE_UNUSED,
+                                    size_t size ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void ArmVIXLJNIMacroAssembler::Copy(FrameOffset dst ATTRIBUTE_UNUSED,
+                                    FrameOffset src_base ATTRIBUTE_UNUSED,
+                                    Offset src_offset ATTRIBUTE_UNUSED,
+                                    ManagedRegister mscratch ATTRIBUTE_UNUSED,
+                                    size_t size ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void ArmVIXLJNIMacroAssembler::Copy(ManagedRegister dest ATTRIBUTE_UNUSED,
+                                    Offset dest_offset ATTRIBUTE_UNUSED,
+                                    ManagedRegister src ATTRIBUTE_UNUSED,
+                                    Offset src_offset ATTRIBUTE_UNUSED,
+                                    ManagedRegister mscratch ATTRIBUTE_UNUSED,
+                                    size_t size ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void ArmVIXLJNIMacroAssembler::Copy(FrameOffset dst ATTRIBUTE_UNUSED,
+                                    Offset dest_offset ATTRIBUTE_UNUSED,
+                                    FrameOffset src ATTRIBUTE_UNUSED,
+                                    Offset src_offset ATTRIBUTE_UNUSED,
+                                    ManagedRegister scratch ATTRIBUTE_UNUSED,
+                                    size_t size ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL);
+}
+
+static constexpr uint32_t kArmInstrMaxSizeInBytes = 4;
+
+void ArmVIXLJNIMacroAssembler::CreateHandleScopeEntry(ManagedRegister mout_reg,
+                                                      FrameOffset handle_scope_offset,
+                                                      ManagedRegister min_reg,
+                                                      bool null_allowed) {
+  ArmManagedRegister out_reg = mout_reg.AsArm();
+  ArmManagedRegister in_reg = min_reg.AsArm();
+  CHECK(in_reg.IsNoRegister() || in_reg.IsCoreRegister()) << in_reg;
+  CHECK(out_reg.IsCoreRegister()) << out_reg;
+  if (null_allowed) {
+    // Null values get a handle scope entry value of 0.  Otherwise, the handle scope entry is
+    // the address in the handle scope holding the reference.
+    // e.g. out_reg = (handle == 0) ? 0 : (SP+handle_offset)
+    if (in_reg.IsNoRegister()) {
+      asm_.LoadFromOffset(kLoadWord,
+                          out_reg.AsVIXLRegister(),
+                          sp,
+                          handle_scope_offset.Int32Value());
+      in_reg = out_reg;
+    }
+    ___ Cmp(in_reg.AsVIXLRegister(), 0);
+
+    if (asm_.ShifterOperandCanHold(ADD, handle_scope_offset.Int32Value(), kCcDontCare)) {
+      if (!out_reg.Equals(in_reg)) {
+        AssemblerAccurateScope guard(asm_.GetVIXLAssembler(),
+                                     3 * kArmInstrMaxSizeInBytes,
+                                     CodeBufferCheckScope::kMaximumSize);
+        ___ it(eq, 0xc);
+        ___ mov(eq, out_reg.AsVIXLRegister(), 0);
+        asm_.AddConstantInIt(out_reg.AsVIXLRegister(), sp, handle_scope_offset.Int32Value(), ne);
+      } else {
+        AssemblerAccurateScope guard(asm_.GetVIXLAssembler(),
+                                     2 * kArmInstrMaxSizeInBytes,
+                                     CodeBufferCheckScope::kMaximumSize);
+        ___ it(ne, 0x8);
+        asm_.AddConstantInIt(out_reg.AsVIXLRegister(), sp, handle_scope_offset.Int32Value(), ne);
+      }
+    } else {
+      // TODO: Implement this (old arm assembler would have crashed here).
+      UNIMPLEMENTED(FATAL);
+    }
+  } else {
+    asm_.AddConstant(out_reg.AsVIXLRegister(), sp, handle_scope_offset.Int32Value());
+  }
+}
+
+void ArmVIXLJNIMacroAssembler::CreateHandleScopeEntry(FrameOffset out_off,
+                                                      FrameOffset handle_scope_offset,
+                                                      ManagedRegister mscratch,
+                                                      bool null_allowed) {
+  ArmManagedRegister scratch = mscratch.AsArm();
+  CHECK(scratch.IsCoreRegister()) << scratch;
+  if (null_allowed) {
+    asm_.LoadFromOffset(kLoadWord, scratch.AsVIXLRegister(), sp, handle_scope_offset.Int32Value());
+    // Null values get a handle scope entry value of 0.  Otherwise, the handle scope entry is
+    // the address in the handle scope holding the reference.
+    // e.g. scratch = (scratch == 0) ? 0 : (SP+handle_scope_offset)
+    ___ Cmp(scratch.AsVIXLRegister(), 0);
+
+    if (asm_.ShifterOperandCanHold(ADD, handle_scope_offset.Int32Value(), kCcDontCare)) {
+      AssemblerAccurateScope guard(asm_.GetVIXLAssembler(),
+                                   2 * kArmInstrMaxSizeInBytes,
+                                   CodeBufferCheckScope::kMaximumSize);
+      ___ it(ne, 0x8);
+      asm_.AddConstantInIt(scratch.AsVIXLRegister(), sp, handle_scope_offset.Int32Value(), ne);
+    } else {
+      // TODO: Implement this (old arm assembler would have crashed here).
+      UNIMPLEMENTED(FATAL);
+    }
+  } else {
+    asm_.AddConstant(scratch.AsVIXLRegister(), sp, handle_scope_offset.Int32Value());
+  }
+  asm_.StoreToOffset(kStoreWord, scratch.AsVIXLRegister(), sp, out_off.Int32Value());
+}
+
+void ArmVIXLJNIMacroAssembler::LoadReferenceFromHandleScope(
+    ManagedRegister mout_reg ATTRIBUTE_UNUSED,
+    ManagedRegister min_reg ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void ArmVIXLJNIMacroAssembler::VerifyObject(ManagedRegister src ATTRIBUTE_UNUSED,
+                                            bool could_be_null ATTRIBUTE_UNUSED) {
+  // TODO: not validating references.
+}
+
+void ArmVIXLJNIMacroAssembler::VerifyObject(FrameOffset src ATTRIBUTE_UNUSED,
+                                            bool could_be_null ATTRIBUTE_UNUSED) {
+  // TODO: not validating references.
+}
+
+void ArmVIXLJNIMacroAssembler::Call(ManagedRegister mbase,
+                                    Offset offset,
+                                    ManagedRegister mscratch) {
+  ArmManagedRegister base = mbase.AsArm();
+  ArmManagedRegister scratch = mscratch.AsArm();
+  CHECK(base.IsCoreRegister()) << base;
+  CHECK(scratch.IsCoreRegister()) << scratch;
+  asm_.LoadFromOffset(kLoadWord,
+                      scratch.AsVIXLRegister(),
+                      base.AsVIXLRegister(),
+                      offset.Int32Value());
+  ___ Blx(scratch.AsVIXLRegister());
+  // TODO: place reference map on call.
+}
+
+void ArmVIXLJNIMacroAssembler::Call(FrameOffset base, Offset offset, ManagedRegister mscratch) {
+  ArmManagedRegister scratch = mscratch.AsArm();
+  CHECK(scratch.IsCoreRegister()) << scratch;
+  // Call *(*(SP + base) + offset)
+  asm_.LoadFromOffset(kLoadWord, scratch.AsVIXLRegister(), sp, base.Int32Value());
+  asm_.LoadFromOffset(kLoadWord,
+                      scratch.AsVIXLRegister(),
+                      scratch.AsVIXLRegister(),
+                      offset.Int32Value());
+  ___ Blx(scratch.AsVIXLRegister());
+  // TODO: place reference map on call
+}
+
+void ArmVIXLJNIMacroAssembler::CallFromThread(ThreadOffset32 offset ATTRIBUTE_UNUSED,
+                                              ManagedRegister scratch ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void ArmVIXLJNIMacroAssembler::GetCurrentThread(ManagedRegister mtr) {
+  ___ Mov(mtr.AsArm().AsVIXLRegister(), tr);
+}
+
+void ArmVIXLJNIMacroAssembler::GetCurrentThread(FrameOffset dest_offset,
+                                                ManagedRegister scratch ATTRIBUTE_UNUSED) {
+  asm_.StoreToOffset(kStoreWord, tr, sp, dest_offset.Int32Value());
+}
+
+void ArmVIXLJNIMacroAssembler::ExceptionPoll(ManagedRegister m_scratch, size_t stack_adjust) {
+  CHECK_ALIGNED(stack_adjust, kStackAlignment);
+  ArmManagedRegister scratch = m_scratch.AsArm();
+  exception_blocks_.emplace_back(
+      new ArmVIXLJNIMacroAssembler::ArmException(scratch, stack_adjust));
+  asm_.LoadFromOffset(kLoadWord,
+                      scratch.AsVIXLRegister(),
+                      tr,
+                      Thread::ExceptionOffset<kArmPointerSize>().Int32Value());
+
+  ___ Cmp(scratch.AsVIXLRegister(), 0);
+  {
+    AssemblerAccurateScope guard(asm_.GetVIXLAssembler(),
+                                 kArmInstrMaxSizeInBytes,
+                                 CodeBufferCheckScope::kMaximumSize);
+    ___ b(ne, Narrow, exception_blocks_.back()->Entry());
+  }
+  // TODO: think about using CBNZ here.
+}
+
+void ArmVIXLJNIMacroAssembler::EmitExceptionPoll(
+    ArmVIXLJNIMacroAssembler::ArmException* exception) {
+  ___ Bind(exception->Entry());
+  if (exception->stack_adjust_ != 0) {  // Fix up the frame.
+    DecreaseFrameSize(exception->stack_adjust_);
+  }
+  // Pass exception object as argument.
+  // Don't care about preserving r0 as this won't return.
+  ___ Mov(r0, exception->scratch_.AsVIXLRegister());
+  // TODO: check that exception->scratch_ is dead by this point.
+  UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+  vixl32::Register temp = temps.Acquire();
+  ___ Ldr(temp,
+          MemOperand(tr,
+              QUICK_ENTRYPOINT_OFFSET(kArmPointerSize, pDeliverException).Int32Value()));
+  ___ Blx(temp);
+}
+
+void ArmVIXLJNIMacroAssembler::MemoryBarrier(ManagedRegister scratch ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL);
+}
+
+void ArmVIXLJNIMacroAssembler::Load(ArmManagedRegister
+                                    dest,
+                                    vixl32::Register base,
+                                    int32_t offset,
+                                    size_t size) {
+  if (dest.IsNoRegister()) {
+    CHECK_EQ(0u, size) << dest;
+  } else if (dest.IsCoreRegister()) {
+    CHECK_EQ(4u, size) << dest;
+    CHECK(!dest.AsVIXLRegister().Is(sp)) << dest;
+    ___ Ldr(dest.AsVIXLRegister(), MemOperand(base, offset));
+  } else if (dest.IsRegisterPair()) {
+    CHECK_EQ(8u, size) << dest;
+    ___ Ldr(dest.AsVIXLRegisterPairLow(),  MemOperand(base, offset));
+    ___ Ldr(dest.AsVIXLRegisterPairHigh(), MemOperand(base, offset + 4));
+  } else if (dest.IsSRegister()) {
+    ___ Vldr(dest.AsVIXLSRegister(), MemOperand(base, offset));
+  } else {
+    CHECK(dest.IsDRegister()) << dest;
+    ___ Vldr(dest.AsVIXLDRegister(), MemOperand(base, offset));
+  }
+}
+
+}  // namespace arm
+}  // namespace art
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
new file mode 100644
index 0000000..dfc35b7
--- /dev/null
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
@@ -0,0 +1,225 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_UTILS_ARM_JNI_MACRO_ASSEMBLER_ARM_VIXL_H_
+#define ART_COMPILER_UTILS_ARM_JNI_MACRO_ASSEMBLER_ARM_VIXL_H_
+
+#include "base/arena_containers.h"
+#include "base/logging.h"
+#include "constants_arm.h"
+#include "offsets.h"
+#include "utils/arm/assembler_arm_shared.h"
+#include "utils/arm/assembler_arm_vixl.h"
+#include "utils/arm/managed_register_arm.h"
+#include "utils/assembler.h"
+#include "utils/jni_macro_assembler.h"
+
+namespace art {
+namespace arm {
+
+class ArmVIXLJNIMacroAssembler FINAL
+    : public JNIMacroAssemblerFwd<ArmVIXLAssembler, PointerSize::k32> {
+ private:
+  class ArmException;
+ public:
+  explicit ArmVIXLJNIMacroAssembler(ArenaAllocator* arena)
+      : JNIMacroAssemblerFwd(arena),
+        exception_blocks_(arena->Adapter(kArenaAllocAssembler)) {}
+
+  virtual ~ArmVIXLJNIMacroAssembler() {}
+  void FinalizeCode() OVERRIDE;
+
+  //
+  // Overridden common assembler high-level functionality
+  //
+
+  // Emit code that will create an activation on the stack.
+  void BuildFrame(size_t frame_size,
+                  ManagedRegister method_reg,
+                  ArrayRef<const ManagedRegister> callee_save_regs,
+                  const ManagedRegisterEntrySpills& entry_spills) OVERRIDE;
+
+  // Emit code that will remove an activation from the stack.
+  void RemoveFrame(size_t frame_size,
+                   ArrayRef<const ManagedRegister> callee_save_regs) OVERRIDE;
+
+  void IncreaseFrameSize(size_t adjust) OVERRIDE;
+  void DecreaseFrameSize(size_t adjust) OVERRIDE;
+
+  // Store routines.
+  void Store(FrameOffset offs, ManagedRegister src, size_t size) OVERRIDE;
+  void StoreRef(FrameOffset dest, ManagedRegister src) OVERRIDE;
+  void StoreRawPtr(FrameOffset dest, ManagedRegister src) OVERRIDE;
+
+  void StoreImmediateToFrame(FrameOffset dest, uint32_t imm, ManagedRegister scratch) OVERRIDE;
+
+  void StoreStackOffsetToThread(ThreadOffset32 thr_offs,
+                                FrameOffset fr_offs,
+                                ManagedRegister scratch) OVERRIDE;
+
+  void StoreStackPointerToThread(ThreadOffset32 thr_offs) OVERRIDE;
+
+  void StoreSpanning(FrameOffset dest,
+                     ManagedRegister src,
+                     FrameOffset in_off,
+                     ManagedRegister scratch) OVERRIDE;
+
+  // Load routines.
+  void Load(ManagedRegister dest, FrameOffset src, size_t size) OVERRIDE;
+
+  void LoadFromThread(ManagedRegister dest,
+                      ThreadOffset32 src,
+                      size_t size) OVERRIDE;
+
+  void LoadRef(ManagedRegister dest, FrameOffset src) OVERRIDE;
+
+  void LoadRef(ManagedRegister dest,
+               ManagedRegister base,
+               MemberOffset offs,
+               bool unpoison_reference) OVERRIDE;
+
+  void LoadRawPtr(ManagedRegister dest, ManagedRegister base, Offset offs) OVERRIDE;
+
+  void LoadRawPtrFromThread(ManagedRegister dest, ThreadOffset32 offs) OVERRIDE;
+
+  // Copying routines.
+  void Move(ManagedRegister dest, ManagedRegister src, size_t size) OVERRIDE;
+
+  void CopyRawPtrFromThread(FrameOffset fr_offs,
+                            ThreadOffset32 thr_offs,
+                            ManagedRegister scratch) OVERRIDE;
+
+  void CopyRawPtrToThread(ThreadOffset32 thr_offs,
+                          FrameOffset fr_offs,
+                          ManagedRegister scratch) OVERRIDE;
+
+  void CopyRef(FrameOffset dest, FrameOffset src, ManagedRegister scratch) OVERRIDE;
+
+  void Copy(FrameOffset dest, FrameOffset src, ManagedRegister scratch, size_t size) OVERRIDE;
+
+  void Copy(FrameOffset dest,
+            ManagedRegister src_base,
+            Offset src_offset,
+            ManagedRegister scratch,
+            size_t size) OVERRIDE;
+
+  void Copy(ManagedRegister dest_base,
+            Offset dest_offset,
+            FrameOffset src,
+            ManagedRegister scratch,
+            size_t size) OVERRIDE;
+
+  void Copy(FrameOffset dest,
+            FrameOffset src_base,
+            Offset src_offset,
+            ManagedRegister scratch,
+            size_t size) OVERRIDE;
+
+  void Copy(ManagedRegister dest,
+            Offset dest_offset,
+            ManagedRegister src,
+            Offset src_offset,
+            ManagedRegister scratch,
+            size_t size) OVERRIDE;
+
+  void Copy(FrameOffset dest,
+            Offset dest_offset,
+            FrameOffset src,
+            Offset src_offset,
+            ManagedRegister scratch,
+            size_t size) OVERRIDE;
+
+  // Sign extension.
+  void SignExtend(ManagedRegister mreg, size_t size) OVERRIDE;
+
+  // Zero extension.
+  void ZeroExtend(ManagedRegister mreg, size_t size) OVERRIDE;
+
+  // Exploit fast access in managed code to Thread::Current().
+  void GetCurrentThread(ManagedRegister mtr) OVERRIDE;
+  void GetCurrentThread(FrameOffset dest_offset,
+                        ManagedRegister scratch) OVERRIDE;
+
+  // Set up out_reg to hold a Object** into the handle scope, or to be null if the
+  // value is null and null_allowed. in_reg holds a possibly stale reference
+  // that can be used to avoid loading the handle scope entry to see if the value is
+  // null.
+  void CreateHandleScopeEntry(ManagedRegister out_reg,
+                              FrameOffset handlescope_offset,
+                              ManagedRegister in_reg,
+                              bool null_allowed) OVERRIDE;
+
+  // Set up out_off to hold a Object** into the handle scope, or to be null if the
+  // value is null and null_allowed.
+  void CreateHandleScopeEntry(FrameOffset out_off,
+                              FrameOffset handlescope_offset,
+                              ManagedRegister scratch,
+                              bool null_allowed) OVERRIDE;
+
+  // src holds a handle scope entry (Object**) load this into dst.
+  void LoadReferenceFromHandleScope(ManagedRegister dst,
+                                    ManagedRegister src) OVERRIDE;
+
+  // Heap::VerifyObject on src. In some cases (such as a reference to this) we
+  // know that src may not be null.
+  void VerifyObject(ManagedRegister src, bool could_be_null) OVERRIDE;
+  void VerifyObject(FrameOffset src, bool could_be_null) OVERRIDE;
+
+  // Call to address held at [base+offset].
+  void Call(ManagedRegister base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void Call(FrameOffset base, Offset offset, ManagedRegister scratch) OVERRIDE;
+  void CallFromThread(ThreadOffset32 offset, ManagedRegister scratch) OVERRIDE;
+
+  // Generate code to check if Thread::Current()->exception_ is non-null
+  // and branch to a ExceptionSlowPath if it is.
+  void ExceptionPoll(ManagedRegister scratch, size_t stack_adjust);
+
+  void MemoryBarrier(ManagedRegister scratch) OVERRIDE;
+
+  void EmitExceptionPoll(ArmVIXLJNIMacroAssembler::ArmException *exception);
+  void Load(ArmManagedRegister dest, vixl32::Register base, int32_t offset, size_t size);
+
+ private:
+  class ArmException {
+   private:
+    ArmException(ArmManagedRegister scratch, size_t stack_adjust)
+        : scratch_(scratch), stack_adjust_(stack_adjust) {}
+
+    vixl32::Label* Entry() { return &exception_entry_; }
+
+    // Register used for passing Thread::Current()->exception_ .
+    const ArmManagedRegister scratch_;
+
+    // Stack adjust for ExceptionPool.
+    const size_t stack_adjust_;
+
+    vixl32::Label exception_entry_;
+
+    friend class ArmVIXLJNIMacroAssembler;
+    DISALLOW_COPY_AND_ASSIGN(ArmException);
+  };
+
+  // List of exception blocks to generate at the end of the code cache.
+  ArenaVector<std::unique_ptr<ArmVIXLJNIMacroAssembler::ArmException>> exception_blocks_;
+  // Used for testing.
+  friend class ArmVIXAssemblerTest_VixlLoadFromOffset_Test;
+  friend class ArmVIXAssemblerTest_VixlStoreToOffset_Test;
+};
+
+}  // namespace arm
+}  // namespace art
+
+#endif  // ART_COMPILER_UTILS_ARM_JNI_MACRO_ASSEMBLER_ARM_VIXL_H_
diff --git a/compiler/utils/arm/managed_register_arm.h b/compiler/utils/arm/managed_register_arm.h
index 276db44..2be2d56 100644
--- a/compiler/utils/arm/managed_register_arm.h
+++ b/compiler/utils/arm/managed_register_arm.h
@@ -22,6 +22,12 @@
 #include "debug/dwarf/register.h"
 #include "utils/managed_register.h"
 
+// TODO(VIXL): Make VIXL compile with -Wshadow.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wshadow"
+#include "aarch32/macro-assembler-aarch32.h"
+#pragma GCC diagnostic pop
+
 namespace art {
 namespace arm {
 
@@ -90,16 +96,31 @@
     return static_cast<Register>(id_);
   }
 
+  vixl::aarch32::Register AsVIXLRegister() const {
+    CHECK(IsCoreRegister());
+    return vixl::aarch32::Register(id_);
+  }
+
   constexpr SRegister AsSRegister() const {
     CHECK(IsSRegister());
     return static_cast<SRegister>(id_ - kNumberOfCoreRegIds);
   }
 
+  vixl::aarch32::SRegister AsVIXLSRegister() const {
+    CHECK(IsSRegister());
+    return vixl::aarch32::SRegister(id_ - kNumberOfCoreRegIds);
+  }
+
   constexpr DRegister AsDRegister() const {
     CHECK(IsDRegister());
     return static_cast<DRegister>(id_ - kNumberOfCoreRegIds - kNumberOfSRegIds);
   }
 
+  vixl::aarch32::DRegister AsVIXLDRegister() const {
+    CHECK(IsDRegister());
+    return vixl::aarch32::DRegister(id_ - kNumberOfCoreRegIds - kNumberOfSRegIds);
+  }
+
   constexpr SRegister AsOverlappingDRegisterLow() const {
     CHECK(IsOverlappingDRegister());
     DRegister d_reg = AsDRegister();
@@ -128,12 +149,20 @@
     return FromRegId(AllocIdLow()).AsCoreRegister();
   }
 
+  vixl::aarch32::Register AsVIXLRegisterPairLow() const {
+    return vixl::aarch32::Register(AsRegisterPairLow());
+  }
+
   constexpr Register AsRegisterPairHigh() const {
     CHECK(IsRegisterPair());
     // Appropriate mapping of register ids allows to use AllocIdHigh().
     return FromRegId(AllocIdHigh()).AsCoreRegister();
   }
 
+  vixl::aarch32::Register AsVIXLRegisterPairHigh() const {
+    return vixl::aarch32::Register(AsRegisterPairHigh());
+  }
+
   constexpr bool IsCoreRegister() const {
     CHECK(IsValidManagedRegister());
     return (0 <= id_) && (id_ < kNumberOfCoreRegIds);
diff --git a/compiler/utils/assembler.cc b/compiler/utils/assembler.cc
index 81159e6..57f3b15 100644
--- a/compiler/utils/assembler.cc
+++ b/compiler/utils/assembler.cc
@@ -20,7 +20,6 @@
 #include <vector>
 
 #ifdef ART_ENABLE_CODEGEN_arm
-#include "arm/assembler_arm32.h"
 #include "arm/assembler_thumb2.h"
 #endif
 #ifdef ART_ENABLE_CODEGEN_arm64
diff --git a/compiler/utils/assembler_thumb_test.cc b/compiler/utils/assembler_thumb_test.cc
index 9c9271d..41cb04b 100644
--- a/compiler/utils/assembler_thumb_test.cc
+++ b/compiler/utils/assembler_thumb_test.cc
@@ -23,6 +23,10 @@
 
 #include "gtest/gtest.h"
 #include "utils/arm/assembler_thumb2.h"
+
+#include "jni/quick/calling_convention.h"
+#include "utils/arm/jni_macro_assembler_arm_vixl.h"
+
 #include "base/hex_dump.h"
 #include "common_runtime_test.h"
 
@@ -1608,6 +1612,196 @@
   EmitAndCheck(&assembler, "CmpConstant");
 }
 
+#define ENABLE_VIXL_TEST
+
+#ifdef ENABLE_VIXL_TEST
+
+#define ARM_VIXL
+
+#ifdef ARM_VIXL
+typedef arm::ArmVIXLJNIMacroAssembler JniAssemblerType;
+#else
+typedef arm::Thumb2Assembler AssemblerType;
+#endif
+
+class ArmVIXAssemblerTest : public ::testing::Test {
+ public:
+  ArmVIXAssemblerTest() : pool(), arena(&pool), assembler(&arena) { }
+
+  ArenaPool pool;
+  ArenaAllocator arena;
+  JniAssemblerType assembler;
+};
+
 #undef __
+#define __ assembler->
+
+void EmitAndCheck(JniAssemblerType* assembler, const char* testname,
+                  const char* const* results) {
+  __ FinalizeCode();
+  size_t cs = __ CodeSize();
+  std::vector<uint8_t> managed_code(cs);
+  MemoryRegion code(&managed_code[0], managed_code.size());
+  __ FinalizeInstructions(code);
+
+  DumpAndCheck(managed_code, testname, results);
+}
+
+void EmitAndCheck(JniAssemblerType* assembler, const char* testname) {
+  InitResults();
+  std::map<std::string, const char* const*>::iterator results = test_results.find(testname);
+  ASSERT_NE(results, test_results.end());
+
+  EmitAndCheck(assembler, testname, results->second);
+}
+
+#undef __
+#define __ assembler.
+
+TEST_F(ArmVIXAssemblerTest, VixlJniHelpers) {
+  const bool is_static = true;
+  const bool is_synchronized = false;
+  const char* shorty = "IIFII";
+
+  ArenaPool pool;
+  ArenaAllocator arena(&pool);
+
+  std::unique_ptr<JniCallingConvention> jni_conv(
+      JniCallingConvention::Create(&arena, is_static, is_synchronized, shorty, kThumb2));
+  std::unique_ptr<ManagedRuntimeCallingConvention> mr_conv(
+      ManagedRuntimeCallingConvention::Create(&arena, is_static, is_synchronized, shorty, kThumb2));
+  const int frame_size(jni_conv->FrameSize());
+  ArrayRef<const ManagedRegister> callee_save_regs = jni_conv->CalleeSaveRegisters();
+
+  const ManagedRegister method_register = ArmManagedRegister::FromCoreRegister(R0);
+  const ManagedRegister scratch_register = ArmManagedRegister::FromCoreRegister(R12);
+
+  __ BuildFrame(frame_size, mr_conv->MethodRegister(), callee_save_regs, mr_conv->EntrySpills());
+  __ IncreaseFrameSize(32);
+
+  // Loads
+  __ IncreaseFrameSize(4096);
+  __ Load(method_register, FrameOffset(32), 4);
+  __ Load(method_register, FrameOffset(124), 4);
+  __ Load(method_register, FrameOffset(132), 4);
+  __ Load(method_register, FrameOffset(1020), 4);
+  __ Load(method_register, FrameOffset(1024), 4);
+  __ Load(scratch_register, FrameOffset(4092), 4);
+  __ Load(scratch_register, FrameOffset(4096), 4);
+  __ LoadRawPtrFromThread(scratch_register, ThreadOffset32(512));
+  __ LoadRef(method_register, scratch_register, MemberOffset(128), true);
+
+  // Stores
+  __ Store(FrameOffset(32), method_register, 4);
+  __ Store(FrameOffset(124), method_register, 4);
+  __ Store(FrameOffset(132), method_register, 4);
+  __ Store(FrameOffset(1020), method_register, 4);
+  __ Store(FrameOffset(1024), method_register, 4);
+  __ Store(FrameOffset(4092), scratch_register, 4);
+  __ Store(FrameOffset(4096), scratch_register, 4);
+  __ StoreImmediateToFrame(FrameOffset(48), 0xFF, scratch_register);
+  __ StoreImmediateToFrame(FrameOffset(48), 0xFFFFFF, scratch_register);
+  __ StoreRawPtr(FrameOffset(48), scratch_register);
+  __ StoreRef(FrameOffset(48), scratch_register);
+  __ StoreSpanning(FrameOffset(48), method_register, FrameOffset(48), scratch_register);
+  __ StoreStackOffsetToThread(ThreadOffset32(512), FrameOffset(4096), scratch_register);
+  __ StoreStackPointerToThread(ThreadOffset32(512));
+
+  // Other
+  __ Call(method_register, FrameOffset(48), scratch_register);
+  __ Copy(FrameOffset(48), FrameOffset(44), scratch_register, 4);
+  __ CopyRawPtrFromThread(FrameOffset(44), ThreadOffset32(512), scratch_register);
+  __ CopyRef(FrameOffset(48), FrameOffset(44), scratch_register);
+  __ GetCurrentThread(method_register);
+  __ GetCurrentThread(FrameOffset(48), scratch_register);
+  __ Move(scratch_register, method_register, 4);
+  __ VerifyObject(scratch_register, false);
+
+  __ CreateHandleScopeEntry(scratch_register, FrameOffset(48), scratch_register, true);
+  __ CreateHandleScopeEntry(scratch_register, FrameOffset(48), scratch_register, false);
+  __ CreateHandleScopeEntry(method_register, FrameOffset(48), scratch_register, true);
+  __ CreateHandleScopeEntry(FrameOffset(48), FrameOffset(64), scratch_register, true);
+  __ CreateHandleScopeEntry(method_register, FrameOffset(0), scratch_register, true);
+  __ CreateHandleScopeEntry(method_register, FrameOffset(1025), scratch_register, true);
+  __ CreateHandleScopeEntry(scratch_register, FrameOffset(1025), scratch_register, true);
+
+  __ ExceptionPoll(scratch_register, 0);
+
+  __ DecreaseFrameSize(4096);
+  __ DecreaseFrameSize(32);
+  __ RemoveFrame(frame_size, callee_save_regs);
+
+  EmitAndCheck(&assembler, "VixlJniHelpers");
+}
+
+#ifdef ARM_VIXL
+#define R0 vixl::aarch32::r0
+#define R2 vixl::aarch32::r2
+#define R4 vixl::aarch32::r4
+#define R12 vixl::aarch32::r12
+#undef __
+#define __ assembler.asm_.
+#endif
+
+TEST_F(ArmVIXAssemblerTest, VixlLoadFromOffset) {
+  __ LoadFromOffset(kLoadWord, R2, R4, 12);
+  __ LoadFromOffset(kLoadWord, R2, R4, 0xfff);
+  __ LoadFromOffset(kLoadWord, R2, R4, 0x1000);
+  __ LoadFromOffset(kLoadWord, R2, R4, 0x1000a4);
+  __ LoadFromOffset(kLoadWord, R2, R4, 0x101000);
+  __ LoadFromOffset(kLoadWord, R4, R4, 0x101000);
+  __ LoadFromOffset(kLoadUnsignedHalfword, R2, R4, 12);
+  __ LoadFromOffset(kLoadUnsignedHalfword, R2, R4, 0xfff);
+  __ LoadFromOffset(kLoadUnsignedHalfword, R2, R4, 0x1000);
+  __ LoadFromOffset(kLoadUnsignedHalfword, R2, R4, 0x1000a4);
+  __ LoadFromOffset(kLoadUnsignedHalfword, R2, R4, 0x101000);
+  __ LoadFromOffset(kLoadUnsignedHalfword, R4, R4, 0x101000);
+  __ LoadFromOffset(kLoadWordPair, R2, R4, 12);
+  __ LoadFromOffset(kLoadWordPair, R2, R4, 0x3fc);
+  __ LoadFromOffset(kLoadWordPair, R2, R4, 0x400);
+  __ LoadFromOffset(kLoadWordPair, R2, R4, 0x400a4);
+  __ LoadFromOffset(kLoadWordPair, R2, R4, 0x40400);
+  __ LoadFromOffset(kLoadWordPair, R4, R4, 0x40400);
+
+  __ LoadFromOffset(kLoadWord, R0, R12, 12);  // 32-bit because of R12.
+  __ LoadFromOffset(kLoadWord, R2, R4, 0xa4 - 0x100000);
+
+  __ LoadFromOffset(kLoadSignedByte, R2, R4, 12);
+  __ LoadFromOffset(kLoadUnsignedByte, R2, R4, 12);
+  __ LoadFromOffset(kLoadSignedHalfword, R2, R4, 12);
+
+  EmitAndCheck(&assembler, "VixlLoadFromOffset");
+}
+
+TEST_F(ArmVIXAssemblerTest, VixlStoreToOffset) {
+  __ StoreToOffset(kStoreWord, R2, R4, 12);
+  __ StoreToOffset(kStoreWord, R2, R4, 0xfff);
+  __ StoreToOffset(kStoreWord, R2, R4, 0x1000);
+  __ StoreToOffset(kStoreWord, R2, R4, 0x1000a4);
+  __ StoreToOffset(kStoreWord, R2, R4, 0x101000);
+  __ StoreToOffset(kStoreWord, R4, R4, 0x101000);
+  __ StoreToOffset(kStoreHalfword, R2, R4, 12);
+  __ StoreToOffset(kStoreHalfword, R2, R4, 0xfff);
+  __ StoreToOffset(kStoreHalfword, R2, R4, 0x1000);
+  __ StoreToOffset(kStoreHalfword, R2, R4, 0x1000a4);
+  __ StoreToOffset(kStoreHalfword, R2, R4, 0x101000);
+  __ StoreToOffset(kStoreHalfword, R4, R4, 0x101000);
+  __ StoreToOffset(kStoreWordPair, R2, R4, 12);
+  __ StoreToOffset(kStoreWordPair, R2, R4, 0x3fc);
+  __ StoreToOffset(kStoreWordPair, R2, R4, 0x400);
+  __ StoreToOffset(kStoreWordPair, R2, R4, 0x400a4);
+  __ StoreToOffset(kStoreWordPair, R2, R4, 0x40400);
+  __ StoreToOffset(kStoreWordPair, R4, R4, 0x40400);
+
+  __ StoreToOffset(kStoreWord, R0, R12, 12);  // 32-bit because of R12.
+  __ StoreToOffset(kStoreWord, R2, R4, 0xa4 - 0x100000);
+
+  __ StoreToOffset(kStoreByte, R2, R4, 12);
+
+  EmitAndCheck(&assembler, "VixlStoreToOffset");
+}
+
+#undef __
+#endif  // ENABLE_VIXL_TEST
 }  // namespace arm
 }  // namespace art
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index 6736015..81c6ec5 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -5468,6 +5468,199 @@
   nullptr
 };
 
+const char* const VixlJniHelpersResults[] = {
+  "   0:  e92d 4de0   stmdb sp!, {r5, r6, r7, r8, sl, fp, lr}\n",
+  "   4:  ed2d 8a10   vpush {s16-s31}\n",
+  "   8:  b089        sub sp, #36 ; 0x24\n",
+  "   a:  9000        str r0, [sp, #0]\n",
+  "   c:  9121        str r1, [sp, #132]  ; 0x84\n",
+  "   e:  ed8d 0a22   vstr  s0, [sp, #136]  ; 0x88\n",
+  "  12:  9223        str r2, [sp, #140]  ; 0x8c\n",
+  "  14:  9324        str r3, [sp, #144]  ; 0x90\n",
+  "  16:  b088        sub sp, #32\n",
+  "  18:  f5ad 5d80   sub.w sp, sp, #4096 ; 0x1000\n",
+  "  1c:  9808        ldr r0, [sp, #32]\n",
+  "  1e:  981f        ldr r0, [sp, #124]  ; 0x7c\n",
+  "  20:  9821        ldr r0, [sp, #132]  ; 0x84\n",
+  "  22:  98ff        ldr r0, [sp, #1020] ; 0x3fc\n",
+  "  24:  f8dd 0400   ldr.w r0, [sp, #1024] ; 0x400\n",
+  "  28:  f8dd cffc   ldr.w ip, [sp, #4092] ; 0xffc\n",
+  "  2c:  f50d 5c80   add.w ip, sp, #4096 ; 0x1000\n",
+  "  30:  f8dc c000   ldr.w ip, [ip]\n",
+  "  34:  f8d9 c200   ldr.w ip, [r9, #512]  ; 0x200\n",
+  "  38:  f8dc 0080   ldr.w r0, [ip, #128]  ; 0x80\n",
+  "  3c:  9008        str r0, [sp, #32]\n",
+  "  3e:  901f        str r0, [sp, #124]  ; 0x7c\n",
+  "  40:  9021        str r0, [sp, #132]  ; 0x84\n",
+  "  42:  90ff        str r0, [sp, #1020] ; 0x3fc\n",
+  "  44:  f8cd 0400   str.w r0, [sp, #1024] ; 0x400\n",
+  "  48:  f8cd cffc   str.w ip, [sp, #4092] ; 0xffc\n",
+  "  4c:  f84d 5d04   str.w r5, [sp, #-4]!\n",
+  "  50:  f50d 5580   add.w r5, sp, #4096 ; 0x1000\n",
+  "  54:  f8c5 c004   str.w ip, [r5, #4]\n",
+  "  58:  f85d 5b04   ldr.w r5, [sp], #4\n",
+  "  5c:  f04f 0cff   mov.w ip, #255  ; 0xff\n",
+  "  60:  f8cd c030   str.w ip, [sp, #48] ; 0x30\n",
+  "  64:  f06f 4c7f   mvn.w ip, #4278190080 ; 0xff000000\n",
+  "  68:  f8cd c030   str.w ip, [sp, #48] ; 0x30\n",
+  "  6c:  f8cd c030   str.w ip, [sp, #48] ; 0x30\n",
+  "  70:  f8cd c030   str.w ip, [sp, #48] ; 0x30\n",
+  "  74:  900c        str r0, [sp, #48] ; 0x30\n",
+  "  76:  f8dd c030   ldr.w ip, [sp, #48] ; 0x30\n",
+  "  7a:  f8cd c034   str.w ip, [sp, #52] ; 0x34\n",
+  "  7e:  f50d 5c80   add.w ip, sp, #4096 ; 0x1000\n",
+  "  82:  f8c9 c200   str.w ip, [r9, #512]  ; 0x200\n",
+  "  86:  f8c9 d200   str.w sp, [r9, #512]  ; 0x200\n",
+  "  8a:  f8d0 c030   ldr.w ip, [r0, #48] ; 0x30\n",
+  "  8e:  47e0        blx ip\n",
+  "  90:  f8dd c02c   ldr.w ip, [sp, #44] ; 0x2c\n",
+  "  94:  f8cd c030   str.w ip, [sp, #48] ; 0x30\n",
+  "  98:  f8d9 c200   ldr.w ip, [r9, #512]  ; 0x200\n",
+  "  9c:  f8cd c02c   str.w ip, [sp, #44] ; 0x2c\n",
+  "  a0:  f8dd c02c   ldr.w ip, [sp, #44] ; 0x2c\n",
+  "  a4:  f8cd c030   str.w ip, [sp, #48] ; 0x30\n",
+  "  a8:  4648        mov r0, r9\n",
+  "  aa:  f8cd 9030   str.w r9, [sp, #48] ; 0x30\n",
+  "  ae:  4684        mov ip, r0\n",
+  "  b0:  f1bc 0f00   cmp.w ip, #0\n",
+  "  b4:  bf18        it  ne\n",
+  "  b6:  f10d 0c30   addne.w ip, sp, #48 ; 0x30\n",
+  "  ba:  f10d 0c30   add.w ip, sp, #48 ; 0x30\n",
+  "  be:  f1bc 0f00   cmp.w ip, #0\n",
+  "  c2:  bf0c        ite eq\n",
+  "  c4:  2000        moveq r0, #0\n",
+  "  c6:  a80c        addne r0, sp, #48 ; 0x30\n",
+  "  c8:  f8dd c040   ldr.w ip, [sp, #64] ; 0x40\n",
+  "  cc:  f1bc 0f00   cmp.w ip, #0\n",
+  "  d0:  bf18        it  ne\n",
+  "  d2:  f10d 0c40   addne.w ip, sp, #64 ; 0x40\n",
+  "  d6:  f8cd c030   str.w ip, [sp, #48] ; 0x30\n",
+  "  da:  f1bc 0f00   cmp.w ip, #0\n",
+  "  de:  bf0c        ite eq\n",
+  "  e0:  2000        moveq r0, #0\n",
+  "  e2:  4668        movne r0, sp\n",
+  "  e4:  f1bc 0f00   cmp.w ip, #0\n",
+  "  e8:  bf0c        ite eq\n",
+  "  ea:  2000        moveq r0, #0\n",
+  "  ec:  f20d 4001   addwne  r0, sp, #1025 ; 0x401\n",
+  "  f0:  f1bc 0f00   cmp.w ip, #0\n",
+  "  f4:  bf18        it  ne\n",
+  "  f6:  f20d 4c01   addwne  ip, sp, #1025 ; 0x401\n",
+  "  fa:  f8d9 c084   ldr.w ip, [r9, #132]  ; 0x84\n",
+  "  fe:  f1bc 0f00   cmp.w ip, #0\n",
+  " 102:  d107        bne.n 114 <VixlJniHelpers+0x114>\n",
+  " 104:  f50d 5d80   add.w sp, sp, #4096 ; 0x1000\n",
+  " 108:  b008        add sp, #32\n",
+  " 10a:  b009        add sp, #36 ; 0x24\n",
+  " 10c:  ecbd 8a10   vpop  {s16-s31}\n",
+  " 110:  e8bd 8de0   ldmia.w sp!, {r5, r6, r7, r8, sl, fp, pc}\n",
+  " 114:  4660        mov r0, ip\n",
+  " 116:  f8d9 c2ac   ldr.w ip, [r9, #684]  ; 0x2ac\n",
+  " 11a:  47e0        blx ip\n",
+  nullptr
+};
+
+const char* const VixlLoadFromOffsetResults[] = {
+  "   0:  68e2        ldr r2, [r4, #12]\n",
+  "   2:  f8d4 2fff   ldr.w r2, [r4, #4095] ; 0xfff\n",
+  "   6:  f504 5280   add.w r2, r4, #4096 ; 0x1000\n",
+  "   a:  6812        ldr r2, [r2, #0]\n",
+  "   c:  f504 1280   add.w r2, r4, #1048576  ; 0x100000\n",
+  "  10:  f8d2 20a4   ldr.w r2, [r2, #164]  ; 0xa4\n",
+  "  14:  f44f 5280   mov.w r2, #4096 ; 0x1000\n",
+  "  18:  f2c0 0210   movt  r2, #16\n",
+  "  1c:  4422        add r2, r4\n",
+  "  1e:  6812        ldr r2, [r2, #0]\n",
+  "  20:  f44f 5c80   mov.w ip, #4096 ; 0x1000\n",
+  "  24:  f2c0 0c10   movt  ip, #16\n",
+  "  28:  4464        add r4, ip\n",
+  "  2a:  6824        ldr r4, [r4, #0]\n",
+  "  2c:  89a2        ldrh  r2, [r4, #12]\n",
+  "  2e:  f8b4 2fff   ldrh.w  r2, [r4, #4095] ; 0xfff\n",
+  "  32:  f504 5280   add.w r2, r4, #4096 ; 0x1000\n",
+  "  36:  8812        ldrh  r2, [r2, #0]\n",
+  "  38:  f504 1280   add.w r2, r4, #1048576  ; 0x100000\n",
+  "  3c:  f8b2 20a4   ldrh.w  r2, [r2, #164]  ; 0xa4\n",
+  "  40:  f44f 5280   mov.w r2, #4096 ; 0x1000\n",
+  "  44:  f2c0 0210   movt  r2, #16\n",
+  "  48:  4422        add r2, r4\n",
+  "  4a:  8812        ldrh  r2, [r2, #0]\n",
+  "  4c:  f44f 5c80   mov.w ip, #4096 ; 0x1000\n",
+  "  50:  f2c0 0c10   movt  ip, #16\n",
+  "  54:  4464        add r4, ip\n",
+  "  56:  8824        ldrh  r4, [r4, #0]\n",
+  "  58:  e9d4 2303   ldrd  r2, r3, [r4, #12]\n",
+  "  5c:  e9d4 23ff   ldrd  r2, r3, [r4, #1020] ; 0x3fc\n",
+  "  60:  f504 6280   add.w r2, r4, #1024 ; 0x400\n",
+  "  64:  e9d2 2300   ldrd  r2, r3, [r2]\n",
+  "  68:  f504 2280   add.w r2, r4, #262144 ; 0x40000\n",
+  "  6c:  e9d2 2329   ldrd  r2, r3, [r2, #164]  ; 0xa4\n",
+  "  70:  f44f 6280   mov.w r2, #1024 ; 0x400\n",
+  "  74:  f2c0 0204   movt  r2, #4\n",
+  "  78:  4422        add r2, r4\n",
+  "  7a:  e9d2 2300   ldrd  r2, r3, [r2]\n",
+  "  7e:  f44f 6c80   mov.w ip, #1024 ; 0x400\n",
+  "  82:  f2c0 0c04   movt  ip, #4\n",
+  "  86:  4464        add r4, ip\n",
+  "  88:  e9d4 4500   ldrd  r4, r5, [r4]\n",
+  "  8c:  f8dc 000c   ldr.w r0, [ip, #12]\n",
+  "  90:  f5a4 1280   sub.w r2, r4, #1048576  ; 0x100000\n",
+  "  94:  f8d2 20a4   ldr.w r2, [r2, #164]  ; 0xa4\n",
+  "  98:  f994 200c   ldrsb.w r2, [r4, #12]\n",
+  "  9c:  7b22        ldrb  r2, [r4, #12]\n",
+  "  9e:  f9b4 200c   ldrsh.w r2, [r4, #12]\n",
+  nullptr
+};
+const char* const VixlStoreToOffsetResults[] = {
+  "   0:  60e2        str r2, [r4, #12]\n",
+  "   2:  f8c4 2fff   str.w r2, [r4, #4095] ; 0xfff\n",
+  "   6:  f504 5c80   add.w ip, r4, #4096 ; 0x1000\n",
+  "   a:  f8cc 2000   str.w r2, [ip]\n",
+  "   e:  f504 1c80   add.w ip, r4, #1048576  ; 0x100000\n",
+  "  12:  f8cc 20a4   str.w r2, [ip, #164]  ; 0xa4\n",
+  "  16:  f44f 5c80   mov.w ip, #4096 ; 0x1000\n",
+  "  1a:  f2c0 0c10   movt  ip, #16\n",
+  "  1e:  44a4        add ip, r4\n",
+  "  20:  f8cc 2000   str.w r2, [ip]\n",
+  "  24:  f44f 5c80   mov.w ip, #4096 ; 0x1000\n",
+  "  28:  f2c0 0c10   movt  ip, #16\n",
+  "  2c:  44a4        add ip, r4\n",
+  "  2e:  f8cc 4000   str.w r4, [ip]\n",
+  "  32:  81a2        strh  r2, [r4, #12]\n",
+  "  34:  f8a4 2fff   strh.w  r2, [r4, #4095] ; 0xfff\n",
+  "  38:  f504 5c80   add.w ip, r4, #4096 ; 0x1000\n",
+  "  3c:  f8ac 2000   strh.w  r2, [ip]\n",
+  "  40:  f504 1c80   add.w ip, r4, #1048576  ; 0x100000\n",
+  "  44:  f8ac 20a4   strh.w  r2, [ip, #164]  ; 0xa4\n",
+  "  48:  f44f 5c80   mov.w ip, #4096 ; 0x1000\n",
+  "  4c:  f2c0 0c10   movt  ip, #16\n",
+  "  50:  44a4        add ip, r4\n",
+  "  52:  f8ac 2000   strh.w  r2, [ip]\n",
+  "  56:  f44f 5c80   mov.w ip, #4096 ; 0x1000\n",
+  "  5a:  f2c0 0c10   movt  ip, #16\n",
+  "  5e:  44a4        add ip, r4\n",
+  "  60:  f8ac 4000   strh.w  r4, [ip]\n",
+  "  64:  e9c4 2303   strd  r2, r3, [r4, #12]\n",
+  "  68:  e9c4 23ff   strd  r2, r3, [r4, #1020] ; 0x3fc\n",
+  "  6c:  f504 6c80   add.w ip, r4, #1024 ; 0x400\n",
+  "  70:  e9cc 2300   strd  r2, r3, [ip]\n",
+  "  74:  f504 2c80   add.w ip, r4, #262144 ; 0x40000\n",
+  "  78:  e9cc 2329   strd  r2, r3, [ip, #164]  ; 0xa4\n",
+  "  7c:  f44f 6c80   mov.w ip, #1024 ; 0x400\n",
+  "  80:  f2c0 0c04   movt  ip, #4\n",
+  "  84:  44a4        add ip, r4\n",
+  "  86:  e9cc 2300   strd  r2, r3, [ip]\n",
+  "  8a:  f44f 6c80   mov.w ip, #1024 ; 0x400\n",
+  "  8e:  f2c0 0c04   movt  ip, #4\n",
+  "  92:  44a4        add ip, r4\n",
+  "  94:  e9cc 4500   strd  r4, r5, [ip]\n",
+  "  98:  f8cc 000c   str.w r0, [ip, #12]\n",
+  "  9c:  f5a4 1c80   sub.w ip, r4, #1048576  ; 0x100000\n",
+  "  a0:  f8cc 20a4   str.w r2, [ip, #164]  ; 0xa4\n",
+  "  a4:  7322        strb  r2, [r4, #12]\n",
+  nullptr
+};
+
 std::map<std::string, const char* const*> test_results;
 void setup_results() {
     test_results["SimpleMov"] = SimpleMovResults;
@@ -5520,4 +5713,7 @@
     test_results["CompareAndBranch"] = CompareAndBranchResults;
     test_results["AddConstant"] = AddConstantResults;
     test_results["CmpConstant"] = CmpConstantResults;
+    test_results["VixlJniHelpers"] = VixlJniHelpersResults;
+    test_results["VixlStoreToOffset"] = VixlStoreToOffsetResults;
+    test_results["VixlLoadFromOffset"] = VixlLoadFromOffsetResults;
 }
diff --git a/compiler/utils/jni_macro_assembler.cc b/compiler/utils/jni_macro_assembler.cc
index 1b74313..2f154fb 100644
--- a/compiler/utils/jni_macro_assembler.cc
+++ b/compiler/utils/jni_macro_assembler.cc
@@ -20,7 +20,7 @@
 #include <vector>
 
 #ifdef ART_ENABLE_CODEGEN_arm
-#include "arm/jni_macro_assembler_arm.h"
+#include "arm/jni_macro_assembler_arm_vixl.h"
 #endif
 #ifdef ART_ENABLE_CODEGEN_arm64
 #include "arm64/jni_macro_assembler_arm64.h"
@@ -58,7 +58,7 @@
 #ifdef ART_ENABLE_CODEGEN_arm
     case kArm:
     case kThumb2:
-      return MacroAsm32UniquePtr(new (arena) arm::ArmJNIMacroAssembler(arena, instruction_set));
+      return MacroAsm32UniquePtr(new (arena) arm::ArmVIXLJNIMacroAssembler(arena));
 #endif
 #ifdef ART_ENABLE_CODEGEN_mips
     case kMips:
diff --git a/compiler/utils/label.h b/compiler/utils/label.h
index 1038f44..0f82ad5 100644
--- a/compiler/utils/label.h
+++ b/compiler/utils/label.h
@@ -28,7 +28,6 @@
 
 namespace arm {
   class ArmAssembler;
-  class Arm32Assembler;
   class Thumb2Assembler;
 }
 namespace arm64 {
@@ -118,7 +117,6 @@
   }
 
   friend class arm::ArmAssembler;
-  friend class arm::Arm32Assembler;
   friend class arm::Thumb2Assembler;
   friend class arm64::Arm64Assembler;
   friend class mips::MipsAssembler;
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index 8b7da3f..bfc63d1 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -1407,44 +1407,6 @@
   }
 }
 
-void MipsAssembler::StoreConst32ToOffset(int32_t value,
-                                         Register base,
-                                         int32_t offset,
-                                         Register temp) {
-  CHECK_NE(temp, AT);  // Must not use AT as temp, so as not to overwrite the adjusted base.
-  AdjustBaseAndOffset(base, offset, /* is_doubleword */ false);
-  if (value == 0) {
-    temp = ZERO;
-  } else {
-    LoadConst32(temp, value);
-  }
-  Sw(temp, base, offset);
-}
-
-void MipsAssembler::StoreConst64ToOffset(int64_t value,
-                                         Register base,
-                                         int32_t offset,
-                                         Register temp) {
-  CHECK_NE(temp, AT);  // Must not use AT as temp, so as not to overwrite the adjusted base.
-  AdjustBaseAndOffset(base, offset, /* is_doubleword */ true);
-  uint32_t low = Low32Bits(value);
-  uint32_t high = High32Bits(value);
-  if (low == 0) {
-    Sw(ZERO, base, offset);
-  } else {
-    LoadConst32(temp, low);
-    Sw(temp, base, offset);
-  }
-  if (high == 0) {
-    Sw(ZERO, base, offset + kMipsWordSize);
-  } else {
-    if (high != low) {
-      LoadConst32(temp, high);
-    }
-    Sw(temp, base, offset + kMipsWordSize);
-  }
-}
-
 void MipsAssembler::LoadSConst32(FRegister r, int32_t value, Register temp) {
   if (value == 0) {
     temp = ZERO;
@@ -2533,61 +2495,19 @@
   CHECK_EQ(misalignment, offset & (kMipsDoublewordSize - 1));
 }
 
-void MipsAssembler::LoadFromOffset(LoadOperandType type, Register reg, Register base,
+void MipsAssembler::LoadFromOffset(LoadOperandType type,
+                                   Register reg,
+                                   Register base,
                                    int32_t offset) {
-  AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kLoadDoubleword));
-  switch (type) {
-    case kLoadSignedByte:
-      Lb(reg, base, offset);
-      break;
-    case kLoadUnsignedByte:
-      Lbu(reg, base, offset);
-      break;
-    case kLoadSignedHalfword:
-      Lh(reg, base, offset);
-      break;
-    case kLoadUnsignedHalfword:
-      Lhu(reg, base, offset);
-      break;
-    case kLoadWord:
-      Lw(reg, base, offset);
-      break;
-    case kLoadDoubleword:
-      if (reg == base) {
-        // This will clobber the base when loading the lower register. Since we have to load the
-        // higher register as well, this will fail. Solution: reverse the order.
-        Lw(static_cast<Register>(reg + 1), base, offset + kMipsWordSize);
-        Lw(reg, base, offset);
-      } else {
-        Lw(reg, base, offset);
-        Lw(static_cast<Register>(reg + 1), base, offset + kMipsWordSize);
-      }
-      break;
-    default:
-      LOG(FATAL) << "UNREACHABLE";
-  }
+  LoadFromOffset<>(type, reg, base, offset);
 }
 
 void MipsAssembler::LoadSFromOffset(FRegister reg, Register base, int32_t offset) {
-  AdjustBaseAndOffset(base, offset, /* is_doubleword */ false, /* is_float */ true);
-  Lwc1(reg, base, offset);
+  LoadSFromOffset<>(reg, base, offset);
 }
 
 void MipsAssembler::LoadDFromOffset(FRegister reg, Register base, int32_t offset) {
-  AdjustBaseAndOffset(base, offset, /* is_doubleword */ true, /* is_float */ true);
-  if (offset & 0x7) {
-    if (Is32BitFPU()) {
-      Lwc1(reg, base, offset);
-      Lwc1(static_cast<FRegister>(reg + 1), base, offset + kMipsWordSize);
-    } else {
-      // 64-bit FPU.
-      Lwc1(reg, base, offset);
-      Lw(T8, base, offset + kMipsWordSize);
-      Mthc1(T8, reg);
-    }
-  } else {
-    Ldc1(reg, base, offset);
-  }
+  LoadDFromOffset<>(reg, base, offset);
 }
 
 void MipsAssembler::EmitLoad(ManagedRegister m_dst, Register src_register, int32_t src_offset,
@@ -2611,53 +2531,19 @@
   }
 }
 
-void MipsAssembler::StoreToOffset(StoreOperandType type, Register reg, Register base,
+void MipsAssembler::StoreToOffset(StoreOperandType type,
+                                  Register reg,
+                                  Register base,
                                   int32_t offset) {
-  // Must not use AT as `reg`, so as not to overwrite the value being stored
-  // with the adjusted `base`.
-  CHECK_NE(reg, AT);
-  AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kStoreDoubleword));
-  switch (type) {
-    case kStoreByte:
-      Sb(reg, base, offset);
-      break;
-    case kStoreHalfword:
-      Sh(reg, base, offset);
-      break;
-    case kStoreWord:
-      Sw(reg, base, offset);
-      break;
-    case kStoreDoubleword:
-      CHECK_NE(reg, base);
-      CHECK_NE(static_cast<Register>(reg + 1), base);
-      Sw(reg, base, offset);
-      Sw(static_cast<Register>(reg + 1), base, offset + kMipsWordSize);
-      break;
-    default:
-      LOG(FATAL) << "UNREACHABLE";
-  }
+  StoreToOffset<>(type, reg, base, offset);
 }
 
 void MipsAssembler::StoreSToOffset(FRegister reg, Register base, int32_t offset) {
-  AdjustBaseAndOffset(base, offset, /* is_doubleword */ false, /* is_float */ true);
-  Swc1(reg, base, offset);
+  StoreSToOffset<>(reg, base, offset);
 }
 
 void MipsAssembler::StoreDToOffset(FRegister reg, Register base, int32_t offset) {
-  AdjustBaseAndOffset(base, offset, /* is_doubleword */ true, /* is_float */ true);
-  if (offset & 0x7) {
-    if (Is32BitFPU()) {
-      Swc1(reg, base, offset);
-      Swc1(static_cast<FRegister>(reg + 1), base, offset + kMipsWordSize);
-    } else {
-      // 64-bit FPU.
-      Mfhc1(T8, reg);
-      Swc1(reg, base, offset);
-      Sw(T8, base, offset + kMipsWordSize);
-    }
-  } else {
-    Sdc1(reg, base, offset);
-  }
+  StoreDToOffset<>(reg, base, offset);
 }
 
 static dwarf::Reg DWARFReg(Register reg) {
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index 41b6c6b..434ca67 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -412,8 +412,6 @@
   void LoadConst64(Register reg_hi, Register reg_lo, int64_t value);
   void LoadDConst64(FRegister rd, int64_t value, Register temp);
   void LoadSConst32(FRegister r, int32_t value, Register temp);
-  void StoreConst32ToOffset(int32_t value, Register base, int32_t offset, Register temp);
-  void StoreConst64ToOffset(int64_t value, Register base, int32_t offset, Register temp);
   void Addiu32(Register rt, Register rs, int32_t value, Register rtmp = AT);
 
   // These will generate R2 branches or R6 branches as appropriate.
@@ -444,6 +442,204 @@
                            int32_t& offset,
                            bool is_doubleword,
                            bool is_float = false);
+
+ private:
+  struct NoImplicitNullChecker {
+    void operator()() {}
+  };
+
+ public:
+  template <typename ImplicitNullChecker = NoImplicitNullChecker>
+  void StoreConst32ToOffset(int32_t value,
+                            Register base,
+                            int32_t offset,
+                            Register temp,
+                            ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
+    CHECK_NE(temp, AT);  // Must not use AT as temp, so as not to overwrite the adjusted base.
+    AdjustBaseAndOffset(base, offset, /* is_doubleword */ false);
+    if (value == 0) {
+      temp = ZERO;
+    } else {
+      LoadConst32(temp, value);
+    }
+    Sw(temp, base, offset);
+    null_checker();
+  }
+
+  template <typename ImplicitNullChecker = NoImplicitNullChecker>
+  void StoreConst64ToOffset(int64_t value,
+                            Register base,
+                            int32_t offset,
+                            Register temp,
+                            ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
+    CHECK_NE(temp, AT);  // Must not use AT as temp, so as not to overwrite the adjusted base.
+    AdjustBaseAndOffset(base, offset, /* is_doubleword */ true);
+    uint32_t low = Low32Bits(value);
+    uint32_t high = High32Bits(value);
+    if (low == 0) {
+      Sw(ZERO, base, offset);
+    } else {
+      LoadConst32(temp, low);
+      Sw(temp, base, offset);
+    }
+    null_checker();
+    if (high == 0) {
+      Sw(ZERO, base, offset + kMipsWordSize);
+    } else {
+      if (high != low) {
+        LoadConst32(temp, high);
+      }
+      Sw(temp, base, offset + kMipsWordSize);
+    }
+  }
+
+  template <typename ImplicitNullChecker = NoImplicitNullChecker>
+  void LoadFromOffset(LoadOperandType type,
+                      Register reg,
+                      Register base,
+                      int32_t offset,
+                      ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
+    AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kLoadDoubleword));
+    switch (type) {
+      case kLoadSignedByte:
+        Lb(reg, base, offset);
+        break;
+      case kLoadUnsignedByte:
+        Lbu(reg, base, offset);
+        break;
+      case kLoadSignedHalfword:
+        Lh(reg, base, offset);
+        break;
+      case kLoadUnsignedHalfword:
+        Lhu(reg, base, offset);
+        break;
+      case kLoadWord:
+        Lw(reg, base, offset);
+        break;
+      case kLoadDoubleword:
+        if (reg == base) {
+          // This will clobber the base when loading the lower register. Since we have to load the
+          // higher register as well, this will fail. Solution: reverse the order.
+          Lw(static_cast<Register>(reg + 1), base, offset + kMipsWordSize);
+          null_checker();
+          Lw(reg, base, offset);
+        } else {
+          Lw(reg, base, offset);
+          null_checker();
+          Lw(static_cast<Register>(reg + 1), base, offset + kMipsWordSize);
+        }
+        break;
+      default:
+        LOG(FATAL) << "UNREACHABLE";
+    }
+    if (type != kLoadDoubleword) {
+      null_checker();
+    }
+  }
+
+  template <typename ImplicitNullChecker = NoImplicitNullChecker>
+  void LoadSFromOffset(FRegister reg,
+                       Register base,
+                       int32_t offset,
+                       ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
+    AdjustBaseAndOffset(base, offset, /* is_doubleword */ false, /* is_float */ true);
+    Lwc1(reg, base, offset);
+    null_checker();
+  }
+
+  template <typename ImplicitNullChecker = NoImplicitNullChecker>
+  void LoadDFromOffset(FRegister reg,
+                       Register base,
+                       int32_t offset,
+                       ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
+    AdjustBaseAndOffset(base, offset, /* is_doubleword */ true, /* is_float */ true);
+    if (IsAligned<kMipsDoublewordSize>(offset)) {
+      Ldc1(reg, base, offset);
+      null_checker();
+    } else {
+      if (Is32BitFPU()) {
+        Lwc1(reg, base, offset);
+        null_checker();
+        Lwc1(static_cast<FRegister>(reg + 1), base, offset + kMipsWordSize);
+      } else {
+        // 64-bit FPU.
+        Lwc1(reg, base, offset);
+        null_checker();
+        Lw(T8, base, offset + kMipsWordSize);
+        Mthc1(T8, reg);
+      }
+    }
+  }
+
+  template <typename ImplicitNullChecker = NoImplicitNullChecker>
+  void StoreToOffset(StoreOperandType type,
+                     Register reg,
+                     Register base,
+                     int32_t offset,
+                     ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
+    // Must not use AT as `reg`, so as not to overwrite the value being stored
+    // with the adjusted `base`.
+    CHECK_NE(reg, AT);
+    AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kStoreDoubleword));
+    switch (type) {
+      case kStoreByte:
+        Sb(reg, base, offset);
+        break;
+      case kStoreHalfword:
+        Sh(reg, base, offset);
+        break;
+      case kStoreWord:
+        Sw(reg, base, offset);
+        break;
+      case kStoreDoubleword:
+        CHECK_NE(reg, base);
+        CHECK_NE(static_cast<Register>(reg + 1), base);
+        Sw(reg, base, offset);
+        null_checker();
+        Sw(static_cast<Register>(reg + 1), base, offset + kMipsWordSize);
+        break;
+      default:
+        LOG(FATAL) << "UNREACHABLE";
+    }
+    if (type != kStoreDoubleword) {
+      null_checker();
+    }
+  }
+
+  template <typename ImplicitNullChecker = NoImplicitNullChecker>
+  void StoreSToOffset(FRegister reg,
+                      Register base,
+                      int32_t offset,
+                      ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
+    AdjustBaseAndOffset(base, offset, /* is_doubleword */ false, /* is_float */ true);
+    Swc1(reg, base, offset);
+    null_checker();
+  }
+
+  template <typename ImplicitNullChecker = NoImplicitNullChecker>
+  void StoreDToOffset(FRegister reg,
+                      Register base,
+                      int32_t offset,
+                      ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
+    AdjustBaseAndOffset(base, offset, /* is_doubleword */ true, /* is_float */ true);
+    if (IsAligned<kMipsDoublewordSize>(offset)) {
+      Sdc1(reg, base, offset);
+      null_checker();
+    } else {
+      if (Is32BitFPU()) {
+        Swc1(reg, base, offset);
+        null_checker();
+        Swc1(static_cast<FRegister>(reg + 1), base, offset + kMipsWordSize);
+      } else {
+        // 64-bit FPU.
+        Mfhc1(T8, reg);
+        Swc1(reg, base, offset);
+        null_checker();
+        Sw(T8, base, offset + kMipsWordSize);
+      }
+    }
+  }
+
   void LoadFromOffset(LoadOperandType type, Register reg, Register base, int32_t offset);
   void LoadSFromOffset(FRegister reg, Register base, int32_t offset);
   void LoadDFromOffset(FRegister reg, Register base, int32_t offset);
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index f1a9915..f2ef41f 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -1148,6 +1148,23 @@
 }
 
 
+void X86Assembler::testb(const Address& dst, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF6);
+  EmitOperand(EAX, dst);
+  CHECK(imm.is_int8());
+  EmitUint8(imm.value() & 0xFF);
+}
+
+
+void X86Assembler::testl(const Address& dst, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF7);
+  EmitOperand(0, dst);
+  EmitImmediate(imm);
+}
+
+
 void X86Assembler::andl(Register dst, Register src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x23);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 63aa4a4..2ddcd76 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -496,6 +496,9 @@
   void testl(Register reg, const Immediate& imm);
   void testl(Register reg1, const Address& address);
 
+  void testb(const Address& dst, const Immediate& imm);
+  void testl(const Address& dst, const Immediate& imm);
+
   void andl(Register dst, const Immediate& imm);
   void andl(Register dst, Register src);
   void andl(Register dst, const Address& address);
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index 307e034..61d70d7 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -375,6 +375,42 @@
   DriverStr(expected, "cmovl_address");
 }
 
+TEST_F(AssemblerX86Test, TestbAddressImmediate) {
+  GetAssembler()->testb(
+      x86::Address(x86::Register(x86::EDI), x86::Register(x86::EBX), x86::TIMES_4, 12),
+      x86::Immediate(1));
+  GetAssembler()->testb(
+      x86::Address(x86::Register(x86::ESP), FrameOffset(7)),
+      x86::Immediate(-128));
+  GetAssembler()->testb(
+      x86::Address(x86::Register(x86::EBX), MemberOffset(130)),
+      x86::Immediate(127));
+  const char* expected =
+      "testb $1, 0xc(%EDI,%EBX,4)\n"
+      "testb $-128, 0x7(%ESP)\n"
+      "testb $127, 0x82(%EBX)\n";
+
+  DriverStr(expected, "TestbAddressImmediate");
+}
+
+TEST_F(AssemblerX86Test, TestlAddressImmediate) {
+  GetAssembler()->testl(
+      x86::Address(x86::Register(x86::EDI), x86::Register(x86::EBX), x86::TIMES_4, 12),
+      x86::Immediate(1));
+  GetAssembler()->testl(
+      x86::Address(x86::Register(x86::ESP), FrameOffset(7)),
+      x86::Immediate(-100000));
+  GetAssembler()->testl(
+      x86::Address(x86::Register(x86::EBX), MemberOffset(130)),
+      x86::Immediate(77777777));
+  const char* expected =
+      "testl $1, 0xc(%EDI,%EBX,4)\n"
+      "testl $-100000, 0x7(%ESP)\n"
+      "testl $77777777, 0x82(%EBX)\n";
+
+  DriverStr(expected, "TestlAddressImmediate");
+}
+
 /////////////////
 // Near labels //
 /////////////////
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index ddc8244..1f73aa7 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -1389,6 +1389,25 @@
 }
 
 
+void X86_64Assembler::testb(const Address& dst, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst);
+  EmitUint8(0xF6);
+  EmitOperand(Register::RAX, dst);
+  CHECK(imm.is_int8());
+  EmitUint8(imm.value() & 0xFF);
+}
+
+
+void X86_64Assembler::testl(const Address& dst, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst);
+  EmitUint8(0xF7);
+  EmitOperand(0, dst);
+  EmitImmediate(imm);
+}
+
+
 void X86_64Assembler::andl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitOptionalRex32(dst, src);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index a4166f9..3a4bfca 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -528,6 +528,9 @@
   void testq(CpuRegister reg1, CpuRegister reg2);
   void testq(CpuRegister reg, const Address& address);
 
+  void testb(const Address& address, const Immediate& imm);
+  void testl(const Address& address, const Immediate& imm);
+
   void andl(CpuRegister dst, const Immediate& imm);
   void andl(CpuRegister dst, CpuRegister src);
   void andl(CpuRegister reg, const Address& address);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 36c966b..48a1876 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -1526,6 +1526,48 @@
   DriverStr(expected, "cmpb");
 }
 
+TEST_F(AssemblerX86_64Test, TestbAddressImmediate) {
+  GetAssembler()->testb(
+      x86_64::Address(x86_64::CpuRegister(x86_64::RDI),
+                      x86_64::CpuRegister(x86_64::RBX),
+                      x86_64::TIMES_4,
+                      12),
+      x86_64::Immediate(1));
+  GetAssembler()->testb(
+      x86_64::Address(x86_64::CpuRegister(x86_64::RSP), FrameOffset(7)),
+      x86_64::Immediate(-128));
+  GetAssembler()->testb(
+      x86_64::Address(x86_64::CpuRegister(x86_64::RBX), MemberOffset(130)),
+      x86_64::Immediate(127));
+  const char* expected =
+      "testb $1, 0xc(%RDI,%RBX,4)\n"
+      "testb $-128, 0x7(%RSP)\n"
+      "testb $127, 0x82(%RBX)\n";
+
+  DriverStr(expected, "TestbAddressImmediate");
+}
+
+TEST_F(AssemblerX86_64Test, TestlAddressImmediate) {
+  GetAssembler()->testl(
+      x86_64::Address(x86_64::CpuRegister(x86_64::RDI),
+                      x86_64::CpuRegister(x86_64::RBX),
+                      x86_64::TIMES_4,
+                      12),
+      x86_64::Immediate(1));
+  GetAssembler()->testl(
+      x86_64::Address(x86_64::CpuRegister(x86_64::RSP), FrameOffset(7)),
+      x86_64::Immediate(-100000));
+  GetAssembler()->testl(
+      x86_64::Address(x86_64::CpuRegister(x86_64::RBX), MemberOffset(130)),
+      x86_64::Immediate(77777777));
+  const char* expected =
+      "testl $1, 0xc(%RDI,%RBX,4)\n"
+      "testl $-100000, 0x7(%RSP)\n"
+      "testl $77777777, 0x82(%RBX)\n";
+
+  DriverStr(expected, "TestlAddressImmediate");
+}
+
 class JNIMacroAssemblerX86_64Test : public JNIMacroAssemblerTest<x86_64::X86_64JNIMacroAssembler> {
  public:
   using Base = JNIMacroAssemblerTest<x86_64::X86_64JNIMacroAssembler>;
diff --git a/dalvikvm/Android.mk b/dalvikvm/Android.mk
index 71e9a28..6c0bcb1 100644
--- a/dalvikvm/Android.mk
+++ b/dalvikvm/Android.mk
@@ -18,7 +18,7 @@
 
 include art/build/Android.common.mk
 
-dalvikvm_cflags := -Wall -Werror -Wextra -std=gnu++11
+dalvikvm_cflags := -Wall -Werror -Wextra
 
 include $(CLEAR_VARS)
 LOCAL_MODULE := dalvikvm
diff --git a/dex2oat/Android.mk b/dex2oat/Android.mk
index 37acef6..d38cc91 100644
--- a/dex2oat/Android.mk
+++ b/dex2oat/Android.mk
@@ -82,14 +82,14 @@
 ifeq ($(ART_BUILD_HOST_NDEBUG),true)
   $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libcutils libart-compiler libsigchain libziparchive-host liblz4,art/compiler,host,ndebug,$(dex2oat_host_arch)))
   ifeq ($(ART_BUILD_HOST_STATIC),true)
-    $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libart libart-compiler libart libvixl-arm64 $(DEX2OAT_STATIC_DEPENDENCIES),art/compiler,host,ndebug,$(dex2oat_host_arch),static))
+    $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libart libart-compiler libart libvixl-arm libvixl-arm64 $(DEX2OAT_STATIC_DEPENDENCIES),art/compiler,host,ndebug,$(dex2oat_host_arch),static))
   endif
 endif
 
 ifeq ($(ART_BUILD_HOST_DEBUG),true)
   $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libcutils libartd-compiler libsigchain libziparchive-host liblz4,art/compiler,host,debug,$(dex2oat_host_arch)))
   ifeq ($(ART_BUILD_HOST_STATIC),true)
-    $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libartd libartd-compiler libartd libvixld-arm64 $(DEX2OAT_STATIC_DEPENDENCIES),art/compiler,host,debug,$(dex2oat_host_arch),static))
+    $(eval $(call build-art-executable,dex2oat,$(DEX2OAT_SRC_FILES),libartd libartd-compiler libartd libvixld-arm libvixld-arm64 $(DEX2OAT_STATIC_DEPENDENCIES),art/compiler,host,debug,$(dex2oat_host_arch),static))
   endif
 endif
 
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index cfcfe1c..febfb63 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -495,7 +495,7 @@
  public:
   explicit Dex2Oat(TimingLogger* timings) :
       compiler_kind_(Compiler::kOptimizing),
-      instruction_set_(kRuntimeISA),
+      instruction_set_(kRuntimeISA == kArm ? kThumb2 : kRuntimeISA),
       // Take the default set of instruction features from the build.
       image_file_location_oat_checksum_(0),
       image_file_location_oat_data_begin_(0),
diff --git a/disassembler/disassembler.cc b/disassembler/disassembler.cc
index e604c1f..bcd0d16 100644
--- a/disassembler/disassembler.cc
+++ b/disassembler/disassembler.cc
@@ -32,10 +32,8 @@
     return new arm::DisassemblerArm(options);
   } else if (instruction_set == kArm64) {
     return new arm64::DisassemblerArm64(options);
-  } else if (instruction_set == kMips) {
-    return new mips::DisassemblerMips(options, false);
-  } else if (instruction_set == kMips64) {
-    return new mips::DisassemblerMips(options, true);
+  } else if (instruction_set == kMips || instruction_set == kMips64) {
+    return new mips::DisassemblerMips(options);
   } else if (instruction_set == kX86) {
     return new x86::DisassemblerX86(options, false);
   } else if (instruction_set == kX86_64) {
diff --git a/disassembler/disassembler.h b/disassembler/disassembler.h
index b080315..86793cc 100644
--- a/disassembler/disassembler.h
+++ b/disassembler/disassembler.h
@@ -28,8 +28,9 @@
 
 class DisassemblerOptions {
  public:
-  // Should the disassembler print absolute or relative addresses.
-  const bool absolute_addresses_;
+  using ThreadOffsetNameFunction = void (*)(std::ostream& os, uint32_t offset);
+
+  ThreadOffsetNameFunction thread_offset_name_function_;
 
   // Base address for calculating relative code offsets when absolute_addresses_ is false.
   const uint8_t* const base_address_;
@@ -37,6 +38,9 @@
   // End address (exclusive);
   const uint8_t* const end_address_;
 
+  // Should the disassembler print absolute or relative addresses.
+  const bool absolute_addresses_;
+
   // If set, the disassembler is allowed to look at load targets in literal
   // pools.
   const bool can_read_literals_;
@@ -44,10 +48,12 @@
   DisassemblerOptions(bool absolute_addresses,
                       const uint8_t* base_address,
                       const uint8_t* end_address,
-                      bool can_read_literals)
-      : absolute_addresses_(absolute_addresses),
+                      bool can_read_literals,
+                      ThreadOffsetNameFunction fn)
+      : thread_offset_name_function_(fn),
         base_address_(base_address),
         end_address_(end_address),
+        absolute_addresses_(absolute_addresses),
         can_read_literals_(can_read_literals) {}
 
  private:
diff --git a/disassembler/disassembler_arm.cc b/disassembler/disassembler_arm.cc
index 4f0e144..a47b6ad 100644
--- a/disassembler/disassembler_arm.cc
+++ b/disassembler/disassembler_arm.cc
@@ -25,7 +25,6 @@
 #include "base/bit_utils.h"
 #include "base/logging.h"
 #include "base/stringprintf.h"
-#include "thread.h"
 
 namespace art {
 namespace arm {
@@ -329,7 +328,7 @@
           }
           if (rn.r == 9) {
             args << "  ; ";
-            Thread::DumpThreadOffset<kArmPointerSize>(args, offset);
+            GetDisassemblerOptions()->thread_offset_name_function_(args, offset);
           }
         }
       }
@@ -1401,7 +1400,7 @@
             args << Rt << ", [" << Rn << ", #" << (U != 0u ? "" : "-") << imm12 << "]";
             if (Rn.r == TR && is_load) {
               args << "  ; ";
-              Thread::DumpThreadOffset<kArmPointerSize>(args, imm12);
+              GetDisassemblerOptions()->thread_offset_name_function_(args, imm12);
             } else if (Rn.r == PC) {
               T2LitType lit_type[] = {
                   kT2LitUByte, kT2LitUHalf, kT2LitHexWord, kT2LitInvalid,
diff --git a/disassembler/disassembler_arm64.cc b/disassembler/disassembler_arm64.cc
index 0ef9025..80bacb2 100644
--- a/disassembler/disassembler_arm64.cc
+++ b/disassembler/disassembler_arm64.cc
@@ -22,7 +22,6 @@
 
 #include "base/logging.h"
 #include "base/stringprintf.h"
-#include "thread.h"
 
 using namespace vixl::aarch64;  // NOLINT(build/namespaces)
 
@@ -102,7 +101,7 @@
   if (instr->GetRn() == TR) {
     int64_t offset = instr->GetImmLSUnsigned() << instr->GetSizeLS();
     std::ostringstream tmp_stream;
-    Thread::DumpThreadOffset<kArm64PointerSize>(tmp_stream, static_cast<uint32_t>(offset));
+    options_->thread_offset_name_function_(tmp_stream, static_cast<uint32_t>(offset));
     AppendToOutput(" ; %s", tmp_stream.str().c_str());
   }
 }
diff --git a/disassembler/disassembler_arm64.h b/disassembler/disassembler_arm64.h
index 7c64792..19e4dfb 100644
--- a/disassembler/disassembler_arm64.h
+++ b/disassembler/disassembler_arm64.h
@@ -35,7 +35,8 @@
       : vixl::aarch64::Disassembler(),
         read_literals_(options->can_read_literals_),
         base_address_(options->base_address_),
-        end_address_(options->end_address_) {
+        end_address_(options->end_address_),
+        options_(options) {
     if (!options->absolute_addresses_) {
       MapCodeAddress(0,
                      reinterpret_cast<const vixl::aarch64::Instruction*>(options->base_address_));
@@ -64,6 +65,8 @@
   // Valid address range: [base_address_, end_address_)
   const void* const base_address_;
   const void* const end_address_;
+
+  DisassemblerOptions* options_;
 };
 
 class DisassemblerArm64 FINAL : public Disassembler {
diff --git a/disassembler/disassembler_mips.cc b/disassembler/disassembler_mips.cc
index 3448878..02c6d71 100644
--- a/disassembler/disassembler_mips.cc
+++ b/disassembler/disassembler_mips.cc
@@ -21,7 +21,6 @@
 
 #include "base/logging.h"
 #include "base/stringprintf.h"
-#include "thread.h"
 
 namespace art {
 namespace mips {
@@ -503,11 +502,7 @@
               args << StringPrintf("%+d(r%d)", offset, rs);
               if (rs == 17) {
                 args << "  ; ";
-                if (is64bit_) {
-                  Thread::DumpThreadOffset<kMips64PointerSize>(args, offset);
-                } else {
-                  Thread::DumpThreadOffset<kMipsPointerSize>(args, offset);
-                }
+                GetDisassemblerOptions()->thread_offset_name_function_(args, offset);
               }
             }
             break;
diff --git a/disassembler/disassembler_mips.h b/disassembler/disassembler_mips.h
index b0e49b3..6342f22 100644
--- a/disassembler/disassembler_mips.h
+++ b/disassembler/disassembler_mips.h
@@ -26,9 +26,8 @@
 
 class DisassemblerMips FINAL : public Disassembler {
  public:
-  DisassemblerMips(DisassemblerOptions* options, bool is64bit)
+  explicit DisassemblerMips(DisassemblerOptions* options)
       : Disassembler(options),
-        is64bit_(is64bit),
         last_ptr_(nullptr),
         last_instr_(0) {}
 
@@ -36,8 +35,6 @@
   void Dump(std::ostream& os, const uint8_t* begin, const uint8_t* end) OVERRIDE;
 
  private:
-  const bool is64bit_;
-
   // Address and encoding of the last disassembled instruction.
   // Needed to produce more readable disassembly of certain 2-instruction sequences.
   const uint8_t* last_ptr_;
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 147e0b1..2ca84e5 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -23,7 +23,6 @@
 
 #include "base/logging.h"
 #include "base/stringprintf.h"
-#include "thread.h"
 
 namespace art {
 namespace x86 {
@@ -1409,11 +1408,11 @@
   }
   if (prefix[1] == kFs && !supports_rex_) {
     args << "  ; ";
-    Thread::DumpThreadOffset<kX86PointerSize>(args, address_bits);
+    GetDisassemblerOptions()->thread_offset_name_function_(args, address_bits);
   }
   if (prefix[1] == kGs && supports_rex_) {
     args << "  ; ";
-    Thread::DumpThreadOffset<kX86_64PointerSize>(args, address_bits);
+    GetDisassemblerOptions()->thread_offset_name_function_(args, address_bits);
   }
   const char* prefix_str;
   switch (prefix[0]) {
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index 77730b9..96c8e94 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -335,10 +335,14 @@
       resolved_addr2instr_(0),
       instruction_set_(oat_file_.GetOatHeader().GetInstructionSet()),
       disassembler_(Disassembler::Create(instruction_set_,
-                                         new DisassemblerOptions(options_.absolute_addresses_,
-                                                                 oat_file.Begin(),
-                                                                 oat_file.End(),
-                                                                 true /* can_read_literals_ */))) {
+                                         new DisassemblerOptions(
+                                             options_.absolute_addresses_,
+                                             oat_file.Begin(),
+                                             oat_file.End(),
+                                             true /* can_read_literals_ */,
+                                             Is64BitInstructionSet(instruction_set_)
+                                                 ? &Thread::DumpThreadOffset<PointerSize::k64>
+                                                 : &Thread::DumpThreadOffset<PointerSize::k32>))) {
     CHECK(options_.class_loader_ != nullptr);
     CHECK(options_.class_filter_ != nullptr);
     CHECK(options_.method_filter_ != nullptr);
@@ -1402,7 +1406,7 @@
   const std::vector<const OatFile::OatDexFile*> oat_dex_files_;
   const OatDumperOptions& options_;
   uint32_t resolved_addr2instr_;
-  InstructionSet instruction_set_;
+  const InstructionSet instruction_set_;
   std::set<uintptr_t> offsets_;
   Disassembler* disassembler_;
 };
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index c4ec726..e25e93f 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -191,7 +191,7 @@
     .cfi_rel_offset r11, 44
     .cfi_rel_offset ip, 48
     .cfi_rel_offset lr, 52
-    vpush {d0-d15}                      @ 32 words of float args.
+    vpush {d0-d15}                      @ 32 words, 2 for each of the 16 saved doubles.
     .cfi_adjust_cfa_offset 128
     sub sp, #8                          @ 2 words of space, alignment padding and Method*
     .cfi_adjust_cfa_offset 8
@@ -1030,11 +1030,49 @@
 END art_quick_set64_instance
 
     /*
-     * Entry from managed code to resolve a string, this stub will allocate a String and deliver an
-     * exception on error. On success the String is returned. R0 holds the string index. The fast
-     * path check for hit in strings cache has already been performed.
+     * Entry from managed code to resolve a string, this stub will
+     * check the dex cache for a matching string (the fast path), and if not found,
+     * it will allocate a String and deliver an exception on error.
+     * On success the String is returned. R0 holds the string index.
      */
-ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+
+ENTRY art_quick_resolve_string
+    ldr    r1, [sp]                                              @ load referrer
+    ldr    r1, [r1, #ART_METHOD_DECLARING_CLASS_OFFSET]          @ load declaring class
+    ldr    r1, [r1, #DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET]   @ load string dex cache
+    ubfx   r2, r0, #0, #STRING_DEX_CACHE_HASH_BITS
+    add    r1, r1, r2, LSL #STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT
+    ldrd   r2, r3, [r1]                                    @ load index into r3 and pointer into r2
+    cmp    r0, r3
+    bne    .Lart_quick_resolve_string_slow_path
+#ifdef USE_READ_BARRIER
+    ldr    r3, [rSELF, #THREAD_IS_GC_MARKING_OFFSET]
+    cbnz   r3, .Lart_quick_resolve_string_marking
+#endif
+    mov    r0, r2
+    bx     lr
+// Slow path case, the index did not match
+.Lart_quick_resolve_string_slow_path:
+    SETUP_SAVE_REFS_ONLY_FRAME r2                    @ save callee saves in case of GC
+    mov    r1, r9                                    @ pass Thread::Current
+    mov    r3, sp
+    bl     artResolveStringFromCode                  @ (uint32_t type_idx, Method* method, Thread*)
+    RESTORE_SAVE_REFS_ONLY_FRAME
+    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+// GC is marking case, need to check the mark bit.
+.Lart_quick_resolve_string_marking:
+    ldr    r3, [r2, MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    tst    r3, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
+    mov    r0, r2
+    bne    .Lart_quick_resolve_string_no_rb
+    push   {r1, r2, r3, lr}                          @ Save x1, LR
+    .cfi_adjust_cfa_offset 16
+    bl     artReadBarrierMark                        @ Get the marked string back.
+    pop    {r1, r2, r3, lr}                          @ Restore registers.
+    .cfi_adjust_cfa_offset -16
+.Lart_quick_resolve_string_no_rb:
+    bx     lr
+END art_quick_resolve_string
 
 // Generate the allocation entrypoints for each allocator.
 GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 4289cab..202846a 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -331,6 +331,7 @@
 #endif
 
     // Save FP registers.
+    // For better performance, store d0 and d31 separately, so that all STPs are 16-byte aligned.
     str d0,       [sp, #8]
     stp d1, d2,   [sp, #16]
     stp d3, d4,   [sp, #32]
@@ -431,6 +432,7 @@
 
 .macro RESTORE_SAVE_EVERYTHING_FRAME
     // Restore FP registers.
+    // For better performance, load d0 and d31 separately, so that all LDPs are 16-byte aligned.
     ldr d0,       [sp, #8]
     ldp d1, d2,   [sp, #16]
     ldp d3, d4,   [sp, #32]
@@ -1784,11 +1786,48 @@
 END art_quick_set64_static
 
     /*
-     * Entry from managed code to resolve a string, this stub will allocate a String and deliver an
-     * exception on error. On success the String is returned. w0 holds the string index. The fast
-     * path check for hit in strings cache has already been performed.
+     * Entry from managed code to resolve a string, this stub will
+     * check the dex cache for a matching string (the fast path), and if not found,
+     * it will allocate a String and deliver an exception on error.
+     * On success the String is returned. R0 holds the string index.
      */
-ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+
+ENTRY art_quick_resolve_string
+    ldr   x1, [sp]                                               // load referrer
+    ldr   w2, [x1, #ART_METHOD_DECLARING_CLASS_OFFSET]           // load declaring class
+    ldr   x1, [x2, #DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET]    // load string dex cache
+    and   x2, x0, #STRING_DEX_CACHE_SIZE_MINUS_ONE               // get masked string index into x2
+    ldr   x2, [x1, x2, lsl #STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT]  // load dex cache pair into x2
+    cmp   x0, x2, lsr #32                                         // compare against upper 32 bits
+    bne   .Lart_quick_resolve_string_slow_path
+    ubfx  x0, x2, #0, #32                                        // extract lower 32 bits into x0
+#ifdef USE_READ_BARRIER
+    // Most common case: GC is not marking.
+    ldr    w3, [xSELF, #THREAD_IS_GC_MARKING_OFFSET]
+    cbnz   x3, .Lart_quick_resolve_string_marking
+#endif
+    ret
+
+// Slow path case, the index did not match.
+.Lart_quick_resolve_string_slow_path:
+    SETUP_SAVE_REFS_ONLY_FRAME                      // save callee saves in case of GC
+    mov   x1, xSELF                                 // pass Thread::Current
+    bl    artResolveStringFromCode                  // (int32_t string_idx, Thread* self)
+    RESTORE_SAVE_REFS_ONLY_FRAME
+    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+
+// GC is marking case, need to check the mark bit.
+.Lart_quick_resolve_string_marking:
+    ldr   x3, [x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    tbnz  x3, #LOCK_WORD_MARK_BIT_SHIFT, .Lart_quick_resolve_string_no_rb
+    // Save LR so that we can return, also x1 for alignment purposes.
+    stp    x1, xLR, [sp, #-16]!                     // Save x1, LR.
+    bl     artReadBarrierMark                       // Get the marked string back.
+    ldp    x1, xLR, [sp], #16                       // Restore registers.
+.Lart_quick_resolve_string_no_rb:
+    ret
+
+END art_quick_resolve_string
 
 // Generate the allocation entrypoints for each allocator.
 GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_REGION_TLAB_ALLOCATORS
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 80bb51d..10adb3a 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -1203,6 +1203,7 @@
 
 
 TEST_F(StubTest, StringCompareTo) {
+  TEST_DISABLED_FOR_STRING_COMPRESSION();
   // There is no StringCompareTo runtime entrypoint for __arm__ or __aarch64__.
 #if defined(__i386__) || defined(__mips__) || \
     (defined(__x86_64__) && !defined(__APPLE__))
diff --git a/runtime/arch/x86/fault_handler_x86.cc b/runtime/arch/x86/fault_handler_x86.cc
index 3efeb40..c7af249 100644
--- a/runtime/arch/x86/fault_handler_x86.cc
+++ b/runtime/arch/x86/fault_handler_x86.cc
@@ -191,6 +191,27 @@
         immediate_size = operand_size_prefix ? 2 : 4;
         break;
 
+      case 0xf6:
+      case 0xf7:
+        modrm = *pc++;
+        has_modrm = true;
+        switch ((modrm >> 3) & 7) {  // Extract "reg/opcode" from "modr/m".
+          case 0:  // test
+            immediate_size = (opcode == 0xf6) ? 1 : (operand_size_prefix ? 2 : 4);
+            break;
+          case 2:  // not
+          case 3:  // neg
+          case 4:  // mul
+          case 5:  // imul
+          case 6:  // div
+          case 7:  // idiv
+            break;
+          default:
+            unhandled_instruction = true;
+            break;
+        }
+        break;
+
       default:
         unhandled_instruction = true;
         break;
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 2e9682e..282f10d 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1108,7 +1108,54 @@
     ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeRegionTLAB
 END_FUNCTION art_quick_alloc_object_region_tlab
 
-ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+DEFINE_FUNCTION art_quick_resolve_string
+    SETUP_SAVE_REFS_ONLY_FRAME  ebx, ebx
+    movl FRAME_SIZE_SAVE_REFS_ONLY(%esp), %ecx                   // get referrer
+    movl ART_METHOD_DECLARING_CLASS_OFFSET(%ecx), %ecx           // get declaring class
+    movl DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET(%ecx), %ecx    // get string dex cache
+    movl LITERAL(STRING_DEX_CACHE_SIZE_MINUS_ONE), %edx
+    andl %eax, %edx
+    shl LITERAL(STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT), %edx
+    addl %ecx, %edx
+    movlps (%edx), %xmm0                                     // load string idx and pointer to xmm0
+    movd %xmm0, %ecx                                         // extract pointer
+    pshufd LITERAL(0x55), %xmm0, %xmm0                       // shuffle index into lowest bits
+    movd %xmm0, %edx                                         // extract index
+    cmp %edx, %eax
+    jne .Lart_quick_resolve_string_slow_path
+    movl %ecx, %eax
+#ifdef USE_READ_BARRIER
+    cmpl LITERAL(0), %fs:THREAD_IS_GC_MARKING_OFFSET
+    jne .Lart_quick_resolve_string_marking
+#endif
+    RESTORE_SAVE_REFS_ONLY_FRAME
+    ret
+.Lart_quick_resolve_string_slow_path:
+    // Outgoing argument set up
+    subl LITERAL(8), %esp                                        // push padding
+    CFI_ADJUST_CFA_OFFSET(8)
+    pushl %fs:THREAD_SELF_OFFSET                                 // pass Thread::Current()
+    CFI_ADJUST_CFA_OFFSET(4)
+    PUSH eax                                                     // pass arg1
+    call SYMBOL(artResolveStringFromCode)
+    addl LITERAL(16), %esp                                       // pop arguments
+    CFI_ADJUST_CFA_OFFSET(-16)
+    RESTORE_SAVE_REFS_ONLY_FRAME
+    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+.Lart_quick_resolve_string_marking:
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax)
+    jnz .Lart_quick_resolve_string_no_rb
+    subl LITERAL(12), %esp                                   // alignment padding
+    CFI_ADJUST_CFA_OFFSET(12)
+    PUSH eax                                                 // Pass the string as the first param.
+    call SYMBOL(artReadBarrierMark)
+    addl LITERAL(16), %esp
+    CFI_ADJUST_CFA_OFFSET(-16)
+.Lart_quick_resolve_string_no_rb:
+    RESTORE_SAVE_REFS_ONLY_FRAME
+    ret
+END_FUNCTION art_quick_resolve_string
+
 ONE_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 ONE_ARG_DOWNCALL art_quick_initialize_type, artInitializeTypeFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 ONE_ARG_DOWNCALL art_quick_initialize_type_and_verify_access, artInitializeTypeAndVerifyAccessFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 32768b0..62808ab 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1330,7 +1330,52 @@
     ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeInitializedRegionTLAB
 END_FUNCTION art_quick_alloc_object_initialized_region_tlab
 
-ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+DEFINE_FUNCTION art_quick_resolve_string
+    movq 8(%rsp), %rcx                                         // get referrer
+    movl ART_METHOD_DECLARING_CLASS_OFFSET(%rcx), %ecx         // get declaring class
+    movq DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET(%ecx), %rcx  // get string dex cache
+    movq LITERAL(STRING_DEX_CACHE_SIZE_MINUS_ONE), %rdx
+    andq %rdi, %rdx
+    shlq LITERAL(STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT), %rdx
+    addq %rcx, %rdx
+    movq %rax, %rcx
+    movq (%rdx), %rdx
+    movq %rdx, %rax
+    movl %eax, %eax
+    shrq LITERAL(32), %rdx
+    cmp %rdx, %rdi
+    jne .Lart_quick_resolve_string_slow_path
+#ifdef USE_READ_BARRIER
+    cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
+    jne .Lart_quick_resolve_string_marking
+#endif
+    ret
+// Slow path, the index did not match
+.Lart_quick_resolve_string_slow_path:
+    SETUP_SAVE_REFS_ONLY_FRAME
+    movq %rcx, %rax
+    // Outgoing argument set up
+    movq %gs:THREAD_SELF_OFFSET, %rsi           // pass Thread::Current()
+    call SYMBOL(artResolveStringFromCode)       // artResolveStringFromCode(arg0, referrer, Thread*)
+    RESTORE_SAVE_REFS_ONLY_FRAME                // restore frame up to return address
+    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+// GC is marking case, need to check the mark bit.
+.Lart_quick_resolve_string_marking:
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%rax)
+    jnz .Lart_quick_resolve_string_no_rb
+    // Save LR so that we can return, also x1 for alignment purposes
+    PUSH rdi
+    PUSH rsi
+    subq LITERAL(8), %rsp                         // 16 byte alignment
+    movq %rax, %rdi
+    call SYMBOL(artReadBarrierMark)
+    addq LITERAL(8), %rsp
+    POP  rsi
+    POP  rdi
+.Lart_quick_resolve_string_no_rb:
+    ret
+END_FUNCTION art_quick_resolve_string
+
 ONE_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 ONE_ARG_DOWNCALL art_quick_initialize_type, artInitializeTypeFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 ONE_ARG_DOWNCALL art_quick_initialize_type_and_verify_access, artInitializeTypeAndVerifyAccessFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 848f8e5..102b993 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -19,12 +19,15 @@
 
 #if defined(__cplusplus)
 #include "art_method.h"
+#include "base/bit_utils.h"
 #include "gc/allocator/rosalloc.h"
 #include "gc/heap.h"
 #include "jit/jit.h"
 #include "lock_word.h"
 #include "mirror/class.h"
+#include "mirror/dex_cache.h"
 #include "mirror/string.h"
+#include "utils/dex_cache_arrays_layout.h"
 #include "runtime.h"
 #include "thread.h"
 #endif
diff --git a/runtime/base/arena_allocator_test.cc b/runtime/base/arena_allocator_test.cc
index 9de3cc4..fd48a3f 100644
--- a/runtime/base/arena_allocator_test.cc
+++ b/runtime/base/arena_allocator_test.cc
@@ -16,6 +16,7 @@
 
 #include "base/arena_allocator.h"
 #include "base/arena_bit_vector.h"
+#include "base/memory_tool.h"
 #include "gtest/gtest.h"
 
 namespace art {
@@ -124,4 +125,221 @@
   }
 }
 
+TEST_F(ArenaAllocatorTest, AllocAlignment) {
+  ArenaPool pool;
+  ArenaAllocator arena(&pool);
+  for (size_t iterations = 0; iterations <= 10; ++iterations) {
+    for (size_t size = 1; size <= ArenaAllocator::kAlignment + 1; ++size) {
+      void* allocation = arena.Alloc(size);
+      EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(allocation))
+          << reinterpret_cast<uintptr_t>(allocation);
+    }
+  }
+}
+
+TEST_F(ArenaAllocatorTest, ReallocReuse) {
+  // Realloc does not reuse arenas when running under sanitization. So we cannot do those
+  if (RUNNING_ON_MEMORY_TOOL != 0) {
+    printf("WARNING: TEST DISABLED FOR MEMORY_TOOL\n");
+    return;
+  }
+
+  {
+    // Case 1: small aligned allocation, aligned extend inside arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = ArenaAllocator::kAlignment * 2;
+    void* original_allocation = arena.Alloc(original_size);
+
+    const size_t new_size = ArenaAllocator::kAlignment * 3;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_EQ(original_allocation, realloc_allocation);
+  }
+
+  {
+    // Case 2: small aligned allocation, non-aligned extend inside arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = ArenaAllocator::kAlignment * 2;
+    void* original_allocation = arena.Alloc(original_size);
+
+    const size_t new_size = ArenaAllocator::kAlignment * 2 + (ArenaAllocator::kAlignment / 2);
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_EQ(original_allocation, realloc_allocation);
+  }
+
+  {
+    // Case 3: small non-aligned allocation, aligned extend inside arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = ArenaAllocator::kAlignment * 2 + (ArenaAllocator::kAlignment / 2);
+    void* original_allocation = arena.Alloc(original_size);
+
+    const size_t new_size = ArenaAllocator::kAlignment * 4;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_EQ(original_allocation, realloc_allocation);
+  }
+
+  {
+    // Case 4: small non-aligned allocation, aligned non-extend inside arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = ArenaAllocator::kAlignment * 2 + (ArenaAllocator::kAlignment / 2);
+    void* original_allocation = arena.Alloc(original_size);
+
+    const size_t new_size = ArenaAllocator::kAlignment * 3;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_EQ(original_allocation, realloc_allocation);
+  }
+
+  // The next part is brittle, as the default size for an arena is variable, and we don't know about
+  // sanitization.
+
+  {
+    // Case 5: large allocation, aligned extend into next arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = Arena::kDefaultSize - ArenaAllocator::kAlignment * 5;
+    void* original_allocation = arena.Alloc(original_size);
+
+    const size_t new_size = Arena::kDefaultSize + ArenaAllocator::kAlignment * 2;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_NE(original_allocation, realloc_allocation);
+  }
+
+  {
+    // Case 6: large allocation, non-aligned extend into next arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = Arena::kDefaultSize -
+        ArenaAllocator::kAlignment * 4 -
+        ArenaAllocator::kAlignment / 2;
+    void* original_allocation = arena.Alloc(original_size);
+
+    const size_t new_size = Arena::kDefaultSize +
+        ArenaAllocator::kAlignment * 2 +
+        ArenaAllocator::kAlignment / 2;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_NE(original_allocation, realloc_allocation);
+  }
+}
+
+TEST_F(ArenaAllocatorTest, ReallocAlignment) {
+  {
+    // Case 1: small aligned allocation, aligned extend inside arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = ArenaAllocator::kAlignment * 2;
+    void* original_allocation = arena.Alloc(original_size);
+    ASSERT_TRUE(IsAligned<ArenaAllocator::kAlignment>(original_allocation));
+
+    const size_t new_size = ArenaAllocator::kAlignment * 3;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(realloc_allocation));
+
+    void* after_alloc = arena.Alloc(1);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(after_alloc));
+  }
+
+  {
+    // Case 2: small aligned allocation, non-aligned extend inside arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = ArenaAllocator::kAlignment * 2;
+    void* original_allocation = arena.Alloc(original_size);
+    ASSERT_TRUE(IsAligned<ArenaAllocator::kAlignment>(original_allocation));
+
+    const size_t new_size = ArenaAllocator::kAlignment * 2 + (ArenaAllocator::kAlignment / 2);
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(realloc_allocation));
+
+    void* after_alloc = arena.Alloc(1);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(after_alloc));
+  }
+
+  {
+    // Case 3: small non-aligned allocation, aligned extend inside arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = ArenaAllocator::kAlignment * 2 + (ArenaAllocator::kAlignment / 2);
+    void* original_allocation = arena.Alloc(original_size);
+    ASSERT_TRUE(IsAligned<ArenaAllocator::kAlignment>(original_allocation));
+
+    const size_t new_size = ArenaAllocator::kAlignment * 4;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(realloc_allocation));
+
+    void* after_alloc = arena.Alloc(1);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(after_alloc));
+  }
+
+  {
+    // Case 4: small non-aligned allocation, aligned non-extend inside arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = ArenaAllocator::kAlignment * 2 + (ArenaAllocator::kAlignment / 2);
+    void* original_allocation = arena.Alloc(original_size);
+    ASSERT_TRUE(IsAligned<ArenaAllocator::kAlignment>(original_allocation));
+
+    const size_t new_size = ArenaAllocator::kAlignment * 3;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(realloc_allocation));
+
+    void* after_alloc = arena.Alloc(1);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(after_alloc));
+  }
+
+  // The next part is brittle, as the default size for an arena is variable, and we don't know about
+  // sanitization.
+
+  {
+    // Case 5: large allocation, aligned extend into next arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = Arena::kDefaultSize - ArenaAllocator::kAlignment * 5;
+    void* original_allocation = arena.Alloc(original_size);
+    ASSERT_TRUE(IsAligned<ArenaAllocator::kAlignment>(original_allocation));
+
+    const size_t new_size = Arena::kDefaultSize + ArenaAllocator::kAlignment * 2;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(realloc_allocation));
+
+    void* after_alloc = arena.Alloc(1);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(after_alloc));
+  }
+
+  {
+    // Case 6: large allocation, non-aligned extend into next arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = Arena::kDefaultSize -
+        ArenaAllocator::kAlignment * 4 -
+        ArenaAllocator::kAlignment / 2;
+    void* original_allocation = arena.Alloc(original_size);
+    ASSERT_TRUE(IsAligned<ArenaAllocator::kAlignment>(original_allocation));
+
+    const size_t new_size = Arena::kDefaultSize +
+        ArenaAllocator::kAlignment * 2 +
+        ArenaAllocator::kAlignment / 2;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(realloc_allocation));
+
+    void* after_alloc = arena.Alloc(1);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(after_alloc));
+  }
+}
+
+
 }  // namespace art
diff --git a/runtime/base/dchecked_vector.h b/runtime/base/dchecked_vector.h
index 51dfba8..77f0ea2 100644
--- a/runtime/base/dchecked_vector.h
+++ b/runtime/base/dchecked_vector.h
@@ -59,10 +59,8 @@
       : Base() { }
   explicit dchecked_vector(const allocator_type& alloc)
       : Base(alloc) { }
-  // Note that we cannot forward to std::vector(size_type, const allocator_type&) because it is not
-  // available in C++11, which is the latest GCC can support. http://b/25022512
   explicit dchecked_vector(size_type n, const allocator_type& alloc = allocator_type())
-      : Base(alloc) { resize(n); }
+      : Base(n, alloc) { }
   dchecked_vector(size_type n,
                   const value_type& value,
                   const allocator_type& alloc = allocator_type())
diff --git a/runtime/base/histogram-inl.h b/runtime/base/histogram-inl.h
index 4af47d1..ca9a694 100644
--- a/runtime/base/histogram-inl.h
+++ b/runtime/base/histogram-inl.h
@@ -228,10 +228,8 @@
   DCHECK_LE(std::abs(out_data->perc_.back() - 1.0), 0.001);
 }
 
-#if defined(__clang__)
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wfloat-equal"
-#endif
 
 template <class Value>
 inline double Histogram<Value>::Percentile(double per, const CumulativeData& data) const {
@@ -273,9 +271,7 @@
   return value;
 }
 
-#if defined(__clang__)
 #pragma clang diagnostic pop
-#endif
 
 }  // namespace art
 #endif  // ART_RUNTIME_BASE_HISTOGRAM_INL_H_
diff --git a/runtime/base/macros.h b/runtime/base/macros.h
index 5a50247..0ec6e6d 100644
--- a/runtime/base/macros.h
+++ b/runtime/base/macros.h
@@ -30,16 +30,8 @@
   _rc; })
 #endif
 
-#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-
-// C++11 final and override keywords that were introduced in GCC version 4.7.
-#if defined(__clang__) || GCC_VERSION >= 40700
 #define OVERRIDE override
 #define FINAL final
-#else
-#define OVERRIDE
-#define FINAL
-#endif
 
 // Declare a friend relationship in a class with a test. Used rather that FRIEND_TEST to avoid
 // globally importing gtest/gtest.h into the main ART header files.
@@ -158,12 +150,9 @@
 #define ALWAYS_INLINE  __attribute__ ((always_inline))
 #endif
 
-#ifdef __clang__
-/* clang doesn't like attributes on lambda functions */
+// clang doesn't like attributes on lambda functions. It would be nice to say:
+//   #define ALWAYS_INLINE_LAMBDA ALWAYS_INLINE
 #define ALWAYS_INLINE_LAMBDA
-#else
-#define ALWAYS_INLINE_LAMBDA ALWAYS_INLINE
-#endif
 
 #define NO_INLINE __attribute__ ((noinline))
 
@@ -228,75 +217,46 @@
 //
 //  In either case this macro has no effect on runtime behavior and performance
 //  of code.
-#if defined(__clang__) && __cplusplus >= 201103L && defined(__has_warning)
 #if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
 #define FALLTHROUGH_INTENDED [[clang::fallthrough]]  // NOLINT
 #endif
-#endif
 
 #ifndef FALLTHROUGH_INTENDED
 #define FALLTHROUGH_INTENDED do { } while (0)
 #endif
 
 // Annotalysis thread-safety analysis support.
-#if defined(__SUPPORT_TS_ANNOTATION__) || defined(__clang__)
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)   __attribute__((x))
-#else
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)   // no-op
-#endif
 
-#define ACQUIRED_AFTER(...) THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
-#define ACQUIRED_BEFORE(...) THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
-#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
-#define GUARDED_VAR THREAD_ANNOTATION_ATTRIBUTE__(guarded)
-#define LOCK_RETURNED(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
-#define NO_THREAD_SAFETY_ANALYSIS THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+#define ACQUIRED_AFTER(...) __attribute__((acquired_after(__VA_ARGS__)))
+#define ACQUIRED_BEFORE(...) __attribute__((acquired_before(__VA_ARGS__)))
+#define GUARDED_BY(x) __attribute__((guarded_by(x)))
+#define GUARDED_VAR __attribute__((guarded))
+#define LOCK_RETURNED(x) __attribute__((lock_returned(x)))
+#define NO_THREAD_SAFETY_ANALYSIS __attribute__((no_thread_safety_analysis))
 #define PT_GUARDED_BY(x)
 // THREAD_ANNOTATION_ATTRIBUTE__(point_to_guarded_by(x))
-#define PT_GUARDED_VAR THREAD_ANNOTATION_ATTRIBUTE__(point_to_guarded)
-#define SCOPED_LOCKABLE THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+#define PT_GUARDED_VAR __attribute__((point_to_guarded))
+#define SCOPED_LOCKABLE __attribute__((scoped_lockable))
 
-#if defined(__clang__)
-#define EXCLUSIVE_LOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(exclusive_lock_function(__VA_ARGS__))
-#define EXCLUSIVE_TRYLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(exclusive_trylock_function(__VA_ARGS__))
-#define SHARED_LOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(shared_lock_function(__VA_ARGS__))
-#define SHARED_TRYLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(shared_trylock_function(__VA_ARGS__))
-#define UNLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(unlock_function(__VA_ARGS__))
-#define REQUIRES(...) THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
-#define SHARED_REQUIRES(...) THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
-#define CAPABILITY(...) THREAD_ANNOTATION_ATTRIBUTE__(capability(__VA_ARGS__))
-#define SHARED_CAPABILITY(...) THREAD_ANNOTATION_ATTRIBUTE__(shared_capability(__VA_ARGS__))
-#define ASSERT_CAPABILITY(...) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(__VA_ARGS__))
-#define ASSERT_SHARED_CAPABILITY(...) THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(__VA_ARGS__))
-#define RETURN_CAPABILITY(...) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(__VA_ARGS__))
-#define TRY_ACQUIRE(...) THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
-#define TRY_ACQUIRE_SHARED(...) THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
-#define ACQUIRE(...) THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
-#define ACQUIRE_SHARED(...) THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
-#define RELEASE(...) THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
-#define RELEASE_SHARED(...) THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
-#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
-#else
-#define EXCLUSIVE_LOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(exclusive_lock(__VA_ARGS__))
-#define EXCLUSIVE_TRYLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(exclusive_trylock(__VA_ARGS__))
-#define SHARED_LOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(shared_lock(__VA_ARGS__))
-#define SHARED_TRYLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(shared_trylock(__VA_ARGS__))
-#define UNLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(unlock(__VA_ARGS__))
-#define REQUIRES(...)
-#define SHARED_REQUIRES(...)
-#define CAPABILITY(...)
-#define SHARED_CAPABILITY(...)
-#define ASSERT_CAPABILITY(...)
-#define ASSERT_SHARED_CAPABILITY(...)
-#define RETURN_CAPABILITY(...)
-#define TRY_ACQUIRE(...)
-#define TRY_ACQUIRE_SHARED(...)
-#define ACQUIRE(...)
-#define ACQUIRE_SHARED(...)
-#define RELEASE(...)
-#define RELEASE_SHARED(...)
-#define SCOPED_CAPABILITY
-#endif
+#define EXCLUSIVE_LOCK_FUNCTION(...) __attribute__((exclusive_lock_function(__VA_ARGS__)))
+#define EXCLUSIVE_TRYLOCK_FUNCTION(...) __attribute__((exclusive_trylock_function(__VA_ARGS__)))
+#define SHARED_LOCK_FUNCTION(...) __attribute__((shared_lock_function(__VA_ARGS__)))
+#define SHARED_TRYLOCK_FUNCTION(...) __attribute__((shared_trylock_function(__VA_ARGS__)))
+#define UNLOCK_FUNCTION(...) __attribute__((unlock_function(__VA_ARGS__)))
+#define REQUIRES(...) __attribute__((requires_capability(__VA_ARGS__)))
+#define SHARED_REQUIRES(...) __attribute__((requires_shared_capability(__VA_ARGS__)))
+#define CAPABILITY(...) __attribute__((capability(__VA_ARGS__)))
+#define SHARED_CAPABILITY(...) __attribute__((shared_capability(__VA_ARGS__)))
+#define ASSERT_CAPABILITY(...) __attribute__((assert_capability(__VA_ARGS__)))
+#define ASSERT_SHARED_CAPABILITY(...) __attribute__((assert_shared_capability(__VA_ARGS__)))
+#define RETURN_CAPABILITY(...) __attribute__((lock_returned(__VA_ARGS__)))
+#define TRY_ACQUIRE(...) __attribute__((try_acquire_capability(__VA_ARGS__)))
+#define TRY_ACQUIRE_SHARED(...) __attribute__((try_acquire_shared_capability(__VA_ARGS__)))
+#define ACQUIRE(...) __attribute__((acquire_capability(__VA_ARGS__)))
+#define ACQUIRE_SHARED(...) __attribute__((acquire_shared_capability(__VA_ARGS__)))
+#define RELEASE(...) __attribute__((release_capability(__VA_ARGS__)))
+#define RELEASE_SHARED(...) __attribute__((release_shared_capability(__VA_ARGS__)))
+#define SCOPED_CAPABILITY __attribute__((scoped_lockable))
 
 #define LOCKABLE CAPABILITY("mutex")
 #define SHARED_LOCKABLE SHARED_CAPABILITY("mutex")
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 264a530..fec918b 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -98,12 +98,7 @@
   }
 
   ~ScopedAllMutexesLock() {
-#if !defined(__clang__)
-    // TODO: remove this workaround target GCC/libc++/bionic bug "invalid failure memory model".
-    while (!gAllMutexData->all_mutexes_guard.CompareExchangeWeakSequentiallyConsistent(mutex_, 0)) {
-#else
     while (!gAllMutexData->all_mutexes_guard.CompareExchangeWeakRelease(mutex_, 0)) {
-#endif
       NanoSleep(100);
     }
   }
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 1a3bba5..f4400c3 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -4546,7 +4546,8 @@
     }
     self->AllowThreadSuspension();
 
-    CHECK_EQ(klass->GetStatus(), mirror::Class::kStatusVerified) << PrettyClass(klass.Get());
+    CHECK_EQ(klass->GetStatus(), mirror::Class::kStatusVerified) << PrettyClass(klass.Get())
+        << " self.tid=" << self->GetTid() << " clinit.tid=" << klass->GetClinitThreadId();
 
     // From here out other threads may observe that we're initializing and so changes of state
     // require the a notification.
diff --git a/runtime/common_runtime_test.h b/runtime/common_runtime_test.h
index f445e52..2d16a49 100644
--- a/runtime/common_runtime_test.h
+++ b/runtime/common_runtime_test.h
@@ -207,6 +207,12 @@
     return; \
   }
 
+#define TEST_DISABLED_FOR_STRING_COMPRESSION() \
+  if (mirror::kUseStringCompression) { \
+    printf("WARNING: TEST DISABLED FOR STRING COMPRESSION\n"); \
+    return; \
+  }
+
 }  // namespace art
 
 namespace std {
diff --git a/runtime/common_throws.cc b/runtime/common_throws.cc
index 99732c6..e1da23c 100644
--- a/runtime/common_throws.cc
+++ b/runtime/common_throws.cc
@@ -402,6 +402,16 @@
                                                dex_file, type);
 }
 
+static bool IsValidReadBarrierImplicitCheck(uintptr_t addr) {
+  DCHECK(kEmitCompilerReadBarrier);
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Uint32Value();
+  if (kUseBakerReadBarrier && (kRuntimeISA == kX86 || kRuntimeISA == kX86_64)) {
+    constexpr uint32_t gray_byte_position = LockWord::kReadBarrierStateShift / kBitsPerByte;
+    monitor_offset += gray_byte_position;
+  }
+  return addr == monitor_offset;
+}
+
 static bool IsValidImplicitCheck(uintptr_t addr, ArtMethod* method, const Instruction& instr)
     SHARED_REQUIRES(Locks::mutator_lock_) {
   if (!CanDoImplicitNullCheckOn(addr)) {
@@ -424,9 +434,13 @@
       return true;
     }
 
+    case Instruction::IGET_OBJECT:
+      if (kEmitCompilerReadBarrier && IsValidReadBarrierImplicitCheck(addr)) {
+        return true;
+      }
+      FALLTHROUGH_INTENDED;
     case Instruction::IGET:
     case Instruction::IGET_WIDE:
-    case Instruction::IGET_OBJECT:
     case Instruction::IGET_BOOLEAN:
     case Instruction::IGET_BYTE:
     case Instruction::IGET_CHAR:
@@ -440,18 +454,20 @@
     case Instruction::IPUT_SHORT: {
       ArtField* field =
           Runtime::Current()->GetClassLinker()->ResolveField(instr.VRegC_22c(), method, false);
-      return (addr == 0) ||
-          (addr == field->GetOffset().Uint32Value()) ||
-          (kEmitCompilerReadBarrier && (addr == mirror::Object::MonitorOffset().Uint32Value()));
+      return (addr == 0) || (addr == field->GetOffset().Uint32Value());
     }
 
+    case Instruction::IGET_OBJECT_QUICK:
+      if (kEmitCompilerReadBarrier && IsValidReadBarrierImplicitCheck(addr)) {
+        return true;
+      }
+      FALLTHROUGH_INTENDED;
     case Instruction::IGET_QUICK:
     case Instruction::IGET_BOOLEAN_QUICK:
     case Instruction::IGET_BYTE_QUICK:
     case Instruction::IGET_CHAR_QUICK:
     case Instruction::IGET_SHORT_QUICK:
     case Instruction::IGET_WIDE_QUICK:
-    case Instruction::IGET_OBJECT_QUICK:
     case Instruction::IPUT_QUICK:
     case Instruction::IPUT_BOOLEAN_QUICK:
     case Instruction::IPUT_BYTE_QUICK:
@@ -459,14 +475,16 @@
     case Instruction::IPUT_SHORT_QUICK:
     case Instruction::IPUT_WIDE_QUICK:
     case Instruction::IPUT_OBJECT_QUICK: {
-      return (addr == 0u) ||
-          (addr == instr.VRegC_22c()) ||
-          (kEmitCompilerReadBarrier && (addr == mirror::Object::MonitorOffset().Uint32Value()));
+      return (addr == 0u) || (addr == instr.VRegC_22c());
     }
 
+    case Instruction::AGET_OBJECT:
+      if (kEmitCompilerReadBarrier && IsValidReadBarrierImplicitCheck(addr)) {
+        return true;
+      }
+      FALLTHROUGH_INTENDED;
     case Instruction::AGET:
     case Instruction::AGET_WIDE:
-    case Instruction::AGET_OBJECT:
     case Instruction::AGET_BOOLEAN:
     case Instruction::AGET_BYTE:
     case Instruction::AGET_CHAR:
@@ -482,9 +500,7 @@
     case Instruction::ARRAY_LENGTH: {
       // The length access should crash. We currently do not do implicit checks on
       // the array access itself.
-      return (addr == 0u) ||
-          (addr == mirror::Array::LengthOffset().Uint32Value()) ||
-          (kEmitCompilerReadBarrier && (addr == mirror::Object::MonitorOffset().Uint32Value()));
+      return (addr == 0u) || (addr == mirror::Array::LengthOffset().Uint32Value());
     }
 
     default: {
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 2a5198b..a5b0689 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -1286,8 +1286,7 @@
   if (c->IsStringClass()) {
     // Special case for java.lang.String.
     gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-    mirror::SetStringCountVisitor visitor(0);
-    new_object = mirror::String::Alloc<true>(self, 0, allocator_type, visitor);
+    new_object = mirror::String::AllocEmptyString<true>(self, allocator_type);
   } else {
     new_object = c->AllocObject(self);
   }
@@ -4059,7 +4058,7 @@
   // Prepare JDWP ids for the reply.
   JDWP::JdwpTag result_tag = BasicTagFromDescriptor(m->GetShorty());
   const bool is_object_result = (result_tag == JDWP::JT_OBJECT);
-  StackHandleScope<2> hs(soa.Self());
+  StackHandleScope<3> hs(soa.Self());
   Handle<mirror::Object> object_result = hs.NewHandle(is_object_result ? result.GetL() : nullptr);
   Handle<mirror::Throwable> exception = hs.NewHandle(soa.Self()->GetException());
   soa.Self()->ClearException();
@@ -4098,10 +4097,17 @@
     // unless we threw, in which case we return null.
     DCHECK_EQ(JDWP::JT_VOID, result_tag);
     if (exceptionObjectId == 0) {
-      // TODO we could keep the receiver ObjectId in the DebugInvokeReq to avoid looking into the
-      // object registry.
-      result_value = GetObjectRegistry()->Add(pReq->receiver.Read());
-      result_tag = TagFromObject(soa, pReq->receiver.Read());
+      if (m->GetDeclaringClass()->IsStringClass()) {
+        // For string constructors, the new string is remapped to the receiver (stored in ref).
+        Handle<mirror::Object> decoded_ref = hs.NewHandle(soa.Self()->DecodeJObject(ref.get()));
+        result_value = gRegistry->Add(decoded_ref);
+        result_tag = TagFromObject(soa, decoded_ref.Get());
+      } else {
+        // TODO we could keep the receiver ObjectId in the DebugInvokeReq to avoid looking into the
+        // object registry.
+        result_value = GetObjectRegistry()->Add(pReq->receiver.Read());
+        result_tag = TagFromObject(soa, pReq->receiver.Read());
+      }
     } else {
       result_value = 0;
       result_tag = JDWP::JT_OBJECT;
@@ -4327,10 +4333,16 @@
     Handle<mirror::String> name(hs.NewHandle(t->GetThreadName(soa)));
     size_t char_count = (name.Get() != nullptr) ? name->GetLength() : 0;
     const jchar* chars = (name.Get() != nullptr) ? name->GetValue() : nullptr;
+    bool is_compressed = (name.Get() != nullptr) ? name->IsCompressed() : false;
 
     std::vector<uint8_t> bytes;
     JDWP::Append4BE(bytes, t->GetThreadId());
-    JDWP::AppendUtf16BE(bytes, chars, char_count);
+    if (is_compressed) {
+      const uint8_t* chars_compressed = name->GetValueCompressed();
+      JDWP::AppendUtf16CompressedBE(bytes, chars_compressed, char_count);
+    } else {
+      JDWP::AppendUtf16BE(bytes, chars, char_count);
+    }
     CHECK_EQ(bytes.size(), char_count*2 + sizeof(uint32_t)*2);
     Dbg::DdmSendChunk(type, bytes);
   }
diff --git a/runtime/entrypoints/quick/quick_math_entrypoints.cc b/runtime/entrypoints/quick/quick_math_entrypoints.cc
index 1c658b7..51d2784 100644
--- a/runtime/entrypoints/quick/quick_math_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_math_entrypoints.cc
@@ -18,10 +18,8 @@
 
 namespace art {
 
-#if defined(__clang__)
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wfloat-equal"
-#endif
 
 int CmplFloat(float a, float b) {
   if (a == b) {
@@ -67,9 +65,7 @@
   return -1;
 }
 
-#if defined(__clang__)
 #pragma clang diagnostic pop
-#endif
 
 extern "C" int64_t artLmul(int64_t a, int64_t b) {
   return a * b;
diff --git a/runtime/gc/accounting/space_bitmap-inl.h b/runtime/gc/accounting/space_bitmap-inl.h
index 4cf5b4f..9feaf41 100644
--- a/runtime/gc/accounting/space_bitmap-inl.h
+++ b/runtime/gc/accounting/space_bitmap-inl.h
@@ -36,7 +36,7 @@
   const uintptr_t offset = addr - heap_begin_;
   const size_t index = OffsetToIndex(offset);
   const uintptr_t mask = OffsetToMask(offset);
-  Atomic<uintptr_t>* atomic_entry = reinterpret_cast<Atomic<uintptr_t>*>(&bitmap_begin_[index]);
+  Atomic<uintptr_t>* atomic_entry = &bitmap_begin_[index];
   DCHECK_LT(index, bitmap_size_ / sizeof(intptr_t)) << " bitmap_size_ = " << bitmap_size_;
   uintptr_t old_word;
   do {
@@ -58,7 +58,7 @@
   DCHECK(bitmap_begin_ != nullptr);
   DCHECK_GE(addr, heap_begin_);
   const uintptr_t offset = addr - heap_begin_;
-  return (bitmap_begin_[OffsetToIndex(offset)] & OffsetToMask(offset)) != 0;
+  return (bitmap_begin_[OffsetToIndex(offset)].LoadRelaxed() & OffsetToMask(offset)) != 0;
 }
 
 template<size_t kAlignment> template<typename Visitor>
@@ -116,7 +116,7 @@
 
     // Traverse the middle, full part.
     for (size_t i = index_start + 1; i < index_end; ++i) {
-      uintptr_t w = bitmap_begin_[i];
+      uintptr_t w = bitmap_begin_[i].LoadRelaxed();
       if (w != 0) {
         const uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
         do {
@@ -164,8 +164,8 @@
   const size_t index = OffsetToIndex(offset);
   const uintptr_t mask = OffsetToMask(offset);
   DCHECK_LT(index, bitmap_size_ / sizeof(intptr_t)) << " bitmap_size_ = " << bitmap_size_;
-  uintptr_t* address = &bitmap_begin_[index];
-  uintptr_t old_word = *address;
+  Atomic<uintptr_t>* atomic_entry = &bitmap_begin_[index];
+  uintptr_t old_word = atomic_entry->LoadRelaxed();
   if (kSetBit) {
     // Check the bit before setting the word incase we are trying to mark a read only bitmap
     // like an image space bitmap. This bitmap is mapped as read only and will fault if we
@@ -173,10 +173,10 @@
     // occur if we check before setting the bit. This also prevents dirty pages that would
     // occur if the bitmap was read write and we did not check the bit.
     if ((old_word & mask) == 0) {
-      *address = old_word | mask;
+      atomic_entry->StoreRelaxed(old_word | mask);
     }
   } else {
-    *address = old_word & ~mask;
+    atomic_entry->StoreRelaxed(old_word & ~mask);
   }
   DCHECK_EQ(Test(obj), kSetBit);
   return (old_word & mask) != 0;
diff --git a/runtime/gc/accounting/space_bitmap.cc b/runtime/gc/accounting/space_bitmap.cc
index b43f77f..3df02ed 100644
--- a/runtime/gc/accounting/space_bitmap.cc
+++ b/runtime/gc/accounting/space_bitmap.cc
@@ -51,7 +51,9 @@
 template<size_t kAlignment>
 SpaceBitmap<kAlignment>::SpaceBitmap(const std::string& name, MemMap* mem_map, uintptr_t* bitmap_begin,
                                      size_t bitmap_size, const void* heap_begin)
-    : mem_map_(mem_map), bitmap_begin_(bitmap_begin), bitmap_size_(bitmap_size),
+    : mem_map_(mem_map),
+      bitmap_begin_(reinterpret_cast<Atomic<uintptr_t>*>(bitmap_begin)),
+      bitmap_size_(bitmap_size),
       heap_begin_(reinterpret_cast<uintptr_t>(heap_begin)),
       name_(name) {
   CHECK(bitmap_begin_ != nullptr);
@@ -104,7 +106,12 @@
 template<size_t kAlignment>
 void SpaceBitmap<kAlignment>::CopyFrom(SpaceBitmap* source_bitmap) {
   DCHECK_EQ(Size(), source_bitmap->Size());
-  std::copy(source_bitmap->Begin(), source_bitmap->Begin() + source_bitmap->Size() / sizeof(intptr_t), Begin());
+  const size_t count = source_bitmap->Size() / sizeof(intptr_t);
+  Atomic<uintptr_t>* const src = source_bitmap->Begin();
+  Atomic<uintptr_t>* const dest = Begin();
+  for (size_t i = 0; i < count; ++i) {
+    dest[i].StoreRelaxed(src[i].LoadRelaxed());
+  }
 }
 
 template<size_t kAlignment>
@@ -113,9 +120,9 @@
   CHECK(callback != nullptr);
 
   uintptr_t end = OffsetToIndex(HeapLimit() - heap_begin_ - 1);
-  uintptr_t* bitmap_begin = bitmap_begin_;
+  Atomic<uintptr_t>* bitmap_begin = bitmap_begin_;
   for (uintptr_t i = 0; i <= end; ++i) {
-    uintptr_t w = bitmap_begin[i];
+    uintptr_t w = bitmap_begin[i].LoadRelaxed();
     if (w != 0) {
       uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
       do {
@@ -160,10 +167,10 @@
   size_t start = OffsetToIndex(sweep_begin - live_bitmap.heap_begin_);
   size_t end = OffsetToIndex(sweep_end - live_bitmap.heap_begin_ - 1);
   CHECK_LT(end, live_bitmap.Size() / sizeof(intptr_t));
-  uintptr_t* live = live_bitmap.bitmap_begin_;
-  uintptr_t* mark = mark_bitmap.bitmap_begin_;
+  Atomic<uintptr_t>* live = live_bitmap.bitmap_begin_;
+  Atomic<uintptr_t>* mark = mark_bitmap.bitmap_begin_;
   for (size_t i = start; i <= end; i++) {
-    uintptr_t garbage = live[i] & ~mark[i];
+    uintptr_t garbage = live[i].LoadRelaxed() & ~mark[i].LoadRelaxed();
     if (UNLIKELY(garbage != 0)) {
       uintptr_t ptr_base = IndexToOffset(i) + live_bitmap.heap_begin_;
       do {
@@ -251,7 +258,7 @@
   uintptr_t end = Size() / sizeof(intptr_t);
   for (uintptr_t i = 0; i < end; ++i) {
     // Need uint for unsigned shift.
-    uintptr_t w = bitmap_begin_[i];
+    uintptr_t w = bitmap_begin_[i].LoadRelaxed();
     if (UNLIKELY(w != 0)) {
       uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
       while (w != 0) {
diff --git a/runtime/gc/accounting/space_bitmap.h b/runtime/gc/accounting/space_bitmap.h
index b8ff471..829b1b1 100644
--- a/runtime/gc/accounting/space_bitmap.h
+++ b/runtime/gc/accounting/space_bitmap.h
@@ -147,7 +147,7 @@
   void CopyFrom(SpaceBitmap* source_bitmap);
 
   // Starting address of our internal storage.
-  uintptr_t* Begin() {
+  Atomic<uintptr_t>* Begin() {
     return bitmap_begin_;
   }
 
@@ -215,7 +215,7 @@
   std::unique_ptr<MemMap> mem_map_;
 
   // This bitmap itself, word sized for efficiency in scanning.
-  uintptr_t* const bitmap_begin_;
+  Atomic<uintptr_t>* const bitmap_begin_;
 
   // Size of this bitmap.
   size_t bitmap_size_;
diff --git a/runtime/gc/collector/concurrent_copying-inl.h b/runtime/gc/collector/concurrent_copying-inl.h
index fb774a4..76f500c 100644
--- a/runtime/gc/collector/concurrent_copying-inl.h
+++ b/runtime/gc/collector/concurrent_copying-inl.h
@@ -34,32 +34,27 @@
   // to gray even though the object has already been marked through. This happens if a mutator
   // thread gets preempted before the AtomicSetReadBarrierPointer below, GC marks through the
   // object (changes it from white to gray and back to white), and the thread runs and
-  // incorrectly changes it from white to gray. We need to detect such "false gray" cases and
-  // change the objects back to white at the end of marking.
+  // incorrectly changes it from white to gray. If this happens, the object will get added to the
+  // mark stack again and get changed back to white after it is processed.
   if (kUseBakerReadBarrier) {
-    // Test the bitmap first to reduce the chance of false gray cases.
+    // Test the bitmap first to avoid graying an object that has already been marked through most
+    // of the time.
     if (bitmap->Test(ref)) {
       return ref;
     }
   }
   // This may or may not succeed, which is ok because the object may already be gray.
-  bool cas_success = false;
+  bool success = false;
   if (kUseBakerReadBarrier) {
-    cas_success = ref->AtomicSetReadBarrierPointer(ReadBarrier::WhitePtr(),
-                                                   ReadBarrier::GrayPtr());
-  }
-  if (bitmap->AtomicTestAndSet(ref)) {
-    // Already marked.
-    if (kUseBakerReadBarrier &&
-        cas_success &&
-        // The object could be white here if a thread gets preempted after a success at the
-        // above AtomicSetReadBarrierPointer, GC has marked through it, and the thread runs up
-        // to this point.
-        ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr()) {
-      // Register a "false-gray" object to change it from gray to white at the end of marking.
-      PushOntoFalseGrayStack(ref);
-    }
+    // GC will mark the bitmap when popping from mark stack. If only the GC is touching the bitmap
+    // we can avoid an expensive CAS.
+    // For the baker case, an object is marked if either the mark bit marked or the bitmap bit is
+    // set.
+    success = ref->AtomicSetReadBarrierPointer(ReadBarrier::WhitePtr(), ReadBarrier::GrayPtr());
   } else {
+    success = !bitmap->AtomicTestAndSet(ref);
+  }
+  if (success) {
     // Newly marked.
     if (kUseBakerReadBarrier) {
       DCHECK_EQ(ref->GetReadBarrierPointer(), ReadBarrier::GrayPtr());
@@ -99,13 +94,16 @@
   return ref;
 }
 
-template<bool kGrayImmuneObject>
+template<bool kGrayImmuneObject, bool kFromGCThread>
 inline mirror::Object* ConcurrentCopying::Mark(mirror::Object* from_ref) {
   if (from_ref == nullptr) {
     return nullptr;
   }
   DCHECK(heap_->collector_type_ == kCollectorTypeCC);
-  if (UNLIKELY(kUseBakerReadBarrier && !is_active_)) {
+  if (kFromGCThread) {
+    DCHECK(is_active_);
+    DCHECK_EQ(Thread::Current(), thread_running_gc_);
+  } else if (UNLIKELY(kUseBakerReadBarrier && !is_active_)) {
     // In the lock word forward address state, the read barrier bits
     // in the lock word are part of the stored forwarding address and
     // invalid. This is usually OK as the from-space copy of objects
@@ -192,6 +190,16 @@
   }
 }
 
+inline bool ConcurrentCopying::IsMarkedInUnevacFromSpace(mirror::Object* from_ref) {
+  // Use load acquire on the read barrier pointer to ensure that we never see a white read barrier
+  // pointer with an unmarked bit due to reordering.
+  DCHECK(region_space_->IsInUnevacFromSpace(from_ref));
+  if (kUseBakerReadBarrier && from_ref->GetReadBarrierPointerAcquire() == ReadBarrier::GrayPtr()) {
+    return true;
+  }
+  return region_space_bitmap_->Test(from_ref);
+}
+
 }  // namespace collector
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 7afe6f9..651669e 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -435,10 +435,8 @@
   gc_barrier_->Init(self, 0);
   ThreadFlipVisitor thread_flip_visitor(this, heap_->use_tlab_);
   FlipCallback flip_callback(this);
-  heap_->ThreadFlipBegin(self);  // Sync with JNI critical calls.
   size_t barrier_count = Runtime::Current()->FlipThreadRoots(
       &thread_flip_visitor, &flip_callback, this);
-  heap_->ThreadFlipEnd(self);
   {
     ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
     gc_barrier_->Increment(self, barrier_count);
@@ -1304,8 +1302,19 @@
         << " " << to_ref << " " << to_ref->GetReadBarrierPointer()
         << " is_marked=" << IsMarked(to_ref);
   }
-  // Scan ref fields.
-  Scan(to_ref);
+  bool add_to_live_bytes = false;
+  if (region_space_->IsInUnevacFromSpace(to_ref)) {
+    // Mark the bitmap only in the GC thread here so that we don't need a CAS.
+    if (!kUseBakerReadBarrier || !region_space_bitmap_->Set(to_ref)) {
+      // It may be already marked if we accidentally pushed the same object twice due to the racy
+      // bitmap read in MarkUnevacFromSpaceRegion.
+      Scan(to_ref);
+      // Only add to the live bytes if the object was not already marked.
+      add_to_live_bytes = true;
+    }
+  } else {
+    Scan(to_ref);
+  }
   if (kUseBakerReadBarrier) {
     DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr())
         << " " << to_ref << " " << to_ref->GetReadBarrierPointer()
@@ -1334,7 +1343,7 @@
   DCHECK(!kUseBakerReadBarrier);
 #endif
 
-  if (region_space_->IsInUnevacFromSpace(to_ref)) {
+  if (add_to_live_bytes) {
     // Add to the live bytes per unevacuated from space. Note this code is always run by the
     // GC-running thread (no synchronization required).
     DCHECK(region_space_bitmap_->Test(to_ref));
@@ -1569,7 +1578,7 @@
       // OK.
       return;
     } else if (region_space_->IsInUnevacFromSpace(ref)) {
-      CHECK(region_space_bitmap_->Test(ref)) << ref;
+      CHECK(IsMarkedInUnevacFromSpace(ref)) << ref;
     } else if (region_space_->IsInFromSpace(ref)) {
       // Not OK. Do extra logging.
       if (obj != nullptr) {
@@ -1616,7 +1625,7 @@
       // OK.
       return;
     } else if (region_space_->IsInUnevacFromSpace(ref)) {
-      CHECK(region_space_bitmap_->Test(ref)) << ref;
+      CHECK(IsMarkedInUnevacFromSpace(ref)) << ref;
     } else if (region_space_->IsInFromSpace(ref)) {
       // Not OK. Do extra logging.
       if (gc_root_source == nullptr) {
@@ -1656,7 +1665,7 @@
     LOG(INFO) << "holder is in the to-space.";
   } else if (region_space_->IsInUnevacFromSpace(obj)) {
     LOG(INFO) << "holder is in the unevac from-space.";
-    if (region_space_bitmap_->Test(obj)) {
+    if (IsMarkedInUnevacFromSpace(obj)) {
       LOG(INFO) << "holder is marked in the region space bitmap.";
     } else {
       LOG(INFO) << "holder is not marked in the region space bitmap.";
@@ -1785,7 +1794,7 @@
   DCHECK_EQ(Thread::Current(), thread_running_gc_);
   mirror::Object* ref = obj->GetFieldObject<
       mirror::Object, kVerifyNone, kWithoutReadBarrier, false>(offset);
-  mirror::Object* to_ref = Mark</*kGrayImmuneObject*/false>(ref);
+  mirror::Object* to_ref = Mark</*kGrayImmuneObject*/false, /*kFromGCThread*/true>(ref);
   if (to_ref == ref) {
     return;
   }
@@ -2128,7 +2137,7 @@
            heap_->non_moving_space_->HasAddress(to_ref))
         << "from_ref=" << from_ref << " to_ref=" << to_ref;
   } else if (rtype == space::RegionSpace::RegionType::kRegionTypeUnevacFromSpace) {
-    if (region_space_bitmap_->Test(from_ref)) {
+    if (IsMarkedInUnevacFromSpace(from_ref)) {
       to_ref = from_ref;
     } else {
       to_ref = nullptr;
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index 5b0e2d6..97f4555 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -104,7 +104,7 @@
     DCHECK(ref != nullptr);
     return IsMarked(ref) == ref;
   }
-  template<bool kGrayImmuneObject = true>
+  template<bool kGrayImmuneObject = true, bool kFromGCThread = false>
   ALWAYS_INLINE mirror::Object* Mark(mirror::Object* from_ref)
       SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
@@ -179,6 +179,8 @@
       REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
   virtual mirror::Object* IsMarked(mirror::Object* from_ref) OVERRIDE
       SHARED_REQUIRES(Locks::mutator_lock_);
+  bool IsMarkedInUnevacFromSpace(mirror::Object* from_ref)
+      SHARED_REQUIRES(Locks::mutator_lock_);
   virtual bool IsMarkedHeapReference(mirror::HeapReference<mirror::Object>* field) OVERRIDE
       SHARED_REQUIRES(Locks::mutator_lock_);
   void SweepSystemWeaks(Thread* self)
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 39f26e7..638c1d8 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -878,9 +878,13 @@
   MutexLock mu(self, *thread_flip_lock_);
   bool has_waited = false;
   uint64_t wait_start = NanoTime();
-  while (thread_flip_running_) {
-    has_waited = true;
-    thread_flip_cond_->Wait(self);
+  if (thread_flip_running_) {
+    TimingLogger::ScopedTiming split("IncrementDisableThreadFlip",
+                                     GetCurrentGcIteration()->GetTimings());
+    while (thread_flip_running_) {
+      has_waited = true;
+      thread_flip_cond_->Wait(self);
+    }
   }
   ++disable_thread_flip_count_;
   if (has_waited) {
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index ae6c321..c87312b 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -454,8 +454,7 @@
                                           const std::string& image_filename,
                                           bool is_zygote,
                                           bool is_global_cache,
-                                          bool is_system,
-                                          bool relocated_version_used,
+                                          bool validate_oat_file,
                                           std::string* error_msg)
       SHARED_REQUIRES(Locks::mutator_lock_) {
     // Note that we must not use the file descriptor associated with
@@ -483,7 +482,7 @@
     // file name.
     return Init(image_filename.c_str(),
                 image_location,
-                !(is_system || relocated_version_used),
+                validate_oat_file,
                 /* oat_file */nullptr,
                 error_msg);
   }
@@ -1473,8 +1472,7 @@
                                  cache_filename,
                                  is_zygote,
                                  is_global_cache,
-                                 /* is_system */ false,
-                                 /* relocated_version_used */ true,
+                                 /* validate_oat_file */ false,
                                  &local_error_msg);
       if (relocated_space != nullptr) {
         return relocated_space;
@@ -1491,8 +1489,7 @@
                                cache_filename,
                                is_zygote,
                                is_global_cache,
-                               /* is_system */ false,
-                               /* relocated_version_used */ true,
+                               /* validate_oat_file */ true,
                                &local_error_msg);
     if (cache_space != nullptr) {
       return cache_space;
@@ -1512,8 +1509,7 @@
                                system_filename,
                                is_zygote,
                                is_global_cache,
-                               /* is_system */ true,
-                               /* relocated_version_used */ false,
+                               /* validate_oat_file */ false,
                                &local_error_msg);
     if (system_space != nullptr) {
       return system_space;
@@ -1538,8 +1534,7 @@
                                    cache_filename,
                                    is_zygote,
                                    is_global_cache,
-                                   /* is_system */ false,
-                                   /* relocated_version_used */ true,
+                                   /* validate_oat_file */ false,
                                    &local_error_msg);
         if (patched_space != nullptr) {
           return patched_space;
@@ -1568,8 +1563,7 @@
                                    cache_filename,
                                    is_zygote,
                                    is_global_cache,
-                                   /* is_system */ false,
-                                   /* relocated_version_used */ true,
+                                   /* validate_oat_file */ false,
                                    &local_error_msg);
         if (compiled_space != nullptr) {
           return compiled_space;
diff --git a/runtime/generated/asm_support_gen.h b/runtime/generated/asm_support_gen.h
index 716c23d..40b71c4 100644
--- a/runtime/generated/asm_support_gen.h
+++ b/runtime/generated/asm_support_gen.h
@@ -70,6 +70,16 @@
 DEFINE_CHECK_EQ(static_cast<int32_t>(ART_METHOD_QUICK_CODE_OFFSET_32), (static_cast<int32_t>(art::ArtMethod:: EntryPointFromQuickCompiledCodeOffset(art::PointerSize::k32).Int32Value())))
 #define ART_METHOD_QUICK_CODE_OFFSET_64 48
 DEFINE_CHECK_EQ(static_cast<int32_t>(ART_METHOD_QUICK_CODE_OFFSET_64), (static_cast<int32_t>(art::ArtMethod:: EntryPointFromQuickCompiledCodeOffset(art::PointerSize::k64).Int32Value())))
+#define ART_METHOD_DECLARING_CLASS_OFFSET 0
+DEFINE_CHECK_EQ(static_cast<int32_t>(ART_METHOD_DECLARING_CLASS_OFFSET), (static_cast<int32_t>(art::ArtMethod:: DeclaringClassOffset().Int32Value())))
+#define DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET 40
+DEFINE_CHECK_EQ(static_cast<int32_t>(DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET), (static_cast<int32_t>(art::mirror::Class:: DexCacheStringsOffset().Int32Value())))
+#define STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT 3
+DEFINE_CHECK_EQ(static_cast<int32_t>(STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT), (static_cast<int32_t>(art::WhichPowerOf2(sizeof(art::mirror::StringDexCachePair)))))
+#define STRING_DEX_CACHE_SIZE_MINUS_ONE 1023
+DEFINE_CHECK_EQ(static_cast<int32_t>(STRING_DEX_CACHE_SIZE_MINUS_ONE), (static_cast<int32_t>(art::mirror::DexCache::kDexCacheStringCacheSize - 1)))
+#define STRING_DEX_CACHE_HASH_BITS 10
+DEFINE_CHECK_EQ(static_cast<int32_t>(STRING_DEX_CACHE_HASH_BITS), (static_cast<int32_t>(art::LeastSignificantBit(art::mirror::DexCache::kDexCacheStringCacheSize))))
 #define MIN_LARGE_OBJECT_THRESHOLD 0x3000
 DEFINE_CHECK_EQ(static_cast<size_t>(MIN_LARGE_OBJECT_THRESHOLD), (static_cast<size_t>(art::gc::Heap::kMinLargeObjectThreshold)))
 #define LOCK_WORD_STATE_SHIFT 30
diff --git a/runtime/hprof/hprof.cc b/runtime/hprof/hprof.cc
index 9895395..4005f05 100644
--- a/runtime/hprof/hprof.cc
+++ b/runtime/hprof/hprof.cc
@@ -223,6 +223,12 @@
     HandleU1List(values, count);
     length_ += count;
   }
+  void AddU1AsU2List(const uint8_t* values, size_t count) {
+    HandleU1AsU2List(values, count);
+    // Array of char from compressed String (8-bit) is added as 16-bit blocks
+    int ceil_count_to_even = count + ((count & 1) ? 1 : 0);
+    length_ += ceil_count_to_even * sizeof(uint8_t);
+  }
   void AddU2List(const uint16_t* values, size_t count) {
     HandleU2List(values, count);
     length_ += count * sizeof(uint16_t);
@@ -268,6 +274,9 @@
   virtual void HandleU1List(const uint8_t* values ATTRIBUTE_UNUSED,
                             size_t count ATTRIBUTE_UNUSED) {
   }
+  virtual void HandleU1AsU2List(const uint8_t* values ATTRIBUTE_UNUSED,
+                                size_t count ATTRIBUTE_UNUSED) {
+  }
   virtual void HandleU2List(const uint16_t* values ATTRIBUTE_UNUSED,
                             size_t count ATTRIBUTE_UNUSED) {
   }
@@ -308,6 +317,19 @@
     buffer_.insert(buffer_.end(), values, values + count);
   }
 
+  void HandleU1AsU2List(const uint8_t* values, size_t count) OVERRIDE {
+    DCHECK_EQ(length_, buffer_.size());
+    // All 8-bits are grouped in 2 to make 16-bit block like Java Char
+    if (count & 1) {
+      buffer_.push_back(0);
+    }
+    for (size_t i = 0; i < count; ++i) {
+      uint8_t value = *values;
+      buffer_.push_back(value);
+      values++;
+    }
+  }
+
   void HandleU2List(const uint16_t* values, size_t count) OVERRIDE {
     DCHECK_EQ(length_, buffer_.size());
     for (size_t i = 0; i < count; ++i) {
@@ -1354,7 +1376,11 @@
         string_value = reinterpret_cast<mirror::Object*>(
             reinterpret_cast<uintptr_t>(s) + kObjectAlignment);
       } else {
-        string_value = reinterpret_cast<mirror::Object*>(s->GetValue());
+        if (s->IsCompressed()) {
+          string_value = reinterpret_cast<mirror::Object*>(s->GetValueCompressed());
+        } else {
+          string_value = reinterpret_cast<mirror::Object*>(s->GetValue());
+        }
       }
       __ AddObjectId(string_value);
     }
@@ -1369,12 +1395,18 @@
   CHECK_EQ(obj->IsString(), string_value != nullptr);
   if (string_value != nullptr) {
     mirror::String* s = obj->AsString();
+    // Compressed string's (8-bit) length is ceil(length/2) in 16-bit blocks
+    int length_in_16_bit = (s->IsCompressed()) ? ((s->GetLength() + 1) / 2) : s->GetLength();
     __ AddU1(HPROF_PRIMITIVE_ARRAY_DUMP);
     __ AddObjectId(string_value);
     __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(obj));
-    __ AddU4(s->GetLength());
+    __ AddU4(length_in_16_bit);
     __ AddU1(hprof_basic_char);
-    __ AddU2List(s->GetValue(), s->GetLength());
+    if (s->IsCompressed()) {
+      __ AddU1AsU2List(s->GetValueCompressed(), s->GetLength());
+    } else {
+      __ AddU2List(s->GetValue(), s->GetLength());
+    }
   }
 }
 
diff --git a/runtime/intern_table.cc b/runtime/intern_table.cc
index eceb593..1940d67 100644
--- a/runtime/intern_table.cc
+++ b/runtime/intern_table.cc
@@ -386,8 +386,23 @@
   if (a_length != b.GetUtf16Length()) {
     return false;
   }
-  const uint16_t* a_value = a_string->GetValue();
-  return CompareModifiedUtf8ToUtf16AsCodePointValues(b.GetUtf8Data(), a_value, a_length) == 0;
+  if (a_string->IsCompressed()) {
+    size_t b_byte_count = strlen(b.GetUtf8Data());
+    size_t b_utf8_length = CountModifiedUtf8Chars(b.GetUtf8Data(), b_byte_count);
+    // Modified UTF-8 single byte character range is 0x01 .. 0x7f
+    // The string compression occurs on regular ASCII with same exact range,
+    // not on extended ASCII which up to 0xff
+    const bool is_b_regular_ascii = (b_byte_count == b_utf8_length);
+    if (is_b_regular_ascii) {
+      return memcmp(b.GetUtf8Data(),
+                    a_string->GetValueCompressed(), a_length * sizeof(uint8_t)) == 0;
+    } else {
+      return false;
+    }
+  } else {
+    const uint16_t* a_value = a_string->GetValue();
+    return CompareModifiedUtf8ToUtf16AsCodePointValues(b.GetUtf8Data(), a_value, a_length) == 0;
+  }
 }
 
 size_t InternTable::Table::AddTableFromMemory(const uint8_t* ptr) {
diff --git a/runtime/interpreter/interpreter.cc b/runtime/interpreter/interpreter.cc
index f1f7f42..101c9a1 100644
--- a/runtime/interpreter/interpreter.cc
+++ b/runtime/interpreter/interpreter.cc
@@ -20,6 +20,9 @@
 
 #include "common_throws.h"
 #include "interpreter_common.h"
+#include "interpreter_goto_table_impl.h"
+#include "interpreter_mterp_impl.h"
+#include "interpreter_switch_impl.h"
 #include "mirror/string-inl.h"
 #include "scoped_thread_state_change.h"
 #include "ScopedLocalRef.h"
@@ -242,28 +245,6 @@
 
 static constexpr InterpreterImplKind kInterpreterImplKind = kMterpImplKind;
 
-#if defined(__clang__)
-// Clang 3.4 fails to build the goto interpreter implementation.
-template<bool do_access_check, bool transaction_active>
-JValue ExecuteGotoImpl(Thread*, const DexFile::CodeItem*, ShadowFrame&, JValue) {
-  LOG(FATAL) << "UNREACHABLE";
-  UNREACHABLE();
-}
-// Explicit definitions of ExecuteGotoImpl.
-template<> SHARED_REQUIRES(Locks::mutator_lock_)
-JValue ExecuteGotoImpl<true, false>(Thread* self, const DexFile::CodeItem* code_item,
-                                    ShadowFrame& shadow_frame, JValue result_register);
-template<> SHARED_REQUIRES(Locks::mutator_lock_)
-JValue ExecuteGotoImpl<false, false>(Thread* self, const DexFile::CodeItem* code_item,
-                                     ShadowFrame& shadow_frame, JValue result_register);
-template<> SHARED_REQUIRES(Locks::mutator_lock_)
-JValue ExecuteGotoImpl<true, true>(Thread* self,  const DexFile::CodeItem* code_item,
-                                   ShadowFrame& shadow_frame, JValue result_register);
-template<> SHARED_REQUIRES(Locks::mutator_lock_)
-JValue ExecuteGotoImpl<false, true>(Thread* self, const DexFile::CodeItem* code_item,
-                                    ShadowFrame& shadow_frame, JValue result_register);
-#endif
-
 static inline JValue Execute(
     Thread* self,
     const DexFile::CodeItem* code_item,
diff --git a/runtime/interpreter/interpreter_common.h b/runtime/interpreter/interpreter_common.h
index 90c8227..7b38473 100644
--- a/runtime/interpreter/interpreter_common.h
+++ b/runtime/interpreter/interpreter_common.h
@@ -65,21 +65,6 @@
 namespace art {
 namespace interpreter {
 
-// External references to all interpreter implementations.
-
-template<bool do_access_check, bool transaction_active>
-extern JValue ExecuteSwitchImpl(Thread* self, const DexFile::CodeItem* code_item,
-                                ShadowFrame& shadow_frame, JValue result_register,
-                                bool interpret_one_instruction);
-
-template<bool do_access_check, bool transaction_active>
-extern JValue ExecuteGotoImpl(Thread* self, const DexFile::CodeItem* code_item,
-                              ShadowFrame& shadow_frame, JValue result_register);
-
-// Mterp does not support transactions or access check, thus no templated versions.
-extern "C" bool ExecuteMterpImpl(Thread* self, const DexFile::CodeItem* code_item,
-                                 ShadowFrame* shadow_frame, JValue* result_register);
-
 void ThrowNullPointerExceptionFromInterpreter()
     SHARED_REQUIRES(Locks::mutator_lock_);
 
@@ -450,7 +435,7 @@
       oss << StringPrintf(" vreg%u=0x%08X", i, raw_value);
       if (ref_value != nullptr) {
         if (ref_value->GetClass()->IsStringClass() &&
-            ref_value->AsString()->GetValue() != nullptr) {
+            !ref_value->AsString()->IsValueNull()) {
           oss << "/java.lang.String \"" << ref_value->AsString()->ToModifiedUtf8() << "\"";
         } else {
           oss << "/" << PrettyTypeOf(ref_value);
diff --git a/runtime/interpreter/interpreter_goto_table_impl.cc b/runtime/interpreter/interpreter_goto_table_impl.cc
index 43b2778..37dd63b 100644
--- a/runtime/interpreter/interpreter_goto_table_impl.cc
+++ b/runtime/interpreter/interpreter_goto_table_impl.cc
@@ -14,18 +14,29 @@
  * limitations under the License.
  */
 
+#include "interpreter_goto_table_impl.h"
+
+// Common includes
+#include "base/logging.h"
+#include "base/macros.h"
+#include "base/mutex.h"
+#include "stack.h"
+#include "thread.h"
+
+// Clang compiles the GOTO interpreter very slowly. So we skip it. These are the implementation
+// details only necessary when compiling it.
 #if !defined(__clang__)
-// Clang 3.4 fails to build the goto interpreter implementation.
-
-
 #include "experimental_flags.h"
 #include "interpreter_common.h"
 #include "jit/jit.h"
 #include "safe_math.h"
+#endif
 
 namespace art {
 namespace interpreter {
 
+#if !defined(__clang__)
+
 // In the following macros, we expect the following local variables exist:
 // - "self": the current Thread*.
 // - "inst" : the current Instruction*.
@@ -530,8 +541,7 @@
     if (LIKELY(c != nullptr)) {
       if (UNLIKELY(c->IsStringClass())) {
         gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-        mirror::SetStringCountVisitor visitor(0);
-        obj = String::Alloc<true>(self, 0, allocator_type, visitor);
+        obj = mirror::String::AllocEmptyString<true>(self, allocator_type);
       } else {
         obj = AllocObjectFromCode<do_access_check, true>(
             inst->VRegB_21c(), shadow_frame.GetMethod(), self,
@@ -2558,20 +2568,40 @@
 }  // NOLINT(readability/fn_size)
 
 // Explicit definitions of ExecuteGotoImpl.
-template SHARED_REQUIRES(Locks::mutator_lock_) HOT_ATTR
+template HOT_ATTR
 JValue ExecuteGotoImpl<true, false>(Thread* self, const DexFile::CodeItem* code_item,
                                     ShadowFrame& shadow_frame, JValue result_register);
-template SHARED_REQUIRES(Locks::mutator_lock_) HOT_ATTR
+template HOT_ATTR
 JValue ExecuteGotoImpl<false, false>(Thread* self, const DexFile::CodeItem* code_item,
                                      ShadowFrame& shadow_frame, JValue result_register);
-template SHARED_REQUIRES(Locks::mutator_lock_)
+template
 JValue ExecuteGotoImpl<true, true>(Thread* self, const DexFile::CodeItem* code_item,
                                    ShadowFrame& shadow_frame, JValue result_register);
-template SHARED_REQUIRES(Locks::mutator_lock_)
+template
 JValue ExecuteGotoImpl<false, true>(Thread* self, const DexFile::CodeItem* code_item,
                                     ShadowFrame& shadow_frame, JValue result_register);
 
+#else
+
+template<bool do_access_check, bool transaction_active>
+JValue ExecuteGotoImpl(Thread*, const DexFile::CodeItem*, ShadowFrame&, JValue) {
+  LOG(FATAL) << "UNREACHABLE";
+  UNREACHABLE();
+}
+// Explicit definitions of ExecuteGotoImpl.
+template<>
+JValue ExecuteGotoImpl<true, false>(Thread* self, const DexFile::CodeItem* code_item,
+                                    ShadowFrame& shadow_frame, JValue result_register);
+template<>
+JValue ExecuteGotoImpl<false, false>(Thread* self, const DexFile::CodeItem* code_item,
+                                     ShadowFrame& shadow_frame, JValue result_register);
+template<>
+JValue ExecuteGotoImpl<true, true>(Thread* self,  const DexFile::CodeItem* code_item,
+                                   ShadowFrame& shadow_frame, JValue result_register);
+template<>
+JValue ExecuteGotoImpl<false, true>(Thread* self, const DexFile::CodeItem* code_item,
+                                    ShadowFrame& shadow_frame, JValue result_register);
+#endif
+
 }  // namespace interpreter
 }  // namespace art
-
-#endif
diff --git a/runtime/interpreter/interpreter_goto_table_impl.h b/runtime/interpreter/interpreter_goto_table_impl.h
new file mode 100644
index 0000000..bb9be88
--- /dev/null
+++ b/runtime/interpreter/interpreter_goto_table_impl.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_INTERPRETER_INTERPRETER_GOTO_TABLE_IMPL_H_
+#define ART_RUNTIME_INTERPRETER_INTERPRETER_GOTO_TABLE_IMPL_H_
+
+#include "base/macros.h"
+#include "base/mutex.h"
+#include "dex_file.h"
+#include "jvalue.h"
+
+namespace art {
+
+class ShadowFrame;
+class Thread;
+
+namespace interpreter {
+
+template<bool do_access_check, bool transaction_active>
+JValue ExecuteGotoImpl(Thread* self,
+                       const DexFile::CodeItem* code_item,
+                       ShadowFrame& shadow_frame,
+                       JValue result_register) SHARED_REQUIRES(Locks::mutator_lock_);
+
+}  // namespace interpreter
+}  // namespace art
+
+#endif  // ART_RUNTIME_INTERPRETER_INTERPRETER_GOTO_TABLE_IMPL_H_
diff --git a/runtime/interpreter/interpreter_mterp_impl.h b/runtime/interpreter/interpreter_mterp_impl.h
new file mode 100644
index 0000000..322df4e
--- /dev/null
+++ b/runtime/interpreter/interpreter_mterp_impl.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_INTERPRETER_INTERPRETER_MTERP_IMPL_H_
+#define ART_RUNTIME_INTERPRETER_INTERPRETER_MTERP_IMPL_H_
+
+#include "base/macros.h"
+#include "base/mutex.h"
+#include "dex_file.h"
+#include "jvalue.h"
+
+namespace art {
+
+class ShadowFrame;
+class Thread;
+
+namespace interpreter {
+
+// Mterp does not support transactions or access check, thus no templated versions.
+extern "C" bool ExecuteMterpImpl(Thread* self,
+                                 const DexFile::CodeItem* code_item,
+                                 ShadowFrame* shadow_frame,
+                                 JValue* result_register) SHARED_REQUIRES(Locks::mutator_lock_);
+
+}  // namespace interpreter
+}  // namespace art
+
+#endif  // ART_RUNTIME_INTERPRETER_INTERPRETER_MTERP_IMPL_H_
diff --git a/runtime/interpreter/interpreter_switch_impl.cc b/runtime/interpreter/interpreter_switch_impl.cc
index a6349fc..227130e 100644
--- a/runtime/interpreter/interpreter_switch_impl.cc
+++ b/runtime/interpreter/interpreter_switch_impl.cc
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "interpreter_switch_impl.h"
+
 #include "base/enums.h"
 #include "experimental_flags.h"
 #include "interpreter_common.h"
@@ -477,8 +479,7 @@
         if (LIKELY(c != nullptr)) {
           if (UNLIKELY(c->IsStringClass())) {
             gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-            mirror::SetStringCountVisitor visitor(0);
-            obj = String::Alloc<true>(self, 0, allocator_type, visitor);
+            obj = mirror::String::AllocEmptyString<true>(self, allocator_type);
           } else {
             obj = AllocObjectFromCode<do_access_check, true>(
               inst->VRegB_21c(), shadow_frame.GetMethod(), self,
@@ -622,10 +623,8 @@
         break;
       }
 
-#if defined(__clang__)
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wfloat-equal"
-#endif
 
       case Instruction::CMPL_FLOAT: {
         PREAMBLE();
@@ -693,9 +692,7 @@
         break;
       }
 
-#if defined(__clang__)
 #pragma clang diagnostic pop
-#endif
 
       case Instruction::CMP_LONG: {
         PREAMBLE();
@@ -2337,19 +2334,19 @@
 }  // NOLINT(readability/fn_size)
 
 // Explicit definitions of ExecuteSwitchImpl.
-template SHARED_REQUIRES(Locks::mutator_lock_) HOT_ATTR
+template HOT_ATTR
 JValue ExecuteSwitchImpl<true, false>(Thread* self, const DexFile::CodeItem* code_item,
                                       ShadowFrame& shadow_frame, JValue result_register,
                                       bool interpret_one_instruction);
-template SHARED_REQUIRES(Locks::mutator_lock_) HOT_ATTR
+template HOT_ATTR
 JValue ExecuteSwitchImpl<false, false>(Thread* self, const DexFile::CodeItem* code_item,
                                        ShadowFrame& shadow_frame, JValue result_register,
                                        bool interpret_one_instruction);
-template SHARED_REQUIRES(Locks::mutator_lock_)
+template
 JValue ExecuteSwitchImpl<true, true>(Thread* self, const DexFile::CodeItem* code_item,
                                      ShadowFrame& shadow_frame, JValue result_register,
                                      bool interpret_one_instruction);
-template SHARED_REQUIRES(Locks::mutator_lock_)
+template
 JValue ExecuteSwitchImpl<false, true>(Thread* self, const DexFile::CodeItem* code_item,
                                       ShadowFrame& shadow_frame, JValue result_register,
                                       bool interpret_one_instruction);
diff --git a/runtime/interpreter/interpreter_switch_impl.h b/runtime/interpreter/interpreter_switch_impl.h
new file mode 100644
index 0000000..90ec908
--- /dev/null
+++ b/runtime/interpreter/interpreter_switch_impl.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_INTERPRETER_INTERPRETER_SWITCH_IMPL_H_
+#define ART_RUNTIME_INTERPRETER_INTERPRETER_SWITCH_IMPL_H_
+
+#include "base/macros.h"
+#include "base/mutex.h"
+#include "dex_file.h"
+#include "jvalue.h"
+
+namespace art {
+
+class ShadowFrame;
+class Thread;
+
+namespace interpreter {
+
+template<bool do_access_check, bool transaction_active>
+JValue ExecuteSwitchImpl(Thread* self,
+                         const DexFile::CodeItem* code_item,
+                         ShadowFrame& shadow_frame,
+                         JValue result_register,
+                         bool interpret_one_instruction) SHARED_REQUIRES(Locks::mutator_lock_);
+
+}  // namespace interpreter
+}  // namespace art
+
+#endif  // ART_RUNTIME_INTERPRETER_INTERPRETER_SWITCH_IMPL_H_
diff --git a/runtime/interpreter/mterp/mterp.cc b/runtime/interpreter/mterp/mterp.cc
index c25cd78..20a0753 100644
--- a/runtime/interpreter/mterp/mterp.cc
+++ b/runtime/interpreter/mterp/mterp.cc
@@ -358,8 +358,7 @@
   if (LIKELY(c != nullptr)) {
     if (UNLIKELY(c->IsStringClass())) {
       gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-      mirror::SetStringCountVisitor visitor(0);
-      obj = String::Alloc<true>(self, 0, allocator_type, visitor);
+      obj = mirror::String::AllocEmptyString<true>(self, allocator_type);
     } else {
       obj = AllocObjectFromCode<false, true>(
         inst->VRegB_21c(), shadow_frame->GetMethod(), self,
diff --git a/runtime/interpreter/unstarted_runtime_test.cc b/runtime/interpreter/unstarted_runtime_test.cc
index 7e1f795..c324600 100644
--- a/runtime/interpreter/unstarted_runtime_test.cc
+++ b/runtime/interpreter/unstarted_runtime_test.cc
@@ -401,8 +401,23 @@
   interpreter::DoCall<false, false>(method, self, *shadow_frame, inst, inst_data[0], &result);
   mirror::String* string_result = reinterpret_cast<mirror::String*>(result.GetL());
   EXPECT_EQ(string_arg->GetLength(), string_result->GetLength());
-  EXPECT_EQ(memcmp(string_arg->GetValue(), string_result->GetValue(),
-                   string_arg->GetLength() * sizeof(uint16_t)), 0);
+
+  if (string_arg->IsCompressed() && string_result->IsCompressed()) {
+    EXPECT_EQ(memcmp(string_arg->GetValueCompressed(), string_result->GetValueCompressed(),
+                     string_arg->GetLength() * sizeof(uint8_t)), 0);
+  } else if (!string_arg->IsCompressed() && !string_result->IsCompressed()) {
+    EXPECT_EQ(memcmp(string_arg->GetValue(), string_result->GetValue(),
+                     string_arg->GetLength() * sizeof(uint16_t)), 0);
+  } else {
+    bool equal = true;
+    for (int i = 0; i < string_arg->GetLength(); ++i) {
+      if (string_arg->CharAt(i) != string_result->CharAt(i)) {
+        equal = false;
+        break;
+      }
+    }
+    EXPECT_EQ(equal, true);
+  }
 
   ShadowFrame::DeleteDeoptimizedFrame(shadow_frame);
 }
diff --git a/runtime/jdwp/jdwp_bits.h b/runtime/jdwp/jdwp_bits.h
index f9cf9ca..33b98f3 100644
--- a/runtime/jdwp/jdwp_bits.h
+++ b/runtime/jdwp/jdwp_bits.h
@@ -59,13 +59,22 @@
   bytes.push_back(static_cast<uint8_t>(value));
 }
 
-static inline void AppendUtf16BE(std::vector<uint8_t>& bytes, const uint16_t* chars, size_t char_count) {
+static inline void AppendUtf16BE(std::vector<uint8_t>& bytes, const uint16_t* chars,
+                                 size_t char_count) {
   Append4BE(bytes, char_count);
   for (size_t i = 0; i < char_count; ++i) {
     Append2BE(bytes, chars[i]);
   }
 }
 
+static inline void AppendUtf16CompressedBE(std::vector<uint8_t>& bytes,
+                                           const uint8_t* chars, size_t char_count) {
+  Append4BE(bytes, char_count);
+  for (size_t i = 0; i < char_count; ++i) {
+    Append2BE(bytes, static_cast<uint16_t>(chars[i]));
+  }
+}
+
 // @deprecated
 static inline void Set1(uint8_t* buf, uint8_t val) {
   *buf = val;
diff --git a/runtime/jit/profile_saver.cc b/runtime/jit/profile_saver.cc
index b35c958..927681c 100644
--- a/runtime/jit/profile_saver.cc
+++ b/runtime/jit/profile_saver.cc
@@ -63,19 +63,6 @@
       options_(options) {
   DCHECK(options_.IsEnabled());
   AddTrackedLocations(output_filename, app_data_dir, code_paths);
-  if (!app_data_dir.empty()) {
-    // The application directory is used to determine which dex files are owned by app.
-    // Since it could be a symlink (e.g. /data/data instead of /data/user/0), and we
-    // don't have control over how the dex files are actually loaded (symlink or canonical path),
-    // store it's canonical form to be sure we use the same base when comparing.
-    UniqueCPtr<const char[]> app_data_dir_real_path(realpath(app_data_dir.c_str(), nullptr));
-    if (app_data_dir_real_path != nullptr) {
-      app_data_dirs_.emplace(app_data_dir_real_path.get());
-    } else {
-      LOG(WARNING) << "Failed to get the real path for app dir: " << app_data_dir
-          << ". The app dir will not be used to determine which dex files belong to the app";
-    }
-  }
 }
 
 void ProfileSaver::Run() {
@@ -498,12 +485,18 @@
   if (it == tracked_dex_base_locations_.end()) {
     tracked_dex_base_locations_.Put(output_filename,
                                     std::set<std::string>(code_paths.begin(), code_paths.end()));
-    app_data_dirs_.insert(app_data_dir);
+    if (!app_data_dir.empty()) {
+      app_data_dirs_.insert(app_data_dir);
+    }
   } else {
     it->second.insert(code_paths.begin(), code_paths.end());
   }
 }
 
+// TODO(calin): This may lead to several calls to realpath.
+// Consider moving the logic to the saver thread (i.e. when notified,
+// only cache the location, and then wake up the saver thread to do the
+// comparisons with the real file paths and to create the markers).
 void ProfileSaver::NotifyDexUse(const std::string& dex_location) {
   if (!ShouldProfileLocation(dex_location)) {
     return;
@@ -536,63 +529,32 @@
   }
 }
 
-bool ProfileSaver::MaybeRecordDexUseInternal(
-      const std::string& dex_location,
-      const std::set<std::string>& app_code_paths,
-      const std::string& foreign_dex_profile_path,
-      const std::set<std::string>& app_data_dirs) {
-  if (dex_location.empty()) {
-    LOG(WARNING) << "Asked to record foreign dex use with an empty dex location.";
-    return false;
-  }
-  if (foreign_dex_profile_path.empty()) {
-    LOG(WARNING) << "Asked to record foreign dex use without a valid profile path ";
-    return false;
-  }
-
-  UniqueCPtr<const char[]> dex_location_real_path(realpath(dex_location.c_str(), nullptr));
-  if (dex_location_real_path == nullptr) {
-    PLOG(WARNING) << "Could not get realpath for " << dex_location;
-  }
-  std::string dex_location_real_path_str((dex_location_real_path == nullptr)
-    ? dex_location.c_str()
-    : dex_location_real_path.get());
-
-  if (app_data_dirs.find(dex_location_real_path_str) != app_data_dirs.end()) {
-    // The dex location is under the application folder. Nothing to record.
-    return false;
-  }
-
-  if (app_code_paths.find(dex_location) != app_code_paths.end()) {
-    // The dex location belongs to the application code paths. Nothing to record.
-    return false;
-  }
-  // Do another round of checks with the real paths.
-  // Note that we could cache all the real locations in the saver (since it's an expensive
-  // operation). However we expect that app_code_paths is small (usually 1 element), and
-  // NotifyDexUse is called just a few times in the app lifetime. So we make the compromise
-  // to save some bytes of memory usage.
-  for (const auto& app_code_location : app_code_paths) {
-    UniqueCPtr<const char[]> real_app_code_location(realpath(app_code_location.c_str(), nullptr));
-    if (real_app_code_location == nullptr) {
-      PLOG(WARNING) << "Could not get realpath for " << app_code_location;
+static bool CheckContainsWithRealPath(const std::set<std::string>& paths_set,
+                                      const std::string& path_to_check) {
+  for (const auto& path : paths_set) {
+    UniqueCPtr<const char[]> real_path(realpath(path.c_str(), nullptr));
+    if (real_path == nullptr) {
+      PLOG(WARNING) << "Could not get realpath for " << path;
+      continue;
     }
-    std::string real_app_code_location_str((real_app_code_location == nullptr)
-        ? app_code_location.c_str()
-        : real_app_code_location.get());
-    if (real_app_code_location_str == dex_location_real_path_str) {
-      // The dex location belongs to the application code paths. Nothing to record.
-      return false;
+    std::string real_path_str(real_path.get());
+    if (real_path_str == path_to_check) {
+      return true;
     }
   }
+  return false;
+}
 
+// After the call, dex_location_real_path will contain the marker's name.
+static bool CreateForeignDexMarker(const std::string& foreign_dex_profile_path,
+                                   /*in-out*/ std::string* dex_location_real_path) {
   // For foreign dex files we record a flag on disk. PackageManager will (potentially) take this
   // into account when deciding how to optimize the loaded dex file.
   // The expected flag name is the canonical path of the apk where '/' is substituted to '@'.
   // (it needs to be kept in sync with
   // frameworks/base/services/core/java/com/android/server/pm/PackageDexOptimizer.java)
-  std::replace(dex_location_real_path_str.begin(), dex_location_real_path_str.end(), '/', '@');
-  std::string flag_path = foreign_dex_profile_path + "/" + dex_location_real_path_str;
+  std::replace(dex_location_real_path->begin(), dex_location_real_path->end(), '/', '@');
+  std::string flag_path = foreign_dex_profile_path + "/" + *dex_location_real_path;
   // We use O_RDONLY as the access mode because we must supply some access
   // mode, and there is no access mode that means 'create but do not read' the
   // file. We will not not actually read from the file.
@@ -614,6 +576,57 @@
   }
 }
 
+bool ProfileSaver::MaybeRecordDexUseInternal(
+      const std::string& dex_location,
+      const std::set<std::string>& app_code_paths,
+      const std::string& foreign_dex_profile_path,
+      const std::set<std::string>& app_data_dirs) {
+  if (dex_location.empty()) {
+    LOG(WARNING) << "Asked to record foreign dex use with an empty dex location.";
+    return false;
+  }
+  if (foreign_dex_profile_path.empty()) {
+    LOG(WARNING) << "Asked to record foreign dex use without a valid profile path ";
+    return false;
+  }
+
+  if (app_code_paths.find(dex_location) != app_code_paths.end()) {
+    // The dex location belongs to the application code paths. Nothing to record.
+    return false;
+  }
+
+  if (app_data_dirs.find(dex_location) != app_data_dirs.end()) {
+    // The dex location is under the application folder. Nothing to record.
+    return false;
+  }
+
+  // Do another round of checks with the real paths.
+  // Application directory could be a symlink (e.g. /data/data instead of /data/user/0), and we
+  // don't have control over how the dex files are actually loaded (symlink or canonical path),
+
+  // Note that we could cache all the real locations in the saver (since it's an expensive
+  // operation). However we expect that app_code_paths is small (usually 1 element), and
+  // NotifyDexUse is called just a few times in the app lifetime. So we make the compromise
+  // to save some bytes of memory usage.
+
+  UniqueCPtr<const char[]> dex_location_real_path(realpath(dex_location.c_str(), nullptr));
+  if (dex_location_real_path == nullptr) {
+    PLOG(WARNING) << "Could not get realpath for " << dex_location;
+    return false;
+  }
+  std::string dex_location_real_path_str(dex_location_real_path.get());
+
+  if (CheckContainsWithRealPath(app_code_paths, dex_location_real_path_str)) {
+    return false;
+  }
+
+  if (CheckContainsWithRealPath(app_data_dirs, dex_location_real_path_str)) {
+    return false;
+  }
+
+  return CreateForeignDexMarker(foreign_dex_profile_path, &dex_location_real_path_str);
+}
+
 void ProfileSaver::DumpInstanceInfo(std::ostream& os) {
   MutexLock mu(Thread::Current(), *Locks::profiler_lock_);
   if (instance_ != nullptr) {
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index c322475..7bcadd8 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -592,9 +592,8 @@
     }
     if (c->IsStringClass()) {
       gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-      mirror::SetStringCountVisitor visitor(0);
-      return soa.AddLocalReference<jobject>(mirror::String::Alloc<true>(soa.Self(), 0,
-                                                                        allocator_type, visitor));
+      return soa.AddLocalReference<jobject>(mirror::String::AllocEmptyString<true>(soa.Self(),
+                                                                              allocator_type));
     }
     return soa.AddLocalReference<jobject>(c->AllocObject(soa.Self()));
   }
@@ -1673,8 +1672,14 @@
       ThrowSIOOBE(soa, start, length, s->GetLength());
     } else {
       CHECK_NON_NULL_MEMCPY_ARGUMENT(length, buf);
-      const jchar* chars = s->GetValue();
-      memcpy(buf, chars + start, length * sizeof(jchar));
+      if (s->IsCompressed()) {
+        for (int i = 0; i < length; ++i) {
+          buf[i] = static_cast<jchar>(s->CharAt(start+i));
+        }
+      } else {
+        const jchar* chars = static_cast<jchar*>(s->GetValue());
+        memcpy(buf, chars + start, length * sizeof(jchar));
+      }
     }
   }
 
@@ -1687,9 +1692,15 @@
       ThrowSIOOBE(soa, start, length, s->GetLength());
     } else {
       CHECK_NON_NULL_MEMCPY_ARGUMENT(length, buf);
-      const jchar* chars = s->GetValue();
-      size_t bytes = CountUtf8Bytes(chars + start, length);
-      ConvertUtf16ToModifiedUtf8(buf, bytes, chars + start, length);
+      if (s->IsCompressed()) {
+        for (int i = 0; i < length; ++i) {
+          buf[i] = s->CharAt(start+i);
+        }
+      } else {
+        const jchar* chars = s->GetValue();
+        size_t bytes = CountUtf8Bytes(chars + start, length);
+        ConvertUtf16ToModifiedUtf8(buf, bytes, chars + start, length);
+      }
     }
   }
 
@@ -1698,9 +1709,16 @@
     ScopedObjectAccess soa(env);
     mirror::String* s = soa.Decode<mirror::String*>(java_string);
     gc::Heap* heap = Runtime::Current()->GetHeap();
-    if (heap->IsMovableObject(s)) {
+    if (heap->IsMovableObject(s) || s->IsCompressed()) {
       jchar* chars = new jchar[s->GetLength()];
-      memcpy(chars, s->GetValue(), sizeof(jchar) * s->GetLength());
+      if (s->IsCompressed()) {
+        int32_t length = s->GetLength();
+        for (int i = 0; i < length; ++i) {
+          chars[i] = s->CharAt(i);
+        }
+      } else {
+        memcpy(chars, s->GetValue(), sizeof(jchar) * s->GetLength());
+      }
       if (is_copy != nullptr) {
         *is_copy = JNI_TRUE;
       }
@@ -1716,7 +1734,7 @@
     CHECK_NON_NULL_ARGUMENT_RETURN_VOID(java_string);
     ScopedObjectAccess soa(env);
     mirror::String* s = soa.Decode<mirror::String*>(java_string);
-    if (chars != s->GetValue()) {
+    if (s->IsCompressed() || (s->IsCompressed() == false && chars != s->GetValue())) {
       delete[] chars;
     }
   }
@@ -1737,15 +1755,27 @@
         heap->IncrementDisableThreadFlip(soa.Self());
       }
     }
-    if (is_copy != nullptr) {
-      *is_copy = JNI_FALSE;
+    if (s->IsCompressed()) {
+      if (is_copy != nullptr) {
+        *is_copy = JNI_TRUE;
+      }
+      int32_t length = s->GetLength();
+      jchar* chars = new jchar[length];
+      for (int i = 0; i < length; ++i) {
+        chars[i] = s->CharAt(i);
+      }
+      return chars;
+    } else {
+      if (is_copy != nullptr) {
+        *is_copy = JNI_FALSE;
+      }
+      return static_cast<jchar*>(s->GetValue());
     }
-    return static_cast<jchar*>(s->GetValue());
   }
 
   static void ReleaseStringCritical(JNIEnv* env,
                                     jstring java_string,
-                                    const jchar* chars ATTRIBUTE_UNUSED) {
+                                    const jchar* chars) {
     CHECK_NON_NULL_ARGUMENT_RETURN_VOID(java_string);
     ScopedObjectAccess soa(env);
     gc::Heap* heap = Runtime::Current()->GetHeap();
@@ -1757,6 +1787,9 @@
         heap->DecrementDisableThreadFlip(soa.Self());
       }
     }
+    if (s->IsCompressed() || (s->IsCompressed() == false && s->GetValue() != chars)) {
+      delete[] chars;
+    }
   }
 
   static const char* GetStringUTFChars(JNIEnv* env, jstring java_string, jboolean* is_copy) {
@@ -1771,8 +1804,14 @@
     size_t byte_count = s->GetUtfLength();
     char* bytes = new char[byte_count + 1];
     CHECK(bytes != nullptr);  // bionic aborts anyway.
-    const uint16_t* chars = s->GetValue();
-    ConvertUtf16ToModifiedUtf8(bytes, byte_count, chars, s->GetLength());
+    if (s->IsCompressed()) {
+      for (size_t i = 0; i < byte_count; ++i) {
+        bytes[i] = s->CharAt(i);
+      }
+    } else {
+      const uint16_t* chars = s->GetValue();
+      ConvertUtf16ToModifiedUtf8(bytes, byte_count, chars, s->GetLength());
+    }
     bytes[byte_count] = '\0';
     return bytes;
   }
diff --git a/runtime/jni_internal_test.cc b/runtime/jni_internal_test.cc
index 04ba8df..6495474 100644
--- a/runtime/jni_internal_test.cc
+++ b/runtime/jni_internal_test.cc
@@ -880,8 +880,15 @@
   ASSERT_NE(fid2, nullptr);
   // Make sure we can actually use it.
   jstring s = env_->NewStringUTF("poop");
-  ASSERT_EQ(4, env_->GetIntField(s, fid2));
-
+  if (mirror::kUseStringCompression) {
+    // Negative because s is compressed (first bit is 1)
+    ASSERT_EQ(-2147483644, env_->GetIntField(s, fid2));
+    // Create incompressible string
+    jstring s_16 = env_->NewStringUTF("\u0444\u0444");
+    ASSERT_EQ(2, env_->GetIntField(s_16, fid2));
+  } else {
+    ASSERT_EQ(4, env_->GetIntField(s, fid2));
+  }
   // Bad arguments.
   GetFromReflectedField_ToReflectedFieldBadArgumentTest(false);
   GetFromReflectedField_ToReflectedFieldBadArgumentTest(true);
@@ -1632,13 +1639,28 @@
 
   jboolean is_copy = JNI_TRUE;
   chars = env_->GetStringCritical(s, &is_copy);
-  EXPECT_EQ(JNI_FALSE, is_copy);
+  if (mirror::kUseStringCompression) {
+    // is_copy has to be JNI_TRUE because "hello" is all-ASCII
+    EXPECT_EQ(JNI_TRUE, is_copy);
+  } else {
+    EXPECT_EQ(JNI_FALSE, is_copy);
+  }
   EXPECT_EQ(expected[0], chars[0]);
   EXPECT_EQ(expected[1], chars[1]);
   EXPECT_EQ(expected[2], chars[2]);
   EXPECT_EQ(expected[3], chars[3]);
   EXPECT_EQ(expected[4], chars[4]);
   env_->ReleaseStringCritical(s, chars);
+
+  if (mirror::kUseStringCompression) {
+    // is_copy has to be JNI_FALSE because "\xed\xa0\x81\xed\xb0\x80" is incompressible
+    jboolean is_copy_16 = JNI_TRUE;
+    jstring s_16 = env_->NewStringUTF("\xed\xa0\x81\xed\xb0\x80");
+    chars = env_->GetStringCritical(s_16, &is_copy_16);
+    EXPECT_EQ(2, env_->GetStringLength(s_16));
+    EXPECT_EQ(4, env_->GetStringUTFLength(s_16));
+    env_->ReleaseStringCritical(s_16, chars);
+  }
 }
 
 TEST_F(JniInternalTest, GetObjectArrayElement_SetObjectArrayElement) {
diff --git a/runtime/mirror/dex_cache.h b/runtime/mirror/dex_cache.h
index 770c45d..4ddfc7b 100644
--- a/runtime/mirror/dex_cache.h
+++ b/runtime/mirror/dex_cache.h
@@ -57,7 +57,6 @@
   // Set the initial state for the 0th entry to be {0,1} which is guaranteed to fail
   // the lookup string id == stored id branch.
   static void Initialize(StringDexCacheType* strings) {
-    DCHECK(StringDexCacheType().is_lock_free());
     mirror::StringDexCachePair first_elem;
     first_elem.string_pointer = GcRoot<String>(nullptr);
     first_elem.string_index = 1;
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index 0495c95..27f8bd7 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -147,6 +147,18 @@
 #endif
 }
 
+inline Object* Object::GetReadBarrierPointerAcquire() {
+#ifdef USE_BAKER_READ_BARRIER
+  DCHECK(kUseBakerReadBarrier);
+  LockWord lw(GetFieldAcquire<uint32_t>(OFFSET_OF_OBJECT_MEMBER(Object, monitor_)));
+  return reinterpret_cast<Object*>(lw.ReadBarrierState());
+#else
+  LOG(FATAL) << "Unreachable";
+  UNREACHABLE();
+#endif
+}
+
+
 inline uint32_t Object::GetMarkBit() {
 #ifdef USE_READ_BARRIER
   return GetLockWord(false).MarkBitState();
@@ -814,6 +826,13 @@
   }
 }
 
+template<typename kSize>
+inline kSize Object::GetFieldAcquire(MemberOffset field_offset) {
+  const uint8_t* raw_addr = reinterpret_cast<const uint8_t*>(this) + field_offset.Int32Value();
+  const kSize* addr = reinterpret_cast<const kSize*>(raw_addr);
+  return reinterpret_cast<const Atomic<kSize>*>(addr)->LoadAcquire();
+}
+
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
 inline bool Object::CasFieldWeakSequentiallyConsistent64(MemberOffset field_offset,
                                                          int64_t old_value, int64_t new_value) {
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index 5b129bf..8649294 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -93,9 +93,12 @@
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   void SetClass(Class* new_klass) SHARED_REQUIRES(Locks::mutator_lock_);
 
-  // TODO: Clean this up and change to return int32_t
+  // TODO: Clean these up and change to return int32_t
   Object* GetReadBarrierPointer() SHARED_REQUIRES(Locks::mutator_lock_);
 
+  // Get the read barrier pointer with release semantics, only supported for baker.
+  Object* GetReadBarrierPointerAcquire() SHARED_REQUIRES(Locks::mutator_lock_);
+
 #ifndef USE_BAKER_OR_BROOKS_READ_BARRIER
   NO_RETURN
 #endif
@@ -574,6 +577,10 @@
   template<typename kSize, bool kIsVolatile>
   ALWAYS_INLINE kSize GetField(MemberOffset field_offset)
       SHARED_REQUIRES(Locks::mutator_lock_);
+  // Get a field with acquire semantics.
+  template<typename kSize>
+  ALWAYS_INLINE kSize GetFieldAcquire(MemberOffset field_offset)
+      SHARED_REQUIRES(Locks::mutator_lock_);
 
   // Verify the type correctness of stores to fields.
   // TODO: This can cause thread suspension and isn't moving GC safe.
diff --git a/runtime/mirror/object_test.cc b/runtime/mirror/object_test.cc
index 0034220..b35a479 100644
--- a/runtime/mirror/object_test.cc
+++ b/runtime/mirror/object_test.cc
@@ -62,7 +62,7 @@
     Handle<String> string(
         hs.NewHandle(String::AllocFromModifiedUtf8(self, expected_utf16_length, utf8_in)));
     ASSERT_EQ(expected_utf16_length, string->GetLength());
-    ASSERT_TRUE(string->GetValue() != nullptr);
+    ASSERT_EQ(string->IsValueNull(), false);
     // strlen is necessary because the 1-character string "\x00\x00" is interpreted as ""
     ASSERT_TRUE(string->Equals(utf8_in) || (expected_utf16_length == 1 && strlen(utf8_in) == 0));
     ASSERT_TRUE(string->Equals(StringPiece(utf8_in)) ||
diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h
index d3660e5..bc39ea8 100644
--- a/runtime/mirror/string-inl.h
+++ b/runtime/mirror/string-inl.h
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #ifndef ART_RUNTIME_MIRROR_STRING_INL_H_
 #define ART_RUNTIME_MIRROR_STRING_INL_H_
 
@@ -49,6 +48,7 @@
     // Avoid AsString as object is not yet in live bitmap or allocation stack.
     String* string = down_cast<String*>(obj);
     string->SetCount(count_);
+    DCHECK(!string->IsCompressed() || kUseStringCompression);
   }
 
  private:
@@ -68,10 +68,19 @@
     // Avoid AsString as object is not yet in live bitmap or allocation stack.
     String* string = down_cast<String*>(obj);
     string->SetCount(count_);
-    uint16_t* value = string->GetValue();
+    DCHECK(!string->IsCompressed() || kUseStringCompression);
+    int32_t length = String::GetLengthFromCount(count_);
     const uint8_t* const src = reinterpret_cast<uint8_t*>(src_array_->GetData()) + offset_;
-    for (int i = 0; i < count_; i++) {
-      value[i] = high_byte_ + (src[i] & 0xFF);
+    if (string->IsCompressed()) {
+      uint8_t* valueCompressed = string->GetValueCompressed();
+      for (int i = 0; i < length; i++) {
+        valueCompressed[i] = (src[i] & 0xFF);
+      }
+    } else {
+      uint16_t* value = string->GetValue();
+      for (int i = 0; i < length; i++) {
+        value[i] = high_byte_ + (src[i] & 0xFF);
+      }
     }
   }
 
@@ -96,7 +105,16 @@
     String* string = down_cast<String*>(obj);
     string->SetCount(count_);
     const uint16_t* const src = src_array_->GetData() + offset_;
-    memcpy(string->GetValue(), src, count_ * sizeof(uint16_t));
+    const int32_t length = String::GetLengthFromCount(count_);
+    bool compressible = kUseStringCompression && String::GetCompressionFlagFromCount(count_);
+    DCHECK(!compressible || kUseStringCompression);
+    if (compressible) {
+      for (int i = 0; i < length; ++i) {
+        string->GetValueCompressed()[i] = static_cast<uint8_t>(src[i]);
+      }
+    } else {
+      memcpy(string->GetValue(), src, length * sizeof(uint16_t));
+    }
   }
 
  private:
@@ -118,8 +136,22 @@
     // Avoid AsString as object is not yet in live bitmap or allocation stack.
     String* string = down_cast<String*>(obj);
     string->SetCount(count_);
-    const uint16_t* const src = src_string_->GetValue() + offset_;
-    memcpy(string->GetValue(), src, count_ * sizeof(uint16_t));
+    const int32_t length = String::GetLengthFromCount(count_);
+    bool compressible = kUseStringCompression && String::GetCompressionFlagFromCount(count_);
+    DCHECK(!compressible || kUseStringCompression);
+    if (src_string_->IsCompressed()) {
+      const uint8_t* const src = src_string_->GetValueCompressed() + offset_;
+      memcpy(string->GetValueCompressed(), src, length * sizeof(uint8_t));
+    } else {
+      const uint16_t* const src = src_string_->GetValue() + offset_;
+      if (compressible) {
+        for (int i = 0; i < length; ++i) {
+          string->GetValueCompressed()[i] = static_cast<uint8_t>(src[i]);
+        }
+      } else {
+        memcpy(string->GetValue(), src, length * sizeof(uint16_t));
+      }
+    }
   }
 
  private:
@@ -133,17 +165,38 @@
 }
 
 inline uint16_t String::CharAt(int32_t index) {
-  int32_t count = GetField32(OFFSET_OF_OBJECT_MEMBER(String, count_));
+  int32_t count = GetLength();
   if (UNLIKELY((index < 0) || (index >= count))) {
     ThrowStringIndexOutOfBoundsException(index, count);
     return 0;
   }
-  return GetValue()[index];
+  if (IsCompressed()) {
+    return GetValueCompressed()[index];
+  } else {
+    return GetValue()[index];
+  }
+}
+
+template <typename MemoryType>
+int32_t String::FastIndexOf(MemoryType* chars, int32_t ch, int32_t start) {
+  const MemoryType* p = chars + start;
+  const MemoryType* end = chars + GetLength();
+  while (p < end) {
+    if (*p++ == ch) {
+      return (p - 1) - chars;
+    }
+  }
+  return -1;
 }
 
 template<VerifyObjectFlags kVerifyFlags>
 inline size_t String::SizeOf() {
-  size_t size = sizeof(String) + (sizeof(uint16_t) * GetLength<kVerifyFlags>());
+  size_t size = sizeof(String);
+  if (IsCompressed()) {
+    size += (sizeof(uint8_t) * GetLength<kVerifyFlags>());
+  } else {
+    size += (sizeof(uint16_t) * GetLength<kVerifyFlags>());
+  }
   // String.equals() intrinsics assume zero-padding up to kObjectAlignment,
   // so make sure the zero-padding is actually copied around if GC compaction
   // chooses to copy only SizeOf() bytes.
@@ -152,31 +205,35 @@
 }
 
 template <bool kIsInstrumented, typename PreFenceVisitor>
-inline String* String::Alloc(Thread* self, int32_t utf16_length, gc::AllocatorType allocator_type,
+inline String* String::Alloc(Thread* self, int32_t utf16_length_with_flag,
+                             gc::AllocatorType allocator_type,
                              const PreFenceVisitor& pre_fence_visitor) {
   constexpr size_t header_size = sizeof(String);
-  static_assert(sizeof(utf16_length) <= sizeof(size_t),
+  const bool compressible = kUseStringCompression &&
+                            String::GetCompressionFlagFromCount(utf16_length_with_flag);
+  const size_t block_size = (compressible) ? sizeof(uint8_t) : sizeof(uint16_t);
+  size_t length = String::GetLengthFromCount(utf16_length_with_flag);
+  static_assert(sizeof(length) <= sizeof(size_t),
                 "static_cast<size_t>(utf16_length) must not lose bits.");
-  size_t length = static_cast<size_t>(utf16_length);
-  size_t data_size = sizeof(uint16_t) * length;
+  size_t data_size = block_size * length;
   size_t size = header_size + data_size;
   // String.equals() intrinsics assume zero-padding up to kObjectAlignment,
   // so make sure the allocator clears the padding as well.
   // http://b/23528461
   size_t alloc_size = RoundUp(size, kObjectAlignment);
-  Class* string_class = GetJavaLangString();
 
+  Class* string_class = GetJavaLangString();
   // Check for overflow and throw OutOfMemoryError if this was an unreasonable request.
   // Do this by comparing with the maximum length that will _not_ cause an overflow.
-  constexpr size_t overflow_length = (-header_size) / sizeof(uint16_t);  // Unsigned arithmetic.
-  constexpr size_t max_alloc_length = overflow_length - 1u;
+  const size_t overflow_length = (-header_size) / block_size;   // Unsigned arithmetic.
+  const size_t max_alloc_length = overflow_length - 1u;
   static_assert(IsAligned<sizeof(uint16_t)>(kObjectAlignment),
                 "kObjectAlignment must be at least as big as Java char alignment");
-  constexpr size_t max_length = RoundDown(max_alloc_length, kObjectAlignment / sizeof(uint16_t));
+  const size_t max_length = RoundDown(max_alloc_length, kObjectAlignment / block_size);
   if (UNLIKELY(length > max_length)) {
     self->ThrowOutOfMemoryError(StringPrintf("%s of length %d would overflow",
                                              PrettyDescriptor(string_class).c_str(),
-                                             utf16_length).c_str());
+                                             static_cast<int>(length)).c_str());
     return nullptr;
   }
 
@@ -187,11 +244,22 @@
 }
 
 template <bool kIsInstrumented>
+inline String* String::AllocEmptyString(Thread* self, gc::AllocatorType allocator_type) {
+  SetStringCountVisitor visitor(0);
+  return Alloc<kIsInstrumented>(self, 0, allocator_type, visitor);
+}
+
+template <bool kIsInstrumented>
 inline String* String::AllocFromByteArray(Thread* self, int32_t byte_length,
                                           Handle<ByteArray> array, int32_t offset,
                                           int32_t high_byte, gc::AllocatorType allocator_type) {
-  SetStringCountAndBytesVisitor visitor(byte_length, array, offset, high_byte << 8);
-  String* string = Alloc<kIsInstrumented>(self, byte_length, allocator_type, visitor);
+  const uint8_t* const src = reinterpret_cast<uint8_t*>(array->GetData()) + offset;
+  const bool compressible = kUseStringCompression && String::AllASCII<uint8_t>(src, byte_length)
+                                            && (high_byte == 0);
+  const int32_t length_with_flag = (compressible) ? String::GetFlaggedCount(byte_length)
+                                                  : byte_length;
+  SetStringCountAndBytesVisitor visitor(length_with_flag, array, offset, high_byte << 8);
+  String* string = Alloc<kIsInstrumented>(self, length_with_flag, allocator_type, visitor);
   return string;
 }
 
@@ -201,16 +269,24 @@
                                           gc::AllocatorType allocator_type) {
   // It is a caller error to have a count less than the actual array's size.
   DCHECK_GE(array->GetLength(), count);
-  SetStringCountAndValueVisitorFromCharArray visitor(count, array, offset);
-  String* new_string = Alloc<kIsInstrumented>(self, count, allocator_type, visitor);
+  const bool compressible = kUseStringCompression &&
+                            String::AllASCII<uint16_t>(array->GetData() + offset, count);
+  const int32_t length_with_flag = (compressible) ? String::GetFlaggedCount(count) : count;
+  SetStringCountAndValueVisitorFromCharArray visitor(length_with_flag, array, offset);
+  String* new_string = Alloc<kIsInstrumented>(self, length_with_flag, allocator_type, visitor);
   return new_string;
 }
 
 template <bool kIsInstrumented>
 inline String* String::AllocFromString(Thread* self, int32_t string_length, Handle<String> string,
                                        int32_t offset, gc::AllocatorType allocator_type) {
-  SetStringCountAndValueVisitorFromString visitor(string_length, string, offset);
-  String* new_string = Alloc<kIsInstrumented>(self, string_length, allocator_type, visitor);
+  const bool compressible = kUseStringCompression &&
+      ((string->IsCompressed()) ? true : String::AllASCII<uint16_t>(string->GetValue() + offset,
+                                                                    string_length));
+  const int32_t length_with_flag = (compressible) ? String::GetFlaggedCount(string_length)
+                                                  : string_length;
+  SetStringCountAndValueVisitorFromString visitor(length_with_flag, string, offset);
+  String* new_string = Alloc<kIsInstrumented>(self, length_with_flag, allocator_type, visitor);
   return new_string;
 }
 
@@ -219,11 +295,28 @@
   if (UNLIKELY(result == 0)) {
     result = ComputeHashCode();
   }
-  DCHECK(result != 0 || ComputeUtf16Hash(GetValue(), GetLength()) == 0)
-      << ToModifiedUtf8() << " " << result;
+  if (kIsDebugBuild) {
+    if (IsCompressed()) {
+      DCHECK(result != 0 || ComputeUtf16Hash(GetValueCompressed(), GetLength()) == 0)
+          << ToModifiedUtf8() << " " << result;
+    } else {
+      DCHECK(result != 0 || ComputeUtf16Hash(GetValue(), GetLength()) == 0)
+          << ToModifiedUtf8() << " " << result;
+    }
+  }
   return result;
 }
 
+template<typename MemoryType>
+bool String::AllASCII(const MemoryType* const chars, const int length) {
+  for (int i = 0; i < length; ++i) {
+    if (chars[i] > 0x80) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace mirror
 }  // namespace art
 
diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc
index 33aca03..46caa4d 100644
--- a/runtime/mirror/string.cc
+++ b/runtime/mirror/string.cc
@@ -41,15 +41,11 @@
   } else if (start > count) {
     start = count;
   }
-  const uint16_t* chars = GetValue();
-  const uint16_t* p = chars + start;
-  const uint16_t* end = chars + count;
-  while (p < end) {
-    if (*p++ == ch) {
-      return (p - 1) - chars;
-    }
+  if (IsCompressed()) {
+    return FastIndexOf<uint8_t>(GetValueCompressed(), ch, start);
+  } else {
+    return FastIndexOf<uint16_t>(GetValue(), ch, start);
   }
-  return -1;
 }
 
 void String::SetClass(Class* java_lang_String) {
@@ -65,45 +61,91 @@
 }
 
 int String::ComputeHashCode() {
-  const int32_t hash_code = ComputeUtf16Hash(GetValue(), GetLength());
+  int32_t hash_code = 0;
+  if (IsCompressed()) {
+    hash_code = ComputeUtf16Hash(GetValueCompressed(), GetLength());
+  } else {
+    hash_code = ComputeUtf16Hash(GetValue(), GetLength());
+  }
   SetHashCode(hash_code);
   return hash_code;
 }
 
 int32_t String::GetUtfLength() {
-  return CountUtf8Bytes(GetValue(), GetLength());
+  if (IsCompressed()) {
+    return GetLength();
+  } else {
+    return CountUtf8Bytes(GetValue(), GetLength());
+  }
 }
 
 void String::SetCharAt(int32_t index, uint16_t c) {
-  DCHECK((index >= 0) && (index < count_));
-  GetValue()[index] = c;
+  DCHECK((index >= 0) && (index < GetLength()));
+  if (IsCompressed()) {
+    // TODO: Handle the case where String is compressed and c is non-ASCII
+    GetValueCompressed()[index] = static_cast<uint8_t>(c);
+  } else {
+    GetValue()[index] = c;
+  }
 }
 
 String* String::AllocFromStrings(Thread* self, Handle<String> string, Handle<String> string2) {
   int32_t length = string->GetLength();
   int32_t length2 = string2->GetLength();
   gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-  SetStringCountVisitor visitor(length + length2);
-  String* new_string = Alloc<true>(self, length + length2, allocator_type, visitor);
+  const bool compressible = kUseStringCompression && (string->IsCompressed() && string2->IsCompressed());
+  const int32_t length_with_flag = (compressible) ? String::GetFlaggedCount(length + length2)
+                                                  : (length + length2);
+
+  SetStringCountVisitor visitor(length_with_flag);
+  String* new_string = Alloc<true>(self, length_with_flag, allocator_type, visitor);
   if (UNLIKELY(new_string == nullptr)) {
     return nullptr;
   }
-  uint16_t* new_value = new_string->GetValue();
-  memcpy(new_value, string->GetValue(), length * sizeof(uint16_t));
-  memcpy(new_value + length, string2->GetValue(), length2 * sizeof(uint16_t));
+  if (compressible) {
+    uint8_t* new_value = new_string->GetValueCompressed();
+    memcpy(new_value, string->GetValueCompressed(), length * sizeof(uint8_t));
+    memcpy(new_value + length, string2->GetValueCompressed(), length2 * sizeof(uint8_t));
+  } else {
+    uint16_t* new_value = new_string->GetValue();
+    if (string->IsCompressed()) {
+      for (int i = 0; i < length; ++i) {
+        new_value[i] = string->CharAt(i);
+      }
+    } else {
+      memcpy(new_value, string->GetValue(), length * sizeof(uint16_t));
+    }
+    if (string2->IsCompressed()) {
+      for (int i = 0; i < length2; ++i) {
+        new_value[i+length] = string2->CharAt(i);
+      }
+    } else {
+      memcpy(new_value + length, string2->GetValue(), length2 * sizeof(uint16_t));
+    }
+  }
   return new_string;
 }
 
 String* String::AllocFromUtf16(Thread* self, int32_t utf16_length, const uint16_t* utf16_data_in) {
   CHECK(utf16_data_in != nullptr || utf16_length == 0);
   gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-  SetStringCountVisitor visitor(utf16_length);
-  String* string = Alloc<true>(self, utf16_length, allocator_type, visitor);
+  const bool compressible = kUseStringCompression &&
+                            String::AllASCII<uint16_t>(utf16_data_in, utf16_length);
+  int32_t length_with_flag = (compressible) ? String::GetFlaggedCount(utf16_length)
+                                            : utf16_length;
+  SetStringCountVisitor visitor(length_with_flag);
+  String* string = Alloc<true>(self, length_with_flag, allocator_type, visitor);
   if (UNLIKELY(string == nullptr)) {
     return nullptr;
   }
-  uint16_t* array = string->GetValue();
-  memcpy(array, utf16_data_in, utf16_length * sizeof(uint16_t));
+  if (compressible) {
+    for (int i = 0; i < utf16_length; ++i) {
+      string->GetValueCompressed()[i] = static_cast<uint8_t>(utf16_data_in[i]);
+    }
+  } else {
+    uint16_t* array = string->GetValue();
+    memcpy(array, utf16_data_in, utf16_length * sizeof(uint16_t));
+  }
   return string;
 }
 
@@ -121,13 +163,20 @@
 String* String::AllocFromModifiedUtf8(Thread* self, int32_t utf16_length,
                                       const char* utf8_data_in, int32_t utf8_length) {
   gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-  SetStringCountVisitor visitor(utf16_length);
-  String* string = Alloc<true>(self, utf16_length, allocator_type, visitor);
+  const bool compressible = kUseStringCompression && (utf16_length == utf8_length);
+  const int32_t utf16_length_with_flag = (compressible) ? String::GetFlaggedCount(utf16_length)
+                                                        : utf16_length;
+  SetStringCountVisitor visitor(utf16_length_with_flag);
+  String* string = Alloc<true>(self, utf16_length_with_flag, allocator_type, visitor);
   if (UNLIKELY(string == nullptr)) {
     return nullptr;
   }
-  uint16_t* utf16_data_out = string->GetValue();
-  ConvertModifiedUtf8ToUtf16(utf16_data_out, utf16_length, utf8_data_in, utf8_length);
+  if (compressible) {
+    memcpy(string->GetValueCompressed(), utf8_data_in, utf16_length * sizeof(uint8_t));
+  } else {
+    uint16_t* utf16_data_out = string->GetValue();
+    ConvertModifiedUtf8ToUtf16(utf16_data_out, utf16_length, utf8_data_in, utf8_length);
+  }
   return string;
 }
 
@@ -219,10 +268,16 @@
 
 // Create a modified UTF-8 encoded std::string from a java/lang/String object.
 std::string String::ToModifiedUtf8() {
-  const uint16_t* chars = GetValue();
   size_t byte_count = GetUtfLength();
   std::string result(byte_count, static_cast<char>(0));
-  ConvertUtf16ToModifiedUtf8(&result[0], byte_count, chars, GetLength());
+  if (IsCompressed()) {
+    for (size_t i = 0; i < byte_count; ++i) {
+      result[i] = static_cast<char>(CharAt(i));
+    }
+  } else {
+    const uint16_t* chars = GetValue();
+    ConvertUtf16ToModifiedUtf8(&result[0], byte_count, chars, GetLength());
+  }
   return result;
 }
 
@@ -242,11 +297,24 @@
   int32_t rhsCount = rhs->GetLength();
   int32_t countDiff = lhsCount - rhsCount;
   int32_t minCount = (countDiff < 0) ? lhsCount : rhsCount;
-  const uint16_t* lhsChars = lhs->GetValue();
-  const uint16_t* rhsChars = rhs->GetValue();
-  int32_t otherRes = MemCmp16(lhsChars, rhsChars, minCount);
-  if (otherRes != 0) {
-    return otherRes;
+  if (lhs->IsCompressed() && rhs->IsCompressed()) {
+    int32_t comparison = memcmp(lhs->GetValueCompressed(), rhs->GetValueCompressed(), minCount * sizeof(uint8_t));
+    if (comparison != 0) {
+      return comparison;
+    }
+  } else if (lhs->IsCompressed() || rhs->IsCompressed()) {
+    for (int32_t i = 0; i < minCount; ++i) {
+      if (lhs->CharAt(i) != rhs->CharAt(i)) {
+        return static_cast<int32_t>(lhs->CharAt(i)) - static_cast<int32_t>(rhs->CharAt(i));
+      }
+    }
+  } else {
+    const uint16_t* lhsChars = lhs->GetValue();
+    const uint16_t* rhsChars = rhs->GetValue();
+    int32_t otherRes = MemCmp16(lhsChars, rhsChars, minCount);
+    if (otherRes != 0) {
+      return otherRes;
+    }
   }
   return countDiff;
 }
@@ -260,7 +328,14 @@
   Handle<String> string(hs.NewHandle(this));
   CharArray* result = CharArray::Alloc(self, GetLength());
   if (result != nullptr) {
-    memcpy(result->GetData(), string->GetValue(), string->GetLength() * sizeof(uint16_t));
+    if (string->IsCompressed()) {
+      int32_t length = string->GetLength();
+      for (int i = 0; i < length; ++i) {
+        result->GetData()[i] = string->CharAt(i);
+      }
+    } else {
+      memcpy(result->GetData(), string->GetValue(), string->GetLength() * sizeof(uint16_t));
+    }
   } else {
     self->AssertPendingOOMException();
   }
@@ -269,8 +344,18 @@
 
 void String::GetChars(int32_t start, int32_t end, Handle<CharArray> array, int32_t index) {
   uint16_t* data = array->GetData() + index;
-  uint16_t* value = GetValue() + start;
-  memcpy(data, value, (end - start) * sizeof(uint16_t));
+  if (IsCompressed()) {
+    for (int i = start; i < end; ++i) {
+      data[i-start] = CharAt(i);
+    }
+  } else {
+    uint16_t* value = GetValue() + start;
+    memcpy(data, value, (end - start) * sizeof(uint16_t));
+  }
+}
+
+bool String::IsValueNull() {
+  return (IsCompressed()) ? (GetValueCompressed() == nullptr) : (GetValue() == nullptr);
 }
 
 }  // namespace mirror
diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h
index d492ba3..8695fe8 100644
--- a/runtime/mirror/string.h
+++ b/runtime/mirror/string.h
@@ -31,6 +31,9 @@
 
 namespace mirror {
 
+// String Compression
+static constexpr bool kUseStringCompression = false;
+
 // C++ mirror of java.lang.String
 class MANAGED String FINAL : public Object {
  public:
@@ -54,18 +57,28 @@
     return &value_[0];
   }
 
+  uint8_t* GetValueCompressed() SHARED_REQUIRES(Locks::mutator_lock_) {
+    return &value_compressed_[0];
+  }
+
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   size_t SizeOf() SHARED_REQUIRES(Locks::mutator_lock_);
 
+  // Taking out the first/uppermost bit because it is not part of actual length value
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   int32_t GetLength() SHARED_REQUIRES(Locks::mutator_lock_) {
+    return GetLengthFromCount(GetCount<kVerifyFlags>());
+  }
+
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  int32_t GetCount() SHARED_REQUIRES(Locks::mutator_lock_) {
     return GetField32<kVerifyFlags>(OFFSET_OF_OBJECT_MEMBER(String, count_));
   }
 
   void SetCount(int32_t new_count) SHARED_REQUIRES(Locks::mutator_lock_) {
     // Count is invariant so use non-transactional mode. Also disable check as we may run inside
     // a transaction.
-    DCHECK_LE(0, new_count);
+    DCHECK_LE(0, (new_count & INT32_MAX));
     SetField32<false, false>(OFFSET_OF_OBJECT_MEMBER(String, count_), new_count);
   }
 
@@ -82,12 +95,6 @@
 
   String* Intern() SHARED_REQUIRES(Locks::mutator_lock_);
 
-  template <bool kIsInstrumented, typename PreFenceVisitor>
-  ALWAYS_INLINE static String* Alloc(Thread* self, int32_t utf16_length,
-                                     gc::AllocatorType allocator_type,
-                                     const PreFenceVisitor& pre_fence_visitor)
-      SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
-
   template <bool kIsInstrumented>
   ALWAYS_INLINE static String* AllocFromByteArray(Thread* self, int32_t byte_length,
                                                   Handle<ByteArray> array, int32_t offset,
@@ -107,6 +114,11 @@
                                                gc::AllocatorType allocator_type)
       SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
 
+  template <bool kIsInstrumented>
+  ALWAYS_INLINE static String* AllocEmptyString(Thread* self,
+                                                gc::AllocatorType allocator_type)
+      SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
+
   static String* AllocFromStrings(Thread* self, Handle<String> string, Handle<String> string2)
       SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
 
@@ -149,6 +161,10 @@
 
   int32_t FastIndexOf(int32_t ch, int32_t start) SHARED_REQUIRES(Locks::mutator_lock_);
 
+  template <typename MemoryType>
+  int32_t FastIndexOf(MemoryType* chars, int32_t ch, int32_t start)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
   int32_t CompareTo(String* other) SHARED_REQUIRES(Locks::mutator_lock_);
 
   CharArray* ToCharArray(Thread* self) SHARED_REQUIRES(Locks::mutator_lock_)
@@ -157,6 +173,28 @@
   void GetChars(int32_t start, int32_t end, Handle<CharArray> array, int32_t index)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  bool IsCompressed() SHARED_REQUIRES(Locks::mutator_lock_) {
+    return kUseStringCompression && GetCompressionFlagFromCount(GetCount());
+  }
+
+  bool IsValueNull() SHARED_REQUIRES(Locks::mutator_lock_);
+
+  template<typename MemoryType>
+  static bool AllASCII(const MemoryType* const chars, const int length);
+
+  ALWAYS_INLINE static bool GetCompressionFlagFromCount(const int32_t count) {
+    return kUseStringCompression && ((count & (1u << 31)) != 0);
+  }
+
+  ALWAYS_INLINE static int32_t GetLengthFromCount(const int32_t count) {
+    return kUseStringCompression ? (count & INT32_MAX) : count;
+  }
+
+  ALWAYS_INLINE static int32_t GetFlaggedCount(const int32_t count) {
+    return kUseStringCompression ? (count | (1u << 31)) : count;
+  }
+
   static Class* GetJavaLangString() SHARED_REQUIRES(Locks::mutator_lock_) {
     DCHECK(!java_lang_String_.IsNull());
     return java_lang_String_.Read();
@@ -174,12 +212,24 @@
     SetField32<false, false>(OFFSET_OF_OBJECT_MEMBER(String, hash_code_), new_hash_code);
   }
 
+  template <bool kIsInstrumented, typename PreFenceVisitor>
+  ALWAYS_INLINE static String* Alloc(Thread* self, int32_t utf16_length_with_flag,
+                                     gc::AllocatorType allocator_type,
+                                     const PreFenceVisitor& pre_fence_visitor)
+      SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
+
   // Field order required by test "ValidateFieldOrderOfJavaCppUnionClasses".
+  // First bit (uppermost/leftmost) is taken out for Compressed/Uncompressed flag
+  // [0] Uncompressed: string uses 16-bit memory | [1] Compressed: 8-bit memory
   int32_t count_;
 
   uint32_t hash_code_;
 
-  uint16_t value_[0];
+  // Compression of all-ASCII into 8-bit memory leads to usage one of these fields
+  union {
+    uint16_t value_[0];
+    uint8_t value_compressed_[0];
+  };
 
   static GcRoot<Class> java_lang_String_;
 
diff --git a/runtime/native/java_lang_Class.cc b/runtime/native/java_lang_Class.cc
index 6d5e7c7..d4e54cf 100644
--- a/runtime/native/java_lang_Class.cc
+++ b/runtime/native/java_lang_Class.cc
@@ -198,12 +198,25 @@
   }
   size_t low = 0;
   size_t high = fields->size();
-  const uint16_t* const data = name->GetValue();
+  const bool is_name_compressed = name->IsCompressed();
+  const uint16_t* const data = (is_name_compressed) ? nullptr : name->GetValue();
+  const uint8_t* const data_compressed = (is_name_compressed) ? name->GetValueCompressed()
+                                                              : nullptr;
   const size_t length = name->GetLength();
   while (low < high) {
     auto mid = (low + high) / 2;
     ArtField& field = fields->At(mid);
-    int result = CompareModifiedUtf8ToUtf16AsCodePointValues(field.GetName(), data, length);
+    int result = 0;
+    if (is_name_compressed) {
+      size_t field_length = strlen(field.GetName());
+      size_t min_size = (length < field_length) ? length : field_length;
+      result = memcmp(field.GetName(), data_compressed, min_size);
+      if (result == 0) {
+        result = field_length - length;
+      }
+    } else {
+      result = CompareModifiedUtf8ToUtf16AsCodePointValues(field.GetName(), data, length);
+    }
     // Alternate approach, only a few % faster at the cost of more allocations.
     // int result = field->GetStringName(self, true)->CompareTo(name);
     if (result < 0) {
@@ -636,8 +649,7 @@
   // Invoke the string allocator to return an empty string for the string class.
   if (klass->IsStringClass()) {
     gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-    mirror::SetStringCountVisitor visitor(0);
-    mirror::Object* obj = mirror::String::Alloc<true>(soa.Self(), 0, allocator_type, visitor);
+    mirror::Object* obj = mirror::String::AllocEmptyString<true>(soa.Self(), allocator_type);
     if (UNLIKELY(soa.Self()->IsExceptionPending())) {
       return nullptr;
     } else {
diff --git a/runtime/native/libcore_util_CharsetUtils.cc b/runtime/native/libcore_util_CharsetUtils.cc
index 1216824..64d56f6 100644
--- a/runtime/native/libcore_util_CharsetUtils.cc
+++ b/runtime/native/libcore_util_CharsetUtils.cc
@@ -165,10 +165,9 @@
     return nullptr;
   }
 
-  const jchar* src = &(string->GetValue()[offset]);
   jbyte* dst = &bytes[0];
-  for (int i = length - 1; i >= 0; --i) {
-    jchar ch = *src++;
+  for (int i = 0; i < length; ++i) {
+    jchar ch = string->CharAt(offset + i);
     if (ch > maxValidChar) {
       ch = '?';
     }
diff --git a/runtime/simulator/Android.mk b/runtime/simulator/Android.mk
index a34a841..e39af2d 100644
--- a/runtime/simulator/Android.mk
+++ b/runtime/simulator/Android.mk
@@ -22,6 +22,9 @@
   code_simulator.cc \
   code_simulator_arm64.cc
 
+LIBART_SIMULATOR_CFLAGS := \
+  -DVIXL_INCLUDE_SIMULATOR_AARCH64
+
 # $(1): target or host
 # $(2): ndebug or debug
 define build-libart-simulator
@@ -54,6 +57,7 @@
   LOCAL_MODULE_CLASS := SHARED_LIBRARIES
 
   LOCAL_SRC_FILES := $$(LIBART_SIMULATOR_SRC_FILES)
+  LOCAL_CFLAGS := $$(LIBART_SIMULATOR_CFLAGS)
 
   ifeq ($$(art_target_or_host),target)
     $(call set-target-local-clang-vars)
diff --git a/runtime/stack.h b/runtime/stack.h
index cf33ae1..850d2a4 100644
--- a/runtime/stack.h
+++ b/runtime/stack.h
@@ -224,7 +224,6 @@
   int64_t GetVRegLong(size_t i) const {
     DCHECK_LT(i, NumberOfVRegs());
     const uint32_t* vreg = &vregs_[i];
-    // Alignment attribute required for GCC 4.8
     typedef const int64_t unaligned_int64 __attribute__ ((aligned (4)));
     return *reinterpret_cast<unaligned_int64*>(vreg);
   }
@@ -232,7 +231,6 @@
   double GetVRegDouble(size_t i) const {
     DCHECK_LT(i, NumberOfVRegs());
     const uint32_t* vreg = &vregs_[i];
-    // Alignment attribute required for GCC 4.8
     typedef const double unaligned_double __attribute__ ((aligned (4)));
     return *reinterpret_cast<unaligned_double*>(vreg);
   }
@@ -289,7 +287,6 @@
   void SetVRegLong(size_t i, int64_t val) {
     DCHECK_LT(i, NumberOfVRegs());
     uint32_t* vreg = &vregs_[i];
-    // Alignment attribute required for GCC 4.8
     typedef int64_t unaligned_int64 __attribute__ ((aligned (4)));
     *reinterpret_cast<unaligned_int64*>(vreg) = val;
     // This is needed for moving collectors since these can update the vreg references if they
@@ -303,7 +300,6 @@
   void SetVRegDouble(size_t i, double val) {
     DCHECK_LT(i, NumberOfVRegs());
     uint32_t* vreg = &vregs_[i];
-    // Alignment attribute required for GCC 4.8
     typedef double unaligned_double __attribute__ ((aligned (4)));
     *reinterpret_cast<unaligned_double*>(vreg) = val;
     // This is needed for moving collectors since these can update the vreg references if they
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 3aa1fc2..216d8a7 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -224,6 +224,7 @@
         thread_to_pass = this;
       }
       MutexLock mu(thread_to_pass, *Locks::thread_suspend_count_lock_);
+      ScopedTransitioningToRunnable scoped_transitioning_to_runnable(this);
       old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
       DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
       while ((old_state_and_flags.as_struct.flags & kSuspendRequest) != 0) {
diff --git a/runtime/thread.cc b/runtime/thread.cc
index b35a614..79b9f02 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -1217,10 +1217,8 @@
   ScopedTrace trace(__FUNCTION__);
   VLOG(threads) << this << " self-suspending";
   // Make thread appear suspended to other threads, release mutator_lock_.
-  tls32_.suspended_at_suspend_check = true;
   // Transition to suspended and back to runnable, re-acquire share on mutator_lock_.
   ScopedThreadSuspension(this, kSuspended);
-  tls32_.suspended_at_suspend_check = false;
   VLOG(threads) << this << " self-reviving";
 }
 
@@ -1433,6 +1431,12 @@
     if (o == nullptr) {
       os << "an unknown object";
     } else {
+      if (kUseReadBarrier && Thread::Current()->GetIsGcMarking()) {
+        // We may call Thread::Dump() in the middle of the CC thread flip and this thread's stack
+        // may have not been flipped yet and "o" may be a from-space (stale) ref, in which case the
+        // IdentityHashCode call below will crash. So explicitly mark/forward it here.
+        o = ReadBarrier::Mark(o);
+      }
       if ((o->GetLockWord(false).GetState() == LockWord::kThinLocked) &&
           Locks::mutator_lock_->IsExclusiveHeld(Thread::Current())) {
         // Getting the identity hashcode here would result in lock inflation and suspension of the
@@ -1635,7 +1639,7 @@
   }
   tlsPtr_.flip_function = nullptr;
   tlsPtr_.thread_local_mark_stack = nullptr;
-  tls32_.suspended_at_suspend_check = false;
+  tls32_.is_transitioning_to_runnable = false;
 }
 
 bool Thread::IsStillStarting() const {
@@ -1773,7 +1777,7 @@
   CHECK(tlsPtr_.checkpoint_function == nullptr);
   CHECK_EQ(checkpoint_overflow_.size(), 0u);
   CHECK(tlsPtr_.flip_function == nullptr);
-  CHECK_EQ(tls32_.suspended_at_suspend_check, false);
+  CHECK_EQ(tls32_.is_transitioning_to_runnable, false);
 
   // Make sure we processed all deoptimization requests.
   CHECK(tlsPtr_.deoptimization_context_stack == nullptr) << "Missed deoptimization";
diff --git a/runtime/thread.h b/runtime/thread.h
index 840b781..1c2d4ab 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -1085,8 +1085,12 @@
     return tlsPtr_.nested_signal_state;
   }
 
-  bool IsSuspendedAtSuspendCheck() const {
-    return tls32_.suspended_at_suspend_check;
+  bool IsTransitioningToRunnable() const {
+    return tls32_.is_transitioning_to_runnable;
+  }
+
+  void SetIsTransitioningToRunnable(bool value) {
+    tls32_.is_transitioning_to_runnable = value;
   }
 
   void PushVerifier(verifier::MethodVerifier* verifier);
@@ -1264,7 +1268,7 @@
       suspend_count(0), debug_suspend_count(0), thin_lock_thread_id(0), tid(0),
       daemon(is_daemon), throwing_OutOfMemoryError(false), no_thread_suspension(0),
       thread_exit_check_count(0), handling_signal_(false),
-      suspended_at_suspend_check(false), ready_for_debug_invoke(false),
+      is_transitioning_to_runnable(false), ready_for_debug_invoke(false),
       debug_method_entry_(false), is_gc_marking(false), weak_ref_access_enabled(true),
       disable_thread_flip_count(0) {
     }
@@ -1306,10 +1310,10 @@
     // True if signal is being handled by this thread.
     bool32_t handling_signal_;
 
-    // True if the thread is suspended in FullSuspendCheck(). This is
-    // used to distinguish runnable threads that are suspended due to
-    // a normal suspend check from other threads.
-    bool32_t suspended_at_suspend_check;
+    // True if the thread is in TransitionFromSuspendedToRunnable(). This is used to distinguish the
+    // non-runnable threads (eg. kNative, kWaiting) that are about to transition to runnable from
+    // the rest of them.
+    bool32_t is_transitioning_to_runnable;
 
     // True if the thread has been suspended by a debugger event. This is
     // used to invoke method from the debugger which is only allowed when
@@ -1588,6 +1592,26 @@
   Thread* const self_;
 };
 
+class ScopedTransitioningToRunnable : public ValueObject {
+ public:
+  explicit ScopedTransitioningToRunnable(Thread* self)
+      : self_(self) {
+    DCHECK_EQ(self, Thread::Current());
+    if (kUseReadBarrier) {
+      self_->SetIsTransitioningToRunnable(true);
+    }
+  }
+
+  ~ScopedTransitioningToRunnable() {
+    if (kUseReadBarrier) {
+      self_->SetIsTransitioningToRunnable(false);
+    }
+  }
+
+ private:
+  Thread* const self_;
+};
+
 std::ostream& operator<<(std::ostream& os, const Thread& thread);
 std::ostream& operator<<(std::ostream& os, const StackedShadowFrameType& thread);
 
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 419ecec..ab1f198 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -60,7 +60,8 @@
 
 // Whether we should try to dump the native stack of unattached threads. See commit ed8b723 for
 // some history.
-static constexpr bool kDumpUnattachedThreadNativeStack = true;
+// Turned off again. b/29248079
+static constexpr bool kDumpUnattachedThreadNativeStack = false;
 
 ThreadList::ThreadList()
     : suspend_all_count_(0),
@@ -405,6 +406,8 @@
   Locks::thread_suspend_count_lock_->AssertNotHeld(self);
   CHECK_NE(self->GetState(), kRunnable);
 
+  collector->GetHeap()->ThreadFlipBegin(self);  // Sync with JNI critical calls.
+
   SuspendAllInternal(self, self, nullptr);
 
   // Run the flip callback for the collector.
@@ -414,26 +417,31 @@
   collector->RegisterPause(NanoTime() - start_time);
 
   // Resume runnable threads.
-  std::vector<Thread*> runnable_threads;
+  size_t runnable_thread_count = 0;
   std::vector<Thread*> other_threads;
   {
+    TimingLogger::ScopedTiming split2("ResumeRunnableThreads", collector->GetTimings());
     MutexLock mu(self, *Locks::thread_list_lock_);
     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
     --suspend_all_count_;
     for (const auto& thread : list_) {
+      // Set the flip function for all threads because Thread::DumpState/DumpJavaStack() (invoked by
+      // a checkpoint) may cause the flip function to be run for a runnable/suspended thread before
+      // a runnable thread runs it for itself or we run it for a suspended thread below.
+      thread->SetFlipFunction(thread_flip_visitor);
       if (thread == self) {
         continue;
       }
-      // Set the flip function for both runnable and suspended threads
-      // because Thread::DumpState/DumpJavaStack() (invoked by a
-      // checkpoint) may cause the flip function to be run for a
-      // runnable/suspended thread before a runnable threads runs it
-      // for itself or we run it for a suspended thread below.
-      thread->SetFlipFunction(thread_flip_visitor);
-      if (thread->IsSuspendedAtSuspendCheck()) {
+      // Resume early the threads that were runnable but are suspended just for this thread flip or
+      // about to transition from non-runnable (eg. kNative at the SOA entry in a JNI function) to
+      // runnable (both cases waiting inside Thread::TransitionFromSuspendedToRunnable), or waiting
+      // for the thread flip to end at the JNI critical section entry (kWaitingForGcThreadFlip),
+      ThreadState state = thread->GetState();
+      if (state == kWaitingForGcThreadFlip ||
+          thread->IsTransitioningToRunnable()) {
         // The thread will resume right after the broadcast.
         thread->ModifySuspendCount(self, -1, nullptr, false);
-        runnable_threads.push_back(thread);
+        ++runnable_thread_count;
       } else {
         other_threads.push_back(thread);
       }
@@ -441,8 +449,11 @@
     Thread::resume_cond_->Broadcast(self);
   }
 
+  collector->GetHeap()->ThreadFlipEnd(self);
+
   // Run the closure on the other threads and let them resume.
   {
+    TimingLogger::ScopedTiming split3("FlipOtherThreads", collector->GetTimings());
     ReaderMutexLock mu(self, *Locks::mutator_lock_);
     for (const auto& thread : other_threads) {
       Closure* flip_func = thread->GetFlipFunction();
@@ -451,11 +462,15 @@
       }
     }
     // Run it for self.
-    thread_flip_visitor->Run(self);
+    Closure* flip_func = self->GetFlipFunction();
+    if (flip_func != nullptr) {
+      flip_func->Run(self);
+    }
   }
 
   // Resume other threads.
   {
+    TimingLogger::ScopedTiming split4("ResumeOtherThreads", collector->GetTimings());
     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
     for (const auto& thread : other_threads) {
       thread->ModifySuspendCount(self, -1, nullptr, false);
@@ -463,7 +478,7 @@
     Thread::resume_cond_->Broadcast(self);
   }
 
-  return runnable_threads.size() + other_threads.size() + 1;  // +1 for self.
+  return runnable_thread_count + other_threads.size() + 1;  // +1 for self.
 }
 
 void ThreadList::SuspendAll(const char* cause, bool long_suspend) {
diff --git a/runtime/utf.cc b/runtime/utf.cc
index 5e9fdf7..7e06482 100644
--- a/runtime/utf.cc
+++ b/runtime/utf.cc
@@ -170,14 +170,6 @@
   }
 }
 
-int32_t ComputeUtf16Hash(const uint16_t* chars, size_t char_count) {
-  uint32_t hash = 0;
-  while (char_count--) {
-    hash = hash * 31 + *chars++;
-  }
-  return static_cast<int32_t>(hash);
-}
-
 int32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length) {
   uint32_t hash = 0;
   while (utf16_length != 0u) {
diff --git a/runtime/utf.h b/runtime/utf.h
index 27d2fd5..7c9c333 100644
--- a/runtime/utf.h
+++ b/runtime/utf.h
@@ -82,7 +82,16 @@
  */
 int32_t ComputeUtf16Hash(mirror::CharArray* chars, int32_t offset, size_t char_count)
     SHARED_REQUIRES(Locks::mutator_lock_);
-int32_t ComputeUtf16Hash(const uint16_t* chars, size_t char_count);
+
+template<typename MemoryType>
+int32_t ComputeUtf16Hash(const MemoryType* chars, size_t char_count) {
+  uint32_t hash = 0;
+  while (char_count--) {
+    hash = hash * 31 + *chars++;
+  }
+  return static_cast<int32_t>(hash);
+}
+
 int32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length);
 
 // Compute a hash code of a modified UTF-8 string. Not the standard java hash since it returns a
diff --git a/runtime/utils.h b/runtime/utils.h
index 84079e2..693e0b8 100644
--- a/runtime/utils.h
+++ b/runtime/utils.h
@@ -380,21 +380,7 @@
 NO_RETURN void SleepForever();
 
 inline void FlushInstructionCache(char* begin, char* end) {
-  // Only use __builtin___clear_cache with Clang or with GCC >= 4.3.0
-  // (__builtin___clear_cache was introduced in GCC 4.3.0).
-#if defined(__clang__) || GCC_VERSION >= 40300
   __builtin___clear_cache(begin, end);
-#else
-  // Only warn on non-Intel platforms, as x86 and x86-64 do not need
-  // cache flush instructions, as long as the "code uses the same
-  // linear address for modifying and fetching the instruction". See
-  // "Intel(R) 64 and IA-32 Architectures Software Developer's Manual
-  // Volume 3A: System Programming Guide, Part 1", section 11.6
-  // "Self-Modifying Code".
-#if !defined(__i386__) && !defined(__x86_64__)
-  UNIMPLEMENTED(WARNING) << "cache flush";
-#endif
-#endif
 }
 
 }  // namespace art
diff --git a/test/020-string/expected.txt b/test/020-string/expected.txt
index 76b8929..83a0835 100644
--- a/test/020-string/expected.txt
+++ b/test/020-string/expected.txt
@@ -1,6 +1,6 @@
 testStr is 'This is a very nice string'
 This is a very nice string
-Compare result is 32
+Compare result is greater than zero
 Compare unicode: -65302
 Got expected exception
 subStr is 'uick brown fox jumps over the lazy '
diff --git a/test/020-string/src/Main.java b/test/020-string/src/Main.java
index 7108082..ccf94aa 100644
--- a/test/020-string/src/Main.java
+++ b/test/020-string/src/Main.java
@@ -45,7 +45,14 @@
         if (testStr.length() != testStr2.length())
             System.out.println("WARNING: stringTest length mismatch");
 
-        System.out.println("Compare result is " + testStr.compareTo(testStr2));
+        int compareResult = testStr.compareTo(testStr2);
+        if (compareResult > 0) {
+          System.out.println("Compare result is greater than zero");
+        } else if (compareResult == 0) {
+          System.out.println("Compare result is equal to zero");
+        } else {
+          System.out.println("Compare result is less than zero");
+        }
 
         // expected: -65302
         String s1 = "\u0c6d\u0cb6\u0d00\u0000\u0080\u0080\u0080\u0000\u0002\u0002\u0002\u0000\u00e9\u00e9\u00e9";
diff --git a/test/031-class-attributes/expected.txt b/test/031-class-attributes/expected.txt
index de99872..72656ae 100644
--- a/test/031-class-attributes/expected.txt
+++ b/test/031-class-attributes/expected.txt
@@ -84,7 +84,7 @@
   enclosingCon: null
   enclosingMeth: null
   modifiers: 1
-  package: package otherpackage
+  package: package otherpackage, Unknown, version 0.0
   declaredClasses: [0]
   member classes: [0]
   isAnnotation: false
diff --git a/test/439-npe/expected.txt b/test/439-npe/expected.txt
index 271d40d..34855ee 100644
--- a/test/439-npe/expected.txt
+++ b/test/439-npe/expected.txt
@@ -1,18 +1,54 @@
-$opt$setObjectField
-$opt$setIntField
-$opt$setFloatField
-$opt$setLongField
-$opt$setDoubleField
-$opt$setByteField
-$opt$setBooleanField
-$opt$setCharField
-$opt$setShortField
-$opt$getObjectField
-$opt$getIntField
-$opt$getFloatField
-$opt$getLongField
-$opt$getDoubleField
-$opt$getByteField
-$opt$getBooleanField
-$opt$getCharField
-$opt$getShortField
+$opt$noinline$setObjectField
+$opt$noinline$setIntField
+$opt$noinline$setFloatField
+$opt$noinline$setLongField
+$opt$noinline$setDoubleField
+$opt$noinline$setByteField
+$opt$noinline$setBooleanField
+$opt$noinline$setCharField
+$opt$noinline$setShortField
+$opt$noinline$getObjectField
+$opt$noinline$getIntField
+$opt$noinline$getFloatField
+$opt$noinline$getLongField
+$opt$noinline$getDoubleField
+$opt$noinline$getByteField
+$opt$noinline$getBooleanField
+$opt$noinline$getCharField
+$opt$noinline$getShortField
+$opt$noinline$setVolatileObjectField
+$opt$noinline$setVolatileIntField
+$opt$noinline$setVolatileFloatField
+$opt$noinline$setVolatileLongField
+$opt$noinline$setVolatileDoubleField
+$opt$noinline$setVolatileByteField
+$opt$noinline$setVolatileBooleanField
+$opt$noinline$setVolatileCharField
+$opt$noinline$setVolatileShortField
+$opt$noinline$getVolatileObjectField
+$opt$noinline$getVolatileIntField
+$opt$noinline$getVolatileFloatField
+$opt$noinline$getVolatileLongField
+$opt$noinline$getVolatileDoubleField
+$opt$noinline$getVolatileByteField
+$opt$noinline$getVolatileBooleanField
+$opt$noinline$getVolatileCharField
+$opt$noinline$getVolatileShortField
+$opt$noinline$setObjectElement
+$opt$noinline$setIntElement
+$opt$noinline$setFloatElement
+$opt$noinline$setLongElement
+$opt$noinline$setDoubleElement
+$opt$noinline$setByteElement
+$opt$noinline$setBooleanElement
+$opt$noinline$setCharElement
+$opt$noinline$setShortElement
+$opt$noinline$getObjectElement
+$opt$noinline$getIntElement
+$opt$noinline$getFloatElement
+$opt$noinline$getLongElement
+$opt$noinline$getDoubleElement
+$opt$noinline$getByteElement
+$opt$noinline$getBooleanElement
+$opt$noinline$getCharElement
+$opt$noinline$getShortElement
diff --git a/test/439-npe/src/Main.java b/test/439-npe/src/Main.java
index 40c2645..8f66da0 100644
--- a/test/439-npe/src/Main.java
+++ b/test/439-npe/src/Main.java
@@ -15,199 +15,624 @@
  */
 
 public class Main {
+  public static boolean doThrow = false;
 
-  private volatile Object objectField;
-  private volatile int intField;
-  private volatile float floatField;
-  private volatile long longField;
-  private volatile double doubleField;
-  private volatile byte byteField;
-  private volatile boolean booleanField;
-  private volatile char charField;
-  private volatile short shortField;
+  private Object objectField;
+  private int intField;
+  private float floatField;
+  private long longField;
+  private double doubleField;
+  private byte byteField;
+  private boolean booleanField;
+  private char charField;
+  private short shortField;
 
-  public static void $opt$setObjectField(Main m) {
+  private volatile Object volatileObjectField;
+  private volatile int volatileIntField;
+  private volatile float volatileFloatField;
+  private volatile long volatileLongField;
+  private volatile double volatileDoubleField;
+  private volatile byte volatileByteField;
+  private volatile boolean volatileBooleanField;
+  private volatile char volatileCharField;
+  private volatile short volatileShortField;
+
+  public static void $opt$noinline$setObjectField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.objectField = null;
   }
 
-  public static void $opt$setIntField(Main m) {
+  public static void $opt$noinline$setIntField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.intField = 0;
   }
 
-  public static void $opt$setFloatField(Main m) {
+  public static void $opt$noinline$setFloatField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.floatField = 0;
   }
 
-  public static void $opt$setLongField(Main m) {
+  public static void $opt$noinline$setLongField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.longField = 0;
   }
 
-  public static void $opt$setDoubleField(Main m) {
+  public static void $opt$noinline$setDoubleField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.doubleField = 0;
   }
 
-  public static void $opt$setByteField(Main m) {
+  public static void $opt$noinline$setByteField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.byteField = 0;
   }
 
-  public static void $opt$setBooleanField(Main m) {
+  public static void $opt$noinline$setBooleanField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.booleanField = false;
   }
 
-  public static void $opt$setCharField(Main m) {
+  public static void $opt$noinline$setCharField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.charField = 0;
   }
 
-  public static void $opt$setShortField(Main m) {
+  public static void $opt$noinline$setShortField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.shortField = 0;
   }
 
-  public static Object $opt$getObjectField(Main m) {
+  public static Object $opt$noinline$getObjectField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.objectField;
   }
 
-  public static int $opt$getIntField(Main m) {
+  public static int $opt$noinline$getIntField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.intField;
   }
 
-  public static float $opt$getFloatField(Main m) {
+  public static float $opt$noinline$getFloatField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.floatField;
   }
 
-  public static long $opt$getLongField(Main m) {
+  public static long $opt$noinline$getLongField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.longField;
   }
 
-  public static double $opt$getDoubleField(Main m) {
+  public static double $opt$noinline$getDoubleField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.doubleField;
   }
 
-  public static byte $opt$getByteField(Main m) {
+  public static byte $opt$noinline$getByteField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.byteField;
   }
 
-  public static boolean $opt$getBooleanField(Main m) {
+  public static boolean $opt$noinline$getBooleanField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.booleanField;
   }
 
-  public static char $opt$getCharField(Main m) {
+  public static char $opt$noinline$getCharField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.charField;
   }
 
-  public static short $opt$getShortField(Main m) {
+  public static short $opt$noinline$getShortField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.shortField;
   }
 
+  public static void $opt$noinline$setVolatileObjectField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileObjectField = null;
+  }
+
+  public static void $opt$noinline$setVolatileIntField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileIntField = 0;
+  }
+
+  public static void $opt$noinline$setVolatileFloatField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileFloatField = 0;
+  }
+
+  public static void $opt$noinline$setVolatileLongField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileLongField = 0;
+  }
+
+  public static void $opt$noinline$setVolatileDoubleField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileDoubleField = 0;
+  }
+
+  public static void $opt$noinline$setVolatileByteField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileByteField = 0;
+  }
+
+  public static void $opt$noinline$setVolatileBooleanField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileBooleanField = false;
+  }
+
+  public static void $opt$noinline$setVolatileCharField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileCharField = 0;
+  }
+
+  public static void $opt$noinline$setVolatileShortField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileShortField = 0;
+  }
+
+  public static Object $opt$noinline$getVolatileObjectField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileObjectField;
+  }
+
+  public static int $opt$noinline$getVolatileIntField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileIntField;
+  }
+
+  public static float $opt$noinline$getVolatileFloatField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileFloatField;
+  }
+
+  public static long $opt$noinline$getVolatileLongField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileLongField;
+  }
+
+  public static double $opt$noinline$getVolatileDoubleField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileDoubleField;
+  }
+
+  public static byte $opt$noinline$getVolatileByteField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileByteField;
+  }
+
+  public static boolean $opt$noinline$getVolatileBooleanField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileBooleanField;
+  }
+
+  public static char $opt$noinline$getVolatileCharField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileCharField;
+  }
+
+  public static short $opt$noinline$getVolatileShortField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileShortField;
+  }
+
+  public static void $opt$noinline$setObjectElement(Object[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = null;
+  }
+
+  public static void $opt$noinline$setIntElement(int[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = 0;
+  }
+
+  public static void $opt$noinline$setFloatElement(float[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = 0;
+  }
+
+  public static void $opt$noinline$setLongElement(long[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = 0;
+  }
+
+  public static void $opt$noinline$setDoubleElement(double[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = 0;
+  }
+
+  public static void $opt$noinline$setByteElement(byte[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = 0;
+  }
+
+  public static void $opt$noinline$setBooleanElement(boolean[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = false;
+  }
+
+  public static void $opt$noinline$setCharElement(char[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = 0;
+  }
+
+  public static void $opt$noinline$setShortElement(short[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = 0;
+  }
+
+  public static Object $opt$noinline$getObjectElement(Object[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
+  public static int $opt$noinline$getIntElement(int[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
+  public static float $opt$noinline$getFloatElement(float[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
+  public static long $opt$noinline$getLongElement(long[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
+  public static double $opt$noinline$getDoubleElement(double[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
+  public static byte $opt$noinline$getByteElement(byte[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
+  public static boolean $opt$noinline$getBooleanElement(boolean[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
+  public static char $opt$noinline$getCharElement(char[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
+  public static short $opt$noinline$getShortElement(short[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
   public static void main(String[] args) {
-    int methodLine = 30;
-    int thisLine = 103;
+    int methodLine = 42;
+    int thisLine = 312;
     try {
-      $opt$setObjectField(null);
+      $opt$noinline$setObjectField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 2, methodLine, "$opt$setObjectField");
+      check(npe, thisLine += 2, methodLine, "$opt$noinline$setObjectField");
     }
     try {
-      $opt$setIntField(null);
+      $opt$noinline$setIntField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$setIntField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setIntField");
     }
     try {
-      $opt$setFloatField(null);
+      $opt$noinline$setFloatField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$setFloatField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setFloatField");
     }
     try {
-      $opt$setLongField(null);
+      $opt$noinline$setLongField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$setLongField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setLongField");
     }
     try {
-      $opt$setDoubleField(null);
+      $opt$noinline$setDoubleField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$setDoubleField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setDoubleField");
     }
     try {
-      $opt$setByteField(null);
+      $opt$noinline$setByteField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$setByteField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setByteField");
     }
     try {
-      $opt$setBooleanField(null);
+      $opt$noinline$setBooleanField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$setBooleanField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setBooleanField");
     }
     try {
-      $opt$setCharField(null);
+      $opt$noinline$setCharField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$setCharField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setCharField");
     }
     try {
-      $opt$setShortField(null);
+      $opt$noinline$setShortField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$setShortField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setShortField");
     }
     try {
-      $opt$getObjectField(null);
+      $opt$noinline$getObjectField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getObjectField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getObjectField");
     }
     try {
-      $opt$getIntField(null);
+      $opt$noinline$getIntField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getIntField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getIntField");
     }
     try {
-      $opt$getFloatField(null);
+      $opt$noinline$getFloatField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getFloatField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getFloatField");
     }
     try {
-      $opt$getLongField(null);
+      $opt$noinline$getLongField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getLongField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getLongField");
     }
     try {
-      $opt$getDoubleField(null);
+      $opt$noinline$getDoubleField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getDoubleField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getDoubleField");
     }
     try {
-      $opt$getByteField(null);
+      $opt$noinline$getByteField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getByteField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getByteField");
     }
     try {
-      $opt$getBooleanField(null);
+      $opt$noinline$getBooleanField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getBooleanField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getBooleanField");
     }
     try {
-      $opt$getCharField(null);
+      $opt$noinline$getCharField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getCharField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getCharField");
     }
     try {
-      $opt$getShortField(null);
+      $opt$noinline$getShortField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getShortField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getShortField");
+    }
+    try {
+      $opt$noinline$setVolatileObjectField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileObjectField");
+    }
+    try {
+      $opt$noinline$setVolatileIntField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileIntField");
+    }
+    try {
+      $opt$noinline$setVolatileFloatField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileFloatField");
+    }
+    try {
+      $opt$noinline$setVolatileLongField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileLongField");
+    }
+    try {
+      $opt$noinline$setVolatileDoubleField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileDoubleField");
+    }
+    try {
+      $opt$noinline$setVolatileByteField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileByteField");
+    }
+    try {
+      $opt$noinline$setVolatileBooleanField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileBooleanField");
+    }
+    try {
+      $opt$noinline$setVolatileCharField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileCharField");
+    }
+    try {
+      $opt$noinline$setVolatileShortField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileShortField");
+    }
+    try {
+      $opt$noinline$getVolatileObjectField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileObjectField");
+    }
+    try {
+      $opt$noinline$getVolatileIntField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileIntField");
+    }
+    try {
+      $opt$noinline$getVolatileFloatField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileFloatField");
+    }
+    try {
+      $opt$noinline$getVolatileLongField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileLongField");
+    }
+    try {
+      $opt$noinline$getVolatileDoubleField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileDoubleField");
+    }
+    try {
+      $opt$noinline$getVolatileByteField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileByteField");
+    }
+    try {
+      $opt$noinline$getVolatileBooleanField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileBooleanField");
+    }
+    try {
+      $opt$noinline$getVolatileCharField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileCharField");
+    }
+    try {
+      $opt$noinline$getVolatileShortField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileShortField");
+    }
+    try {
+      $opt$noinline$setObjectElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setObjectElement");
+    }
+    try {
+      $opt$noinline$setIntElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setIntElement");
+    }
+    try {
+      $opt$noinline$setFloatElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setFloatElement");
+    }
+    try {
+      $opt$noinline$setLongElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setLongElement");
+    }
+    try {
+      $opt$noinline$setDoubleElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setDoubleElement");
+    }
+    try {
+      $opt$noinline$setByteElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setByteElement");
+    }
+    try {
+      $opt$noinline$setBooleanElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setBooleanElement");
+    }
+    try {
+      $opt$noinline$setCharElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setCharElement");
+    }
+    try {
+      $opt$noinline$setShortElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setShortElement");
+    }
+    try {
+      $opt$noinline$getObjectElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getObjectElement");
+    }
+    try {
+      $opt$noinline$getIntElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getIntElement");
+    }
+    try {
+      $opt$noinline$getFloatElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getFloatElement");
+    }
+    try {
+      $opt$noinline$getLongElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getLongElement");
+    }
+    try {
+      $opt$noinline$getDoubleElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getDoubleElement");
+    }
+    try {
+      $opt$noinline$getByteElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getByteElement");
+    }
+    try {
+      $opt$noinline$getBooleanElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getBooleanElement");
+    }
+    try {
+      $opt$noinline$getCharElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getCharElement");
+    }
+    try {
+      $opt$noinline$getShortElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getShortElement");
     }
   }
 
diff --git a/test/537-checker-arraycopy/src/Main.java b/test/537-checker-arraycopy/src/Main.java
index 7c124ca..95a11ca 100644
--- a/test/537-checker-arraycopy/src/Main.java
+++ b/test/537-checker-arraycopy/src/Main.java
@@ -51,10 +51,10 @@
 
   /// CHECK-START-X86_64: void Main.arraycopy() disassembly (after)
   /// CHECK:          InvokeStaticOrDirect intrinsic:SystemArrayCopy
-  /// CHECK-NOT:      test
+  /// CHECK-NOT:      test {{^[^\[].*}}, {{^[^\[].*}}
   /// CHECK-NOT:      call
   /// CHECK:          ReturnVoid
-  // Checks that the call is intrinsified and that there is no test instruction
+  // Checks that the call is intrinsified and that there is no register test instruction
   // when we know the source and destination are not null.
   public static void arraycopy() {
     Object[] obj = new Object[4];
diff --git a/test/551-implicit-null-checks/expected.txt b/test/551-implicit-null-checks/expected.txt
index e69de29..49b3771 100644
--- a/test/551-implicit-null-checks/expected.txt
+++ b/test/551-implicit-null-checks/expected.txt
@@ -0,0 +1,4 @@
+NPE from GetLong
+NPE from PutLong
+NPE from GetDouble
+NPE from PutDouble
diff --git a/test/551-implicit-null-checks/info.txt b/test/551-implicit-null-checks/info.txt
index bdd066b..bd3ecfd 100644
--- a/test/551-implicit-null-checks/info.txt
+++ b/test/551-implicit-null-checks/info.txt
@@ -1 +1 @@
-Test that implicit null checks are recorded correctly for longs.
\ No newline at end of file
+Test that implicit null checks are recorded correctly for longs and doubles.
diff --git a/test/551-implicit-null-checks/src/Main.java b/test/551-implicit-null-checks/src/Main.java
index 677e8d3..3586a29 100644
--- a/test/551-implicit-null-checks/src/Main.java
+++ b/test/551-implicit-null-checks/src/Main.java
@@ -18,6 +18,7 @@
 
   private class Inner {
     private long i1;
+    private double i2;
   }
   private Inner inst;
 
@@ -26,12 +27,22 @@
     try {
       m.$opt$noinline$testGetLong();
     } catch (NullPointerException ex) {
-      // good
+      System.out.println("NPE from GetLong");
     }
     try {
       m.$opt$noinline$testPutLong(778899112233L);
     } catch (NullPointerException ex) {
-      // good
+      System.out.println("NPE from PutLong");
+    }
+    try {
+      m.$opt$noinline$testGetDouble();
+    } catch (NullPointerException ex) {
+      System.out.println("NPE from GetDouble");
+    }
+    try {
+      m.$opt$noinline$testPutDouble(1.0);
+    } catch (NullPointerException ex) {
+      System.out.println("NPE from PutDouble");
     }
   }
 
@@ -44,4 +55,14 @@
     inst.i1 = a;
     throw new Exception();  // prevent inline
   }
+
+  public void $opt$noinline$testGetDouble() throws Exception {
+    double result = inst.i2;
+    throw new Exception();  // prevent inline
+  }
+
+  public void $opt$noinline$testPutDouble(double a) throws Exception {
+    inst.i2 = a;
+    throw new Exception();  // prevent inline
+  }
 }
diff --git a/test/615-checker-arm64-zr-parallel-move/expected.txt b/test/615-checker-arm64-store-zero/expected.txt
similarity index 100%
rename from test/615-checker-arm64-zr-parallel-move/expected.txt
rename to test/615-checker-arm64-store-zero/expected.txt
diff --git a/test/615-checker-arm64-store-zero/info.txt b/test/615-checker-arm64-store-zero/info.txt
new file mode 100644
index 0000000..ac88eee
--- /dev/null
+++ b/test/615-checker-arm64-store-zero/info.txt
@@ -0,0 +1 @@
+Checker test to verify we correctly use wzr and xzr to store zero constants.
diff --git a/test/615-checker-arm64-store-zero/src/Main.java b/test/615-checker-arm64-store-zero/src/Main.java
new file mode 100644
index 0000000..c8ceb94
--- /dev/null
+++ b/test/615-checker-arm64-store-zero/src/Main.java
@@ -0,0 +1,472 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  public static boolean doThrow = false;
+
+  public void $noinline$foo(int in_w1,
+                            int in_w2,
+                            int in_w3,
+                            int in_w4,
+                            int in_w5,
+                            int in_w6,
+                            int in_w7,
+                            int on_stack_int,
+                            long on_stack_long,
+                            float in_s0,
+                            float in_s1,
+                            float in_s2,
+                            float in_s3,
+                            float in_s4,
+                            float in_s5,
+                            float in_s6,
+                            float in_s7,
+                            float on_stack_float,
+                            double on_stack_double) {
+    if (doThrow) throw new Error();
+  }
+
+  // We expect a parallel move that moves four times the zero constant to stack locations.
+  /// CHECK-START-ARM64: void Main.bar() register (after)
+  /// CHECK:             ParallelMove {{.*#0->[0-9x]+\(sp\).*#0->[0-9x]+\(sp\).*#0->[0-9x]+\(sp\).*#0->[0-9x]+\(sp\).*}}
+
+  // Those four moves should generate four 'store' instructions using directly the zero register.
+  /// CHECK-START-ARM64: void Main.bar() disassembly (after)
+  /// CHECK-DAG:         {{(str|stur)}} wzr, [sp, #{{[0-9]+}}]
+  /// CHECK-DAG:         {{(str|stur)}} xzr, [sp, #{{[0-9]+}}]
+  /// CHECK-DAG:         {{(str|stur)}} wzr, [sp, #{{[0-9]+}}]
+  /// CHECK-DAG:         {{(str|stur)}} xzr, [sp, #{{[0-9]+}}]
+
+  public void bar() {
+    $noinline$foo(1, 2, 3, 4, 5, 6, 7,     // Integral values in registers.
+                  0, 0L,                   // Integral values on the stack.
+                  1, 2, 3, 4, 5, 6, 7, 8,  // Floating-point values in registers.
+                  0.0f, 0.0);              // Floating-point values on the stack.
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_static_byte_field() disassembly (after)
+  /// CHECK:             StaticFieldSet
+  /// CHECK-NEXT:        strb wzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  public static byte static_byte_field;
+
+  public void store_zero_to_static_byte_field() {
+    static_byte_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_static_char_field() disassembly (after)
+  /// CHECK:             StaticFieldSet
+  /// CHECK-NEXT:        strh wzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  public static char static_char_field;
+
+  public void store_zero_to_static_char_field() {
+    static_char_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_static_short_field() disassembly (after)
+  /// CHECK:             StaticFieldSet
+  /// CHECK-NEXT:        strh wzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  public static short static_short_field;
+
+  public void store_zero_to_static_short_field() {
+    static_short_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_static_int_field() disassembly (after)
+  /// CHECK:             StaticFieldSet
+  /// CHECK-NEXT:        str wzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  public static int static_int_field;
+
+  public void store_zero_to_static_int_field() {
+    static_int_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_static_long_field() disassembly (after)
+  /// CHECK:             StaticFieldSet
+  /// CHECK-NEXT:        str xzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  public static long static_long_field;
+
+  public void store_zero_to_static_long_field() {
+    static_long_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_static_float_field() disassembly (after)
+  /// CHECK:             StaticFieldSet
+  /// CHECK-NEXT:        str wzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  public static float static_float_field;
+
+  public void store_zero_to_static_float_field() {
+    static_float_field = 0.0f;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_static_double_field() disassembly (after)
+  /// CHECK:             StaticFieldSet
+  /// CHECK-NEXT:        str xzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  public static double static_double_field;
+
+  public void store_zero_to_static_double_field() {
+    static_double_field = 0.0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_volatile_static_byte_field() disassembly (after)
+  /// CHECK:             StaticFieldSet
+  /// CHECK-NEXT:        add <<temp:x[0-9]+>>, x{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK-NEXT:        stlrb wzr, [<<temp>>]
+
+  public static volatile byte volatile_static_byte_field;
+
+  public void store_zero_to_volatile_static_byte_field() {
+    volatile_static_byte_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_volatile_static_char_field() disassembly (after)
+  /// CHECK:             StaticFieldSet
+  /// CHECK-NEXT:        add <<temp:x[0-9]+>>, x{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK-NEXT:        stlrh wzr, [<<temp>>]
+
+  public static volatile char volatile_static_char_field;
+
+  public void store_zero_to_volatile_static_char_field() {
+    volatile_static_char_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_volatile_static_short_field() disassembly (after)
+  /// CHECK:             StaticFieldSet
+  /// CHECK-NEXT:        add <<temp:x[0-9]+>>, x{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK-NEXT:        stlrh wzr, [<<temp>>]
+
+  public static volatile short volatile_static_short_field;
+
+  public void store_zero_to_volatile_static_short_field() {
+    volatile_static_short_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_volatile_static_int_field() disassembly (after)
+  /// CHECK:             StaticFieldSet
+  /// CHECK-NEXT:        add <<temp:x[0-9]+>>, x{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK-NEXT:        stlr wzr, [<<temp>>]
+
+  public static volatile int volatile_static_int_field;
+
+  public void store_zero_to_volatile_static_int_field() {
+    volatile_static_int_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_volatile_static_long_field() disassembly (after)
+  /// CHECK:             StaticFieldSet
+  /// CHECK-NEXT:        add <<temp:x[0-9]+>>, x{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK-NEXT:        stlr xzr, [<<temp>>]
+
+  public static volatile long volatile_static_long_field;
+
+  public void store_zero_to_volatile_static_long_field() {
+    volatile_static_long_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_volatile_static_float_field() disassembly (after)
+  /// CHECK:             StaticFieldSet
+  /// CHECK-NEXT:        add <<temp:x[0-9]+>>, x{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK-NEXT:        stlr wzr, [<<temp>>]
+
+  public static volatile float volatile_static_float_field;
+
+  public void store_zero_to_volatile_static_float_field() {
+    volatile_static_float_field = 0.0f;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_volatile_static_double_field() disassembly (after)
+  /// CHECK:             StaticFieldSet
+  /// CHECK-NEXT:        add <<temp:x[0-9]+>>, x{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK-NEXT:        stlr xzr, [<<temp>>]
+
+  public static volatile double volatile_static_double_field;
+
+  public void store_zero_to_volatile_static_double_field() {
+    volatile_static_double_field = 0.0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_instance_byte_field() disassembly (after)
+  /// CHECK:             InstanceFieldSet
+  /// CHECK-NEXT:        strb wzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  public byte instance_byte_field;
+
+  public void store_zero_to_instance_byte_field() {
+    instance_byte_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_instance_char_field() disassembly (after)
+  /// CHECK:             InstanceFieldSet
+  /// CHECK-NEXT:        strh wzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  public char instance_char_field;
+
+  public void store_zero_to_instance_char_field() {
+    instance_char_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_instance_short_field() disassembly (after)
+  /// CHECK:             InstanceFieldSet
+  /// CHECK-NEXT:        strh wzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  public short instance_short_field;
+
+  public void store_zero_to_instance_short_field() {
+    instance_short_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_instance_int_field() disassembly (after)
+  /// CHECK:             InstanceFieldSet
+  /// CHECK-NEXT:        str wzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  public int instance_int_field;
+
+  public void store_zero_to_instance_int_field() {
+    instance_int_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_instance_long_field() disassembly (after)
+  /// CHECK:             InstanceFieldSet
+  /// CHECK-NEXT:        str xzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  public long instance_long_field;
+
+  public void store_zero_to_instance_long_field() {
+    instance_long_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_instance_float_field() disassembly (after)
+  /// CHECK:             InstanceFieldSet
+  /// CHECK-NEXT:        str wzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  public float instance_float_field;
+
+  public void store_zero_to_instance_float_field() {
+    instance_float_field = 0.0f;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_instance_double_field() disassembly (after)
+  /// CHECK:             InstanceFieldSet
+  /// CHECK-NEXT:        str xzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  public double instance_double_field;
+
+  public void store_zero_to_instance_double_field() {
+    instance_double_field = 0.0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_volatile_instance_byte_field() disassembly (after)
+  /// CHECK:             InstanceFieldSet
+  /// CHECK-NEXT:        add <<temp:x[0-9]+>>, x{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK-NEXT:        stlrb wzr, [<<temp>>]
+
+  public volatile byte volatile_instance_byte_field;
+
+  public void store_zero_to_volatile_instance_byte_field() {
+    volatile_instance_byte_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_volatile_instance_char_field() disassembly (after)
+  /// CHECK:             InstanceFieldSet
+  /// CHECK-NEXT:        add <<temp:x[0-9]+>>, x{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK-NEXT:        stlrh wzr, [<<temp>>]
+
+  public volatile char volatile_instance_char_field;
+
+  public void store_zero_to_volatile_instance_char_field() {
+    volatile_instance_char_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_volatile_instance_short_field() disassembly (after)
+  /// CHECK:             InstanceFieldSet
+  /// CHECK-NEXT:        add <<temp:x[0-9]+>>, x{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK-NEXT:        stlrh wzr, [<<temp>>]
+
+  public volatile short volatile_instance_short_field;
+
+  public void store_zero_to_volatile_instance_short_field() {
+    volatile_instance_short_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_volatile_instance_int_field() disassembly (after)
+  /// CHECK:             InstanceFieldSet
+  /// CHECK-NEXT:        add <<temp:x[0-9]+>>, x{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK-NEXT:        stlr wzr, [<<temp>>]
+
+  public volatile int volatile_instance_int_field;
+
+  public void store_zero_to_volatile_instance_int_field() {
+    volatile_instance_int_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_volatile_instance_long_field() disassembly (after)
+  /// CHECK:             InstanceFieldSet
+  /// CHECK-NEXT:        add <<temp:x[0-9]+>>, x{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK-NEXT:        stlr xzr, [<<temp>>]
+
+  public volatile long volatile_instance_long_field;
+
+  public void store_zero_to_volatile_instance_long_field() {
+    volatile_instance_long_field = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_volatile_instance_float_field() disassembly (after)
+  /// CHECK:             InstanceFieldSet
+  /// CHECK-NEXT:        add <<temp:x[0-9]+>>, x{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK-NEXT:        stlr wzr, [<<temp>>]
+
+  public volatile float volatile_instance_float_field;
+
+  public void store_zero_to_volatile_instance_float_field() {
+    volatile_instance_float_field = 0.0f;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_volatile_instance_double_field() disassembly (after)
+  /// CHECK:             InstanceFieldSet
+  /// CHECK-NEXT:        add <<temp:x[0-9]+>>, x{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK-NEXT:        stlr xzr, [<<temp>>]
+
+  public volatile double volatile_instance_double_field;
+
+  public void store_zero_to_volatile_instance_double_field() {
+    volatile_instance_double_field = 0.0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_array_byte() disassembly (after)
+  /// CHECK:             ArraySet
+  /// CHECK-NEXT:        strb wzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  byte array_byte[];
+
+  public void store_zero_to_array_byte() {
+    array_byte[0] = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_array_char() disassembly (after)
+  /// CHECK:             ArraySet
+  /// CHECK-NEXT:        strh wzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  char array_char[];
+
+  public void store_zero_to_array_char() {
+    array_char[0] = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_array_short() disassembly (after)
+  /// CHECK:             ArraySet
+  /// CHECK-NEXT:        strh wzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  short array_short[];
+
+  public void store_zero_to_array_short() {
+    array_short[0] = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_array_int() disassembly (after)
+  /// CHECK:             ArraySet
+  /// CHECK-NEXT:        str wzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  int array_int[];
+
+  public void store_zero_to_array_int() {
+    array_int[0] = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_array_long() disassembly (after)
+  /// CHECK:             ArraySet
+  /// CHECK-NEXT:        str xzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  long array_long[];
+
+  public void store_zero_to_array_long() {
+    array_long[0] = 0;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_array_float() disassembly (after)
+  /// CHECK:             ArraySet
+  /// CHECK-NEXT:        str wzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  float array_float[];
+
+  public void store_zero_to_array_float() {
+    array_float[0] = 0.0f;
+  }
+
+  /// CHECK-START-ARM64: void Main.store_zero_to_array_double() disassembly (after)
+  /// CHECK:             ArraySet
+  /// CHECK-NEXT:        str xzr, [x{{[0-9]+}}, #{{[0-9]+}}]
+
+  double array_double[];
+
+  public void store_zero_to_array_double() {
+    array_double[0] = 0.0;
+  }
+
+  public static void main(String args[]) {
+    Main obj = new Main();
+    obj.array_byte = new byte[1];
+    obj.array_char = new char[1];
+    obj.array_short = new short[1];
+    obj.array_int = new int[1];
+    obj.array_long = new long[1];
+    obj.array_float = new float[1];
+    obj.array_double = new double[1];
+
+    obj.bar();
+    obj.store_zero_to_static_byte_field();
+    obj.store_zero_to_static_char_field();
+    obj.store_zero_to_static_short_field();
+    obj.store_zero_to_static_int_field();
+    obj.store_zero_to_static_long_field();
+    obj.store_zero_to_static_float_field();
+    obj.store_zero_to_static_double_field();
+    obj.store_zero_to_volatile_static_byte_field();
+    obj.store_zero_to_volatile_static_char_field();
+    obj.store_zero_to_volatile_static_short_field();
+    obj.store_zero_to_volatile_static_int_field();
+    obj.store_zero_to_volatile_static_long_field();
+    obj.store_zero_to_volatile_static_float_field();
+    obj.store_zero_to_volatile_static_double_field();
+    obj.store_zero_to_instance_byte_field();
+    obj.store_zero_to_instance_char_field();
+    obj.store_zero_to_instance_short_field();
+    obj.store_zero_to_instance_int_field();
+    obj.store_zero_to_instance_long_field();
+    obj.store_zero_to_instance_float_field();
+    obj.store_zero_to_instance_double_field();
+    obj.store_zero_to_volatile_instance_byte_field();
+    obj.store_zero_to_volatile_instance_char_field();
+    obj.store_zero_to_volatile_instance_short_field();
+    obj.store_zero_to_volatile_instance_int_field();
+    obj.store_zero_to_volatile_instance_long_field();
+    obj.store_zero_to_volatile_instance_float_field();
+    obj.store_zero_to_volatile_instance_double_field();
+    obj.store_zero_to_array_byte();
+    obj.store_zero_to_array_char();
+    obj.store_zero_to_array_short();
+    obj.store_zero_to_array_int();
+    obj.store_zero_to_array_long();
+    obj.store_zero_to_array_float();
+    obj.store_zero_to_array_double();
+  }
+}
diff --git a/test/615-checker-arm64-zr-parallel-move/info.txt b/test/615-checker-arm64-zr-parallel-move/info.txt
deleted file mode 100644
index 199755d..0000000
--- a/test/615-checker-arm64-zr-parallel-move/info.txt
+++ /dev/null
@@ -1 +0,0 @@
-Checker test to verify we correctly use wzr and xzr to synthesize zero constants.
diff --git a/test/615-checker-arm64-zr-parallel-move/src/Main.java b/test/615-checker-arm64-zr-parallel-move/src/Main.java
deleted file mode 100644
index 5024f28..0000000
--- a/test/615-checker-arm64-zr-parallel-move/src/Main.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (C) 2016 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-public class Main {
-
-  public static boolean doThrow = false;
-
-  public void $noinline$foo(int in_w1,
-                            int in_w2,
-                            int in_w3,
-                            int in_w4,
-                            int in_w5,
-                            int in_w6,
-                            int in_w7,
-                            int on_stack_int,
-                            long on_stack_long,
-                            float in_s0,
-                            float in_s1,
-                            float in_s2,
-                            float in_s3,
-                            float in_s4,
-                            float in_s5,
-                            float in_s6,
-                            float in_s7,
-                            float on_stack_float,
-                            double on_stack_double) {
-    if (doThrow) throw new Error();
-  }
-
-  // We expect a parallel move that moves four times the zero constant to stack locations.
-  /// CHECK-START-ARM64: void Main.bar() register (after)
-  /// CHECK:             ParallelMove {{.*#0->[0-9x]+\(sp\).*#0->[0-9x]+\(sp\).*#0->[0-9x]+\(sp\).*#0->[0-9x]+\(sp\).*}}
-
-  // Those four moves should generate four 'store' instructions using directly the zero register.
-  /// CHECK-START-ARM64: void Main.bar() disassembly (after)
-  /// CHECK-DAG:         {{(str|stur)}} wzr, [sp, #{{[0-9]+}}]
-  /// CHECK-DAG:         {{(str|stur)}} xzr, [sp, #{{[0-9]+}}]
-  /// CHECK-DAG:         {{(str|stur)}} wzr, [sp, #{{[0-9]+}}]
-  /// CHECK-DAG:         {{(str|stur)}} xzr, [sp, #{{[0-9]+}}]
-
-  public void bar() {
-    $noinline$foo(1, 2, 3, 4, 5, 6, 7,     // Integral values in registers.
-                  0, 0L,                   // Integral values on the stack.
-                  1, 2, 3, 4, 5, 6, 7, 8,  // Floating-point values in registers.
-                  0.0f, 0.0);              // Floating-point values on the stack.
-  }
-
-  public static void main(String args[]) {}
-}
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index 9e18750..75c4f34 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -566,6 +566,18 @@
 # Tests that should fail in the read barrier configuration with JIT (Optimizing compiler).
 TEST_ART_BROKEN_JIT_READ_BARRIER_RUN_TESTS :=
 
+# Tests failing in non-Baker read barrier configurations with the Optimizing compiler (AOT).
+# 537: Expects an array copy to be intrinsified, but calling-on-slowpath intrinsics are not yet
+#      handled in non-Baker read barrier configurations.
+TEST_ART_BROKEN_OPTIMIZING_NON_BAKER_READ_BARRIER_RUN_TESTS := \
+  537-checker-arraycopy
+
+# Tests failing in non-Baker read barrier configurations with JIT (Optimizing compiler).
+# 537: Expects an array copy to be intrinsified, but calling-on-slowpath intrinsics are not yet
+#      handled in non-Baker read barrier configurations.
+TEST_ART_BROKEN_JIT_NON_BAKER_READ_BARRIER_RUN_TESTS := \
+  537-checker-arraycopy
+
 ifeq ($(ART_USE_READ_BARRIER),true)
   ifneq (,$(filter interpreter,$(COMPILER_TYPES)))
     ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES), \
@@ -576,9 +588,15 @@
 
   ifneq (,$(filter $(OPTIMIZING_COMPILER_TYPES),$(COMPILER_TYPES)))
     ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES), \
-        $(PREBUILD_TYPES),$(OPTIMIZING_COMPILER_TYPES),$(RELOCATE_TYPES),$(TRACE_TYPES),$(GC_TYPES), \
-        $(JNI_TYPES),$(IMAGE_TYPES),$(PICTEST_TYPES),$(DEBUGGABLE_TYPES), \
+        $(PREBUILD_TYPES),$(OPTIMIZING_COMPILER_TYPES),$(RELOCATE_TYPES),$(TRACE_TYPES), \
+        $(GC_TYPES),$(JNI_TYPES),$(IMAGE_TYPES),$(PICTEST_TYPES),$(DEBUGGABLE_TYPES), \
         $(TEST_ART_BROKEN_OPTIMIZING_READ_BARRIER_RUN_TESTS),$(ALL_ADDRESS_SIZES))
+    ifneq ($(ART_READ_BARRIER_TYPE),BAKER)
+      ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES), \
+          $(PREBUILD_TYPES),$(OPTIMIZING_COMPILER_TYPES),$(RELOCATE_TYPES),$(TRACE_TYPES), \
+          $(GC_TYPES),$(JNI_TYPES),$(IMAGE_TYPES),$(PICTEST_TYPES),$(DEBUGGABLE_TYPES), \
+          $(TEST_ART_BROKEN_OPTIMIZING_NON_BAKER_READ_BARRIER_RUN_TESTS),$(ALL_ADDRESS_SIZES))
+    endif
   endif
 
   ifneq (,$(filter jit,$(COMPILER_TYPES)))
@@ -586,6 +604,12 @@
         $(PREBUILD_TYPES),jit,$(RELOCATE_TYPES),$(TRACE_TYPES),$(GC_TYPES), \
         $(JNI_TYPES),$(IMAGE_TYPES),$(PICTEST_TYPES),$(DEBUGGABLE_TYPES), \
         $(TEST_ART_BROKEN_JIT_READ_BARRIER_RUN_TESTS),$(ALL_ADDRESS_SIZES))
+    ifneq ($(ART_READ_BARRIER_TYPE),BAKER)
+      ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES), \
+          $(PREBUILD_TYPES),jit,$(RELOCATE_TYPES),$(TRACE_TYPES),$(GC_TYPES), \
+          $(JNI_TYPES),$(IMAGE_TYPES),$(PICTEST_TYPES),$(DEBUGGABLE_TYPES), \
+          $(TEST_ART_BROKEN_JIT_NON_BAKER_READ_BARRIER_RUN_TESTS),$(ALL_ADDRESS_SIZES))
+    endif
   endif
 endif
 
diff --git a/tools/cpp-define-generator/constant_dexcache.def b/tools/cpp-define-generator/constant_dexcache.def
new file mode 100644
index 0000000..fd197f2
--- /dev/null
+++ b/tools/cpp-define-generator/constant_dexcache.def
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined(DEFINE_INCLUDE_DEPENDENCIES)
+#include "mirror/dex_cache.h"   // art::mirror::DexCache, StringDexCachePair
+#endif
+
+DEFINE_EXPR(STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT,       int32_t, art::WhichPowerOf2(sizeof(art::mirror::StringDexCachePair)))
+DEFINE_EXPR(STRING_DEX_CACHE_SIZE_MINUS_ONE,           int32_t, art::mirror::DexCache::kDexCacheStringCacheSize - 1)
+DEFINE_EXPR(STRING_DEX_CACHE_HASH_BITS,                int32_t,
+    art::LeastSignificantBit(art::mirror::DexCache::kDexCacheStringCacheSize))
\ No newline at end of file
diff --git a/tools/cpp-define-generator/offset_dexcache.def b/tools/cpp-define-generator/offset_dexcache.def
index 3b26518..4b9d481 100644
--- a/tools/cpp-define-generator/offset_dexcache.def
+++ b/tools/cpp-define-generator/offset_dexcache.def
@@ -19,16 +19,27 @@
 #if defined(DEFINE_INCLUDE_DEPENDENCIES)
 #include "art_method.h"         // art::ArtMethod
 #include "base/enums.h"         // PointerSize
+#include "mirror/dex_cache.h"   // art::DexCache
 #endif
 
-#define DEFINE_ART_METHOD_OFFSET(field_name, method_name) \
+#define DEFINE_ART_METHOD_OFFSET_SIZED(field_name, method_name) \
   DEFINE_EXPR(ART_METHOD_ ## field_name ## _OFFSET_32, int32_t, art::ArtMethod::method_name##Offset(art::PointerSize::k32).Int32Value()) \
   DEFINE_EXPR(ART_METHOD_ ## field_name ## _OFFSET_64, int32_t, art::ArtMethod::method_name##Offset(art::PointerSize::k64).Int32Value())
 
+#define DEFINE_ART_METHOD_OFFSET(field_name, method_name) \
+  DEFINE_EXPR(ART_METHOD_ ## field_name ## _OFFSET, int32_t, art::ArtMethod::method_name##Offset().Int32Value())
+
+#define DEFINE_DECLARING_CLASS_OFFSET(field_name, method_name) \
+  DEFINE_EXPR(DECLARING_CLASS_ ## field_name ## _OFFSET, int32_t, art::mirror::Class::method_name##Offset().Int32Value())
+
 //                         New macro suffix          Method Name (of the Offset method)
-DEFINE_ART_METHOD_OFFSET(DEX_CACHE_METHODS,          DexCacheResolvedMethods)
-DEFINE_ART_METHOD_OFFSET(DEX_CACHE_TYPES,            DexCacheResolvedTypes)
-DEFINE_ART_METHOD_OFFSET(JNI,                        EntryPointFromJni)
-DEFINE_ART_METHOD_OFFSET(QUICK_CODE,                 EntryPointFromQuickCompiledCode)
+DEFINE_ART_METHOD_OFFSET_SIZED(DEX_CACHE_METHODS,    DexCacheResolvedMethods)
+DEFINE_ART_METHOD_OFFSET_SIZED(DEX_CACHE_TYPES,      DexCacheResolvedTypes)
+DEFINE_ART_METHOD_OFFSET_SIZED(JNI,                  EntryPointFromJni)
+DEFINE_ART_METHOD_OFFSET_SIZED(QUICK_CODE,           EntryPointFromQuickCompiledCode)
+DEFINE_ART_METHOD_OFFSET(DECLARING_CLASS,            DeclaringClass)
+DEFINE_DECLARING_CLASS_OFFSET(DEX_CACHE_STRINGS,     DexCacheStrings)
 
 #undef DEFINE_ART_METHOD_OFFSET
+#undef DEFINE_ART_METHOD_OFFSET_32
+#undef DEFINE_DECLARING_CLASS_OFFSET
diff --git a/tools/cpp-define-generator/offsets_all.def b/tools/cpp-define-generator/offsets_all.def
index d2d8777..13371a1 100644
--- a/tools/cpp-define-generator/offsets_all.def
+++ b/tools/cpp-define-generator/offsets_all.def
@@ -48,6 +48,7 @@
 // TODO: MIRROR_*_ARRAY offsets (depends on header size)
 // TODO: MIRROR_STRING offsets (depends on header size)
 #include "offset_dexcache.def"
+#include "constant_dexcache.def"
 #include "constant_heap.def"
 #include "constant_lockword.def"
 #include "constant_globals.def"
diff --git a/tools/javafuzz/README.md b/tools/javafuzz/README.md
index 35c057c..68fc171 100644
--- a/tools/javafuzz/README.md
+++ b/tools/javafuzz/README.md
@@ -8,7 +8,7 @@
 or using various target architectures. Any difference between the outputs
 (**divergence**) may indicate a bug in one of the execution modes.
 
-JavaFuzz can be combined with dexfuzz to get multilayered fuzz testing.
+JavaFuzz can be combined with dexfuzz to get multi-layered fuzz testing.
 
 How to run JavaFuzz
 ===================
@@ -40,19 +40,20 @@
 ===============================
 
     run_java_fuzz_test.py [--num_tests]
+                          [--device]
                           [--mode1=mode] [--mode2=mode]
 
 where
 
-    --num_tests: number of tests to run (10000 by default)
-    --mode1:m1
-    --mode2:m2
-    with m1 != m2, and one of
-      ri   : reference implementation on host (default for m1)
-      hint : Art interpreter on host
-      hopt : Art optimizing on host (default for m2)
-      tint : Art interpreter on target
-      topt : Art optimizing on target
+    --num_tests : number of tests to run (10000 by default)
+    --device    : target device serial number (passed to adb -s)
+    --mode1     : m1
+    --mode2     : m2, with m1 != m2, and values one of
+      ri   = reference implementation on host (default for m1)
+      hint = Art interpreter on host
+      hopt = Art optimizing on host (default for m2)
+      tint = Art interpreter on target
+      topt = Art optimizing on target
 
 Background
 ==========
@@ -67,14 +68,15 @@
 and flaws still linger in the system.
 
 Over the years, fuzz testing has gained popularity as a testing technique for
-discovering such lingering bugs, including bugs that can bring down a system in
-an unexpected way. Fuzzing refers to feeding a large amount of random data as
-input to a system in an attempt to find bugs or make it crash. Mutation-based
-fuzz testing is a special form of fuzzing that applies small random changes to
-existing inputs in order to detect shortcomings in a system. Profile-guided or
-coverage-guided fuzzing adds a direction to the way these random changes are
-applied. Multilayer approaches generate random inputs that are subsequently
-mutated at various stages of execution.
+discovering such lingering bugs, including bugs that can bring down a system
+in an unexpected way. Fuzzing refers to feeding a large amount of random data
+as input to a system in an attempt to find bugs or make it crash. Generation-
+based fuzz testing constructs random, but properly formatted input data.
+Mutation-based fuzz testing applies small random changes to existing inputs
+in order to detect shortcomings in a system. Profile-guided or coverage-guided
+fuzzing adds a direction to the way these random changes are applied. Multi-
+layered approaches generate random inputs that are subsequently mutated at
+various stages of execution.
 
 The randomness of fuzz testing implies that the size and scope of testing is no
 longer bounded. Every new run can potentially discover bugs and crashes that were
diff --git a/tools/javafuzz/javafuzz.cc b/tools/javafuzz/javafuzz.cc
index 4e6e978..161ae0a 100644
--- a/tools/javafuzz/javafuzz.cc
+++ b/tools/javafuzz/javafuzz.cc
@@ -53,7 +53,9 @@
  * to preserve the property that a given version of JavaFuzz yields the same
  * fuzzed Java program for a deterministic random seed.
  */
-const char* VERSION = "1.0";
+const char* VERSION = "1.1";
+
+static const uint32_t MAX_DIMS[11] = { 0, 1000, 32, 10, 6, 4, 3, 3, 2, 2, 2 };
 
 /**
  * A class that generates a random Java program that compiles correctly. The program
@@ -83,8 +85,8 @@
         fuzz_loop_nest_(loop_nest),
         return_type_(randomType()),
         array_type_(randomType()),
-        array_dim_(random1(3)),
-        array_size_(random1(10)),
+        array_dim_(random1(10)),
+        array_size_(random1(MAX_DIMS[array_dim_])),
         indentation_(0),
         expr_depth_(0),
         stmt_length_(0),
@@ -169,7 +171,7 @@
   // Emit an unary operator (same type in-out).
   void emitUnaryOp(Type tp) {
     if (tp == kBoolean) {
-      fputs("!", out_);
+      fputc('!', out_);
     } else if (isInteger(tp)) {
       EMIT(kIntUnaryOps);
     } else {  // isFP(tp)
@@ -239,16 +241,21 @@
         case 6: fputs("(long)(int)(long)",   out_); return kLong;
       }
     } else if (tp == kFloat) {
-      switch (random1(3)) {
+      switch (random1(4)) {
         case 1: fputs("(float)", out_); return kInt;
         case 2: fputs("(float)", out_); return kLong;
         case 3: fputs("(float)", out_); return kDouble;
+        // Narrowing-widening.
+        case 4: fputs("(float)(int)(float)", out_); return kFloat;
       }
     } else if (tp == kDouble) {
-      switch (random1(3)) {
+      switch (random1(5)) {
         case 1: fputs("(double)", out_); return kInt;
         case 2: fputs("(double)", out_); return kLong;
         case 3: fputs("(double)", out_); return kFloat;
+        // Narrowing-widening.
+        case 4: fputs("(double)(int)(double)",   out_); return kDouble;
+        case 5: fputs("(double)(float)(double)", out_); return kDouble;
       }
     }
     return tp;  // nothing suitable, just keep type
@@ -273,15 +280,17 @@
   // Emit an unary intrinsic (out type given, new suitable in type picked).
   Type emitIntrinsic1(Type tp) {
     if (tp == kBoolean) {
-      switch (random1(4)) {
+      switch (random1(6)) {
         case 1: fputs("Float.isNaN",       out_); return kFloat;
-        case 2: fputs("Float.isInfinite",  out_); return kFloat;
-        case 3: fputs("Double.isNaN",      out_); return kDouble;
-        case 4: fputs("Double.isInfinite", out_); return kDouble;
+        case 2: fputs("Float.isFinite",    out_); return kFloat;
+        case 3: fputs("Float.isInfinite",  out_); return kFloat;
+        case 4: fputs("Double.isNaN",      out_); return kDouble;
+        case 5: fputs("Double.isFinite",   out_); return kDouble;
+        case 6: fputs("Double.isInfinite", out_); return kDouble;
       }
     } else if (isInteger(tp)) {
       const char* prefix = tp == kLong ? "Long" : "Integer";
-      switch (random1(9)) {
+      switch (random1(13)) {
         case 1: fprintf(out_, "%s.highestOneBit",         prefix); break;
         case 2: fprintf(out_, "%s.lowestOneBit",          prefix); break;
         case 3: fprintf(out_, "%s.numberOfLeadingZeros",  prefix); break;
@@ -290,15 +299,27 @@
         case 6: fprintf(out_, "%s.signum",                prefix); break;
         case 7: fprintf(out_, "%s.reverse",               prefix); break;
         case 8: fprintf(out_, "%s.reverseBytes",          prefix); break;
-        case 9: fputs("Math.abs", out_);                           break;
+        case 9:  fputs("Math.incrementExact", out_); break;
+        case 10: fputs("Math.decrementExact", out_); break;
+        case 11: fputs("Math.negateExact",    out_); break;
+        case 12: fputs("Math.abs",            out_); break;
+        case 13: fputs("Math.round", out_);
+                 return tp == kLong ? kDouble : kFloat;
       }
     } else {  // isFP(tp)
-      switch (random1(5)) {
+      switch (random1(6)) {
         case 1: fputs("Math.abs",      out_); break;
         case 2: fputs("Math.ulp",      out_); break;
         case 3: fputs("Math.signum",   out_); break;
         case 4: fputs("Math.nextUp",   out_); break;
         case 5: fputs("Math.nextDown", out_); break;
+        case 6: if (tp == kDouble) {
+                  fputs("Double.longBitsToDouble", out_);
+                  return kLong;
+                } else {
+                  fputs("Float.intBitsToFloat", out_);
+                  return kInt;
+                }
       }
     }
     return tp;  // same type in-out
@@ -314,15 +335,27 @@
       }
     } else if (isInteger(tp)) {
       const char* prefix = tp == kLong ? "Long" : "Integer";
-      switch (random1(3)) {
+      switch (random1(11)) {
         case 1: fprintf(out_, "%s.compare", prefix); break;
-        case 2: fputs("Math.min", out_); break;
-        case 3: fputs("Math.max", out_); break;
+        case 2: fprintf(out_, "%s.sum",     prefix); break;
+        case 3: fprintf(out_, "%s.min",     prefix); break;
+        case 4: fprintf(out_, "%s.max",     prefix); break;
+        case 5:  fputs("Math.min",           out_); break;
+        case 6:  fputs("Math.max",           out_); break;
+        case 7:  fputs("Math.floorDiv",      out_); break;
+        case 8:  fputs("Math.floorMod",      out_); break;
+        case 9:  fputs("Math.addExact",      out_); break;
+        case 10: fputs("Math.subtractExact", out_); break;
+        case 11: fputs("Math.multiplyExact", out_); break;
       }
     } else {  // isFP(tp)
-      switch (random1(2)) {
-        case 1: fputs("Math.min", out_); break;
-        case 2: fputs("Math.max", out_); break;
+      const char* prefix = tp == kDouble ? "Double" : "Float";
+      switch (random1(5)) {
+        case 1: fprintf(out_, "%s.sum", prefix); break;
+        case 2: fprintf(out_, "%s.min", prefix); break;
+        case 3: fprintf(out_, "%s.max", prefix); break;
+        case 4: fputs("Math.min", out_); break;
+        case 5: fputs("Math.max", out_); break;
       }
     }
     return tp;  // same type in-out
@@ -358,12 +391,24 @@
 
   // Emit miscellaneous constructs.
   void emitMisc(Type tp) {
-    switch (tp) {
-      case kBoolean: fputs("this instanceof Test", out_); break;
-      case kInt:     fputs("mArray.length",    out_); break;
-      case kLong:    fputs("Long.MAX_VALUE",   out_); break;
-      case kFloat:   fputs("Float.MAX_VALUE",  out_); break;
-      case kDouble:  fputs("Double.MAX_VALUE", out_); break;
+    if (tp == kBoolean) {
+      fputs("this instanceof Test", out_);
+    } else if (isInteger(tp)) {
+      const char* prefix = tp == kLong ? "Long" : "Integer";
+      switch (random1(2)) {
+        case 1: fprintf(out_, "%s.MIN_VALUE", prefix); break;
+        case 2: fprintf(out_, "%s.MAX_VALUE", prefix); break;
+      }
+    } else {  // isFP(tp)
+      const char* prefix = tp == kDouble ? "Double" : "Float";
+      switch (random1(6)) {
+        case 1: fprintf(out_, "%s.MIN_NORMAL", prefix);        break;
+        case 2: fprintf(out_, "%s.MIN_VALUE", prefix);         break;
+        case 3: fprintf(out_, "%s.MAX_VALUE", prefix);         break;
+        case 4: fprintf(out_, "%s.POSITIVE_INFINITY", prefix); break;
+        case 5: fprintf(out_, "%s.NEGATIVE_INFINITY", prefix); break;
+        case 6: fprintf(out_, "%s.NaN", prefix);               break;
+      }
     }
   }
 
@@ -412,10 +457,10 @@
   void emitLiteral(Type tp) {
     switch (tp) {
       case kBoolean: fputs(random1(2) == 1 ? "true" : "false", out_); break;
-      case kInt:     fprintf(out_, "%d",    random0(100)); break;
-      case kLong:    fprintf(out_, "%dL",   random0(100)); break;
-      case kFloat:   fprintf(out_, "%d.0f", random0(100)); break;
-      case kDouble:  fprintf(out_, "%d.0",  random0(100)); break;
+      case kInt:     fprintf(out_, "%d",    random()); break;
+      case kLong:    fprintf(out_, "%dL",   random()); break;
+      case kFloat:   fprintf(out_, "%d.0f", random()); break;
+      case kDouble:  fprintf(out_, "%d.0",  random()); break;
     }
   }
 
@@ -433,17 +478,6 @@
     return false;
   }
 
-  // Emit a loop variable, if available.
-  bool emitLoopVariable(Type tp) {
-    if (tp == kInt) {
-      if (loop_nest_ > 0) {
-        fprintf(out_, "i%u", random0(loop_nest_));
-        return true;
-      }
-    }
-    return false;
-  }
-
   // Emit a local variable, if available.
   bool emitLocalVariable(Type tp) {
     uint32_t locals = adjustLocal(tp, 0);
@@ -483,10 +517,6 @@
         if (emitLocalVariable(tp))
           return;
         // FALL-THROUGH
-      case 3:
-        if (emitLoopVariable(tp))
-          return;
-        // FALL-THROUGH
       default:
         emitFieldVariable(tp);
         break;
@@ -510,8 +540,9 @@
     fputc('(', out_);
     switch (random1(12)) {  // favor binary operations
       case 1:
-        // Unary operator: ~x
+        // Unary operator: ~ x
         emitUnaryOp(tp);
+        fputc(' ', out_);
         emitExpression(tp);
         break;
       case 2:
@@ -761,7 +792,7 @@
 
     bool mayFollow = false;
     fputs("switch (", out_);
-    emitExpression(kInt);
+    emitArrayIndex();  // restrict its range
     fputs(") {\n", out_);
 
     ++if_nest_;
@@ -771,7 +802,7 @@
     for (uint32_t i = 0; i < 2; i++) {
       emitIndentation();
       if (i == 0) {
-        fprintf(out_, "case %d: {\n", random0(100));
+        fprintf(out_, "case %u: {\n", random0(array_size_));
       } else {
         fprintf(out_, "default: {\n");
       }
@@ -977,6 +1008,11 @@
   // Random integers.
   //
 
+  // Return random integer.
+  int32_t random() {
+    return fuzz_random_engine_();
+  }
+
   // Return random integer in range [0,max).
   uint32_t random0(uint32_t max) {
     std::uniform_int_distribution<uint32_t> gen(0, max - 1);
@@ -1025,7 +1061,7 @@
   // Defaults.
   uint32_t seed = time(NULL);
   uint32_t expr_depth = 1;
-  uint32_t stmt_length = 4;
+  uint32_t stmt_length = 8;
   uint32_t if_nest = 2;
   uint32_t loop_nest = 3;
 
diff --git a/tools/javafuzz/run_java_fuzz_test.py b/tools/javafuzz/run_java_fuzz_test.py
index 4f192e7..5f527b8 100755
--- a/tools/javafuzz/run_java_fuzz_test.py
+++ b/tools/javafuzz/run_java_fuzz_test.py
@@ -78,10 +78,11 @@
   return libdir + '/core-libart-hostdex_intermediates/classes.jack:' \
        + libdir + '/core-oj-hostdex_intermediates/classes.jack'
 
-def GetExecutionModeRunner(mode):
+def GetExecutionModeRunner(device, mode):
   """Returns a runner for the given execution mode.
 
   Args:
+    device: string, target device serial number (or None)
     mode: string, execution mode
   Returns:
     TestRunner with given execution mode
@@ -95,9 +96,9 @@
   if mode == 'hopt':
     return TestRunnerArtOnHost(False)
   if mode == 'tint':
-    return TestRunnerArtOnTarget(True)
+    return TestRunnerArtOnTarget(device, True)
   if mode == 'topt':
-    return TestRunnerArtOnTarget(False)
+    return TestRunnerArtOnTarget(device, False)
   raise FatalError('Unknown execution mode')
 
 def GetReturnCode(retc):
@@ -210,13 +211,14 @@
 class TestRunnerArtOnTarget(TestRunner):
   """Concrete test runner of Art on target (interpreter or optimizing)."""
 
-  def  __init__(self, interpreter):
+  def  __init__(self, device, interpreter):
     """Constructor for the Art on target tester.
 
     Args:
+      device: string, target device serial number (or None)
       interpreter: boolean, selects between interpreter or optimizing
     """
-    self._dalvik_args = '-cp /data/local/tmp/classes.dex Test'
+    self._dalvik_args = 'shell dalvikvm -cp /data/local/tmp/classes.dex Test'
     if interpreter:
       self._description = 'Art interpreter on target'
       self._id = 'TInt'
@@ -224,16 +226,19 @@
     else:
       self._description = 'Art optimizing on target'
       self._id = 'TOpt'
+    self._adb = 'adb'
+    if device != None:
+      self._adb = self._adb + ' -s ' + device
     self._jack_args = '-cp ' + GetJackClassPath() + ' --output-dex . Test.java'
 
   def CompileAndRunTest(self):
     if RunCommand('jack', self._jack_args,
                   out=None, err='jackerr.txt', timeout=30) == EXIT_SUCCESS:
-      if RunCommand('adb push', 'classes.dex /data/local/tmp/',
+      if RunCommand(self._adb, 'push classes.dex /data/local/tmp/',
                     'adb.txt', err=None) != EXIT_SUCCESS:
         raise FatalError('Cannot push to target device')
       out = self.GetId() + '_run_out.txt'
-      retc = RunCommand('adb shell dalvikvm', self._dalvik_args, out, err=None)
+      retc = RunCommand(self._adb, self._dalvik_args, out, err=None)
       if retc != EXIT_SUCCESS and retc != EXIT_TIMEOUT:
         retc = EXIT_NOTRUN
     else:
@@ -241,7 +246,7 @@
     # Cleanup and return.
     RunCommand('rm', '-f classes.dex jackerr.txt adb.txt',
                out=None, err=None)
-    RunCommand('adb shell', 'rm -f /data/local/tmp/classes.dex',
+    RunCommand(self._adb, 'shell rm -f /data/local/tmp/classes.dex',
                out=None, err=None)
     return retc
 
@@ -256,17 +261,19 @@
 class JavaFuzzTester(object):
   """Tester that runs JavaFuzz many times and report divergences."""
 
-  def  __init__(self, num_tests, mode1, mode2):
+  def  __init__(self, num_tests, device, mode1, mode2):
     """Constructor for the tester.
 
     Args:
     num_tests: int, number of tests to run
+    device: string, target device serial number (or None)
     mode1: string, execution mode for first runner
     mode2: string, execution mode for second runner
     """
     self._num_tests = num_tests
-    self._runner1 = GetExecutionModeRunner(mode1)
-    self._runner2 = GetExecutionModeRunner(mode2)
+    self._device = device
+    self._runner1 = GetExecutionModeRunner(device, mode1)
+    self._runner2 = GetExecutionModeRunner(device, mode2)
     self._save_dir = None
     self._tmp_dir = None
     # Statistics.
@@ -302,6 +309,7 @@
     print '**\n**** JavaFuzz Testing\n**'
     print
     print '#Tests    :', self._num_tests
+    print 'Device    :', self._device
     print 'Directory :', self._tmp_dir
     print 'Exec-mode1:', self._runner1.GetDescription()
     print 'Exec-mode2:', self._runner2.GetDescription()
@@ -391,6 +399,7 @@
   parser = argparse.ArgumentParser()
   parser.add_argument('--num_tests', default=10000,
                       type=int, help='number of tests to run')
+  parser.add_argument('--device', help='target device serial number')
   parser.add_argument('--mode1', default='ri',
                       help='execution mode 1 (default: ri)')
   parser.add_argument('--mode2', default='hopt',
@@ -399,7 +408,8 @@
   if args.mode1 == args.mode2:
     raise FatalError("Identical execution modes given")
   # Run the JavaFuzz tester.
-  with JavaFuzzTester(args.num_tests, args.mode1, args.mode2) as fuzzer:
+  with JavaFuzzTester(args.num_tests, args.device,
+                      args.mode1, args.mode2) as fuzzer:
     fuzzer.Run()
 
 if __name__ == "__main__":
diff --git a/tools/libcore_failures.txt b/tools/libcore_failures.txt
index cbb6e1d..6472c8d 100644
--- a/tools/libcore_failures.txt
+++ b/tools/libcore_failures.txt
@@ -220,13 +220,6 @@
   names: [ "libcore.java.io.FileTest#testJavaIoTmpdirMutable" ]
 },
 {
-  description: "Made for extending, shouldn't be run",
-  result: EXEC_FAILED,
-  names: ["jsr166.CollectionTest#testEmptyMeansEmpty",
-          "jsr166.Collection8Test#testForEach",
-          "jsr166.Collection8Test#testForEachConcurrentStressTest"]
-},
-{
   description: "Flaky test",
   result: EXEC_FAILED,
   bug: 30107038,