69 files changed, 2867 insertions, 1691 deletions
diff --git a/compiler/Android.bp b/compiler/Android.bp
index a2b07af810..df896dc73c 100644
--- a/compiler/Android.bp
+++ b/compiler/Android.bp
@@ -115,6 +115,7 @@ art_cc_defaults {
                 "optimizing/intrinsics_arm.cc",
                 "optimizing/intrinsics_arm_vixl.cc",
                 "optimizing/nodes_shared.cc",
+                "optimizing/scheduler_arm.cc",
                 "utils/arm/assembler_arm.cc",
                 "utils/arm/assembler_arm_vixl.cc",
                 "utils/arm/assembler_thumb2.cc",
diff --git a/compiler/linker/arm/relative_patcher_thumb2.cc b/compiler/linker/arm/relative_patcher_thumb2.cc
index ced52ff07a..a98aedfc69 100644
--- a/compiler/linker/arm/relative_patcher_thumb2.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2.cc
@@ -18,6 +18,7 @@
 
 #include "arch/arm/asm_support_arm.h"
 #include "art_method.h"
+#include "base/bit_utils.h"
 #include "compiled_method.h"
 #include "entrypoints/quick/quick_entrypoints_enum.h"
 #include "lock_word.h"
@@ -112,12 +113,22 @@ void Thumb2RelativePatcher::PatchBakerReadBarrierBranch(std::vector<uint8_t>* co
     // Check that the next instruction matches the expected LDR.
     switch (kind) {
       case BakerReadBarrierKind::kField: {
-        DCHECK_GE(code->size() - literal_offset, 8u);
-        uint32_t next_insn = GetInsn32(code, literal_offset + 4u);
-        // LDR (immediate) with correct base_reg.
-        CheckValidReg((next_insn >> 12) & 0xfu);  // Check destination register.
-        const uint32_t base_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
-        CHECK_EQ(next_insn & 0xffff0000u, 0xf8d00000u | (base_reg << 16));
+        BakerReadBarrierWidth width = BakerReadBarrierWidthField::Decode(encoded_data);
+        if (width == BakerReadBarrierWidth::kWide) {
+          DCHECK_GE(code->size() - literal_offset, 8u);
+          uint32_t next_insn = GetInsn32(code, literal_offset + 4u);
+          // LDR (immediate), encoding T3, with correct base_reg.
+          CheckValidReg((next_insn >> 12) & 0xfu);  // Check destination register.
+          const uint32_t base_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
+          CHECK_EQ(next_insn & 0xffff0000u, 0xf8d00000u | (base_reg << 16));
+        } else {
+          DCHECK_GE(code->size() - literal_offset, 6u);
+          uint32_t next_insn = GetInsn16(code, literal_offset + 4u);
+          // LDR (immediate), encoding T1, with correct base_reg.
+          CheckValidReg(next_insn & 0x7u);  // Check destination register.
+          const uint32_t base_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
+          CHECK_EQ(next_insn & 0xf838u, 0x6800u | (base_reg << 3));
+        }
         break;
       }
       case BakerReadBarrierKind::kArray: {
@@ -131,11 +142,20 @@ void Thumb2RelativePatcher::PatchBakerReadBarrierBranch(std::vector<uint8_t>* co
         break;
       }
       case BakerReadBarrierKind::kGcRoot: {
-        DCHECK_GE(literal_offset, 4u);
-        uint32_t prev_insn = GetInsn32(code, literal_offset - 4u);
-        // LDR (immediate) with correct root_reg.
-        const uint32_t root_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
-        CHECK_EQ(prev_insn & 0xfff0f000u, 0xf8d00000u | (root_reg << 12));
+        BakerReadBarrierWidth width = BakerReadBarrierWidthField::Decode(encoded_data);
+        if (width == BakerReadBarrierWidth::kWide) {
+          DCHECK_GE(literal_offset, 4u);
+          uint32_t prev_insn = GetInsn32(code, literal_offset - 4u);
+          // LDR (immediate), encoding T3, with correct root_reg.
+          const uint32_t root_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
+          CHECK_EQ(prev_insn & 0xfff0f000u, 0xf8d00000u | (root_reg << 12));
+        } else {
+          DCHECK_GE(literal_offset, 2u);
+          uint32_t prev_insn = GetInsn16(code, literal_offset - 2u);
+          // LDR (immediate), encoding T1, with correct root_reg.
+          const uint32_t root_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
+          CHECK_EQ(prev_insn & 0xf807u, 0x6800u | root_reg);
+        }
         break;
       }
       default:
@@ -160,7 +180,8 @@ void Thumb2RelativePatcher::PatchBakerReadBarrierBranch(std::vector<uint8_t>* co
 static void EmitGrayCheckAndFastPath(arm::ArmVIXLAssembler& assembler,
                                      vixl::aarch32::Register base_reg,
                                      vixl::aarch32::MemOperand& lock_word,
-                                     vixl::aarch32::Label* slow_path) {
+                                     vixl::aarch32::Label* slow_path,
+                                     int32_t raw_ldr_offset) {
   using namespace vixl::aarch32;  // NOLINT(build/namespaces)
   // Load the lock word containing the rb_state.
   __ Ldr(ip, lock_word);
@@ -169,14 +190,7 @@ static void EmitGrayCheckAndFastPath(arm::ArmVIXLAssembler& assembler,
   static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
   __ Tst(ip, Operand(LockWord::kReadBarrierStateMaskShifted));
   __ B(ne, slow_path, /* is_far_target */ false);
-  static_assert(
-      BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET,
-      "Field and array LDR offsets must be the same to reuse the same code.");
-  // Adjust the return address back to the LDR (1 instruction; 2 for heap poisoning).
-  static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                "Field LDR must be 1 instruction (4B) before the return address label; "
-                " 2 instructions (8B) for heap poisoning.");
-  __ Add(lr, lr, BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET);
+  __ Add(lr, lr, raw_ldr_offset);
   // Introduce a dependency on the lock_word including rb_state,
   // to prevent load-load reordering, and without using
   // a memory barrier (which would be more expensive).
@@ -199,6 +213,7 @@ void Thumb2RelativePatcher::CompileBakerReadBarrierThunk(arm::ArmVIXLAssembler&
       CheckValidReg(base_reg.GetCode());
       Register holder_reg(BakerReadBarrierSecondRegField::Decode(encoded_data));
       CheckValidReg(holder_reg.GetCode());
+      BakerReadBarrierWidth width = BakerReadBarrierWidthField::Decode(encoded_data);
       UseScratchRegisterScope temps(assembler.GetVIXLAssembler());
       temps.Exclude(ip);
       // If base_reg differs from holder_reg, the offset was too large and we must have
@@ -210,16 +225,30 @@ void Thumb2RelativePatcher::CompileBakerReadBarrierThunk(arm::ArmVIXLAssembler&
       }
       vixl::aarch32::Label slow_path;
       MemOperand lock_word(holder_reg, mirror::Object::MonitorOffset().Int32Value());
-      EmitGrayCheckAndFastPath(assembler, base_reg, lock_word, &slow_path);
+      const int32_t raw_ldr_offset = (width == BakerReadBarrierWidth::kWide)
+          ? BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET
+          : BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET;
+      EmitGrayCheckAndFastPath(assembler, base_reg, lock_word, &slow_path, raw_ldr_offset);
       __ Bind(&slow_path);
       const int32_t ldr_offset = /* Thumb state adjustment (LR contains Thumb state). */ -1 +
-                                 BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET;
-      MemOperand ldr_half_address(lr, ldr_offset + 2);
-      __ Ldrh(ip, ldr_half_address);          // Load the LDR immediate half-word with "Rt | imm12".
-      __ Ubfx(ip, ip, 0, 12);                 // Extract the offset imm12.
-      __ Ldr(ip, MemOperand(base_reg, ip));   // Load the reference.
+                                 raw_ldr_offset;
+      Register ep_reg(kBakerCcEntrypointRegister);
+      if (width == BakerReadBarrierWidth::kWide) {
+        MemOperand ldr_half_address(lr, ldr_offset + 2);
+        __ Ldrh(ip, ldr_half_address);        // Load the LDR immediate half-word with "Rt | imm12".
+        __ Ubfx(ip, ip, 0, 12);               // Extract the offset imm12.
+        __ Ldr(ip, MemOperand(base_reg, ip));   // Load the reference.
+      } else {
+        MemOperand ldr_address(lr, ldr_offset);
+        __ Ldrh(ip, ldr_address);             // Load the LDR immediate, encoding T1.
+        __ Add(ep_reg,                        // Adjust the entrypoint address to the entrypoint
+               ep_reg,                        // for narrow LDR.
+               Operand(BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_ENTRYPOINT_OFFSET));
+        __ Ubfx(ip, ip, 6, 5);                // Extract the imm5, i.e. offset / 4.
+        __ Ldr(ip, MemOperand(base_reg, ip, LSL, 2));   // Load the reference.
+      }
       // Do not unpoison. With heap poisoning enabled, the entrypoint expects a poisoned reference.
-      __ Bx(Register(kBakerCcEntrypointRegister));  // Jump to the entrypoint.
+      __ Bx(ep_reg);                          // Jump to the entrypoint.
       if (holder_reg.Is(base_reg)) {
         // Add null check slow path. The stack map is at the address pointed to by LR.
         __ Bind(&throw_npe);
@@ -233,6 +262,7 @@ void Thumb2RelativePatcher::CompileBakerReadBarrierThunk(arm::ArmVIXLAssembler&
       Register base_reg(BakerReadBarrierFirstRegField::Decode(encoded_data));
       CheckValidReg(base_reg.GetCode());
       DCHECK_EQ(kInvalidEncodedReg, BakerReadBarrierSecondRegField::Decode(encoded_data));
+      DCHECK(BakerReadBarrierWidth::kWide == BakerReadBarrierWidthField::Decode(encoded_data));
       UseScratchRegisterScope temps(assembler.GetVIXLAssembler());
       temps.Exclude(ip);
       vixl::aarch32::Label slow_path;
@@ -240,10 +270,11 @@ void Thumb2RelativePatcher::CompileBakerReadBarrierThunk(arm::ArmVIXLAssembler&
           mirror::Array::DataOffset(Primitive::ComponentSize(Primitive::kPrimNot)).Int32Value();
       MemOperand lock_word(base_reg, mirror::Object::MonitorOffset().Int32Value() - data_offset);
       DCHECK_LT(lock_word.GetOffsetImmediate(), 0);
-      EmitGrayCheckAndFastPath(assembler, base_reg, lock_word, &slow_path);
+      const int32_t raw_ldr_offset = BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET;
+      EmitGrayCheckAndFastPath(assembler, base_reg, lock_word, &slow_path, raw_ldr_offset);
       __ Bind(&slow_path);
       const int32_t ldr_offset = /* Thumb state adjustment (LR contains Thumb state). */ -1 +
-                                 BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET;
+                                 raw_ldr_offset;
       MemOperand ldr_address(lr, ldr_offset + 2);
       __ Ldrb(ip, ldr_address);               // Load the LDR (register) byte with "00 | imm2 | Rm",
                                               // i.e. Rm+32 because the scale in imm2 is 2.
@@ -261,6 +292,7 @@ void Thumb2RelativePatcher::CompileBakerReadBarrierThunk(arm::ArmVIXLAssembler&
       Register root_reg(BakerReadBarrierFirstRegField::Decode(encoded_data));
       CheckValidReg(root_reg.GetCode());
       DCHECK_EQ(kInvalidEncodedReg, BakerReadBarrierSecondRegField::Decode(encoded_data));
+      BakerReadBarrierWidth width = BakerReadBarrierWidthField::Decode(encoded_data);
       UseScratchRegisterScope temps(assembler.GetVIXLAssembler());
       temps.Exclude(ip);
       vixl::aarch32::Label return_label, not_marked, forwarding_address;
@@ -280,7 +312,10 @@ void Thumb2RelativePatcher::CompileBakerReadBarrierThunk(arm::ArmVIXLAssembler&
       // Adjust the art_quick_read_barrier_mark_introspection address in kBakerCcEntrypointRegister
       // to art_quick_read_barrier_mark_introspection_gc_roots.
       Register ep_reg(kBakerCcEntrypointRegister);
-      __ Add(ep_reg, ep_reg, Operand(BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET));
+      int32_t entrypoint_offset = (width == BakerReadBarrierWidth::kWide)
+          ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET
+          : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET;
+      __ Add(ep_reg, ep_reg, Operand(entrypoint_offset));
       __ Mov(ip, root_reg);
       __ Bx(ep_reg);
       __ Bind(&forwarding_address);
@@ -344,7 +379,7 @@ uint32_t Thumb2RelativePatcher::MaxNegativeDisplacement(const ThunkKey& key) {
 
 void Thumb2RelativePatcher::SetInsn32(std::vector<uint8_t>* code, uint32_t offset, uint32_t value) {
   DCHECK_LE(offset + 4u, code->size());
-  DCHECK_EQ(offset & 1u, 0u);
+  DCHECK_ALIGNED(offset, 2u);
   uint8_t* addr = &(*code)[offset];
   addr[0] = (value >> 16) & 0xff;
   addr[1] = (value >> 24) & 0xff;
@@ -354,7 +389,7 @@ void Thumb2RelativePatcher::SetInsn32(std::vector<uint8_t>* code, uint32_t offse
 
 uint32_t Thumb2RelativePatcher::GetInsn32(ArrayRef<const uint8_t> code, uint32_t offset) {
   DCHECK_LE(offset + 4u, code.size());
-  DCHECK_EQ(offset & 1u, 0u);
+  DCHECK_ALIGNED(offset, 2u);
   const uint8_t* addr = &code[offset];
   return
       (static_cast<uint32_t>(addr[0]) << 16) +
@@ -369,5 +404,18 @@ uint32_t Thumb2RelativePatcher::GetInsn32(Vector* code, uint32_t offset) {
   return GetInsn32(ArrayRef<const uint8_t>(*code), offset);
 }
 
+uint32_t Thumb2RelativePatcher::GetInsn16(ArrayRef<const uint8_t> code, uint32_t offset) {
+  DCHECK_LE(offset + 2u, code.size());
+  DCHECK_ALIGNED(offset, 2u);
+  const uint8_t* addr = &code[offset];
+  return (static_cast<uint32_t>(addr[0]) << 0) + (static_cast<uint32_t>(addr[1]) << 8);
+}
+
+template <typename Vector>
+uint32_t Thumb2RelativePatcher::GetInsn16(Vector* code, uint32_t offset) {
+  static_assert(std::is_same<typename Vector::value_type, uint8_t>::value, "Invalid value type");
+  return GetInsn16(ArrayRef<const uint8_t>(*code), offset);
+}
+
 }  // namespace linker
 }  // namespace art
diff --git a/compiler/linker/arm/relative_patcher_thumb2.h b/compiler/linker/arm/relative_patcher_thumb2.h
index 7fad245856..7e787d2916 100644
--- a/compiler/linker/arm/relative_patcher_thumb2.h
+++ b/compiler/linker/arm/relative_patcher_thumb2.h
@@ -35,26 +35,37 @@ class Thumb2RelativePatcher FINAL : public ArmBaseRelativePatcher {
  public:
   static constexpr uint32_t kBakerCcEntrypointRegister = 4u;
 
-  static uint32_t EncodeBakerReadBarrierFieldData(uint32_t base_reg, uint32_t holder_reg) {
+  static uint32_t EncodeBakerReadBarrierFieldData(uint32_t base_reg,
+                                                  uint32_t holder_reg,
+                                                  bool narrow) {
     CheckValidReg(base_reg);
     CheckValidReg(holder_reg);
+    DCHECK(!narrow || base_reg < 8u) << base_reg;
+    BakerReadBarrierWidth width =
+        narrow ? BakerReadBarrierWidth::kNarrow : BakerReadBarrierWidth::kWide;
     return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kField) |
            BakerReadBarrierFirstRegField::Encode(base_reg) |
-           BakerReadBarrierSecondRegField::Encode(holder_reg);
+           BakerReadBarrierSecondRegField::Encode(holder_reg) |
+           BakerReadBarrierWidthField::Encode(width);
   }
 
   static uint32_t EncodeBakerReadBarrierArrayData(uint32_t base_reg) {
     CheckValidReg(base_reg);
     return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kArray) |
            BakerReadBarrierFirstRegField::Encode(base_reg) |
-           BakerReadBarrierSecondRegField::Encode(kInvalidEncodedReg);
+           BakerReadBarrierSecondRegField::Encode(kInvalidEncodedReg) |
+           BakerReadBarrierWidthField::Encode(BakerReadBarrierWidth::kWide);
   }
 
-  static uint32_t EncodeBakerReadBarrierGcRootData(uint32_t root_reg) {
+  static uint32_t EncodeBakerReadBarrierGcRootData(uint32_t root_reg, bool narrow) {
     CheckValidReg(root_reg);
+    DCHECK(!narrow || root_reg < 8u) << root_reg;
+    BakerReadBarrierWidth width =
+        narrow ? BakerReadBarrierWidth::kNarrow : BakerReadBarrierWidth::kWide;
     return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kGcRoot) |
            BakerReadBarrierFirstRegField::Encode(root_reg) |
-           BakerReadBarrierSecondRegField::Encode(kInvalidEncodedReg);
+           BakerReadBarrierSecondRegField::Encode(kInvalidEncodedReg) |
+           BakerReadBarrierWidthField::Encode(width);
   }
 
   explicit Thumb2RelativePatcher(RelativePatcherTargetProvider* provider);
@@ -86,6 +97,12 @@ class Thumb2RelativePatcher FINAL : public ArmBaseRelativePatcher {
     kLast
   };
 
+  enum class BakerReadBarrierWidth : uint8_t {
+    kWide,          // 32-bit LDR (and 32-bit NEG if heap poisoning is enabled).
+    kNarrow,        // 16-bit LDR (and 16-bit NEG if heap poisoning is enabled).
+    kLast
+  };
+
   static constexpr size_t kBitsForBakerReadBarrierKind =
       MinimumBitsToStore(static_cast<size_t>(BakerReadBarrierKind::kLast));
   static constexpr size_t kBitsForRegister = 4u;
@@ -95,9 +112,14 @@ class Thumb2RelativePatcher FINAL : public ArmBaseRelativePatcher {
       BitField<uint32_t, kBitsForBakerReadBarrierKind, kBitsForRegister>;
   using BakerReadBarrierSecondRegField =
       BitField<uint32_t, kBitsForBakerReadBarrierKind + kBitsForRegister, kBitsForRegister>;
+  static constexpr size_t kBitsForBakerReadBarrierWidth =
+      MinimumBitsToStore(static_cast<size_t>(BakerReadBarrierWidth::kLast));
+  using BakerReadBarrierWidthField = BitField<BakerReadBarrierWidth,
+                                              kBitsForBakerReadBarrierKind + 2 * kBitsForRegister,
+                                              kBitsForBakerReadBarrierWidth>;
 
   static void CheckValidReg(uint32_t reg) {
-    DCHECK(reg < 12u && reg != kBakerCcEntrypointRegister);
+    DCHECK(reg < 12u && reg != kBakerCcEntrypointRegister) << reg;
   }
 
   void CompileBakerReadBarrierThunk(arm::ArmVIXLAssembler& assembler, uint32_t encoded_data);
@@ -108,6 +130,11 @@ class Thumb2RelativePatcher FINAL : public ArmBaseRelativePatcher {
   template <typename Vector>
   static uint32_t GetInsn32(Vector* code, uint32_t offset);
 
+  static uint32_t GetInsn16(ArrayRef<const uint8_t> code, uint32_t offset);
+
+  template <typename Vector>
+  static uint32_t GetInsn16(Vector* code, uint32_t offset);
+
   friend class Thumb2RelativePatcherTest;
 
   DISALLOW_COPY_AND_ASSIGN(Thumb2RelativePatcher);
diff --git a/compiler/linker/arm/relative_patcher_thumb2_test.cc b/compiler/linker/arm/relative_patcher_thumb2_test.cc
index 2e28349231..af5fa40dc1 100644
--- a/compiler/linker/arm/relative_patcher_thumb2_test.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2_test.cc
@@ -52,6 +52,9 @@ class Thumb2RelativePatcherTest : public RelativePatcherTest {
   // BNE +0, 32-bit, encoding T3. Bits 0-10, 11, 13, 16-21, 26 are placeholder for target offset.
   static constexpr uint32_t kBneWPlus0 = 0xf0408000u;
 
+  // LDR immediate, 16-bit, encoding T1. Bits 6-10 are imm5, 0-2 are Rt, 3-5 are Rn.
+  static constexpr uint32_t kLdrInsn = 0x6800u;
+
   // LDR immediate, 32-bit, encoding T3. Bits 0-11 are offset, 12-15 are Rt, 16-20 are Rn.
   static constexpr uint32_t kLdrWInsn = 0xf8d00000u;
 
@@ -223,9 +226,11 @@ class Thumb2RelativePatcherTest : public RelativePatcherTest {
   void TestStringReference(uint32_t string_offset);
   void CheckPcRelativePatch(const ArrayRef<const LinkerPatch>& patches, uint32_t target_offset);
 
-  std::vector<uint8_t> CompileBakerOffsetThunk(uint32_t base_reg, uint32_t holder_reg) {
+  std::vector<uint8_t> CompileBakerOffsetThunk(uint32_t base_reg,
+                                               uint32_t holder_reg,
+                                               bool narrow) {
     const LinkerPatch patch = LinkerPatch::BakerReadBarrierBranchPatch(
-        0u, Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base_reg, holder_reg));
+        0u, Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base_reg, holder_reg, narrow));
     ArmBaseRelativePatcher::ThunkKey key = ArmBaseRelativePatcher::GetBakerThunkKey(patch);
     return down_cast<Thumb2RelativePatcher*>(patcher_.get())->CompileThunk(key);
   }
@@ -237,9 +242,9 @@ class Thumb2RelativePatcherTest : public RelativePatcherTest {
     return down_cast<Thumb2RelativePatcher*>(patcher_.get())->CompileThunk(key);
   }
 
-  std::vector<uint8_t> CompileBakerGcRootThunk(uint32_t root_reg) {
+  std::vector<uint8_t> CompileBakerGcRootThunk(uint32_t root_reg, bool narrow) {
     LinkerPatch patch = LinkerPatch::BakerReadBarrierBranchPatch(
-        0u, Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg));
+        0u, Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg, narrow));
     ArmBaseRelativePatcher::ThunkKey key = ArmBaseRelativePatcher::GetBakerThunkKey(patch);
     return down_cast<Thumb2RelativePatcher*>(patcher_.get())->CompileThunk(key);
   }
@@ -260,7 +265,8 @@ class Thumb2RelativePatcherTest : public RelativePatcherTest {
            (static_cast<uint32_t>(output_[offset + 1]) << 8);
   }
 
-  void TestBakerField(uint32_t offset, uint32_t ref_reg);
+  void TestBakerFieldWide(uint32_t offset, uint32_t ref_reg);
+  void TestBakerFieldNarrow(uint32_t offset, uint32_t ref_reg);
 };
 
 const uint8_t Thumb2RelativePatcherTest::kCallRawCode[] = {
@@ -568,7 +574,7 @@ TEST_F(Thumb2RelativePatcherTest, StringReference4) {
   ASSERT_LT(GetMethodOffset(1u), 0xfcu);
 }
 
-void Thumb2RelativePatcherTest::TestBakerField(uint32_t offset, uint32_t ref_reg) {
+void Thumb2RelativePatcherTest::TestBakerFieldWide(uint32_t offset, uint32_t ref_reg) {
   uint32_t valid_regs[] = {
       0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
       8,  9, 10, 11,                  // IP, SP, LR and PC are reserved.
@@ -584,8 +590,8 @@ void Thumb2RelativePatcherTest::TestBakerField(uint32_t offset, uint32_t ref_reg
       const std::vector<uint8_t> raw_code = RawCode({kBneWPlus0, ldr});
       ASSERT_EQ(kMethodCodeSize, raw_code.size());
       ArrayRef<const uint8_t> code(raw_code);
-      uint32_t encoded_data =
-          Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base_reg, holder_reg);
+      uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
+          base_reg, holder_reg, /* narrow */ false);
       const LinkerPatch patches[] = {
           LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset, encoded_data),
       };
@@ -608,7 +614,113 @@ void Thumb2RelativePatcherTest::TestBakerField(uint32_t offset, uint32_t ref_reg
       ASSERT_TRUE(
           CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code)));
 
-      std::vector<uint8_t> expected_thunk = CompileBakerOffsetThunk(base_reg, holder_reg);
+      std::vector<uint8_t> expected_thunk =
+          CompileBakerOffsetThunk(base_reg, holder_reg, /* narrow */ false);
+      ASSERT_GT(output_.size(), thunk_offset);
+      ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size());
+      ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset,
+                                             expected_thunk.size());
+      if (ArrayRef<const uint8_t>(expected_thunk) != compiled_thunk) {
+        DumpDiff(ArrayRef<const uint8_t>(expected_thunk), compiled_thunk);
+        ASSERT_TRUE(false);
+      }
+
+      size_t gray_check_offset = thunk_offset;
+      if (holder_reg == base_reg) {
+        // Verify that the null-check uses the correct register, i.e. holder_reg.
+        if (holder_reg < 8) {
+          ASSERT_GE(output_.size() - gray_check_offset, 2u);
+          ASSERT_EQ(0xb100 | holder_reg, GetOutputInsn16(thunk_offset) & 0xfd07u);
+          gray_check_offset +=2u;
+        } else {
+          ASSERT_GE(output_.size() - gray_check_offset, 6u);
+          ASSERT_EQ(0xf1b00f00u | (holder_reg << 16), GetOutputInsn32(thunk_offset) & 0xfbff8f00u);
+          ASSERT_EQ(0xd000u, GetOutputInsn16(thunk_offset + 4u) & 0xff00u);  // BEQ
+          gray_check_offset += 6u;
+        }
+      }
+      // Verify that the lock word for gray bit check is loaded from the holder address.
+      ASSERT_GE(output_.size() - gray_check_offset,
+                4u * /* 32-bit instructions */ 4u + 2u * /* 16-bit instructions */ 2u);
+      const uint32_t load_lock_word =
+          kLdrWInsn |
+          (holder_reg << 16) |
+          (/* IP */ 12 << 12) |
+          mirror::Object::MonitorOffset().Uint32Value();
+      ASSERT_EQ(load_lock_word, GetOutputInsn32(gray_check_offset));
+      // Verify the gray bit check.
+      DCHECK_GE(LockWord::kReadBarrierStateShift, 8u);  // ROR modified immediate.
+      uint32_t ror_shift = 7 + (32 - LockWord::kReadBarrierStateShift);
+      const uint32_t tst_gray_bit_without_offset =
+          0xf0100f00 | (/* IP */ 12 << 16)
+                     | (((ror_shift >> 4) & 1) << 26)   // i
+                     | (((ror_shift >> 1) & 7) << 12)   // imm3
+                     | ((ror_shift & 1) << 7);          // imm8, ROR('1':imm8<7:0>, ror_shift).
+      EXPECT_EQ(tst_gray_bit_without_offset, GetOutputInsn32(gray_check_offset + 4u));
+      EXPECT_EQ(0xd100u, GetOutputInsn16(gray_check_offset + 8u) & 0xff00u);  // BNE
+      // Verify the fake dependency (skip "ADD LR, LR, #ldr_offset").
+      const uint32_t fake_dependency =
+          0xeb000010 |              // ADD Rd, Rn, Rm, LSR 32 (type=01, imm3=000, imm2=00)
+          (/* IP */ 12) |           // Rm = IP
+          (base_reg << 16) |        // Rn = base_reg
+          (base_reg << 8);          // Rd = base_reg
+      EXPECT_EQ(fake_dependency, GetOutputInsn32(gray_check_offset + 14u));
+      // Do not check the rest of the implementation.
+
+      // The next thunk follows on the next aligned offset.
+      thunk_offset += RoundUp(expected_thunk.size(), kArmAlignment);
+    }
+  }
+}
+
+void Thumb2RelativePatcherTest::TestBakerFieldNarrow(uint32_t offset, uint32_t ref_reg) {
+  uint32_t valid_regs[] = {
+      0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
+      8,  9, 10, 11,                  // IP, SP, LR and PC are reserved.
+  };
+  DCHECK_ALIGNED(offset, 4u);
+  DCHECK_LT(offset, 32u);
+  constexpr size_t kMethodCodeSize = 6u;
+  constexpr size_t kLiteralOffset = 0u;
+  uint32_t method_idx = 0u;
+  for (uint32_t base_reg : valid_regs) {
+    if (base_reg >= 8u) {
+      continue;
+    }
+    for (uint32_t holder_reg : valid_regs) {
+      uint32_t ldr = kLdrInsn | (offset << (6 - 2)) | (base_reg << 3) | ref_reg;
+      const std::vector<uint8_t> raw_code = RawCode({kBneWPlus0, ldr});
+      ASSERT_EQ(kMethodCodeSize, raw_code.size());
+      ArrayRef<const uint8_t> code(raw_code);
+      uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
+          base_reg, holder_reg, /* narrow */ true);
+      const LinkerPatch patches[] = {
+          LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset, encoded_data),
+      };
+      ++method_idx;
+      AddCompiledMethod(MethodRef(method_idx), code, ArrayRef<const LinkerPatch>(patches));
+    }
+  }
+  Link();
+
+  // All thunks are at the end.
+  uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment);
+  method_idx = 0u;
+  for (uint32_t base_reg : valid_regs) {
+    if (base_reg >= 8u) {
+      continue;
+    }
+    for (uint32_t holder_reg : valid_regs) {
+      ++method_idx;
+      uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset);
+      uint32_t ldr = kLdrInsn | (offset << (6 - 2)) | (base_reg << 3) | ref_reg;
+      const std::vector<uint8_t> expected_code = RawCode({bne, ldr});
+      ASSERT_EQ(kMethodCodeSize, expected_code.size()) << "bne=0x" << std::hex << bne;
+      ASSERT_TRUE(
+          CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code)));
+
+      std::vector<uint8_t> expected_thunk =
+          CompileBakerOffsetThunk(base_reg, holder_reg, /* narrow */ true);
       ASSERT_GT(output_.size(), thunk_offset);
       ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size());
       ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset,
@@ -666,15 +778,26 @@ void Thumb2RelativePatcherTest::TestBakerField(uint32_t offset, uint32_t ref_reg
   }
 }
 
-#define TEST_BAKER_FIELD(offset, ref_reg)     \
-  TEST_F(Thumb2RelativePatcherTest,           \
-    BakerOffset##offset##_##ref_reg) {        \
-    TestBakerField(offset, ref_reg);          \
+#define TEST_BAKER_FIELD_WIDE(offset, ref_reg)    \
+  TEST_F(Thumb2RelativePatcherTest,               \
+    BakerOffsetWide##offset##_##ref_reg) {        \
+    TestBakerFieldWide(offset, ref_reg);          \
   }
 
-TEST_BAKER_FIELD(/* offset */ 0, /* ref_reg */ 0)
-TEST_BAKER_FIELD(/* offset */ 8, /* ref_reg */ 7)
-TEST_BAKER_FIELD(/* offset */ 0xffc, /* ref_reg */ 11)
+TEST_BAKER_FIELD_WIDE(/* offset */ 0, /* ref_reg */ 0)
+TEST_BAKER_FIELD_WIDE(/* offset */ 8, /* ref_reg */ 3)
+TEST_BAKER_FIELD_WIDE(/* offset */ 28, /* ref_reg */ 7)
+TEST_BAKER_FIELD_WIDE(/* offset */ 0xffc, /* ref_reg */ 11)
+
+#define TEST_BAKER_FIELD_NARROW(offset, ref_reg)  \
+  TEST_F(Thumb2RelativePatcherTest,               \
+    BakerOffsetNarrow##offset##_##ref_reg) {      \
+    TestBakerFieldNarrow(offset, ref_reg);        \
+  }
+
+TEST_BAKER_FIELD_NARROW(/* offset */ 0, /* ref_reg */ 0)
+TEST_BAKER_FIELD_NARROW(/* offset */ 8, /* ref_reg */ 3)
+TEST_BAKER_FIELD_NARROW(/* offset */ 28, /* ref_reg */ 7)
 
 TEST_F(Thumb2RelativePatcherTest, BakerOffsetThunkInTheMiddle) {
   // One thunk in the middle with maximum distance branches to it from both sides.
@@ -682,8 +805,8 @@ TEST_F(Thumb2RelativePatcherTest, BakerOffsetThunkInTheMiddle) {
   constexpr uint32_t kLiteralOffset1 = 6u;
   const std::vector<uint8_t> raw_code1 = RawCode({kNopWInsn, kNopInsn, kBneWPlus0, kLdrWInsn});
   ArrayRef<const uint8_t> code1(raw_code1);
-  uint32_t encoded_data =
-      Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(/* base_reg */ 0, /* holder_reg */ 0);
+  uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
+      /* base_reg */ 0, /* holder_reg */ 0, /* narrow */ false);
   const LinkerPatch patches1[] = {
       LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset1, encoded_data),
   };
@@ -710,7 +833,8 @@ TEST_F(Thumb2RelativePatcherTest, BakerOffsetThunkInTheMiddle) {
   //   - thunk size and method 3 pre-header, rounded up (padding in between if needed)
   //   - method 3 code and method 4 pre-header, rounded up (padding in between if needed)
   //   - method 4 header (let there be no padding between method 4 code and method 5 pre-header).
-  size_t thunk_size = CompileBakerOffsetThunk(/* base_reg */ 0, /* holder_reg */ 0).size();
+  size_t thunk_size =
+      CompileBakerOffsetThunk(/* base_reg */ 0, /* holder_reg */ 0, /* narrow */ false).size();
   size_t filler2_size =
       1 * MB - (kLiteralOffset2 + kPcAdjustment)
              - RoundUp(thunk_size + sizeof(OatQuickMethodHeader), kArmAlignment)
@@ -749,8 +873,8 @@ TEST_F(Thumb2RelativePatcherTest, BakerOffsetThunkBeforeFiller) {
   constexpr uint32_t kLiteralOffset1 = 4u;
   const std::vector<uint8_t> raw_code1 = RawCode({kNopWInsn, kBneWPlus0, kLdrWInsn, kNopInsn});
   ArrayRef<const uint8_t> code1(raw_code1);
-  uint32_t encoded_data =
-      Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(/* base_reg */ 0, /* holder_reg */ 0);
+  uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
+      /* base_reg */ 0, /* holder_reg */ 0, /* narrow */ false);
   const LinkerPatch patches1[] = {
       LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset1, encoded_data),
   };
@@ -779,8 +903,8 @@ TEST_F(Thumb2RelativePatcherTest, BakerOffsetThunkInTheMiddleUnreachableFromLast
   constexpr uint32_t kLiteralOffset1 = 6u;
   const std::vector<uint8_t> raw_code1 = RawCode({kNopWInsn, kNopInsn, kBneWPlus0, kLdrWInsn});
   ArrayRef<const uint8_t> code1(raw_code1);
-  uint32_t encoded_data =
-      Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(/* base_reg */ 0, /* holder_reg */ 0);
+  uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
+      /* base_reg */ 0, /* holder_reg */ 0, /* narrow */ false);
   const LinkerPatch patches1[] = {
       LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset1, encoded_data),
   };
@@ -809,7 +933,8 @@ TEST_F(Thumb2RelativePatcherTest, BakerOffsetThunkInTheMiddleUnreachableFromLast
   //   - thunk size and method 3 pre-header, rounded up (padding in between if needed)
   //   - method 3 code and method 4 pre-header, rounded up (padding in between if needed)
   //   - method 4 header (let there be no padding between method 4 code and method 5 pre-header).
-  size_t thunk_size = CompileBakerOffsetThunk(/* base_reg */ 0, /* holder_reg */ 0).size();
+  size_t thunk_size =
+      CompileBakerOffsetThunk(/* base_reg */ 0, /* holder_reg */ 0, /* narrow */ false).size();
   size_t filler2_size =
       1 * MB - (kReachableFromOffset2 + kPcAdjustment)
              - RoundUp(thunk_size + sizeof(OatQuickMethodHeader), kArmAlignment)
@@ -929,7 +1054,7 @@ TEST_F(Thumb2RelativePatcherTest, BakerArray) {
   }
 }
 
-TEST_F(Thumb2RelativePatcherTest, BakerGcRoot) {
+TEST_F(Thumb2RelativePatcherTest, BakerGcRootWide) {
   uint32_t valid_regs[] = {
       0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
       8,  9, 10, 11,                  // IP, SP, LR and PC are reserved.
@@ -945,7 +1070,8 @@ TEST_F(Thumb2RelativePatcherTest, BakerGcRoot) {
     ArrayRef<const uint8_t> code(raw_code);
     const LinkerPatch patches[] = {
         LinkerPatch::BakerReadBarrierBranchPatch(
-            kLiteralOffset, Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg)),
+            kLiteralOffset,
+            Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg, /* narrow */ false)),
     };
     AddCompiledMethod(MethodRef(method_idx), code, ArrayRef<const LinkerPatch>(patches));
   }
@@ -962,7 +1088,7 @@ TEST_F(Thumb2RelativePatcherTest, BakerGcRoot) {
     ASSERT_EQ(kMethodCodeSize, expected_code.size());
     EXPECT_TRUE(CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code)));
 
-    std::vector<uint8_t> expected_thunk = CompileBakerGcRootThunk(root_reg);
+    std::vector<uint8_t> expected_thunk = CompileBakerGcRootThunk(root_reg, /* narrow */ false);
     ASSERT_GT(output_.size(), thunk_offset);
     ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size());
     ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset,
@@ -972,7 +1098,7 @@ TEST_F(Thumb2RelativePatcherTest, BakerGcRoot) {
       ASSERT_TRUE(false);
     }
 
-    // Verify that the fast-path null-check CBZ uses the correct register, i.e. root_reg.
+    // Verify that the fast-path null-check uses the correct register, i.e. root_reg.
     if (root_reg < 8) {
       ASSERT_GE(output_.size() - thunk_offset, 2u);
       ASSERT_EQ(0xb100 | root_reg, GetOutputInsn16(thunk_offset) & 0xfd07u);
@@ -988,6 +1114,60 @@ TEST_F(Thumb2RelativePatcherTest, BakerGcRoot) {
   }
 }
 
+TEST_F(Thumb2RelativePatcherTest, BakerGcRootNarrow) {
+  uint32_t valid_regs[] = {
+      0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
+                                      // Not appplicable to high registers.
+  };
+  constexpr size_t kMethodCodeSize = 6u;
+  constexpr size_t kLiteralOffset = 2u;
+  uint32_t method_idx = 0u;
+  for (uint32_t root_reg : valid_regs) {
+    ++method_idx;
+    uint32_t ldr = kLdrInsn | (/* offset */ 8 << (6 - 2)) | (/* base_reg */ 0 << 3) | root_reg;
+    const std::vector<uint8_t> raw_code = RawCode({ldr, kBneWPlus0});
+    ASSERT_EQ(kMethodCodeSize, raw_code.size());
+    ArrayRef<const uint8_t> code(raw_code);
+    const LinkerPatch patches[] = {
+        LinkerPatch::BakerReadBarrierBranchPatch(
+            kLiteralOffset,
+            Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg, /* narrow */ true)),
+    };
+    AddCompiledMethod(MethodRef(method_idx), code, ArrayRef<const LinkerPatch>(patches));
+  }
+  Link();
+
+  // All thunks are at the end.
+  uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment);
+  method_idx = 0u;
+  for (uint32_t root_reg : valid_regs) {
+    ++method_idx;
+    uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset);
+    uint32_t ldr = kLdrInsn | (/* offset */ 8 << (6 - 2)) | (/* base_reg */ 0 << 3) | root_reg;
+    const std::vector<uint8_t> expected_code = RawCode({ldr, bne});
+    ASSERT_EQ(kMethodCodeSize, expected_code.size());
+    EXPECT_TRUE(CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code)));
+
+    std::vector<uint8_t> expected_thunk = CompileBakerGcRootThunk(root_reg, /* narrow */ true);
+    ASSERT_GT(output_.size(), thunk_offset);
+    ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size());
+    ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset,
+                                           expected_thunk.size());
+    if (ArrayRef<const uint8_t>(expected_thunk) != compiled_thunk) {
+      DumpDiff(ArrayRef<const uint8_t>(expected_thunk), compiled_thunk);
+      ASSERT_TRUE(false);
+    }
+
+    // Verify that the fast-path null-check CBZ uses the correct register, i.e. root_reg.
+    ASSERT_GE(output_.size() - thunk_offset, 2u);
+    ASSERT_EQ(0xb100 | root_reg, GetOutputInsn16(thunk_offset) & 0xfd07u);
+    // Do not check the rest of the implementation.
+
+    // The next thunk follows on the next aligned offset.
+    thunk_offset += RoundUp(expected_thunk.size(), kArmAlignment);
+  }
+}
+
 TEST_F(Thumb2RelativePatcherTest, BakerGcRootOffsetBits) {
   // Test 1MiB of patches to the same thunk to stress-test different large offsets.
   // (The low bits are not that important but the location of the high bits is easy to get wrong.)
@@ -998,7 +1178,8 @@ TEST_F(Thumb2RelativePatcherTest, BakerGcRootOffsetBits) {
   patches.reserve(num_patches);
   const uint32_t ldr =
       kLdrWInsn | (/* offset */ 8) | (/* base_reg */ 0 << 16) | (/* root_reg */ 0 << 12);
-  uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 0);
+  uint32_t encoded_data =
+      Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 0, /* narrow */ false);
   for (size_t i = 0; i != num_patches; ++i) {
     PushBackInsn(&code, ldr);
     PushBackInsn(&code, kBneWPlus0);
@@ -1067,7 +1248,7 @@ TEST_F(Thumb2RelativePatcherTest, BakerAndMethodCallInteraction) {
   // this pushes the first GC root thunk's pending MaxNextOffset() before the method call
   // thunk's pending MaxNextOffset() which needs to be adjusted.
   ASSERT_LT(RoundUp(CompileMethodCallThunk().size(), kArmAlignment) + kArmAlignment,
-            CompileBakerGcRootThunk(/* root_reg */ 0).size());
+            CompileBakerGcRootThunk(/* root_reg */ 0, /* narrow */ false).size());
   static_assert(kArmAlignment == 8, "Code below assumes kArmAlignment == 8");
   constexpr size_t kBakerLiteralOffset1 = kArmAlignment + 2u - kPcAdjustment;
   constexpr size_t kBakerLiteralOffset2 = kBakerLiteralOffset1 + kArmAlignment;
@@ -1080,9 +1261,9 @@ TEST_F(Thumb2RelativePatcherTest, BakerAndMethodCallInteraction) {
       ldr2, kBneWPlus0,                         // Second GC root LDR with read barrier.
   });
   uint32_t encoded_data1 =
-      Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 1);
+      Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 1, /* narrow */ false);
   uint32_t encoded_data2 =
-      Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 2);
+      Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 2, /* narrow */ false);
   const LinkerPatch last_method_patches[] = {
       LinkerPatch::BakerReadBarrierBranchPatch(kBakerLiteralOffset1, encoded_data1),
       LinkerPatch::BakerReadBarrierBranchPatch(kBakerLiteralOffset2, encoded_data2),
diff --git a/compiler/linker/arm64/relative_patcher_arm64.h b/compiler/linker/arm64/relative_patcher_arm64.h
index d1ab410a7e..02a5b1ef8f 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.h
+++ b/compiler/linker/arm64/relative_patcher_arm64.h
@@ -100,7 +100,7 @@ class Arm64RelativePatcher FINAL : public ArmBaseRelativePatcher {
       BitField<uint32_t, kBitsForBakerReadBarrierKind + kBitsForRegister, kBitsForRegister>;
 
   static void CheckValidReg(uint32_t reg) {
-    DCHECK(reg < 30u && reg != 16u && reg != 17u);
+    DCHECK(reg < 30u && reg != 16u && reg != 17u) << reg;
   }
 
   void CompileBakerReadBarrierThunk(arm64::Arm64Assembler& assembler, uint32_t encoded_data);
diff --git a/compiler/optimizing/block_builder.cc b/compiler/optimizing/block_builder.cc
index 5e70a8284d..1e75f10ebe 100644
--- a/compiler/optimizing/block_builder.cc
+++ b/compiler/optimizing/block_builder.cc
@@ -310,16 +310,18 @@ void HBasicBlockBuilder::InsertTryBoundaryBlocks() {
   // least one predecessor is not covered by the same TryItem as the try block.
   // We do not split each edge separately, but rather create one boundary block
   // that all predecessors are relinked to. This preserves loop headers (b/23895756).
-  for (auto entry : try_block_info) {
-    HBasicBlock* try_block = graph_->GetBlocks()[entry.first];
+  for (const auto& entry : try_block_info) {
+    uint32_t block_id = entry.first;
+    const DexFile::TryItem* try_item = entry.second;
+    HBasicBlock* try_block = graph_->GetBlocks()[block_id];
     for (HBasicBlock* predecessor : try_block->GetPredecessors()) {
-      if (GetTryItem(predecessor, try_block_info) != entry.second) {
+      if (GetTryItem(predecessor, try_block_info) != try_item) {
         // Found a predecessor not covered by the same TryItem. Insert entering
         // boundary block.
         HTryBoundary* try_entry =
             new (arena_) HTryBoundary(HTryBoundary::BoundaryKind::kEntry, try_block->GetDexPc());
         try_block->CreateImmediateDominator()->AddInstruction(try_entry);
-        LinkToCatchBlocks(try_entry, code_item_, entry.second, catch_blocks);
+        LinkToCatchBlocks(try_entry, code_item_, try_item, catch_blocks);
         break;
       }
     }
@@ -327,8 +329,10 @@ void HBasicBlockBuilder::InsertTryBoundaryBlocks() {
 
   // Do a second pass over the try blocks and insert exit TryBoundaries where
   // the successor is not in the same TryItem.
-  for (auto entry : try_block_info) {
-    HBasicBlock* try_block = graph_->GetBlocks()[entry.first];
+  for (const auto& entry : try_block_info) {
+    uint32_t block_id = entry.first;
+    const DexFile::TryItem* try_item = entry.second;
+    HBasicBlock* try_block = graph_->GetBlocks()[block_id];
     // NOTE: Do not use iterators because SplitEdge would invalidate them.
     for (size_t i = 0, e = try_block->GetSuccessors().size(); i < e; ++i) {
       HBasicBlock* successor = try_block->GetSuccessors()[i];
@@ -337,7 +341,7 @@ void HBasicBlockBuilder::InsertTryBoundaryBlocks() {
       // covered by the same TryItem. Otherwise the previous pass would have
       // created a non-throwing boundary block.
       if (GetTryItem(successor, try_block_info) != nullptr) {
-        DCHECK_EQ(entry.second, GetTryItem(successor, try_block_info));
+        DCHECK_EQ(try_item, GetTryItem(successor, try_block_info));
         continue;
       }
 
@@ -345,7 +349,7 @@ void HBasicBlockBuilder::InsertTryBoundaryBlocks() {
       HTryBoundary* try_exit =
           new (arena_) HTryBoundary(HTryBoundary::BoundaryKind::kExit, successor->GetDexPc());
       graph_->SplitEdge(try_block, successor)->AddInstruction(try_exit);
-      LinkToCatchBlocks(try_exit, code_item_, entry.second, catch_blocks);
+      LinkToCatchBlocks(try_exit, code_item_, try_item, catch_blocks);
     }
   }
 }
diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc
index ed630cda91..f3ecdf036a 100644
--- a/compiler/optimizing/bounds_check_elimination.cc
+++ b/compiler/optimizing/bounds_check_elimination.cc
@@ -1734,8 +1734,8 @@ class BCEVisitor : public HGraphVisitor {
    */
   void InsertPhiNodes() {
     // Scan all new deoptimization blocks.
-    for (auto it1 = taken_test_loop_.begin(); it1 != taken_test_loop_.end(); ++it1) {
-      HBasicBlock* true_block = it1->second;
+    for (const auto& entry : taken_test_loop_) {
+      HBasicBlock* true_block = entry.second;
       HBasicBlock* new_preheader = true_block->GetSingleSuccessor();
       // Scan all instructions in a new deoptimization block.
       for (HInstructionIterator it(true_block->GetInstructions()); !it.Done(); it.Advance()) {
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 5136d7d2b8..65f3c72e99 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -145,7 +145,7 @@ size_t CodeGenerator::GetCacheOffset(uint32_t index) {
 }
 
 size_t CodeGenerator::GetCachePointerOffset(uint32_t index) {
-  auto pointer_size = InstructionSetPointerSize(GetInstructionSet());
+  PointerSize pointer_size = InstructionSetPointerSize(GetInstructionSet());
   return static_cast<size_t>(pointer_size) * index;
 }
 
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index ea463eeb62..9ef692aaf0 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -842,7 +842,7 @@ class SlowPathGenerator {
     const uint32_t dex_pc = instruction->GetDexPc();
     auto iter = slow_path_map_.find(dex_pc);
     if (iter != slow_path_map_.end()) {
-      auto candidates = iter->second;
+      const ArenaVector<std::pair<InstructionType*, SlowPathCode*>>& candidates = iter->second;
       for (const auto& it : candidates) {
         InstructionType* other_instruction = it.first;
         SlowPathCodeType* other_slow_path = down_cast<SlowPathCodeType*>(it.second);
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 1990e8f67d..ab3d499235 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -90,13 +90,17 @@ static inline void CheckLastTempIsBakerCcEntrypointRegister(HInstruction* instru
 }
 
 static inline void EmitPlaceholderBne(CodeGeneratorARM* codegen, Label* bne_label) {
-  DCHECK(down_cast<Thumb2Assembler*>(codegen->GetAssembler())->IsForced32Bit());
+  ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(codegen->GetAssembler()));
   __ BindTrackedLabel(bne_label);
   Label placeholder_label;
   __ b(&placeholder_label, NE);  // Placeholder, patched at link-time.
   __ Bind(&placeholder_label);
 }
 
+static inline bool CanEmitNarrowLdr(Register rt, Register rn, uint32_t offset) {
+  return ArmAssembler::IsLowRegister(rt) && ArmAssembler::IsLowRegister(rn) && offset < 32u;
+}
+
 static constexpr int kRegListThreshold = 4;
 
 // SaveLiveRegisters and RestoreLiveRegisters from SlowPathCodeARM operate on sets of S registers,
@@ -1652,34 +1656,6 @@ static void GenerateVcmp(HInstruction* instruction, CodeGeneratorARM* codegen) {
   }
 }
 
-static int64_t AdjustConstantForCondition(int64_t value,
-                                          IfCondition* condition,
-                                          IfCondition* opposite) {
-  if (value == 1) {
-    if (*condition == kCondB) {
-      value = 0;
-      *condition = kCondEQ;
-      *opposite = kCondNE;
-    } else if (*condition == kCondAE) {
-      value = 0;
-      *condition = kCondNE;
-      *opposite = kCondEQ;
-    }
-  } else if (value == -1) {
-    if (*condition == kCondGT) {
-      value = 0;
-      *condition = kCondGE;
-      *opposite = kCondLT;
-    } else if (*condition == kCondLE) {
-      value = 0;
-      *condition = kCondLT;
-      *opposite = kCondGE;
-    }
-  }
-
-  return value;
-}
-
 static std::pair<Condition, Condition> GenerateLongTestConstant(HCondition* condition,
                                                                 bool invert,
                                                                 CodeGeneratorARM* codegen) {
@@ -1693,7 +1669,7 @@ static std::pair<Condition, Condition> GenerateLongTestConstant(HCondition* cond
     std::swap(cond, opposite);
   }
 
-  std::pair<Condition, Condition> ret(EQ, NE);
+  std::pair<Condition, Condition> ret;
   const Location left = locations->InAt(0);
   const Location right = locations->InAt(1);
 
@@ -1701,38 +1677,7 @@ static std::pair<Condition, Condition> GenerateLongTestConstant(HCondition* cond
 
   const Register left_high = left.AsRegisterPairHigh<Register>();
   const Register left_low = left.AsRegisterPairLow<Register>();
-  int64_t value = AdjustConstantForCondition(right.GetConstant()->AsLongConstant()->GetValue(),
-                                             &cond,
-                                             &opposite);
-
-  // Comparisons against 0 are common enough to deserve special attention.
-  if (value == 0) {
-    switch (cond) {
-      case kCondNE:
-      // x > 0 iff x != 0 when the comparison is unsigned.
-      case kCondA:
-        ret = std::make_pair(NE, EQ);
-        FALLTHROUGH_INTENDED;
-      case kCondEQ:
-      // x <= 0 iff x == 0 when the comparison is unsigned.
-      case kCondBE:
-        __ orrs(IP, left_low, ShifterOperand(left_high));
-        return ret;
-      case kCondLT:
-      case kCondGE:
-        __ cmp(left_high, ShifterOperand(0));
-        return std::make_pair(ARMCondition(cond), ARMCondition(opposite));
-      // Trivially true or false.
-      case kCondB:
-        ret = std::make_pair(NE, EQ);
-        FALLTHROUGH_INTENDED;
-      case kCondAE:
-        __ cmp(left_low, ShifterOperand(left_low));
-        return ret;
-      default:
-        break;
-    }
-  }
+  int64_t value = right.GetConstant()->AsLongConstant()->GetValue();
 
   switch (cond) {
     case kCondEQ:
@@ -1892,14 +1837,10 @@ static std::pair<Condition, Condition> GenerateTest(HCondition* condition,
 static bool CanGenerateTest(HCondition* condition, ArmAssembler* assembler) {
   if (condition->GetLeft()->GetType() == Primitive::kPrimLong) {
     const LocationSummary* const locations = condition->GetLocations();
+    const IfCondition c = condition->GetCondition();
 
     if (locations->InAt(1).IsConstant()) {
-      IfCondition c = condition->GetCondition();
-      IfCondition opposite = condition->GetOppositeCondition();
-      const int64_t value = AdjustConstantForCondition(
-          Int64FromConstant(locations->InAt(1).GetConstant()),
-          &c,
-          &opposite);
+      const int64_t value = locations->InAt(1).GetConstant()->AsLongConstant()->GetValue();
       ShifterOperand so;
 
       if (c < kCondLT || c > kCondGE) {
@@ -1907,11 +1848,9 @@ static bool CanGenerateTest(HCondition* condition, ArmAssembler* assembler) {
         // we check that the least significant half of the first input to be compared
         // is in a low register (the other half is read outside an IT block), and
         // the constant fits in an 8-bit unsigned integer, so that a 16-bit CMP
-        // encoding can be used; 0 is always handled, no matter what registers are
-        // used by the first input.
-        if (value != 0 &&
-            (!ArmAssembler::IsLowRegister(locations->InAt(0).AsRegisterPairLow<Register>()) ||
-             !IsUint<8>(Low32Bits(value)))) {
+        // encoding can be used.
+        if (!ArmAssembler::IsLowRegister(locations->InAt(0).AsRegisterPairLow<Register>()) ||
+            !IsUint<8>(Low32Bits(value))) {
           return false;
         }
       } else if (c == kCondLE || c == kCondGT) {
@@ -1938,329 +1877,6 @@ static bool CanGenerateTest(HCondition* condition, ArmAssembler* assembler) {
   return true;
 }
 
-static void GenerateConditionGeneric(HCondition* cond, CodeGeneratorARM* codegen) {
-  DCHECK(CanGenerateTest(cond, codegen->GetAssembler()));
-
-  const Register out = cond->GetLocations()->Out().AsRegister<Register>();
-  const auto condition = GenerateTest(cond, false, codegen);
-
-  __ mov(out, ShifterOperand(0), AL, kCcKeep);
-
-  if (ArmAssembler::IsLowRegister(out)) {
-    __ it(condition.first);
-    __ mov(out, ShifterOperand(1), condition.first);
-  } else {
-    Label done_label;
-    Label* const final_label = codegen->GetFinalLabel(cond, &done_label);
-
-    __ b(final_label, condition.second);
-    __ LoadImmediate(out, 1);
-
-    if (done_label.IsLinked()) {
-      __ Bind(&done_label);
-    }
-  }
-}
-
-static void GenerateEqualLong(HCondition* cond, CodeGeneratorARM* codegen) {
-  DCHECK_EQ(cond->GetLeft()->GetType(), Primitive::kPrimLong);
-
-  const LocationSummary* const locations = cond->GetLocations();
-  IfCondition condition = cond->GetCondition();
-  const Register out = locations->Out().AsRegister<Register>();
-  const Location left = locations->InAt(0);
-  const Location right = locations->InAt(1);
-  Register left_high = left.AsRegisterPairHigh<Register>();
-  Register left_low = left.AsRegisterPairLow<Register>();
-
-  if (right.IsConstant()) {
-    IfCondition opposite = cond->GetOppositeCondition();
-    const int64_t value = AdjustConstantForCondition(Int64FromConstant(right.GetConstant()),
-                                                     &condition,
-                                                     &opposite);
-    int32_t value_high = -High32Bits(value);
-    int32_t value_low = -Low32Bits(value);
-
-    // The output uses Location::kNoOutputOverlap.
-    if (out == left_high) {
-      std::swap(left_low, left_high);
-      std::swap(value_low, value_high);
-    }
-
-    __ AddConstant(out, left_low, value_low);
-    __ AddConstant(IP, left_high, value_high);
-  } else {
-    DCHECK(right.IsRegisterPair());
-    __ sub(IP, left_high, ShifterOperand(right.AsRegisterPairHigh<Register>()));
-    __ sub(out, left_low, ShifterOperand(right.AsRegisterPairLow<Register>()));
-  }
-
-  // Need to check after calling AdjustConstantForCondition().
-  DCHECK(condition == kCondEQ || condition == kCondNE) << condition;
-
-  if (condition == kCondNE && ArmAssembler::IsLowRegister(out)) {
-    __ orrs(out, out, ShifterOperand(IP));
-    __ it(NE);
-    __ mov(out, ShifterOperand(1), NE);
-  } else {
-    __ orr(out, out, ShifterOperand(IP));
-    codegen->GenerateConditionWithZero(condition, out, out, IP);
-  }
-}
-
-static void GenerateLongComparesAndJumps(HCondition* cond,
-                                         Label* true_label,
-                                         Label* false_label,
-                                         CodeGeneratorARM* codegen) {
-  LocationSummary* locations = cond->GetLocations();
-  Location left = locations->InAt(0);
-  Location right = locations->InAt(1);
-  IfCondition if_cond = cond->GetCondition();
-
-  Register left_high = left.AsRegisterPairHigh<Register>();
-  Register left_low = left.AsRegisterPairLow<Register>();
-  IfCondition true_high_cond = if_cond;
-  IfCondition false_high_cond = cond->GetOppositeCondition();
-  Condition final_condition = ARMUnsignedCondition(if_cond);  // unsigned on lower part
-
-  // Set the conditions for the test, remembering that == needs to be
-  // decided using the low words.
-  switch (if_cond) {
-    case kCondEQ:
-    case kCondNE:
-      // Nothing to do.
-      break;
-    case kCondLT:
-      false_high_cond = kCondGT;
-      break;
-    case kCondLE:
-      true_high_cond = kCondLT;
-      break;
-    case kCondGT:
-      false_high_cond = kCondLT;
-      break;
-    case kCondGE:
-      true_high_cond = kCondGT;
-      break;
-    case kCondB:
-      false_high_cond = kCondA;
-      break;
-    case kCondBE:
-      true_high_cond = kCondB;
-      break;
-    case kCondA:
-      false_high_cond = kCondB;
-      break;
-    case kCondAE:
-      true_high_cond = kCondA;
-      break;
-  }
-  if (right.IsConstant()) {
-    int64_t value = right.GetConstant()->AsLongConstant()->GetValue();
-    int32_t val_low = Low32Bits(value);
-    int32_t val_high = High32Bits(value);
-
-    __ CmpConstant(left_high, val_high);
-    if (if_cond == kCondNE) {
-      __ b(true_label, ARMCondition(true_high_cond));
-    } else if (if_cond == kCondEQ) {
-      __ b(false_label, ARMCondition(false_high_cond));
-    } else {
-      __ b(true_label, ARMCondition(true_high_cond));
-      __ b(false_label, ARMCondition(false_high_cond));
-    }
-    // Must be equal high, so compare the lows.
-    __ CmpConstant(left_low, val_low);
-  } else {
-    Register right_high = right.AsRegisterPairHigh<Register>();
-    Register right_low = right.AsRegisterPairLow<Register>();
-
-    __ cmp(left_high, ShifterOperand(right_high));
-    if (if_cond == kCondNE) {
-      __ b(true_label, ARMCondition(true_high_cond));
-    } else if (if_cond == kCondEQ) {
-      __ b(false_label, ARMCondition(false_high_cond));
-    } else {
-      __ b(true_label, ARMCondition(true_high_cond));
-      __ b(false_label, ARMCondition(false_high_cond));
-    }
-    // Must be equal high, so compare the lows.
-    __ cmp(left_low, ShifterOperand(right_low));
-  }
-  // The last comparison might be unsigned.
-  // TODO: optimize cases where this is always true/false
-  __ b(true_label, final_condition);
-}
-
-static void GenerateConditionLong(HCondition* cond, CodeGeneratorARM* codegen) {
-  DCHECK_EQ(cond->GetLeft()->GetType(), Primitive::kPrimLong);
-
-  const LocationSummary* const locations = cond->GetLocations();
-  IfCondition condition = cond->GetCondition();
-  const Register out = locations->Out().AsRegister<Register>();
-  const Location left = locations->InAt(0);
-  const Location right = locations->InAt(1);
-
-  if (right.IsConstant()) {
-    IfCondition opposite = cond->GetOppositeCondition();
-
-    // Comparisons against 0 are common enough to deserve special attention.
-    if (AdjustConstantForCondition(Int64FromConstant(right.GetConstant()),
-                                   &condition,
-                                   &opposite) == 0) {
-      switch (condition) {
-        case kCondNE:
-        case kCondA:
-          if (ArmAssembler::IsLowRegister(out)) {
-            // We only care if both input registers are 0 or not.
-            __ orrs(out,
-                    left.AsRegisterPairLow<Register>(),
-                    ShifterOperand(left.AsRegisterPairHigh<Register>()));
-            __ it(NE);
-            __ mov(out, ShifterOperand(1), NE);
-            return;
-          }
-
-          FALLTHROUGH_INTENDED;
-        case kCondEQ:
-        case kCondBE:
-          // We only care if both input registers are 0 or not.
-          __ orr(out,
-                 left.AsRegisterPairLow<Register>(),
-                 ShifterOperand(left.AsRegisterPairHigh<Register>()));
-          codegen->GenerateConditionWithZero(condition, out, out);
-          return;
-        case kCondLT:
-        case kCondGE:
-          // We only care about the sign bit.
-          FALLTHROUGH_INTENDED;
-        case kCondAE:
-        case kCondB:
-          codegen->GenerateConditionWithZero(condition, out, left.AsRegisterPairHigh<Register>());
-          return;
-        case kCondLE:
-        case kCondGT:
-        default:
-          break;
-      }
-    }
-  }
-
-  if ((condition == kCondEQ || condition == kCondNE) &&
-      // If `out` is a low register, then the GenerateConditionGeneric()
-      // function generates a shorter code sequence that is still branchless.
-      (!ArmAssembler::IsLowRegister(out) || !CanGenerateTest(cond, codegen->GetAssembler()))) {
-    GenerateEqualLong(cond, codegen);
-    return;
-  }
-
-  if (CanGenerateTest(cond, codegen->GetAssembler())) {
-    GenerateConditionGeneric(cond, codegen);
-    return;
-  }
-
-  // Convert the jumps into the result.
-  Label done_label;
-  Label* const final_label = codegen->GetFinalLabel(cond, &done_label);
-  Label true_label, false_label;
-
-  GenerateLongComparesAndJumps(cond, &true_label, &false_label, codegen);
-
-  // False case: result = 0.
-  __ Bind(&false_label);
-  __ mov(out, ShifterOperand(0));
-  __ b(final_label);
-
-  // True case: result = 1.
-  __ Bind(&true_label);
-  __ mov(out, ShifterOperand(1));
-
-  if (done_label.IsLinked()) {
-    __ Bind(&done_label);
-  }
-}
-
-static void GenerateConditionIntegralOrNonPrimitive(HCondition* cond, CodeGeneratorARM* codegen) {
-  const Primitive::Type type = cond->GetLeft()->GetType();
-
-  DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type;
-
-  if (type == Primitive::kPrimLong) {
-    GenerateConditionLong(cond, codegen);
-    return;
-  }
-
-  const LocationSummary* const locations = cond->GetLocations();
-  IfCondition condition = cond->GetCondition();
-  Register in = locations->InAt(0).AsRegister<Register>();
-  const Register out = locations->Out().AsRegister<Register>();
-  const Location right = cond->GetLocations()->InAt(1);
-  int64_t value;
-
-  if (right.IsConstant()) {
-    IfCondition opposite = cond->GetOppositeCondition();
-
-    value = AdjustConstantForCondition(Int64FromConstant(right.GetConstant()),
-                                       &condition,
-                                       &opposite);
-
-    // Comparisons against 0 are common enough to deserve special attention.
-    if (value == 0) {
-      switch (condition) {
-        case kCondNE:
-        case kCondA:
-          if (ArmAssembler::IsLowRegister(out) && out == in) {
-            __ cmp(out, ShifterOperand(0));
-            __ it(NE);
-            __ mov(out, ShifterOperand(1), NE);
-            return;
-          }
-
-          FALLTHROUGH_INTENDED;
-        case kCondEQ:
-        case kCondBE:
-        case kCondLT:
-        case kCondGE:
-        case kCondAE:
-        case kCondB:
-          codegen->GenerateConditionWithZero(condition, out, in);
-          return;
-        case kCondLE:
-        case kCondGT:
-        default:
-          break;
-      }
-    }
-  }
-
-  if (condition == kCondEQ || condition == kCondNE) {
-    ShifterOperand operand;
-
-    if (right.IsConstant()) {
-      operand = ShifterOperand(value);
-    } else if (out == right.AsRegister<Register>()) {
-      // Avoid 32-bit instructions if possible.
-      operand = ShifterOperand(in);
-      in = right.AsRegister<Register>();
-    } else {
-      operand = ShifterOperand(right.AsRegister<Register>());
-    }
-
-    if (condition == kCondNE && ArmAssembler::IsLowRegister(out)) {
-      __ subs(out, in, operand);
-      __ it(NE);
-      __ mov(out, ShifterOperand(1), NE);
-    } else {
-      __ sub(out, in, operand);
-      codegen->GenerateConditionWithZero(condition, out, out);
-    }
-
-    return;
-  }
-
-  GenerateConditionGeneric(cond, codegen);
-}
-
 static bool CanEncodeConstantAs8BitImmediate(HConstant* constant) {
   const Primitive::Type type = constant->GetType();
   bool ret = false;
@@ -2867,6 +2483,89 @@ void LocationsBuilderARM::VisitExit(HExit* exit) {
 void InstructionCodeGeneratorARM::VisitExit(HExit* exit ATTRIBUTE_UNUSED) {
 }
 
+void InstructionCodeGeneratorARM::GenerateLongComparesAndJumps(HCondition* cond,
+                                                               Label* true_label,
+                                                               Label* false_label) {
+  LocationSummary* locations = cond->GetLocations();
+  Location left = locations->InAt(0);
+  Location right = locations->InAt(1);
+  IfCondition if_cond = cond->GetCondition();
+
+  Register left_high = left.AsRegisterPairHigh<Register>();
+  Register left_low = left.AsRegisterPairLow<Register>();
+  IfCondition true_high_cond = if_cond;
+  IfCondition false_high_cond = cond->GetOppositeCondition();
+  Condition final_condition = ARMUnsignedCondition(if_cond);  // unsigned on lower part
+
+  // Set the conditions for the test, remembering that == needs to be
+  // decided using the low words.
+  switch (if_cond) {
+    case kCondEQ:
+    case kCondNE:
+      // Nothing to do.
+      break;
+    case kCondLT:
+      false_high_cond = kCondGT;
+      break;
+    case kCondLE:
+      true_high_cond = kCondLT;
+      break;
+    case kCondGT:
+      false_high_cond = kCondLT;
+      break;
+    case kCondGE:
+      true_high_cond = kCondGT;
+      break;
+    case kCondB:
+      false_high_cond = kCondA;
+      break;
+    case kCondBE:
+      true_high_cond = kCondB;
+      break;
+    case kCondA:
+      false_high_cond = kCondB;
+      break;
+    case kCondAE:
+      true_high_cond = kCondA;
+      break;
+  }
+  if (right.IsConstant()) {
+    int64_t value = right.GetConstant()->AsLongConstant()->GetValue();
+    int32_t val_low = Low32Bits(value);
+    int32_t val_high = High32Bits(value);
+
+    __ CmpConstant(left_high, val_high);
+    if (if_cond == kCondNE) {
+      __ b(true_label, ARMCondition(true_high_cond));
+    } else if (if_cond == kCondEQ) {
+      __ b(false_label, ARMCondition(false_high_cond));
+    } else {
+      __ b(true_label, ARMCondition(true_high_cond));
+      __ b(false_label, ARMCondition(false_high_cond));
+    }
+    // Must be equal high, so compare the lows.
+    __ CmpConstant(left_low, val_low);
+  } else {
+    Register right_high = right.AsRegisterPairHigh<Register>();
+    Register right_low = right.AsRegisterPairLow<Register>();
+
+    __ cmp(left_high, ShifterOperand(right_high));
+    if (if_cond == kCondNE) {
+      __ b(true_label, ARMCondition(true_high_cond));
+    } else if (if_cond == kCondEQ) {
+      __ b(false_label, ARMCondition(false_high_cond));
+    } else {
+      __ b(true_label, ARMCondition(true_high_cond));
+      __ b(false_label, ARMCondition(false_high_cond));
+    }
+    // Must be equal high, so compare the lows.
+    __ cmp(left_low, ShifterOperand(right_low));
+  }
+  // The last comparison might be unsigned.
+  // TODO: optimize cases where this is always true/false
+  __ b(true_label, final_condition);
+}
+
 void InstructionCodeGeneratorARM::GenerateCompareTestAndBranch(HCondition* condition,
                                                                Label* true_target_in,
                                                                Label* false_target_in) {
@@ -2901,7 +2600,7 @@ void InstructionCodeGeneratorARM::GenerateCompareTestAndBranch(HCondition* condi
   Label* false_target = false_target_in == nullptr ? &fallthrough_target : false_target_in;
 
   DCHECK_EQ(condition->InputAt(0)->GetType(), Primitive::kPrimLong);
-  GenerateLongComparesAndJumps(condition, true_target, false_target, codegen_);
+  GenerateLongComparesAndJumps(condition, true_target, false_target);
 
   if (false_target != &fallthrough_target) {
     __ b(false_target);
@@ -3216,80 +2915,6 @@ void CodeGeneratorARM::GenerateNop() {
   __ nop();
 }
 
-// `temp` is an extra temporary register that is used for some conditions;
-// callers may not specify it, in which case the method will use a scratch
-// register instead.
-void CodeGeneratorARM::GenerateConditionWithZero(IfCondition condition,
-                                                 Register out,
-                                                 Register in,
-                                                 Register temp) {
-  switch (condition) {
-    case kCondEQ:
-    // x <= 0 iff x == 0 when the comparison is unsigned.
-    case kCondBE:
-      if (temp == kNoRegister || (ArmAssembler::IsLowRegister(out) && out != in)) {
-        temp = out;
-      }
-
-      // Avoid 32-bit instructions if possible; note that `in` and `temp` must be
-      // different as well.
-      if (ArmAssembler::IsLowRegister(in) && ArmAssembler::IsLowRegister(temp) && in != temp) {
-        // temp = - in; only 0 sets the carry flag.
-        __ rsbs(temp, in, ShifterOperand(0));
-
-        if (out == in) {
-          std::swap(in, temp);
-        }
-
-        // out = - in + in + carry = carry
-        __ adc(out, temp, ShifterOperand(in));
-      } else {
-        // If `in` is 0, then it has 32 leading zeros, and less than that otherwise.
-        __ clz(out, in);
-        // Any number less than 32 logically shifted right by 5 bits results in 0;
-        // the same operation on 32 yields 1.
-        __ Lsr(out, out, 5);
-      }
-
-      break;
-    case kCondNE:
-    // x > 0 iff x != 0 when the comparison is unsigned.
-    case kCondA:
-      if (out == in) {
-        if (temp == kNoRegister || in == temp) {
-          temp = IP;
-        }
-      } else if (temp == kNoRegister || !ArmAssembler::IsLowRegister(temp)) {
-        temp = out;
-      }
-
-      // temp = in - 1; only 0 does not set the carry flag.
-      __ subs(temp, in, ShifterOperand(1));
-      // out = in + ~temp + carry = in + (-(in - 1) - 1) + carry = in - in + 1 - 1 + carry = carry
-      __ sbc(out, in, ShifterOperand(temp));
-      break;
-    case kCondGE:
-      __ mvn(out, ShifterOperand(in));
-      in = out;
-      FALLTHROUGH_INTENDED;
-    case kCondLT:
-      // We only care about the sign bit.
-      __ Lsr(out, in, 31);
-      break;
-    case kCondAE:
-      // Trivially true.
-      __ mov(out, ShifterOperand(1));
-      break;
-    case kCondB:
-      // Trivially false.
-      __ mov(out, ShifterOperand(0));
-      break;
-    default:
-      LOG(FATAL) << "Unexpected condition " << condition;
-      UNREACHABLE();
-  }
-}
-
 void LocationsBuilderARM::HandleCondition(HCondition* cond) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(cond, LocationSummary::kNoCall);
@@ -3326,42 +2951,48 @@ void InstructionCodeGeneratorARM::HandleCondition(HCondition* cond) {
     return;
   }
 
-  const Primitive::Type type = cond->GetLeft()->GetType();
+  const Register out = cond->GetLocations()->Out().AsRegister<Register>();
+
+  if (ArmAssembler::IsLowRegister(out) && CanGenerateTest(cond, codegen_->GetAssembler())) {
+    const auto condition = GenerateTest(cond, false, codegen_);
 
-  if (Primitive::IsFloatingPointType(type)) {
-    GenerateConditionGeneric(cond, codegen_);
+    __ it(condition.first);
+    __ mov(out, ShifterOperand(1), condition.first);
+    __ it(condition.second);
+    __ mov(out, ShifterOperand(0), condition.second);
     return;
   }
 
-  DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type;
-
-  if (type == Primitive::kPrimBoolean) {
-    const LocationSummary* const locations = cond->GetLocations();
-    const IfCondition c = cond->GetCondition();
-    Register left = locations->InAt(0).AsRegister<Register>();
-    const Register out = locations->Out().AsRegister<Register>();
-    const Location right_loc = locations->InAt(1);
+  // Convert the jumps into the result.
+  Label done_label;
+  Label* const final_label = codegen_->GetFinalLabel(cond, &done_label);
 
-    // All other cases are handled by the instruction simplifier.
-    DCHECK((c == kCondEQ || c == kCondNE) && !right_loc.IsConstant());
+  if (cond->InputAt(0)->GetType() == Primitive::kPrimLong) {
+    Label true_label, false_label;
 
-    Register right = right_loc.AsRegister<Register>();
+    GenerateLongComparesAndJumps(cond, &true_label, &false_label);
 
-    // Avoid 32-bit instructions if possible.
-    if (out == right) {
-      std::swap(left, right);
-    }
+    // False case: result = 0.
+    __ Bind(&false_label);
+    __ LoadImmediate(out, 0);
+    __ b(final_label);
 
-    __ eor(out, left, ShifterOperand(right));
+    // True case: result = 1.
+    __ Bind(&true_label);
+    __ LoadImmediate(out, 1);
+  } else {
+    DCHECK(CanGenerateTest(cond, codegen_->GetAssembler()));
 
-    if (c == kCondEQ) {
-      __ eor(out, out, ShifterOperand(1));
-    }
+    const auto condition = GenerateTest(cond, false, codegen_);
 
-    return;
+    __ mov(out, ShifterOperand(0), AL, kCcKeep);
+    __ b(final_label, condition.second);
+    __ LoadImmediate(out, 1);
   }
 
-  GenerateConditionIntegralOrNonPrimitive(cond, codegen_);
+  if (done_label.IsLinked()) {
+    __ Bind(&done_label);
+  }
 }
 
 void LocationsBuilderARM::VisitEqual(HEqual* comp) {
@@ -6743,6 +6374,15 @@ void InstructionCodeGeneratorARM::VisitIntermediateAddress(HIntermediateAddress*
   }
 }
 
+void LocationsBuilderARM::VisitIntermediateAddressIndex(HIntermediateAddressIndex* instruction) {
+  LOG(FATAL) << "Unreachable " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARM::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instruction) {
+  LOG(FATAL) << "Unreachable " << instruction->GetId();
+}
+
 void LocationsBuilderARM::VisitBoundsCheck(HBoundsCheck* instruction) {
   RegisterSet caller_saves = RegisterSet::Empty();
   InvokeRuntimeCallingConvention calling_convention;
@@ -8430,8 +8070,9 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct
         //   return_address:
 
         CheckLastTempIsBakerCcEntrypointRegister(instruction);
+        bool narrow = CanEmitNarrowLdr(root_reg, obj, offset);
         uint32_t custom_data =
-            linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg);
+            linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg, narrow);
         Label* bne_label = codegen_->NewBakerReadBarrierPatch(custom_data);
 
         // entrypoint_reg =
@@ -8444,16 +8085,18 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct
         Label return_address;
         __ AdrCode(LR, &return_address);
         __ CmpConstant(kBakerCcEntrypointRegister, 0);
-        static_assert(
-            BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET == -8,
-            "GC root LDR must be 2 32-bit instructions (8B) before the return address label.");
         // Currently the offset is always within range. If that changes,
         // we shall have to split the load the same way as for fields.
         DCHECK_LT(offset, kReferenceLoadMinFarOffset);
-        ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
+        DCHECK(!down_cast<Thumb2Assembler*>(GetAssembler())->IsForced32Bit());
+        ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()), !narrow);
+        int old_position = GetAssembler()->GetBuffer()->GetPosition();
         __ LoadFromOffset(kLoadWord, root_reg, obj, offset);
         EmitPlaceholderBne(codegen_, bne_label);
         __ Bind(&return_address);
+        DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(),
+                  narrow ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET
+                         : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET);
       } else {
         // Note that we do not actually check the value of
         // `GetIsGcMarking()` to decide whether to mark the loaded GC
@@ -8553,10 +8196,12 @@ void CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instr
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = *(obj+offset);
+    //     HeapReference<mirror::Object> reference = *(obj+offset);
     //   gray_return_address:
 
     DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
+    Register ref_reg = ref.AsRegister<Register>();
+    bool narrow = CanEmitNarrowLdr(ref_reg, obj, offset);
     Register base = obj;
     if (offset >= kReferenceLoadMinFarOffset) {
       base = temp.AsRegister<Register>();
@@ -8564,10 +8209,14 @@ void CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instr
       static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2.");
       __ AddConstant(base, obj, offset & ~(kReferenceLoadMinFarOffset - 1u));
       offset &= (kReferenceLoadMinFarOffset - 1u);
+      // Use narrow LDR only for small offsets. Generating narrow encoding LDR for the large
+      // offsets with `(offset & (kReferenceLoadMinFarOffset - 1u)) < 32u` would most likely
+      // increase the overall code size when taking the generated thunks into account.
+      DCHECK(!narrow);
     }
     CheckLastTempIsBakerCcEntrypointRegister(instruction);
     uint32_t custom_data =
-        linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base, obj);
+        linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base, obj, narrow);
     Label* bne_label = NewBakerReadBarrierPatch(custom_data);
 
     // entrypoint_reg =
@@ -8580,19 +8229,20 @@ void CodeGeneratorARM::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instr
     Label return_address;
     __ AdrCode(LR, &return_address);
     __ CmpConstant(kBakerCcEntrypointRegister, 0);
-    ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
     EmitPlaceholderBne(this, bne_label);
-    static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                  "Field LDR must be 1 32-bit instruction (4B) before the return address label; "
-                  " 2 32-bit instructions (8B) for heap poisoning.");
-    Register ref_reg = ref.AsRegister<Register>();
     DCHECK_LT(offset, kReferenceLoadMinFarOffset);
+    DCHECK(!down_cast<Thumb2Assembler*>(GetAssembler())->IsForced32Bit());
+    ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()), !narrow);
+    int old_position = GetAssembler()->GetBuffer()->GetPosition();
     __ LoadFromOffset(kLoadWord, ref_reg, base, offset);
     if (needs_null_check) {
       MaybeRecordImplicitNullCheck(instruction);
     }
     GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
     __ Bind(&return_address);
+    DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(),
+              narrow ? BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
+                     : BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET);
     return;
   }
 
@@ -8638,7 +8288,7 @@ void CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instr
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = data[index];
+    //     HeapReference<mirror::Object> reference = data[index];
     //   gray_return_address:
 
     DCHECK(index.IsValid());
@@ -8663,15 +8313,15 @@ void CodeGeneratorARM::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instr
     Label return_address;
     __ AdrCode(LR, &return_address);
     __ CmpConstant(kBakerCcEntrypointRegister, 0);
-    ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
     EmitPlaceholderBne(this, bne_label);
-    static_assert(BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                  "Array LDR must be 1 32-bit instruction (4B) before the return address label; "
-                  " 2 32-bit instructions (8B) for heap poisoning.");
+    ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
+    int old_position = GetAssembler()->GetBuffer()->GetPosition();
     __ ldr(ref_reg, Address(data_reg, index_reg, LSL, scale_factor));
     DCHECK(!needs_null_check);  // The thunk cannot handle the null check.
     GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
     __ Bind(&return_address);
+    DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(),
+              BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET);
     return;
   }
 
@@ -9426,14 +9076,20 @@ static void PatchJitRootUse(uint8_t* code,
 
 void CodeGeneratorARM::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const auto& entry : jit_string_patches_) {
-    const auto& it = jit_string_roots_.find(entry.first);
+    const StringReference& string_reference = entry.first;
+    Literal* table_entry_literal = entry.second;
+    const auto it = jit_string_roots_.find(string_reference);
     DCHECK(it != jit_string_roots_.end());
-    PatchJitRootUse(code, roots_data, entry.second, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table);
   }
   for (const auto& entry : jit_class_patches_) {
-    const auto& it = jit_class_roots_.find(entry.first);
+    const TypeReference& type_reference = entry.first;
+    Literal* table_entry_literal = entry.second;
+    const auto it = jit_class_roots_.find(type_reference);
     DCHECK(it != jit_class_roots_.end());
-    PatchJitRootUse(code, roots_data, entry.second, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index ac9d57aa0a..b94ee20d9d 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -299,6 +299,7 @@ class InstructionCodeGeneratorARM : public InstructionCodeGenerator {
   void GenerateCompareTestAndBranch(HCondition* condition,
                                     Label* true_target,
                                     Label* false_target);
+  void GenerateLongComparesAndJumps(HCondition* cond, Label* true_label, Label* false_label);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
   void DivRemByPowerOfTwo(HBinaryOperation* instruction);
   void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
@@ -625,14 +626,6 @@ class CodeGeneratorARM : public CodeGenerator {
   void GenerateImplicitNullCheck(HNullCheck* instruction) OVERRIDE;
   void GenerateExplicitNullCheck(HNullCheck* instruction) OVERRIDE;
 
-  // `temp` is an extra temporary register that is used for some conditions;
-  // callers may not specify it, in which case the method will use a scratch
-  // register instead.
-  void GenerateConditionWithZero(IfCondition condition,
-                                 Register out,
-                                 Register in,
-                                 Register temp = kNoRegister);
-
  private:
   Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke, Register temp);
 
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 7d9778a4e7..fa39b79e39 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -1515,7 +1515,7 @@ Location ParallelMoveResolverARM64::AllocateScratchLocationFor(Location::Kind ki
   if (kind == Location::kRegister) {
     scratch = LocationFrom(vixl_temps_.AcquireX());
   } else {
-    DCHECK(kind == Location::kFpuRegister);
+    DCHECK_EQ(kind, Location::kFpuRegister);
     scratch = LocationFrom(codegen_->GetGraph()->HasSIMD()
         ? vixl_temps_.AcquireVRegisterOfSize(kQRegSize)
         : vixl_temps_.AcquireD());
@@ -1743,9 +1743,9 @@ static bool CoherentConstantAndType(Location constant, Primitive::Type type) {
          (cst->IsDoubleConstant() && type == Primitive::kPrimDouble);
 }
 
-// Allocate a scratch register from the VIXL pool, querying first into
-// the floating-point register pool, and then the the core register
-// pool.  This is essentially a reimplementation of
+// Allocate a scratch register from the VIXL pool, querying first
+// the floating-point register pool, and then the core register
+// pool. This is essentially a reimplementation of
 // vixl::aarch64::UseScratchRegisterScope::AcquireCPURegisterOfSize
 // using a different allocation strategy.
 static CPURegister AcquireFPOrCoreCPURegisterOfSize(vixl::aarch64::MacroAssembler* masm,
@@ -1893,7 +1893,7 @@ void CodeGeneratorARM64::MoveLocation(Location destination,
       // ask for a scratch register of any type (core or FP).
       //
       // Also, we start by asking for a FP scratch register first, as the
-      // demand of scratch core registers is higher.  This is why we
+      // demand of scratch core registers is higher. This is why we
       // use AcquireFPOrCoreCPURegisterOfSize instead of
       // UseScratchRegisterScope::AcquireCPURegisterOfSize, which
       // allocates core scratch registers first.
@@ -2661,6 +2661,38 @@ void InstructionCodeGeneratorARM64::VisitIntermediateAddress(HIntermediateAddres
          Operand(InputOperandAt(instruction, 1)));
 }
 
+void LocationsBuilderARM64::VisitIntermediateAddressIndex(HIntermediateAddressIndex* instruction) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+
+  HIntConstant* shift = instruction->GetShift()->AsIntConstant();
+
+  locations->SetInAt(0, Location::RequiresRegister());
+  // For byte case we don't need to shift the index variable so we can encode the data offset into
+  // ADD instruction. For other cases we prefer the data_offset to be in register; that will hoist
+  // data offset constant generation out of the loop and reduce the critical path length in the
+  // loop.
+  locations->SetInAt(1, shift->GetValue() == 0
+                        ? Location::ConstantLocation(instruction->GetOffset()->AsIntConstant())
+                        : Location::RequiresRegister());
+  locations->SetInAt(2, Location::ConstantLocation(shift));
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
+void InstructionCodeGeneratorARM64::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instruction) {
+  Register index_reg = InputRegisterAt(instruction, 0);
+  uint32_t shift = Int64ConstantFrom(instruction->GetLocations()->InAt(2));
+  uint32_t offset = instruction->GetOffset()->AsIntConstant()->GetValue();
+
+  if (shift == 0) {
+    __ Add(OutputRegister(instruction), index_reg, offset);
+  } else {
+    Register offset_reg = InputRegisterAt(instruction, 1);
+    __ Add(OutputRegister(instruction), offset_reg, Operand(index_reg, LSL, shift));
+  }
+}
+
 void LocationsBuilderARM64::VisitMultiplyAccumulate(HMultiplyAccumulate* instr) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instr, LocationSummary::kNoCall);
@@ -6102,7 +6134,7 @@ void CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier(HInstruction* ins
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = *(obj+offset);
+    //     HeapReference<mirror::Object> reference = *(obj+offset);
     //   gray_return_address:
 
     DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
@@ -6197,7 +6229,7 @@ void CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier(HInstruction* ins
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = data[index];
+    //     HeapReference<mirror::Object> reference = data[index];
     //   gray_return_address:
 
     DCHECK(index.IsValid());
@@ -6571,14 +6603,20 @@ static void PatchJitRootUse(uint8_t* code,
 
 void CodeGeneratorARM64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const auto& entry : jit_string_patches_) {
-    const auto& it = jit_string_roots_.find(entry.first);
+    const StringReference& string_reference = entry.first;
+    vixl::aarch64::Literal<uint32_t>* table_entry_literal = entry.second;
+    const auto it = jit_string_roots_.find(string_reference);
     DCHECK(it != jit_string_roots_.end());
-    PatchJitRootUse(code, roots_data, entry.second, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table);
   }
   for (const auto& entry : jit_class_patches_) {
-    const auto& it = jit_class_roots_.find(entry.first);
+    const TypeReference& type_reference = entry.first;
+    vixl::aarch64::Literal<uint32_t>* table_entry_literal = entry.second;
+    const auto it = jit_class_roots_.find(type_reference);
     DCHECK(it != jit_class_roots_.end());
-    PatchJitRootUse(code, roots_data, entry.second, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 502b298163..1759c68125 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -124,6 +124,10 @@ static inline void EmitPlaceholderBne(CodeGeneratorARMVIXL* codegen, vixl32::Lab
   __ bind(&placeholder_label);
 }
 
+static inline bool CanEmitNarrowLdr(vixl32::Register rt, vixl32::Register rn, uint32_t offset) {
+  return rt.IsLow() && rn.IsLow() && offset < 32u;
+}
+
 class EmitAdrCode {
  public:
   EmitAdrCode(ArmVIXLMacroAssembler* assembler, vixl32::Register rd, vixl32::Label* label)
@@ -1771,34 +1775,6 @@ static void GenerateVcmp(HInstruction* instruction, CodeGeneratorARMVIXL* codege
   }
 }
 
-static int64_t AdjustConstantForCondition(int64_t value,
-                                          IfCondition* condition,
-                                          IfCondition* opposite) {
-  if (value == 1) {
-    if (*condition == kCondB) {
-      value = 0;
-      *condition = kCondEQ;
-      *opposite = kCondNE;
-    } else if (*condition == kCondAE) {
-      value = 0;
-      *condition = kCondNE;
-      *opposite = kCondEQ;
-    }
-  } else if (value == -1) {
-    if (*condition == kCondGT) {
-      value = 0;
-      *condition = kCondGE;
-      *opposite = kCondLT;
-    } else if (*condition == kCondLE) {
-      value = 0;
-      *condition = kCondLT;
-      *opposite = kCondGE;
-    }
-  }
-
-  return value;
-}
-
 static std::pair<vixl32::Condition, vixl32::Condition> GenerateLongTestConstant(
     HCondition* condition,
     bool invert,
@@ -1821,37 +1797,7 @@ static std::pair<vixl32::Condition, vixl32::Condition> GenerateLongTestConstant(
 
   const vixl32::Register left_high = HighRegisterFrom(left);
   const vixl32::Register left_low = LowRegisterFrom(left);
-  int64_t value = AdjustConstantForCondition(Int64ConstantFrom(right), &cond, &opposite);
-  UseScratchRegisterScope temps(codegen->GetVIXLAssembler());
-
-  // Comparisons against 0 are common enough to deserve special attention.
-  if (value == 0) {
-    switch (cond) {
-      case kCondNE:
-      // x > 0 iff x != 0 when the comparison is unsigned.
-      case kCondA:
-        ret = std::make_pair(ne, eq);
-        FALLTHROUGH_INTENDED;
-      case kCondEQ:
-      // x <= 0 iff x == 0 when the comparison is unsigned.
-      case kCondBE:
-        __ Orrs(temps.Acquire(), left_low, left_high);
-        return ret;
-      case kCondLT:
-      case kCondGE:
-        __ Cmp(left_high, 0);
-        return std::make_pair(ARMCondition(cond), ARMCondition(opposite));
-      // Trivially true or false.
-      case kCondB:
-        ret = std::make_pair(ne, eq);
-        FALLTHROUGH_INTENDED;
-      case kCondAE:
-        __ Cmp(left_low, left_low);
-        return ret;
-      default:
-        break;
-    }
-  }
+  int64_t value = Int64ConstantFrom(right);
 
   switch (cond) {
     case kCondEQ:
@@ -1896,6 +1842,8 @@ static std::pair<vixl32::Condition, vixl32::Condition> GenerateLongTestConstant(
       FALLTHROUGH_INTENDED;
     case kCondGE:
     case kCondLT: {
+      UseScratchRegisterScope temps(codegen->GetVIXLAssembler());
+
       __ Cmp(left_low, Low32Bits(value));
       __ Sbcs(temps.Acquire(), left_high, High32Bits(value));
       ret = std::make_pair(ARMCondition(cond), ARMCondition(opposite));
@@ -2013,22 +1961,18 @@ static std::pair<vixl32::Condition, vixl32::Condition> GenerateTest(HCondition*
 static bool CanGenerateTest(HCondition* condition, ArmVIXLAssembler* assembler) {
   if (condition->GetLeft()->GetType() == Primitive::kPrimLong) {
     const LocationSummary* const locations = condition->GetLocations();
+    const IfCondition c = condition->GetCondition();
 
     if (locations->InAt(1).IsConstant()) {
-      IfCondition c = condition->GetCondition();
-      IfCondition opposite = condition->GetOppositeCondition();
-      const int64_t value =
-          AdjustConstantForCondition(Int64ConstantFrom(locations->InAt(1)), &c, &opposite);
+      const int64_t value = Int64ConstantFrom(locations->InAt(1));
 
       if (c < kCondLT || c > kCondGE) {
         // Since IT blocks longer than a 16-bit instruction are deprecated by ARMv8,
         // we check that the least significant half of the first input to be compared
         // is in a low register (the other half is read outside an IT block), and
         // the constant fits in an 8-bit unsigned integer, so that a 16-bit CMP
-        // encoding can be used; 0 is always handled, no matter what registers are
-        // used by the first input.
-        if (value != 0 &&
-            (!LowRegisterFrom(locations->InAt(0)).IsLow() || !IsUint<8>(Low32Bits(value)))) {
+        // encoding can be used.
+        if (!LowRegisterFrom(locations->InAt(0)).IsLow() || !IsUint<8>(Low32Bits(value))) {
           return false;
         }
       // TODO(VIXL): The rest of the checks are there to keep the backend in sync with
@@ -2047,353 +1991,6 @@ static bool CanGenerateTest(HCondition* condition, ArmVIXLAssembler* assembler)
   return true;
 }
 
-static void GenerateConditionGeneric(HCondition* cond, CodeGeneratorARMVIXL* codegen) {
-  DCHECK(CanGenerateTest(cond, codegen->GetAssembler()));
-
-  const vixl32::Register out = OutputRegister(cond);
-  const auto condition = GenerateTest(cond, false, codegen);
-
-  __ Mov(LeaveFlags, out, 0);
-
-  if (out.IsLow()) {
-    // We use the scope because of the IT block that follows.
-    ExactAssemblyScope guard(codegen->GetVIXLAssembler(),
-                             2 * vixl32::k16BitT32InstructionSizeInBytes,
-                             CodeBufferCheckScope::kExactSize);
-
-    __ it(condition.first);
-    __ mov(condition.first, out, 1);
-  } else {
-    vixl32::Label done_label;
-    vixl32::Label* const final_label = codegen->GetFinalLabel(cond, &done_label);
-
-    __ B(condition.second, final_label, /* far_target */ false);
-    __ Mov(out, 1);
-
-    if (done_label.IsReferenced()) {
-      __ Bind(&done_label);
-    }
-  }
-}
-
-static void GenerateEqualLong(HCondition* cond, CodeGeneratorARMVIXL* codegen) {
-  DCHECK_EQ(cond->GetLeft()->GetType(), Primitive::kPrimLong);
-
-  const LocationSummary* const locations = cond->GetLocations();
-  IfCondition condition = cond->GetCondition();
-  const vixl32::Register out = OutputRegister(cond);
-  const Location left = locations->InAt(0);
-  const Location right = locations->InAt(1);
-  vixl32::Register left_high = HighRegisterFrom(left);
-  vixl32::Register left_low = LowRegisterFrom(left);
-  vixl32::Register temp;
-  UseScratchRegisterScope temps(codegen->GetVIXLAssembler());
-
-  if (right.IsConstant()) {
-    IfCondition opposite = cond->GetOppositeCondition();
-    const int64_t value = AdjustConstantForCondition(Int64ConstantFrom(right),
-                                                     &condition,
-                                                     &opposite);
-    Operand right_high = High32Bits(value);
-    Operand right_low = Low32Bits(value);
-
-    // The output uses Location::kNoOutputOverlap.
-    if (out.Is(left_high)) {
-      std::swap(left_low, left_high);
-      std::swap(right_low, right_high);
-    }
-
-    __ Sub(out, left_low, right_low);
-    temp = temps.Acquire();
-    __ Sub(temp, left_high, right_high);
-  } else {
-    DCHECK(right.IsRegisterPair());
-    temp = temps.Acquire();
-    __ Sub(temp, left_high, HighRegisterFrom(right));
-    __ Sub(out, left_low, LowRegisterFrom(right));
-  }
-
-  // Need to check after calling AdjustConstantForCondition().
-  DCHECK(condition == kCondEQ || condition == kCondNE) << condition;
-
-  if (condition == kCondNE && out.IsLow()) {
-    __ Orrs(out, out, temp);
-
-    // We use the scope because of the IT block that follows.
-    ExactAssemblyScope guard(codegen->GetVIXLAssembler(),
-                             2 * vixl32::k16BitT32InstructionSizeInBytes,
-                             CodeBufferCheckScope::kExactSize);
-
-    __ it(ne);
-    __ mov(ne, out, 1);
-  } else {
-    __ Orr(out, out, temp);
-    codegen->GenerateConditionWithZero(condition, out, out, temp);
-  }
-}
-
-static void GenerateLongComparesAndJumps(HCondition* cond,
-                                         vixl32::Label* true_label,
-                                         vixl32::Label* false_label,
-                                         CodeGeneratorARMVIXL* codegen) {
-  LocationSummary* locations = cond->GetLocations();
-  Location left = locations->InAt(0);
-  Location right = locations->InAt(1);
-  IfCondition if_cond = cond->GetCondition();
-
-  vixl32::Register left_high = HighRegisterFrom(left);
-  vixl32::Register left_low = LowRegisterFrom(left);
-  IfCondition true_high_cond = if_cond;
-  IfCondition false_high_cond = cond->GetOppositeCondition();
-  vixl32::Condition final_condition = ARMUnsignedCondition(if_cond);  // unsigned on lower part
-
-  // Set the conditions for the test, remembering that == needs to be
-  // decided using the low words.
-  switch (if_cond) {
-    case kCondEQ:
-    case kCondNE:
-      // Nothing to do.
-      break;
-    case kCondLT:
-      false_high_cond = kCondGT;
-      break;
-    case kCondLE:
-      true_high_cond = kCondLT;
-      break;
-    case kCondGT:
-      false_high_cond = kCondLT;
-      break;
-    case kCondGE:
-      true_high_cond = kCondGT;
-      break;
-    case kCondB:
-      false_high_cond = kCondA;
-      break;
-    case kCondBE:
-      true_high_cond = kCondB;
-      break;
-    case kCondA:
-      false_high_cond = kCondB;
-      break;
-    case kCondAE:
-      true_high_cond = kCondA;
-      break;
-  }
-  if (right.IsConstant()) {
-    int64_t value = Int64ConstantFrom(right);
-    int32_t val_low = Low32Bits(value);
-    int32_t val_high = High32Bits(value);
-
-    __ Cmp(left_high, val_high);
-    if (if_cond == kCondNE) {
-      __ B(ARMCondition(true_high_cond), true_label);
-    } else if (if_cond == kCondEQ) {
-      __ B(ARMCondition(false_high_cond), false_label);
-    } else {
-      __ B(ARMCondition(true_high_cond), true_label);
-      __ B(ARMCondition(false_high_cond), false_label);
-    }
-    // Must be equal high, so compare the lows.
-    __ Cmp(left_low, val_low);
-  } else {
-    vixl32::Register right_high = HighRegisterFrom(right);
-    vixl32::Register right_low = LowRegisterFrom(right);
-
-    __ Cmp(left_high, right_high);
-    if (if_cond == kCondNE) {
-      __ B(ARMCondition(true_high_cond), true_label);
-    } else if (if_cond == kCondEQ) {
-      __ B(ARMCondition(false_high_cond), false_label);
-    } else {
-      __ B(ARMCondition(true_high_cond), true_label);
-      __ B(ARMCondition(false_high_cond), false_label);
-    }
-    // Must be equal high, so compare the lows.
-    __ Cmp(left_low, right_low);
-  }
-  // The last comparison might be unsigned.
-  // TODO: optimize cases where this is always true/false
-  __ B(final_condition, true_label);
-}
-
-static void GenerateConditionLong(HCondition* cond, CodeGeneratorARMVIXL* codegen) {
-  DCHECK_EQ(cond->GetLeft()->GetType(), Primitive::kPrimLong);
-
-  const LocationSummary* const locations = cond->GetLocations();
-  IfCondition condition = cond->GetCondition();
-  const vixl32::Register out = OutputRegister(cond);
-  const Location left = locations->InAt(0);
-  const Location right = locations->InAt(1);
-
-  if (right.IsConstant()) {
-    IfCondition opposite = cond->GetOppositeCondition();
-
-    // Comparisons against 0 are common enough to deserve special attention.
-    if (AdjustConstantForCondition(Int64ConstantFrom(right), &condition, &opposite) == 0) {
-      switch (condition) {
-        case kCondNE:
-        case kCondA:
-          if (out.IsLow()) {
-            // We only care if both input registers are 0 or not.
-            __ Orrs(out, LowRegisterFrom(left), HighRegisterFrom(left));
-
-            // We use the scope because of the IT block that follows.
-            ExactAssemblyScope guard(codegen->GetVIXLAssembler(),
-                                     2 * vixl32::k16BitT32InstructionSizeInBytes,
-                                     CodeBufferCheckScope::kExactSize);
-
-            __ it(ne);
-            __ mov(ne, out, 1);
-            return;
-          }
-
-          FALLTHROUGH_INTENDED;
-        case kCondEQ:
-        case kCondBE:
-          // We only care if both input registers are 0 or not.
-          __ Orr(out, LowRegisterFrom(left), HighRegisterFrom(left));
-          codegen->GenerateConditionWithZero(condition, out, out);
-          return;
-        case kCondLT:
-        case kCondGE:
-          // We only care about the sign bit.
-          FALLTHROUGH_INTENDED;
-        case kCondAE:
-        case kCondB:
-          codegen->GenerateConditionWithZero(condition, out, HighRegisterFrom(left));
-          return;
-        case kCondLE:
-        case kCondGT:
-        default:
-          break;
-      }
-    }
-  }
-
-  if ((condition == kCondEQ || condition == kCondNE) &&
-      // If `out` is a low register, then the GenerateConditionGeneric()
-      // function generates a shorter code sequence that is still branchless.
-      (!out.IsLow() || !CanGenerateTest(cond, codegen->GetAssembler()))) {
-    GenerateEqualLong(cond, codegen);
-    return;
-  }
-
-  if (CanGenerateTest(cond, codegen->GetAssembler())) {
-    GenerateConditionGeneric(cond, codegen);
-    return;
-  }
-
-  // Convert the jumps into the result.
-  vixl32::Label done_label;
-  vixl32::Label* const final_label = codegen->GetFinalLabel(cond, &done_label);
-  vixl32::Label true_label, false_label;
-
-  GenerateLongComparesAndJumps(cond, &true_label, &false_label, codegen);
-
-  // False case: result = 0.
-  __ Bind(&false_label);
-  __ Mov(out, 0);
-  __ B(final_label);
-
-  // True case: result = 1.
-  __ Bind(&true_label);
-  __ Mov(out, 1);
-
-  if (done_label.IsReferenced()) {
-    __ Bind(&done_label);
-  }
-}
-
-static void GenerateConditionIntegralOrNonPrimitive(HCondition* cond, CodeGeneratorARMVIXL* codegen) {
-  const Primitive::Type type = cond->GetLeft()->GetType();
-
-  DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type;
-
-  if (type == Primitive::kPrimLong) {
-    GenerateConditionLong(cond, codegen);
-    return;
-  }
-
-  IfCondition condition = cond->GetCondition();
-  vixl32::Register in = InputRegisterAt(cond, 0);
-  const vixl32::Register out = OutputRegister(cond);
-  const Location right = cond->GetLocations()->InAt(1);
-  int64_t value;
-
-  if (right.IsConstant()) {
-    IfCondition opposite = cond->GetOppositeCondition();
-
-    value = AdjustConstantForCondition(Int64ConstantFrom(right), &condition, &opposite);
-
-    // Comparisons against 0 are common enough to deserve special attention.
-    if (value == 0) {
-      switch (condition) {
-        case kCondNE:
-        case kCondA:
-          if (out.IsLow() && out.Is(in)) {
-            __ Cmp(out, 0);
-
-            // We use the scope because of the IT block that follows.
-            ExactAssemblyScope guard(codegen->GetVIXLAssembler(),
-                                     2 * vixl32::k16BitT32InstructionSizeInBytes,
-                                     CodeBufferCheckScope::kExactSize);
-
-            __ it(ne);
-            __ mov(ne, out, 1);
-            return;
-          }
-
-          FALLTHROUGH_INTENDED;
-        case kCondEQ:
-        case kCondBE:
-        case kCondLT:
-        case kCondGE:
-        case kCondAE:
-        case kCondB:
-          codegen->GenerateConditionWithZero(condition, out, in);
-          return;
-        case kCondLE:
-        case kCondGT:
-        default:
-          break;
-      }
-    }
-  }
-
-  if (condition == kCondEQ || condition == kCondNE) {
-    Operand operand(0);
-
-    if (right.IsConstant()) {
-      operand = Operand::From(value);
-    } else if (out.Is(RegisterFrom(right))) {
-      // Avoid 32-bit instructions if possible.
-      operand = InputOperandAt(cond, 0);
-      in = RegisterFrom(right);
-    } else {
-      operand = InputOperandAt(cond, 1);
-    }
-
-    if (condition == kCondNE && out.IsLow()) {
-      __ Subs(out, in, operand);
-
-      // We use the scope because of the IT block that follows.
-      ExactAssemblyScope guard(codegen->GetVIXLAssembler(),
-                               2 * vixl32::k16BitT32InstructionSizeInBytes,
-                               CodeBufferCheckScope::kExactSize);
-
-      __ it(ne);
-      __ mov(ne, out, 1);
-    } else {
-      __ Sub(out, in, operand);
-      codegen->GenerateConditionWithZero(condition, out, out);
-    }
-
-    return;
-  }
-
-  GenerateConditionGeneric(cond, codegen);
-}
-
 static bool CanEncodeConstantAs8BitImmediate(HConstant* constant) {
   const Primitive::Type type = constant->GetType();
   bool ret = false;
@@ -2954,6 +2551,89 @@ void LocationsBuilderARMVIXL::VisitExit(HExit* exit) {
 void InstructionCodeGeneratorARMVIXL::VisitExit(HExit* exit ATTRIBUTE_UNUSED) {
 }
 
+void InstructionCodeGeneratorARMVIXL::GenerateLongComparesAndJumps(HCondition* cond,
+                                                                   vixl32::Label* true_label,
+                                                                   vixl32::Label* false_label) {
+  LocationSummary* locations = cond->GetLocations();
+  Location left = locations->InAt(0);
+  Location right = locations->InAt(1);
+  IfCondition if_cond = cond->GetCondition();
+
+  vixl32::Register left_high = HighRegisterFrom(left);
+  vixl32::Register left_low = LowRegisterFrom(left);
+  IfCondition true_high_cond = if_cond;
+  IfCondition false_high_cond = cond->GetOppositeCondition();
+  vixl32::Condition final_condition = ARMUnsignedCondition(if_cond);  // unsigned on lower part
+
+  // Set the conditions for the test, remembering that == needs to be
+  // decided using the low words.
+  switch (if_cond) {
+    case kCondEQ:
+    case kCondNE:
+      // Nothing to do.
+      break;
+    case kCondLT:
+      false_high_cond = kCondGT;
+      break;
+    case kCondLE:
+      true_high_cond = kCondLT;
+      break;
+    case kCondGT:
+      false_high_cond = kCondLT;
+      break;
+    case kCondGE:
+      true_high_cond = kCondGT;
+      break;
+    case kCondB:
+      false_high_cond = kCondA;
+      break;
+    case kCondBE:
+      true_high_cond = kCondB;
+      break;
+    case kCondA:
+      false_high_cond = kCondB;
+      break;
+    case kCondAE:
+      true_high_cond = kCondA;
+      break;
+  }
+  if (right.IsConstant()) {
+    int64_t value = Int64ConstantFrom(right);
+    int32_t val_low = Low32Bits(value);
+    int32_t val_high = High32Bits(value);
+
+    __ Cmp(left_high, val_high);
+    if (if_cond == kCondNE) {
+      __ B(ARMCondition(true_high_cond), true_label);
+    } else if (if_cond == kCondEQ) {
+      __ B(ARMCondition(false_high_cond), false_label);
+    } else {
+      __ B(ARMCondition(true_high_cond), true_label);
+      __ B(ARMCondition(false_high_cond), false_label);
+    }
+    // Must be equal high, so compare the lows.
+    __ Cmp(left_low, val_low);
+  } else {
+    vixl32::Register right_high = HighRegisterFrom(right);
+    vixl32::Register right_low = LowRegisterFrom(right);
+
+    __ Cmp(left_high, right_high);
+    if (if_cond == kCondNE) {
+      __ B(ARMCondition(true_high_cond), true_label);
+    } else if (if_cond == kCondEQ) {
+      __ B(ARMCondition(false_high_cond), false_label);
+    } else {
+      __ B(ARMCondition(true_high_cond), true_label);
+      __ B(ARMCondition(false_high_cond), false_label);
+    }
+    // Must be equal high, so compare the lows.
+    __ Cmp(left_low, right_low);
+  }
+  // The last comparison might be unsigned.
+  // TODO: optimize cases where this is always true/false
+  __ B(final_condition, true_label);
+}
+
 void InstructionCodeGeneratorARMVIXL::GenerateCompareTestAndBranch(HCondition* condition,
                                                                    vixl32::Label* true_target_in,
                                                                    vixl32::Label* false_target_in) {
@@ -2988,7 +2668,7 @@ void InstructionCodeGeneratorARMVIXL::GenerateCompareTestAndBranch(HCondition* c
   vixl32::Label* false_target = (false_target_in == nullptr) ? &fallthrough : false_target_in;
 
   DCHECK_EQ(condition->InputAt(0)->GetType(), Primitive::kPrimLong);
-  GenerateLongComparesAndJumps(condition, true_target, false_target, codegen_);
+  GenerateLongComparesAndJumps(condition, true_target, false_target);
 
   if (false_target != &fallthrough) {
     __ B(false_target);
@@ -3299,83 +2979,6 @@ void CodeGeneratorARMVIXL::GenerateNop() {
   __ Nop();
 }
 
-// `temp` is an extra temporary register that is used for some conditions;
-// callers may not specify it, in which case the method will use a scratch
-// register instead.
-void CodeGeneratorARMVIXL::GenerateConditionWithZero(IfCondition condition,
-                                                     vixl32::Register out,
-                                                     vixl32::Register in,
-                                                     vixl32::Register temp) {
-  switch (condition) {
-    case kCondEQ:
-    // x <= 0 iff x == 0 when the comparison is unsigned.
-    case kCondBE:
-      if (!temp.IsValid() || (out.IsLow() && !out.Is(in))) {
-        temp = out;
-      }
-
-      // Avoid 32-bit instructions if possible; note that `in` and `temp` must be
-      // different as well.
-      if (in.IsLow() && temp.IsLow() && !in.Is(temp)) {
-        // temp = - in; only 0 sets the carry flag.
-        __ Rsbs(temp, in, 0);
-
-        if (out.Is(in)) {
-          std::swap(in, temp);
-        }
-
-        // out = - in + in + carry = carry
-        __ Adc(out, temp, in);
-      } else {
-        // If `in` is 0, then it has 32 leading zeros, and less than that otherwise.
-        __ Clz(out, in);
-        // Any number less than 32 logically shifted right by 5 bits results in 0;
-        // the same operation on 32 yields 1.
-        __ Lsr(out, out, 5);
-      }
-
-      break;
-    case kCondNE:
-    // x > 0 iff x != 0 when the comparison is unsigned.
-    case kCondA: {
-      UseScratchRegisterScope temps(GetVIXLAssembler());
-
-      if (out.Is(in)) {
-        if (!temp.IsValid() || in.Is(temp)) {
-          temp = temps.Acquire();
-        }
-      } else if (!temp.IsValid() || !temp.IsLow()) {
-        temp = out;
-      }
-
-      // temp = in - 1; only 0 does not set the carry flag.
-      __ Subs(temp, in, 1);
-      // out = in + ~temp + carry = in + (-(in - 1) - 1) + carry = in - in + 1 - 1 + carry = carry
-      __ Sbc(out, in, temp);
-      break;
-    }
-    case kCondGE:
-      __ Mvn(out, in);
-      in = out;
-      FALLTHROUGH_INTENDED;
-    case kCondLT:
-      // We only care about the sign bit.
-      __ Lsr(out, in, 31);
-      break;
-    case kCondAE:
-      // Trivially true.
-      __ Mov(out, 1);
-      break;
-    case kCondB:
-      // Trivially false.
-      __ Mov(out, 0);
-      break;
-    default:
-      LOG(FATAL) << "Unexpected condition " << condition;
-      UNREACHABLE();
-  }
-}
-
 void LocationsBuilderARMVIXL::HandleCondition(HCondition* cond) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(cond, LocationSummary::kNoCall);
@@ -3412,41 +3015,52 @@ void InstructionCodeGeneratorARMVIXL::HandleCondition(HCondition* cond) {
     return;
   }
 
-  const Primitive::Type type = cond->GetLeft()->GetType();
+  const vixl32::Register out = OutputRegister(cond);
+
+  if (out.IsLow() && CanGenerateTest(cond, codegen_->GetAssembler())) {
+    const auto condition = GenerateTest(cond, false, codegen_);
+    // We use the scope because of the IT block that follows.
+    ExactAssemblyScope guard(GetVIXLAssembler(),
+                             4 * vixl32::k16BitT32InstructionSizeInBytes,
+                             CodeBufferCheckScope::kExactSize);
 
-  if (Primitive::IsFloatingPointType(type)) {
-    GenerateConditionGeneric(cond, codegen_);
+    __ it(condition.first);
+    __ mov(condition.first, out, 1);
+    __ it(condition.second);
+    __ mov(condition.second, out, 0);
     return;
   }
 
-  DCHECK(Primitive::IsIntegralType(type) || type == Primitive::kPrimNot) << type;
-
-  if (type == Primitive::kPrimBoolean) {
-    const IfCondition c = cond->GetCondition();
-    vixl32::Register left = InputRegisterAt(cond, 0);
-    const vixl32::Register out = OutputRegister(cond);
-    const Location right_loc = cond->GetLocations()->InAt(1);
+  // Convert the jumps into the result.
+  vixl32::Label done_label;
+  vixl32::Label* const final_label = codegen_->GetFinalLabel(cond, &done_label);
 
-    // All other cases are handled by the instruction simplifier.
-    DCHECK((c == kCondEQ || c == kCondNE) && !right_loc.IsConstant());
+  if (cond->InputAt(0)->GetType() == Primitive::kPrimLong) {
+    vixl32::Label true_label, false_label;
 
-    vixl32::Register right = RegisterFrom(right_loc);
+    GenerateLongComparesAndJumps(cond, &true_label, &false_label);
 
-    // Avoid 32-bit instructions if possible.
-    if (out.Is(right)) {
-      std::swap(left, right);
-    }
+    // False case: result = 0.
+    __ Bind(&false_label);
+    __ Mov(out, 0);
+    __ B(final_label);
 
-    __ Eor(out, left, right);
+    // True case: result = 1.
+    __ Bind(&true_label);
+    __ Mov(out, 1);
+  } else {
+    DCHECK(CanGenerateTest(cond, codegen_->GetAssembler()));
 
-    if (c == kCondEQ) {
-      __ Eor(out, out, 1);
-    }
+    const auto condition = GenerateTest(cond, false, codegen_);
 
-    return;
+    __ Mov(LeaveFlags, out, 0);
+    __ B(condition.second, final_label, /* far_target */ false);
+    __ Mov(out, 1);
   }
 
-  GenerateConditionIntegralOrNonPrimitive(cond, codegen_);
+  if (done_label.IsReferenced()) {
+    __ Bind(&done_label);
+  }
 }
 
 void LocationsBuilderARMVIXL::VisitEqual(HEqual* comp) {
@@ -6833,6 +6447,16 @@ void InstructionCodeGeneratorARMVIXL::VisitIntermediateAddress(HIntermediateAddr
   }
 }
 
+void LocationsBuilderARMVIXL::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instruction) {
+  LOG(FATAL) << "Unreachable " << instruction->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitIntermediateAddressIndex(
+    HIntermediateAddressIndex* instruction) {
+  LOG(FATAL) << "Unreachable " << instruction->GetId();
+}
+
 void LocationsBuilderARMVIXL::VisitBoundsCheck(HBoundsCheck* instruction) {
   RegisterSet caller_saves = RegisterSet::Empty();
   InvokeRuntimeCallingConventionARMVIXL calling_convention;
@@ -8557,8 +8181,9 @@ void InstructionCodeGeneratorARMVIXL::GenerateGcRootFieldLoad(
 
         UseScratchRegisterScope temps(GetVIXLAssembler());
         ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction);
-        uint32_t custom_data =
-            linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg.GetCode());
+        bool narrow = CanEmitNarrowLdr(root_reg, obj, offset);
+        uint32_t custom_data = linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(
+            root_reg.GetCode(), narrow);
         vixl32::Label* bne_label = codegen_->NewBakerReadBarrierPatch(custom_data);
 
         // entrypoint_reg =
@@ -8573,15 +8198,16 @@ void InstructionCodeGeneratorARMVIXL::GenerateGcRootFieldLoad(
         vixl32::Label return_address;
         EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
         __ cmp(kBakerCcEntrypointRegister, Operand(0));
-        static_assert(
-            BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET == -8,
-            "GC root LDR must be 2 32-bit instructions (8B) before the return address label.");
         // Currently the offset is always within range. If that changes,
         // we shall have to split the load the same way as for fields.
         DCHECK_LT(offset, kReferenceLoadMinFarOffset);
-        __ ldr(EncodingSize(Wide), root_reg, MemOperand(obj, offset));
+        ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
+        __ ldr(EncodingSize(narrow ? Narrow : Wide), root_reg, MemOperand(obj, offset));
         EmitPlaceholderBne(codegen_, bne_label);
         __ Bind(&return_address);
+        DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
+                  narrow ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET
+                         : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET);
       } else {
         // Note that we do not actually check the value of
         // `GetIsGcMarking()` to decide whether to mark the loaded GC
@@ -8682,10 +8308,12 @@ void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* i
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = *(obj+offset);
+    //     HeapReference<mirror::Object> reference = *(obj+offset);
     //   gray_return_address:
 
     DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
+    vixl32::Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot);
+    bool narrow = CanEmitNarrowLdr(ref_reg, obj, offset);
     vixl32::Register base = obj;
     if (offset >= kReferenceLoadMinFarOffset) {
       base = RegisterFrom(temp);
@@ -8693,12 +8321,15 @@ void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* i
       static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2.");
       __ Add(base, obj, Operand(offset & ~(kReferenceLoadMinFarOffset - 1u)));
       offset &= (kReferenceLoadMinFarOffset - 1u);
+      // Use narrow LDR only for small offsets. Generating narrow encoding LDR for the large
+      // offsets with `(offset & (kReferenceLoadMinFarOffset - 1u)) < 32u` would most likely
+      // increase the overall code size when taking the generated thunks into account.
+      DCHECK(!narrow);
     }
     UseScratchRegisterScope temps(GetVIXLAssembler());
     ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction);
     uint32_t custom_data = linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
-        base.GetCode(),
-        obj.GetCode());
+        base.GetCode(), obj.GetCode(), narrow);
     vixl32::Label* bne_label = NewBakerReadBarrierPatch(custom_data);
 
     // entrypoint_reg =
@@ -8715,19 +8346,24 @@ void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* i
     EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
     __ cmp(kBakerCcEntrypointRegister, Operand(0));
     EmitPlaceholderBne(this, bne_label);
-    static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                  "Field LDR must be 1 32-bit instruction (4B) before the return address label; "
-                  " 2 32-bit instructions (8B) for heap poisoning.");
-    vixl32::Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot);
-    __ ldr(EncodingSize(Wide), ref_reg, MemOperand(base, offset));
+    ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
+    __ ldr(EncodingSize(narrow ? Narrow : Wide), ref_reg, MemOperand(base, offset));
     if (needs_null_check) {
       MaybeRecordImplicitNullCheck(instruction);
     }
-    // Note: We need a Wide NEG for the unpoisoning.
+    // Note: We need a specific width for the unpoisoning NEG.
     if (kPoisonHeapReferences) {
-      __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0));
+      if (narrow) {
+        // The only 16-bit encoding is T1 which sets flags outside IT block (i.e. RSBS, not RSB).
+        __ rsbs(EncodingSize(Narrow), ref_reg, ref_reg, Operand(0));
+      } else {
+        __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0));
+      }
     }
     __ Bind(&return_address);
+    DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
+              narrow ? BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
+                     : BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET);
     return;
   }
 
@@ -8773,7 +8409,7 @@ void CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier(HInstruction* i
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = data[index];
+    //     HeapReference<mirror::Object> reference = data[index];
     //   gray_return_address:
 
     DCHECK(index.IsValid());
@@ -8803,9 +8439,7 @@ void CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier(HInstruction* i
     EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
     __ cmp(kBakerCcEntrypointRegister, Operand(0));
     EmitPlaceholderBne(this, bne_label);
-    static_assert(BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                  "Array LDR must be 1 32-bit instruction (4B) before the return address label; "
-                  " 2 32-bit instructions (8B) for heap poisoning.");
+    ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
     __ ldr(ref_reg, MemOperand(data_reg, index_reg, vixl32::LSL, scale_factor));
     DCHECK(!needs_null_check);  // The thunk cannot handle the null check.
     // Note: We need a Wide NEG for the unpoisoning.
@@ -8813,6 +8447,8 @@ void CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier(HInstruction* i
       __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0));
     }
     __ Bind(&return_address);
+    DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
+              BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET);
     return;
   }
 
@@ -9625,14 +9261,20 @@ static void PatchJitRootUse(uint8_t* code,
 
 void CodeGeneratorARMVIXL::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const auto& entry : jit_string_patches_) {
-    const auto& it = jit_string_roots_.find(entry.first);
+    const StringReference& string_reference = entry.first;
+    VIXLUInt32Literal* table_entry_literal = entry.second;
+    const auto it = jit_string_roots_.find(string_reference);
     DCHECK(it != jit_string_roots_.end());
-    PatchJitRootUse(code, roots_data, entry.second, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table);
   }
   for (const auto& entry : jit_class_patches_) {
-    const auto& it = jit_class_roots_.find(entry.first);
+    const TypeReference& type_reference = entry.first;
+    VIXLUInt32Literal* table_entry_literal = entry.second;
+    const auto it = jit_class_roots_.find(type_reference);
     DCHECK(it != jit_class_roots_.end());
-    PatchJitRootUse(code, roots_data, entry.second, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index afff72fb52..657d3c134f 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -401,6 +401,9 @@ class InstructionCodeGeneratorARMVIXL : public InstructionCodeGenerator {
   void GenerateCompareTestAndBranch(HCondition* condition,
                                     vixl::aarch32::Label* true_target,
                                     vixl::aarch32::Label* false_target);
+  void GenerateLongComparesAndJumps(HCondition* cond,
+                                    vixl::aarch32::Label* true_label,
+                                    vixl::aarch32::Label* false_label);
   void DivRemOneOrMinusOne(HBinaryOperation* instruction);
   void DivRemByPowerOfTwo(HBinaryOperation* instruction);
   void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
@@ -717,14 +720,6 @@ class CodeGeneratorARMVIXL : public CodeGenerator {
   void EmitMovwMovtPlaceholder(CodeGeneratorARMVIXL::PcRelativePatchInfo* labels,
                                vixl::aarch32::Register out);
 
-  // `temp` is an extra temporary register that is used for some conditions;
-  // callers may not specify it, in which case the method will use a scratch
-  // register instead.
-  void GenerateConditionWithZero(IfCondition condition,
-                                 vixl::aarch32::Register out,
-                                 vixl::aarch32::Register in,
-                                 vixl::aarch32::Register temp = vixl32::Register());
-
  private:
   vixl::aarch32::Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke,
                                                                 vixl::aarch32::Register temp);
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index e9870acff4..fdfa4eedf8 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -1780,16 +1780,18 @@ void CodeGeneratorMIPS::PatchJitRootUse(uint8_t* code,
 
 void CodeGeneratorMIPS::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const JitPatchInfo& info : jit_string_patches_) {
-    const auto& it = jit_string_roots_.find(StringReference(&info.target_dex_file,
-                                                            dex::StringIndex(info.index)));
+    const auto it = jit_string_roots_.find(StringReference(&info.target_dex_file,
+                                                           dex::StringIndex(info.index)));
     DCHECK(it != jit_string_roots_.end());
-    PatchJitRootUse(code, roots_data, info, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, info, index_in_table);
   }
   for (const JitPatchInfo& info : jit_class_patches_) {
-    const auto& it = jit_class_roots_.find(TypeReference(&info.target_dex_file,
-                                                         dex::TypeIndex(info.index)));
+    const auto it = jit_class_roots_.find(TypeReference(&info.target_dex_file,
+                                                        dex::TypeIndex(info.index)));
     DCHECK(it != jit_class_roots_.end());
-    PatchJitRootUse(code, roots_data, info, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, info, index_in_table);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index f04e3841f5..d3ae3a729b 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -1586,14 +1586,20 @@ void CodeGeneratorMIPS64::PatchJitRootUse(uint8_t* code,
 
 void CodeGeneratorMIPS64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const auto& entry : jit_string_patches_) {
-    const auto& it = jit_string_roots_.find(entry.first);
+    const StringReference& string_reference = entry.first;
+    Literal* table_entry_literal = entry.second;
+    const auto it = jit_string_roots_.find(string_reference);
     DCHECK(it != jit_string_roots_.end());
-    PatchJitRootUse(code, roots_data, entry.second, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table);
   }
   for (const auto& entry : jit_class_patches_) {
-    const auto& it = jit_class_roots_.find(entry.first);
+    const TypeReference& type_reference = entry.first;
+    Literal* table_entry_literal = entry.second;
+    const auto it = jit_class_roots_.find(type_reference);
     DCHECK(it != jit_class_roots_.end());
-    PatchJitRootUse(code, roots_data, entry.second, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, table_entry_literal, index_in_table);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 57f7e6b25c..478bd24388 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -783,6 +783,12 @@ MemOperand InstructionCodeGeneratorARM64::VecAddress(
     /*out*/ Register* scratch) {
   LocationSummary* locations = instruction->GetLocations();
   Register base = InputRegisterAt(instruction, 0);
+
+  if (instruction->InputAt(1)->IsIntermediateAddressIndex()) {
+    DCHECK(!is_string_char_at);
+    return MemOperand(base.X(), InputRegisterAt(instruction, 1).X());
+  }
+
   Location index = locations->InAt(1);
   uint32_t offset = is_string_char_at
       ? mirror::String::ValueOffset().Uint32Value()
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index cf2d5cbee3..bd9a5d2564 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -7703,7 +7703,7 @@ void CodeGeneratorX86::Finalize(CodeAllocator* allocator) {
     constant_area_start_ = assembler->CodeSize();
 
     // Populate any jump tables.
-    for (auto jump_table : fixups_to_jump_tables_) {
+    for (JumpTableRIPFixup* jump_table : fixups_to_jump_tables_) {
       jump_table->CreateJumpTable();
     }
 
@@ -7842,17 +7842,19 @@ void CodeGeneratorX86::PatchJitRootUse(uint8_t* code,
 
 void CodeGeneratorX86::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const PatchInfo<Label>& info : jit_string_patches_) {
-    const auto& it = jit_string_roots_.find(
+    const auto it = jit_string_roots_.find(
         StringReference(&info.dex_file, dex::StringIndex(info.index)));
     DCHECK(it != jit_string_roots_.end());
-    PatchJitRootUse(code, roots_data, info, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, info, index_in_table);
   }
 
   for (const PatchInfo<Label>& info : jit_class_patches_) {
-    const auto& it = jit_class_roots_.find(
+    const auto it = jit_class_roots_.find(
         TypeReference(&info.dex_file, dex::TypeIndex(info.index)));
     DCHECK(it != jit_class_roots_.end());
-    PatchJitRootUse(code, roots_data, info, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, info, index_in_table);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index f2ed52b5a5..6b0e001ad8 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -7055,7 +7055,7 @@ void CodeGeneratorX86_64::Finalize(CodeAllocator* allocator) {
     constant_area_start_ = assembler->CodeSize();
 
     // Populate any jump tables.
-    for (auto jump_table : fixups_to_jump_tables_) {
+    for (JumpTableRIPFixup* jump_table : fixups_to_jump_tables_) {
       jump_table->CreateJumpTable();
     }
 
@@ -7149,17 +7149,19 @@ void CodeGeneratorX86_64::PatchJitRootUse(uint8_t* code,
 
 void CodeGeneratorX86_64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const PatchInfo<Label>& info : jit_string_patches_) {
-    const auto& it = jit_string_roots_.find(
+    const auto it = jit_string_roots_.find(
         StringReference(&info.dex_file, dex::StringIndex(info.index)));
     DCHECK(it != jit_string_roots_.end());
-    PatchJitRootUse(code, roots_data, info, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, info, index_in_table);
   }
 
   for (const PatchInfo<Label>& info : jit_class_patches_) {
-    const auto& it = jit_class_roots_.find(
+    const auto it = jit_class_roots_.find(
         TypeReference(&info.dex_file, dex::TypeIndex(info.index)));
     DCHECK(it != jit_class_roots_.end());
-    PatchJitRootUse(code, roots_data, info, it->second);
+    uint64_t index_in_table = it->second;
+    PatchJitRootUse(code, roots_data, info, index_in_table);
   }
 }
 
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index 4ba5c5580f..fe25b7690d 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -64,7 +64,7 @@ static ::std::vector<CodegenTargetConfig> GetTargetConfigs() {
 #endif
   };
 
-  for (auto test_config : test_config_candidates) {
+  for (const CodegenTargetConfig& test_config : test_config_candidates) {
     if (CanExecute(test_config.GetInstructionSet())) {
       v.push_back(test_config);
     }
@@ -76,7 +76,7 @@ static ::std::vector<CodegenTargetConfig> GetTargetConfigs() {
 static void TestCode(const uint16_t* data,
                      bool has_result = false,
                      int32_t expected = 0) {
-  for (CodegenTargetConfig target_config : GetTargetConfigs()) {
+  for (const CodegenTargetConfig& target_config : GetTargetConfigs()) {
     ArenaPool pool;
     ArenaAllocator arena(&pool);
     HGraph* graph = CreateCFG(&arena, data);
@@ -89,7 +89,7 @@ static void TestCode(const uint16_t* data,
 static void TestCodeLong(const uint16_t* data,
                          bool has_result,
                          int64_t expected) {
-  for (CodegenTargetConfig target_config : GetTargetConfigs()) {
+  for (const CodegenTargetConfig& target_config : GetTargetConfigs()) {
     ArenaPool pool;
     ArenaAllocator arena(&pool);
     HGraph* graph = CreateCFG(&arena, data, Primitive::kPrimLong);
@@ -754,7 +754,28 @@ TEST_F(CodegenTest, ARM64ParallelMoveResolverB34760542) {
   //
   //   Assertion failed (!available->IsEmpty())
   //
-  // in vixl::aarch64::UseScratchRegisterScope::AcquireNextAvailable.
+  // in vixl::aarch64::UseScratchRegisterScope::AcquireNextAvailable,
+  // because of the following situation:
+  //
+  //   1. a temp register (IP0) is allocated as a scratch register by
+  //      the parallel move resolver to solve a cycle (swap):
+  //
+  //        [ source=DS0 destination=DS257 type=PrimDouble instruction=null ]
+  //        [ source=DS257 destination=DS0 type=PrimDouble instruction=null ]
+  //
+  //   2. within CodeGeneratorARM64::MoveLocation, another temp
+  //      register (IP1) is allocated to generate the swap between two
+  //      double stack slots;
+  //
+  //   3. VIXL requires a third temp register to emit the `Ldr` or
+  //      `Str` operation from CodeGeneratorARM64::MoveLocation (as
+  //      one of the stack slots' offsets cannot be encoded as an
+  //      immediate), but the pool of (core) temp registers is now
+  //      empty.
+  //
+  // The solution used so far is to use a floating-point temp register
+  // (D31) in step #2, so that IP1 is available for step #3.
+
   HParallelMove* move = new (graph->GetArena()) HParallelMove(graph->GetArena());
   move->AddMove(Location::DoubleStackSlot(0),
                 Location::DoubleStackSlot(257),
@@ -807,7 +828,6 @@ TEST_F(CodegenTest, ARM64ParallelMoveResolverSIMD) {
   InternalCodeAllocator code_allocator;
   codegen.Finalize(&code_allocator);
 }
-
 #endif
 
 #ifdef ART_ENABLE_CODEGEN_mips
diff --git a/compiler/optimizing/codegen_test_utils.h b/compiler/optimizing/codegen_test_utils.h
index 31cd204c9f..00a16fe849 100644
--- a/compiler/optimizing/codegen_test_utils.h
+++ b/compiler/optimizing/codegen_test_utils.h
@@ -243,7 +243,7 @@ static void ValidateGraph(HGraph* graph) {
   GraphChecker graph_checker(graph);
   graph_checker.Run();
   if (!graph_checker.IsValid()) {
-    for (const auto& error : graph_checker.GetErrors()) {
+    for (const std::string& error : graph_checker.GetErrors()) {
       std::cout << error << std::endl;
     }
   }
diff --git a/compiler/optimizing/gvn.cc b/compiler/optimizing/gvn.cc
index c93bc210be..8ea312d0ea 100644
--- a/compiler/optimizing/gvn.cc
+++ b/compiler/optimizing/gvn.cc
@@ -516,13 +516,13 @@ void GlobalValueNumberer::VisitBasicBlock(HBasicBlock* block) {
 bool GlobalValueNumberer::WillBeReferencedAgain(HBasicBlock* block) const {
   DCHECK(visited_blocks_.IsBitSet(block->GetBlockId()));
 
-  for (auto dominated_block : block->GetDominatedBlocks()) {
+  for (const HBasicBlock* dominated_block : block->GetDominatedBlocks()) {
     if (!visited_blocks_.IsBitSet(dominated_block->GetBlockId())) {
       return true;
     }
   }
 
-  for (auto successor : block->GetSuccessors()) {
+  for (const HBasicBlock* successor : block->GetSuccessors()) {
     if (!visited_blocks_.IsBitSet(successor->GetBlockId())) {
       return true;
     }
diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc
index f16e3727c8..311be1fb49 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.cc
+++ b/compiler/optimizing/instruction_simplifier_arm64.cc
@@ -216,5 +216,18 @@ void InstructionSimplifierArm64Visitor::VisitVecMul(HVecMul* instruction) {
   }
 }
 
+void InstructionSimplifierArm64Visitor::VisitVecLoad(HVecLoad* instruction) {
+  if (!instruction->IsStringCharAt()
+      && TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) {
+    RecordSimplification();
+  }
+}
+
+void InstructionSimplifierArm64Visitor::VisitVecStore(HVecStore* instruction) {
+  if (TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) {
+    RecordSimplification();
+  }
+}
+
 }  // namespace arm64
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h
index eec4e49792..8596f6ad40 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.h
+++ b/compiler/optimizing/instruction_simplifier_arm64.h
@@ -75,6 +75,8 @@ class InstructionSimplifierArm64Visitor : public HGraphVisitor {
   void VisitUShr(HUShr* instruction) OVERRIDE;
   void VisitXor(HXor* instruction) OVERRIDE;
   void VisitVecMul(HVecMul* instruction) OVERRIDE;
+  void VisitVecLoad(HVecLoad* instruction) OVERRIDE;
+  void VisitVecStore(HVecStore* instruction) OVERRIDE;
 
   OptimizingCompilerStats* stats_;
 };
diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc
index c39e5f4d3b..e5a8499ff4 100644
--- a/compiler/optimizing/instruction_simplifier_shared.cc
+++ b/compiler/optimizing/instruction_simplifier_shared.cc
@@ -16,6 +16,8 @@
 
 #include "instruction_simplifier_shared.h"
 
+#include "mirror/array-inl.h"
+
 namespace art {
 
 namespace {
@@ -346,4 +348,59 @@ bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa) {
   return false;
 }
 
+bool TryExtractVecArrayAccessAddress(HVecMemoryOperation* access, HInstruction* index) {
+  if (index->IsConstant()) {
+    // If index is constant the whole address calculation often can be done by LDR/STR themselves.
+    // TODO: Treat the case with not-embedable constant.
+    return false;
+  }
+
+  HGraph* graph = access->GetBlock()->GetGraph();
+  ArenaAllocator* arena = graph->GetArena();
+  Primitive::Type packed_type = access->GetPackedType();
+  uint32_t data_offset = mirror::Array::DataOffset(
+      Primitive::ComponentSize(packed_type)).Uint32Value();
+  size_t component_shift = Primitive::ComponentSizeShift(packed_type);
+
+  bool is_extracting_beneficial = false;
+  // It is beneficial to extract index intermediate address only if there are at least 2 users.
+  for (const HUseListNode<HInstruction*>& use : index->GetUses()) {
+    HInstruction* user = use.GetUser();
+    if (user->IsVecMemoryOperation() && user != access) {
+      HVecMemoryOperation* another_access = user->AsVecMemoryOperation();
+      Primitive::Type another_packed_type = another_access->GetPackedType();
+      uint32_t another_data_offset = mirror::Array::DataOffset(
+          Primitive::ComponentSize(another_packed_type)).Uint32Value();
+      size_t another_component_shift = Primitive::ComponentSizeShift(another_packed_type);
+      if (another_data_offset == data_offset && another_component_shift == component_shift) {
+        is_extracting_beneficial = true;
+        break;
+      }
+    } else if (user->IsIntermediateAddressIndex()) {
+      HIntermediateAddressIndex* another_access = user->AsIntermediateAddressIndex();
+      uint32_t another_data_offset = another_access->GetOffset()->AsIntConstant()->GetValue();
+      size_t another_component_shift = another_access->GetShift()->AsIntConstant()->GetValue();
+      if (another_data_offset == data_offset && another_component_shift == component_shift) {
+        is_extracting_beneficial = true;
+        break;
+      }
+    }
+  }
+
+  if (!is_extracting_beneficial) {
+    return false;
+  }
+
+  // Proceed to extract the index + data_offset address computation.
+  HIntConstant* offset = graph->GetIntConstant(data_offset);
+  HIntConstant* shift = graph->GetIntConstant(component_shift);
+  HIntermediateAddressIndex* address =
+      new (arena) HIntermediateAddressIndex(index, offset, shift, kNoDexPc);
+
+  access->GetBlock()->InsertInstructionBefore(address, access);
+  access->ReplaceInput(address, 1);
+
+  return true;
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_shared.h b/compiler/optimizing/instruction_simplifier_shared.h
index 2ea103a518..371619fa2e 100644
--- a/compiler/optimizing/instruction_simplifier_shared.h
+++ b/compiler/optimizing/instruction_simplifier_shared.h
@@ -59,6 +59,7 @@ bool TryExtractArrayAccessAddress(HInstruction* access,
                                   size_t data_offset);
 
 bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa);
+bool TryExtractVecArrayAccessAddress(HVecMemoryOperation* access, HInstruction* index);
 
 }  // namespace art
 
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index fc7d20c793..69cf9a126f 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -2598,7 +2598,11 @@ void IntrinsicCodeGeneratorARM::VisitFloatIsInfinite(HInvoke* invoke) {
   // We don't care about the sign bit, so shift left.
   __ Lsl(out, out, 1);
   __ eor(out, out, ShifterOperand(infinity));
-  codegen_->GenerateConditionWithZero(kCondEQ, out, out);
+  // If the result is 0, then it has 32 leading zeros, and less than that otherwise.
+  __ clz(out, out);
+  // Any number less than 32 logically shifted right by 5 bits results in 0;
+  // the same operation on 32 yields 1.
+  __ Lsr(out, out, 5);
 }
 
 void IntrinsicLocationsBuilderARM::VisitDoubleIsInfinite(HInvoke* invoke) {
@@ -2621,7 +2625,11 @@ void IntrinsicCodeGeneratorARM::VisitDoubleIsInfinite(HInvoke* invoke) {
   __ eor(out, out, ShifterOperand(infinity_high2));
   // We don't care about the sign bit, so shift left.
   __ orr(out, IP, ShifterOperand(out, LSL, 1));
-  codegen_->GenerateConditionWithZero(kCondEQ, out, out);
+  // If the result is 0, then it has 32 leading zeros, and less than that otherwise.
+  __ clz(out, out);
+  // Any number less than 32 logically shifted right by 5 bits results in 0;
+  // the same operation on 32 yields 1.
+  __ Lsr(out, out, 5);
 }
 
 void IntrinsicLocationsBuilderARM::VisitReferenceGetReferent(HInvoke* invoke) {
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 56d06eb666..356d5bcb0c 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -2971,7 +2971,11 @@ void IntrinsicCodeGeneratorARMVIXL::VisitFloatIsInfinite(HInvoke* invoke) {
   // We don't care about the sign bit, so shift left.
   __ Lsl(out, out, 1);
   __ Eor(out, out, infinity);
-  codegen_->GenerateConditionWithZero(kCondEQ, out, out);
+  // If the result is 0, then it has 32 leading zeros, and less than that otherwise.
+  __ Clz(out, out);
+  // Any number less than 32 logically shifted right by 5 bits results in 0;
+  // the same operation on 32 yields 1.
+  __ Lsr(out, out, 5);
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitDoubleIsInfinite(HInvoke* invoke) {
@@ -2997,7 +3001,11 @@ void IntrinsicCodeGeneratorARMVIXL::VisitDoubleIsInfinite(HInvoke* invoke) {
   __ Eor(out, out, infinity_high2);
   // We don't care about the sign bit, so shift left.
   __ Orr(out, temp, Operand(out, vixl32::LSL, 1));
-  codegen_->GenerateConditionWithZero(kCondEQ, out, out);
+  // If the result is 0, then it has 32 leading zeros, and less than that otherwise.
+  __ Clz(out, out);
+  // Any number less than 32 logically shifted right by 5 bits results in 0;
+  // the same operation on 32 yields 1.
+  __ Lsr(out, out, 5);
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitReferenceGetReferent(HInvoke* invoke) {
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 8ed2ad86bf..af0b193b03 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -759,7 +759,7 @@ static void CreateFPToFPCallLocations(ArenaAllocator* arena,
   // We have to ensure that the native code doesn't clobber the XMM registers which are
   // non-volatile for ART, but volatile for Native calls.  This will ensure that they are
   // saved in the prologue and properly restored.
-  for (auto fp_reg : non_volatile_xmm_regs) {
+  for (FloatRegister fp_reg : non_volatile_xmm_regs) {
     locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
   }
 }
@@ -898,7 +898,7 @@ static void CreateFPFPToFPCallLocations(ArenaAllocator* arena,
   // We have to ensure that the native code doesn't clobber the XMM registers which are
   // non-volatile for ART, but volatile for Native calls.  This will ensure that they are
   // saved in the prologue and properly restored.
-  for (auto fp_reg : non_volatile_xmm_regs) {
+  for (FloatRegister fp_reg : non_volatile_xmm_regs) {
     locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
   }
 }
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index b4da20b558..522962485b 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1406,7 +1406,8 @@ class HLoopInformationOutwardIterator : public ValueObject {
   M(BitwiseNegatedRight, Instruction)                                   \
   M(DataProcWithShifterOp, Instruction)                                 \
   M(MultiplyAccumulate, Instruction)                                    \
-  M(IntermediateAddress, Instruction)
+  M(IntermediateAddress, Instruction)                                   \
+  M(IntermediateAddressIndex, Instruction)
 #endif
 
 #ifndef ART_ENABLE_CODEGEN_arm
diff --git a/compiler/optimizing/nodes_shared.h b/compiler/optimizing/nodes_shared.h
index c6bfbcc7fb..075a816f3f 100644
--- a/compiler/optimizing/nodes_shared.h
+++ b/compiler/optimizing/nodes_shared.h
@@ -150,6 +150,49 @@ class HIntermediateAddress FINAL : public HExpression<2> {
   DISALLOW_COPY_AND_ASSIGN(HIntermediateAddress);
 };
 
+// This instruction computes part of the array access offset (data and index offset).
+//
+// For array accesses the element address has the following structure:
+// Address = CONST_OFFSET + base_addr + index << ELEM_SHIFT. Taking into account LDR/STR addressing
+// modes address part (CONST_OFFSET + index << ELEM_SHIFT) can be shared across array access with
+// the same data type and index. For example, for the following loop 5 accesses can share address
+// computation:
+//
+// void foo(int[] a, int[] b, int[] c) {
+//   for (i...) {
+//     a[i] = a[i] + 5;
+//     b[i] = b[i] + c[i];
+//   }
+// }
+//
+// Note: as the instruction doesn't involve base array address into computations it has no side
+// effects (in comparison of HIntermediateAddress).
+class HIntermediateAddressIndex FINAL : public HExpression<3> {
+ public:
+  HIntermediateAddressIndex(
+      HInstruction* index, HInstruction* offset, HInstruction* shift, uint32_t dex_pc)
+      : HExpression(Primitive::kPrimInt, SideEffects::None(), dex_pc) {
+    SetRawInputAt(0, index);
+    SetRawInputAt(1, offset);
+    SetRawInputAt(2, shift);
+  }
+
+  bool CanBeMoved() const OVERRIDE { return true; }
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+    return true;
+  }
+  bool IsActualObject() const OVERRIDE { return false; }
+
+  HInstruction* GetIndex() const { return InputAt(0); }
+  HInstruction* GetOffset() const { return InputAt(1); }
+  HInstruction* GetShift() const { return InputAt(2); }
+
+  DECLARE_INSTRUCTION(IntermediateAddressIndex);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HIntermediateAddressIndex);
+};
+
 class HDataProcWithShifterOp FINAL : public HExpression<2> {
  public:
   enum OpKind {
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index 52c247b52f..92fe9bfa7d 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -178,12 +178,17 @@ class HVecMemoryOperation : public HVecOperation {
                       size_t vector_length,
                       uint32_t dex_pc)
       : HVecOperation(arena, packed_type, side_effects, number_of_inputs, vector_length, dex_pc),
-        alignment_(Primitive::ComponentSize(packed_type), 0) { }
+        alignment_(Primitive::ComponentSize(packed_type), 0) {
+    DCHECK_GE(number_of_inputs, 2u);
+  }
 
   void SetAlignment(Alignment alignment) { alignment_ = alignment; }
 
   Alignment GetAlignment() const { return alignment_; }
 
+  HInstruction* GetArray() const { return InputAt(0); }
+  HInstruction* GetIndex() const { return InputAt(1); }
+
   DECLARE_ABSTRACT_INSTRUCTION(VecMemoryOperation);
 
  private:
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 065c11eddb..f928f71209 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -638,11 +638,14 @@ void OptimizingCompiler::RunArchOptimizations(InstructionSet instruction_set,
           new (arena) arm::InstructionSimplifierArm(graph, stats);
       SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph);
       GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects, "GVN$after_arch");
+      HInstructionScheduling* scheduling =
+          new (arena) HInstructionScheduling(graph, instruction_set, codegen);
       HOptimization* arm_optimizations[] = {
         simplifier,
         side_effects,
         gvn,
-        fixups
+        fixups,
+        scheduling,
       };
       RunOptimizations(arm_optimizations, arraysize(arm_optimizations), pass_observer);
       break;
diff --git a/compiler/optimizing/register_allocator_graph_color.cc b/compiler/optimizing/register_allocator_graph_color.cc
index 87f709f63d..300f4c6239 100644
--- a/compiler/optimizing/register_allocator_graph_color.cc
+++ b/compiler/optimizing/register_allocator_graph_color.cc
@@ -1968,8 +1968,7 @@ void RegisterAllocatorGraphColor::ColorSpillSlots(ArenaVector<LiveInterval*>* in
   ArenaVector<std::tuple<size_t, bool, LiveInterval*>> interval_endpoints(
       allocator_->Adapter(kArenaAllocRegisterAllocator));
 
-  for (auto it = intervals->begin(), e = intervals->end(); it != e; ++it) {
-    LiveInterval* parent_interval = *it;
+  for (LiveInterval* parent_interval : *intervals) {
     DCHECK(parent_interval->IsParent());
     DCHECK(!parent_interval->HasSpillSlot());
     size_t start = parent_interval->GetStart();
diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc
index d65d20cf43..320f01a727 100644
--- a/compiler/optimizing/scheduler.cc
+++ b/compiler/optimizing/scheduler.cc
@@ -23,6 +23,10 @@
 #include "scheduler_arm64.h"
 #endif
 
+#ifdef ART_ENABLE_CODEGEN_arm
+#include "scheduler_arm.h"
+#endif
+
 namespace art {
 
 void SchedulingGraph::AddDependency(SchedulingNode* node,
@@ -264,10 +268,11 @@ void SchedulingGraph::DumpAsDotGraph(const std::string& description,
   // Start the dot graph. Use an increasing index for easier differentiation.
   output << "digraph G {\n";
   for (const auto& entry : nodes_map_) {
-    DumpAsDotNode(output, entry.second);
+    SchedulingNode* node = entry.second;
+    DumpAsDotNode(output, node);
   }
   // Create a fake 'end_of_scheduling' node to help visualization of critical_paths.
-  for (auto node : initial_candidates) {
+  for (SchedulingNode* node : initial_candidates) {
     const HInstruction* instruction = node->GetInstruction();
     output << InstructionTypeId(instruction) << ":s -> end_of_scheduling:n "
       << "[label=\"" << node->GetLatency() << "\",dir=back]\n";
@@ -580,28 +585,39 @@ bool HScheduler::IsSchedulingBarrier(const HInstruction* instr) const {
 
 void HInstructionScheduling::Run(bool only_optimize_loop_blocks,
                                  bool schedule_randomly) {
+#if defined(ART_ENABLE_CODEGEN_arm64) || defined(ART_ENABLE_CODEGEN_arm)
+  // Phase-local allocator that allocates scheduler internal data structures like
+  // scheduling nodes, internel nodes map, dependencies, etc.
+  ArenaAllocator arena_allocator(graph_->GetArena()->GetArenaPool());
+  CriticalPathSchedulingNodeSelector critical_path_selector;
+  RandomSchedulingNodeSelector random_selector;
+  SchedulingNodeSelector* selector = schedule_randomly
+      ? static_cast<SchedulingNodeSelector*>(&random_selector)
+      : static_cast<SchedulingNodeSelector*>(&critical_path_selector);
+#else
   // Avoid compilation error when compiling for unsupported instruction set.
   UNUSED(only_optimize_loop_blocks);
   UNUSED(schedule_randomly);
+#endif
   switch (instruction_set_) {
 #ifdef ART_ENABLE_CODEGEN_arm64
     case kArm64: {
-      // Phase-local allocator that allocates scheduler internal data structures like
-      // scheduling nodes, internel nodes map, dependencies, etc.
-      ArenaAllocator arena_allocator(graph_->GetArena()->GetArenaPool());
-
-      CriticalPathSchedulingNodeSelector critical_path_selector;
-      RandomSchedulingNodeSelector random_selector;
-      SchedulingNodeSelector* selector = schedule_randomly
-          ? static_cast<SchedulingNodeSelector*>(&random_selector)
-          : static_cast<SchedulingNodeSelector*>(&critical_path_selector);
-
       arm64::HSchedulerARM64 scheduler(&arena_allocator, selector);
       scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
       scheduler.Schedule(graph_);
       break;
     }
 #endif
+#if defined(ART_ENABLE_CODEGEN_arm)
+    case kThumb2:
+    case kArm: {
+      arm::SchedulingLatencyVisitorARM arm_latency_visitor(codegen_);
+      arm::HSchedulerARM scheduler(&arena_allocator, selector, &arm_latency_visitor);
+      scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
+      scheduler.Schedule(graph_);
+      break;
+    }
+#endif
     default:
       break;
   }
diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h
index 9236a0e4fa..73e8087cd0 100644
--- a/compiler/optimizing/scheduler.h
+++ b/compiler/optimizing/scheduler.h
@@ -23,6 +23,7 @@
 #include "driver/compiler_driver.h"
 #include "nodes.h"
 #include "optimization.h"
+#include "code_generator.h"
 
 namespace art {
 
@@ -469,8 +470,9 @@ inline bool SchedulingGraph::IsSchedulingBarrier(const HInstruction* instruction
 
 class HInstructionScheduling : public HOptimization {
  public:
-  HInstructionScheduling(HGraph* graph, InstructionSet instruction_set)
+  HInstructionScheduling(HGraph* graph, InstructionSet instruction_set, CodeGenerator* cg = nullptr)
       : HOptimization(graph, kInstructionScheduling),
+        codegen_(cg),
         instruction_set_(instruction_set) {}
 
   void Run() {
@@ -480,6 +482,7 @@ class HInstructionScheduling : public HOptimization {
 
   static constexpr const char* kInstructionScheduling = "scheduler";
 
+  CodeGenerator* const codegen_;
   const InstructionSet instruction_set_;
 
  private:
diff --git a/compiler/optimizing/scheduler_arm.cc b/compiler/optimizing/scheduler_arm.cc
new file mode 100644
index 0000000000..1a89567991
--- /dev/null
+++ b/compiler/optimizing/scheduler_arm.cc
@@ -0,0 +1,822 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arch/arm/instruction_set_features_arm.h"
+#include "code_generator_utils.h"
+#include "common_arm.h"
+#include "mirror/array-inl.h"
+#include "scheduler_arm.h"
+
+namespace art {
+namespace arm {
+
+using helpers::Int32ConstantFrom;
+using helpers::Uint64ConstantFrom;
+
+void SchedulingLatencyVisitorARM::HandleBinaryOperationLantencies(HBinaryOperation* instr) {
+  switch (instr->GetResultType()) {
+    case Primitive::kPrimLong:
+      // HAdd and HSub long operations translate to ADDS+ADC or SUBS+SBC pairs,
+      // so a bubble (kArmNopLatency) is added to represent the internal carry flag
+      // dependency inside these pairs.
+      last_visited_internal_latency_ = kArmIntegerOpLatency + kArmNopLatency;
+      last_visited_latency_ = kArmIntegerOpLatency;
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      last_visited_latency_ = kArmFloatingPointOpLatency;
+      break;
+    default:
+      last_visited_latency_ = kArmIntegerOpLatency;
+      break;
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitAdd(HAdd* instr) {
+  HandleBinaryOperationLantencies(instr);
+}
+
+void SchedulingLatencyVisitorARM::VisitSub(HSub* instr) {
+  HandleBinaryOperationLantencies(instr);
+}
+
+void SchedulingLatencyVisitorARM::VisitMul(HMul* instr) {
+  switch (instr->GetResultType()) {
+    case Primitive::kPrimLong:
+      last_visited_internal_latency_ = 3 * kArmMulIntegerLatency;
+      last_visited_latency_ = kArmIntegerOpLatency;
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      last_visited_latency_ = kArmMulFloatingPointLatency;
+      break;
+    default:
+      last_visited_latency_ = kArmMulIntegerLatency;
+      break;
+  }
+}
+
+void SchedulingLatencyVisitorARM::HandleBitwiseOperationLantencies(HBinaryOperation* instr) {
+  switch (instr->GetResultType()) {
+    case Primitive::kPrimLong:
+      last_visited_internal_latency_ = kArmIntegerOpLatency;
+      last_visited_latency_ = kArmIntegerOpLatency;
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      last_visited_latency_ = kArmFloatingPointOpLatency;
+      break;
+    default:
+      last_visited_latency_ = kArmIntegerOpLatency;
+      break;
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitAnd(HAnd* instr) {
+  HandleBitwiseOperationLantencies(instr);
+}
+
+void SchedulingLatencyVisitorARM::VisitOr(HOr* instr) {
+  HandleBitwiseOperationLantencies(instr);
+}
+
+void SchedulingLatencyVisitorARM::VisitXor(HXor* instr) {
+  HandleBitwiseOperationLantencies(instr);
+}
+
+void SchedulingLatencyVisitorARM::VisitRor(HRor* instr) {
+  switch (instr->GetResultType()) {
+    case Primitive::kPrimInt:
+      last_visited_latency_ = kArmIntegerOpLatency;
+      break;
+    case Primitive::kPrimLong: {
+      // HandleLongRotate
+      HInstruction* rhs = instr->GetRight();
+      if (rhs->IsConstant()) {
+        uint64_t rot = Uint64ConstantFrom(rhs->AsConstant()) & kMaxLongShiftDistance;
+        if (rot != 0u) {
+          last_visited_internal_latency_ = 3 * kArmIntegerOpLatency;
+          last_visited_latency_ = kArmIntegerOpLatency;
+        } else {
+          last_visited_internal_latency_ = kArmIntegerOpLatency;
+          last_visited_latency_ = kArmIntegerOpLatency;
+        }
+      } else {
+        last_visited_internal_latency_ = 9 * kArmIntegerOpLatency + kArmBranchLatency;
+        last_visited_latency_ = kArmBranchLatency;
+      }
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unexpected operation type " << instr->GetResultType();
+      UNREACHABLE();
+  }
+}
+
+void SchedulingLatencyVisitorARM::HandleShiftLatencies(HBinaryOperation* instr) {
+  Primitive::Type type = instr->GetResultType();
+  HInstruction* rhs = instr->GetRight();
+  switch (type) {
+    case Primitive::kPrimInt:
+      if (!rhs->IsConstant()) {
+        last_visited_internal_latency_ = kArmIntegerOpLatency;
+      }
+      last_visited_latency_ = kArmIntegerOpLatency;
+      break;
+    case Primitive::kPrimLong:
+      if (!rhs->IsConstant()) {
+        last_visited_internal_latency_ = 8 * kArmIntegerOpLatency;
+      } else {
+        uint32_t shift_value = Int32ConstantFrom(rhs->AsConstant()) & kMaxLongShiftDistance;
+        if (shift_value == 1 || shift_value >= 32) {
+          last_visited_internal_latency_ = kArmIntegerOpLatency;
+        } else {
+          last_visited_internal_latency_ = 2 * kArmIntegerOpLatency;
+        }
+      }
+      last_visited_latency_ = kArmIntegerOpLatency;
+      break;
+    default:
+      LOG(FATAL) << "Unexpected operation type " << type;
+      UNREACHABLE();
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitShl(HShl* instr) {
+  HandleShiftLatencies(instr);
+}
+
+void SchedulingLatencyVisitorARM::VisitShr(HShr* instr) {
+  HandleShiftLatencies(instr);
+}
+
+void SchedulingLatencyVisitorARM::VisitUShr(HUShr* instr) {
+  HandleShiftLatencies(instr);
+}
+
+void SchedulingLatencyVisitorARM::VisitCondition(HCondition* instr) {
+  switch (instr->GetLeft()->GetType()) {
+    case Primitive::kPrimLong:
+      last_visited_internal_latency_ = 4 * kArmIntegerOpLatency;
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      last_visited_internal_latency_ = 2 * kArmFloatingPointOpLatency;
+      break;
+    default:
+      last_visited_internal_latency_ = 2 * kArmIntegerOpLatency;
+      break;
+  }
+  last_visited_latency_ = kArmIntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitCompare(HCompare* instr) {
+  Primitive::Type type = instr->InputAt(0)->GetType();
+  switch (type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimInt:
+      last_visited_internal_latency_ = 2 * kArmIntegerOpLatency;
+      break;
+    case Primitive::kPrimLong:
+      last_visited_internal_latency_ = 2 * kArmIntegerOpLatency + 3 * kArmBranchLatency;
+      break;
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      last_visited_internal_latency_ = kArmIntegerOpLatency + 2 * kArmFloatingPointOpLatency;
+      break;
+    default:
+      last_visited_internal_latency_ = 2 * kArmIntegerOpLatency;
+      break;
+  }
+  last_visited_latency_ = kArmIntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitBitwiseNegatedRight(HBitwiseNegatedRight* instruction) {
+  if (instruction->GetResultType() == Primitive::kPrimInt) {
+    last_visited_latency_ = kArmIntegerOpLatency;
+  } else {
+    last_visited_internal_latency_ = kArmIntegerOpLatency;
+    last_visited_latency_ = kArmIntegerOpLatency;
+  }
+}
+
+void SchedulingLatencyVisitorARM::HandleGenerateDataProcInstruction(bool internal_latency) {
+  if (internal_latency) {
+    last_visited_internal_latency_ += kArmIntegerOpLatency;
+  } else {
+    last_visited_latency_ = kArmDataProcWithShifterOpLatency;
+  }
+}
+
+void SchedulingLatencyVisitorARM::HandleGenerateDataProc(HDataProcWithShifterOp* instruction) {
+  const HInstruction::InstructionKind kind = instruction->GetInstrKind();
+  if (kind == HInstruction::kAdd) {
+    last_visited_internal_latency_ = kArmIntegerOpLatency;
+    last_visited_latency_ = kArmIntegerOpLatency;
+  } else if (kind == HInstruction::kSub) {
+    last_visited_internal_latency_ = kArmIntegerOpLatency;
+    last_visited_latency_ = kArmIntegerOpLatency;
+  } else {
+    HandleGenerateDataProcInstruction(/* internal_latency */ true);
+    HandleGenerateDataProcInstruction();
+  }
+}
+
+void SchedulingLatencyVisitorARM::HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction) {
+  DCHECK_EQ(instruction->GetType(), Primitive::kPrimLong);
+  DCHECK(HDataProcWithShifterOp::IsShiftOp(instruction->GetOpKind()));
+
+  const uint32_t shift_value = instruction->GetShiftAmount();
+  const HInstruction::InstructionKind kind = instruction->GetInstrKind();
+
+  if (shift_value >= 32) {
+    // Different shift types actually generate similar code here,
+    // no need to differentiate shift types like the codegen pass does,
+    // which also avoids handling shift types from different ARM backends.
+    HandleGenerateDataProc(instruction);
+  } else {
+    DCHECK_GT(shift_value, 1U);
+    DCHECK_LT(shift_value, 32U);
+
+    if (kind == HInstruction::kOr || kind == HInstruction::kXor) {
+      HandleGenerateDataProcInstruction(/* internal_latency */ true);
+      HandleGenerateDataProcInstruction(/* internal_latency */ true);
+      HandleGenerateDataProcInstruction();
+    } else {
+      last_visited_internal_latency_ += 2 * kArmIntegerOpLatency;
+      HandleGenerateDataProc(instruction);
+    }
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitDataProcWithShifterOp(HDataProcWithShifterOp* instruction) {
+  const HDataProcWithShifterOp::OpKind op_kind = instruction->GetOpKind();
+
+  if (instruction->GetType() == Primitive::kPrimInt) {
+    DCHECK(!HDataProcWithShifterOp::IsExtensionOp(op_kind));
+    HandleGenerateDataProcInstruction();
+  } else {
+    DCHECK_EQ(instruction->GetType(), Primitive::kPrimLong);
+    if (HDataProcWithShifterOp::IsExtensionOp(op_kind)) {
+      HandleGenerateDataProc(instruction);
+    } else {
+      HandleGenerateLongDataProc(instruction);
+    }
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitIntermediateAddress(HIntermediateAddress* ATTRIBUTE_UNUSED) {
+  // Although the code generated is a simple `add` instruction, we found through empirical results
+  // that spacing it from its use in memory accesses was beneficial.
+  last_visited_internal_latency_ = kArmNopLatency;
+  last_visited_latency_ = kArmIntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitMultiplyAccumulate(HMultiplyAccumulate* ATTRIBUTE_UNUSED) {
+  last_visited_latency_ = kArmMulIntegerLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitArrayGet(HArrayGet* instruction) {
+  Primitive::Type type = instruction->GetType();
+  const bool maybe_compressed_char_at =
+      mirror::kUseStringCompression && instruction->IsStringCharAt();
+  HInstruction* array_instr = instruction->GetArray();
+  bool has_intermediate_address = array_instr->IsIntermediateAddress();
+  HInstruction* index = instruction->InputAt(1);
+
+  switch (type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimInt: {
+      if (maybe_compressed_char_at) {
+        last_visited_internal_latency_ += kArmMemoryLoadLatency;
+      }
+      if (index->IsConstant()) {
+        if (maybe_compressed_char_at) {
+          last_visited_internal_latency_ +=
+              kArmIntegerOpLatency + kArmBranchLatency + kArmMemoryLoadLatency;
+          last_visited_latency_ = kArmBranchLatency;
+        } else {
+          last_visited_latency_ += kArmMemoryLoadLatency;
+        }
+      } else {
+        if (has_intermediate_address) {
+        } else {
+          last_visited_internal_latency_ += kArmIntegerOpLatency;
+        }
+        if (maybe_compressed_char_at) {
+          last_visited_internal_latency_ +=
+              kArmIntegerOpLatency + kArmBranchLatency + kArmMemoryLoadLatency;
+          last_visited_latency_ = kArmBranchLatency;
+        } else {
+          last_visited_latency_ += kArmMemoryLoadLatency;
+        }
+      }
+      break;
+    }
+
+    case Primitive::kPrimNot: {
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        last_visited_latency_ = kArmLoadWithBakerReadBarrierLatency;
+      } else {
+        if (index->IsConstant()) {
+          last_visited_latency_ = kArmMemoryLoadLatency;
+        } else {
+          if (has_intermediate_address) {
+          } else {
+            last_visited_internal_latency_ += kArmIntegerOpLatency;
+          }
+          last_visited_internal_latency_ = kArmMemoryLoadLatency;
+        }
+      }
+      break;
+    }
+
+    case Primitive::kPrimLong: {
+      if (index->IsConstant()) {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      } else {
+        last_visited_internal_latency_ += kArmIntegerOpLatency;
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+    }
+
+    case Primitive::kPrimFloat: {
+      if (index->IsConstant()) {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      } else {
+        last_visited_internal_latency_ += kArmIntegerOpLatency;
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+    }
+
+    case Primitive::kPrimDouble: {
+      if (index->IsConstant()) {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      } else {
+        last_visited_internal_latency_ += kArmIntegerOpLatency;
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+    }
+
+    default:
+      LOG(FATAL) << "Unreachable type " << type;
+      UNREACHABLE();
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitArrayLength(HArrayLength* instruction) {
+  last_visited_latency_ = kArmMemoryLoadLatency;
+  if (mirror::kUseStringCompression && instruction->IsStringLength()) {
+    last_visited_internal_latency_ = kArmMemoryLoadLatency;
+    last_visited_latency_ = kArmIntegerOpLatency;
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitArraySet(HArraySet* instruction) {
+  HInstruction* index = instruction->InputAt(1);
+  Primitive::Type value_type = instruction->GetComponentType();
+  HInstruction* array_instr = instruction->GetArray();
+  bool has_intermediate_address = array_instr->IsIntermediateAddress();
+
+  switch (value_type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimInt: {
+      if (index->IsConstant()) {
+        last_visited_latency_ = kArmMemoryStoreLatency;
+      } else {
+        if (has_intermediate_address) {
+        } else {
+          last_visited_internal_latency_ = kArmIntegerOpLatency;
+        }
+        last_visited_latency_ = kArmMemoryStoreLatency;
+      }
+      break;
+    }
+
+    case Primitive::kPrimNot: {
+      if (instruction->InputAt(2)->IsNullConstant()) {
+        if (index->IsConstant()) {
+          last_visited_latency_ = kArmMemoryStoreLatency;
+        } else {
+          last_visited_internal_latency_ = kArmIntegerOpLatency;
+          last_visited_latency_ = kArmMemoryStoreLatency;
+        }
+      } else {
+        // Following the exact instructions of runtime type checks is too complicated,
+        // just giving it a simple slow latency.
+        last_visited_latency_ = kArmRuntimeTypeCheckLatency;
+      }
+      break;
+    }
+
+    case Primitive::kPrimLong: {
+      if (index->IsConstant()) {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      } else {
+        last_visited_internal_latency_ = kArmIntegerOpLatency;
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+    }
+
+    case Primitive::kPrimFloat: {
+      if (index->IsConstant()) {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      } else {
+        last_visited_internal_latency_ = kArmIntegerOpLatency;
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+    }
+
+    case Primitive::kPrimDouble: {
+      if (index->IsConstant()) {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      } else {
+        last_visited_internal_latency_ = kArmIntegerOpLatency;
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+    }
+
+    default:
+      LOG(FATAL) << "Unreachable type " << value_type;
+      UNREACHABLE();
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitBoundsCheck(HBoundsCheck* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArmIntegerOpLatency;
+  // Users do not use any data results.
+  last_visited_latency_ = 0;
+}
+
+void SchedulingLatencyVisitorARM::HandleDivRemConstantIntegralLatencies(int32_t imm) {
+  if (imm == 0) {
+    last_visited_internal_latency_ = 0;
+    last_visited_latency_ = 0;
+  } else if (imm == 1 || imm == -1) {
+    last_visited_latency_ = kArmIntegerOpLatency;
+  } else if (IsPowerOfTwo(AbsOrMin(imm))) {
+    last_visited_internal_latency_ = 3 * kArmIntegerOpLatency;
+    last_visited_latency_ = kArmIntegerOpLatency;
+  } else {
+    last_visited_internal_latency_ = kArmMulIntegerLatency + 2 * kArmIntegerOpLatency;
+    last_visited_latency_ = kArmIntegerOpLatency;
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitDiv(HDiv* instruction) {
+  Primitive::Type type = instruction->GetResultType();
+  switch (type) {
+    case Primitive::kPrimInt: {
+      HInstruction* rhs = instruction->GetRight();
+      if (rhs->IsConstant()) {
+        int32_t imm = Int32ConstantFrom(rhs->AsConstant());
+        HandleDivRemConstantIntegralLatencies(imm);
+      } else {
+        last_visited_latency_ = kArmDivIntegerLatency;
+      }
+      break;
+    }
+    case Primitive::kPrimFloat:
+      last_visited_latency_ = kArmDivFloatLatency;
+      break;
+    case Primitive::kPrimDouble:
+      last_visited_latency_ = kArmDivDoubleLatency;
+      break;
+    default:
+      last_visited_internal_latency_ = kArmCallInternalLatency;
+      last_visited_latency_ = kArmCallLatency;
+      break;
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
+  HandleFieldGetLatencies(instruction, instruction->GetFieldInfo());
+}
+
+void SchedulingLatencyVisitorARM::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
+  HandleFieldSetLatencies(instruction, instruction->GetFieldInfo());
+}
+
+void SchedulingLatencyVisitorARM::VisitInstanceOf(HInstanceOf* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArmCallInternalLatency;
+  last_visited_latency_ = kArmIntegerOpLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitInvoke(HInvoke* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArmCallInternalLatency;
+  last_visited_latency_ = kArmCallLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitLoadString(HLoadString* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArmLoadStringInternalLatency;
+  last_visited_latency_ = kArmMemoryLoadLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitNewArray(HNewArray* ATTRIBUTE_UNUSED) {
+  last_visited_internal_latency_ = kArmIntegerOpLatency + kArmCallInternalLatency;
+  last_visited_latency_ = kArmCallLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitNewInstance(HNewInstance* instruction) {
+  if (instruction->IsStringAlloc()) {
+    last_visited_internal_latency_ = 2 * kArmMemoryLoadLatency + kArmCallInternalLatency;
+  } else {
+    last_visited_internal_latency_ = kArmCallInternalLatency;
+  }
+  last_visited_latency_ = kArmCallLatency;
+}
+
+void SchedulingLatencyVisitorARM::VisitRem(HRem* instruction) {
+  Primitive::Type type = instruction->GetResultType();
+  switch (type) {
+    case Primitive::kPrimInt: {
+      HInstruction* rhs = instruction->GetRight();
+      if (rhs->IsConstant()) {
+        int32_t imm = Int32ConstantFrom(rhs->AsConstant());
+        HandleDivRemConstantIntegralLatencies(imm);
+      } else {
+        last_visited_internal_latency_ = kArmDivIntegerLatency;
+        last_visited_latency_ = kArmMulIntegerLatency;
+      }
+      break;
+    }
+    default:
+      last_visited_internal_latency_ = kArmCallInternalLatency;
+      last_visited_latency_ = kArmCallLatency;
+      break;
+  }
+}
+
+void SchedulingLatencyVisitorARM::HandleFieldGetLatencies(HInstruction* instruction,
+                                                          const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
+  DCHECK(codegen_ != nullptr);
+  bool is_volatile = field_info.IsVolatile();
+  Primitive::Type field_type = field_info.GetFieldType();
+  bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd();
+
+  switch (field_type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimInt:
+      last_visited_latency_ = kArmMemoryLoadLatency;
+      break;
+
+    case Primitive::kPrimNot:
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        last_visited_internal_latency_ = kArmMemoryLoadLatency + kArmIntegerOpLatency;
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      } else {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+
+    case Primitive::kPrimLong:
+      if (is_volatile && !atomic_ldrd_strd) {
+        last_visited_internal_latency_ = kArmMemoryLoadLatency + kArmIntegerOpLatency;
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      } else {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+
+    case Primitive::kPrimFloat:
+      last_visited_latency_ = kArmMemoryLoadLatency;
+      break;
+
+    case Primitive::kPrimDouble:
+      if (is_volatile && !atomic_ldrd_strd) {
+        last_visited_internal_latency_ =
+            kArmMemoryLoadLatency + kArmIntegerOpLatency + kArmMemoryLoadLatency;
+        last_visited_latency_ = kArmIntegerOpLatency;
+      } else {
+        last_visited_latency_ = kArmMemoryLoadLatency;
+      }
+      break;
+
+    default:
+      last_visited_latency_ = kArmMemoryLoadLatency;
+      break;
+  }
+
+  if (is_volatile) {
+    last_visited_internal_latency_ += kArmMemoryBarrierLatency;
+  }
+}
+
+void SchedulingLatencyVisitorARM::HandleFieldSetLatencies(HInstruction* instruction,
+                                                          const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldSet() || instruction->IsStaticFieldSet());
+  DCHECK(codegen_ != nullptr);
+  bool is_volatile = field_info.IsVolatile();
+  Primitive::Type field_type = field_info.GetFieldType();
+  bool needs_write_barrier =
+      CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1));
+  bool atomic_ldrd_strd = codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd();
+
+  switch (field_type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimChar:
+      if (is_volatile) {
+        last_visited_internal_latency_ = kArmMemoryBarrierLatency + kArmMemoryStoreLatency;
+        last_visited_latency_ = kArmMemoryBarrierLatency;
+      } else {
+        last_visited_latency_ = kArmMemoryStoreLatency;
+      }
+      break;
+
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot:
+      if (kPoisonHeapReferences && needs_write_barrier) {
+        last_visited_internal_latency_ += kArmIntegerOpLatency * 2;
+      }
+      last_visited_latency_ = kArmMemoryStoreLatency;
+      break;
+
+    case Primitive::kPrimLong:
+      if (is_volatile && !atomic_ldrd_strd) {
+        last_visited_internal_latency_ =
+            kArmIntegerOpLatency + kArmMemoryLoadLatency + kArmMemoryStoreLatency;
+        last_visited_latency_ = kArmIntegerOpLatency;
+      } else {
+        last_visited_latency_ = kArmMemoryStoreLatency;
+      }
+      break;
+
+    case Primitive::kPrimFloat:
+      last_visited_latency_ = kArmMemoryStoreLatency;
+      break;
+
+    case Primitive::kPrimDouble:
+      if (is_volatile && !atomic_ldrd_strd) {
+        last_visited_internal_latency_ = kArmIntegerOpLatency +
+            kArmIntegerOpLatency + kArmMemoryLoadLatency + kArmMemoryStoreLatency;
+        last_visited_latency_ = kArmIntegerOpLatency;
+      } else {
+        last_visited_latency_ = kArmMemoryStoreLatency;
+      }
+      break;
+
+    default:
+      last_visited_latency_ = kArmMemoryStoreLatency;
+      break;
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitStaticFieldGet(HStaticFieldGet* instruction) {
+  HandleFieldGetLatencies(instruction, instruction->GetFieldInfo());
+}
+
+void SchedulingLatencyVisitorARM::VisitStaticFieldSet(HStaticFieldSet* instruction) {
+  HandleFieldSetLatencies(instruction, instruction->GetFieldInfo());
+}
+
+void SchedulingLatencyVisitorARM::VisitSuspendCheck(HSuspendCheck* instruction) {
+  HBasicBlock* block = instruction->GetBlock();
+  DCHECK((block->GetLoopInformation() != nullptr) ||
+         (block->IsEntryBlock() && instruction->GetNext()->IsGoto()));
+  // Users do not use any data results.
+  last_visited_latency_ = 0;
+}
+
+void SchedulingLatencyVisitorARM::VisitTypeConversion(HTypeConversion* instr) {
+  Primitive::Type result_type = instr->GetResultType();
+  Primitive::Type input_type = instr->GetInputType();
+
+  switch (result_type) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      last_visited_latency_ = kArmIntegerOpLatency;  // SBFX or UBFX
+      break;
+
+    case Primitive::kPrimInt:
+      switch (input_type) {
+        case Primitive::kPrimLong:
+          last_visited_latency_ = kArmIntegerOpLatency;  // MOV
+          break;
+        case Primitive::kPrimFloat:
+        case Primitive::kPrimDouble:
+          last_visited_internal_latency_ = kArmTypeConversionFloatingPointIntegerLatency;
+          last_visited_latency_ = kArmFloatingPointOpLatency;
+          break;
+        default:
+          last_visited_latency_ = kArmIntegerOpLatency;
+          break;
+      }
+      break;
+
+    case Primitive::kPrimLong:
+      switch (input_type) {
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte:
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort:
+        case Primitive::kPrimInt:
+          // MOV and extension
+          last_visited_internal_latency_ = kArmIntegerOpLatency;
+          last_visited_latency_ = kArmIntegerOpLatency;
+          break;
+        case Primitive::kPrimFloat:
+        case Primitive::kPrimDouble:
+          // invokes runtime
+          last_visited_internal_latency_ = kArmCallInternalLatency;
+          break;
+        default:
+          last_visited_internal_latency_ = kArmIntegerOpLatency;
+          last_visited_latency_ = kArmIntegerOpLatency;
+          break;
+      }
+      break;
+
+    case Primitive::kPrimFloat:
+      switch (input_type) {
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte:
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort:
+        case Primitive::kPrimInt:
+          last_visited_internal_latency_ = kArmTypeConversionFloatingPointIntegerLatency;
+          last_visited_latency_ = kArmFloatingPointOpLatency;
+          break;
+        case Primitive::kPrimLong:
+          // invokes runtime
+          last_visited_internal_latency_ = kArmCallInternalLatency;
+          break;
+        case Primitive::kPrimDouble:
+          last_visited_latency_ = kArmFloatingPointOpLatency;
+          break;
+        default:
+          last_visited_latency_ = kArmFloatingPointOpLatency;
+          break;
+      }
+      break;
+
+    case Primitive::kPrimDouble:
+      switch (input_type) {
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte:
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort:
+        case Primitive::kPrimInt:
+          last_visited_internal_latency_ = kArmTypeConversionFloatingPointIntegerLatency;
+          last_visited_latency_ = kArmFloatingPointOpLatency;
+          break;
+        case Primitive::kPrimLong:
+          last_visited_internal_latency_ = 5 * kArmFloatingPointOpLatency;
+          last_visited_latency_ = kArmFloatingPointOpLatency;
+          break;
+        case Primitive::kPrimFloat:
+          last_visited_latency_ = kArmFloatingPointOpLatency;
+          break;
+        default:
+          last_visited_latency_ = kArmFloatingPointOpLatency;
+          break;
+      }
+      break;
+
+    default:
+      last_visited_latency_ = kArmTypeConversionFloatingPointIntegerLatency;
+      break;
+  }
+}
+
+void SchedulingLatencyVisitorARM::VisitArmDexCacheArraysBase(art::HArmDexCacheArraysBase*) {
+  last_visited_internal_latency_ = kArmIntegerOpLatency;
+  last_visited_latency_ = kArmIntegerOpLatency;
+}
+
+}  // namespace arm
+}  // namespace art
diff --git a/compiler/optimizing/scheduler_arm.h b/compiler/optimizing/scheduler_arm.h
new file mode 100644
index 0000000000..8d5e4f375b
--- /dev/null
+++ b/compiler/optimizing/scheduler_arm.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_
+#define ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_
+
+#include "code_generator_arm_vixl.h"
+#include "scheduler.h"
+
+namespace art {
+namespace arm {
+#ifdef ART_USE_OLD_ARM_BACKEND
+typedef CodeGeneratorARM CodeGeneratorARMType;
+#else
+typedef CodeGeneratorARMVIXL CodeGeneratorARMType;
+#endif
+
+// AArch32 instruction latencies.
+// We currently assume that all ARM CPUs share the same instruction latency list.
+// The following latencies were tuned based on performance experiments and
+// automatic tuning using differential evolution approach on various benchmarks.
+static constexpr uint32_t kArmIntegerOpLatency = 2;
+static constexpr uint32_t kArmFloatingPointOpLatency = 11;
+static constexpr uint32_t kArmDataProcWithShifterOpLatency = 4;
+static constexpr uint32_t kArmMulIntegerLatency = 6;
+static constexpr uint32_t kArmMulFloatingPointLatency = 11;
+static constexpr uint32_t kArmDivIntegerLatency = 10;
+static constexpr uint32_t kArmDivFloatLatency = 20;
+static constexpr uint32_t kArmDivDoubleLatency = 25;
+static constexpr uint32_t kArmTypeConversionFloatingPointIntegerLatency = 11;
+static constexpr uint32_t kArmMemoryLoadLatency = 9;
+static constexpr uint32_t kArmMemoryStoreLatency = 9;
+static constexpr uint32_t kArmMemoryBarrierLatency = 6;
+static constexpr uint32_t kArmBranchLatency = 4;
+static constexpr uint32_t kArmCallLatency = 5;
+static constexpr uint32_t kArmCallInternalLatency = 29;
+static constexpr uint32_t kArmLoadStringInternalLatency = 10;
+static constexpr uint32_t kArmNopLatency = 2;
+static constexpr uint32_t kArmLoadWithBakerReadBarrierLatency = 18;
+static constexpr uint32_t kArmRuntimeTypeCheckLatency = 46;
+
+class SchedulingLatencyVisitorARM : public SchedulingLatencyVisitor {
+ public:
+  explicit SchedulingLatencyVisitorARM(CodeGenerator* codegen)
+      : codegen_(down_cast<CodeGeneratorARMType*>(codegen)) {}
+
+  // Default visitor for instructions not handled specifically below.
+  void VisitInstruction(HInstruction* ATTRIBUTE_UNUSED) {
+    last_visited_latency_ = kArmIntegerOpLatency;
+  }
+
+// We add a second unused parameter to be able to use this macro like the others
+// defined in `nodes.h`.
+#define FOR_EACH_SCHEDULED_ARM_INSTRUCTION(M)    \
+  M(ArrayGet         , unused)                   \
+  M(ArrayLength      , unused)                   \
+  M(ArraySet         , unused)                   \
+  M(Add              , unused)                   \
+  M(Sub              , unused)                   \
+  M(And              , unused)                   \
+  M(Or               , unused)                   \
+  M(Ror              , unused)                   \
+  M(Xor              , unused)                   \
+  M(Shl              , unused)                   \
+  M(Shr              , unused)                   \
+  M(UShr             , unused)                   \
+  M(Mul              , unused)                   \
+  M(Div              , unused)                   \
+  M(Condition        , unused)                   \
+  M(Compare          , unused)                   \
+  M(BoundsCheck      , unused)                   \
+  M(InstanceFieldGet , unused)                   \
+  M(InstanceFieldSet , unused)                   \
+  M(InstanceOf       , unused)                   \
+  M(Invoke           , unused)                   \
+  M(LoadString       , unused)                   \
+  M(NewArray         , unused)                   \
+  M(NewInstance      , unused)                   \
+  M(Rem              , unused)                   \
+  M(StaticFieldGet   , unused)                   \
+  M(StaticFieldSet   , unused)                   \
+  M(SuspendCheck     , unused)                   \
+  M(TypeConversion   , unused)
+
+#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \
+  M(BitwiseNegatedRight, unused)                 \
+  M(MultiplyAccumulate, unused)                  \
+  M(IntermediateAddress, unused)                 \
+  M(DataProcWithShifterOp, unused)
+
+#define DECLARE_VISIT_INSTRUCTION(type, unused)  \
+  void Visit##type(H##type* instruction) OVERRIDE;
+
+  FOR_EACH_SCHEDULED_ARM_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION)
+
+#undef DECLARE_VISIT_INSTRUCTION
+
+ private:
+  void HandleBinaryOperationLantencies(HBinaryOperation* instr);
+  void HandleBitwiseOperationLantencies(HBinaryOperation* instr);
+  void HandleShiftLatencies(HBinaryOperation* instr);
+  void HandleDivRemConstantIntegralLatencies(int32_t imm);
+  void HandleFieldSetLatencies(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleFieldGetLatencies(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleGenerateDataProcInstruction(bool internal_latency = false);
+  void HandleGenerateDataProc(HDataProcWithShifterOp* instruction);
+  void HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction);
+
+  // The latency setting for each HInstruction depends on how CodeGenerator may generate code,
+  // latency visitors may query CodeGenerator for such information for accurate latency settings.
+  CodeGeneratorARMType* codegen_;
+};
+
+class HSchedulerARM : public HScheduler {
+ public:
+  HSchedulerARM(ArenaAllocator* arena,
+                SchedulingNodeSelector* selector,
+                SchedulingLatencyVisitorARM* arm_latency_visitor)
+      : HScheduler(arena, arm_latency_visitor, selector) {}
+  ~HSchedulerARM() OVERRIDE {}
+
+  bool IsSchedulable(const HInstruction* instruction) const OVERRIDE {
+#define CASE_INSTRUCTION_KIND(type, unused) case \
+  HInstruction::InstructionKind::k##type:
+    switch (instruction->GetKind()) {
+      FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(CASE_INSTRUCTION_KIND)
+        return true;
+      FOR_EACH_CONCRETE_INSTRUCTION_ARM(CASE_INSTRUCTION_KIND)
+        return true;
+      default:
+        return HScheduler::IsSchedulable(instruction);
+    }
+#undef CASE_INSTRUCTION_KIND
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HSchedulerARM);
+};
+
+}  // namespace arm
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_
diff --git a/compiler/optimizing/scheduler_test.cc b/compiler/optimizing/scheduler_test.cc
index 31d13e2a26..d87600aa5e 100644
--- a/compiler/optimizing/scheduler_test.cc
+++ b/compiler/optimizing/scheduler_test.cc
@@ -28,6 +28,10 @@
 #include "scheduler_arm64.h"
 #endif
 
+#ifdef ART_ENABLE_CODEGEN_arm
+#include "scheduler_arm.h"
+#endif
+
 namespace art {
 
 // Return all combinations of ISA and code generator that are executable on
@@ -56,7 +60,7 @@ static ::std::vector<CodegenTargetConfig> GetTargetConfigs() {
 #endif
   };
 
-  for (auto test_config : test_config_candidates) {
+  for (const CodegenTargetConfig& test_config : test_config_candidates) {
     if (CanExecute(test_config.GetInstructionSet())) {
       v.push_back(test_config);
     }
@@ -65,133 +69,151 @@ static ::std::vector<CodegenTargetConfig> GetTargetConfigs() {
   return v;
 }
 
-class SchedulerTest : public CommonCompilerTest {};
-
-#ifdef ART_ENABLE_CODEGEN_arm64
-TEST_F(SchedulerTest, DependencyGraph) {
-  ArenaPool pool;
-  ArenaAllocator allocator(&pool);
-  HGraph* graph = CreateGraph(&allocator);
-  HBasicBlock* entry = new (&allocator) HBasicBlock(graph);
-  HBasicBlock* block1 = new (&allocator) HBasicBlock(graph);
-  graph->AddBlock(entry);
-  graph->AddBlock(block1);
-  graph->SetEntryBlock(entry);
-
-  // entry:
-  // array         ParameterValue
-  // c1            IntConstant
-  // c2            IntConstant
-  // block1:
-  // add1          Add [c1, c2]
-  // add2          Add [add1, c2]
-  // mul           Mul [add1, add2]
-  // div_check     DivZeroCheck [add2] (env: add2, mul)
-  // div           Div [add1, div_check]
-  // array_get1    ArrayGet [array, add1]
-  // array_set1    ArraySet [array, add1, add2]
-  // array_get2    ArrayGet [array, add1]
-  // array_set2    ArraySet [array, add1, add2]
-
-  HInstruction* array = new (&allocator) HParameterValue(graph->GetDexFile(),
-                                                         dex::TypeIndex(0),
-                                                         0,
-                                                         Primitive::kPrimNot);
-  HInstruction* c1 = graph->GetIntConstant(1);
-  HInstruction* c2 = graph->GetIntConstant(10);
-  HInstruction* add1 = new (&allocator) HAdd(Primitive::kPrimInt, c1, c2);
-  HInstruction* add2 = new (&allocator) HAdd(Primitive::kPrimInt, add1, c2);
-  HInstruction* mul = new (&allocator) HMul(Primitive::kPrimInt, add1, add2);
-  HInstruction* div_check = new (&allocator) HDivZeroCheck(add2, 0);
-  HInstruction* div = new (&allocator) HDiv(Primitive::kPrimInt, add1, div_check, 0);
-  HInstruction* array_get1 = new (&allocator) HArrayGet(array, add1, Primitive::kPrimInt, 0);
-  HInstruction* array_set1 = new (&allocator) HArraySet(array, add1, add2, Primitive::kPrimInt, 0);
-  HInstruction* array_get2 = new (&allocator) HArrayGet(array, add1, Primitive::kPrimInt, 0);
-  HInstruction* array_set2 = new (&allocator) HArraySet(array, add1, add2, Primitive::kPrimInt, 0);
-
-  DCHECK(div_check->CanThrow());
-
-  entry->AddInstruction(array);
-
-  HInstruction* block_instructions[] = {add1,
-                                        add2,
-                                        mul,
-                                        div_check,
-                                        div,
-                                        array_get1,
-                                        array_set1,
-                                        array_get2,
-                                        array_set2};
-  for (auto instr : block_instructions) {
-    block1->AddInstruction(instr);
+class SchedulerTest : public CommonCompilerTest {
+ public:
+  SchedulerTest() : pool_(), allocator_(&pool_) {
+    graph_ = CreateGraph(&allocator_);
   }
 
-  HEnvironment* environment = new (&allocator) HEnvironment(&allocator,
-                                                            2,
-                                                            graph->GetArtMethod(),
+  // Build scheduling graph, and run target specific scheduling on it.
+  void TestBuildDependencyGraphAndSchedule(HScheduler* scheduler) {
+    HBasicBlock* entry = new (&allocator_) HBasicBlock(graph_);
+    HBasicBlock* block1 = new (&allocator_) HBasicBlock(graph_);
+    graph_->AddBlock(entry);
+    graph_->AddBlock(block1);
+    graph_->SetEntryBlock(entry);
+
+    // entry:
+    // array         ParameterValue
+    // c1            IntConstant
+    // c2            IntConstant
+    // block1:
+    // add1          Add [c1, c2]
+    // add2          Add [add1, c2]
+    // mul           Mul [add1, add2]
+    // div_check     DivZeroCheck [add2] (env: add2, mul)
+    // div           Div [add1, div_check]
+    // array_get1    ArrayGet [array, add1]
+    // array_set1    ArraySet [array, add1, add2]
+    // array_get2    ArrayGet [array, add1]
+    // array_set2    ArraySet [array, add1, add2]
+
+    HInstruction* array = new (&allocator_) HParameterValue(graph_->GetDexFile(),
+                                                            dex::TypeIndex(0),
                                                             0,
-                                                            div_check);
-  div_check->SetRawEnvironment(environment);
-  environment->SetRawEnvAt(0, add2);
-  add2->AddEnvUseAt(div_check->GetEnvironment(), 0);
-  environment->SetRawEnvAt(1, mul);
-  mul->AddEnvUseAt(div_check->GetEnvironment(), 1);
-
-  ArenaAllocator* arena = graph->GetArena();
-  CriticalPathSchedulingNodeSelector critical_path_selector;
-  arm64::HSchedulerARM64 scheduler(arena, &critical_path_selector);
-  SchedulingGraph scheduling_graph(&scheduler, arena);
-  // Instructions must be inserted in reverse order into the scheduling graph.
-  for (auto instr : ReverseRange(block_instructions)) {
-    scheduling_graph.AddNode(instr);
+                                                            Primitive::kPrimNot);
+    HInstruction* c1 = graph_->GetIntConstant(1);
+    HInstruction* c2 = graph_->GetIntConstant(10);
+    HInstruction* add1 = new (&allocator_) HAdd(Primitive::kPrimInt, c1, c2);
+    HInstruction* add2 = new (&allocator_) HAdd(Primitive::kPrimInt, add1, c2);
+    HInstruction* mul = new (&allocator_) HMul(Primitive::kPrimInt, add1, add2);
+    HInstruction* div_check = new (&allocator_) HDivZeroCheck(add2, 0);
+    HInstruction* div = new (&allocator_) HDiv(Primitive::kPrimInt, add1, div_check, 0);
+    HInstruction* array_get1 = new (&allocator_) HArrayGet(array, add1, Primitive::kPrimInt, 0);
+    HInstruction* array_set1 = new (&allocator_) HArraySet(array, add1, add2, Primitive::kPrimInt, 0);
+    HInstruction* array_get2 = new (&allocator_) HArrayGet(array, add1, Primitive::kPrimInt, 0);
+    HInstruction* array_set2 = new (&allocator_) HArraySet(array, add1, add2, Primitive::kPrimInt, 0);
+
+    DCHECK(div_check->CanThrow());
+
+    entry->AddInstruction(array);
+
+    HInstruction* block_instructions[] = {add1,
+                                          add2,
+                                          mul,
+                                          div_check,
+                                          div,
+                                          array_get1,
+                                          array_set1,
+                                          array_get2,
+                                          array_set2};
+    for (HInstruction* instr : block_instructions) {
+      block1->AddInstruction(instr);
+    }
+
+    HEnvironment* environment = new (&allocator_) HEnvironment(&allocator_,
+                                                               2,
+                                                               graph_->GetArtMethod(),
+                                                               0,
+                                                               div_check);
+    div_check->SetRawEnvironment(environment);
+    environment->SetRawEnvAt(0, add2);
+    add2->AddEnvUseAt(div_check->GetEnvironment(), 0);
+    environment->SetRawEnvAt(1, mul);
+    mul->AddEnvUseAt(div_check->GetEnvironment(), 1);
+
+    SchedulingGraph scheduling_graph(scheduler, graph_->GetArena());
+    // Instructions must be inserted in reverse order into the scheduling graph.
+    for (HInstruction* instr : ReverseRange(block_instructions)) {
+      scheduling_graph.AddNode(instr);
+    }
+
+    // Should not have dependencies cross basic blocks.
+    ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, c1));
+    ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add2, c2));
+
+    // Define-use dependency.
+    ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(add2, add1));
+    ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, add2));
+    ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div_check, add2));
+    ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(div_check, add1));
+    ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div, div_check));
+    ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add1));
+    ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add2));
+
+    // Read and write dependencies
+    ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, array_get1));
+    ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_get2));
+    ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_get2, array_set1));
+    ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_set1));
+
+    // Env dependency.
+    ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(div_check, mul));
+    ASSERT_FALSE(scheduling_graph.HasImmediateOtherDependency(mul, div_check));
+
+    // CanThrow.
+    ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, div_check));
+
+    // Exercise the code path of target specific scheduler and SchedulingLatencyVisitor.
+    scheduler->Schedule(graph_);
   }
 
-  // Should not have dependencies cross basic blocks.
-  ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, c1));
-  ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add2, c2));
-
-  // Define-use dependency.
-  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(add2, add1));
-  ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(add1, add2));
-  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div_check, add2));
-  ASSERT_FALSE(scheduling_graph.HasImmediateDataDependency(div_check, add1));
-  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(div, div_check));
-  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add1));
-  ASSERT_TRUE(scheduling_graph.HasImmediateDataDependency(array_set1, add2));
-
-  // Read and write dependencies
-  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, array_get1));
-  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_get2));
-  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_get2, array_set1));
-  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set2, array_set1));
-
-  // Env dependency.
-  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(div_check, mul));
-  ASSERT_FALSE(scheduling_graph.HasImmediateOtherDependency(mul, div_check));
-
-  // CanThrow.
-  ASSERT_TRUE(scheduling_graph.HasImmediateOtherDependency(array_set1, div_check));
+  void CompileWithRandomSchedulerAndRun(const uint16_t* data, bool has_result, int expected) {
+    for (CodegenTargetConfig target_config : GetTargetConfigs()) {
+      HGraph* graph = CreateCFG(&allocator_, data);
+
+      // Schedule the graph randomly.
+      HInstructionScheduling scheduling(graph, target_config.GetInstructionSet());
+      scheduling.Run(/*only_optimize_loop_blocks*/ false, /*schedule_randomly*/ true);
+
+      RunCode(target_config,
+              graph,
+              [](HGraph* graph_arg) { RemoveSuspendChecks(graph_arg); },
+              has_result, expected);
+    }
+  }
+
+  ArenaPool pool_;
+  ArenaAllocator allocator_;
+  HGraph* graph_;
+};
+
+#if defined(ART_ENABLE_CODEGEN_arm64)
+TEST_F(SchedulerTest, DependencyGraphAndSchedulerARM64) {
+  CriticalPathSchedulingNodeSelector critical_path_selector;
+  arm64::HSchedulerARM64 scheduler(&allocator_, &critical_path_selector);
+  TestBuildDependencyGraphAndSchedule(&scheduler);
 }
 #endif
 
-static void CompileWithRandomSchedulerAndRun(const uint16_t* data,
-                                             bool has_result,
-                                             int expected) {
-  for (CodegenTargetConfig target_config : GetTargetConfigs()) {
-    ArenaPool pool;
-    ArenaAllocator arena(&pool);
-    HGraph* graph = CreateCFG(&arena, data);
-
-    // Schedule the graph randomly.
-    HInstructionScheduling scheduling(graph, target_config.GetInstructionSet());
-    scheduling.Run(/*only_optimize_loop_blocks*/ false, /*schedule_randomly*/ true);
-
-    RunCode(target_config,
-            graph,
-            [](HGraph* graph_arg) { RemoveSuspendChecks(graph_arg); },
-            has_result, expected);
-  }
+#if defined(ART_ENABLE_CODEGEN_arm)
+TEST_F(SchedulerTest, DependencyGrapAndSchedulerARM) {
+  CriticalPathSchedulingNodeSelector critical_path_selector;
+  arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr);
+  arm::HSchedulerARM scheduler(&allocator_, &critical_path_selector, &arm_latency_visitor);
+  TestBuildDependencyGraphAndSchedule(&scheduler);
 }
+#endif
 
 TEST_F(SchedulerTest, RandomScheduling) {
   //
diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc
index eedaf6e67e..98ded24257 100644
--- a/compiler/optimizing/sharpening.cc
+++ b/compiler/optimizing/sharpening.cc
@@ -56,7 +56,7 @@ static bool IsInBootImage(ArtMethod* method) {
   const std::vector<gc::space::ImageSpace*>& image_spaces =
       Runtime::Current()->GetHeap()->GetBootImageSpaces();
   for (gc::space::ImageSpace* image_space : image_spaces) {
-    const auto& method_section = image_space->GetImageHeader().GetMethodsSection();
+    const ImageSection& method_section = image_space->GetImageHeader().GetMethodsSection();
     if (method_section.Contains(reinterpret_cast<uint8_t*>(method) - image_space->Begin())) {
       return true;
     }
diff --git a/compiler/utils/arm/assembler_thumb2.h b/compiler/utils/arm/assembler_thumb2.h
index 5c36110cf6..2ff9018510 100644
--- a/compiler/utils/arm/assembler_thumb2.h
+++ b/compiler/utils/arm/assembler_thumb2.h
@@ -924,9 +924,11 @@ class Thumb2Assembler FINAL : public ArmAssembler {
 
 class ScopedForce32Bit {
  public:
-  explicit ScopedForce32Bit(Thumb2Assembler* assembler)
+  explicit ScopedForce32Bit(Thumb2Assembler* assembler, bool force = true)
       : assembler_(assembler), old_force_32bit_(assembler->IsForced32Bit()) {
-    assembler->Force32Bit();
+    if (force) {
+      assembler->Force32Bit();
+    }
   }
 
   ~ScopedForce32Bit() {
diff --git a/dexlayout/dexdiag.cc b/dexlayout/dexdiag.cc
index c577b6e105..78860e3f96 100644
--- a/dexlayout/dexdiag.cc
+++ b/dexlayout/dexdiag.cc
@@ -200,7 +200,8 @@ static void ProcessPageMap(uint64_t* pagemap,
   for (size_t page = start; page < end; ++page) {
     char type_char = '.';
     if (PM_PAGEMAP_PRESENT(pagemap[page])) {
-      uint16_t type = FindSectionTypeForPage(page, sections);
+      const size_t dex_page_offset = page - start;
+      uint16_t type = FindSectionTypeForPage(dex_page_offset, sections);
       page_counts->Increment(type);
       type_char = PageTypeChar(type);
     }
@@ -231,7 +232,8 @@ static void DisplayDexStatistics(size_t start,
     return;
   }
   for (size_t page = start; page < end; ++page) {
-    mapped_pages.Increment(FindSectionTypeForPage(page, sections));
+    const size_t dex_page_offset = page - start;
+    mapped_pages.Increment(FindSectionTypeForPage(dex_page_offset, sections));
   }
   size_t total_resident_pages = 0;
   printer->PrintHeader();
diff --git a/runtime/arch/arch_test.cc b/runtime/arch/arch_test.cc
index 1a5e39f0f7..d6056c0ece 100644
--- a/runtime/arch/arch_test.cc
+++ b/runtime/arch/arch_test.cc
@@ -71,11 +71,15 @@ static constexpr size_t kFrameSizeSaveRefsAndArgs = FRAME_SIZE_SAVE_REFS_AND_ARG
 #undef FRAME_SIZE_SAVE_REFS_AND_ARGS
 static constexpr size_t kFrameSizeSaveEverything = FRAME_SIZE_SAVE_EVERYTHING;
 #undef FRAME_SIZE_SAVE_EVERYTHING
+#undef BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_ENTRYPOINT_OFFSET
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET
 #undef BAKER_MARK_INTROSPECTION_ARRAY_SWITCH_OFFSET
-#undef BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET
-#undef BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET
+#undef BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET
+#undef BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
 #undef BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET
-#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET
 }  // namespace arm
 
 namespace arm64 {
diff --git a/runtime/arch/arm/asm_support_arm.h b/runtime/arch/arm/asm_support_arm.h
index f1f1766ad4..8f2fd6ecc9 100644
--- a/runtime/arch/arm/asm_support_arm.h
+++ b/runtime/arch/arm/asm_support_arm.h
@@ -24,18 +24,25 @@
 #define FRAME_SIZE_SAVE_REFS_AND_ARGS 112
 #define FRAME_SIZE_SAVE_EVERYTHING 192
 
+// The offset from the art_quick_read_barrier_mark_introspection (used for field
+// loads with 32-bit LDR) to the entrypoint for field loads with 16-bit LDR,
+// i.e. art_quick_read_barrier_mark_introspection_narrow.
+#define BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_ENTRYPOINT_OFFSET 0x20
+// The offsets from art_quick_read_barrier_mark_introspection to the GC root entrypoints,
+// i.e. art_quick_read_barrier_mark_introspection_gc_roots_{wide,narrow}.
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET 0x80
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET 0xc0
 // The offset from art_quick_read_barrier_mark_introspection to the array switch cases,
 // i.e. art_quick_read_barrier_mark_introspection_arrays.
 #define BAKER_MARK_INTROSPECTION_ARRAY_SWITCH_OFFSET 0x100
-// The offset from art_quick_read_barrier_mark_introspection to the GC root entrypoint,
-// i.e. art_quick_read_barrier_mark_introspection_gc_roots.
-#define BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET 0xc0
 
 // The offset of the reference load LDR from the return address in LR for field loads.
 #ifdef USE_HEAP_POISONING
-#define BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET -8
+#define BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET -8
+#define BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET -4
 #else
-#define BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET -4
+#define BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET -4
+#define BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET -2
 #endif
 // The offset of the reference load LDR from the return address in LR for array loads.
 #ifdef USE_HEAP_POISONING
@@ -44,7 +51,8 @@
 #define BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET -4
 #endif
 // The offset of the reference load LDR from the return address in LR for GC root loads.
-#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET -8
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET -8
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET -6
 
 // Flag for enabling R4 optimization in arm runtime
 // #define ARM_R4_SUSPEND_FLAG
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index 6b7247773a..919b0afc40 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -53,8 +53,11 @@ extern "C" mirror::Object* art_quick_read_barrier_mark_reg11(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg12(mirror::Object*);
 
 extern "C" mirror::Object* art_quick_read_barrier_mark_introspection(mirror::Object*);
+extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_narrow(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_arrays(mirror::Object*);
-extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_gc_roots(mirror::Object*);
+extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_gc_roots_wide(mirror::Object*);
+extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_gc_roots_narrow(
+    mirror::Object*);
 
 // Used by soft float.
 // Single-precision FP arithmetics.
@@ -86,18 +89,27 @@ void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_active) {
   qpoints->pReadBarrierMarkReg10 = is_active ? art_quick_read_barrier_mark_reg10 : nullptr;
   qpoints->pReadBarrierMarkReg11 = is_active ? art_quick_read_barrier_mark_reg11 : nullptr;
 
-  // Check that array switch cases are at appropriate offsets from the introspection entrypoint.
   // For the alignment check, strip the Thumb mode bit.
   DCHECK_ALIGNED(reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection) - 1u, 256u);
+  // Check the field narrow entrypoint offset from the introspection entrypoint.
+  intptr_t narrow_diff =
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection_narrow) -
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection);
+  DCHECK_EQ(BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_ENTRYPOINT_OFFSET, narrow_diff);
+  // Check array switch cases offsets from the introspection entrypoint.
   intptr_t array_diff =
       reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection_arrays) -
       reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection);
   DCHECK_EQ(BAKER_MARK_INTROSPECTION_ARRAY_SWITCH_OFFSET, array_diff);
-  // Check that the GC root entrypoint is at appropriate offset from the introspection entrypoint.
-  intptr_t gc_roots_diff =
-      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection_gc_roots) -
+  // Check the GC root entrypoint offsets from the introspection entrypoint.
+  intptr_t gc_roots_wide_diff =
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection_gc_roots_wide) -
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection);
+  DCHECK_EQ(BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET, gc_roots_wide_diff);
+  intptr_t gc_roots_narrow_diff =
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection_gc_roots_narrow) -
       reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection);
-  DCHECK_EQ(BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET, gc_roots_diff);
+  DCHECK_EQ(BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET, gc_roots_narrow_diff);
   // The register 12, i.e. IP, is reserved, so there is no art_quick_read_barrier_mark_reg12.
   // We're using the entry to hold a pointer to the introspection entrypoint instead.
   qpoints->pReadBarrierMarkReg12 = is_active ? art_quick_read_barrier_mark_introspection : nullptr;
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 6be7537d61..31a7f6ae8e 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -2189,7 +2189,7 @@ READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg11, r11
     .byte   (.Lmark_introspection_return_switch_case_bad - .Lmark_introspection_return_table) / 2
 .endm
 
-#if BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET != BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET
+#if BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET != BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET
 #error "Array and field introspection code sharing requires same LDR offset."
 #endif
 .macro BRBMI_ARRAY_LOAD index_reg
@@ -2208,7 +2208,10 @@ READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg11, r11
     BRBMI_BKPT_FILL_4B
 .endm
 
-.macro BRBMI_SLOW_PATH ldr_offset
+.macro BRBMI_RUNTIME_CALL
+    // Note: This macro generates exactly 22 bytes of code. The core register
+    // PUSH and the MOVs are 16-bit instructions, the rest is 32-bit instructions.
+
     push   {r0-r3, r7, lr}            // Save return address and caller-save registers.
     .cfi_adjust_cfa_offset 24
     .cfi_rel_offset r0, 0
@@ -2234,11 +2237,72 @@ READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg11, r11
     .cfi_restore r3
     .cfi_restore r7
     .cfi_restore lr
+.endm
+
+.macro BRBMI_CHECK_NULL_AND_MARKED label_suffix
+    // If reference is null, just return it in the right register.
+    cmp     ip, #0
+    beq     .Lmark_introspection_return\label_suffix
+    // Use R4 as temp and check the mark bit of the reference.
+    ldr     r4, [ip, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    tst     r4, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
+    beq     .Lmark_introspection_unmarked\label_suffix
+.Lmark_introspection_return\label_suffix:
+.endm
+
+.macro BRBMI_UNMARKED_FORWARDING_ADDRESS_CHECK label_suffix
+.Lmark_introspection_unmarked\label_suffix:
+    // Check if the top two bits are one, if this is the case it is a forwarding address.
+#if (LOCK_WORD_STATE_SHIFT != 30) || (LOCK_WORD_STATE_FORWARDING_ADDRESS != 3)
+    // To use "CMP ip, #modified-immediate; BHS", we need the lock word state in
+    // the highest bits and the "forwarding address" state to have all bits set.
+#error "Unexpected lock word state shift or forwarding address state value."
+#endif
+    cmp     r4, #(LOCK_WORD_STATE_FORWARDING_ADDRESS << LOCK_WORD_STATE_SHIFT)
+    bhs     .Lmark_introspection_forwarding_address\label_suffix
+.endm
+
+.macro BRBMI_EXTRACT_FORWARDING_ADDRESS label_suffix
+.Lmark_introspection_forwarding_address\label_suffix:
+    // Note: This macro generates exactly 22 bytes of code, the branch is near.
 
+    // Shift left by the forwarding address shift. This clears out the state bits since they are
+    // in the top 2 bits of the lock word.
+    lsl     ip, r4, #LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT
+    b       .Lmark_introspection_return\label_suffix
+.endm
+
+.macro BRBMI_LOAD_RETURN_REG_FROM_CODE_wide ldr_offset
     // Load the half of the instruction that contains Rt. Adjust for the thumb state in LR.
     ldrh    r4, [lr, #(-1 + \ldr_offset + 2)]
-    lsr     r4, r4, #12               // Extract `ref_reg`.
-    b       .Lmark_introspection_return_switch
+.endm
+
+.macro BRBMI_LOAD_RETURN_REG_FROM_CODE_narrow ldr_offset
+    // Load the 16-bit instruction. Adjust for the thumb state in LR.
+    ldrh    r4, [lr, #(-1 + \ldr_offset)]
+.endm
+
+.macro BRBMI_GC_ROOT_AND_FIELD_SLOW_PATH gc_root_ldr_offset, label_suffix
+    .balign 64
+    .thumb_func
+    .type art_quick_read_barrier_mark_introspection_gc_roots\label_suffix, #function
+    .hidden art_quick_read_barrier_mark_introspection_gc_roots\label_suffix
+    .global art_quick_read_barrier_mark_introspection_gc_roots\label_suffix
+art_quick_read_barrier_mark_introspection_gc_roots\label_suffix:
+    BRBMI_RUNTIME_CALL
+    // Load the LDR (or the half of it) that contains Rt.
+    BRBMI_LOAD_RETURN_REG_FROM_CODE\label_suffix \gc_root_ldr_offset
+    b       .Lmark_introspection_extract_register_and_return\label_suffix
+    // We've used 28 bytes since the "gc_roots" entrypoint (22 bytes for
+    // BRBMI_RUNTIME_CALL, 4 bytes for LDRH and 2 bytes for the branch). Squeeze
+    // the 6 byte forwarding address extraction here across the 32-byte boundary.
+    BRBMI_EXTRACT_FORWARDING_ADDRESS \label_suffix
+    // And the slow path taking exactly 30 bytes (6 bytes for the forwarding
+    // address check, 22 bytes for BRBMI_RUNTIME_CALL and 2 bytes for the near
+    // branch) shall take the rest of the 32-byte section (within a cache line).
+    BRBMI_UNMARKED_FORWARDING_ADDRESS_CHECK \label_suffix
+    BRBMI_RUNTIME_CALL
+    b       .Lmark_introspection_return\label_suffix
 .endm
 
     /*
@@ -2249,14 +2313,16 @@ READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg11, r11
      *
      * The entrypoint is called through a thunk that differs across load kinds.
      * For field and array loads the LDR instruction in generated code follows
-     * the branch to the thunk, i.e. the LDR is at [LR, #(-4 - 1)] where the -1
-     * is an adjustment for the Thumb mode bit in LR, and the thunk knows the
-     * holder and performs the gray bit check, returning to the LDR instruction
-     * if the object is not gray, so this entrypoint no longer needs to know
-     * anything about the holder. For GC root loads, the LDR instruction in
-     * generated code precedes the branch to the thunk, i.e. the LDR is at
-     * [LR, #(-8 - 1)] where the -1 is again the Thumb mode bit adjustment, and
-     * the thunk does not do the gray bit check.
+     * the branch to the thunk, i.e. the LDR is (ignoring the heap poisoning)
+     * at [LR, #(-4 - 1)] (encoding T3) or [LR, #(-2 - 1)] (encoding T1) where
+     * the -1 is an adjustment for the Thumb mode bit in LR, and the thunk
+     * knows the holder and performs the gray bit check, returning to the LDR
+     * instruction if the object is not gray, so this entrypoint no longer
+     * needs to know anything about the holder. For GC root loads, the LDR
+     * instruction in generated code precedes the branch to the thunk, i.e. the
+     * LDR is at [LR, #(-8 - 1)] (encoding T3) or [LR, #(-6 - 1)] (encoding T1)
+     * where the -1 is again the Thumb mode bit adjustment, and the thunk does
+     * not do the gray bit check.
      *
      * For field accesses and array loads with a constant index the thunk loads
      * the reference into IP using introspection and calls the main entrypoint,
@@ -2288,11 +2354,29 @@ READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg11, r11
      *
      * The code structure is
      *   art_quick_read_barrier_mark_introspection:
-     *     Over 128 bytes for the main entrypoint code.
-     *     Padding to 192 bytes if needed.
-     *   art_quick_read_barrier_mark_introspection_gc_roots:
-     *     GC root entrypoint code.
-     *     Padding to 256 bytes if needed.
+     *     Up to 32 bytes code for main entrypoint fast-path code for fields
+     *     (and array elements with constant offset) with LDR encoding T3;
+     *     jumps to the switch in the "narrow" entrypoint.
+     *     Padding to 32 bytes if needed.
+     *   art_quick_read_barrier_mark_introspection_narrow:
+     *     Up to 48 bytes code for fast path code for fields (and array
+     *     elements with constant offset) with LDR encoding T1, ending in the
+     *     return switch instruction TBB and the table with switch offsets.
+     *     Padding to 80 bytes if needed.
+     *   .Lmark_introspection_return_switch_case_r0:
+     *     Exactly 48 bytes of code for the return switch cases (12 cases,
+     *     including BKPT for the reserved registers).
+     *     Ends at 128 bytes total.
+     *   art_quick_read_barrier_mark_introspection_gc_roots_wide:
+     *     GC root entrypoint code for LDR encoding T3 (28 bytes).
+     *     Forwarding address extraction for LDR encoding T3 (6 bytes).
+     *     Slow path for main entrypoint for LDR encoding T3 (30 bytes).
+     *     Ends at 192 bytes total.
+     *   art_quick_read_barrier_mark_introspection_gc_roots_narrow:
+     *     GC root entrypoint code for LDR encoding T1 (28 bytes).
+     *     Forwarding address extraction for LDR encoding T1 (6 bytes).
+     *     Slow path for main entrypoint for LDR encoding T1 (30 bytes).
+     *     Ends at 256 bytes total.
      *   art_quick_read_barrier_mark_introspection_arrays:
      *     Exactly 128 bytes for array load switch cases (16x2 instructions).
      */
@@ -2302,17 +2386,30 @@ ENTRY art_quick_read_barrier_mark_introspection
     // (R4 is reserved for the entrypoint address.)
     // For heap poisoning, the reference is poisoned, so unpoison it first.
     UNPOISON_HEAP_REF ip
-    // If reference is null, just return it in the right register.
-    cmp     ip, #0
-    beq     .Lmark_introspection_return
-    // Use R4 as temp and check the mark bit of the reference.
-    ldr     r4, [ip, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    tst     r4, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
-    beq     .Lmark_introspection_unmarked
-.Lmark_introspection_return:
-    // Load the half of the instruction that contains Rt. Adjust for the thumb state in LR.
-    ldrh    r4, [lr, #(-1 + BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET + 2)]
+    // Check for null or marked, lock word is loaded into IP.
+    BRBMI_CHECK_NULL_AND_MARKED _wide
+    // Load the half of the instruction that contains Rt.
+    BRBMI_LOAD_RETURN_REG_FROM_CODE_wide BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET
+.Lmark_introspection_extract_register_and_return_wide:
     lsr     r4, r4, #12               // Extract `ref_reg`.
+    b       .Lmark_introspection_return_switch
+
+    .balign 32
+    .thumb_func
+    .type art_quick_read_barrier_mark_introspection_narrow, #function
+    .hidden art_quick_read_barrier_mark_introspection_narrow
+    .global art_quick_read_barrier_mark_introspection_narrow
+art_quick_read_barrier_mark_introspection_narrow:
+    // At this point, IP contains the reference, R4 can be freely used.
+    // (R4 is reserved for the entrypoint address.)
+    // For heap poisoning, the reference is poisoned, so unpoison it first.
+    UNPOISON_HEAP_REF ip
+    // Check for null or marked, lock word is loaded into R4.
+    BRBMI_CHECK_NULL_AND_MARKED _narrow
+    // Load the 16-bit instruction.
+    BRBMI_LOAD_RETURN_REG_FROM_CODE_narrow BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
+.Lmark_introspection_extract_register_and_return_narrow:
+    and     r4, r4, #7                // Extract `ref_reg`.
 .Lmark_introspection_return_switch:
     tbb     [pc, r4]                  // Jump to the switch case.
 .Lmark_introspection_return_table:
@@ -2320,32 +2417,8 @@ ENTRY art_quick_read_barrier_mark_introspection
     .balign 16
     BRBMI_FOR_12_REGISTERS BRBMI_RETURN_SWITCH_CASE, BRBMI_BAD_RETURN_SWITCH_CASE
 
-    .balign 16
-.Lmark_introspection_unmarked:
-    // Check if the top two bits are one, if this is the case it is a forwarding address.
-#if (LOCK_WORD_STATE_SHIFT != 30) || (LOCK_WORD_STATE_FORWARDING_ADDRESS != 3)
-    // To use "CMP ip, #modified-immediate; BHS", we need the lock word state in
-    // the highest bits and the "forwarding address" state to have all bits set.
-#error "Unexpected lock word state shift or forwarding address state value."
-#endif
-    cmp     r4, #(LOCK_WORD_STATE_FORWARDING_ADDRESS << LOCK_WORD_STATE_SHIFT)
-    bhs     .Lmark_introspection_forwarding_address
-    BRBMI_SLOW_PATH BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET
-
-    .balign 8
-.Lmark_introspection_forwarding_address:
-    // Shift left by the forwarding address shift. This clears out the state bits since they are
-    // in the top 2 bits of the lock word.
-    lsl     ip, r4, #LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT
-    b       .Lmark_introspection_return
-
-    .balign 64
-    .thumb_func
-    .type art_quick_read_barrier_mark_introspection_gc_roots, #function
-    .hidden art_quick_read_barrier_mark_introspection_gc_roots
-    .global art_quick_read_barrier_mark_introspection_gc_roots
-art_quick_read_barrier_mark_introspection_gc_roots:
-    BRBMI_SLOW_PATH BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET
+    BRBMI_GC_ROOT_AND_FIELD_SLOW_PATH BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET, _wide
+    BRBMI_GC_ROOT_AND_FIELD_SLOW_PATH BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET, _narrow
 
     .balign 256
     .thumb_func
diff --git a/runtime/dex_file_annotations.cc b/runtime/dex_file_annotations.cc
index 13979160bd..f21f1a2704 100644
--- a/runtime/dex_file_annotations.cc
+++ b/runtime/dex_file_annotations.cc
@@ -1421,11 +1421,20 @@ mirror::ObjectArray<mirror::String>* GetSignatureAnnotationForClass(Handle<mirro
 }
 
 const char* GetSourceDebugExtension(Handle<mirror::Class> klass) {
+  // Before instantiating ClassData, check that klass has a DexCache
+  // assigned.  The ClassData constructor indirectly dereferences it
+  // when calling klass->GetDexFile().
+  if (klass->GetDexCache() == nullptr) {
+    DCHECK(klass->IsPrimitive() || klass->IsArrayClass());
+    return nullptr;
+  }
+
   ClassData data(klass);
   const DexFile::AnnotationSetItem* annotation_set = FindAnnotationSetForClass(data);
   if (annotation_set == nullptr) {
     return nullptr;
   }
+
   const DexFile::AnnotationItem* annotation_item = SearchAnnotationSet(
       data.GetDexFile(),
       annotation_set,
@@ -1434,6 +1443,7 @@ const char* GetSourceDebugExtension(Handle<mirror::Class> klass) {
   if (annotation_item == nullptr) {
     return nullptr;
   }
+
   const uint8_t* annotation =
       SearchEncodedAnnotation(data.GetDexFile(), annotation_item->annotation_, "value");
   if (annotation == nullptr) {
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index a450a751b8..8b80f54880 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -616,25 +616,8 @@ void ConcurrentCopying::FlipThreadRoots() {
   ThreadFlipVisitor thread_flip_visitor(this, heap_->use_tlab_);
   FlipCallback flip_callback(this);
 
-  // This is the point where Concurrent-Copying will pause all threads. We report a pause here, if
-  // necessary. This is slightly over-reporting, as this includes the time to actually suspend
-  // threads.
-  {
-    GcPauseListener* pause_listener = GetHeap()->GetGcPauseListener();
-    if (pause_listener != nullptr) {
-      pause_listener->StartPause();
-    }
-  }
-
-  size_t barrier_count = Runtime::Current()->FlipThreadRoots(
-      &thread_flip_visitor, &flip_callback, this);
-
-  {
-    GcPauseListener* pause_listener = GetHeap()->GetGcPauseListener();
-    if (pause_listener != nullptr) {
-      pause_listener->EndPause();
-    }
-  }
+  size_t barrier_count = Runtime::Current()->GetThreadList()->FlipThreadRoots(
+      &thread_flip_visitor, &flip_callback, this, GetHeap()->GetGcPauseListener());
 
   {
     ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
diff --git a/runtime/oat.h b/runtime/oat.h
index a38eebc188..b7c715cc03 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@ class InstructionSetFeatures;
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '1', '2', '4', '\0' };  // New compiler filter names.
+  static constexpr uint8_t kOatVersion[] = { '1', '2', '5', '\0' };  // ARM Baker narrow thunks.
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
@@ -175,6 +175,7 @@ class PACKED(4) OatMethodOffsets {
 
   ~OatMethodOffsets();
 
+  OatMethodOffsets(const OatMethodOffsets&) = default;
   OatMethodOffsets& operator=(const OatMethodOffsets&) = default;
 
   uint32_t code_offset_;
diff --git a/runtime/oat_quick_method_header.h b/runtime/oat_quick_method_header.h
index f2a2af2a5f..152b0ba21b 100644
--- a/runtime/oat_quick_method_header.h
+++ b/runtime/oat_quick_method_header.h
@@ -54,6 +54,7 @@ class PACKED(4) OatQuickMethodHeader {
     return FromCodePointer(EntryPointToCodePointer(entry_point));
   }
 
+  OatQuickMethodHeader(const OatQuickMethodHeader&) = default;
   OatQuickMethodHeader& operator=(const OatQuickMethodHeader&) = default;
 
   uintptr_t NativeQuickPcOffset(const uintptr_t pc) const {
diff --git a/runtime/openjdkjvmti/OpenjdkJvmTi.cc b/runtime/openjdkjvmti/OpenjdkJvmTi.cc
index 0921ceae05..9be486e269 100644
--- a/runtime/openjdkjvmti/OpenjdkJvmTi.cc
+++ b/runtime/openjdkjvmti/OpenjdkJvmTi.cc
@@ -1205,6 +1205,30 @@ class JvmtiFunctions {
       return error;
     }
 
+    error = add_extension(
+        reinterpret_cast<jvmtiExtensionFunction>(HeapExtensions::IterateThroughHeapExt),
+        "com.android.art.heap.iterate_through_heap_ext",
+        "Iterate through a heap. This is equivalent to the standard IterateThroughHeap function,"
+        " except for additionally passing the heap id of the current object. The jvmtiHeapCallbacks"
+        " structure is reused, with the callbacks field overloaded to a signature of "
+        "jint (*)(jlong, jlong, jlong*, jint length, void*, jint).",
+        4,
+        {                                                          // NOLINT [whitespace/braces] [4]
+            { "heap_filter", JVMTI_KIND_IN, JVMTI_TYPE_JINT, false},
+            { "klass", JVMTI_KIND_IN, JVMTI_TYPE_JCLASS, true},
+            { "callbacks", JVMTI_KIND_IN_PTR, JVMTI_TYPE_CVOID, false},
+            { "user_data", JVMTI_KIND_IN_PTR, JVMTI_TYPE_CVOID, true}
+        },
+        3,
+        {                                                          // NOLINT [whitespace/braces] [4]
+            JVMTI_ERROR_MUST_POSSESS_CAPABILITY,
+            JVMTI_ERROR_INVALID_CLASS,
+            JVMTI_ERROR_NULL_POINTER
+        });
+    if (error != ERR(NONE)) {
+      return error;
+    }
+
     // Copy into output buffer.
 
     *extension_count_ptr = ext_vector.size();
diff --git a/runtime/openjdkjvmti/ti_heap.cc b/runtime/openjdkjvmti/ti_heap.cc
index 9b4dcaa9d0..99774c67b5 100644
--- a/runtime/openjdkjvmti/ti_heap.cc
+++ b/runtime/openjdkjvmti/ti_heap.cc
@@ -651,14 +651,17 @@ void HeapUtil::Unregister() {
   art::Runtime::Current()->RemoveSystemWeakHolder(&gIndexCachingTable);
 }
 
+template <typename Callback>
 struct IterateThroughHeapData {
-  IterateThroughHeapData(HeapUtil* _heap_util,
+  IterateThroughHeapData(Callback _cb,
+                         ObjectTagTable* _tag_table,
                          jvmtiEnv* _env,
                          art::ObjPtr<art::mirror::Class> klass,
                          jint _heap_filter,
                          const jvmtiHeapCallbacks* _callbacks,
                          const void* _user_data)
-      : heap_util(_heap_util),
+      : cb(_cb),
+        tag_table(_tag_table),
         heap_filter(_heap_filter),
         filter_klass(klass),
         env(_env),
@@ -667,95 +670,89 @@ struct IterateThroughHeapData {
         stop_reports(false) {
   }
 
-  HeapUtil* heap_util;
-  const HeapFilter heap_filter;
-  art::ObjPtr<art::mirror::Class> filter_klass;
-  jvmtiEnv* env;
-  const jvmtiHeapCallbacks* callbacks;
-  const void* user_data;
-
-  bool stop_reports;
-};
-
-static void IterateThroughHeapObjectCallback(art::mirror::Object* obj, void* arg)
-    REQUIRES_SHARED(art::Locks::mutator_lock_) {
-  IterateThroughHeapData* ithd = reinterpret_cast<IterateThroughHeapData*>(arg);
-  // Early return, as we can't really stop visiting.
-  if (ithd->stop_reports) {
-    return;
+  static void ObjectCallback(art::mirror::Object* obj, void* arg)
+      REQUIRES_SHARED(art::Locks::mutator_lock_) {
+    IterateThroughHeapData* ithd = reinterpret_cast<IterateThroughHeapData*>(arg);
+    ithd->ObjectCallback(obj);
   }
 
-  art::ScopedAssertNoThreadSuspension no_suspension("IterateThroughHeapCallback");
+  void ObjectCallback(art::mirror::Object* obj)
+      REQUIRES_SHARED(art::Locks::mutator_lock_) {
+    // Early return, as we can't really stop visiting.
+    if (stop_reports) {
+      return;
+    }
 
-  jlong tag = 0;
-  ithd->heap_util->GetTags()->GetTag(obj, &tag);
+    art::ScopedAssertNoThreadSuspension no_suspension("IterateThroughHeapCallback");
 
-  jlong class_tag = 0;
-  art::ObjPtr<art::mirror::Class> klass = obj->GetClass();
-  ithd->heap_util->GetTags()->GetTag(klass.Ptr(), &class_tag);
-  // For simplicity, even if we find a tag = 0, assume 0 = not tagged.
+    jlong tag = 0;
+    tag_table->GetTag(obj, &tag);
 
-  if (!ithd->heap_filter.ShouldReportByHeapFilter(tag, class_tag)) {
-    return;
-  }
+    jlong class_tag = 0;
+    art::ObjPtr<art::mirror::Class> klass = obj->GetClass();
+    tag_table->GetTag(klass.Ptr(), &class_tag);
+    // For simplicity, even if we find a tag = 0, assume 0 = not tagged.
 
-  if (ithd->filter_klass != nullptr) {
-    if (ithd->filter_klass != klass) {
+    if (!heap_filter.ShouldReportByHeapFilter(tag, class_tag)) {
       return;
     }
-  }
 
-  jlong size = obj->SizeOf();
+    if (filter_klass != nullptr) {
+      if (filter_klass != klass) {
+        return;
+      }
+    }
 
-  jint length = -1;
-  if (obj->IsArrayInstance()) {
-    length = obj->AsArray()->GetLength();
-  }
+    jlong size = obj->SizeOf();
 
-  jlong saved_tag = tag;
-  jint ret = ithd->callbacks->heap_iteration_callback(class_tag,
-                                                      size,
-                                                      &tag,
-                                                      length,
-                                                      const_cast<void*>(ithd->user_data));
+    jint length = -1;
+    if (obj->IsArrayInstance()) {
+      length = obj->AsArray()->GetLength();
+    }
 
-  if (tag != saved_tag) {
-    ithd->heap_util->GetTags()->Set(obj, tag);
-  }
+    jlong saved_tag = tag;
+    jint ret = cb(obj, callbacks, class_tag, size, &tag, length, const_cast<void*>(user_data));
 
-  ithd->stop_reports = (ret & JVMTI_VISIT_ABORT) != 0;
+    if (tag != saved_tag) {
+      tag_table->Set(obj, tag);
+    }
 
-  if (!ithd->stop_reports) {
-    jint string_ret = ReportString(obj,
-                                   ithd->env,
-                                   ithd->heap_util->GetTags(),
-                                   ithd->callbacks,
-                                   ithd->user_data);
-    ithd->stop_reports = (string_ret & JVMTI_VISIT_ABORT) != 0;
-  }
+    stop_reports = (ret & JVMTI_VISIT_ABORT) != 0;
 
-  if (!ithd->stop_reports) {
-    jint array_ret = ReportPrimitiveArray(obj,
-                                          ithd->env,
-                                          ithd->heap_util->GetTags(),
-                                          ithd->callbacks,
-                                          ithd->user_data);
-    ithd->stop_reports = (array_ret & JVMTI_VISIT_ABORT) != 0;
-  }
+    if (!stop_reports) {
+      jint string_ret = ReportString(obj, env, tag_table, callbacks, user_data);
+      stop_reports = (string_ret & JVMTI_VISIT_ABORT) != 0;
+    }
+
+    if (!stop_reports) {
+      jint array_ret = ReportPrimitiveArray(obj, env, tag_table, callbacks, user_data);
+      stop_reports = (array_ret & JVMTI_VISIT_ABORT) != 0;
+    }
 
-  if (!ithd->stop_reports) {
-    ithd->stop_reports = ReportPrimitiveField::Report(obj,
-                                                      ithd->heap_util->GetTags(),
-                                                      ithd->callbacks,
-                                                      ithd->user_data);
+    if (!stop_reports) {
+      stop_reports = ReportPrimitiveField::Report(obj, tag_table, callbacks, user_data);
+    }
   }
-}
 
-jvmtiError HeapUtil::IterateThroughHeap(jvmtiEnv* env,
-                                        jint heap_filter,
-                                        jclass klass,
-                                        const jvmtiHeapCallbacks* callbacks,
-                                        const void* user_data) {
+  Callback cb;
+  ObjectTagTable* tag_table;
+  const HeapFilter heap_filter;
+  art::ObjPtr<art::mirror::Class> filter_klass;
+  jvmtiEnv* env;
+  const jvmtiHeapCallbacks* callbacks;
+  const void* user_data;
+
+  bool stop_reports;
+};
+
+template <typename T>
+static jvmtiError DoIterateThroughHeap(T fn,
+                                       jvmtiEnv* env,
+                                       ObjectTagTable* tag_table,
+                                       jint heap_filter,
+                                       jclass klass,
+                                       const jvmtiHeapCallbacks* callbacks,
+                                       const void* user_data) {
   if (callbacks == nullptr) {
     return ERR(NULL_POINTER);
   }
@@ -763,16 +760,46 @@ jvmtiError HeapUtil::IterateThroughHeap(jvmtiEnv* env,
   art::Thread* self = art::Thread::Current();
   art::ScopedObjectAccess soa(self);      // Now we know we have the shared lock.
 
-  IterateThroughHeapData ithd(this,
+  using Iterator = IterateThroughHeapData<T>;
+  Iterator ithd(fn,
+                tag_table,
+                env,
+                soa.Decode<art::mirror::Class>(klass),
+                heap_filter,
+                callbacks,
+                user_data);
+
+  art::Runtime::Current()->GetHeap()->VisitObjects(Iterator::ObjectCallback, &ithd);
+
+  return ERR(NONE);
+}
+
+jvmtiError HeapUtil::IterateThroughHeap(jvmtiEnv* env,
+                                        jint heap_filter,
+                                        jclass klass,
+                                        const jvmtiHeapCallbacks* callbacks,
+                                        const void* user_data) {
+  auto JvmtiIterateHeap = [](art::mirror::Object* obj ATTRIBUTE_UNUSED,
+                             const jvmtiHeapCallbacks* cb_callbacks,
+                             jlong class_tag,
+                             jlong size,
+                             jlong* tag,
+                             jint length,
+                             void* cb_user_data)
+      REQUIRES_SHARED(art::Locks::mutator_lock_) {
+    return cb_callbacks->heap_iteration_callback(class_tag,
+                                                 size,
+                                                 tag,
+                                                 length,
+                                                 cb_user_data);
+  };
+  return DoIterateThroughHeap(JvmtiIterateHeap,
                               env,
-                              soa.Decode<art::mirror::Class>(klass),
+                              ArtJvmTiEnv::AsArtJvmTiEnv(env)->object_tag_table.get(),
                               heap_filter,
+                              klass,
                               callbacks,
                               user_data);
-
-  art::Runtime::Current()->GetHeap()->VisitObjects(IterateThroughHeapObjectCallback, &ithd);
-
-  return ERR(NONE);
 }
 
 class FollowReferencesHelper FINAL {
@@ -1406,6 +1433,33 @@ static constexpr jint kHeapIdImage = 1;
 static constexpr jint kHeapIdZygote = 2;
 static constexpr jint kHeapIdApp = 3;
 
+static jint GetHeapId(art::ObjPtr<art::mirror::Object> obj)
+    REQUIRES_SHARED(art::Locks::mutator_lock_) {
+  if (obj == nullptr) {
+    return -1;
+  }
+
+  art::gc::Heap* const heap = art::Runtime::Current()->GetHeap();
+  const art::gc::space::ContinuousSpace* const space =
+      heap->FindContinuousSpaceFromObject(obj, true);
+  jint heap_type = kHeapIdApp;
+  if (space != nullptr) {
+    if (space->IsZygoteSpace()) {
+      heap_type = kHeapIdZygote;
+    } else if (space->IsImageSpace() && heap->ObjectIsInBootImageSpace(obj)) {
+      // Only count objects in the boot image as HPROF_HEAP_IMAGE, this leaves app image objects
+      // as HPROF_HEAP_APP. b/35762934
+      heap_type = kHeapIdImage;
+    }
+  } else {
+    const auto* los = heap->GetLargeObjectsSpace();
+    if (los->Contains(obj.Ptr()) && los->IsZygoteLargeObject(art::Thread::Current(), obj.Ptr())) {
+      heap_type = kHeapIdZygote;
+    }
+  }
+  return heap_type;
+};
+
 jvmtiError HeapExtensions::GetObjectHeapId(jvmtiEnv* env, jlong tag, jint* heap_id, ...) {
   if (heap_id == nullptr) {
     return ERR(NULL_POINTER);
@@ -1416,28 +1470,10 @@ jvmtiError HeapExtensions::GetObjectHeapId(jvmtiEnv* env, jlong tag, jint* heap_
   auto work = [&]() REQUIRES_SHARED(art::Locks::mutator_lock_) {
     ObjectTagTable* tag_table = ArtJvmTiEnv::AsArtJvmTiEnv(env)->object_tag_table.get();
     art::ObjPtr<art::mirror::Object> obj = tag_table->Find(tag);
-    if (obj == nullptr) {
+    jint heap_type = GetHeapId(obj);
+    if (heap_type == -1) {
       return ERR(NOT_FOUND);
     }
-
-    art::gc::Heap* const heap = art::Runtime::Current()->GetHeap();
-    const art::gc::space::ContinuousSpace* const space =
-        heap->FindContinuousSpaceFromObject(obj, true);
-    jint heap_type = kHeapIdApp;
-    if (space != nullptr) {
-      if (space->IsZygoteSpace()) {
-        heap_type = kHeapIdZygote;
-      } else if (space->IsImageSpace() && heap->ObjectIsInBootImageSpace(obj)) {
-        // Only count objects in the boot image as HPROF_HEAP_IMAGE, this leaves app image objects
-        // as HPROF_HEAP_APP. b/35762934
-        heap_type = kHeapIdImage;
-      }
-    } else {
-      const auto* los = heap->GetLargeObjectsSpace();
-      if (los->Contains(obj.Ptr()) && los->IsZygoteLargeObject(self, obj.Ptr())) {
-        heap_type = kHeapIdZygote;
-      }
-    }
     *heap_id = heap_type;
     return ERR(NONE);
   };
@@ -1491,4 +1527,36 @@ jvmtiError HeapExtensions::GetHeapName(jvmtiEnv* env, jint heap_id, char** heap_
   }
 }
 
+jvmtiError HeapExtensions::IterateThroughHeapExt(jvmtiEnv* env,
+                                                 jint heap_filter,
+                                                 jclass klass,
+                                                 const jvmtiHeapCallbacks* callbacks,
+                                                 const void* user_data) {
+  if (ArtJvmTiEnv::AsArtJvmTiEnv(env)->capabilities.can_tag_objects != 1) { \
+    return ERR(MUST_POSSESS_CAPABILITY); \
+  }
+
+  // ART extension API: Also pass the heap id.
+  auto ArtIterateHeap = [](art::mirror::Object* obj,
+                           const jvmtiHeapCallbacks* cb_callbacks,
+                           jlong class_tag,
+                           jlong size,
+                           jlong* tag,
+                           jint length,
+                           void* cb_user_data)
+      REQUIRES_SHARED(art::Locks::mutator_lock_) {
+    jint heap_id = GetHeapId(obj);
+    using ArtExtensionAPI = jint (*)(jlong, jlong, jlong*, jint length, void*, jint);
+    return reinterpret_cast<ArtExtensionAPI>(cb_callbacks->heap_iteration_callback)(
+        class_tag, size, tag, length, cb_user_data, heap_id);
+  };
+  return DoIterateThroughHeap(ArtIterateHeap,
+                              env,
+                              ArtJvmTiEnv::AsArtJvmTiEnv(env)->object_tag_table.get(),
+                              heap_filter,
+                              klass,
+                              callbacks,
+                              user_data);
+}
+
 }  // namespace openjdkjvmti
diff --git a/runtime/openjdkjvmti/ti_heap.h b/runtime/openjdkjvmti/ti_heap.h
index b4b71ba88e..0c973db199 100644
--- a/runtime/openjdkjvmti/ti_heap.h
+++ b/runtime/openjdkjvmti/ti_heap.h
@@ -60,6 +60,12 @@ class HeapExtensions {
  public:
   static jvmtiError JNICALL GetObjectHeapId(jvmtiEnv* env, jlong tag, jint* heap_id, ...);
   static jvmtiError JNICALL GetHeapName(jvmtiEnv* env, jint heap_id, char** heap_name, ...);
+
+  static jvmtiError JNICALL IterateThroughHeapExt(jvmtiEnv* env,
+                                                  jint heap_filter,
+                                                  jclass klass,
+                                                  const jvmtiHeapCallbacks* callbacks,
+                                                  const void* user_data);
 };
 
 }  // namespace openjdkjvmti
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 60fa0828a0..0bc0869044 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -1816,11 +1816,6 @@ void Runtime::VisitThreadRoots(RootVisitor* visitor, VisitRootFlags flags) {
   thread_list_->VisitRoots(visitor, flags);
 }
 
-size_t Runtime::FlipThreadRoots(Closure* thread_flip_visitor, Closure* flip_callback,
-                                gc::collector::GarbageCollector* collector) {
-  return thread_list_->FlipThreadRoots(thread_flip_visitor, flip_callback, collector);
-}
-
 void Runtime::VisitRoots(RootVisitor* visitor, VisitRootFlags flags) {
   VisitNonConcurrentRoots(visitor, flags);
   VisitConcurrentRoots(visitor, flags);
diff --git a/runtime/runtime.h b/runtime/runtime.h
index a2505e2292..4931382e55 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -48,9 +48,6 @@ namespace art {
 namespace gc {
   class AbstractSystemWeakHolder;
   class Heap;
-  namespace collector {
-    class GarbageCollector;
-  }  // namespace collector
 }  // namespace gc
 
 namespace jit {
@@ -79,7 +76,6 @@ class ArenaPool;
 class ArtMethod;
 class ClassHierarchyAnalysis;
 class ClassLinker;
-class Closure;
 class CompilerCallbacks;
 class DexFile;
 class InternTable;
@@ -340,11 +336,6 @@ class Runtime {
   void VisitTransactionRoots(RootVisitor* visitor)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // Flip thread roots from from-space refs to to-space refs.
-  size_t FlipThreadRoots(Closure* thread_flip_visitor, Closure* flip_callback,
-                         gc::collector::GarbageCollector* collector)
-      REQUIRES(!Locks::mutator_lock_);
-
   // Sweep system weaks, the system weak is deleted if the visitor return null. Otherwise, the
   // system weak is updated to be the visitor's returned value.
   void SweepSystemWeaks(IsMarkedVisitor* visitor)
@@ -948,7 +939,8 @@ class Runtime {
 
   std::unique_ptr<RuntimeCallbacks> callbacks_;
 
-  std::atomic<uint32_t> deoptimization_counts_[static_cast<uint32_t>(DeoptimizationKind::kLast)];
+  std::atomic<uint32_t> deoptimization_counts_[
+      static_cast<uint32_t>(DeoptimizationKind::kLast) + 1];
 
   DISALLOW_COPY_AND_ASSIGN(Runtime);
 };
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index b63eaa40ef..dc2af2ae34 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -34,6 +34,7 @@
 #include "base/timing_logger.h"
 #include "debugger.h"
 #include "gc/collector/concurrent_copying.h"
+#include "gc/gc_pause_listener.h"
 #include "gc/reference_processor.h"
 #include "jni_internal.h"
 #include "lock_word.h"
@@ -528,7 +529,8 @@ size_t ThreadList::RunCheckpointOnRunnableThreads(Closure* checkpoint_function)
 // invariant.
 size_t ThreadList::FlipThreadRoots(Closure* thread_flip_visitor,
                                    Closure* flip_callback,
-                                   gc::collector::GarbageCollector* collector) {
+                                   gc::collector::GarbageCollector* collector,
+                                   gc::GcPauseListener* pause_listener) {
   TimingLogger::ScopedTiming split("ThreadListFlip", collector->GetTimings());
   Thread* self = Thread::Current();
   Locks::mutator_lock_->AssertNotHeld(self);
@@ -542,6 +544,9 @@ size_t ThreadList::FlipThreadRoots(Closure* thread_flip_visitor,
   // pause.
   const uint64_t suspend_start_time = NanoTime();
   SuspendAllInternal(self, self, nullptr);
+  if (pause_listener != nullptr) {
+    pause_listener->StartPause();
+  }
 
   // Run the flip callback for the collector.
   Locks::mutator_lock_->ExclusiveLock(self);
@@ -549,6 +554,9 @@ size_t ThreadList::FlipThreadRoots(Closure* thread_flip_visitor,
   flip_callback->Run(self);
   Locks::mutator_lock_->ExclusiveUnlock(self);
   collector->RegisterPause(NanoTime() - suspend_start_time);
+  if (pause_listener != nullptr) {
+    pause_listener->EndPause();
+  }
 
   // Resume runnable threads.
   size_t runnable_thread_count = 0;
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index 14bef5e2b9..337574603b 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -35,6 +35,7 @@ namespace gc {
   namespace collector {
     class GarbageCollector;
   }  // namespac collector
+  class GcPauseListener;
 }  // namespace gc
 class Closure;
 class Thread;
@@ -121,7 +122,8 @@ class ThreadList {
   // the concurrent copying collector.
   size_t FlipThreadRoots(Closure* thread_flip_visitor,
                          Closure* flip_callback,
-                         gc::collector::GarbageCollector* collector)
+                         gc::collector::GarbageCollector* collector,
+                         gc::GcPauseListener* pause_listener)
       REQUIRES(!Locks::mutator_lock_,
                !Locks::thread_list_lock_,
                !Locks::thread_suspend_count_lock_);
diff --git a/test/115-native-bridge/expected.txt b/test/115-native-bridge/expected.txt
index 852ec2e5e9..9c64111027 100644
--- a/test/115-native-bridge/expected.txt
+++ b/test/115-native-bridge/expected.txt
@@ -62,3 +62,8 @@ trampoline_Java_Main_testNewStringObject called!
 Getting trampoline for Java_Main_testSignal with shorty I.
 NB signal handler with signal 11.
 NB signal handler with signal 4.
+Loading invalid library 'libinvalid.so' from Java, which will fail.
+Checking for support.
+Was to load 'libinvalid.so', force fail.
+getError() in native bridge.
+Catch UnsatisfiedLinkError exception as expected.
diff --git a/test/115-native-bridge/nativebridge.cc b/test/115-native-bridge/nativebridge.cc
index 87287f8acf..b3b89491bf 100644
--- a/test/115-native-bridge/nativebridge.cc
+++ b/test/115-native-bridge/nativebridge.cc
@@ -285,6 +285,10 @@ extern "C" bool native_bridge_initialize(const android::NativeBridgeRuntimeCallb
 }
 
 extern "C" void* native_bridge_loadLibrary(const char* libpath, int flag) {
+  if (strstr(libpath, "libinvalid.so") != nullptr) {
+    printf("Was to load 'libinvalid.so', force fail.\n");
+    return nullptr;
+  }
   size_t len = strlen(libpath);
   char* tmp = new char[len + 10];
   strncpy(tmp, libpath, len);
@@ -300,7 +304,7 @@ extern "C" void* native_bridge_loadLibrary(const char* libpath, int flag) {
     printf("Handle = nullptr!\n");
     printf("Was looking for %s.\n", libpath);
     printf("Error = %s.\n", dlerror());
-    char cwd[1024];
+    char cwd[1024] = {'\0'};
     if (getcwd(cwd, sizeof(cwd)) != nullptr) {
       printf("Current working dir: %s\n", cwd);
     }
@@ -437,8 +441,8 @@ extern "C" int native_bridge_unloadLibrary(void* handle ATTRIBUTE_UNUSED) {
 }
 
 extern "C" const char* native_bridge_getError() {
-  printf("dlerror() in native bridge.\n");
-  return nullptr;
+  printf("getError() in native bridge.\n");
+  return "";
 }
 
 extern "C" bool native_bridge_isPathSupported(const char* library_path ATTRIBUTE_UNUSED) {
diff --git a/test/115-native-bridge/run b/test/115-native-bridge/run
index 9290dd3cf4..22f5c67ddc 100644
--- a/test/115-native-bridge/run
+++ b/test/115-native-bridge/run
@@ -23,6 +23,7 @@ LIBPATH=${LIBPATH##*:}
 ln -sf ${LIBPATH}/libnativebridgetest.so .
 touch libarttest.so
 touch libarttestd.so
+touch libinvalid.so
 ln -sf ${LIBPATH}/libarttest.so libarttest2.so
 ln -sf ${LIBPATH}/libarttestd.so libarttestd2.so
 
diff --git a/test/115-native-bridge/src/NativeBridgeMain.java b/test/115-native-bridge/src/NativeBridgeMain.java
index c298b1b772..e8d1e4e326 100644
--- a/test/115-native-bridge/src/NativeBridgeMain.java
+++ b/test/115-native-bridge/src/NativeBridgeMain.java
@@ -16,6 +16,7 @@
 
 import java.lang.reflect.Method;
 import java.lang.System;
+import java.lang.Exception;
 
 // This is named Main as it is a copy of JniTest, so that we can re-use the native implementations
 // from libarttest.
@@ -33,6 +34,7 @@ class Main {
         testEnvironment();
         testNewStringObject();
         testSignalHandler();
+        testGetErrorByLoadInvalidLibrary();
     }
 
     public static native void testFindClassOnAttachedNativeThread();
@@ -183,6 +185,20 @@ class Main {
     }
 
     private static native int testSignal();
+
+    // Test the path from Java to getError() of NativeBridge.
+    //
+    // Load invalid library 'libinvalid.so' from Java. Library loading will fail since it's
+    // invalid (empty file). ART, NativeLoader actually, calls getError() to dump error message.
+    // After that in Java, catch UnsatisfiedLinkError exception to confirm.
+    private static void testGetErrorByLoadInvalidLibrary() {
+        System.out.println("Loading invalid library 'libinvalid.so' from Java, which will fail.");
+        try {
+            System.loadLibrary("invalid");
+        } catch (java.lang.UnsatisfiedLinkError e){
+            System.out.println("Catch UnsatisfiedLinkError exception as expected.");
+        }
+    }
 }
 
 public class NativeBridgeMain {
diff --git a/test/409-materialized-condition/src/Main.java b/test/409-materialized-condition/src/Main.java
index 8a814a2da1..0c179a99de 100644
--- a/test/409-materialized-condition/src/Main.java
+++ b/test/409-materialized-condition/src/Main.java
@@ -50,36 +50,6 @@ public class Main {
     return b;
   }
 
-  public static boolean $noinline$intEq0(int x) {
-    return x == 0;
-  }
-
-  public static boolean $noinline$intNe0(int x) {
-    return x != 0;
-  }
-
-  public static boolean $noinline$longEq0(long x) {
-    return x == 0;
-  }
-
-  public static boolean $noinline$longNe0(long x) {
-    return x != 0;
-  }
-
-  public static boolean $noinline$longEqCst(long x) {
-    return x == 0x0123456789ABCDEFL;
-  }
-
-  public static boolean $noinline$longNeCst(long x) {
-    return x != 0x0123456789ABCDEFL;
-  }
-
-  public static void assertEqual(boolean expected, boolean actual) {
-    if (expected != actual) {
-      throw new Error("Assertion failed: " + expected + " != " + actual);
-    }
-  }
-
   public static void main(String[] args) {
     System.out.println("foo1");
     int res = foo1();
@@ -92,46 +62,5 @@ public class Main {
     if (res != 42) {
       throw new Error("Unexpected return value for foo2: " + res + ", expected 42.");
     }
-
-    int[] int_inputs = {0, 1, -1, Integer.MIN_VALUE, Integer.MAX_VALUE, 42, -9000};
-    long[] long_inputs = {
-        0L, 1L, -1L, Long.MIN_VALUE, Long.MAX_VALUE, 0x100000000L,
-        0x100000001L, -9000L, 0x0123456789ABCDEFL};
-
-    boolean[] int_eq_0_expected = {true, false, false, false, false, false, false};
-
-    for (int i = 0; i < int_inputs.length; i++) {
-      assertEqual(int_eq_0_expected[i], $noinline$intEq0(int_inputs[i]));
-    }
-
-    boolean[] int_ne_0_expected = {false, true, true, true, true, true, true};
-
-    for (int i = 0; i < int_inputs.length; i++) {
-      assertEqual(int_ne_0_expected[i], $noinline$intNe0(int_inputs[i]));
-    }
-
-    boolean[] long_eq_0_expected = {true, false, false, false, false, false, false, false, false};
-
-    for (int i = 0; i < long_inputs.length; i++) {
-      assertEqual(long_eq_0_expected[i], $noinline$longEq0(long_inputs[i]));
-    }
-
-    boolean[] long_ne_0_expected = {false, true, true, true, true, true, true, true, true};
-
-    for (int i = 0; i < long_inputs.length; i++) {
-      assertEqual(long_ne_0_expected[i], $noinline$longNe0(long_inputs[i]));
-    }
-
-    boolean[] long_eq_cst_expected = {false, false, false, false, false, false, false, false, true};
-
-    for (int i = 0; i < long_inputs.length; i++) {
-      assertEqual(long_eq_cst_expected[i], $noinline$longEqCst(long_inputs[i]));
-    }
-
-    boolean[] long_ne_cst_expected = {true, true, true, true, true, true, true, true, false};
-
-    for (int i = 0; i < long_inputs.length; i++) {
-      assertEqual(long_ne_cst_expected[i], $noinline$longNeCst(long_inputs[i]));
-    }
   }
 }
diff --git a/test/527-checker-array-access-simd/expected.txt b/test/527-checker-array-access-simd/expected.txt
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/test/527-checker-array-access-simd/expected.txt
diff --git a/test/527-checker-array-access-simd/info.txt b/test/527-checker-array-access-simd/info.txt
new file mode 100644
index 0000000000..f147943043
--- /dev/null
+++ b/test/527-checker-array-access-simd/info.txt
@@ -0,0 +1 @@
+Test arm- and arm64-specific array access optimization for simd loops.
diff --git a/test/527-checker-array-access-simd/src/Main.java b/test/527-checker-array-access-simd/src/Main.java
new file mode 100644
index 0000000000..8af5465faf
--- /dev/null
+++ b/test/527-checker-array-access-simd/src/Main.java
@@ -0,0 +1,223 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  public static void assertIntEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.checkIntCase(int[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Index>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkIntCase(int[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Const2:i\d+>>        IntConstant 2
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Address1>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-DAG:             <<Address2:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Address2>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkIntCase(int[]) GVN$after_arch (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Const2:i\d+>>        IntConstant 2
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Address1>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-NOT:                                    IntermediateAddress
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Address1>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkIntCase(int[]) disassembly (after)
+  /// CHECK:                                        IntermediateAddressIndex
+  /// CHECK-NEXT:                                   add w{{[0-9]+}}, w{{[0-9]+}}, w{{[0-9]+}}, lsl #2
+  public static void checkIntCase(int[] a) {
+    for (int i = 0; i < 128; i++) {
+      a[i] += 5;
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.checkByteCase(byte[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Index>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkByteCase(byte[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const0:i\d+>>        IntConstant 0
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const0>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Address1>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-DAG:             <<Address2:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const0>>]
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Address2>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkByteCase(byte[]) GVN$after_arch (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const0:i\d+>>        IntConstant 0
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const0>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array>>,<<Address1>>]
+  /// CHECK-DAG:             <<Add:d\d+>>           VecAdd [<<Load>>,<<Repl>>]
+  /// CHECK-NOT:                                    IntermediateAddress
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Address1>>,<<Add>>]
+
+  /// CHECK-START-ARM64: void Main.checkByteCase(byte[]) disassembly (after)
+  /// CHECK:                                        IntermediateAddressIndex
+  /// CHECK-NEXT:                                   add w{{[0-9]+}}, w{{[0-9]+}}, #0x{{[0-9a-fA-F]+}}
+  /// CHECK:                                        VecLoad
+  /// CHECK-NEXT:                                   ldr q{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}]
+  /// CHECK:                                        VecStore
+  /// CHECK-NEXT:                                   str q{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}]
+  public static void checkByteCase(byte[] a) {
+    for (int i = 0; i < 128; i++) {
+      a[i] += 5;
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.checkSingleAccess(int[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Index>>,<<Repl>>]
+
+  /// CHECK-START-ARM64: void Main.checkSingleAccess(int[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:             <<Array:l\d+>>         ParameterValue
+  /// CHECK-DAG:             <<Const0:i\d+>>        IntConstant 0
+  /// CHECK-DAG:             <<Const5:i\d+>>        IntConstant 5
+  /// CHECK-DAG:             <<Repl:d\d+>>          VecReplicateScalar [<<Const5>>]
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:                                    VecStore [<<Array>>,<<Index>>,<<Repl>>]
+  /// CHECK-NOT:                                    IntermediateAddress
+  public static void checkSingleAccess(int[] a) {
+    for (int i = 0; i < 128; i++) {
+      a[i] = 5;
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.checkInt2Float(int[], float[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:             <<Array1:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<Array2:l\d+>>        ParameterValue
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array1>>,<<Index>>]
+  /// CHECK-DAG:             <<Cnv:d\d+>>           VecCnv [<<Load>>]
+  /// CHECK-DAG:                                    VecStore [<<Array2>>,<<Index>>,<<Cnv>>]
+
+  /// CHECK-START-ARM64: void Main.checkInt2Float(int[], float[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:             <<Array1:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<Array2:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Const2:i\d+>>        IntConstant 2
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array1>>,<<Address1>>]
+  /// CHECK-DAG:             <<Cnv:d\d+>>           VecCnv [<<Load>>]
+  /// CHECK-DAG:             <<Address2:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:                                    VecStore [<<Array2>>,<<Address2>>,<<Cnv>>]
+
+  /// CHECK-START-ARM64: void Main.checkInt2Float(int[], float[]) GVN$after_arch (after)
+  /// CHECK-DAG:             <<Array1:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<Array2:l\d+>>        ParameterValue
+  /// CHECK-DAG:             <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK-DAG:             <<Const2:i\d+>>        IntConstant 2
+  //  -------------- Loop
+  /// CHECK-DAG:             <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                    If
+  /// CHECK-DAG:             <<Address1:i\d+>>      IntermediateAddressIndex [<<Index>>,<<DataOffset>>,<<Const2>>]
+  /// CHECK-DAG:             <<Load:d\d+>>          VecLoad [<<Array1>>,<<Address1>>]
+  /// CHECK-DAG:             <<Cnv:d\d+>>           VecCnv [<<Load>>]
+  /// CHECK-NOT:                                    IntermediateAddress
+  /// CHECK-DAG:                                    VecStore [<<Array2>>,<<Address1>>,<<Cnv>>]
+
+  /// CHECK-START-ARM64: void Main.checkInt2Float(int[], float[]) disassembly (after)
+  /// CHECK:                                        IntermediateAddressIndex
+  /// CHECK-NEXT:                                   add w{{[0-9]+}}, w{{[0-9]+}}, w{{[0-9]+}}, lsl #2
+  public static void checkInt2Float(int[] a, float[] b) {
+    for (int i = 0; i < 128; i++) {
+      b[i] = (float) a[i];
+    }
+  }
+
+  public static final int ARRAY_SIZE = 1024;
+
+  public static int calcArraySum(int[] a, byte[] b, float[] c) {
+    int sum = 0;
+    for (int i = 0; i < 128; i++) {
+      sum += a[i] + b[i] + (int) c[i];
+    }
+    return sum;
+  }
+
+  public static void main(String[] args) {
+    byte[] ba = new byte[ARRAY_SIZE];
+    int[] ia = new int[ARRAY_SIZE];
+    float[] fa = new float[ARRAY_SIZE];
+
+    checkSingleAccess(ia);
+    checkIntCase(ia);
+    checkByteCase(ba);
+    checkInt2Float(ia, fa);
+
+    assertIntEquals(3200, calcArraySum(ia, ba, fa));
+  }
+}
diff --git a/test/570-checker-select/src/Main.java b/test/570-checker-select/src/Main.java
index 2dad14ce31..3ac6f89c5f 100644
--- a/test/570-checker-select/src/Main.java
+++ b/test/570-checker-select/src/Main.java
@@ -414,46 +414,6 @@ public class Main {
     return a > 0x7FFFFFFFFFFFFFFFL ? x : y;
   }
 
-  /// CHECK-START-ARM: long Main.$noinline$LongNonmatCondCst_LongVarVar4(long, long, long) disassembly (after)
-  /// CHECK:               Select
-  /// CHECK-NEXT:            orrs ip, {{r\d+}}, {{r\d+}}
-  /// CHECK-NOT:             cmp
-  /// CHECK-NOT:             sbcs
-
-  public static long $noinline$LongNonmatCondCst_LongVarVar4(long a, long x, long y) {
-    return a == 0 ? x : y;
-  }
-
-  /// CHECK-START-ARM: long Main.$noinline$LongNonmatCondCst_LongVarVar5(long, long, long) disassembly (after)
-  /// CHECK:               Select
-  /// CHECK-NEXT:            orrs ip, {{r\d+}}, {{r\d+}}
-  /// CHECK-NOT:             cmp
-  /// CHECK-NOT:             sbcs
-
-  public static long $noinline$LongNonmatCondCst_LongVarVar5(long a, long x, long y) {
-    return a != 0 ? x : y;
-  }
-
-  /// CHECK-START-ARM: long Main.$noinline$LongNonmatCondCst_LongVarVar6(long, long, long) disassembly (after)
-  /// CHECK:               Select
-  /// CHECK-NEXT:            cmp {{r\d+}}, #0
-  /// CHECK-NOT:             cmp
-  /// CHECK-NOT:             sbcs
-
-  public static long $noinline$LongNonmatCondCst_LongVarVar6(long a, long x, long y) {
-    return a >= 0 ? x : y;
-  }
-
-  /// CHECK-START-ARM: long Main.$noinline$LongNonmatCondCst_LongVarVar7(long, long, long) disassembly (after)
-  /// CHECK:               Select
-  /// CHECK-NEXT:            cmp {{r\d+}}, #0
-  /// CHECK-NOT:             cmp
-  /// CHECK-NOT:             sbcs
-
-  public static long $noinline$LongNonmatCondCst_LongVarVar7(long a, long x, long y) {
-    return a < 0 ? x : y;
-  }
-
   /// CHECK-START: long Main.LongMatCond_LongVarVar(long, long, long, long) register (after)
   /// CHECK:            <<Cond:z\d+>> LessThanOrEqual [{{j\d+}},{{j\d+}}]
   /// CHECK:            <<Sel1:j\d+>> Select [{{j\d+}},{{j\d+}},<<Cond>>]
@@ -728,37 +688,6 @@ public class Main {
 
     assertEqual(7L, $noinline$LongNonmatCondCst_LongVarVar3(2L, 5L, 7L));
 
-    long[] long_inputs = {
-        0L, 1L, -1L, Long.MIN_VALUE, Long.MAX_VALUE, 2L, 0x100000000L, 0xFFFFFFFF00000000L, -9000L};
-
-    long[] expected_1 = {5L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L};
-
-    for (int i = 0; i < long_inputs.length; i++) {
-      assertEqual(expected_1[i], $noinline$LongNonmatCondCst_LongVarVar4(long_inputs[i], 5L, 7L));
-    }
-
-    long[] expected_2 = {7L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L};
-
-    for (int i = 0; i < long_inputs.length; i++) {
-      assertEqual(expected_2[i], $noinline$LongNonmatCondCst_LongVarVar5(long_inputs[i], 5L, 7L));
-    }
-
-    long[] expected_3 = {5L, 5L, 7L, 7L, 5L, 5L, 5L, 7L, 7L};
-
-    for (int i = 0; i < long_inputs.length; i++) {
-      assertEqual(expected_3[i], $noinline$LongNonmatCondCst_LongVarVar6(long_inputs[i], 5L, 7L));
-    }
-
-    long[] expected_4 = {7L, 7L, 5L, 5L, 7L, 7L, 7L, 5L, 5L};
-
-    for (int i = 0; i < long_inputs.length; i++) {
-      assertEqual(expected_4[i], $noinline$LongNonmatCondCst_LongVarVar7(long_inputs[i], 5L, 7L));
-    }
-
-    assertEqual(7L, $noinline$LongNonmatCondCst_LongVarVar7(0L, 5L, 7L));
-    assertEqual(7L, $noinline$LongNonmatCondCst_LongVarVar7(2L, 5L, 7L));
-    assertEqual(5L, $noinline$LongNonmatCondCst_LongVarVar7(-9000L, 5L, 7L));
-
     assertEqual(5, FloatLtNonmatCond_IntVarVar(3, 2, 5, 7));
     assertEqual(7, FloatLtNonmatCond_IntVarVar(2, 3, 5, 7));
     assertEqual(7, FloatLtNonmatCond_IntVarVar(Float.NaN, 2, 5, 7));
diff --git a/test/913-heaps/heaps.cc b/test/913-heaps/heaps.cc
index f39c5f16d7..ec36cebd43 100644
--- a/test/913-heaps/heaps.cc
+++ b/test/913-heaps/heaps.cc
@@ -823,6 +823,14 @@ static GetObjectHeapId gGetObjectHeapIdFn = nullptr;
 using GetHeapName = jvmtiError(*)(jvmtiEnv*, jint, char**, ...);
 static GetHeapName gGetHeapNameFn = nullptr;
 
+using IterateThroughHeapExt = jvmtiError(*)(jvmtiEnv*,
+                                            jint,
+                                            jclass,
+                                            const jvmtiHeapCallbacks*,
+                                            const void*);
+static IterateThroughHeapExt gIterateThroughHeapExt = nullptr;
+
+
 static void FreeExtensionFunctionInfo(jvmtiExtensionFunctionInfo* extensions, jint count) {
   for (size_t i = 0; i != static_cast<size_t>(count); ++i) {
     jvmti_env->Deallocate(reinterpret_cast<unsigned char*>(extensions[i].id));
@@ -886,6 +894,38 @@ extern "C" JNIEXPORT void JNICALL Java_art_Test913_checkForExtensionApis(
       CHECK(extensions[i].errors != nullptr);
       CHECK(extensions[i].errors[0] == JVMTI_ERROR_ILLEGAL_ARGUMENT);
     }
+
+    if (strcmp("com.android.art.heap.iterate_through_heap_ext", extensions[i].id) == 0) {
+      CHECK(gIterateThroughHeapExt == nullptr);
+      gIterateThroughHeapExt = reinterpret_cast<IterateThroughHeapExt>(extensions[i].func);
+
+      CHECK_EQ(extensions[i].param_count, 4);
+
+      CHECK_EQ(strcmp("heap_filter", extensions[i].params[0].name), 0);
+      CHECK_EQ(extensions[i].params[0].base_type, JVMTI_TYPE_JINT);
+      CHECK_EQ(extensions[i].params[0].kind, JVMTI_KIND_IN);
+
+      CHECK_EQ(strcmp("klass", extensions[i].params[1].name), 0);
+      CHECK_EQ(extensions[i].params[1].base_type, JVMTI_TYPE_JCLASS);
+      CHECK_EQ(extensions[i].params[1].kind, JVMTI_KIND_IN);
+      CHECK_EQ(extensions[i].params[1].null_ok, true);
+
+      CHECK_EQ(strcmp("callbacks", extensions[i].params[2].name), 0);
+      CHECK_EQ(extensions[i].params[2].base_type, JVMTI_TYPE_CVOID);
+      CHECK_EQ(extensions[i].params[2].kind, JVMTI_KIND_IN_PTR);
+      CHECK_EQ(extensions[i].params[2].null_ok, false);
+
+      CHECK_EQ(strcmp("user_data", extensions[i].params[3].name), 0);
+      CHECK_EQ(extensions[i].params[3].base_type, JVMTI_TYPE_CVOID);
+      CHECK_EQ(extensions[i].params[3].kind, JVMTI_KIND_IN_PTR);
+      CHECK_EQ(extensions[i].params[3].null_ok, true);
+
+      CHECK_EQ(extensions[i].error_count, 3);
+      CHECK(extensions[i].errors != nullptr);
+      CHECK(extensions[i].errors[0] == JVMTI_ERROR_MUST_POSSESS_CAPABILITY);
+      CHECK(extensions[i].errors[1] == JVMTI_ERROR_INVALID_CLASS);
+      CHECK(extensions[i].errors[2] == JVMTI_ERROR_NULL_POINTER);
+    }
   }
 
   CHECK(gGetObjectHeapIdFn != nullptr);
@@ -1004,5 +1044,39 @@ extern "C" JNIEXPORT void JNICALL Java_art_Test913_checkGetObjectHeapIdInCallbac
   }
 }
 
+static bool gFoundExt = false;
+
+static jint JNICALL HeapIterationExtCallback(jlong class_tag ATTRIBUTE_UNUSED,
+                                             jlong size ATTRIBUTE_UNUSED,
+                                             jlong* tag_ptr,
+                                             jint length ATTRIBUTE_UNUSED,
+                                             void* user_data ATTRIBUTE_UNUSED,
+                                             jint heap_id) {
+  // We expect some tagged objects at or above the threshold, where the expected heap id is
+  // encoded into lowest byte.
+  constexpr jlong kThreshold = 30000000;
+  jlong tag = *tag_ptr;
+  if (tag >= kThreshold) {
+    jint expected_heap_id = static_cast<jint>(tag - kThreshold);
+    CHECK_EQ(expected_heap_id, heap_id);
+    gFoundExt = true;
+  }
+  return 0;
+}
+
+extern "C" JNIEXPORT void JNICALL Java_art_Test913_iterateThroughHeapExt(
+    JNIEnv* env, jclass klass ATTRIBUTE_UNUSED) {
+  CHECK(gIterateThroughHeapExt != nullptr);
+
+  jvmtiHeapCallbacks callbacks;
+  memset(&callbacks, 0, sizeof(jvmtiHeapCallbacks));
+  callbacks.heap_iteration_callback =
+      reinterpret_cast<decltype(callbacks.heap_iteration_callback)>(HeapIterationExtCallback);
+
+  jvmtiError ret = gIterateThroughHeapExt(jvmti_env, 0, nullptr, &callbacks, nullptr);
+  JvmtiErrorToException(env, jvmti_env, ret);
+  CHECK(gFoundExt);
+}
+
 }  // namespace Test913Heaps
 }  // namespace art
diff --git a/test/913-heaps/src/art/Test913.java b/test/913-heaps/src/art/Test913.java
index 6694aad868..97f48eea03 100644
--- a/test/913-heaps/src/art/Test913.java
+++ b/test/913-heaps/src/art/Test913.java
@@ -261,6 +261,15 @@ public class Test913 {
     checkGetObjectHeapIdInCallback(100000, objClassExpectedHeapId);
     checkGetObjectHeapIdInCallback(100001, 3);
 
+    long baseTag = 30000000;
+    setTag(Object.class, baseTag + objClassExpectedHeapId);
+    setTag(Class.class, baseTag + objClassExpectedHeapId);
+    Object o = new Object();
+    extensionTestHolder.add(o);
+    setTag(o, baseTag + 3);
+
+    iterateThroughHeapExt();
+
     extensionTestHolder = null;
   }
 
@@ -719,4 +728,6 @@ public class Test913 {
   public static native String[] followReferencesString(Object initialObject);
   public static native String followReferencesPrimitiveArray(Object initialObject);
   public static native String followReferencesPrimitiveFields(Object initialObject);
+
+  private static native void iterateThroughHeapExt();
 }
diff --git a/test/testrunner/testrunner.py b/test/testrunner/testrunner.py
index 9a437cc822..c99159f1ae 100755
--- a/test/testrunner/testrunner.py
+++ b/test/testrunner/testrunner.py
@@ -828,7 +828,15 @@ def get_default_threads(target):
     adb_command = 'adb shell cat /sys/devices/system/cpu/present'
     cpu_info_proc = subprocess.Popen(adb_command.split(), stdout=subprocess.PIPE)
     cpu_info = cpu_info_proc.stdout.read()
-    return int(cpu_info.split('-')[1])
+    if type(cpu_info) is bytes:
+      cpu_info = cpu_info.decode('utf-8')
+    cpu_info_regex = '\d*-(\d*)'
+    match = re.match(cpu_info_regex, cpu_info)
+    if match:
+      return int(match.group(1))
+    else:
+      raise ValueError('Unable to predict the concurrency for the target. '
+                       'Is device connected?')
   else:
     return multiprocessing.cpu_count()