ARM/AOT: Allow 16-bit LDR for Baker read barrier loads.

Test: m test-art-target-gtest
Test: testrunner.py --target on Nexus 6P.
Test: testrunner.py --target on Nexus 6P with heap poisoning enabled.
Test: Repeat the above tests with ART_USE_OLD_ARM_BACKEND=true.
Bug: 29516974
Bug: 30126666
Bug: 36141117
Change-Id: I458f2ec5fe9abead4db06c7595d992945096fb68
diff --git a/compiler/linker/arm/relative_patcher_thumb2.cc b/compiler/linker/arm/relative_patcher_thumb2.cc
index ced52ff..a98aedf 100644
--- a/compiler/linker/arm/relative_patcher_thumb2.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2.cc
@@ -18,6 +18,7 @@
 
 #include "arch/arm/asm_support_arm.h"
 #include "art_method.h"
+#include "base/bit_utils.h"
 #include "compiled_method.h"
 #include "entrypoints/quick/quick_entrypoints_enum.h"
 #include "lock_word.h"
@@ -112,12 +113,22 @@
     // Check that the next instruction matches the expected LDR.
     switch (kind) {
       case BakerReadBarrierKind::kField: {
-        DCHECK_GE(code->size() - literal_offset, 8u);
-        uint32_t next_insn = GetInsn32(code, literal_offset + 4u);
-        // LDR (immediate) with correct base_reg.
-        CheckValidReg((next_insn >> 12) & 0xfu);  // Check destination register.
-        const uint32_t base_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
-        CHECK_EQ(next_insn & 0xffff0000u, 0xf8d00000u | (base_reg << 16));
+        BakerReadBarrierWidth width = BakerReadBarrierWidthField::Decode(encoded_data);
+        if (width == BakerReadBarrierWidth::kWide) {
+          DCHECK_GE(code->size() - literal_offset, 8u);
+          uint32_t next_insn = GetInsn32(code, literal_offset + 4u);
+          // LDR (immediate), encoding T3, with correct base_reg.
+          CheckValidReg((next_insn >> 12) & 0xfu);  // Check destination register.
+          const uint32_t base_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
+          CHECK_EQ(next_insn & 0xffff0000u, 0xf8d00000u | (base_reg << 16));
+        } else {
+          DCHECK_GE(code->size() - literal_offset, 6u);
+          uint32_t next_insn = GetInsn16(code, literal_offset + 4u);
+          // LDR (immediate), encoding T1, with correct base_reg.
+          CheckValidReg(next_insn & 0x7u);  // Check destination register.
+          const uint32_t base_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
+          CHECK_EQ(next_insn & 0xf838u, 0x6800u | (base_reg << 3));
+        }
         break;
       }
       case BakerReadBarrierKind::kArray: {
@@ -131,11 +142,20 @@
         break;
       }
       case BakerReadBarrierKind::kGcRoot: {
-        DCHECK_GE(literal_offset, 4u);
-        uint32_t prev_insn = GetInsn32(code, literal_offset - 4u);
-        // LDR (immediate) with correct root_reg.
-        const uint32_t root_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
-        CHECK_EQ(prev_insn & 0xfff0f000u, 0xf8d00000u | (root_reg << 12));
+        BakerReadBarrierWidth width = BakerReadBarrierWidthField::Decode(encoded_data);
+        if (width == BakerReadBarrierWidth::kWide) {
+          DCHECK_GE(literal_offset, 4u);
+          uint32_t prev_insn = GetInsn32(code, literal_offset - 4u);
+          // LDR (immediate), encoding T3, with correct root_reg.
+          const uint32_t root_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
+          CHECK_EQ(prev_insn & 0xfff0f000u, 0xf8d00000u | (root_reg << 12));
+        } else {
+          DCHECK_GE(literal_offset, 2u);
+          uint32_t prev_insn = GetInsn16(code, literal_offset - 2u);
+          // LDR (immediate), encoding T1, with correct root_reg.
+          const uint32_t root_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
+          CHECK_EQ(prev_insn & 0xf807u, 0x6800u | root_reg);
+        }
         break;
       }
       default:
@@ -160,7 +180,8 @@
 static void EmitGrayCheckAndFastPath(arm::ArmVIXLAssembler& assembler,
                                      vixl::aarch32::Register base_reg,
                                      vixl::aarch32::MemOperand& lock_word,
-                                     vixl::aarch32::Label* slow_path) {
+                                     vixl::aarch32::Label* slow_path,
+                                     int32_t raw_ldr_offset) {
   using namespace vixl::aarch32;  // NOLINT(build/namespaces)
   // Load the lock word containing the rb_state.
   __ Ldr(ip, lock_word);
@@ -169,14 +190,7 @@
   static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
   __ Tst(ip, Operand(LockWord::kReadBarrierStateMaskShifted));
   __ B(ne, slow_path, /* is_far_target */ false);
-  static_assert(
-      BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET,
-      "Field and array LDR offsets must be the same to reuse the same code.");
-  // Adjust the return address back to the LDR (1 instruction; 2 for heap poisoning).
-  static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                "Field LDR must be 1 instruction (4B) before the return address label; "
-                " 2 instructions (8B) for heap poisoning.");
-  __ Add(lr, lr, BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET);
+  __ Add(lr, lr, raw_ldr_offset);
   // Introduce a dependency on the lock_word including rb_state,
   // to prevent load-load reordering, and without using
   // a memory barrier (which would be more expensive).
@@ -199,6 +213,7 @@
       CheckValidReg(base_reg.GetCode());
       Register holder_reg(BakerReadBarrierSecondRegField::Decode(encoded_data));
       CheckValidReg(holder_reg.GetCode());
+      BakerReadBarrierWidth width = BakerReadBarrierWidthField::Decode(encoded_data);
       UseScratchRegisterScope temps(assembler.GetVIXLAssembler());
       temps.Exclude(ip);
       // If base_reg differs from holder_reg, the offset was too large and we must have
@@ -210,16 +225,30 @@
       }
       vixl::aarch32::Label slow_path;
       MemOperand lock_word(holder_reg, mirror::Object::MonitorOffset().Int32Value());
-      EmitGrayCheckAndFastPath(assembler, base_reg, lock_word, &slow_path);
+      const int32_t raw_ldr_offset = (width == BakerReadBarrierWidth::kWide)
+          ? BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET
+          : BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET;
+      EmitGrayCheckAndFastPath(assembler, base_reg, lock_word, &slow_path, raw_ldr_offset);
       __ Bind(&slow_path);
       const int32_t ldr_offset = /* Thumb state adjustment (LR contains Thumb state). */ -1 +
-                                 BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET;
-      MemOperand ldr_half_address(lr, ldr_offset + 2);
-      __ Ldrh(ip, ldr_half_address);          // Load the LDR immediate half-word with "Rt | imm12".
-      __ Ubfx(ip, ip, 0, 12);                 // Extract the offset imm12.
-      __ Ldr(ip, MemOperand(base_reg, ip));   // Load the reference.
+                                 raw_ldr_offset;
+      Register ep_reg(kBakerCcEntrypointRegister);
+      if (width == BakerReadBarrierWidth::kWide) {
+        MemOperand ldr_half_address(lr, ldr_offset + 2);
+        __ Ldrh(ip, ldr_half_address);        // Load the LDR immediate half-word with "Rt | imm12".
+        __ Ubfx(ip, ip, 0, 12);               // Extract the offset imm12.
+        __ Ldr(ip, MemOperand(base_reg, ip));   // Load the reference.
+      } else {
+        MemOperand ldr_address(lr, ldr_offset);
+        __ Ldrh(ip, ldr_address);             // Load the LDR immediate, encoding T1.
+        __ Add(ep_reg,                        // Adjust the entrypoint address to the entrypoint
+               ep_reg,                        // for narrow LDR.
+               Operand(BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_ENTRYPOINT_OFFSET));
+        __ Ubfx(ip, ip, 6, 5);                // Extract the imm5, i.e. offset / 4.
+        __ Ldr(ip, MemOperand(base_reg, ip, LSL, 2));   // Load the reference.
+      }
       // Do not unpoison. With heap poisoning enabled, the entrypoint expects a poisoned reference.
-      __ Bx(Register(kBakerCcEntrypointRegister));  // Jump to the entrypoint.
+      __ Bx(ep_reg);                          // Jump to the entrypoint.
       if (holder_reg.Is(base_reg)) {
         // Add null check slow path. The stack map is at the address pointed to by LR.
         __ Bind(&throw_npe);
@@ -233,6 +262,7 @@
       Register base_reg(BakerReadBarrierFirstRegField::Decode(encoded_data));
       CheckValidReg(base_reg.GetCode());
       DCHECK_EQ(kInvalidEncodedReg, BakerReadBarrierSecondRegField::Decode(encoded_data));
+      DCHECK(BakerReadBarrierWidth::kWide == BakerReadBarrierWidthField::Decode(encoded_data));
       UseScratchRegisterScope temps(assembler.GetVIXLAssembler());
       temps.Exclude(ip);
       vixl::aarch32::Label slow_path;
@@ -240,10 +270,11 @@
           mirror::Array::DataOffset(Primitive::ComponentSize(Primitive::kPrimNot)).Int32Value();
       MemOperand lock_word(base_reg, mirror::Object::MonitorOffset().Int32Value() - data_offset);
       DCHECK_LT(lock_word.GetOffsetImmediate(), 0);
-      EmitGrayCheckAndFastPath(assembler, base_reg, lock_word, &slow_path);
+      const int32_t raw_ldr_offset = BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET;
+      EmitGrayCheckAndFastPath(assembler, base_reg, lock_word, &slow_path, raw_ldr_offset);
       __ Bind(&slow_path);
       const int32_t ldr_offset = /* Thumb state adjustment (LR contains Thumb state). */ -1 +
-                                 BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET;
+                                 raw_ldr_offset;
       MemOperand ldr_address(lr, ldr_offset + 2);
       __ Ldrb(ip, ldr_address);               // Load the LDR (register) byte with "00 | imm2 | Rm",
                                               // i.e. Rm+32 because the scale in imm2 is 2.
@@ -261,6 +292,7 @@
       Register root_reg(BakerReadBarrierFirstRegField::Decode(encoded_data));
       CheckValidReg(root_reg.GetCode());
       DCHECK_EQ(kInvalidEncodedReg, BakerReadBarrierSecondRegField::Decode(encoded_data));
+      BakerReadBarrierWidth width = BakerReadBarrierWidthField::Decode(encoded_data);
       UseScratchRegisterScope temps(assembler.GetVIXLAssembler());
       temps.Exclude(ip);
       vixl::aarch32::Label return_label, not_marked, forwarding_address;
@@ -280,7 +312,10 @@
       // Adjust the art_quick_read_barrier_mark_introspection address in kBakerCcEntrypointRegister
       // to art_quick_read_barrier_mark_introspection_gc_roots.
       Register ep_reg(kBakerCcEntrypointRegister);
-      __ Add(ep_reg, ep_reg, Operand(BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET));
+      int32_t entrypoint_offset = (width == BakerReadBarrierWidth::kWide)
+          ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET
+          : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET;
+      __ Add(ep_reg, ep_reg, Operand(entrypoint_offset));
       __ Mov(ip, root_reg);
       __ Bx(ep_reg);
       __ Bind(&forwarding_address);
@@ -344,7 +379,7 @@
 
 void Thumb2RelativePatcher::SetInsn32(std::vector<uint8_t>* code, uint32_t offset, uint32_t value) {
   DCHECK_LE(offset + 4u, code->size());
-  DCHECK_EQ(offset & 1u, 0u);
+  DCHECK_ALIGNED(offset, 2u);
   uint8_t* addr = &(*code)[offset];
   addr[0] = (value >> 16) & 0xff;
   addr[1] = (value >> 24) & 0xff;
@@ -354,7 +389,7 @@
 
 uint32_t Thumb2RelativePatcher::GetInsn32(ArrayRef<const uint8_t> code, uint32_t offset) {
   DCHECK_LE(offset + 4u, code.size());
-  DCHECK_EQ(offset & 1u, 0u);
+  DCHECK_ALIGNED(offset, 2u);
   const uint8_t* addr = &code[offset];
   return
       (static_cast<uint32_t>(addr[0]) << 16) +
@@ -369,5 +404,18 @@
   return GetInsn32(ArrayRef<const uint8_t>(*code), offset);
 }
 
+uint32_t Thumb2RelativePatcher::GetInsn16(ArrayRef<const uint8_t> code, uint32_t offset) {
+  DCHECK_LE(offset + 2u, code.size());
+  DCHECK_ALIGNED(offset, 2u);
+  const uint8_t* addr = &code[offset];
+  return (static_cast<uint32_t>(addr[0]) << 0) + (static_cast<uint32_t>(addr[1]) << 8);
+}
+
+template <typename Vector>
+uint32_t Thumb2RelativePatcher::GetInsn16(Vector* code, uint32_t offset) {
+  static_assert(std::is_same<typename Vector::value_type, uint8_t>::value, "Invalid value type");
+  return GetInsn16(ArrayRef<const uint8_t>(*code), offset);
+}
+
 }  // namespace linker
 }  // namespace art
diff --git a/compiler/linker/arm/relative_patcher_thumb2.h b/compiler/linker/arm/relative_patcher_thumb2.h
index 7fad245..7e787d2 100644
--- a/compiler/linker/arm/relative_patcher_thumb2.h
+++ b/compiler/linker/arm/relative_patcher_thumb2.h
@@ -35,26 +35,37 @@
  public:
   static constexpr uint32_t kBakerCcEntrypointRegister = 4u;
 
-  static uint32_t EncodeBakerReadBarrierFieldData(uint32_t base_reg, uint32_t holder_reg) {
+  static uint32_t EncodeBakerReadBarrierFieldData(uint32_t base_reg,
+                                                  uint32_t holder_reg,
+                                                  bool narrow) {
     CheckValidReg(base_reg);
     CheckValidReg(holder_reg);
+    DCHECK(!narrow || base_reg < 8u) << base_reg;
+    BakerReadBarrierWidth width =
+        narrow ? BakerReadBarrierWidth::kNarrow : BakerReadBarrierWidth::kWide;
     return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kField) |
            BakerReadBarrierFirstRegField::Encode(base_reg) |
-           BakerReadBarrierSecondRegField::Encode(holder_reg);
+           BakerReadBarrierSecondRegField::Encode(holder_reg) |
+           BakerReadBarrierWidthField::Encode(width);
   }
 
   static uint32_t EncodeBakerReadBarrierArrayData(uint32_t base_reg) {
     CheckValidReg(base_reg);
     return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kArray) |
            BakerReadBarrierFirstRegField::Encode(base_reg) |
-           BakerReadBarrierSecondRegField::Encode(kInvalidEncodedReg);
+           BakerReadBarrierSecondRegField::Encode(kInvalidEncodedReg) |
+           BakerReadBarrierWidthField::Encode(BakerReadBarrierWidth::kWide);
   }
 
-  static uint32_t EncodeBakerReadBarrierGcRootData(uint32_t root_reg) {
+  static uint32_t EncodeBakerReadBarrierGcRootData(uint32_t root_reg, bool narrow) {
     CheckValidReg(root_reg);
+    DCHECK(!narrow || root_reg < 8u) << root_reg;
+    BakerReadBarrierWidth width =
+        narrow ? BakerReadBarrierWidth::kNarrow : BakerReadBarrierWidth::kWide;
     return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kGcRoot) |
            BakerReadBarrierFirstRegField::Encode(root_reg) |
-           BakerReadBarrierSecondRegField::Encode(kInvalidEncodedReg);
+           BakerReadBarrierSecondRegField::Encode(kInvalidEncodedReg) |
+           BakerReadBarrierWidthField::Encode(width);
   }
 
   explicit Thumb2RelativePatcher(RelativePatcherTargetProvider* provider);
@@ -86,6 +97,12 @@
     kLast
   };
 
+  enum class BakerReadBarrierWidth : uint8_t {
+    kWide,          // 32-bit LDR (and 32-bit NEG if heap poisoning is enabled).
+    kNarrow,        // 16-bit LDR (and 16-bit NEG if heap poisoning is enabled).
+    kLast
+  };
+
   static constexpr size_t kBitsForBakerReadBarrierKind =
       MinimumBitsToStore(static_cast<size_t>(BakerReadBarrierKind::kLast));
   static constexpr size_t kBitsForRegister = 4u;
@@ -95,9 +112,14 @@
       BitField<uint32_t, kBitsForBakerReadBarrierKind, kBitsForRegister>;
   using BakerReadBarrierSecondRegField =
       BitField<uint32_t, kBitsForBakerReadBarrierKind + kBitsForRegister, kBitsForRegister>;
+  static constexpr size_t kBitsForBakerReadBarrierWidth =
+      MinimumBitsToStore(static_cast<size_t>(BakerReadBarrierWidth::kLast));
+  using BakerReadBarrierWidthField = BitField<BakerReadBarrierWidth,
+                                              kBitsForBakerReadBarrierKind + 2 * kBitsForRegister,
+                                              kBitsForBakerReadBarrierWidth>;
 
   static void CheckValidReg(uint32_t reg) {
-    DCHECK(reg < 12u && reg != kBakerCcEntrypointRegister);
+    DCHECK(reg < 12u && reg != kBakerCcEntrypointRegister) << reg;
   }
 
   void CompileBakerReadBarrierThunk(arm::ArmVIXLAssembler& assembler, uint32_t encoded_data);
@@ -108,6 +130,11 @@
   template <typename Vector>
   static uint32_t GetInsn32(Vector* code, uint32_t offset);
 
+  static uint32_t GetInsn16(ArrayRef<const uint8_t> code, uint32_t offset);
+
+  template <typename Vector>
+  static uint32_t GetInsn16(Vector* code, uint32_t offset);
+
   friend class Thumb2RelativePatcherTest;
 
   DISALLOW_COPY_AND_ASSIGN(Thumb2RelativePatcher);
diff --git a/compiler/linker/arm/relative_patcher_thumb2_test.cc b/compiler/linker/arm/relative_patcher_thumb2_test.cc
index 2e28349..af5fa40 100644
--- a/compiler/linker/arm/relative_patcher_thumb2_test.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2_test.cc
@@ -52,6 +52,9 @@
   // BNE +0, 32-bit, encoding T3. Bits 0-10, 11, 13, 16-21, 26 are placeholder for target offset.
   static constexpr uint32_t kBneWPlus0 = 0xf0408000u;
 
+  // LDR immediate, 16-bit, encoding T1. Bits 6-10 are imm5, 0-2 are Rt, 3-5 are Rn.
+  static constexpr uint32_t kLdrInsn = 0x6800u;
+
   // LDR immediate, 32-bit, encoding T3. Bits 0-11 are offset, 12-15 are Rt, 16-20 are Rn.
   static constexpr uint32_t kLdrWInsn = 0xf8d00000u;
 
@@ -223,9 +226,11 @@
   void TestStringReference(uint32_t string_offset);
   void CheckPcRelativePatch(const ArrayRef<const LinkerPatch>& patches, uint32_t target_offset);
 
-  std::vector<uint8_t> CompileBakerOffsetThunk(uint32_t base_reg, uint32_t holder_reg) {
+  std::vector<uint8_t> CompileBakerOffsetThunk(uint32_t base_reg,
+                                               uint32_t holder_reg,
+                                               bool narrow) {
     const LinkerPatch patch = LinkerPatch::BakerReadBarrierBranchPatch(
-        0u, Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base_reg, holder_reg));
+        0u, Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base_reg, holder_reg, narrow));
     ArmBaseRelativePatcher::ThunkKey key = ArmBaseRelativePatcher::GetBakerThunkKey(patch);
     return down_cast<Thumb2RelativePatcher*>(patcher_.get())->CompileThunk(key);
   }
@@ -237,9 +242,9 @@
     return down_cast<Thumb2RelativePatcher*>(patcher_.get())->CompileThunk(key);
   }
 
-  std::vector<uint8_t> CompileBakerGcRootThunk(uint32_t root_reg) {
+  std::vector<uint8_t> CompileBakerGcRootThunk(uint32_t root_reg, bool narrow) {
     LinkerPatch patch = LinkerPatch::BakerReadBarrierBranchPatch(
-        0u, Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg));
+        0u, Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg, narrow));
     ArmBaseRelativePatcher::ThunkKey key = ArmBaseRelativePatcher::GetBakerThunkKey(patch);
     return down_cast<Thumb2RelativePatcher*>(patcher_.get())->CompileThunk(key);
   }
@@ -260,7 +265,8 @@
            (static_cast<uint32_t>(output_[offset + 1]) << 8);
   }
 
-  void TestBakerField(uint32_t offset, uint32_t ref_reg);
+  void TestBakerFieldWide(uint32_t offset, uint32_t ref_reg);
+  void TestBakerFieldNarrow(uint32_t offset, uint32_t ref_reg);
 };
 
 const uint8_t Thumb2RelativePatcherTest::kCallRawCode[] = {
@@ -568,7 +574,7 @@
   ASSERT_LT(GetMethodOffset(1u), 0xfcu);
 }
 
-void Thumb2RelativePatcherTest::TestBakerField(uint32_t offset, uint32_t ref_reg) {
+void Thumb2RelativePatcherTest::TestBakerFieldWide(uint32_t offset, uint32_t ref_reg) {
   uint32_t valid_regs[] = {
       0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
       8,  9, 10, 11,                  // IP, SP, LR and PC are reserved.
@@ -584,8 +590,8 @@
       const std::vector<uint8_t> raw_code = RawCode({kBneWPlus0, ldr});
       ASSERT_EQ(kMethodCodeSize, raw_code.size());
       ArrayRef<const uint8_t> code(raw_code);
-      uint32_t encoded_data =
-          Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base_reg, holder_reg);
+      uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
+          base_reg, holder_reg, /* narrow */ false);
       const LinkerPatch patches[] = {
           LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset, encoded_data),
       };
@@ -608,7 +614,8 @@
       ASSERT_TRUE(
           CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code)));
 
-      std::vector<uint8_t> expected_thunk = CompileBakerOffsetThunk(base_reg, holder_reg);
+      std::vector<uint8_t> expected_thunk =
+          CompileBakerOffsetThunk(base_reg, holder_reg, /* narrow */ false);
       ASSERT_GT(output_.size(), thunk_offset);
       ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size());
       ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset,
@@ -666,15 +673,131 @@
   }
 }
 
-#define TEST_BAKER_FIELD(offset, ref_reg)     \
-  TEST_F(Thumb2RelativePatcherTest,           \
-    BakerOffset##offset##_##ref_reg) {        \
-    TestBakerField(offset, ref_reg);          \
+void Thumb2RelativePatcherTest::TestBakerFieldNarrow(uint32_t offset, uint32_t ref_reg) {
+  uint32_t valid_regs[] = {
+      0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
+      8,  9, 10, 11,                  // IP, SP, LR and PC are reserved.
+  };
+  DCHECK_ALIGNED(offset, 4u);
+  DCHECK_LT(offset, 32u);
+  constexpr size_t kMethodCodeSize = 6u;
+  constexpr size_t kLiteralOffset = 0u;
+  uint32_t method_idx = 0u;
+  for (uint32_t base_reg : valid_regs) {
+    if (base_reg >= 8u) {
+      continue;
+    }
+    for (uint32_t holder_reg : valid_regs) {
+      uint32_t ldr = kLdrInsn | (offset << (6 - 2)) | (base_reg << 3) | ref_reg;
+      const std::vector<uint8_t> raw_code = RawCode({kBneWPlus0, ldr});
+      ASSERT_EQ(kMethodCodeSize, raw_code.size());
+      ArrayRef<const uint8_t> code(raw_code);
+      uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
+          base_reg, holder_reg, /* narrow */ true);
+      const LinkerPatch patches[] = {
+          LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset, encoded_data),
+      };
+      ++method_idx;
+      AddCompiledMethod(MethodRef(method_idx), code, ArrayRef<const LinkerPatch>(patches));
+    }
+  }
+  Link();
+
+  // All thunks are at the end.
+  uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment);
+  method_idx = 0u;
+  for (uint32_t base_reg : valid_regs) {
+    if (base_reg >= 8u) {
+      continue;
+    }
+    for (uint32_t holder_reg : valid_regs) {
+      ++method_idx;
+      uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset);
+      uint32_t ldr = kLdrInsn | (offset << (6 - 2)) | (base_reg << 3) | ref_reg;
+      const std::vector<uint8_t> expected_code = RawCode({bne, ldr});
+      ASSERT_EQ(kMethodCodeSize, expected_code.size()) << "bne=0x" << std::hex << bne;
+      ASSERT_TRUE(
+          CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code)));
+
+      std::vector<uint8_t> expected_thunk =
+          CompileBakerOffsetThunk(base_reg, holder_reg, /* narrow */ true);
+      ASSERT_GT(output_.size(), thunk_offset);
+      ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size());
+      ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset,
+                                             expected_thunk.size());
+      if (ArrayRef<const uint8_t>(expected_thunk) != compiled_thunk) {
+        DumpDiff(ArrayRef<const uint8_t>(expected_thunk), compiled_thunk);
+        ASSERT_TRUE(false);
+      }
+
+      size_t gray_check_offset = thunk_offset;
+      if (holder_reg == base_reg) {
+        // Verify that the null-check uses the correct register, i.e. holder_reg.
+        if (holder_reg < 8) {
+          ASSERT_GE(output_.size() - gray_check_offset, 2u);
+          ASSERT_EQ(0xb100 | holder_reg, GetOutputInsn16(thunk_offset) & 0xfd07u);
+          gray_check_offset +=2u;
+        } else {
+          ASSERT_GE(output_.size() - gray_check_offset, 6u);
+          ASSERT_EQ(0xf1b00f00u | (holder_reg << 16), GetOutputInsn32(thunk_offset) & 0xfbff8f00u);
+          ASSERT_EQ(0xd000u, GetOutputInsn16(thunk_offset + 4u) & 0xff00u);  // BEQ
+          gray_check_offset += 6u;
+        }
+      }
+      // Verify that the lock word for gray bit check is loaded from the holder address.
+      ASSERT_GE(output_.size() - gray_check_offset,
+                4u * /* 32-bit instructions */ 4u + 2u * /* 16-bit instructions */ 2u);
+      const uint32_t load_lock_word =
+          kLdrWInsn |
+          (holder_reg << 16) |
+          (/* IP */ 12 << 12) |
+          mirror::Object::MonitorOffset().Uint32Value();
+      ASSERT_EQ(load_lock_word, GetOutputInsn32(gray_check_offset));
+      // Verify the gray bit check.
+      DCHECK_GE(LockWord::kReadBarrierStateShift, 8u);  // ROR modified immediate.
+      uint32_t ror_shift = 7 + (32 - LockWord::kReadBarrierStateShift);
+      const uint32_t tst_gray_bit_without_offset =
+          0xf0100f00 | (/* IP */ 12 << 16)
+                     | (((ror_shift >> 4) & 1) << 26)   // i
+                     | (((ror_shift >> 1) & 7) << 12)   // imm3
+                     | ((ror_shift & 1) << 7);          // imm8, ROR('1':imm8<7:0>, ror_shift).
+      EXPECT_EQ(tst_gray_bit_without_offset, GetOutputInsn32(gray_check_offset + 4u));
+      EXPECT_EQ(0xd100u, GetOutputInsn16(gray_check_offset + 8u) & 0xff00u);  // BNE
+      // Verify the fake dependency (skip "ADD LR, LR, #ldr_offset").
+      const uint32_t fake_dependency =
+          0xeb000010 |              // ADD Rd, Rn, Rm, LSR 32 (type=01, imm3=000, imm2=00)
+          (/* IP */ 12) |           // Rm = IP
+          (base_reg << 16) |        // Rn = base_reg
+          (base_reg << 8);          // Rd = base_reg
+      EXPECT_EQ(fake_dependency, GetOutputInsn32(gray_check_offset + 14u));
+      // Do not check the rest of the implementation.
+
+      // The next thunk follows on the next aligned offset.
+      thunk_offset += RoundUp(expected_thunk.size(), kArmAlignment);
+    }
+  }
+}
+
+#define TEST_BAKER_FIELD_WIDE(offset, ref_reg)    \
+  TEST_F(Thumb2RelativePatcherTest,               \
+    BakerOffsetWide##offset##_##ref_reg) {        \
+    TestBakerFieldWide(offset, ref_reg);          \
   }
 
-TEST_BAKER_FIELD(/* offset */ 0, /* ref_reg */ 0)
-TEST_BAKER_FIELD(/* offset */ 8, /* ref_reg */ 7)
-TEST_BAKER_FIELD(/* offset */ 0xffc, /* ref_reg */ 11)
+TEST_BAKER_FIELD_WIDE(/* offset */ 0, /* ref_reg */ 0)
+TEST_BAKER_FIELD_WIDE(/* offset */ 8, /* ref_reg */ 3)
+TEST_BAKER_FIELD_WIDE(/* offset */ 28, /* ref_reg */ 7)
+TEST_BAKER_FIELD_WIDE(/* offset */ 0xffc, /* ref_reg */ 11)
+
+#define TEST_BAKER_FIELD_NARROW(offset, ref_reg)  \
+  TEST_F(Thumb2RelativePatcherTest,               \
+    BakerOffsetNarrow##offset##_##ref_reg) {      \
+    TestBakerFieldNarrow(offset, ref_reg);        \
+  }
+
+TEST_BAKER_FIELD_NARROW(/* offset */ 0, /* ref_reg */ 0)
+TEST_BAKER_FIELD_NARROW(/* offset */ 8, /* ref_reg */ 3)
+TEST_BAKER_FIELD_NARROW(/* offset */ 28, /* ref_reg */ 7)
 
 TEST_F(Thumb2RelativePatcherTest, BakerOffsetThunkInTheMiddle) {
   // One thunk in the middle with maximum distance branches to it from both sides.
@@ -682,8 +805,8 @@
   constexpr uint32_t kLiteralOffset1 = 6u;
   const std::vector<uint8_t> raw_code1 = RawCode({kNopWInsn, kNopInsn, kBneWPlus0, kLdrWInsn});
   ArrayRef<const uint8_t> code1(raw_code1);
-  uint32_t encoded_data =
-      Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(/* base_reg */ 0, /* holder_reg */ 0);
+  uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
+      /* base_reg */ 0, /* holder_reg */ 0, /* narrow */ false);
   const LinkerPatch patches1[] = {
       LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset1, encoded_data),
   };
@@ -710,7 +833,8 @@
   //   - thunk size and method 3 pre-header, rounded up (padding in between if needed)
   //   - method 3 code and method 4 pre-header, rounded up (padding in between if needed)
   //   - method 4 header (let there be no padding between method 4 code and method 5 pre-header).
-  size_t thunk_size = CompileBakerOffsetThunk(/* base_reg */ 0, /* holder_reg */ 0).size();
+  size_t thunk_size =
+      CompileBakerOffsetThunk(/* base_reg */ 0, /* holder_reg */ 0, /* narrow */ false).size();
   size_t filler2_size =
       1 * MB - (kLiteralOffset2 + kPcAdjustment)
              - RoundUp(thunk_size + sizeof(OatQuickMethodHeader), kArmAlignment)
@@ -749,8 +873,8 @@
   constexpr uint32_t kLiteralOffset1 = 4u;
   const std::vector<uint8_t> raw_code1 = RawCode({kNopWInsn, kBneWPlus0, kLdrWInsn, kNopInsn});
   ArrayRef<const uint8_t> code1(raw_code1);
-  uint32_t encoded_data =
-      Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(/* base_reg */ 0, /* holder_reg */ 0);
+  uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
+      /* base_reg */ 0, /* holder_reg */ 0, /* narrow */ false);
   const LinkerPatch patches1[] = {
       LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset1, encoded_data),
   };
@@ -779,8 +903,8 @@
   constexpr uint32_t kLiteralOffset1 = 6u;
   const std::vector<uint8_t> raw_code1 = RawCode({kNopWInsn, kNopInsn, kBneWPlus0, kLdrWInsn});
   ArrayRef<const uint8_t> code1(raw_code1);
-  uint32_t encoded_data =
-      Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(/* base_reg */ 0, /* holder_reg */ 0);
+  uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
+      /* base_reg */ 0, /* holder_reg */ 0, /* narrow */ false);
   const LinkerPatch patches1[] = {
       LinkerPatch::BakerReadBarrierBranchPatch(kLiteralOffset1, encoded_data),
   };
@@ -809,7 +933,8 @@
   //   - thunk size and method 3 pre-header, rounded up (padding in between if needed)
   //   - method 3 code and method 4 pre-header, rounded up (padding in between if needed)
   //   - method 4 header (let there be no padding between method 4 code and method 5 pre-header).
-  size_t thunk_size = CompileBakerOffsetThunk(/* base_reg */ 0, /* holder_reg */ 0).size();
+  size_t thunk_size =
+      CompileBakerOffsetThunk(/* base_reg */ 0, /* holder_reg */ 0, /* narrow */ false).size();
   size_t filler2_size =
       1 * MB - (kReachableFromOffset2 + kPcAdjustment)
              - RoundUp(thunk_size + sizeof(OatQuickMethodHeader), kArmAlignment)
@@ -929,7 +1054,7 @@
   }
 }
 
-TEST_F(Thumb2RelativePatcherTest, BakerGcRoot) {
+TEST_F(Thumb2RelativePatcherTest, BakerGcRootWide) {
   uint32_t valid_regs[] = {
       0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
       8,  9, 10, 11,                  // IP, SP, LR and PC are reserved.
@@ -945,7 +1070,8 @@
     ArrayRef<const uint8_t> code(raw_code);
     const LinkerPatch patches[] = {
         LinkerPatch::BakerReadBarrierBranchPatch(
-            kLiteralOffset, Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg)),
+            kLiteralOffset,
+            Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg, /* narrow */ false)),
     };
     AddCompiledMethod(MethodRef(method_idx), code, ArrayRef<const LinkerPatch>(patches));
   }
@@ -962,7 +1088,67 @@
     ASSERT_EQ(kMethodCodeSize, expected_code.size());
     EXPECT_TRUE(CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code)));
 
-    std::vector<uint8_t> expected_thunk = CompileBakerGcRootThunk(root_reg);
+    std::vector<uint8_t> expected_thunk = CompileBakerGcRootThunk(root_reg, /* narrow */ false);
+    ASSERT_GT(output_.size(), thunk_offset);
+    ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size());
+    ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset,
+                                           expected_thunk.size());
+    if (ArrayRef<const uint8_t>(expected_thunk) != compiled_thunk) {
+      DumpDiff(ArrayRef<const uint8_t>(expected_thunk), compiled_thunk);
+      ASSERT_TRUE(false);
+    }
+
+    // Verify that the fast-path null-check uses the correct register, i.e. root_reg.
+    if (root_reg < 8) {
+      ASSERT_GE(output_.size() - thunk_offset, 2u);
+      ASSERT_EQ(0xb100 | root_reg, GetOutputInsn16(thunk_offset) & 0xfd07u);
+    } else {
+      ASSERT_GE(output_.size() - thunk_offset, 6u);
+      ASSERT_EQ(0xf1b00f00u | (root_reg << 16), GetOutputInsn32(thunk_offset) & 0xfbff8f00u);
+      ASSERT_EQ(0xd000u, GetOutputInsn16(thunk_offset + 4u) & 0xff00u);  // BEQ
+    }
+    // Do not check the rest of the implementation.
+
+    // The next thunk follows on the next aligned offset.
+    thunk_offset += RoundUp(expected_thunk.size(), kArmAlignment);
+  }
+}
+
+TEST_F(Thumb2RelativePatcherTest, BakerGcRootNarrow) {
+  uint32_t valid_regs[] = {
+      0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
+                                      // Not appplicable to high registers.
+  };
+  constexpr size_t kMethodCodeSize = 6u;
+  constexpr size_t kLiteralOffset = 2u;
+  uint32_t method_idx = 0u;
+  for (uint32_t root_reg : valid_regs) {
+    ++method_idx;
+    uint32_t ldr = kLdrInsn | (/* offset */ 8 << (6 - 2)) | (/* base_reg */ 0 << 3) | root_reg;
+    const std::vector<uint8_t> raw_code = RawCode({ldr, kBneWPlus0});
+    ASSERT_EQ(kMethodCodeSize, raw_code.size());
+    ArrayRef<const uint8_t> code(raw_code);
+    const LinkerPatch patches[] = {
+        LinkerPatch::BakerReadBarrierBranchPatch(
+            kLiteralOffset,
+            Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg, /* narrow */ true)),
+    };
+    AddCompiledMethod(MethodRef(method_idx), code, ArrayRef<const LinkerPatch>(patches));
+  }
+  Link();
+
+  // All thunks are at the end.
+  uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment);
+  method_idx = 0u;
+  for (uint32_t root_reg : valid_regs) {
+    ++method_idx;
+    uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset);
+    uint32_t ldr = kLdrInsn | (/* offset */ 8 << (6 - 2)) | (/* base_reg */ 0 << 3) | root_reg;
+    const std::vector<uint8_t> expected_code = RawCode({ldr, bne});
+    ASSERT_EQ(kMethodCodeSize, expected_code.size());
+    EXPECT_TRUE(CheckLinkedMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(expected_code)));
+
+    std::vector<uint8_t> expected_thunk = CompileBakerGcRootThunk(root_reg, /* narrow */ true);
     ASSERT_GT(output_.size(), thunk_offset);
     ASSERT_GE(output_.size() - thunk_offset, expected_thunk.size());
     ArrayRef<const uint8_t> compiled_thunk(output_.data() + thunk_offset,
@@ -973,14 +1159,8 @@
     }
 
     // Verify that the fast-path null-check CBZ uses the correct register, i.e. root_reg.
-    if (root_reg < 8) {
-      ASSERT_GE(output_.size() - thunk_offset, 2u);
-      ASSERT_EQ(0xb100 | root_reg, GetOutputInsn16(thunk_offset) & 0xfd07u);
-    } else {
-      ASSERT_GE(output_.size() - thunk_offset, 6u);
-      ASSERT_EQ(0xf1b00f00u | (root_reg << 16), GetOutputInsn32(thunk_offset) & 0xfbff8f00u);
-      ASSERT_EQ(0xd000u, GetOutputInsn16(thunk_offset + 4u) & 0xff00u);  // BEQ
-    }
+    ASSERT_GE(output_.size() - thunk_offset, 2u);
+    ASSERT_EQ(0xb100 | root_reg, GetOutputInsn16(thunk_offset) & 0xfd07u);
     // Do not check the rest of the implementation.
 
     // The next thunk follows on the next aligned offset.
@@ -998,7 +1178,8 @@
   patches.reserve(num_patches);
   const uint32_t ldr =
       kLdrWInsn | (/* offset */ 8) | (/* base_reg */ 0 << 16) | (/* root_reg */ 0 << 12);
-  uint32_t encoded_data = Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 0);
+  uint32_t encoded_data =
+      Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 0, /* narrow */ false);
   for (size_t i = 0; i != num_patches; ++i) {
     PushBackInsn(&code, ldr);
     PushBackInsn(&code, kBneWPlus0);
@@ -1067,7 +1248,7 @@
   // this pushes the first GC root thunk's pending MaxNextOffset() before the method call
   // thunk's pending MaxNextOffset() which needs to be adjusted.
   ASSERT_LT(RoundUp(CompileMethodCallThunk().size(), kArmAlignment) + kArmAlignment,
-            CompileBakerGcRootThunk(/* root_reg */ 0).size());
+            CompileBakerGcRootThunk(/* root_reg */ 0, /* narrow */ false).size());
   static_assert(kArmAlignment == 8, "Code below assumes kArmAlignment == 8");
   constexpr size_t kBakerLiteralOffset1 = kArmAlignment + 2u - kPcAdjustment;
   constexpr size_t kBakerLiteralOffset2 = kBakerLiteralOffset1 + kArmAlignment;
@@ -1080,9 +1261,9 @@
       ldr2, kBneWPlus0,                         // Second GC root LDR with read barrier.
   });
   uint32_t encoded_data1 =
-      Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 1);
+      Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 1, /* narrow */ false);
   uint32_t encoded_data2 =
-      Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 2);
+      Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(/* root_reg */ 2, /* narrow */ false);
   const LinkerPatch last_method_patches[] = {
       LinkerPatch::BakerReadBarrierBranchPatch(kBakerLiteralOffset1, encoded_data1),
       LinkerPatch::BakerReadBarrierBranchPatch(kBakerLiteralOffset2, encoded_data2),
diff --git a/compiler/linker/arm64/relative_patcher_arm64.h b/compiler/linker/arm64/relative_patcher_arm64.h
index d1ab410..02a5b1e 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.h
+++ b/compiler/linker/arm64/relative_patcher_arm64.h
@@ -100,7 +100,7 @@
       BitField<uint32_t, kBitsForBakerReadBarrierKind + kBitsForRegister, kBitsForRegister>;
 
   static void CheckValidReg(uint32_t reg) {
-    DCHECK(reg < 30u && reg != 16u && reg != 17u);
+    DCHECK(reg < 30u && reg != 16u && reg != 17u) << reg;
   }
 
   void CompileBakerReadBarrierThunk(arm64::Arm64Assembler& assembler, uint32_t encoded_data);
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 35dccd6..8650aee 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -90,13 +90,17 @@
 }
 
 static inline void EmitPlaceholderBne(CodeGeneratorARM* codegen, Label* bne_label) {
-  DCHECK(down_cast<Thumb2Assembler*>(codegen->GetAssembler())->IsForced32Bit());
+  ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(codegen->GetAssembler()));
   __ BindTrackedLabel(bne_label);
   Label placeholder_label;
   __ b(&placeholder_label, NE);  // Placeholder, patched at link-time.
   __ Bind(&placeholder_label);
 }
 
+static inline bool CanEmitNarrowLdr(Register rt, Register rn, uint32_t offset) {
+  return ArmAssembler::IsLowRegister(rt) && ArmAssembler::IsLowRegister(rn) && offset < 32u;
+}
+
 static constexpr int kRegListThreshold = 4;
 
 // SaveLiveRegisters and RestoreLiveRegisters from SlowPathCodeARM operate on sets of S registers,
@@ -8049,8 +8053,9 @@
         //   return_address:
 
         CheckLastTempIsBakerCcEntrypointRegister(instruction);
+        bool narrow = CanEmitNarrowLdr(root_reg, obj, offset);
         uint32_t custom_data =
-            linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg);
+            linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg, narrow);
         Label* bne_label = codegen_->NewBakerReadBarrierPatch(custom_data);
 
         // entrypoint_reg =
@@ -8063,16 +8068,18 @@
         Label return_address;
         __ AdrCode(LR, &return_address);
         __ CmpConstant(kBakerCcEntrypointRegister, 0);
-        static_assert(
-            BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET == -8,
-            "GC root LDR must be 2 32-bit instructions (8B) before the return address label.");
         // Currently the offset is always within range. If that changes,
         // we shall have to split the load the same way as for fields.
         DCHECK_LT(offset, kReferenceLoadMinFarOffset);
-        ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
+        DCHECK(!down_cast<Thumb2Assembler*>(GetAssembler())->IsForced32Bit());
+        ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()), !narrow);
+        int old_position = GetAssembler()->GetBuffer()->GetPosition();
         __ LoadFromOffset(kLoadWord, root_reg, obj, offset);
         EmitPlaceholderBne(codegen_, bne_label);
         __ Bind(&return_address);
+        DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(),
+                  narrow ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET
+                         : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET);
       } else {
         // Note that we do not actually check the value of
         // `GetIsGcMarking()` to decide whether to mark the loaded GC
@@ -8172,10 +8179,12 @@
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = *(obj+offset);
+    //     HeapReference<mirror::Object> reference = *(obj+offset);
     //   gray_return_address:
 
     DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
+    Register ref_reg = ref.AsRegister<Register>();
+    bool narrow = CanEmitNarrowLdr(ref_reg, obj, offset);
     Register base = obj;
     if (offset >= kReferenceLoadMinFarOffset) {
       base = temp.AsRegister<Register>();
@@ -8183,10 +8192,14 @@
       static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2.");
       __ AddConstant(base, obj, offset & ~(kReferenceLoadMinFarOffset - 1u));
       offset &= (kReferenceLoadMinFarOffset - 1u);
+      // Use narrow LDR only for small offsets. Generating narrow encoding LDR for the large
+      // offsets with `(offset & (kReferenceLoadMinFarOffset - 1u)) < 32u` would most likely
+      // increase the overall code size when taking the generated thunks into account.
+      DCHECK(!narrow);
     }
     CheckLastTempIsBakerCcEntrypointRegister(instruction);
     uint32_t custom_data =
-        linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base, obj);
+        linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base, obj, narrow);
     Label* bne_label = NewBakerReadBarrierPatch(custom_data);
 
     // entrypoint_reg =
@@ -8199,19 +8212,20 @@
     Label return_address;
     __ AdrCode(LR, &return_address);
     __ CmpConstant(kBakerCcEntrypointRegister, 0);
-    ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
     EmitPlaceholderBne(this, bne_label);
-    static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                  "Field LDR must be 1 32-bit instruction (4B) before the return address label; "
-                  " 2 32-bit instructions (8B) for heap poisoning.");
-    Register ref_reg = ref.AsRegister<Register>();
     DCHECK_LT(offset, kReferenceLoadMinFarOffset);
+    DCHECK(!down_cast<Thumb2Assembler*>(GetAssembler())->IsForced32Bit());
+    ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()), !narrow);
+    int old_position = GetAssembler()->GetBuffer()->GetPosition();
     __ LoadFromOffset(kLoadWord, ref_reg, base, offset);
     if (needs_null_check) {
       MaybeRecordImplicitNullCheck(instruction);
     }
     GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
     __ Bind(&return_address);
+    DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(),
+              narrow ? BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
+                     : BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET);
     return;
   }
 
@@ -8257,7 +8271,7 @@
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = data[index];
+    //     HeapReference<mirror::Object> reference = data[index];
     //   gray_return_address:
 
     DCHECK(index.IsValid());
@@ -8282,15 +8296,15 @@
     Label return_address;
     __ AdrCode(LR, &return_address);
     __ CmpConstant(kBakerCcEntrypointRegister, 0);
-    ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
     EmitPlaceholderBne(this, bne_label);
-    static_assert(BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                  "Array LDR must be 1 32-bit instruction (4B) before the return address label; "
-                  " 2 32-bit instructions (8B) for heap poisoning.");
+    ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
+    int old_position = GetAssembler()->GetBuffer()->GetPosition();
     __ ldr(ref_reg, Address(data_reg, index_reg, LSL, scale_factor));
     DCHECK(!needs_null_check);  // The thunk cannot handle the null check.
     GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
     __ Bind(&return_address);
+    DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(),
+              BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET);
     return;
   }
 
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index ed0a64c..54aa03c 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -6094,7 +6094,7 @@
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = *(obj+offset);
+    //     HeapReference<mirror::Object> reference = *(obj+offset);
     //   gray_return_address:
 
     DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
@@ -6189,7 +6189,7 @@
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = data[index];
+    //     HeapReference<mirror::Object> reference = data[index];
     //   gray_return_address:
 
     DCHECK(index.IsValid());
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 8417f84..b2e0a91 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -124,6 +124,10 @@
   __ bind(&placeholder_label);
 }
 
+static inline bool CanEmitNarrowLdr(vixl32::Register rt, vixl32::Register rn, uint32_t offset) {
+  return rt.IsLow() && rn.IsLow() && offset < 32u;
+}
+
 class EmitAdrCode {
  public:
   EmitAdrCode(ArmVIXLMacroAssembler* assembler, vixl32::Register rd, vixl32::Label* label)
@@ -8158,8 +8162,9 @@
 
         UseScratchRegisterScope temps(GetVIXLAssembler());
         ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction);
-        uint32_t custom_data =
-            linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg.GetCode());
+        bool narrow = CanEmitNarrowLdr(root_reg, obj, offset);
+        uint32_t custom_data = linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(
+            root_reg.GetCode(), narrow);
         vixl32::Label* bne_label = codegen_->NewBakerReadBarrierPatch(custom_data);
 
         // entrypoint_reg =
@@ -8174,15 +8179,16 @@
         vixl32::Label return_address;
         EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
         __ cmp(kBakerCcEntrypointRegister, Operand(0));
-        static_assert(
-            BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET == -8,
-            "GC root LDR must be 2 32-bit instructions (8B) before the return address label.");
         // Currently the offset is always within range. If that changes,
         // we shall have to split the load the same way as for fields.
         DCHECK_LT(offset, kReferenceLoadMinFarOffset);
-        __ ldr(EncodingSize(Wide), root_reg, MemOperand(obj, offset));
+        ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
+        __ ldr(EncodingSize(narrow ? Narrow : Wide), root_reg, MemOperand(obj, offset));
         EmitPlaceholderBne(codegen_, bne_label);
         __ Bind(&return_address);
+        DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
+                  narrow ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET
+                         : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET);
       } else {
         // Note that we do not actually check the value of
         // `GetIsGcMarking()` to decide whether to mark the loaded GC
@@ -8283,10 +8289,12 @@
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = *(obj+offset);
+    //     HeapReference<mirror::Object> reference = *(obj+offset);
     //   gray_return_address:
 
     DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
+    vixl32::Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot);
+    bool narrow = CanEmitNarrowLdr(ref_reg, obj, offset);
     vixl32::Register base = obj;
     if (offset >= kReferenceLoadMinFarOffset) {
       base = RegisterFrom(temp);
@@ -8294,12 +8302,15 @@
       static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2.");
       __ Add(base, obj, Operand(offset & ~(kReferenceLoadMinFarOffset - 1u)));
       offset &= (kReferenceLoadMinFarOffset - 1u);
+      // Use narrow LDR only for small offsets. Generating narrow encoding LDR for the large
+      // offsets with `(offset & (kReferenceLoadMinFarOffset - 1u)) < 32u` would most likely
+      // increase the overall code size when taking the generated thunks into account.
+      DCHECK(!narrow);
     }
     UseScratchRegisterScope temps(GetVIXLAssembler());
     ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction);
     uint32_t custom_data = linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
-        base.GetCode(),
-        obj.GetCode());
+        base.GetCode(), obj.GetCode(), narrow);
     vixl32::Label* bne_label = NewBakerReadBarrierPatch(custom_data);
 
     // entrypoint_reg =
@@ -8316,19 +8327,24 @@
     EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
     __ cmp(kBakerCcEntrypointRegister, Operand(0));
     EmitPlaceholderBne(this, bne_label);
-    static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                  "Field LDR must be 1 32-bit instruction (4B) before the return address label; "
-                  " 2 32-bit instructions (8B) for heap poisoning.");
-    vixl32::Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot);
-    __ ldr(EncodingSize(Wide), ref_reg, MemOperand(base, offset));
+    ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
+    __ ldr(EncodingSize(narrow ? Narrow : Wide), ref_reg, MemOperand(base, offset));
     if (needs_null_check) {
       MaybeRecordImplicitNullCheck(instruction);
     }
-    // Note: We need a Wide NEG for the unpoisoning.
+    // Note: We need a specific width for the unpoisoning NEG.
     if (kPoisonHeapReferences) {
-      __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0));
+      if (narrow) {
+        // The only 16-bit encoding is T1 which sets flags outside IT block (i.e. RSBS, not RSB).
+        __ rsbs(EncodingSize(Narrow), ref_reg, ref_reg, Operand(0));
+      } else {
+        __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0));
+      }
     }
     __ Bind(&return_address);
+    DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
+              narrow ? BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
+                     : BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET);
     return;
   }
 
@@ -8374,7 +8390,7 @@
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = data[index];
+    //     HeapReference<mirror::Object> reference = data[index];
     //   gray_return_address:
 
     DCHECK(index.IsValid());
@@ -8404,9 +8420,7 @@
     EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
     __ cmp(kBakerCcEntrypointRegister, Operand(0));
     EmitPlaceholderBne(this, bne_label);
-    static_assert(BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                  "Array LDR must be 1 32-bit instruction (4B) before the return address label; "
-                  " 2 32-bit instructions (8B) for heap poisoning.");
+    ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
     __ ldr(ref_reg, MemOperand(data_reg, index_reg, vixl32::LSL, scale_factor));
     DCHECK(!needs_null_check);  // The thunk cannot handle the null check.
     // Note: We need a Wide NEG for the unpoisoning.
@@ -8414,6 +8428,8 @@
       __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0));
     }
     __ Bind(&return_address);
+    DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
+              BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET);
     return;
   }
 
diff --git a/compiler/utils/arm/assembler_thumb2.h b/compiler/utils/arm/assembler_thumb2.h
index 5c36110..2ff9018 100644
--- a/compiler/utils/arm/assembler_thumb2.h
+++ b/compiler/utils/arm/assembler_thumb2.h
@@ -924,9 +924,11 @@
 
 class ScopedForce32Bit {
  public:
-  explicit ScopedForce32Bit(Thumb2Assembler* assembler)
+  explicit ScopedForce32Bit(Thumb2Assembler* assembler, bool force = true)
       : assembler_(assembler), old_force_32bit_(assembler->IsForced32Bit()) {
-    assembler->Force32Bit();
+    if (force) {
+      assembler->Force32Bit();
+    }
   }
 
   ~ScopedForce32Bit() {
diff --git a/runtime/arch/arch_test.cc b/runtime/arch/arch_test.cc
index 1a5e39f..d6056c0 100644
--- a/runtime/arch/arch_test.cc
+++ b/runtime/arch/arch_test.cc
@@ -71,11 +71,15 @@
 #undef FRAME_SIZE_SAVE_REFS_AND_ARGS
 static constexpr size_t kFrameSizeSaveEverything = FRAME_SIZE_SAVE_EVERYTHING;
 #undef FRAME_SIZE_SAVE_EVERYTHING
+#undef BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_ENTRYPOINT_OFFSET
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET
 #undef BAKER_MARK_INTROSPECTION_ARRAY_SWITCH_OFFSET
-#undef BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET
-#undef BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET
+#undef BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET
+#undef BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
 #undef BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET
-#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET
 }  // namespace arm
 
 namespace arm64 {
diff --git a/runtime/arch/arm/asm_support_arm.h b/runtime/arch/arm/asm_support_arm.h
index f1f1766..8f2fd6e 100644
--- a/runtime/arch/arm/asm_support_arm.h
+++ b/runtime/arch/arm/asm_support_arm.h
@@ -24,18 +24,25 @@
 #define FRAME_SIZE_SAVE_REFS_AND_ARGS 112
 #define FRAME_SIZE_SAVE_EVERYTHING 192
 
+// The offset from the art_quick_read_barrier_mark_introspection (used for field
+// loads with 32-bit LDR) to the entrypoint for field loads with 16-bit LDR,
+// i.e. art_quick_read_barrier_mark_introspection_narrow.
+#define BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_ENTRYPOINT_OFFSET 0x20
+// The offsets from art_quick_read_barrier_mark_introspection to the GC root entrypoints,
+// i.e. art_quick_read_barrier_mark_introspection_gc_roots_{wide,narrow}.
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET 0x80
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET 0xc0
 // The offset from art_quick_read_barrier_mark_introspection to the array switch cases,
 // i.e. art_quick_read_barrier_mark_introspection_arrays.
 #define BAKER_MARK_INTROSPECTION_ARRAY_SWITCH_OFFSET 0x100
-// The offset from art_quick_read_barrier_mark_introspection to the GC root entrypoint,
-// i.e. art_quick_read_barrier_mark_introspection_gc_roots.
-#define BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET 0xc0
 
 // The offset of the reference load LDR from the return address in LR for field loads.
 #ifdef USE_HEAP_POISONING
-#define BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET -8
+#define BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET -8
+#define BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET -4
 #else
-#define BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET -4
+#define BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET -4
+#define BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET -2
 #endif
 // The offset of the reference load LDR from the return address in LR for array loads.
 #ifdef USE_HEAP_POISONING
@@ -44,7 +51,8 @@
 #define BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET -4
 #endif
 // The offset of the reference load LDR from the return address in LR for GC root loads.
-#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET -8
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET -8
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET -6
 
 // Flag for enabling R4 optimization in arm runtime
 // #define ARM_R4_SUSPEND_FLAG
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index 6b72477..919b0af 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -53,8 +53,11 @@
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg12(mirror::Object*);
 
 extern "C" mirror::Object* art_quick_read_barrier_mark_introspection(mirror::Object*);
+extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_narrow(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_arrays(mirror::Object*);
-extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_gc_roots(mirror::Object*);
+extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_gc_roots_wide(mirror::Object*);
+extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_gc_roots_narrow(
+    mirror::Object*);
 
 // Used by soft float.
 // Single-precision FP arithmetics.
@@ -86,18 +89,27 @@
   qpoints->pReadBarrierMarkReg10 = is_active ? art_quick_read_barrier_mark_reg10 : nullptr;
   qpoints->pReadBarrierMarkReg11 = is_active ? art_quick_read_barrier_mark_reg11 : nullptr;
 
-  // Check that array switch cases are at appropriate offsets from the introspection entrypoint.
   // For the alignment check, strip the Thumb mode bit.
   DCHECK_ALIGNED(reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection) - 1u, 256u);
+  // Check the field narrow entrypoint offset from the introspection entrypoint.
+  intptr_t narrow_diff =
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection_narrow) -
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection);
+  DCHECK_EQ(BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_ENTRYPOINT_OFFSET, narrow_diff);
+  // Check array switch cases offsets from the introspection entrypoint.
   intptr_t array_diff =
       reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection_arrays) -
       reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection);
   DCHECK_EQ(BAKER_MARK_INTROSPECTION_ARRAY_SWITCH_OFFSET, array_diff);
-  // Check that the GC root entrypoint is at appropriate offset from the introspection entrypoint.
-  intptr_t gc_roots_diff =
-      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection_gc_roots) -
+  // Check the GC root entrypoint offsets from the introspection entrypoint.
+  intptr_t gc_roots_wide_diff =
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection_gc_roots_wide) -
       reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection);
-  DCHECK_EQ(BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET, gc_roots_diff);
+  DCHECK_EQ(BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET, gc_roots_wide_diff);
+  intptr_t gc_roots_narrow_diff =
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection_gc_roots_narrow) -
+      reinterpret_cast<intptr_t>(art_quick_read_barrier_mark_introspection);
+  DCHECK_EQ(BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET, gc_roots_narrow_diff);
   // The register 12, i.e. IP, is reserved, so there is no art_quick_read_barrier_mark_reg12.
   // We're using the entry to hold a pointer to the introspection entrypoint instead.
   qpoints->pReadBarrierMarkReg12 = is_active ? art_quick_read_barrier_mark_introspection : nullptr;
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index fa21208..d0c6728 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -2189,7 +2189,7 @@
     .byte   (.Lmark_introspection_return_switch_case_bad - .Lmark_introspection_return_table) / 2
 .endm
 
-#if BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET != BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET
+#if BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET != BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET
 #error "Array and field introspection code sharing requires same LDR offset."
 #endif
 .macro BRBMI_ARRAY_LOAD index_reg
@@ -2208,7 +2208,10 @@
     BRBMI_BKPT_FILL_4B
 .endm
 
-.macro BRBMI_SLOW_PATH ldr_offset
+.macro BRBMI_RUNTIME_CALL
+    // Note: This macro generates exactly 22 bytes of code. The core register
+    // PUSH and the MOVs are 16-bit instructions, the rest is 32-bit instructions.
+
     push   {r0-r3, r7, lr}            // Save return address and caller-save registers.
     .cfi_adjust_cfa_offset 24
     .cfi_rel_offset r0, 0
@@ -2234,11 +2237,72 @@
     .cfi_restore r3
     .cfi_restore r7
     .cfi_restore lr
+.endm
 
+.macro BRBMI_CHECK_NULL_AND_MARKED label_suffix
+    // If reference is null, just return it in the right register.
+    cmp     ip, #0
+    beq     .Lmark_introspection_return\label_suffix
+    // Use R4 as temp and check the mark bit of the reference.
+    ldr     r4, [ip, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    tst     r4, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
+    beq     .Lmark_introspection_unmarked\label_suffix
+.Lmark_introspection_return\label_suffix:
+.endm
+
+.macro BRBMI_UNMARKED_FORWARDING_ADDRESS_CHECK label_suffix
+.Lmark_introspection_unmarked\label_suffix:
+    // Check if the top two bits are one, if this is the case it is a forwarding address.
+#if (LOCK_WORD_STATE_SHIFT != 30) || (LOCK_WORD_STATE_FORWARDING_ADDRESS != 3)
+    // To use "CMP ip, #modified-immediate; BHS", we need the lock word state in
+    // the highest bits and the "forwarding address" state to have all bits set.
+#error "Unexpected lock word state shift or forwarding address state value."
+#endif
+    cmp     r4, #(LOCK_WORD_STATE_FORWARDING_ADDRESS << LOCK_WORD_STATE_SHIFT)
+    bhs     .Lmark_introspection_forwarding_address\label_suffix
+.endm
+
+.macro BRBMI_EXTRACT_FORWARDING_ADDRESS label_suffix
+.Lmark_introspection_forwarding_address\label_suffix:
+    // Note: This macro generates exactly 22 bytes of code, the branch is near.
+
+    // Shift left by the forwarding address shift. This clears out the state bits since they are
+    // in the top 2 bits of the lock word.
+    lsl     ip, r4, #LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT
+    b       .Lmark_introspection_return\label_suffix
+.endm
+
+.macro BRBMI_LOAD_RETURN_REG_FROM_CODE_wide ldr_offset
     // Load the half of the instruction that contains Rt. Adjust for the thumb state in LR.
     ldrh    r4, [lr, #(-1 + \ldr_offset + 2)]
-    lsr     r4, r4, #12               // Extract `ref_reg`.
-    b       .Lmark_introspection_return_switch
+.endm
+
+.macro BRBMI_LOAD_RETURN_REG_FROM_CODE_narrow ldr_offset
+    // Load the 16-bit instruction. Adjust for the thumb state in LR.
+    ldrh    r4, [lr, #(-1 + \ldr_offset)]
+.endm
+
+.macro BRBMI_GC_ROOT_AND_FIELD_SLOW_PATH gc_root_ldr_offset, label_suffix
+    .balign 64
+    .thumb_func
+    .type art_quick_read_barrier_mark_introspection_gc_roots\label_suffix, #function
+    .hidden art_quick_read_barrier_mark_introspection_gc_roots\label_suffix
+    .global art_quick_read_barrier_mark_introspection_gc_roots\label_suffix
+art_quick_read_barrier_mark_introspection_gc_roots\label_suffix:
+    BRBMI_RUNTIME_CALL
+    // Load the LDR (or the half of it) that contains Rt.
+    BRBMI_LOAD_RETURN_REG_FROM_CODE\label_suffix \gc_root_ldr_offset
+    b       .Lmark_introspection_extract_register_and_return\label_suffix
+    // We've used 28 bytes since the "gc_roots" entrypoint (22 bytes for
+    // BRBMI_RUNTIME_CALL, 4 bytes for LDRH and 2 bytes for the branch). Squeeze
+    // the 6 byte forwarding address extraction here across the 32-byte boundary.
+    BRBMI_EXTRACT_FORWARDING_ADDRESS \label_suffix
+    // And the slow path taking exactly 30 bytes (6 bytes for the forwarding
+    // address check, 22 bytes for BRBMI_RUNTIME_CALL and 2 bytes for the near
+    // branch) shall take the rest of the 32-byte section (within a cache line).
+    BRBMI_UNMARKED_FORWARDING_ADDRESS_CHECK \label_suffix
+    BRBMI_RUNTIME_CALL
+    b       .Lmark_introspection_return\label_suffix
 .endm
 
     /*
@@ -2249,14 +2313,16 @@
      *
      * The entrypoint is called through a thunk that differs across load kinds.
      * For field and array loads the LDR instruction in generated code follows
-     * the branch to the thunk, i.e. the LDR is at [LR, #(-4 - 1)] where the -1
-     * is an adjustment for the Thumb mode bit in LR, and the thunk knows the
-     * holder and performs the gray bit check, returning to the LDR instruction
-     * if the object is not gray, so this entrypoint no longer needs to know
-     * anything about the holder. For GC root loads, the LDR instruction in
-     * generated code precedes the branch to the thunk, i.e. the LDR is at
-     * [LR, #(-8 - 1)] where the -1 is again the Thumb mode bit adjustment, and
-     * the thunk does not do the gray bit check.
+     * the branch to the thunk, i.e. the LDR is (ignoring the heap poisoning)
+     * at [LR, #(-4 - 1)] (encoding T3) or [LR, #(-2 - 1)] (encoding T1) where
+     * the -1 is an adjustment for the Thumb mode bit in LR, and the thunk
+     * knows the holder and performs the gray bit check, returning to the LDR
+     * instruction if the object is not gray, so this entrypoint no longer
+     * needs to know anything about the holder. For GC root loads, the LDR
+     * instruction in generated code precedes the branch to the thunk, i.e. the
+     * LDR is at [LR, #(-8 - 1)] (encoding T3) or [LR, #(-6 - 1)] (encoding T1)
+     * where the -1 is again the Thumb mode bit adjustment, and the thunk does
+     * not do the gray bit check.
      *
      * For field accesses and array loads with a constant index the thunk loads
      * the reference into IP using introspection and calls the main entrypoint,
@@ -2288,11 +2354,29 @@
      *
      * The code structure is
      *   art_quick_read_barrier_mark_introspection:
-     *     Over 128 bytes for the main entrypoint code.
-     *     Padding to 192 bytes if needed.
-     *   art_quick_read_barrier_mark_introspection_gc_roots:
-     *     GC root entrypoint code.
-     *     Padding to 256 bytes if needed.
+     *     Up to 32 bytes code for main entrypoint fast-path code for fields
+     *     (and array elements with constant offset) with LDR encoding T3;
+     *     jumps to the switch in the "narrow" entrypoint.
+     *     Padding to 32 bytes if needed.
+     *   art_quick_read_barrier_mark_introspection_narrow:
+     *     Up to 48 bytes code for fast path code for fields (and array
+     *     elements with constant offset) with LDR encoding T1, ending in the
+     *     return switch instruction TBB and the table with switch offsets.
+     *     Padding to 80 bytes if needed.
+     *   .Lmark_introspection_return_switch_case_r0:
+     *     Exactly 48 bytes of code for the return switch cases (12 cases,
+     *     including BKPT for the reserved registers).
+     *     Ends at 128 bytes total.
+     *   art_quick_read_barrier_mark_introspection_gc_roots_wide:
+     *     GC root entrypoint code for LDR encoding T3 (28 bytes).
+     *     Forwarding address extraction for LDR encoding T3 (6 bytes).
+     *     Slow path for main entrypoint for LDR encoding T3 (30 bytes).
+     *     Ends at 192 bytes total.
+     *   art_quick_read_barrier_mark_introspection_gc_roots_narrow:
+     *     GC root entrypoint code for LDR encoding T1 (28 bytes).
+     *     Forwarding address extraction for LDR encoding T1 (6 bytes).
+     *     Slow path for main entrypoint for LDR encoding T1 (30 bytes).
+     *     Ends at 256 bytes total.
      *   art_quick_read_barrier_mark_introspection_arrays:
      *     Exactly 128 bytes for array load switch cases (16x2 instructions).
      */
@@ -2302,17 +2386,30 @@
     // (R4 is reserved for the entrypoint address.)
     // For heap poisoning, the reference is poisoned, so unpoison it first.
     UNPOISON_HEAP_REF ip
-    // If reference is null, just return it in the right register.
-    cmp     ip, #0
-    beq     .Lmark_introspection_return
-    // Use R4 as temp and check the mark bit of the reference.
-    ldr     r4, [ip, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    tst     r4, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
-    beq     .Lmark_introspection_unmarked
-.Lmark_introspection_return:
-    // Load the half of the instruction that contains Rt. Adjust for the thumb state in LR.
-    ldrh    r4, [lr, #(-1 + BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET + 2)]
+    // Check for null or marked, lock word is loaded into IP.
+    BRBMI_CHECK_NULL_AND_MARKED _wide
+    // Load the half of the instruction that contains Rt.
+    BRBMI_LOAD_RETURN_REG_FROM_CODE_wide BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET
+.Lmark_introspection_extract_register_and_return_wide:
     lsr     r4, r4, #12               // Extract `ref_reg`.
+    b       .Lmark_introspection_return_switch
+
+    .balign 32
+    .thumb_func
+    .type art_quick_read_barrier_mark_introspection_narrow, #function
+    .hidden art_quick_read_barrier_mark_introspection_narrow
+    .global art_quick_read_barrier_mark_introspection_narrow
+art_quick_read_barrier_mark_introspection_narrow:
+    // At this point, IP contains the reference, R4 can be freely used.
+    // (R4 is reserved for the entrypoint address.)
+    // For heap poisoning, the reference is poisoned, so unpoison it first.
+    UNPOISON_HEAP_REF ip
+    // Check for null or marked, lock word is loaded into R4.
+    BRBMI_CHECK_NULL_AND_MARKED _narrow
+    // Load the 16-bit instruction.
+    BRBMI_LOAD_RETURN_REG_FROM_CODE_narrow BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
+.Lmark_introspection_extract_register_and_return_narrow:
+    and     r4, r4, #7                // Extract `ref_reg`.
 .Lmark_introspection_return_switch:
     tbb     [pc, r4]                  // Jump to the switch case.
 .Lmark_introspection_return_table:
@@ -2320,32 +2417,8 @@
     .balign 16
     BRBMI_FOR_12_REGISTERS BRBMI_RETURN_SWITCH_CASE, BRBMI_BAD_RETURN_SWITCH_CASE
 
-    .balign 16
-.Lmark_introspection_unmarked:
-    // Check if the top two bits are one, if this is the case it is a forwarding address.
-#if (LOCK_WORD_STATE_SHIFT != 30) || (LOCK_WORD_STATE_FORWARDING_ADDRESS != 3)
-    // To use "CMP ip, #modified-immediate; BHS", we need the lock word state in
-    // the highest bits and the "forwarding address" state to have all bits set.
-#error "Unexpected lock word state shift or forwarding address state value."
-#endif
-    cmp     r4, #(LOCK_WORD_STATE_FORWARDING_ADDRESS << LOCK_WORD_STATE_SHIFT)
-    bhs     .Lmark_introspection_forwarding_address
-    BRBMI_SLOW_PATH BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET
-
-    .balign 8
-.Lmark_introspection_forwarding_address:
-    // Shift left by the forwarding address shift. This clears out the state bits since they are
-    // in the top 2 bits of the lock word.
-    lsl     ip, r4, #LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT
-    b       .Lmark_introspection_return
-
-    .balign 64
-    .thumb_func
-    .type art_quick_read_barrier_mark_introspection_gc_roots, #function
-    .hidden art_quick_read_barrier_mark_introspection_gc_roots
-    .global art_quick_read_barrier_mark_introspection_gc_roots
-art_quick_read_barrier_mark_introspection_gc_roots:
-    BRBMI_SLOW_PATH BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET
+    BRBMI_GC_ROOT_AND_FIELD_SLOW_PATH BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET, _wide
+    BRBMI_GC_ROOT_AND_FIELD_SLOW_PATH BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET, _narrow
 
     .balign 256
     .thumb_func
diff --git a/runtime/oat.h b/runtime/oat.h
index a38eebc..e119b81 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '1', '2', '4', '\0' };  // New compiler filter names.
+  static constexpr uint8_t kOatVersion[] = { '1', '2', '5', '\0' };  // ARM Baker narrow thunks.
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";