ARM: Use rMR for Baker RB introspection marking.

The marking register (r8 on ARM) is known to be 1 when
entering the introspection marking entrypoint, so we can
clobber it, use it as a temporaray register (instead or r4)
in the runtime entrypoint, and reload the 1 before
returning. The immediate benefits are minor, see below,
but this shall allow further improvements, for example we
could try to change rMR to r4 which would reduce code size
of every marking register check by 2 bytes.

ARM boot image (boot*.oat) size in aosp_taimen-userdebug:
  - before: 17861724
  - after: 17858088 (-3636)

Test: Pixel 2 XL boots.
Test: m test-art-host-gtest
Test: testrunner.py --target --optimizing --32
Test: Repeat the above tests with heap poisoning enabled.
Bug: 36141117
Change-Id: I0f625dec3a6b3ee1786f7e5f4377be42b9bc37d3
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 7350b14..58ce9aa 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -107,16 +107,6 @@
 // Marker that code is yet to be, and must, be implemented.
 #define TODO_VIXL32(level) LOG(level) << __PRETTY_FUNCTION__ << " unimplemented "
 
-static inline void ExcludeIPAndBakerCcEntrypointRegister(UseScratchRegisterScope* temps,
-                                                         HInstruction* instruction) {
-  DCHECK(temps->IsAvailable(ip));
-  temps->Exclude(ip);
-  DCHECK(!temps->IsAvailable(kBakerCcEntrypointRegister));
-  DCHECK_NE(instruction->GetLocations()->GetTempCount(), 0u);
-  DCHECK(RegisterFrom(instruction->GetLocations()->GetTemp(
-      instruction->GetLocations()->GetTempCount() - 1u)).Is(kBakerCcEntrypointRegister));
-}
-
 static inline void EmitPlaceholderBne(CodeGeneratorARMVIXL* codegen, vixl32::Label* patch_label) {
   ExactAssemblyScope eas(codegen->GetVIXLAssembler(), kMaxInstructionSizeInBytes);
   __ bind(patch_label);
@@ -5973,8 +5963,6 @@
       if (field_info.GetFieldOffset().Uint32Value() >= kReferenceLoadMinFarOffset) {
         locations->AddTemp(Location::RequiresRegister());
       }
-      // And we always need the reserved entrypoint register.
-      locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode()));
     } else {
       locations->AddTemp(Location::RequiresRegister());
     }
@@ -6087,11 +6075,11 @@
     case DataType::Type::kReference: {
       // /* HeapReference<Object> */ out = *(base + offset)
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-        Location temp_loc = locations->GetTemp(0);
+        Location maybe_temp = (locations->GetTempCount() != 0) ? locations->GetTemp(0) : Location();
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier call.
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
-            instruction, out, base, offset, temp_loc, /* needs_null_check */ true);
+            instruction, out, base, offset, maybe_temp, /* needs_null_check */ true);
         if (is_volatile) {
           codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
         }
@@ -6390,8 +6378,6 @@
         object_array_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
   if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
-    // We need a temporary register for the read barrier marking slow
-    // path in CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier.
     if (kBakerReadBarrierLinkTimeThunksEnableForFields &&
         !Runtime::Current()->UseJitCompilation() &&
         instruction->GetIndex()->IsConstant()) {
@@ -6404,16 +6390,10 @@
       if (offset >= kReferenceLoadMinFarOffset) {
         locations->AddTemp(Location::RequiresRegister());
       }
-      // And we always need the reserved entrypoint register.
-      locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode()));
-    } else if (kBakerReadBarrierLinkTimeThunksEnableForArrays &&
-               !Runtime::Current()->UseJitCompilation() &&
-               !instruction->GetIndex()->IsConstant()) {
-      // We need a non-scratch temporary for the array data pointer.
-      locations->AddTemp(Location::RequiresRegister());
-      // And we always need the reserved entrypoint register.
-      locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode()));
     } else {
+      // If using introspection, we need a non-scratch temporary for the array data pointer.
+      // Otherwise, we need a temporary register for the read barrier marking slow
+      // path in CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier.
       locations->AddTemp(Location::RequiresRegister());
     }
   } else if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
@@ -6526,20 +6506,22 @@
       // /* HeapReference<Object> */ out =
       //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-        Location temp = locations->GetTemp(0);
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier call.
         DCHECK(!instruction->CanDoImplicitNullCheckOn(instruction->InputAt(0)));
         if (index.IsConstant()) {
           // Array load with a constant index can be treated as a field load.
+          Location maybe_temp =
+              (locations->GetTempCount() != 0) ? locations->GetTemp(0) : Location();
           data_offset += Int32ConstantFrom(index) << DataType::SizeShift(type);
           codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
                                                           out_loc,
                                                           obj,
                                                           data_offset,
-                                                          locations->GetTemp(0),
+                                                          maybe_temp,
                                                           /* needs_null_check */ false);
         } else {
+          Location temp = locations->GetTemp(0);
           codegen_->GenerateArrayLoadWithBakerReadBarrier(
               instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ false);
         }
@@ -7447,13 +7429,6 @@
       // For non-Baker read barrier we have a temp-clobbering call.
     }
   }
-  if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) {
-    if (load_kind == HLoadClass::LoadKind::kBssEntry ||
-        (load_kind == HLoadClass::LoadKind::kReferrersClass &&
-            !Runtime::Current()->UseJitCompilation())) {
-      locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode()));
-    }
-  }
 }
 
 // NO_THREAD_SAFETY_ANALYSIS as we manipulate handles whose internal object we know does not
@@ -7687,9 +7662,6 @@
         // TODO: Add GetReturnLocation() to the calling convention so that we can DCHECK()
         // that the the kPrimNot result register is the same as the first argument register.
         locations->SetCustomSlowPathCallerSaves(caller_saves);
-        if (kUseBakerReadBarrier && kBakerReadBarrierLinkTimeThunksEnableForGcRoots) {
-          locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode()));
-        }
       } else {
         // For non-Baker read barrier we have a temp-clobbering call.
       }
@@ -7866,9 +7838,6 @@
   // Note that TypeCheckSlowPathARM uses this register too.
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
   locations->AddRegisterTemps(NumberOfInstanceOfTemps(type_check_kind));
-  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-    codegen_->MaybeAddBakerCcEntrypointTempForFields(locations);
-  }
 }
 
 void InstructionCodeGeneratorARMVIXL::VisitInstanceOf(HInstanceOf* instruction) {
@@ -8829,7 +8798,7 @@
         //   return_address:
 
         UseScratchRegisterScope temps(GetVIXLAssembler());
-        ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction);
+        temps.Exclude(ip);
         bool narrow = CanEmitNarrowLdr(root_reg, obj, offset);
         uint32_t custom_data = EncodeBakerReadBarrierGcRootData(root_reg.GetCode(), narrow);
         vixl32::Label* bne_label = NewBakerReadBarrierPatch(custom_data);
@@ -8897,16 +8866,6 @@
   MaybeGenerateMarkingRegisterCheck(/* code */ 18);
 }
 
-void CodeGeneratorARMVIXL::MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations) {
-  DCHECK(kEmitCompilerReadBarrier);
-  DCHECK(kUseBakerReadBarrier);
-  if (kBakerReadBarrierLinkTimeThunksEnableForFields) {
-    if (!Runtime::Current()->UseJitCompilation()) {
-      locations->AddTemp(Location::RegisterLocation(kBakerCcEntrypointRegister.GetCode()));
-    }
-  }
-}
-
 void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
                                                                  Location ref,
                                                                  vixl32::Register obj,
@@ -8944,7 +8903,6 @@
     vixl32::Register base = obj;
     if (offset >= kReferenceLoadMinFarOffset) {
       base = RegisterFrom(temp);
-      DCHECK(!base.Is(kBakerCcEntrypointRegister));
       static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2.");
       __ Add(base, obj, Operand(offset & ~(kReferenceLoadMinFarOffset - 1u)));
       offset &= (kReferenceLoadMinFarOffset - 1u);
@@ -8954,7 +8912,7 @@
       DCHECK(!narrow);
     }
     UseScratchRegisterScope temps(GetVIXLAssembler());
-    ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction);
+    temps.Exclude(ip);
     uint32_t custom_data = EncodeBakerReadBarrierFieldData(base.GetCode(), obj.GetCode(), narrow);
     vixl32::Label* bne_label = NewBakerReadBarrierPatch(custom_data);
 
@@ -9037,10 +8995,9 @@
     vixl32::Register index_reg = RegisterFrom(index, DataType::Type::kInt32);
     vixl32::Register ref_reg = RegisterFrom(ref, DataType::Type::kReference);
     vixl32::Register data_reg = RegisterFrom(temp, DataType::Type::kInt32);  // Raw pointer.
-    DCHECK(!data_reg.Is(kBakerCcEntrypointRegister));
 
     UseScratchRegisterScope temps(GetVIXLAssembler());
-    ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction);
+    temps.Exclude(ip);
     uint32_t custom_data = EncodeBakerReadBarrierArrayData(data_reg.GetCode());
     vixl32::Label* bne_label = NewBakerReadBarrierPatch(custom_data);
 
@@ -9927,16 +9884,16 @@
 }
 
 // Load the read barrier introspection entrypoint in register `entrypoint`
-static void LoadReadBarrierMarkIntrospectionEntrypoint(ArmVIXLAssembler& assembler,
-                                                       vixl32::Register entrypoint) {
+static vixl32::Register LoadReadBarrierMarkIntrospectionEntrypoint(ArmVIXLAssembler& assembler) {
   // The register where the read barrier introspection entrypoint is loaded
-  // is fixed: `kBakerCcEntrypointRegister` (R4).
-  DCHECK(entrypoint.Is(kBakerCcEntrypointRegister));
+  // is the marking register. We clobber it here and the entrypoint restores it to 1.
+  vixl32::Register entrypoint = mr;
   // entrypoint = Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection.
   DCHECK_EQ(ip.GetCode(), 12u);
   const int32_t entry_point_offset =
       Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode());
   __ Ldr(entrypoint, MemOperand(tr, entry_point_offset));
+  return entrypoint;
 }
 
 void CodeGeneratorARMVIXL::CompileBakerReadBarrierThunk(ArmVIXLAssembler& assembler,
@@ -9975,8 +9932,7 @@
       __ Bind(&slow_path);
       const int32_t ldr_offset = /* Thumb state adjustment (LR contains Thumb state). */ -1 +
                                  raw_ldr_offset;
-      vixl32::Register ep_reg(kBakerCcEntrypointRegister);
-      LoadReadBarrierMarkIntrospectionEntrypoint(assembler, ep_reg);
+      vixl32::Register ep_reg = LoadReadBarrierMarkIntrospectionEntrypoint(assembler);
       if (width == BakerReadBarrierWidth::kWide) {
         MemOperand ldr_half_address(lr, ldr_offset + 2);
         __ Ldrh(ip, ldr_half_address);        // Load the LDR immediate half-word with "Rt | imm12".
@@ -10016,8 +9972,7 @@
       MemOperand ldr_address(lr, ldr_offset + 2);
       __ Ldrb(ip, ldr_address);               // Load the LDR (register) byte with "00 | imm2 | Rm",
                                               // i.e. Rm+32 because the scale in imm2 is 2.
-      vixl32::Register ep_reg(kBakerCcEntrypointRegister);
-      LoadReadBarrierMarkIntrospectionEntrypoint(assembler, ep_reg);
+      vixl32::Register ep_reg = LoadReadBarrierMarkIntrospectionEntrypoint(assembler);
       __ Bfi(ep_reg, ip, 3, 6);               // Insert ip to the entrypoint address to create
                                               // a switch case target based on the index register.
       __ Mov(ip, base_reg);                   // Move the base register to ip0.
@@ -10050,8 +10005,7 @@
                     " the highest bits and the 'forwarding address' state to have all bits set");
       __ Cmp(ip, Operand(0xc0000000));
       __ B(hs, &forwarding_address);
-      vixl32::Register ep_reg(kBakerCcEntrypointRegister);
-      LoadReadBarrierMarkIntrospectionEntrypoint(assembler, ep_reg);
+      vixl32::Register ep_reg = LoadReadBarrierMarkIntrospectionEntrypoint(assembler);
       // Adjust the art_quick_read_barrier_mark_introspection address in kBakerCcEntrypointRegister
       // to art_quick_read_barrier_mark_introspection_gc_roots.
       int32_t entrypoint_offset = (width == BakerReadBarrierWidth::kWide)
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index 6b9919a..d5b739b 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -113,9 +113,6 @@
 static const size_t kRuntimeParameterFpuRegistersLengthVIXL =
     arraysize(kRuntimeParameterFpuRegistersVIXL);
 
-// The reserved entrypoint register for link-time generated thunks.
-const vixl::aarch32::Register kBakerCcEntrypointRegister = vixl32::r4;
-
 class LoadClassSlowPathARMVIXL;
 class CodeGeneratorARMVIXL;
 
@@ -611,10 +608,6 @@
 
   void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE;
 
-  // Maybe add the reserved entrypoint register as a temporary for field load. This temp
-  // is added only for AOT compilation if link-time generated thunks for fields are enabled.
-  void MaybeAddBakerCcEntrypointTempForFields(LocationSummary* locations);
-
   // Generate a GC root reference load:
   //
   //   root <- *(obj + offset)
@@ -816,7 +809,7 @@
                kBitsForBakerReadBarrierWidth>;
 
   static void CheckValidReg(uint32_t reg) {
-    DCHECK(reg < vixl::aarch32::ip.GetCode() && reg != kBakerCcEntrypointRegister.GetCode()) << reg;
+    DCHECK(reg < vixl::aarch32::ip.GetCode() && reg != mr.GetCode()) << reg;
   }
 
   static uint32_t EncodeBakerReadBarrierFieldData(uint32_t base_reg,
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 29aecbc..5287b4b 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -1802,8 +1802,6 @@
     // is clobbered by ReadBarrierMarkRegX entry points). Get an extra
     // temporary register from the register allocator.
     locations->AddTemp(Location::RequiresRegister());
-    CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen_);
-    arm_codegen->MaybeAddBakerCcEntrypointTempForFields(locations);
   }
 }
 
diff --git a/dex2oat/linker/arm/relative_patcher_thumb2_test.cc b/dex2oat/linker/arm/relative_patcher_thumb2_test.cc
index e7b11bd..3fe97e1 100644
--- a/dex2oat/linker/arm/relative_patcher_thumb2_test.cc
+++ b/dex2oat/linker/arm/relative_patcher_thumb2_test.cc
@@ -625,18 +625,23 @@
   ASSERT_LT(GetMethodOffset(1u), 0xfcu);
 }
 
+const uint32_t kBakerValidRegs[] = {
+    0,  1,  2,  3,  4,  5,  6,  7,
+    9, 10, 11,                      // r8 (rMR), IP, SP, LR and PC are reserved.
+};
+
+const uint32_t kBakerValidRegsNarrow[] = {
+    0,  1,  2,  3,  4,  5,  6,  7,
+};
+
 void Thumb2RelativePatcherTest::TestBakerFieldWide(uint32_t offset, uint32_t ref_reg) {
-  uint32_t valid_regs[] = {
-      0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
-      8,  9, 10, 11,                  // IP, SP, LR and PC are reserved.
-  };
   DCHECK_ALIGNED(offset, 4u);
   DCHECK_LT(offset, 4 * KB);
   constexpr size_t kMethodCodeSize = 8u;
   constexpr size_t kLiteralOffset = 0u;
   uint32_t method_idx = 0u;
-  for (uint32_t base_reg : valid_regs) {
-    for (uint32_t holder_reg : valid_regs) {
+  for (uint32_t base_reg : kBakerValidRegs) {
+    for (uint32_t holder_reg : kBakerValidRegs) {
       uint32_t ldr = kLdrWInsn | offset | (base_reg << 16) | (ref_reg << 12);
       const std::vector<uint8_t> raw_code = RawCode({kBneWPlus0, ldr});
       ASSERT_EQ(kMethodCodeSize, raw_code.size());
@@ -655,8 +660,8 @@
   // All thunks are at the end.
   uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment);
   method_idx = 0u;
-  for (uint32_t base_reg : valid_regs) {
-    for (uint32_t holder_reg : valid_regs) {
+  for (uint32_t base_reg : kBakerValidRegs) {
+    for (uint32_t holder_reg : kBakerValidRegs) {
       ++method_idx;
       uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset);
       uint32_t ldr = kLdrWInsn | offset | (base_reg << 16) | (ref_reg << 12);
@@ -725,20 +730,16 @@
 }
 
 void Thumb2RelativePatcherTest::TestBakerFieldNarrow(uint32_t offset, uint32_t ref_reg) {
-  uint32_t valid_regs[] = {
-      0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
-      8,  9, 10, 11,                  // IP, SP, LR and PC are reserved.
-  };
   DCHECK_ALIGNED(offset, 4u);
   DCHECK_LT(offset, 32u);
   constexpr size_t kMethodCodeSize = 6u;
   constexpr size_t kLiteralOffset = 0u;
   uint32_t method_idx = 0u;
-  for (uint32_t base_reg : valid_regs) {
+  for (uint32_t base_reg : kBakerValidRegs) {
     if (base_reg >= 8u) {
       continue;
     }
-    for (uint32_t holder_reg : valid_regs) {
+    for (uint32_t holder_reg : kBakerValidRegs) {
       uint32_t ldr = kLdrInsn | (offset << (6 - 2)) | (base_reg << 3) | ref_reg;
       const std::vector<uint8_t> raw_code = RawCode({kBneWPlus0, ldr});
       ASSERT_EQ(kMethodCodeSize, raw_code.size());
@@ -757,11 +758,11 @@
   // All thunks are at the end.
   uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment);
   method_idx = 0u;
-  for (uint32_t base_reg : valid_regs) {
+  for (uint32_t base_reg : kBakerValidRegs) {
     if (base_reg >= 8u) {
       continue;
     }
-    for (uint32_t holder_reg : valid_regs) {
+    for (uint32_t holder_reg : kBakerValidRegs) {
       ++method_idx;
       uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset);
       uint32_t ldr = kLdrInsn | (offset << (6 - 2)) | (base_reg << 3) | ref_reg;
@@ -1021,10 +1022,6 @@
 }
 
 TEST_F(Thumb2RelativePatcherTest, BakerArray) {
-  uint32_t valid_regs[] = {
-      0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
-      8,  9, 10, 11,                  // IP, SP, LR and PC are reserved.
-  };
   auto ldr = [](uint32_t base_reg) {
     uint32_t index_reg = (base_reg == 0u) ? 1u : 0u;
     uint32_t ref_reg = (base_reg == 2) ? 3u : 2u;
@@ -1033,7 +1030,7 @@
   constexpr size_t kMethodCodeSize = 8u;
   constexpr size_t kLiteralOffset = 0u;
   uint32_t method_idx = 0u;
-  for (uint32_t base_reg : valid_regs) {
+  for (uint32_t base_reg : kBakerValidRegs) {
     ++method_idx;
     const std::vector<uint8_t> raw_code = RawCode({kBneWPlus0, ldr(base_reg)});
     ASSERT_EQ(kMethodCodeSize, raw_code.size());
@@ -1049,7 +1046,7 @@
   // All thunks are at the end.
   uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment);
   method_idx = 0u;
-  for (uint32_t base_reg : valid_regs) {
+  for (uint32_t base_reg : kBakerValidRegs) {
     ++method_idx;
     uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset);
     const std::vector<uint8_t> expected_code = RawCode({bne, ldr(base_reg)});
@@ -1106,14 +1103,10 @@
 }
 
 TEST_F(Thumb2RelativePatcherTest, BakerGcRootWide) {
-  uint32_t valid_regs[] = {
-      0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
-      8,  9, 10, 11,                  // IP, SP, LR and PC are reserved.
-  };
   constexpr size_t kMethodCodeSize = 8u;
   constexpr size_t kLiteralOffset = 4u;
   uint32_t method_idx = 0u;
-  for (uint32_t root_reg : valid_regs) {
+  for (uint32_t root_reg : kBakerValidRegs) {
     ++method_idx;
     uint32_t ldr = kLdrWInsn | (/* offset */ 8) | (/* base_reg */ 0 << 16) | (root_reg << 12);
     const std::vector<uint8_t> raw_code = RawCode({ldr, kBneWPlus0});
@@ -1130,7 +1123,7 @@
   // All thunks are at the end.
   uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment);
   method_idx = 0u;
-  for (uint32_t root_reg : valid_regs) {
+  for (uint32_t root_reg : kBakerValidRegs) {
     ++method_idx;
     uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset);
     uint32_t ldr = kLdrWInsn | (/* offset */ 8) | (/* base_reg */ 0 << 16) | (root_reg << 12);
@@ -1165,14 +1158,10 @@
 }
 
 TEST_F(Thumb2RelativePatcherTest, BakerGcRootNarrow) {
-  uint32_t valid_regs[] = {
-      0,  1,  2,  3,      5,  6,  7,  // R4 is reserved for entrypoint address.
-                                      // Not appplicable to high registers.
-  };
   constexpr size_t kMethodCodeSize = 6u;
   constexpr size_t kLiteralOffset = 2u;
   uint32_t method_idx = 0u;
-  for (uint32_t root_reg : valid_regs) {
+  for (uint32_t root_reg : kBakerValidRegsNarrow) {
     ++method_idx;
     uint32_t ldr = kLdrInsn | (/* offset */ 8 << (6 - 2)) | (/* base_reg */ 0 << 3) | root_reg;
     const std::vector<uint8_t> raw_code = RawCode({ldr, kBneWPlus0});
@@ -1189,7 +1178,7 @@
   // All thunks are at the end.
   uint32_t thunk_offset = GetMethodOffset(method_idx) + RoundUp(kMethodCodeSize, kArmAlignment);
   method_idx = 0u;
-  for (uint32_t root_reg : valid_regs) {
+  for (uint32_t root_reg : kBakerValidRegsNarrow) {
     ++method_idx;
     uint32_t bne = BneWWithOffset(GetMethodOffset(method_idx) + kLiteralOffset, thunk_offset);
     uint32_t ldr = kLdrInsn | (/* offset */ 8 << (6 - 2)) | (/* base_reg */ 0 << 3) | root_reg;
diff --git a/runtime/arch/arm/asm_support_arm.h b/runtime/arch/arm/asm_support_arm.h
index ac17303..7123ae7 100644
--- a/runtime/arch/arm/asm_support_arm.h
+++ b/runtime/arch/arm/asm_support_arm.h
@@ -32,8 +32,8 @@
 #define BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_ENTRYPOINT_OFFSET 0x20
 // The offsets from art_quick_read_barrier_mark_introspection to the GC root entrypoints,
 // i.e. art_quick_read_barrier_mark_introspection_gc_roots_{wide,narrow}.
-#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET 0x80
-#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET 0xc0
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET 0xc0
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET 0xe0
 // The offset from art_quick_read_barrier_mark_introspection to the array switch cases,
 // i.e. art_quick_read_barrier_mark_introspection_arrays.
 #define BAKER_MARK_INTROSPECTION_ARRAY_SWITCH_OFFSET 0x100
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 0fd239a..526960b 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -2362,23 +2362,19 @@
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg11, r11
 
 // Helper macros for Baker CC read barrier mark introspection (BRBMI).
-.macro BRBMI_FOR_12_REGISTERS macro_for_register, macro_for_reserved_register
+.macro BRBMI_FOR_REGISTERS macro_for_register, macro_for_reserved_register
     \macro_for_register r0
     \macro_for_register r1
     \macro_for_register r2
     \macro_for_register r3
-    \macro_for_reserved_register  // R4 is reserved for the entrypoint address.
+    \macro_for_register r4
     \macro_for_register r5
     \macro_for_register r6
     \macro_for_register r7
-    \macro_for_register r8
+    \macro_for_reserved_register  // r8 (rMR) is the marking register.
     \macro_for_register r9
     \macro_for_register r10
     \macro_for_register r11
-.endm
-
-.macro BRBMI_FOR_REGISTERS macro_for_register, macro_for_reserved_register
-    BRBMI_FOR_12_REGISTERS \macro_for_register, \macro_for_reserved_register
     \macro_for_reserved_register  // IP is reserved.
     \macro_for_reserved_register  // SP is reserved.
     \macro_for_reserved_register  // LR is reserved.
@@ -2386,16 +2382,13 @@
 .endm
 
 .macro BRBMI_RETURN_SWITCH_CASE reg
+    .balign 8
 .Lmark_introspection_return_switch_case_\reg:
+    mov     rMR, #1
     mov     \reg, ip
     bx      lr
 .endm
 
-.macro BRBMI_BAD_RETURN_SWITCH_CASE
-.Lmark_introspection_return_switch_case_bad:
-    BRBMI_BKPT_FILL_4B
-.endm
-
 .macro BRBMI_RETURN_SWITCH_CASE_OFFSET reg
     .byte   (.Lmark_introspection_return_switch_case_\reg - .Lmark_introspection_return_table) / 2
 .endm
@@ -2458,9 +2451,9 @@
     // If reference is null, just return it in the right register.
     cmp     ip, #0
     beq     .Lmark_introspection_return\label_suffix
-    // Use R4 as temp and check the mark bit of the reference.
-    ldr     r4, [ip, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    tst     r4, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
+    // Use rMR as temp and check the mark bit of the reference.
+    ldr     rMR, [ip, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    tst     rMR, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
     beq     .Lmark_introspection_unmarked\label_suffix
 .Lmark_introspection_return\label_suffix:
 .endm
@@ -2473,7 +2466,7 @@
     // the highest bits and the "forwarding address" state to have all bits set.
 #error "Unexpected lock word state shift or forwarding address state value."
 #endif
-    cmp     r4, #(LOCK_WORD_STATE_FORWARDING_ADDRESS << LOCK_WORD_STATE_SHIFT)
+    cmp     rMR, #(LOCK_WORD_STATE_FORWARDING_ADDRESS << LOCK_WORD_STATE_SHIFT)
     bhs     .Lmark_introspection_forwarding_address\label_suffix
 .endm
 
@@ -2483,41 +2476,50 @@
 
     // Shift left by the forwarding address shift. This clears out the state bits since they are
     // in the top 2 bits of the lock word.
-    lsl     ip, r4, #LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT
+    lsl     ip, rMR, #LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT
     b       .Lmark_introspection_return\label_suffix
 .endm
 
 .macro BRBMI_LOAD_RETURN_REG_FROM_CODE_wide ldr_offset
     // Load the half of the instruction that contains Rt. Adjust for the thumb state in LR.
-    ldrh    r4, [lr, #(-1 + \ldr_offset + 2)]
+    ldrh    rMR, [lr, #(-1 + \ldr_offset + 2)]
 .endm
 
 .macro BRBMI_LOAD_RETURN_REG_FROM_CODE_narrow ldr_offset
     // Load the 16-bit instruction. Adjust for the thumb state in LR.
-    ldrh    r4, [lr, #(-1 + \ldr_offset)]
+    ldrh    rMR, [lr, #(-1 + \ldr_offset)]
 .endm
 
-.macro BRBMI_GC_ROOT_AND_FIELD_SLOW_PATH gc_root_ldr_offset, label_suffix
-    .balign 64
+.macro BRBMI_EXTRACT_RETURN_REG_wide
+    lsr     rMR, rMR, #12             // Extract `ref_reg`.
+.endm
+
+.macro BRBMI_EXTRACT_RETURN_REG_narrow
+    and     rMR, rMR, #7              // Extract `ref_reg`.
+.endm
+
+.macro BRBMI_LOAD_AND_EXTRACT_RETURN_REG ldr_offset, label_suffix
+    BRBMI_LOAD_RETURN_REG_FROM_CODE\label_suffix \ldr_offset
+    BRBMI_EXTRACT_RETURN_REG\label_suffix
+.endm
+
+.macro BRBMI_GC_ROOT gc_root_ldr_offset, label_suffix
+    .balign 32
     .thumb_func
     .type art_quick_read_barrier_mark_introspection_gc_roots\label_suffix, #function
     .hidden art_quick_read_barrier_mark_introspection_gc_roots\label_suffix
     .global art_quick_read_barrier_mark_introspection_gc_roots\label_suffix
 art_quick_read_barrier_mark_introspection_gc_roots\label_suffix:
-    BRBMI_RUNTIME_CALL
-    // Load the LDR (or the half of it) that contains Rt.
-    BRBMI_LOAD_RETURN_REG_FROM_CODE\label_suffix \gc_root_ldr_offset
-    b       .Lmark_introspection_extract_register_and_return\label_suffix
-    // We've used 28 bytes since the "gc_roots" entrypoint (22 bytes for
-    // BRBMI_RUNTIME_CALL, 4 bytes for LDRH and 2 bytes for the branch). Squeeze
-    // the 6 byte forwarding address extraction here across the 32-byte boundary.
-    BRBMI_EXTRACT_FORWARDING_ADDRESS \label_suffix
-    // And the slow path taking exactly 30 bytes (6 bytes for the forwarding
-    // address check, 22 bytes for BRBMI_RUNTIME_CALL and 2 bytes for the near
-    // branch) shall take the rest of the 32-byte section (within a cache line).
+    BRBMI_LOAD_AND_EXTRACT_RETURN_REG \gc_root_ldr_offset, \label_suffix
+.endm
+
+.macro BRBMI_FIELD_SLOW_PATH ldr_offset, label_suffix
+    .balign 16
+.Lmark_introspection_unmarked\label_suffix:
+    // Note: Generates exactly 16 bytes of code.
     BRBMI_UNMARKED_FORWARDING_ADDRESS_CHECK \label_suffix
-    BRBMI_RUNTIME_CALL
-    b       .Lmark_introspection_return\label_suffix
+    BRBMI_LOAD_AND_EXTRACT_RETURN_REG \ldr_offset, \label_suffix
+    b .Lmark_introspection_runtime_call
 .endm
 
     /*
@@ -2540,9 +2542,12 @@
      * not do the gray bit check.
      *
      * For field accesses and array loads with a constant index the thunk loads
-     * the reference into IP using introspection and calls the main entrypoint,
-     * art_quick_read_barrier_mark_introspection. With heap poisoning enabled,
-     * the passed reference is poisoned.
+     * the reference into IP using introspection and calls the main entrypoint
+     * ("wide", for 32-bit LDR) art_quick_read_barrier_mark_introspection or
+     * the "narrow" entrypoint (for 16-bit LDR). The latter is at a known
+     * offset (BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_ENTRYPOINT_OFFSET)
+     * from the main entrypoint and the thunk adjusts the entrypoint pointer.
+     * With heap poisoning enabled, the passed reference is poisoned.
      *
      * For array accesses with non-constant index, the thunk inserts the bits
      * 0-5 of the LDR instruction to the entrypoint address, effectively
@@ -2560,53 +2565,61 @@
      * (And even with heap poisoning enabled, GC roots are not poisoned.)
      * To re-use the same entrypoint pointer in generated code, we make sure
      * that the gc root entrypoint (a copy of the entrypoint with a different
-     * offset for introspection loads) is located at a known offset (128 bytes,
-     * or BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET) from the main
-     * entrypoint and the GC root thunk adjusts the entrypoint pointer, moves
-     * the root register to IP and jumps to the customized entrypoint,
-     * art_quick_read_barrier_mark_introspection_gc_roots. The thunk also
-     * performs all the fast-path checks, so we need just the slow path.
+     * offset for introspection loads) is located at a known offset (0xc0/0xe0
+     * bytes, or BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET/
+     * BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET) from the
+     * main entrypoint and the GC root thunk adjusts the entrypoint pointer,
+     * moves the root register to IP and jumps to the customized entrypoint,
+     * art_quick_read_barrier_mark_introspection_gc_roots_{wide,narrow}.
+     * The thunk also performs all the fast-path checks, so we need just the
+     * slow path.
      *
      * The code structure is
-     *   art_quick_read_barrier_mark_introspection:
+     *   art_quick_read_barrier_mark_introspection:                   // @0x00
      *     Up to 32 bytes code for main entrypoint fast-path code for fields
      *     (and array elements with constant offset) with LDR encoding T3;
      *     jumps to the switch in the "narrow" entrypoint.
-     *     Padding to 32 bytes if needed.
-     *   art_quick_read_barrier_mark_introspection_narrow:
+     *   art_quick_read_barrier_mark_introspection_narrow:            // @0x20
      *     Up to 48 bytes code for fast path code for fields (and array
      *     elements with constant offset) with LDR encoding T1, ending in the
      *     return switch instruction TBB and the table with switch offsets.
-     *     Padding to 80 bytes if needed.
-     *   .Lmark_introspection_return_switch_case_r0:
-     *     Exactly 48 bytes of code for the return switch cases (12 cases,
-     *     including BKPT for the reserved registers).
-     *     Ends at 128 bytes total.
-     *   art_quick_read_barrier_mark_introspection_gc_roots_wide:
-     *     GC root entrypoint code for LDR encoding T3 (28 bytes).
-     *     Forwarding address extraction for LDR encoding T3 (6 bytes).
-     *     Slow path for main entrypoint for LDR encoding T3 (30 bytes).
-     *     Ends at 192 bytes total.
-     *   art_quick_read_barrier_mark_introspection_gc_roots_narrow:
-     *     GC root entrypoint code for LDR encoding T1 (28 bytes).
-     *     Forwarding address extraction for LDR encoding T1 (6 bytes).
-     *     Slow path for main entrypoint for LDR encoding T1 (30 bytes).
-     *     Ends at 256 bytes total.
-     *   art_quick_read_barrier_mark_introspection_arrays:
+     *   .Lmark_introspection_return_switch_case_r0:                  // @0x50
+     *     Exactly 88 bytes of code for the return switch cases (8 bytes per
+     *     case, 11 cases; no code for reserved registers).
+     *   .Lmark_introspection_forwarding_address_narrow:              // @0xa8
+     *     Exactly 6 bytes to extract the forwarding address and jump to the
+     *     "narrow" entrypoint fast path.
+     *   .Lmark_introspection_return_switch_case_bad:                 // @0xae
+     *     Exactly 2 bytes, bkpt for unexpected return register.
+     *   .Lmark_introspection_unmarked_narrow:                        // @0xb0
+     *     Exactly 16 bytes for "narrow" entrypoint slow path.
+     *   art_quick_read_barrier_mark_introspection_gc_roots_wide:     // @0xc0
+     *     GC root entrypoint code for LDR encoding T3 (10 bytes); loads and
+     *     extracts the return register and jumps to the runtime call.
+     *   .Lmark_introspection_forwarding_address_wide:                // @0xca
+     *     Exactly 6 bytes to extract the forwarding address and jump to the
+     *     "wide" entrypoint fast path.
+     *   .Lmark_introspection_unmarked_wide:                          // @0xd0
+     *     Exactly 16 bytes for "wide" entrypoint slow path.
+     *   art_quick_read_barrier_mark_introspection_gc_roots_narrow:   // @0xe0
+     *     GC root entrypoint code for LDR encoding T1 (8 bytes); loads and
+     *     extracts the return register and falls through to the runtime call.
+     *   .Lmark_introspection_runtime_call:                           // @0xe8
+     *     Exactly 24 bytes for the runtime call to MarkReg() and jump to the
+     *     return switch.
+     *   art_quick_read_barrier_mark_introspection_arrays:            // @0x100
      *     Exactly 128 bytes for array load switch cases (16x2 instructions).
      */
     .balign 512
 ENTRY art_quick_read_barrier_mark_introspection
-    // At this point, IP contains the reference, R4 can be freely used.
-    // (R4 is reserved for the entrypoint address.)
+    // At this point, IP contains the reference, rMR is clobbered by the thunk
+    // and can be freely used as it will be set back to 1 before returning.
     // For heap poisoning, the reference is poisoned, so unpoison it first.
     UNPOISON_HEAP_REF ip
-    // Check for null or marked, lock word is loaded into IP.
+    // Check for null or marked, lock word is loaded into rMR.
     BRBMI_CHECK_NULL_AND_MARKED _wide
-    // Load the half of the instruction that contains Rt.
-    BRBMI_LOAD_RETURN_REG_FROM_CODE_wide BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET
-.Lmark_introspection_extract_register_and_return_wide:
-    lsr     r4, r4, #12               // Extract `ref_reg`.
+    // Load and extract the return register from the instruction.
+    BRBMI_LOAD_AND_EXTRACT_RETURN_REG BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET, _wide
     b       .Lmark_introspection_return_switch
 
     .balign 32
@@ -2615,25 +2628,45 @@
     .hidden art_quick_read_barrier_mark_introspection_narrow
     .global art_quick_read_barrier_mark_introspection_narrow
 art_quick_read_barrier_mark_introspection_narrow:
-    // At this point, IP contains the reference, R4 can be freely used.
-    // (R4 is reserved for the entrypoint address.)
+    // At this point, IP contains the reference, rMR is clobbered by the thunk
+    // and can be freely used as it will be set back to 1 before returning.
     // For heap poisoning, the reference is poisoned, so unpoison it first.
     UNPOISON_HEAP_REF ip
-    // Check for null or marked, lock word is loaded into R4.
+    // Check for null or marked, lock word is loaded into rMR.
     BRBMI_CHECK_NULL_AND_MARKED _narrow
-    // Load the 16-bit instruction.
-    BRBMI_LOAD_RETURN_REG_FROM_CODE_narrow BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
-.Lmark_introspection_extract_register_and_return_narrow:
-    and     r4, r4, #7                // Extract `ref_reg`.
+    // Load and extract the return register from the instruction.
+    BRBMI_LOAD_AND_EXTRACT_RETURN_REG BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET, _narrow
 .Lmark_introspection_return_switch:
-    tbb     [pc, r4]                  // Jump to the switch case.
+    tbb     [pc, rMR]                 // Jump to the switch case.
 .Lmark_introspection_return_table:
     BRBMI_FOR_REGISTERS BRBMI_RETURN_SWITCH_CASE_OFFSET, BRBMI_BAD_RETURN_SWITCH_CASE_OFFSET
-    .balign 16
-    BRBMI_FOR_12_REGISTERS BRBMI_RETURN_SWITCH_CASE, BRBMI_BAD_RETURN_SWITCH_CASE
+    BRBMI_FOR_REGISTERS BRBMI_RETURN_SWITCH_CASE, /* no code */
 
-    BRBMI_GC_ROOT_AND_FIELD_SLOW_PATH BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET, _wide
-    BRBMI_GC_ROOT_AND_FIELD_SLOW_PATH BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET, _narrow
+    .balign 8
+    BRBMI_EXTRACT_FORWARDING_ADDRESS _narrow  // 6 bytes
+.Lmark_introspection_return_switch_case_bad:
+    bkpt                              // 2 bytes
+
+    BRBMI_FIELD_SLOW_PATH BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET, _narrow
+
+    // 8 bytes for the loading and extracting of the return register.
+    BRBMI_GC_ROOT BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET, _wide
+    // 2 bytes for near branch to the runtime call.
+    b .Lmark_introspection_runtime_call
+
+    BRBMI_EXTRACT_FORWARDING_ADDRESS _wide  // Not even 4-byte aligned.
+
+    BRBMI_FIELD_SLOW_PATH BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET, _wide
+
+    // 8 bytes for the loading and extracting of the return register.
+    BRBMI_GC_ROOT BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET, _narrow
+    // And the runtime call and branch to the switch taking exactly 24 bytes
+    // (22 bytes for BRBMI_RUNTIME_CALL and 2 bytes for the near branch)
+    // shall take the rest of the 32-byte section (within a cache line).
+.Lmark_introspection_runtime_call:
+    BRBMI_RUNTIME_CALL
+    b       .Lmark_introspection_return_switch
+
 
     .balign 256
     .thumb_func
diff --git a/runtime/oat.h b/runtime/oat.h
index 01d3914..0318606 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,8 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  // Last oat version changed reason: Retrieve Class* and String* from .data.bimg.rel.ro .
-  static constexpr uint8_t kOatVersion[] = { '1', '4', '0', '\0' };
+  // Last oat version changed reason: Use rMR as temp in Baker RB introspection marking.
+  static constexpr uint8_t kOatVersion[] = { '1', '4', '1', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";