Introduce a Marking Register in ARM64 code generation.

When generating code for ARM64, maintain the status of
Thread::Current()->GetIsGcMarking() in register X20,
dubbed MR (Marking Register), and check the value of that
register (instead of loading and checking a read barrier
marking entrypoint) in read barriers.

Test: m test-art-target
Test: m test-art-target with tree built with ART_USE_READ_BARRIER=false
Test: ARM64 device boot test
Bug: 37707231
Change-Id: Ibe9bc5c99a2176b0a0476e9e9ad7fcc9f745017b
diff --git a/compiler/jni/jni_cfi_test_expected.inc b/compiler/jni/jni_cfi_test_expected.inc
index 2710ae9..acb8a57 100644
--- a/compiler/jni/jni_cfi_test_expected.inc
+++ b/compiler/jni/jni_cfi_test_expected.inc
@@ -89,7 +89,8 @@
     0xF3, 0x53, 0x46, 0xA9, 0xF5, 0x5B, 0x47, 0xA9, 0xF7, 0x63, 0x48, 0xA9,
     0xF9, 0x6B, 0x49, 0xA9, 0xFB, 0x73, 0x4A, 0xA9, 0xFD, 0x7B, 0x4B, 0xA9,
     0xE8, 0x27, 0x42, 0x6D, 0xEA, 0x2F, 0x43, 0x6D, 0xEC, 0x37, 0x44, 0x6D,
-    0xEE, 0x3F, 0x45, 0x6D, 0xFF, 0x03, 0x03, 0x91, 0xC0, 0x03, 0x5F, 0xD6,
+    0xEE, 0x3F, 0x45, 0x6D, 0x74, 0x36, 0x40, 0xB9, 0xFF, 0x03, 0x03, 0x91,
+    0xC0, 0x03, 0x5F, 0xD6,
 };
 static constexpr uint8_t expected_cfi_kArm64[] = {
     0x44, 0x0E, 0xC0, 0x01, 0x44, 0x93, 0x18, 0x94, 0x16, 0x44, 0x95, 0x14,
@@ -101,7 +102,7 @@
     0xD3, 0xD4, 0x44, 0xD5, 0xD6, 0x44, 0xD7, 0xD8, 0x44, 0xD9, 0xDA, 0x44,
     0xDB, 0xDC, 0x44, 0xDD, 0xDE, 0x44, 0x06, 0x48, 0x06, 0x49, 0x44, 0x06,
     0x4A, 0x06, 0x4B, 0x44, 0x06, 0x4C, 0x06, 0x4D, 0x44, 0x06, 0x4E, 0x06,
-    0x4F, 0x44, 0x0E, 0x00, 0x44, 0x0B, 0x0E, 0xC0, 0x01,
+    0x4F, 0x48, 0x0E, 0x00, 0x44, 0x0B, 0x0E, 0xC0, 0x01,
 };
 // 0x00000000: sub sp, sp, #0xc0 (192)
 // 0x00000004: .cfi_def_cfa_offset: 192
@@ -175,11 +176,12 @@
 // 0x0000006c: ldp d14, d15, [sp, #80]
 // 0x00000070: .cfi_restore_extended: r78
 // 0x00000070: .cfi_restore_extended: r79
-// 0x00000070: add sp, sp, #0xc0 (192)
-// 0x00000074: .cfi_def_cfa_offset: 0
-// 0x00000074: ret
-// 0x00000078: .cfi_restore_state
-// 0x00000078: .cfi_def_cfa_offset: 192
+// 0x00000070: ldr w20, [tr, #52] ; is_gc_marking
+// 0x00000074: add sp, sp, #0xc0 (192)
+// 0x00000078: .cfi_def_cfa_offset: 0
+// 0x00000078: ret
+// 0x0000007c: .cfi_restore_state
+// 0x0000007c: .cfi_def_cfa_offset: 192
 
 static constexpr uint8_t expected_asm_kX86[] = {
     0x57, 0x56, 0x55, 0x83, 0xC4, 0xE4, 0x50, 0x89, 0x4C, 0x24, 0x34, 0xF3,
diff --git a/compiler/jni/jni_compiler_test.cc b/compiler/jni/jni_compiler_test.cc
index b34d938..6ce7d75 100644
--- a/compiler/jni/jni_compiler_test.cc
+++ b/compiler/jni/jni_compiler_test.cc
@@ -49,6 +49,9 @@
   return count + 1;
 }
 
+// TODO: In the Baker read barrier configuration, add checks to ensure
+// the Marking Register's value is correct.
+
 namespace art {
 
 enum class JniKind {
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.cc b/compiler/jni/quick/arm64/calling_convention_arm64.cc
index 33f4d77..e086455 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.cc
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.cc
@@ -108,11 +108,25 @@
 
 // Calling convention
 ManagedRegister Arm64ManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
-  return Arm64ManagedRegister::FromXRegister(X20);  // saved on entry restored on exit
+  // X20 is safe to use as a scratch register:
+  // - with Baker read barriers, it is reserved as Marking Register,
+  //   and thus does not actually need to be saved/restored; it is
+  //   refreshed on exit (see Arm64JNIMacroAssembler::RemoveFrame);
+  // - in other cases, it is saved on entry (in
+  //   Arm64JNIMacroAssembler::BuildFrame) and restored on exit (in
+  //   Arm64JNIMacroAssembler::RemoveFrame).
+  return Arm64ManagedRegister::FromXRegister(X20);
 }
 
 ManagedRegister Arm64JniCallingConvention::InterproceduralScratchRegister() {
-  return Arm64ManagedRegister::FromXRegister(X20);  // saved on entry restored on exit
+  // X20 is safe to use as a scratch register:
+  // - with Baker read barriers, it is reserved as Marking Register,
+  //   and thus does not actually need to be saved/restored; it is
+  //   refreshed on exit (see Arm64JNIMacroAssembler::RemoveFrame);
+  // - in other cases, it is saved on entry (in
+  //   Arm64JNIMacroAssembler::BuildFrame) and restored on exit (in
+  //   Arm64JNIMacroAssembler::RemoveFrame).
+  return Arm64ManagedRegister::FromXRegister(X20);
 }
 
 static ManagedRegister ReturnRegisterForShorty(const char* shorty) {
diff --git a/compiler/linker/arm64/relative_patcher_arm64.cc b/compiler/linker/arm64/relative_patcher_arm64.cc
index bc21607..38c732b 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64.cc
@@ -381,6 +381,21 @@
   // Note: The fake dependency is unnecessary for the slow path.
 }
 
+// Load the read barrier introspection entrypoint in register `entrypoint`.
+static void LoadReadBarrierMarkIntrospectionEntrypoint(arm64::Arm64Assembler& assembler,
+                                                       vixl::aarch64::Register entrypoint) {
+  using vixl::aarch64::MemOperand;
+  using vixl::aarch64::ip0;
+  // Thread Register.
+  const vixl::aarch64::Register tr = vixl::aarch64::x19;
+
+  // entrypoint = Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection.
+  DCHECK_EQ(ip0.GetCode(), 16u);
+  const int32_t entry_point_offset =
+      Thread::ReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ip0.GetCode());
+  __ Ldr(entrypoint, MemOperand(tr, entry_point_offset));
+}
+
 void Arm64RelativePatcher::CompileBakerReadBarrierThunk(arm64::Arm64Assembler& assembler,
                                                         uint32_t encoded_data) {
   using namespace vixl::aarch64;  // NOLINT(build/namespaces)
@@ -412,6 +427,7 @@
       __ Bind(&slow_path);
       MemOperand ldr_address(lr, BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET);
       __ Ldr(ip0.W(), ldr_address);         // Load the LDR (immediate) unsigned offset.
+      LoadReadBarrierMarkIntrospectionEntrypoint(assembler, ip1);
       __ Ubfx(ip0.W(), ip0.W(), 10, 12);    // Extract the offset.
       __ Ldr(ip0.W(), MemOperand(base_reg, ip0, LSL, 2));   // Load the reference.
       // Do not unpoison. With heap poisoning enabled, the entrypoint expects a poisoned reference.
@@ -441,6 +457,7 @@
       __ Bind(&slow_path);
       MemOperand ldr_address(lr, BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET);
       __ Ldr(ip0.W(), ldr_address);         // Load the LDR (register) unsigned offset.
+      LoadReadBarrierMarkIntrospectionEntrypoint(assembler, ip1);
       __ Ubfx(ip0, ip0, 16, 6);             // Extract the index register, plus 32 (bit 21 is set).
       __ Bfi(ip1, ip0, 3, 6);               // Insert ip0 to the entrypoint address to create
                                             // a switch case target based on the index register.
@@ -469,6 +486,7 @@
       __ Bind(&not_marked);
       __ Tst(ip0.W(), Operand(ip0.W(), LSL, 1));
       __ B(&forwarding_address, mi);
+      LoadReadBarrierMarkIntrospectionEntrypoint(assembler, ip1);
       // Adjust the art_quick_read_barrier_mark_introspection address in IP1 to
       // art_quick_read_barrier_mark_introspection_gc_roots.
       __ Add(ip1, ip1, Operand(BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRYPOINT_OFFSET));
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 7bf43f7..73202b4 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -404,17 +404,6 @@
   // accessing the String's `value` field in String intrinsics.
   static uint32_t GetArrayDataOffset(HArrayGet* array_get);
 
-  // Return the entry point offset for ReadBarrierMarkRegX, where X is `reg`.
-  template <PointerSize pointer_size>
-  static int32_t GetReadBarrierMarkEntryPointsOffset(size_t reg) {
-    // The entry point list defines 30 ReadBarrierMarkRegX entry points.
-    DCHECK_LT(reg, 30u);
-    // The ReadBarrierMarkRegX entry points are ordered by increasing
-    // register number in Thread::tls_Ptr_.quick_entrypoints.
-    return QUICK_ENTRYPOINT_OFFSET(pointer_size, pReadBarrierMarkReg00).Int32Value()
-        + static_cast<size_t>(pointer_size) * reg;
-  }
-
   void EmitParallelMoves(Location from1,
                          Location to1,
                          Primitive::Type type1,
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 6b9f232..92467fe 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -729,7 +729,7 @@
     } else {
       // Entrypoint is not already loaded, load from the thread.
       int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg);
+          Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg);
       // This runtime call does not require a stack map.
       arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     }
@@ -8428,7 +8428,7 @@
         //     Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection.
         DCHECK_EQ(IP, 12);
         const int32_t entry_point_offset =
-            CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP);
+            Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP);
         __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset);
 
         Label return_address;
@@ -8469,7 +8469,7 @@
 
         // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
         const int32_t entry_point_offset =
-            CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
+            Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
         // Loading the entrypoint does not require a load acquire since it is only changed when
         // threads are suspended or running a checkpoint.
         __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset);
@@ -8572,7 +8572,7 @@
     //     Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection.
     DCHECK_EQ(IP, 12);
     const int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP);
+        Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP);
     __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset);
 
     Label return_address;
@@ -8655,7 +8655,7 @@
     //     Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection.
     DCHECK_EQ(IP, 12);
     const int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP);
+        Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(IP);
     __ LoadFromOffset(kLoadWord, kBakerCcEntrypointRegister, TR, entry_point_offset);
     __ AddConstant(data_reg, obj, data_offset);
 
@@ -8736,7 +8736,7 @@
 
   // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
   const int32_t entry_point_offset =
-      CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg());
+      Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg());
   // Loading the entrypoint does not require a load acquire since it is only changed when
   // threads are suspended or running a checkpoint.
   __ LoadFromOffset(kLoadWord, temp2.AsRegister<Register>(), TR, entry_point_offset);
@@ -8805,7 +8805,7 @@
 
   // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
   const int32_t entry_point_offset =
-      CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg());
+      Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg());
   // Loading the entrypoint does not require a load acquire since it is only changed when
   // threads are suspended or running a checkpoint.
   __ LoadFromOffset(kLoadWord, temp3.AsRegister<Register>(), TR, entry_point_offset);
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 2561ed0..7e5b1a0 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -672,7 +672,9 @@
 // `ref`.
 //
 // Argument `entrypoint` must be a register location holding the read
-// barrier marking runtime entry point to be invoked.
+// barrier marking runtime entry point to be invoked or an empty
+// location; in the latter case, the read barrier marking runtime
+// entry point will be loaded by the slow path code itself.
 class ReadBarrierMarkSlowPathBaseARM64 : public SlowPathCodeARM64 {
  protected:
   ReadBarrierMarkSlowPathBaseARM64(HInstruction* instruction, Location ref, Location entrypoint)
@@ -716,7 +718,7 @@
     } else {
       // Entrypoint is not already loaded, load from the thread.
       int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref_.reg());
+          Thread::ReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref_.reg());
       // This runtime call does not require a stack map.
       arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     }
@@ -743,9 +745,10 @@
 // another thread, or if another thread installed another object
 // reference (different from `ref`) in `obj.field`).
 //
-// If `entrypoint` is a valid location it is assumed to already be
-// holding the entrypoint. The case where the entrypoint is passed in
-// is when the decision to mark is based on whether the GC is marking.
+// Argument `entrypoint` must be a register location holding the read
+// barrier marking runtime entry point to be invoked or an empty
+// location; in the latter case, the read barrier marking runtime
+// entry point will be loaded by the slow path code itself.
 class ReadBarrierMarkSlowPathARM64 : public ReadBarrierMarkSlowPathBaseARM64 {
  public:
   ReadBarrierMarkSlowPathARM64(HInstruction* instruction,
@@ -791,7 +794,9 @@
 // reference (different from `ref`) in `obj.field`).
 //
 // Argument `entrypoint` must be a register location holding the read
-// barrier marking runtime entry point to be invoked.
+// barrier marking runtime entry point to be invoked or an empty
+// location; in the latter case, the read barrier marking runtime
+// entry point will be loaded by the slow path code itself.
 class LoadReferenceWithBakerReadBarrierSlowPathARM64 : public ReadBarrierMarkSlowPathBaseARM64 {
  public:
   LoadReferenceWithBakerReadBarrierSlowPathARM64(HInstruction* instruction,
@@ -803,7 +808,7 @@
                                                  bool needs_null_check,
                                                  bool use_load_acquire,
                                                  Register temp,
-                                                 Location entrypoint)
+                                                 Location entrypoint = Location::NoLocation())
       : ReadBarrierMarkSlowPathBaseARM64(instruction, ref, entrypoint),
         obj_(obj),
         offset_(offset),
@@ -947,20 +952,23 @@
 // another object reference (different from `ref`) in `obj.field`).
 //
 // Argument `entrypoint` must be a register location holding the read
-// barrier marking runtime entry point to be invoked.
+// barrier marking runtime entry point to be invoked or an empty
+// location; in the latter case, the read barrier marking runtime
+// entry point will be loaded by the slow path code itself.
 class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64
     : public ReadBarrierMarkSlowPathBaseARM64 {
  public:
-  LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64(HInstruction* instruction,
-                                                               Location ref,
-                                                               Register obj,
-                                                               uint32_t offset,
-                                                               Location index,
-                                                               size_t scale_factor,
-                                                               bool needs_null_check,
-                                                               bool use_load_acquire,
-                                                               Register temp,
-                                                               Location entrypoint)
+  LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64(
+      HInstruction* instruction,
+      Location ref,
+      Register obj,
+      uint32_t offset,
+      Location index,
+      size_t scale_factor,
+      bool needs_null_check,
+      bool use_load_acquire,
+      Register temp,
+      Location entrypoint = Location::NoLocation())
       : ReadBarrierMarkSlowPathBaseARM64(instruction, ref, entrypoint),
         obj_(obj),
         offset_(offset),
@@ -1655,7 +1663,7 @@
   // Blocked core registers:
   //      lr        : Runtime reserved.
   //      tr        : Runtime reserved.
-  //      xSuspend  : Runtime reserved. TODO: Unblock this when the runtime stops using it.
+  //      mr        : Runtime reserved.
   //      ip1       : VIXL core temp.
   //      ip0       : VIXL core temp.
   //
@@ -5921,20 +5929,17 @@
       // Baker's read barrier are used.
       if (kBakerReadBarrierLinkTimeThunksEnableForGcRoots &&
           !Runtime::Current()->UseJitCompilation()) {
-        // Note that we do not actually check the value of `GetIsGcMarking()`
-        // to decide whether to mark the loaded GC root or not.  Instead, we
-        // load into `temp` (actually IP1) the read barrier mark introspection
-        // entrypoint. If `temp` is null, it means that `GetIsGcMarking()` is
-        // false, and vice versa.
+        // Query `art::Thread::Current()->GetIsGcMarking()` (stored in
+        // the Marking Register) to decide whether we need to enter
+        // the slow path to mark the GC root.
         //
         // We use link-time generated thunks for the slow path. That thunk
         // checks the reference and jumps to the entrypoint if needed.
         //
-        //     temp = Thread::Current()->pReadBarrierMarkIntrospection
         //     lr = &return_address;
         //     GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
-        //     if (temp != nullptr) {
-        //        goto gc_root_thunk<root_reg>(lr)
+        //     if (mr) {  // Thread::Current()->GetIsGcMarking()
+        //       goto gc_root_thunk<root_reg>(lr)
         //     }
         //   return_address:
 
@@ -5946,11 +5951,6 @@
             linker::Arm64RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg.GetCode());
         vixl::aarch64::Label* cbnz_label = codegen_->NewBakerReadBarrierPatch(custom_data);
 
-        // ip1 = Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection.
-        DCHECK_EQ(ip0.GetCode(), 16u);
-        const int32_t entry_point_offset =
-            CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ip0.GetCode());
-        __ Ldr(ip1, MemOperand(tr, entry_point_offset));
         EmissionCheckScope guard(GetVIXLAssembler(), 3 * vixl::aarch64::kInstructionSize);
         vixl::aarch64::Label return_address;
         __ adr(lr, &return_address);
@@ -5961,36 +5961,26 @@
                       "GC root LDR must be 2 instruction (8B) before the return address label.");
         __ ldr(root_reg, MemOperand(obj.X(), offset));
         __ Bind(cbnz_label);
-        __ cbnz(ip1, static_cast<int64_t>(0));  // Placeholder, patched at link-time.
+        __ cbnz(mr, static_cast<int64_t>(0));  // Placeholder, patched at link-time.
         __ Bind(&return_address);
       } else {
-        // Note that we do not actually check the value of
-        // `GetIsGcMarking()` to decide whether to mark the loaded GC
-        // root or not.  Instead, we load into `temp` the read barrier
-        // mark entry point corresponding to register `root`. If `temp`
-        // is null, it means that `GetIsGcMarking()` is false, and vice
-        // versa.
+        // Query `art::Thread::Current()->GetIsGcMarking()` (stored in
+        // the Marking Register) to decide whether we need to enter
+        // the slow path to mark the GC root.
         //
-        //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
         //   GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
-        //   if (temp != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+        //   if (mr) {  // Thread::Current()->GetIsGcMarking()
         //     // Slow path.
-        //     root = temp(root);  // root = ReadBarrier::Mark(root);  // Runtime entry point call.
+        //     entrypoint = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+        //     root = entrypoint(root);  // root = ReadBarrier::Mark(root);  // Entry point call.
         //   }
 
-        // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`.
-        Register temp = lr;
-        SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(
-            instruction, root, /* entrypoint */ LocationFrom(temp));
+        // Slow path marking the GC root `root`. The entrypoint will
+        // be loaded by the slow path code.
+        SlowPathCodeARM64* slow_path =
+            new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, root);
         codegen_->AddSlowPath(slow_path);
 
-        // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-        const int32_t entry_point_offset =
-            CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(root.reg());
-        // Loading the entrypoint does not require a load acquire since it is only changed when
-        // threads are suspended or running a checkpoint.
-        __ Ldr(temp, MemOperand(tr, entry_point_offset));
-
         // /* GcRoot<mirror::Object> */ root = *(obj + offset)
         if (fixup_label == nullptr) {
           __ Ldr(root_reg, MemOperand(obj, offset));
@@ -6005,9 +5995,7 @@
                       "art::mirror::CompressedReference<mirror::Object> and int32_t "
                       "have different sizes.");
 
-        // The entrypoint is null when the GC is not marking, this prevents one load compared to
-        // checking GetIsGcMarking.
-        __ Cbnz(temp, slow_path->GetEntryLabel());
+        __ Cbnz(mr, slow_path->GetEntryLabel());
         __ Bind(slow_path->GetExitLabel());
       }
     } else {
@@ -6048,20 +6036,19 @@
   if (kBakerReadBarrierLinkTimeThunksEnableForFields &&
       !use_load_acquire &&
       !Runtime::Current()->UseJitCompilation()) {
-    // Note that we do not actually check the value of `GetIsGcMarking()`
-    // to decide whether to mark the loaded reference or not.  Instead, we
-    // load into `temp` (actually IP1) the read barrier mark introspection
-    // entrypoint. If `temp` is null, it means that `GetIsGcMarking()` is
-    // false, and vice versa.
+    // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the
+    // Marking Register) to decide whether we need to enter the slow
+    // path to mark the reference. Then, in the slow path, check the
+    // gray bit in the lock word of the reference's holder (`obj`) to
+    // decide whether to mark `ref` or not.
     //
     // We use link-time generated thunks for the slow path. That thunk checks
     // the holder and jumps to the entrypoint if needed. If the holder is not
     // gray, it creates a fake dependency and returns to the LDR instruction.
     //
-    //     temp = Thread::Current()->pReadBarrierMarkIntrospection
     //     lr = &gray_return_address;
-    //     if (temp != nullptr) {
-    //        goto field_thunk<holder_reg, base_reg>(lr)
+    //     if (mr) {  // Thread::Current()->GetIsGcMarking()
+    //       goto field_thunk<holder_reg, base_reg>(lr)
     //     }
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
@@ -6087,17 +6074,12 @@
         obj.GetCode());
     vixl::aarch64::Label* cbnz_label = NewBakerReadBarrierPatch(custom_data);
 
-    // ip1 = Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection.
-    DCHECK_EQ(ip0.GetCode(), 16u);
-    const int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ip0.GetCode());
-    __ Ldr(ip1, MemOperand(tr, entry_point_offset));
     EmissionCheckScope guard(GetVIXLAssembler(),
                              (kPoisonHeapReferences ? 4u : 3u) * vixl::aarch64::kInstructionSize);
     vixl::aarch64::Label return_address;
     __ adr(lr, &return_address);
     __ Bind(cbnz_label);
-    __ cbnz(ip1, static_cast<int64_t>(0));  // Placeholder, patched at link-time.
+    __ cbnz(mr, static_cast<int64_t>(0));  // Placeholder, patched at link-time.
     static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
                   "Field LDR must be 1 instruction (4B) before the return address label; "
                   " 2 instructions (8B) for heap poisoning.");
@@ -6143,20 +6125,19 @@
 
   if (kBakerReadBarrierLinkTimeThunksEnableForArrays &&
       !Runtime::Current()->UseJitCompilation()) {
-    // Note that we do not actually check the value of `GetIsGcMarking()`
-    // to decide whether to mark the loaded reference or not.  Instead, we
-    // load into `temp` (actually IP1) the read barrier mark introspection
-    // entrypoint. If `temp` is null, it means that `GetIsGcMarking()` is
-    // false, and vice versa.
+    // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the
+    // Marking Register) to decide whether we need to enter the slow
+    // path to mark the reference. Then, in the slow path, check the
+    // gray bit in the lock word of the reference's holder (`obj`) to
+    // decide whether to mark `ref` or not.
     //
     // We use link-time generated thunks for the slow path. That thunk checks
     // the holder and jumps to the entrypoint if needed. If the holder is not
     // gray, it creates a fake dependency and returns to the LDR instruction.
     //
-    //     temp = Thread::Current()->pReadBarrierMarkIntrospection
     //     lr = &gray_return_address;
-    //     if (temp != nullptr) {
-    //        goto field_thunk<holder_reg, base_reg>(lr)
+    //     if (mr) {  // Thread::Current()->GetIsGcMarking()
+    //       goto array_thunk<base_reg>(lr)
     //     }
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
@@ -6176,18 +6157,13 @@
         linker::Arm64RelativePatcher::EncodeBakerReadBarrierArrayData(temp.GetCode());
     vixl::aarch64::Label* cbnz_label = NewBakerReadBarrierPatch(custom_data);
 
-    // ip1 = Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection.
-    DCHECK_EQ(ip0.GetCode(), 16u);
-    const int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ip0.GetCode());
-    __ Ldr(ip1, MemOperand(tr, entry_point_offset));
     __ Add(temp.X(), obj.X(), Operand(data_offset));
     EmissionCheckScope guard(GetVIXLAssembler(),
                              (kPoisonHeapReferences ? 4u : 3u) * vixl::aarch64::kInstructionSize);
     vixl::aarch64::Label return_address;
     __ adr(lr, &return_address);
     __ Bind(cbnz_label);
-    __ cbnz(ip1, static_cast<int64_t>(0));  // Placeholder, patched at link-time.
+    __ cbnz(mr, static_cast<int64_t>(0));  // Placeholder, patched at link-time.
     static_assert(BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
                   "Array LDR must be 1 instruction (4B) before the return address label; "
                   " 2 instructions (8B) for heap poisoning.");
@@ -6231,35 +6207,28 @@
   // `instruction->IsArrayGet()` => `!use_load_acquire`.
   DCHECK(!instruction->IsArrayGet() || !use_load_acquire);
 
-  // Query `art::Thread::Current()->GetIsGcMarking()` to decide
-  // whether we need to enter the slow path to mark the reference.
-  // Then, in the slow path, check the gray bit in the lock word of
-  // the reference's holder (`obj`) to decide whether to mark `ref` or
-  // not.
+  // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the
+  // Marking Register) to decide whether we need to enter the slow
+  // path to mark the reference. Then, in the slow path, check the
+  // gray bit in the lock word of the reference's holder (`obj`) to
+  // decide whether to mark `ref` or not.
   //
-  // Note that we do not actually check the value of `GetIsGcMarking()`;
-  // instead, we load into `temp2` the read barrier mark entry point
-  // corresponding to register `ref`. If `temp2` is null, it means
-  // that `GetIsGcMarking()` is false, and vice versa.
-  //
-  //   temp2 = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-  //   if (temp2 != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+  //   if (mr) {  // Thread::Current()->GetIsGcMarking()
   //     // Slow path.
   //     uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
   //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
   //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
   //     bool is_gray = (rb_state == ReadBarrier::GrayState());
   //     if (is_gray) {
-  //       ref = temp2(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+  //       entrypoint = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+  //       ref = entrypoint(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
   //     }
   //   } else {
   //     HeapReference<mirror::Object> ref = *src;  // Original reference load.
   //   }
 
   // Slow path marking the object `ref` when the GC is marking. The
-  // entrypoint will already be loaded in `temp2`.
-  Register temp2 = lr;
-  Location temp2_loc = LocationFrom(temp2);
+  // entrypoint will be loaded by the slow path code.
   SlowPathCodeARM64* slow_path =
       new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARM64(
           instruction,
@@ -6270,19 +6239,10 @@
           scale_factor,
           needs_null_check,
           use_load_acquire,
-          temp,
-          /* entrypoint */ temp2_loc);
+          temp);
   AddSlowPath(slow_path);
 
-  // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
-  const int32_t entry_point_offset =
-      CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref.reg());
-  // Loading the entrypoint does not require a load acquire since it is only changed when
-  // threads are suspended or running a checkpoint.
-  __ Ldr(temp2, MemOperand(tr, entry_point_offset));
-  // The entrypoint is null when the GC is not marking, this prevents one load compared to
-  // checking GetIsGcMarking.
-  __ Cbnz(temp2, slow_path->GetEntryLabel());
+  __ Cbnz(mr, slow_path->GetEntryLabel());
   // Fast path: the GC is not marking: just load the reference.
   GenerateRawReferenceLoad(
       instruction, ref, obj, offset, index, scale_factor, needs_null_check, use_load_acquire);
@@ -6303,19 +6263,14 @@
   // `instruction->IsArrayGet()` => `!use_load_acquire`.
   DCHECK(!instruction->IsArrayGet() || !use_load_acquire);
 
-  // Query `art::Thread::Current()->GetIsGcMarking()` to decide
-  // whether we need to enter the slow path to update the reference
-  // field within `obj`.  Then, in the slow path, check the gray bit
-  // in the lock word of the reference's holder (`obj`) to decide
-  // whether to mark `ref` and update the field or not.
+  // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the
+  // Marking Register) to decide whether we need to enter the slow
+  // path to update the reference field within `obj`. Then, in the
+  // slow path, check the gray bit in the lock word of the reference's
+  // holder (`obj`) to decide whether to mark `ref` and update the
+  // field or not.
   //
-  // Note that we do not actually check the value of `GetIsGcMarking()`;
-  // instead, we load into `temp2` the read barrier mark entry point
-  // corresponding to register `ref`. If `temp2` is null, it means
-  // that `GetIsGcMarking()` is false, and vice versa.
-  //
-  //   temp2 = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-  //   if (temp2 != nullptr) {  // <=> Thread::Current()->GetIsGcMarking()
+  //   if (mr) {  // Thread::Current()->GetIsGcMarking()
   //     // Slow path.
   //     uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
   //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
@@ -6323,15 +6278,14 @@
   //     bool is_gray = (rb_state == ReadBarrier::GrayState());
   //     if (is_gray) {
   //       old_ref = ref;
-  //       ref = temp2(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
+  //       entrypoint = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+  //       ref = entrypoint(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
   //       compareAndSwapObject(obj, field_offset, old_ref, ref);
   //     }
   //   }
 
   // Slow path updating the object reference at address `obj + field_offset`
-  // when the GC is marking. The entrypoint will already be loaded in `temp2`.
-  Register temp2 = lr;
-  Location temp2_loc = LocationFrom(temp2);
+  // when the GC is marking. The entrypoint will be loaded by the slow path code.
   SlowPathCodeARM64* slow_path =
       new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64(
           instruction,
@@ -6342,19 +6296,10 @@
           /* scale_factor */ 0u /* "times 1" */,
           needs_null_check,
           use_load_acquire,
-          temp,
-          /* entrypoint */ temp2_loc);
+          temp);
   AddSlowPath(slow_path);
 
-  // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
-  const int32_t entry_point_offset =
-      CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref.reg());
-  // Loading the entrypoint does not require a load acquire since it is only changed when
-  // threads are suspended or running a checkpoint.
-  __ Ldr(temp2, MemOperand(tr, entry_point_offset));
-  // The entrypoint is null when the GC is not marking, this prevents one load compared to
-  // checking GetIsGcMarking.
-  __ Cbnz(temp2, slow_path->GetEntryLabel());
+  __ Cbnz(mr, slow_path->GetEntryLabel());
   // Fast path: the GC is not marking: nothing to do (the field is
   // up-to-date, and we don't need to load the reference).
   __ Bind(slow_path->GetExitLabel());
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index d9c49d1..584eead 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -70,21 +70,32 @@
 };
 static constexpr size_t kParameterFPRegistersLength = arraysize(kParameterFPRegisters);
 
-// Thread Register
+// Thread Register.
 const vixl::aarch64::Register tr = vixl::aarch64::x19;
+// Marking Register.
+const vixl::aarch64::Register mr = vixl::aarch64::x20;
 // Method register on invoke.
 static const vixl::aarch64::Register kArtMethodRegister = vixl::aarch64::x0;
 const vixl::aarch64::CPURegList vixl_reserved_core_registers(vixl::aarch64::ip0,
                                                              vixl::aarch64::ip1);
 const vixl::aarch64::CPURegList vixl_reserved_fp_registers(vixl::aarch64::d31);
 
-const vixl::aarch64::CPURegList runtime_reserved_core_registers(tr, vixl::aarch64::lr);
+const vixl::aarch64::CPURegList runtime_reserved_core_registers =
+    vixl::aarch64::CPURegList(
+        tr,
+        // Reserve X20 as Marking Register when emitting Baker read barriers.
+        ((kEmitCompilerReadBarrier && kUseBakerReadBarrier) ? mr : vixl::aarch64::NoCPUReg),
+        vixl::aarch64::lr);
 
-// Callee-saved registers AAPCS64 (without x19 - Thread Register)
-const vixl::aarch64::CPURegList callee_saved_core_registers(vixl::aarch64::CPURegister::kRegister,
-                                                            vixl::aarch64::kXRegSize,
-                                                            vixl::aarch64::x20.GetCode(),
-                                                            vixl::aarch64::x30.GetCode());
+// Callee-save registers AAPCS64, without x19 (Thread Register) (nor
+// x20 (Marking Register) when emitting Baker read barriers).
+const vixl::aarch64::CPURegList callee_saved_core_registers(
+    vixl::aarch64::CPURegister::kRegister,
+    vixl::aarch64::kXRegSize,
+    ((kEmitCompilerReadBarrier && kUseBakerReadBarrier)
+         ? vixl::aarch64::x21.GetCode()
+         : vixl::aarch64::x20.GetCode()),
+     vixl::aarch64::x30.GetCode());
 const vixl::aarch64::CPURegList callee_saved_fp_registers(vixl::aarch64::CPURegister::kFPRegister,
                                                           vixl::aarch64::kDRegSize,
                                                           vixl::aarch64::d8.GetCode(),
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 9a2402b..7334678 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -786,7 +786,7 @@
     } else {
       // Entrypoint is not already loaded, load from the thread.
       int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg.GetCode());
+          Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg.GetCode());
       // This runtime call does not require a stack map.
       arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     }
@@ -8559,7 +8559,7 @@
         //     Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection.
         DCHECK_EQ(ip.GetCode(), 12u);
         const int32_t entry_point_offset =
-            CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode());
+            Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode());
         __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset));
 
         vixl::EmissionCheckScope guard(GetVIXLAssembler(),
@@ -8601,7 +8601,7 @@
 
         // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
         const int32_t entry_point_offset =
-            CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
+            Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
         // Loading the entrypoint does not require a load acquire since it is only changed when
         // threads are suspended or running a checkpoint.
         GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset);
@@ -8705,7 +8705,7 @@
     //     Thread::Current()->pReadBarrierMarkReg12, i.e. pReadBarrierMarkIntrospection.
     DCHECK_EQ(ip.GetCode(), 12u);
     const int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode());
+        Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode());
     __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset));
 
     vixl::EmissionCheckScope guard(
@@ -8797,7 +8797,7 @@
     //     Thread::Current()->pReadBarrierMarkReg16, i.e. pReadBarrierMarkIntrospection.
     DCHECK_EQ(ip.GetCode(), 12u);
     const int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode());
+        Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ip.GetCode());
     __ Ldr(kBakerCcEntrypointRegister, MemOperand(tr, entry_point_offset));
     __ Add(data_reg, obj, Operand(data_offset));
 
@@ -8883,7 +8883,7 @@
 
   // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
   const int32_t entry_point_offset =
-      CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg());
+      Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg());
   // Loading the entrypoint does not require a load acquire since it is only changed when
   // threads are suspended or running a checkpoint.
   GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp2), tr, entry_point_offset);
@@ -8951,7 +8951,7 @@
 
   // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg()
   const int32_t entry_point_offset =
-      CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg());
+      Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg());
   // Loading the entrypoint does not require a load acquire since it is only changed when
   // threads are suspended or running a checkpoint.
   GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp3), tr, entry_point_offset);
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index abe1d70..be8f9e9 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -656,7 +656,7 @@
       __ NopIfNoReordering();
     } else {
       int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(ref_reg - 1);
+          Thread::ReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(ref_reg - 1);
       // This runtime call does not require a stack map.
       mips_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset,
                                                         instruction_,
@@ -750,7 +750,7 @@
     //   rX <- ReadBarrierMarkRegX(rX)
     //
     int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(ref_reg - 1);
+        Thread::ReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(ref_reg - 1);
     // This runtime call does not require a stack map.
     mips_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset,
                                                       instruction_,
@@ -6497,7 +6497,7 @@
 
       // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
       const int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(root.reg() - 1);
+          Thread::ReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(root.reg() - 1);
       // Loading the entrypoint does not require a load acquire since it is only changed when
       // threads are suspended or running a checkpoint.
       __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset);
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 232241c..cf6b3d5 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -606,7 +606,7 @@
       __ Nop();
     } else {
       int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(ref_reg - 1);
+          Thread::ReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(ref_reg - 1);
       // This runtime call does not require a stack map.
       mips64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset,
                                                           instruction_,
@@ -699,7 +699,7 @@
     //   rX <- ReadBarrierMarkRegX(rX)
     //
     int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(ref_reg - 1);
+        Thread::ReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(ref_reg - 1);
     // This runtime call does not require a stack map.
     mips64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset,
                                                         instruction_,
@@ -4421,7 +4421,7 @@
 
       // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
       const int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(root.reg() - 1);
+          Thread::ReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(root.reg() - 1);
       // Loading the entrypoint does not require a load acquire since it is only changed when
       // threads are suspended or running a checkpoint.
       __ LoadFromOffset(kLoadDoubleword, temp.AsRegister<GpuRegister>(), TR, entry_point_offset);
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 79fccfe..af0e646 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -509,8 +509,7 @@
     //
     //   rX <- ReadBarrierMarkRegX(rX)
     //
-    int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86PointerSize>(ref_reg);
+    int32_t entry_point_offset = Thread::ReadBarrierMarkEntryPointsOffset<kX86PointerSize>(ref_reg);
     // This runtime call does not require a stack map.
     x86_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     __ jmp(GetExitLabel());
@@ -595,8 +594,7 @@
     //
     //   rX <- ReadBarrierMarkRegX(rX)
     //
-    int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86PointerSize>(ref_reg);
+    int32_t entry_point_offset = Thread::ReadBarrierMarkEntryPointsOffset<kX86PointerSize>(ref_reg);
     // This runtime call does not require a stack map.
     x86_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
 
@@ -7153,7 +7151,7 @@
 
       // Test the entrypoint (`Thread::Current()->pReadBarrierMarkReg ## root.reg()`).
       const int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86PointerSize>(root.reg());
+          Thread::ReadBarrierMarkEntryPointsOffset<kX86PointerSize>(root.reg());
       __ fs()->cmpl(Address::Absolute(entry_point_offset), Immediate(0));
       // The entrypoint is null when the GC is not marking.
       __ j(kNotEqual, slow_path->GetEntryLabel());
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 57319ce..86f6d51 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -524,7 +524,7 @@
     //   rX <- ReadBarrierMarkRegX(rX)
     //
     int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(ref_reg);
+        Thread::ReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(ref_reg);
     // This runtime call does not require a stack map.
     x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     __ jmp(GetExitLabel());
@@ -615,7 +615,7 @@
     //   rX <- ReadBarrierMarkRegX(rX)
     //
     int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(ref_reg);
+        Thread::ReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(ref_reg);
     // This runtime call does not require a stack map.
     x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
 
@@ -6540,7 +6540,7 @@
 
       // Test the `Thread::Current()->pReadBarrierMarkReg ## root.reg()` entrypoint.
       const int32_t entry_point_offset =
-          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(root.reg());
+          Thread::ReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(root.reg());
       __ gs()->cmpl(Address::Absolute(entry_point_offset, /* no_rip */ true), Immediate(0));
       // The entrypoint is null when the GC is not marking.
       __ j(kNotEqual, slow_path->GetEntryLabel());
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index ae5f8d1..3795866 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -154,8 +154,7 @@
     DCHECK(0 <= tmp && tmp < kNumberOfCoreRegisters) << tmp;
     // TODO: Load the entrypoint once before the loop, instead of
     // loading it at every iteration.
-    int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(tmp);
+    int32_t entry_point_offset = Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(tmp);
     // This runtime call does not require a stack map.
     arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     __ MaybePoisonHeapReference(tmp);
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 37d7981..aec1ec7 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -205,7 +205,7 @@
     // TODO: Load the entrypoint once before the loop, instead of
     // loading it at every iteration.
     int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(tmp_.reg());
+        Thread::ReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(tmp_.reg());
     // This runtime call does not require a stack map.
     codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     codegen->GetAssembler()->MaybePoisonHeapReference(tmp_reg);
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 3c9b613..ced931b 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -226,7 +226,7 @@
     // TODO: Load the entrypoint once before the loop, instead of
     // loading it at every iteration.
     int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(tmp.GetCode());
+        Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(tmp.GetCode());
     // This runtime call does not require a stack map.
     arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     assembler->MaybePoisonHeapReference(tmp);
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 6b4851d..a18b0cc 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -143,8 +143,7 @@
     // explanations.)
     DCHECK_NE(temp2, ESP);
     DCHECK(0 <= temp2 && temp2 < kNumberOfCpuRegisters) << temp2;
-    int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86PointerSize>(temp2);
+    int32_t entry_point_offset = Thread::ReadBarrierMarkEntryPointsOffset<kX86PointerSize>(temp2);
     // This runtime call does not require a stack map.
     x86_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     __ MaybePoisonHeapReference(temp2);
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index ef98b7b..5abdb1d 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -105,8 +105,7 @@
     // No need to save live registers; it's taken care of by the
     // entrypoint. Also, there is no need to update the stack mask,
     // as this runtime call will not trigger a garbage collection.
-    int32_t entry_point_offset =
-        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(TMP);
+    int32_t entry_point_offset = Thread::ReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(TMP);
     // This runtime call does not require a stack map.
     x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
     __ MaybePoisonHeapReference(CpuRegister(TMP));
diff --git a/compiler/optimizing/optimizing_cfi_test_expected.inc b/compiler/optimizing/optimizing_cfi_test_expected.inc
index 60af2b4..abab431 100644
--- a/compiler/optimizing/optimizing_cfi_test_expected.inc
+++ b/compiler/optimizing/optimizing_cfi_test_expected.inc
@@ -31,21 +31,21 @@
 // 0x00000010: .cfi_def_cfa_offset: 64
 
 static constexpr uint8_t expected_asm_kArm64[] = {
-    0xFF, 0x03, 0x01, 0xD1, 0xF4, 0x17, 0x00, 0xF9, 0xF5, 0x7B, 0x03, 0xA9,
-    0xE8, 0xA7, 0x01, 0x6D, 0xE8, 0xA7, 0x41, 0x6D, 0xF4, 0x17, 0x40, 0xF9,
-    0xF5, 0x7B, 0x43, 0xA9, 0xFF, 0x03, 0x01, 0x91, 0xC0, 0x03, 0x5F, 0xD6,
+    0xFF, 0x03, 0x01, 0xD1, 0xF5, 0x17, 0x00, 0xF9, 0xF6, 0x7B, 0x03, 0xA9,
+    0xE8, 0xA7, 0x01, 0x6D, 0xE8, 0xA7, 0x41, 0x6D, 0xF5, 0x17, 0x40, 0xF9,
+    0xF6, 0x7B, 0x43, 0xA9, 0xFF, 0x03, 0x01, 0x91, 0xC0, 0x03, 0x5F, 0xD6,
 };
 static constexpr uint8_t expected_cfi_kArm64[] = {
-    0x44, 0x0E, 0x40, 0x44, 0x94, 0x06, 0x44, 0x95, 0x04, 0x9E, 0x02, 0x44,
+    0x44, 0x0E, 0x40, 0x44, 0x95, 0x06, 0x44, 0x96, 0x04, 0x9E, 0x02, 0x44,
     0x05, 0x48, 0x0A, 0x05, 0x49, 0x08, 0x0A, 0x44, 0x06, 0x48, 0x06, 0x49,
-    0x44, 0xD4, 0x44, 0xD5, 0xDE, 0x44, 0x0E, 0x00, 0x44, 0x0B, 0x0E, 0x40,
+    0x44, 0xD5, 0x44, 0xD6, 0xDE, 0x44, 0x0E, 0x00, 0x44, 0x0B, 0x0E, 0x40,
 };
 // 0x00000000: sub sp, sp, #0x40 (64)
 // 0x00000004: .cfi_def_cfa_offset: 64
-// 0x00000004: str x20, [sp, #40]
-// 0x00000008: .cfi_offset: r20 at cfa-24
-// 0x00000008: stp x21, lr, [sp, #48]
-// 0x0000000c: .cfi_offset: r21 at cfa-16
+// 0x00000004: str x21, [sp, #40]
+// 0x00000008: .cfi_offset: r21 at cfa-24
+// 0x00000008: stp x22, lr, [sp, #48]
+// 0x0000000c: .cfi_offset: r22 at cfa-16
 // 0x0000000c: .cfi_offset: r30 at cfa-8
 // 0x0000000c: stp d8, d9, [sp, #24]
 // 0x00000010: .cfi_offset_extended: r72 at cfa-40
@@ -54,10 +54,10 @@
 // 0x00000010: ldp d8, d9, [sp, #24]
 // 0x00000014: .cfi_restore_extended: r72
 // 0x00000014: .cfi_restore_extended: r73
-// 0x00000014: ldr x20, [sp, #40]
-// 0x00000018: .cfi_restore: r20
-// 0x00000018: ldp x21, lr, [sp, #48]
-// 0x0000001c: .cfi_restore: r21
+// 0x00000014: ldr x21, [sp, #40]
+// 0x00000018: .cfi_restore: r21
+// 0x00000018: ldp x22, lr, [sp, #48]
+// 0x0000001c: .cfi_restore: r22
 // 0x0000001c: .cfi_restore: r30
 // 0x0000001c: add sp, sp, #0x40 (64)
 // 0x00000020: .cfi_def_cfa_offset: 0
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index 9cd6884..c436fd9 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -772,6 +772,13 @@
   asm_.UnspillRegisters(core_reg_list, frame_size - core_reg_size);
   asm_.UnspillRegisters(fp_reg_list, frame_size - core_reg_size - fp_reg_size);
 
+  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // Refresh Mark Register.
+    // TODO: Refresh MR only if suspend is taken.
+    ___ Ldr(reg_w(MR),
+            MemOperand(reg_x(TR), Thread::IsGcMarkingOffset<kArm64PointerSize>().Int32Value()));
+  }
+
   // Decrease frame size to start of callee saved regs.
   DecreaseFrameSize(frame_size);
 
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 676efc4..b909bda 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -1331,7 +1331,7 @@
 // r0: type r1: component_count r2: total_size r9: Thread::Current, r3, r12: free.
 // Need to preserve r0 and r1 to the slow path.
 .macro ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE slowPathLabel
-    and    r2, r2, #OBJECT_ALIGNMENT_MASK_TOGGLED             // Apply alignemnt mask
+    and    r2, r2, #OBJECT_ALIGNMENT_MASK_TOGGLED             // Apply alignment mask
                                                               // (addr + 7) & ~7.
 
                                                               // Load thread_local_pos (r3) and
diff --git a/runtime/arch/arm64/asm_support_arm64.S b/runtime/arch/arm64/asm_support_arm64.S
index bcf55e3..715fc35 100644
--- a/runtime/arch/arm64/asm_support_arm64.S
+++ b/runtime/arch/arm64/asm_support_arm64.S
@@ -33,6 +33,12 @@
 #define xIP1 x17
 #define wIP1 w17
 
+#if defined(USE_READ_BARRIER) && defined(USE_BAKER_READ_BARRIER)
+// Marking Register, holding Thread::Current()->GetIsGcMarking().
+// Only used with the Concurrent Copying (CC) garbage
+// collector, with the Baker read barrier configuration.
+#define wMR w20
+#endif
 
 .macro ENTRY name
     .type \name, #function
@@ -55,14 +61,14 @@
     END \name
 .endm
 
-// Macros to poison (negate) the reference for heap poisoning.
+// Macro to poison (negate) the reference for heap poisoning.
 .macro POISON_HEAP_REF rRef
 #ifdef USE_HEAP_POISONING
     neg \rRef, \rRef
 #endif  // USE_HEAP_POISONING
 .endm
 
-// Macros to unpoison (negate) the reference for heap poisoning.
+// Macro to unpoison (negate) the reference for heap poisoning.
 .macro UNPOISON_HEAP_REF rRef
 #ifdef USE_HEAP_POISONING
     neg \rRef, \rRef
diff --git a/runtime/arch/arm64/context_arm64.cc b/runtime/arch/arm64/context_arm64.cc
index 0465c1e..0f0814a 100644
--- a/runtime/arch/arm64/context_arm64.cc
+++ b/runtime/arch/arm64/context_arm64.cc
@@ -137,7 +137,9 @@
   for (size_t i = 0; i < kNumberOfDRegisters; ++i) {
     fprs[i] = fprs_[i] != nullptr ? *fprs_[i] : Arm64Context::kBadFprBase + i;
   }
+  // Ensure the Thread Register contains the address of the current thread.
   DCHECK_EQ(reinterpret_cast<uintptr_t>(Thread::Current()), gprs[TR]);
+  // The Marking Register will be updated by art_quick_do_long_jump.
   art_quick_do_long_jump(gprs, fprs);
 }
 
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 138dbf9..e097a33 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -39,6 +39,18 @@
     .cfi_restore \reg
 .endm
 
+.macro SAVE_REG_INCREASE_FRAME reg, frame_adjustment
+    str \reg, [sp, #-(\frame_adjustment)]!
+    .cfi_adjust_cfa_offset (\frame_adjustment)
+    .cfi_rel_offset \reg, 0
+.endm
+
+.macro RESTORE_REG_DECREASE_FRAME reg, frame_adjustment
+    ldr \reg, [sp], #(\frame_adjustment)
+    .cfi_restore \reg
+    .cfi_adjust_cfa_offset -(\frame_adjustment)
+.endm
+
 .macro SAVE_TWO_REGS reg1, reg2, offset
     stp \reg1, \reg2, [sp, #(\offset)]
     .cfi_rel_offset \reg1, (\offset)
@@ -140,6 +152,9 @@
     SAVE_TWO_REGS x29, xLR, 80
 
     // Store ArtMethod* Runtime::callee_save_methods_[kSaveRefsOnly].
+    // Note: We could avoid saving X20 in the case of Baker read
+    // barriers, as it is overwritten by REFRESH_MARKING_REGISTER
+    // later; but it's not worth handling this special case.
     stp xIP0, x20, [sp]
     .cfi_rel_offset x20, 8
 
@@ -151,6 +166,9 @@
 // TODO: Probably no need to restore registers preserved by aapcs64.
 .macro RESTORE_SAVE_REFS_ONLY_FRAME
     // Callee-saves.
+    // Note: Likewise, we could avoid restoring X20 in the case of Baker
+    // read barriers, as it is overwritten by REFRESH_MARKING_REGISTER
+    // later; but it's not worth handling this special case.
     RESTORE_REG x20, 8
     RESTORE_TWO_REGS x21, x22, 16
     RESTORE_TWO_REGS x23, x24, 32
@@ -165,11 +183,6 @@
     DECREASE_FRAME 96
 .endm
 
-.macro RESTORE_SAVE_REFS_ONLY_FRAME_AND_RETURN
-    RESTORE_SAVE_REFS_ONLY_FRAME
-    ret
-.endm
-
 
 .macro SETUP_SAVE_REFS_AND_ARGS_FRAME_INTERNAL
     INCREASE_FRAME 224
@@ -192,6 +205,9 @@
     SAVE_TWO_REGS x5, x6, 112
 
     // x7, Callee-saves.
+    // Note: We could avoid saving X20 in the case of Baker read
+    // barriers, as it is overwritten by REFRESH_MARKING_REGISTER
+    // later; but it's not worth handling this special case.
     SAVE_TWO_REGS x7, x20, 128
     SAVE_TWO_REGS x21, x22, 144
     SAVE_TWO_REGS x23, x24, 160
@@ -250,6 +266,9 @@
     RESTORE_TWO_REGS x5, x6, 112
 
     // x7, Callee-saves.
+    // Note: Likewise, we could avoid restoring X20 in the case of Baker
+    // read barriers, as it is overwritten by REFRESH_MARKING_REGISTER
+    // later; but it's not worth handling this special case.
     RESTORE_TWO_REGS x7, x20, 128
     RESTORE_TWO_REGS x21, x22, 144
     RESTORE_TWO_REGS x23, x24, 160
@@ -358,7 +377,7 @@
     ldp d29, d30, [sp, #240]
     ldr d31,      [sp, #256]
 
-    // Restore core registers.
+    // Restore core registers, except x0.
     RESTORE_TWO_REGS  x1,  x2, 272
     RESTORE_TWO_REGS  x3,  x4, 288
     RESTORE_TWO_REGS  x5,  x6, 304
@@ -379,10 +398,21 @@
 .endm
 
 .macro RESTORE_SAVE_EVERYTHING_FRAME
-    RESTORE_REG            x0, 264
+    RESTORE_REG       x0,      264
     RESTORE_SAVE_EVERYTHING_FRAME_KEEP_X0
 .endm
 
+// Macro to refresh the Marking Register (W20).
+//
+// This macro must be called at the end of functions implementing
+// entrypoints that possibly (directly or indirectly) perform a
+// suspend check (before they return).
+.macro REFRESH_MARKING_REGISTER
+#if defined(USE_READ_BARRIER) && defined(USE_BAKER_READ_BARRIER)
+    ldr wMR, [xSELF, #THREAD_IS_GC_MARKING_OFFSET]
+#endif
+.endm
+
 .macro RETURN_IF_RESULT_IS_ZERO
     cbnz x0, 1f                // result non-zero branch over
     ret                        // return
@@ -562,6 +592,7 @@
     bl     \cxx_name                      // (method_idx, this, Thread*, SP)
     mov    xIP0, x1                       // save Method*->code_
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
+    REFRESH_MARKING_REGISTER
     cbz    x0, 1f                         // did we find the target? if not go to exception delivery
     br     xIP0                           // tail call to target
 1:
@@ -661,13 +692,15 @@
 
 .macro INVOKE_STUB_CALL_AND_RETURN
 
+    REFRESH_MARKING_REGISTER
+
     // load method-> METHOD_QUICK_CODE_OFFSET
     ldr x9, [x0, #ART_METHOD_QUICK_CODE_OFFSET_64]
     // Branch to method.
     blr x9
 
     // Restore return value address and shorty address.
-    ldp x4,x5, [xFP, #16]
+    ldp x4, x5, [xFP, #16]
     .cfi_restore x4
     .cfi_restore x5
 
@@ -1046,6 +1079,7 @@
     stp x3, x4, [sp, #16]                 // Save result and shorty addresses.
     stp xFP, xLR, [sp]                    // Store LR & FP.
     mov xSELF, x5                         // Move thread pointer into SELF register.
+    REFRESH_MARKING_REGISTER
 
     sub sp, sp, #16
     str xzr, [sp]                         // Store null for ArtMethod* slot
@@ -1152,7 +1186,7 @@
     ldp x24, x25, [x0], #-16
     ldp x22, x23, [x0], #-16
     ldp x20, x21, [x0], #-16
-    ldp x18, x19, [x0], #-16
+    ldp x18, x19, [x0], #-16         // X18 & xSELF
     ldp x16, x17, [x0], #-16
     ldp x14, x15, [x0], #-16
     ldp x12, x13, [x0], #-16
@@ -1163,6 +1197,8 @@
     ldp x2, x3, [x0], #-16
     mov sp, x1
 
+    REFRESH_MARKING_REGISTER
+
     // Need to load PC, it's at the end (after the space for the unused XZR). Use x1.
     ldr x1, [x0, #33*8]
     // And the value of x0.
@@ -1213,6 +1249,7 @@
     mov    x1, xSELF                  // pass Thread::Current
     bl     artLockObjectFromCode      // (Object* obj, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     RETURN_IF_W0_IS_ZERO_OR_DELIVER
 END art_quick_lock_object
 
@@ -1221,6 +1258,7 @@
     mov    x1, xSELF                  // pass Thread::Current
     bl     artLockObjectFromCode      // (Object* obj, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     RETURN_IF_W0_IS_ZERO_OR_DELIVER
 END art_quick_lock_object_no_inline
 
@@ -1275,6 +1313,7 @@
     mov    x1, xSELF                  // pass Thread::Current
     bl     artUnlockObjectFromCode    // (Object* obj, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     RETURN_IF_W0_IS_ZERO_OR_DELIVER
 END art_quick_unlock_object
 
@@ -1283,6 +1322,7 @@
     mov    x1, xSELF                  // pass Thread::Current
     bl     artUnlockObjectFromCode    // (Object* obj, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     RETURN_IF_W0_IS_ZERO_OR_DELIVER
 END art_quick_unlock_object_no_inline
 
@@ -1356,7 +1396,7 @@
      */
 .macro READ_BARRIER xDest, wDest, xObj, xTemp, wTemp, offset, number
 #ifdef USE_READ_BARRIER
-#ifdef USE_BAKER_READ_BARRIER
+# ifdef USE_BAKER_READ_BARRIER
     ldr \wTemp, [\xObj, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
     tbnz \wTemp, #LOCK_WORD_READ_BARRIER_STATE_SHIFT, .Lrb_slowpath\number
     // False dependency to avoid needing load/load fence.
@@ -1364,7 +1404,7 @@
     ldr \wDest, [\xObj, #\offset]   // Heap reference = 32b. This also zero-extends to \xDest.
     UNPOISON_HEAP_REF \wDest
     b .Lrb_exit\number
-#endif
+# endif  // USE_BAKER_READ_BARRIER
 .Lrb_slowpath\number:
     // Store registers used in art_quick_aput_obj (x0-x4, LR), stack is 16B aligned.
     SAVE_TWO_REGS_INCREASE_FRAME x0, x1, 48
@@ -1471,6 +1511,7 @@
     mov    x1, xSELF                  // pass Thread::Current
     bl     \entrypoint                // (uint32_t type_idx, Method* method, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     \return
 END \name
 .endm
@@ -1483,6 +1524,7 @@
     mov    x2, xSELF                  // pass Thread::Current
     bl     \entrypoint                // (uint32_t type_idx, Method* method, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     \return
 END \name
 .endm
@@ -1495,6 +1537,7 @@
     mov    x3, xSELF                  // pass Thread::Current
     bl     \entrypoint
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     \return
 END \name
 .endm
@@ -1507,8 +1550,8 @@
     mov    x4, xSELF                  // pass Thread::Current
     bl     \entrypoint                //
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     \return
-    DELIVER_PENDING_EXCEPTION
 END \name
 .endm
 
@@ -1520,6 +1563,7 @@
     mov    x1, xSELF                  // pass Thread::Current
     bl     \entrypoint                // (uint32_t type_idx, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     \return
 END \name
 .endm
@@ -1531,6 +1575,7 @@
     mov    x2, xSELF                  // pass Thread::Current
     bl     \entrypoint
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     \return
 END \name
 .endm
@@ -1542,6 +1587,7 @@
     mov    x3, xSELF                  // pass Thread::Current
     bl     \entrypoint
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     \return
 END \name
 .endm
@@ -1556,6 +1602,7 @@
     cbz   w0, 1f                      // If result is null, deliver the OOME.
     .cfi_remember_state
     RESTORE_SAVE_EVERYTHING_FRAME_KEEP_X0
+    REFRESH_MARKING_REGISTER
     ret                        // return
     .cfi_restore_state
     .cfi_def_cfa_offset FRAME_SIZE_SAVE_EVERYTHING  // workaround for clang bug: 31975598
@@ -1588,6 +1635,9 @@
 ONE_ARG_SAVE_EVERYTHING_DOWNCALL art_quick_initialize_type_and_verify_access, artInitializeTypeAndVerifyAccessFromCode
 ONE_ARG_SAVE_EVERYTHING_DOWNCALL art_quick_resolve_string, artResolveStringFromCode
 
+// Note: Functions `art{Get,Set}<Kind>{Static,Instance>FromCompiledCode` are
+// defined by macros in runtime/entrypoints/quick/quick_field_entrypoints.cc.
+
 ONE_ARG_REF_DOWNCALL art_quick_get_boolean_static, artGetBooleanStaticFromCompiledCode, RETURN_OR_DELIVER_PENDING_EXCEPTION_X1
 ONE_ARG_REF_DOWNCALL art_quick_get_byte_static, artGetByteStaticFromCompiledCode, RETURN_OR_DELIVER_PENDING_EXCEPTION_X1
 ONE_ARG_REF_DOWNCALL art_quick_get_char_static, artGetCharStaticFromCompiledCode, RETURN_OR_DELIVER_PENDING_EXCEPTION_X1
@@ -1752,6 +1802,7 @@
     mov    x1, xSELF                                // pass Thread::Current
     bl     \cxx_name
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 END \c_name
 .endm
@@ -1815,6 +1866,7 @@
     mov    x1, xSELF                           // Pass Thread::Current.
     bl     \entrypoint                         // (mirror::Class*, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 END \name
 .endm
@@ -1825,7 +1877,7 @@
 GENERATE_ALLOC_OBJECT_RESOLVED_TLAB art_quick_alloc_object_initialized_tlab, artAllocObjectFromCodeInitializedTLAB, /* isInitialized */ 1
 
 .macro ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE slowPathLabel, xClass, wClass, xCount, wCount, xTemp0, wTemp0, xTemp1, wTemp1, xTemp2, wTemp2
-    and    \xTemp1, \xTemp1, #OBJECT_ALIGNMENT_MASK_TOGGLED64 // Apply alignemnt mask
+    and    \xTemp1, \xTemp1, #OBJECT_ALIGNMENT_MASK_TOGGLED64 // Apply alignment mask
                                                               // (addr + 7) & ~7. The mask must
                                                               // be 64 bits to keep high bits in
                                                               // case of overflow.
@@ -1887,6 +1939,7 @@
     mov    x2, xSELF                  // pass Thread::Current
     bl     \entrypoint
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 END \name
 .endm
@@ -1937,8 +1990,8 @@
     add    \xTemp1, \xTemp1, #(MIRROR_WIDE_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK)
 .endm
 
-# TODO(ngeoffray): art_quick_alloc_array_resolved_region_tlab is not used for arm64, remove
-# the entrypoint once all backends have been updated to use the size variants.
+// TODO(ngeoffray): art_quick_alloc_array_resolved_region_tlab is not used for arm64, remove
+// the entrypoint once all backends have been updated to use the size variants.
 GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN
 GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_8
 GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_16
@@ -1959,6 +2012,7 @@
     mov    x0, xSELF
     bl     artTestSuspendFromCode             // (Thread*)
     RESTORE_SAVE_EVERYTHING_FRAME
+    REFRESH_MARKING_REGISTER
     ret
 END art_quick_test_suspend
 
@@ -1966,7 +2020,9 @@
     mov    x0, xSELF
     SETUP_SAVE_REFS_ONLY_FRAME                // save callee saves for stack crawl
     bl     artTestSuspendFromCode             // (Thread*)
-    RESTORE_SAVE_REFS_ONLY_FRAME_AND_RETURN
+    RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
+    ret
 END art_quick_implicit_suspend
 
      /*
@@ -1983,6 +2039,7 @@
     ldr     x2, [xSELF, THREAD_EXCEPTION_OFFSET]
     cbnz    x2, .Lexception_in_proxy    // success if no exception is pending
     RESTORE_SAVE_REFS_AND_ARGS_FRAME    // Restore frame
+    REFRESH_MARKING_REGISTER
     fmov    d0, x0                      // Store result in d0 in case it was float or double
     ret                                 // return on success
 .Lexception_in_proxy:
@@ -2035,6 +2092,7 @@
     mov xIP0, x0            // Remember returned code pointer in xIP0.
     ldr x0, [sp, #0]        // artQuickResolutionTrampoline puts called method in *SP.
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
+    REFRESH_MARKING_REGISTER
     br xIP0
 1:
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
@@ -2170,6 +2228,7 @@
 
     // Tear down the callee-save frame.
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
+    REFRESH_MARKING_REGISTER
 
     // store into fpr, for when it's a fpr return...
     fmov d0, x0
@@ -2202,6 +2261,7 @@
     bl   artQuickToInterpreterBridge
 
     RESTORE_SAVE_REFS_AND_ARGS_FRAME       // TODO: no need to restore arguments in this case.
+    REFRESH_MARKING_REGISTER
 
     fmov d0, x0
 
@@ -2231,6 +2291,7 @@
     mov   x0, x20             // Reload method reference.
 
     RESTORE_SAVE_REFS_AND_ARGS_FRAME  // Note: will restore xSELF
+    REFRESH_MARKING_REGISTER
     cbz   xIP0, 1f            // Deliver the pending exception if method is null.
     adr   xLR, art_quick_instrumentation_exit
     br    xIP0                // Tail-call method with lr set to art_quick_instrumentation_exit.
@@ -2263,6 +2324,7 @@
     .cfi_adjust_cfa_offset -16
 
     RESTORE_SAVE_REFS_ONLY_FRAME
+    REFRESH_MARKING_REGISTER
     cbz   xIP0, 1f            // Handle error
     br    xIP0                // Tail-call out.
 1:
@@ -2831,6 +2893,7 @@
 .Lcleanup_and_return:
     DECREASE_FRAME 16
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
+    REFRESH_MARKING_REGISTER
     RETURN_OR_DELIVER_PENDING_EXCEPTION_X1
 
     .section    .rodata                           // Place handler table in read-only section away from text.
diff --git a/runtime/arch/arm64/registers_arm64.h b/runtime/arch/arm64/registers_arm64.h
index 4683fc3..d4c9192 100644
--- a/runtime/arch/arm64/registers_arm64.h
+++ b/runtime/arch/arm64/registers_arm64.h
@@ -61,6 +61,7 @@
   kNumberOfXRegisters = 33,
   // Aliases.
   TR  = X19,     // ART Thread Register - Managed Runtime (Callee Saved Reg)
+  MR  = X20,     // ART Marking Register - Managed Runtime (Callee Saved Reg)
   IP0 = X16,     // Used as scratch by VIXL.
   IP1 = X17,     // Used as scratch by ART JNI Assembler.
   FP  = X29,
diff --git a/runtime/arch/quick_alloc_entrypoints.S b/runtime/arch/quick_alloc_entrypoints.S
index 2b3525b..fbfa756 100644
--- a/runtime/arch/quick_alloc_entrypoints.S
+++ b/runtime/arch/quick_alloc_entrypoints.S
@@ -53,7 +53,7 @@
 .endm
 
 // Generate the allocation entrypoints for each allocator. This is used as an alternative to
-// GNERATE_ALL_ALLOC_ENTRYPOINTS for selectively implementing allocation fast paths in
+// GENERATE_ALL_ALLOC_ENTRYPOINTS for selectively implementing allocation fast paths in
 // hand-written assembly.
 #define GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(c_suffix, cxx_suffix) \
   ONE_ARG_DOWNCALL art_quick_alloc_object_resolved ## c_suffix, artAllocObjectFromCodeResolved ## cxx_suffix, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 458e830..8d3c62f 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -166,7 +166,7 @@
   }
   if (kUseBakerReadBarrier && kGrayDirtyImmuneObjects) {
     // Switch to read barrier mark entrypoints before we gray the objects. This is required in case
-    // a mutator sees a gray bit and dispatches on the entrpoint. (b/37876887).
+    // a mutator sees a gray bit and dispatches on the entrypoint. (b/37876887).
     ActivateReadBarrierEntrypoints();
     // Gray dirty immune objects concurrently to reduce GC pause times. We re-process gray cards in
     // the pause.
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 36ecd33..3a3a5a0 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -2862,6 +2862,7 @@
   DO_THREAD_OFFSET(SelfOffset<ptr_size>(), "self")
   DO_THREAD_OFFSET(StackEndOffset<ptr_size>(), "stack_end")
   DO_THREAD_OFFSET(ThinLockIdOffset<ptr_size>(), "thin_lock_thread_id")
+  DO_THREAD_OFFSET(IsGcMarkingOffset<ptr_size>(), "is_gc_marking")
   DO_THREAD_OFFSET(TopOfManagedStackOffset<ptr_size>(), "top_quick_frame_method")
   DO_THREAD_OFFSET(TopShadowFrameOffset<ptr_size>(), "top_shadow_frame")
   DO_THREAD_OFFSET(TopHandleScopeOffset<ptr_size>(), "top_handle_scope")
diff --git a/runtime/thread.h b/runtime/thread.h
index e785ddc..24d126f 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -656,6 +656,17 @@
         OFFSETOF_MEMBER(tls_ptr_sized_values, jni_entrypoints) + jni_entrypoint_offset);
   }
 
+  // Return the entry point offset integer value for ReadBarrierMarkRegX, where X is `reg`.
+  template <PointerSize pointer_size>
+  static int32_t ReadBarrierMarkEntryPointsOffset(size_t reg) {
+    // The entry point list defines 30 ReadBarrierMarkRegX entry points.
+    DCHECK_LT(reg, 30u);
+    // The ReadBarrierMarkRegX entry points are ordered by increasing
+    // register number in Thread::tls_Ptr_.quick_entrypoints.
+    return QUICK_ENTRYPOINT_OFFSET(pointer_size, pReadBarrierMarkReg00).Int32Value()
+        + static_cast<size_t>(pointer_size) * reg;
+  }
+
   template<PointerSize pointer_size>
   static ThreadOffset<pointer_size> SelfOffset() {
     return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, self));
diff --git a/test/990-method-handle-and-mr/build b/test/990-method-handle-and-mr/build
new file mode 100755
index 0000000..5e5f36e
--- /dev/null
+++ b/test/990-method-handle-and-mr/build
@@ -0,0 +1,25 @@
+#!/bin/bash
+#
+# Copyright 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Exit on failure.
+set -e
+
+if [[ $@ != *"--jvm"* ]]; then
+  # Don't do anything with jvm.
+  export USE_JACK=true
+fi
+
+./default-build "$@" --experimental method-handles
diff --git a/test/990-method-handle-and-mr/expected.txt b/test/990-method-handle-and-mr/expected.txt
new file mode 100644
index 0000000..8483fb5
--- /dev/null
+++ b/test/990-method-handle-and-mr/expected.txt
@@ -0,0 +1,4 @@
+Test
+Test
+Test
+passed
diff --git a/test/990-method-handle-and-mr/info.txt b/test/990-method-handle-and-mr/info.txt
new file mode 100644
index 0000000..85a957c
--- /dev/null
+++ b/test/990-method-handle-and-mr/info.txt
@@ -0,0 +1,2 @@
+Test stressing code generated for invoke-polymorphic instructions with
+respect to Marking Register (on architectures supporting MR).
diff --git a/test/990-method-handle-and-mr/src/Main.java b/test/990-method-handle-and-mr/src/Main.java
new file mode 100644
index 0000000..739b8eb
--- /dev/null
+++ b/test/990-method-handle-and-mr/src/Main.java
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This test was inspired by benchmarks.MicroMethodHandles.java.MicroMethodHandles.
+
+import java.io.PrintStream;
+import java.lang.invoke.MethodHandle;
+import java.lang.invoke.MethodHandles;
+import java.lang.invoke.MethodType;
+
+class A {
+  public Long binaryFunction(int x, double y) {
+    return 1000l;
+  }
+}
+
+class Test {
+  Test() throws Throwable {
+    this.handle = MethodHandles.lookup().findVirtual(A.class, "binaryFunction",
+                                                     MethodType.methodType(Long.class, int.class,
+                                                                           double.class));
+    this.a = new A();
+    this.x = new Integer(72);
+    this.y = new Double(-1.39e-31);
+  }
+
+  void execute() {
+    try {
+      executeFor(2000);
+      System.out.println(getName());
+    } catch (Throwable t) {
+      System.err.println("Exception during the execution of " + getName());
+      System.err.println(t);
+      t.printStackTrace(new PrintStream(System.err));
+      System.exit(1);
+    }
+  }
+
+  void executeFor(long timeMinimumMillis) throws Throwable {
+    long startTime = System.currentTimeMillis();
+    long elapsed = 0;
+    while (elapsed < timeMinimumMillis) {
+      exercise();
+      elapsed = System.currentTimeMillis() - startTime;
+    }
+  }
+
+  void exercise() throws Throwable {
+    for (int i = 0; i < EXERCISE_ITERATIONS; ++i) {
+      run();
+    }
+  }
+
+  void run() throws Throwable {
+    long result = (long) handle.invoke(a, x, y);
+  }
+
+  String getName() {
+    return getClass().getSimpleName();
+  }
+
+  private static final int EXERCISE_ITERATIONS = 500;
+
+  private MethodHandle handle;
+  private A a;
+  private Integer x;
+  private Double y;
+}
+
+public class Main {
+  public static void main(String[] args) throws Throwable {
+    Test[] tests = new Test[] { new Test(), new Test(), new Test() };
+    for (Test test : tests) {
+      test.execute();
+    }
+    System.out.println("passed");
+  }
+}