MIPS: Reduce Baker read barrier code size overhead

Test: booted MIPS64 (with 2nd arch MIPS32R6) in QEMU
Test: test-art-target-gtest
Test: testrunner.py --target --optimizing
Test: same tests as above on CI20
Test: booted MIPS32 and MIPS64 in QEMU with poisoning
      in configurations:
      - with Baker read barrier thunks
      - without Baker read barrier thunks
      - ART_READ_BARRIER_TYPE=TABLELOOKUP

Change-Id: I79f320bf8862a04215c76cfeff3118ebc87f7ef2
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 23d188d..b6eb5c1 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -16,6 +16,7 @@
 
 #include "code_generator_mips.h"
 
+#include "arch/mips/asm_support_mips.h"
 #include "arch/mips/entrypoints_direct_mips.h"
 #include "arch/mips/instruction_set_features_mips.h"
 #include "art_method.h"
@@ -40,6 +41,11 @@
 static constexpr int kCurrentMethodStackOffset = 0;
 static constexpr Register kMethodRegisterArgument = A0;
 
+// Flags controlling the use of thunks for Baker read barriers.
+constexpr bool kBakerReadBarrierThunksEnableForFields = true;
+constexpr bool kBakerReadBarrierThunksEnableForArrays = true;
+constexpr bool kBakerReadBarrierThunksEnableForGcRoots = true;
+
 Location MipsReturnLocation(Primitive::Type return_type) {
   switch (return_type) {
     case Primitive::kPrimBoolean:
@@ -1486,7 +1492,8 @@
         __ Mfc1(dst_low, src);
         __ MoveFromFpuHigh(dst_high, src);
       } else {
-        DCHECK(source.IsDoubleStackSlot()) << "Cannot move from " << source << " to " << destination;
+        DCHECK(source.IsDoubleStackSlot())
+            << "Cannot move from " << source << " to " << destination;
         int32_t off = source.GetStackIndex();
         Register r = destination.AsRegisterPairLow<Register>();
         __ LoadFromOffset(kLoadDoubleword, r, SP, off);
@@ -1539,7 +1546,8 @@
       } else if (source.IsFpuRegister()) {
         __ StoreDToOffset(source.AsFpuRegister<FRegister>(), SP, dst_offset);
       } else {
-        DCHECK(source.IsDoubleStackSlot()) << "Cannot move from " << source << " to " << destination;
+        DCHECK(source.IsDoubleStackSlot())
+            << "Cannot move from " << source << " to " << destination;
         __ LoadFromOffset(kLoadWord, TMP, SP, source.GetStackIndex());
         __ StoreToOffset(kStoreWord, TMP, SP, dst_offset);
         __ LoadFromOffset(kLoadWord, TMP, SP, source.GetStackIndex() + 4);
@@ -1763,8 +1771,10 @@
   }
   // A following instruction will add the sign-extended low half of the 32-bit
   // offset to `out` (e.g. lw, jialc, addiu).
-  DCHECK_EQ(info_low->patch_info_high, info_high);
-  __ Bind(&info_low->label);
+  if (info_low != nullptr) {
+    DCHECK_EQ(info_low->patch_info_high, info_high);
+    __ Bind(&info_low->label);
+  }
 }
 
 CodeGeneratorMIPS::JitPatchInfo* CodeGeneratorMIPS::NewJitRootStringPatch(
@@ -1791,25 +1801,26 @@
                                         const uint8_t* roots_data,
                                         const CodeGeneratorMIPS::JitPatchInfo& info,
                                         uint64_t index_in_table) const {
-  uint32_t literal_offset = GetAssembler().GetLabelLocation(&info.high_label);
+  uint32_t high_literal_offset = GetAssembler().GetLabelLocation(&info.high_label);
+  uint32_t low_literal_offset = GetAssembler().GetLabelLocation(&info.low_label);
   uintptr_t address =
       reinterpret_cast<uintptr_t>(roots_data) + index_in_table * sizeof(GcRoot<mirror::Object>);
   uint32_t addr32 = dchecked_integral_cast<uint32_t>(address);
   // lui reg, addr32_high
-  DCHECK_EQ(code[literal_offset + 0], 0x34);
-  DCHECK_EQ(code[literal_offset + 1], 0x12);
-  DCHECK_EQ((code[literal_offset + 2] & 0xE0), 0x00);
-  DCHECK_EQ(code[literal_offset + 3], 0x3C);
+  DCHECK_EQ(code[high_literal_offset + 0], 0x34);
+  DCHECK_EQ(code[high_literal_offset + 1], 0x12);
+  DCHECK_EQ((code[high_literal_offset + 2] & 0xE0), 0x00);
+  DCHECK_EQ(code[high_literal_offset + 3], 0x3C);
   // instr reg, reg, addr32_low
-  DCHECK_EQ(code[literal_offset + 4], 0x78);
-  DCHECK_EQ(code[literal_offset + 5], 0x56);
+  DCHECK_EQ(code[low_literal_offset + 0], 0x78);
+  DCHECK_EQ(code[low_literal_offset + 1], 0x56);
   addr32 += (addr32 & 0x8000) << 1;  // Account for sign extension in "instr reg, reg, addr32_low".
   // lui reg, addr32_high
-  code[literal_offset + 0] = static_cast<uint8_t>(addr32 >> 16);
-  code[literal_offset + 1] = static_cast<uint8_t>(addr32 >> 24);
+  code[high_literal_offset + 0] = static_cast<uint8_t>(addr32 >> 16);
+  code[high_literal_offset + 1] = static_cast<uint8_t>(addr32 >> 24);
   // instr reg, reg, addr32_low
-  code[literal_offset + 4] = static_cast<uint8_t>(addr32 >> 0);
-  code[literal_offset + 5] = static_cast<uint8_t>(addr32 >> 8);
+  code[low_literal_offset + 0] = static_cast<uint8_t>(addr32 >> 0);
+  code[low_literal_offset + 1] = static_cast<uint8_t>(addr32 >> 8);
 }
 
 void CodeGeneratorMIPS::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
@@ -2545,7 +2556,12 @@
   // We need a temporary register for the read barrier marking slow
   // path in CodeGeneratorMIPS::GenerateArrayLoadWithBakerReadBarrier.
   if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
-    locations->AddTemp(Location::RequiresRegister());
+    bool temp_needed = instruction->GetIndex()->IsConstant()
+        ? !kBakerReadBarrierThunksEnableForFields
+        : !kBakerReadBarrierThunksEnableForArrays;
+    if (temp_needed) {
+      locations->AddTemp(Location::RequiresRegister());
+    }
   }
 }
 
@@ -2681,16 +2697,32 @@
       // /* HeapReference<Object> */ out =
       //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-        Location temp = locations->GetTemp(0);
+        bool temp_needed = index.IsConstant()
+            ? !kBakerReadBarrierThunksEnableForFields
+            : !kBakerReadBarrierThunksEnableForArrays;
+        Location temp = temp_needed ? locations->GetTemp(0) : Location::NoLocation();
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorMIPS::GenerateArrayLoadWithBakerReadBarrier call.
-        codegen_->GenerateArrayLoadWithBakerReadBarrier(instruction,
-                                                        out_loc,
-                                                        obj,
-                                                        data_offset,
-                                                        index,
-                                                        temp,
-                                                        /* needs_null_check */ true);
+        DCHECK(!instruction->CanDoImplicitNullCheckOn(instruction->InputAt(0)));
+        if (index.IsConstant()) {
+          // Array load with a constant index can be treated as a field load.
+          size_t offset =
+              (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
+          codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
+                                                          out_loc,
+                                                          obj,
+                                                          offset,
+                                                          temp,
+                                                          /* needs_null_check */ false);
+        } else {
+          codegen_->GenerateArrayLoadWithBakerReadBarrier(instruction,
+                                                          out_loc,
+                                                          obj,
+                                                          data_offset,
+                                                          index,
+                                                          temp,
+                                                          /* needs_null_check */ false);
+        }
       } else {
         Register out = out_loc.AsRegister<Register>();
         if (index.IsConstant()) {
@@ -3093,6 +3125,7 @@
 // Temp is used for read barrier.
 static size_t NumberOfInstanceOfTemps(TypeCheckKind type_check_kind) {
   if (kEmitCompilerReadBarrier &&
+      !(kUseBakerReadBarrier && kBakerReadBarrierThunksEnableForFields) &&
       (kUseBakerReadBarrier ||
        type_check_kind == TypeCheckKind::kAbstractClassCheck ||
        type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
@@ -6096,7 +6129,9 @@
     if (object_field_get_with_read_barrier && kUseBakerReadBarrier) {
       // We need a temporary register for the read barrier marking slow
       // path in CodeGeneratorMIPS::GenerateFieldLoadWithBakerReadBarrier.
-      locations->AddTemp(Location::RequiresRegister());
+      if (!kBakerReadBarrierThunksEnableForFields) {
+        locations->AddTemp(Location::RequiresRegister());
+      }
     }
   }
 }
@@ -6171,7 +6206,8 @@
     if (type == Primitive::kPrimNot) {
       // /* HeapReference<Object> */ dst = *(obj + offset)
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-        Location temp_loc = locations->GetTemp(0);
+        Location temp_loc =
+            kBakerReadBarrierThunksEnableForFields ? Location::NoLocation() : locations->GetTemp(0);
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorMIPS::GenerateFieldLoadWithBakerReadBarrier call.
         codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
@@ -6395,7 +6431,9 @@
   Register out_reg = out.AsRegister<Register>();
   if (read_barrier_option == kWithReadBarrier) {
     CHECK(kEmitCompilerReadBarrier);
-    DCHECK(maybe_temp.IsRegister()) << maybe_temp;
+    if (!kUseBakerReadBarrier || !kBakerReadBarrierThunksEnableForFields) {
+      DCHECK(maybe_temp.IsRegister()) << maybe_temp;
+    }
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(out + offset)
@@ -6435,7 +6473,9 @@
   if (read_barrier_option == kWithReadBarrier) {
     CHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
-      DCHECK(maybe_temp.IsRegister()) << maybe_temp;
+      if (!kBakerReadBarrierThunksEnableForFields) {
+        DCHECK(maybe_temp.IsRegister()) << maybe_temp;
+      }
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(obj + offset)
       codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
@@ -6458,67 +6498,172 @@
   }
 }
 
+static inline int GetBakerMarkThunkNumber(Register reg) {
+  static_assert(BAKER_MARK_INTROSPECTION_REGISTER_COUNT == 21, "Expecting equal");
+  if (reg >= V0 && reg <= T7) {  // 14 consequtive regs.
+    return reg - V0;
+  } else if (reg >= S2 && reg <= S7) {  // 6 consequtive regs.
+    return 14 + (reg - S2);
+  } else if (reg == FP) {  // One more.
+    return 20;
+  }
+  LOG(FATAL) << "Unexpected register " << reg;
+  UNREACHABLE();
+}
+
+static inline int GetBakerMarkFieldArrayThunkDisplacement(Register reg, bool short_offset) {
+  int num = GetBakerMarkThunkNumber(reg) +
+      (short_offset ? BAKER_MARK_INTROSPECTION_REGISTER_COUNT : 0);
+  return num * BAKER_MARK_INTROSPECTION_FIELD_ARRAY_ENTRY_SIZE;
+}
+
+static inline int GetBakerMarkGcRootThunkDisplacement(Register reg) {
+  return GetBakerMarkThunkNumber(reg) * BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRY_SIZE +
+      BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRIES_OFFSET;
+}
+
 void InstructionCodeGeneratorMIPS::GenerateGcRootFieldLoad(HInstruction* instruction,
                                                            Location root,
                                                            Register obj,
                                                            uint32_t offset,
-                                                           ReadBarrierOption read_barrier_option) {
+                                                           ReadBarrierOption read_barrier_option,
+                                                           MipsLabel* label_low) {
+  bool reordering;
+  if (label_low != nullptr) {
+    DCHECK_EQ(offset, 0x5678u);
+  }
   Register root_reg = root.AsRegister<Register>();
   if (read_barrier_option == kWithReadBarrier) {
     DCHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
       // Baker's read barrier are used:
-      //
-      //   root = obj.field;
-      //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      //   if (temp != null) {
-      //     root = temp(root)
-      //   }
+      if (kBakerReadBarrierThunksEnableForGcRoots) {
+        // Note that we do not actually check the value of `GetIsGcMarking()`
+        // to decide whether to mark the loaded GC root or not.  Instead, we
+        // load into `temp` (T9) the read barrier mark introspection entrypoint.
+        // If `temp` is null, it means that `GetIsGcMarking()` is false, and
+        // vice versa.
+        //
+        // We use thunks for the slow path. That thunk checks the reference
+        // and jumps to the entrypoint if needed.
+        //
+        //     temp = Thread::Current()->pReadBarrierMarkReg00
+        //     // AKA &art_quick_read_barrier_mark_introspection.
+        //     GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
+        //     if (temp != nullptr) {
+        //        temp = &gc_root_thunk<root_reg>
+        //        root = temp(root)
+        //     }
 
-      // /* GcRoot<mirror::Object> */ root = *(obj + offset)
-      __ LoadFromOffset(kLoadWord, root_reg, obj, offset);
-      static_assert(
-          sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>),
-          "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> "
-          "have different sizes.");
-      static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t),
-                    "art::mirror::CompressedReference<mirror::Object> and int32_t "
-                    "have different sizes.");
+        bool isR6 = codegen_->GetInstructionSetFeatures().IsR6();
+        const int32_t entry_point_offset =
+            Thread::ReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(0);
+        const int thunk_disp = GetBakerMarkGcRootThunkDisplacement(root_reg);
+        int16_t offset_low = Low16Bits(offset);
+        int16_t offset_high = High16Bits(offset - offset_low);  // Accounts for sign
+                                                                // extension in lw.
+        bool short_offset = IsInt<16>(static_cast<int32_t>(offset));
+        Register base = short_offset ? obj : TMP;
+        // Loading the entrypoint does not require a load acquire since it is only changed when
+        // threads are suspended or running a checkpoint.
+        __ LoadFromOffset(kLoadWord, T9, TR, entry_point_offset);
+        reordering = __ SetReorder(false);
+        if (!short_offset) {
+          DCHECK(!label_low);
+          __ AddUpper(base, obj, offset_high);
+        }
+        __ Beqz(T9, (isR6 ? 2 : 4));  // Skip jialc / addiu+jalr+nop.
+        if (label_low != nullptr) {
+          DCHECK(short_offset);
+          __ Bind(label_low);
+        }
+        // /* GcRoot<mirror::Object> */ root = *(obj + offset)
+        __ LoadFromOffset(kLoadWord, root_reg, base, offset_low);  // Single instruction
+                                                                   // in delay slot.
+        if (isR6) {
+          __ Jialc(T9, thunk_disp);
+        } else {
+          __ Addiu(T9, T9, thunk_disp);
+          __ Jalr(T9);
+          __ Nop();
+        }
+        __ SetReorder(reordering);
+      } else {
+        // Note that we do not actually check the value of `GetIsGcMarking()`
+        // to decide whether to mark the loaded GC root or not.  Instead, we
+        // load into `temp` (T9) the read barrier mark entry point corresponding
+        // to register `root`. If `temp` is null, it means that `GetIsGcMarking()`
+        // is false, and vice versa.
+        //
+        //     GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
+        //     temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+        //     if (temp != null) {
+        //       root = temp(root)
+        //     }
 
-      // Slow path marking the GC root `root`.
-      Location temp = Location::RegisterLocation(T9);
-      SlowPathCodeMIPS* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathMIPS(
-              instruction,
-              root,
-              /*entrypoint*/ temp);
-      codegen_->AddSlowPath(slow_path);
+        if (label_low != nullptr) {
+          reordering = __ SetReorder(false);
+          __ Bind(label_low);
+        }
+        // /* GcRoot<mirror::Object> */ root = *(obj + offset)
+        __ LoadFromOffset(kLoadWord, root_reg, obj, offset);
+        if (label_low != nullptr) {
+          __ SetReorder(reordering);
+        }
+        static_assert(
+            sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>),
+            "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> "
+            "have different sizes.");
+        static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t),
+                      "art::mirror::CompressedReference<mirror::Object> and int32_t "
+                      "have different sizes.");
 
-      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      const int32_t entry_point_offset =
-          Thread::ReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(root.reg() - 1);
-      // Loading the entrypoint does not require a load acquire since it is only changed when
-      // threads are suspended or running a checkpoint.
-      __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset);
-      // The entrypoint is null when the GC is not marking, this prevents one load compared to
-      // checking GetIsGcMarking.
-      __ Bnez(temp.AsRegister<Register>(), slow_path->GetEntryLabel());
-      __ Bind(slow_path->GetExitLabel());
+        // Slow path marking the GC root `root`.
+        Location temp = Location::RegisterLocation(T9);
+        SlowPathCodeMIPS* slow_path =
+            new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathMIPS(
+                instruction,
+                root,
+                /*entrypoint*/ temp);
+        codegen_->AddSlowPath(slow_path);
+
+        const int32_t entry_point_offset =
+            Thread::ReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(root.reg() - 1);
+        // Loading the entrypoint does not require a load acquire since it is only changed when
+        // threads are suspended or running a checkpoint.
+        __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset);
+        __ Bnez(temp.AsRegister<Register>(), slow_path->GetEntryLabel());
+        __ Bind(slow_path->GetExitLabel());
+      }
     } else {
+      if (label_low != nullptr) {
+        reordering = __ SetReorder(false);
+        __ Bind(label_low);
+      }
       // GC root loaded through a slow path for read barriers other
       // than Baker's.
       // /* GcRoot<mirror::Object>* */ root = obj + offset
       __ Addiu32(root_reg, obj, offset);
+      if (label_low != nullptr) {
+        __ SetReorder(reordering);
+      }
       // /* mirror::Object* */ root = root->Read()
       codegen_->GenerateReadBarrierForRootSlow(instruction, root, root);
     }
   } else {
+    if (label_low != nullptr) {
+      reordering = __ SetReorder(false);
+      __ Bind(label_low);
+    }
     // Plain GC root load with no read barrier.
     // /* GcRoot<mirror::Object> */ root = *(obj + offset)
     __ LoadFromOffset(kLoadWord, root_reg, obj, offset);
     // Note that GC roots are not affected by heap poisoning, thus we
     // do not have to unpoison `root_reg` here.
+    if (label_low != nullptr) {
+      __ SetReorder(reordering);
+    }
   }
 }
 
@@ -6531,6 +6676,88 @@
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
+  if (kBakerReadBarrierThunksEnableForFields) {
+    // Note that we do not actually check the value of `GetIsGcMarking()`
+    // to decide whether to mark the loaded reference or not.  Instead, we
+    // load into `temp` (T9) the read barrier mark introspection entrypoint.
+    // If `temp` is null, it means that `GetIsGcMarking()` is false, and
+    // vice versa.
+    //
+    // We use thunks for the slow path. That thunk checks the reference
+    // and jumps to the entrypoint if needed. If the holder is not gray,
+    // it issues a load-load memory barrier and returns to the original
+    // reference load.
+    //
+    //     temp = Thread::Current()->pReadBarrierMarkReg00
+    //     // AKA &art_quick_read_barrier_mark_introspection.
+    //     if (temp != nullptr) {
+    //        temp = &field_array_thunk<holder_reg>
+    //        temp()
+    //     }
+    //   not_gray_return_address:
+    //     // If the offset is too large to fit into the lw instruction, we
+    //     // use an adjusted base register (TMP) here. This register
+    //     // receives bits 16 ... 31 of the offset before the thunk invocation
+    //     // and the thunk benefits from it.
+    //     HeapReference<mirror::Object> reference = *(obj+offset);  // Original reference load.
+    //   gray_return_address:
+
+    DCHECK(temp.IsInvalid());
+    bool isR6 = GetInstructionSetFeatures().IsR6();
+    int16_t offset_low = Low16Bits(offset);
+    int16_t offset_high = High16Bits(offset - offset_low);  // Accounts for sign extension in lw.
+    bool short_offset = IsInt<16>(static_cast<int32_t>(offset));
+    bool reordering = __ SetReorder(false);
+    const int32_t entry_point_offset =
+        Thread::ReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(0);
+    // There may have or may have not been a null check if the field offset is smaller than
+    // the page size.
+    // There must've been a null check in case it's actually a load from an array.
+    // We will, however, perform an explicit null check in the thunk as it's easier to
+    // do it than not.
+    if (instruction->IsArrayGet()) {
+      DCHECK(!needs_null_check);
+    }
+    const int thunk_disp = GetBakerMarkFieldArrayThunkDisplacement(obj, short_offset);
+    // Loading the entrypoint does not require a load acquire since it is only changed when
+    // threads are suspended or running a checkpoint.
+    __ LoadFromOffset(kLoadWord, T9, TR, entry_point_offset);
+    Register ref_reg = ref.AsRegister<Register>();
+    Register base = short_offset ? obj : TMP;
+    if (short_offset) {
+      if (isR6) {
+        __ Beqzc(T9, 2);  // Skip jialc.
+        __ Nop();  // In forbidden slot.
+        __ Jialc(T9, thunk_disp);
+      } else {
+        __ Beqz(T9, 3);  // Skip jalr+nop.
+        __ Addiu(T9, T9, thunk_disp);  // In delay slot.
+        __ Jalr(T9);
+        __ Nop();  // In delay slot.
+      }
+    } else {
+      if (isR6) {
+        __ Beqz(T9, 2);  // Skip jialc.
+        __ Aui(base, obj, offset_high);  // In delay slot.
+        __ Jialc(T9, thunk_disp);
+      } else {
+        __ Lui(base, offset_high);
+        __ Beqz(T9, 2);  // Skip jalr.
+        __ Addiu(T9, T9, thunk_disp);  // In delay slot.
+        __ Jalr(T9);
+        __ Addu(base, base, obj);  // In delay slot.
+      }
+    }
+    // /* HeapReference<Object> */ ref = *(obj + offset)
+    __ LoadFromOffset(kLoadWord, ref_reg, base, offset_low);  // Single instruction.
+    if (needs_null_check) {
+      MaybeRecordImplicitNullCheck(instruction);
+    }
+    __ MaybeUnpoisonHeapReference(ref_reg);
+    __ SetReorder(reordering);
+    return;
+  }
+
   // /* HeapReference<Object> */ ref = *(obj + offset)
   Location no_index = Location::NoLocation();
   ScaleFactor no_scale_factor = TIMES_1;
@@ -6557,9 +6784,69 @@
   static_assert(
       sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
       "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+  ScaleFactor scale_factor = TIMES_4;
+
+  if (kBakerReadBarrierThunksEnableForArrays) {
+    // Note that we do not actually check the value of `GetIsGcMarking()`
+    // to decide whether to mark the loaded reference or not.  Instead, we
+    // load into `temp` (T9) the read barrier mark introspection entrypoint.
+    // If `temp` is null, it means that `GetIsGcMarking()` is false, and
+    // vice versa.
+    //
+    // We use thunks for the slow path. That thunk checks the reference
+    // and jumps to the entrypoint if needed. If the holder is not gray,
+    // it issues a load-load memory barrier and returns to the original
+    // reference load.
+    //
+    //     temp = Thread::Current()->pReadBarrierMarkReg00
+    //     // AKA &art_quick_read_barrier_mark_introspection.
+    //     if (temp != nullptr) {
+    //        temp = &field_array_thunk<holder_reg>
+    //        temp()
+    //     }
+    //   not_gray_return_address:
+    //     // The element address is pre-calculated in the TMP register before the
+    //     // thunk invocation and the thunk benefits from it.
+    //     HeapReference<mirror::Object> reference = data[index];  // Original reference load.
+    //   gray_return_address:
+
+    DCHECK(temp.IsInvalid());
+    DCHECK(index.IsValid());
+    bool reordering = __ SetReorder(false);
+    const int32_t entry_point_offset =
+        Thread::ReadBarrierMarkEntryPointsOffset<kMipsPointerSize>(0);
+    // We will not do the explicit null check in the thunk as some form of a null check
+    // must've been done earlier.
+    DCHECK(!needs_null_check);
+    const int thunk_disp = GetBakerMarkFieldArrayThunkDisplacement(obj, /* short_offset */ false);
+    // Loading the entrypoint does not require a load acquire since it is only changed when
+    // threads are suspended or running a checkpoint.
+    __ LoadFromOffset(kLoadWord, T9, TR, entry_point_offset);
+    Register ref_reg = ref.AsRegister<Register>();
+    Register index_reg = index.IsRegisterPair()
+        ? index.AsRegisterPairLow<Register>()
+        : index.AsRegister<Register>();
+    if (GetInstructionSetFeatures().IsR6()) {
+      __ Beqz(T9, 2);  // Skip jialc.
+      __ Lsa(TMP, index_reg, obj, scale_factor);  // In delay slot.
+      __ Jialc(T9, thunk_disp);
+    } else {
+      __ Sll(TMP, index_reg, scale_factor);
+      __ Beqz(T9, 2);  // Skip jalr.
+      __ Addiu(T9, T9, thunk_disp);  // In delay slot.
+      __ Jalr(T9);
+      __ Addu(TMP, TMP, obj);  // In delay slot.
+    }
+    // /* HeapReference<Object> */ ref = *(obj + data_offset + (index << scale_factor))
+    DCHECK(IsInt<16>(static_cast<int32_t>(data_offset))) << data_offset;
+    __ LoadFromOffset(kLoadWord, ref_reg, TMP, data_offset);  // Single instruction.
+    __ MaybeUnpoisonHeapReference(ref_reg);
+    __ SetReorder(reordering);
+    return;
+  }
+
   // /* HeapReference<Object> */ ref =
   //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
-  ScaleFactor scale_factor = TIMES_4;
   GenerateReferenceLoadWithBakerReadBarrier(instruction,
                                             ref,
                                             obj,
@@ -7461,10 +7748,14 @@
       bool reordering = __ SetReorder(false);
       codegen_->EmitPcRelativeAddressPlaceholderHigh(bss_info_high,
                                                      temp,
-                                                     base_or_current_method_reg,
-                                                     info_low);
-      GenerateGcRootFieldLoad(cls, out_loc, temp, /* placeholder */ 0x5678, read_barrier_option);
+                                                     base_or_current_method_reg);
       __ SetReorder(reordering);
+      GenerateGcRootFieldLoad(cls,
+                              out_loc,
+                              temp,
+                              /* placeholder */ 0x5678,
+                              read_barrier_option,
+                              &info_low->label);
       generate_null_check = true;
       break;
     }
@@ -7475,8 +7766,13 @@
       bool reordering = __ SetReorder(false);
       __ Bind(&info->high_label);
       __ Lui(out, /* placeholder */ 0x1234);
-      GenerateGcRootFieldLoad(cls, out_loc, out, /* placeholder */ 0x5678, read_barrier_option);
       __ SetReorder(reordering);
+      GenerateGcRootFieldLoad(cls,
+                              out_loc,
+                              out,
+                              /* placeholder */ 0x5678,
+                              read_barrier_option,
+                              &info->low_label);
       break;
     }
     case HLoadClass::LoadKind::kRuntimeCall:
@@ -7623,14 +7919,14 @@
       bool reordering = __ SetReorder(false);
       codegen_->EmitPcRelativeAddressPlaceholderHigh(info_high,
                                                      temp,
-                                                     base_or_current_method_reg,
-                                                     info_low);
+                                                     base_or_current_method_reg);
+      __ SetReorder(reordering);
       GenerateGcRootFieldLoad(load,
                               out_loc,
                               temp,
                               /* placeholder */ 0x5678,
-                              kCompilerReadBarrierOption);
-      __ SetReorder(reordering);
+                              kCompilerReadBarrierOption,
+                              &info_low->label);
       SlowPathCodeMIPS* slow_path =
           new (GetGraph()->GetArena()) LoadStringSlowPathMIPS(load, info_high);
       codegen_->AddSlowPath(slow_path);
@@ -7646,12 +7942,13 @@
       bool reordering = __ SetReorder(false);
       __ Bind(&info->high_label);
       __ Lui(out, /* placeholder */ 0x1234);
+      __ SetReorder(reordering);
       GenerateGcRootFieldLoad(load,
                               out_loc,
                               out,
                               /* placeholder */ 0x5678,
-                              kCompilerReadBarrierOption);
-      __ SetReorder(reordering);
+                              kCompilerReadBarrierOption,
+                              &info->low_label);
       return;
     }
     default:
diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index 52ee852..7195b9d 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h
@@ -285,7 +285,8 @@
                                Location root,
                                Register obj,
                                uint32_t offset,
-                               ReadBarrierOption read_barrier_option);
+                               ReadBarrierOption read_barrier_option,
+                               MipsLabel* label_low = nullptr);
 
   void GenerateIntCompare(IfCondition cond, LocationSummary* locations);
   // When the function returns `false` it means that the condition holds if `dst` is non-zero
@@ -637,7 +638,7 @@
   void EmitPcRelativeAddressPlaceholderHigh(PcRelativePatchInfo* info_high,
                                             Register out,
                                             Register base,
-                                            PcRelativePatchInfo* info_low);
+                                            PcRelativePatchInfo* info_low = nullptr);
 
   // The JitPatchInfo is used for JIT string and class loads.
   struct JitPatchInfo {
@@ -649,8 +650,9 @@
     // String/type index.
     uint64_t index;
     // Label for the instruction loading the most significant half of the address.
-    // The least significant half is loaded with the instruction that follows immediately.
     MipsLabel high_label;
+    // Label for the instruction supplying the least significant half of the address.
+    MipsLabel low_label;
   };
 
   void PatchJitRootUse(uint8_t* code,
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 454a2dd..3e79f47 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -16,6 +16,7 @@
 
 #include "code_generator_mips64.h"
 
+#include "arch/mips64/asm_support_mips64.h"
 #include "art_method.h"
 #include "code_generator_utils.h"
 #include "compiled_method.h"
@@ -38,6 +39,11 @@
 static constexpr int kCurrentMethodStackOffset = 0;
 static constexpr GpuRegister kMethodRegisterArgument = A0;
 
+// Flags controlling the use of thunks for Baker read barriers.
+constexpr bool kBakerReadBarrierThunksEnableForFields = true;
+constexpr bool kBakerReadBarrierThunksEnableForArrays = true;
+constexpr bool kBakerReadBarrierThunksEnableForGcRoots = true;
+
 Location Mips64ReturnLocation(Primitive::Type return_type) {
   switch (return_type) {
     case Primitive::kPrimBoolean:
@@ -1649,8 +1655,10 @@
   __ Auipc(out, /* placeholder */ 0x1234);
   // A following instruction will add the sign-extended low half of the 32-bit
   // offset to `out` (e.g. ld, jialc, daddiu).
-  DCHECK_EQ(info_low->patch_info_high, info_high);
-  __ Bind(&info_low->label);
+  if (info_low != nullptr) {
+    DCHECK_EQ(info_low->patch_info_high, info_high);
+    __ Bind(&info_low->label);
+  }
 }
 
 Literal* CodeGeneratorMIPS64::DeduplicateJitStringLiteral(const DexFile& dex_file,
@@ -2117,7 +2125,12 @@
   // We need a temporary register for the read barrier marking slow
   // path in CodeGeneratorMIPS64::GenerateArrayLoadWithBakerReadBarrier.
   if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
-    locations->AddTemp(Location::RequiresRegister());
+    bool temp_needed = instruction->GetIndex()->IsConstant()
+        ? !kBakerReadBarrierThunksEnableForFields
+        : !kBakerReadBarrierThunksEnableForArrays;
+    if (temp_needed) {
+      locations->AddTemp(Location::RequiresRegister());
+    }
   }
 }
 
@@ -2254,16 +2267,32 @@
       // /* HeapReference<Object> */ out =
       //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-        Location temp = locations->GetTemp(0);
+        bool temp_needed = index.IsConstant()
+            ? !kBakerReadBarrierThunksEnableForFields
+            : !kBakerReadBarrierThunksEnableForArrays;
+        Location temp = temp_needed ? locations->GetTemp(0) : Location::NoLocation();
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorMIPS64::GenerateArrayLoadWithBakerReadBarrier call.
-        codegen_->GenerateArrayLoadWithBakerReadBarrier(instruction,
-                                                        out_loc,
-                                                        obj,
-                                                        data_offset,
-                                                        index,
-                                                        temp,
-                                                        /* needs_null_check */ true);
+        DCHECK(!instruction->CanDoImplicitNullCheckOn(instruction->InputAt(0)));
+        if (index.IsConstant()) {
+          // Array load with a constant index can be treated as a field load.
+          size_t offset =
+              (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
+          codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
+                                                          out_loc,
+                                                          obj,
+                                                          offset,
+                                                          temp,
+                                                          /* needs_null_check */ false);
+        } else {
+          codegen_->GenerateArrayLoadWithBakerReadBarrier(instruction,
+                                                          out_loc,
+                                                          obj,
+                                                          data_offset,
+                                                          index,
+                                                          temp,
+                                                          /* needs_null_check */ false);
+        }
       } else {
         GpuRegister out = out_loc.AsRegister<GpuRegister>();
         if (index.IsConstant()) {
@@ -2666,6 +2695,7 @@
 // Temp is used for read barrier.
 static size_t NumberOfInstanceOfTemps(TypeCheckKind type_check_kind) {
   if (kEmitCompilerReadBarrier &&
+      !(kUseBakerReadBarrier && kBakerReadBarrierThunksEnableForFields) &&
       (kUseBakerReadBarrier ||
        type_check_kind == TypeCheckKind::kAbstractClassCheck ||
        type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
@@ -4118,7 +4148,9 @@
   if (object_field_get_with_read_barrier && kUseBakerReadBarrier) {
     // We need a temporary register for the read barrier marking slow
     // path in CodeGeneratorMIPS64::GenerateFieldLoadWithBakerReadBarrier.
-    locations->AddTemp(Location::RequiresRegister());
+    if (!kBakerReadBarrierThunksEnableForFields) {
+      locations->AddTemp(Location::RequiresRegister());
+    }
   }
 }
 
@@ -4168,7 +4200,8 @@
     if (type == Primitive::kPrimNot) {
       // /* HeapReference<Object> */ dst = *(obj + offset)
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-        Location temp_loc = locations->GetTemp(0);
+        Location temp_loc =
+            kBakerReadBarrierThunksEnableForFields ? Location::NoLocation() : locations->GetTemp(0);
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorMIPS64::GenerateFieldLoadWithBakerReadBarrier call.
         codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
@@ -4318,7 +4351,9 @@
   GpuRegister out_reg = out.AsRegister<GpuRegister>();
   if (read_barrier_option == kWithReadBarrier) {
     CHECK(kEmitCompilerReadBarrier);
-    DCHECK(maybe_temp.IsRegister()) << maybe_temp;
+    if (!kUseBakerReadBarrier || !kBakerReadBarrierThunksEnableForFields) {
+      DCHECK(maybe_temp.IsRegister()) << maybe_temp;
+    }
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(out + offset)
@@ -4358,7 +4393,9 @@
   if (read_barrier_option == kWithReadBarrier) {
     CHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
-      DCHECK(maybe_temp.IsRegister()) << maybe_temp;
+      if (!kBakerReadBarrierThunksEnableForFields) {
+        DCHECK(maybe_temp.IsRegister()) << maybe_temp;
+      }
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(obj + offset)
       codegen_->GenerateFieldLoadWithBakerReadBarrier(instruction,
@@ -4381,55 +4418,134 @@
   }
 }
 
-void InstructionCodeGeneratorMIPS64::GenerateGcRootFieldLoad(
-    HInstruction* instruction,
-    Location root,
-    GpuRegister obj,
-    uint32_t offset,
-    ReadBarrierOption read_barrier_option) {
+static inline int GetBakerMarkThunkNumber(GpuRegister reg) {
+  static_assert(BAKER_MARK_INTROSPECTION_REGISTER_COUNT == 20, "Expecting equal");
+  if (reg >= V0 && reg <= T2) {  // 13 consequtive regs.
+    return reg - V0;
+  } else if (reg >= S2 && reg <= S7) {  // 6 consequtive regs.
+    return 13 + (reg - S2);
+  } else if (reg == S8) {  // One more.
+    return 19;
+  }
+  LOG(FATAL) << "Unexpected register " << reg;
+  UNREACHABLE();
+}
+
+static inline int GetBakerMarkFieldArrayThunkDisplacement(GpuRegister reg, bool short_offset) {
+  int num = GetBakerMarkThunkNumber(reg) +
+      (short_offset ? BAKER_MARK_INTROSPECTION_REGISTER_COUNT : 0);
+  return num * BAKER_MARK_INTROSPECTION_FIELD_ARRAY_ENTRY_SIZE;
+}
+
+static inline int GetBakerMarkGcRootThunkDisplacement(GpuRegister reg) {
+  return GetBakerMarkThunkNumber(reg) * BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRY_SIZE +
+      BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRIES_OFFSET;
+}
+
+void InstructionCodeGeneratorMIPS64::GenerateGcRootFieldLoad(HInstruction* instruction,
+                                                             Location root,
+                                                             GpuRegister obj,
+                                                             uint32_t offset,
+                                                             ReadBarrierOption read_barrier_option,
+                                                             Mips64Label* label_low) {
+  if (label_low != nullptr) {
+    DCHECK_EQ(offset, 0x5678u);
+  }
   GpuRegister root_reg = root.AsRegister<GpuRegister>();
   if (read_barrier_option == kWithReadBarrier) {
     DCHECK(kEmitCompilerReadBarrier);
     if (kUseBakerReadBarrier) {
       // Fast path implementation of art::ReadBarrier::BarrierForRoot when
       // Baker's read barrier are used:
-      //
-      //   root = obj.field;
-      //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      //   if (temp != null) {
-      //     root = temp(root)
-      //   }
+      if (kBakerReadBarrierThunksEnableForGcRoots) {
+        // Note that we do not actually check the value of `GetIsGcMarking()`
+        // to decide whether to mark the loaded GC root or not.  Instead, we
+        // load into `temp` (T9) the read barrier mark introspection entrypoint.
+        // If `temp` is null, it means that `GetIsGcMarking()` is false, and
+        // vice versa.
+        //
+        // We use thunks for the slow path. That thunk checks the reference
+        // and jumps to the entrypoint if needed.
+        //
+        //     temp = Thread::Current()->pReadBarrierMarkReg00
+        //     // AKA &art_quick_read_barrier_mark_introspection.
+        //     GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
+        //     if (temp != nullptr) {
+        //        temp = &gc_root_thunk<root_reg>
+        //        root = temp(root)
+        //     }
 
-      // /* GcRoot<mirror::Object> */ root = *(obj + offset)
-      __ LoadFromOffset(kLoadUnsignedWord, root_reg, obj, offset);
-      static_assert(
-          sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>),
-          "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> "
-          "have different sizes.");
-      static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t),
-                    "art::mirror::CompressedReference<mirror::Object> and int32_t "
-                    "have different sizes.");
+        const int32_t entry_point_offset =
+            Thread::ReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(0);
+        const int thunk_disp = GetBakerMarkGcRootThunkDisplacement(root_reg);
+        int16_t offset_low = Low16Bits(offset);
+        int16_t offset_high = High16Bits(offset - offset_low);  // Accounts for sign
+                                                                // extension in lwu.
+        bool short_offset = IsInt<16>(static_cast<int32_t>(offset));
+        GpuRegister base = short_offset ? obj : TMP;
+        // Loading the entrypoint does not require a load acquire since it is only changed when
+        // threads are suspended or running a checkpoint.
+        __ LoadFromOffset(kLoadDoubleword, T9, TR, entry_point_offset);
+        if (!short_offset) {
+          DCHECK(!label_low);
+          __ Daui(base, obj, offset_high);
+        }
+        __ Beqz(T9, 2);  // Skip jialc.
+        if (label_low != nullptr) {
+          DCHECK(short_offset);
+          __ Bind(label_low);
+        }
+        // /* GcRoot<mirror::Object> */ root = *(obj + offset)
+        __ LoadFromOffset(kLoadUnsignedWord, root_reg, base, offset_low);  // Single instruction
+                                                                           // in delay slot.
+        __ Jialc(T9, thunk_disp);
+      } else {
+        // Note that we do not actually check the value of `GetIsGcMarking()`
+        // to decide whether to mark the loaded GC root or not.  Instead, we
+        // load into `temp` (T9) the read barrier mark entry point corresponding
+        // to register `root`. If `temp` is null, it means that `GetIsGcMarking()`
+        // is false, and vice versa.
+        //
+        //     GcRoot<mirror::Object> root = *(obj+offset);  // Original reference load.
+        //     temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+        //     if (temp != null) {
+        //       root = temp(root)
+        //     }
 
-      // Slow path marking the GC root `root`.
-      Location temp = Location::RegisterLocation(T9);
-      SlowPathCodeMIPS64* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathMIPS64(
-              instruction,
-              root,
-              /*entrypoint*/ temp);
-      codegen_->AddSlowPath(slow_path);
+        if (label_low != nullptr) {
+          __ Bind(label_low);
+        }
+        // /* GcRoot<mirror::Object> */ root = *(obj + offset)
+        __ LoadFromOffset(kLoadUnsignedWord, root_reg, obj, offset);
+        static_assert(
+            sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>),
+            "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> "
+            "have different sizes.");
+        static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t),
+                      "art::mirror::CompressedReference<mirror::Object> and int32_t "
+                      "have different sizes.");
 
-      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-      const int32_t entry_point_offset =
-          Thread::ReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(root.reg() - 1);
-      // Loading the entrypoint does not require a load acquire since it is only changed when
-      // threads are suspended or running a checkpoint.
-      __ LoadFromOffset(kLoadDoubleword, temp.AsRegister<GpuRegister>(), TR, entry_point_offset);
-      // The entrypoint is null when the GC is not marking, this prevents one load compared to
-      // checking GetIsGcMarking.
-      __ Bnezc(temp.AsRegister<GpuRegister>(), slow_path->GetEntryLabel());
-      __ Bind(slow_path->GetExitLabel());
+        // Slow path marking the GC root `root`.
+        Location temp = Location::RegisterLocation(T9);
+        SlowPathCodeMIPS64* slow_path =
+            new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathMIPS64(
+                instruction,
+                root,
+                /*entrypoint*/ temp);
+        codegen_->AddSlowPath(slow_path);
+
+        const int32_t entry_point_offset =
+            Thread::ReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(root.reg() - 1);
+        // Loading the entrypoint does not require a load acquire since it is only changed when
+        // threads are suspended or running a checkpoint.
+        __ LoadFromOffset(kLoadDoubleword, temp.AsRegister<GpuRegister>(), TR, entry_point_offset);
+        __ Bnezc(temp.AsRegister<GpuRegister>(), slow_path->GetEntryLabel());
+        __ Bind(slow_path->GetExitLabel());
+      }
     } else {
+      if (label_low != nullptr) {
+        __ Bind(label_low);
+      }
       // GC root loaded through a slow path for read barriers other
       // than Baker's.
       // /* GcRoot<mirror::Object>* */ root = obj + offset
@@ -4438,6 +4554,9 @@
       codegen_->GenerateReadBarrierForRootSlow(instruction, root, root);
     }
   } else {
+    if (label_low != nullptr) {
+      __ Bind(label_low);
+    }
     // Plain GC root load with no read barrier.
     // /* GcRoot<mirror::Object> */ root = *(obj + offset)
     __ LoadFromOffset(kLoadUnsignedWord, root_reg, obj, offset);
@@ -4455,6 +4574,71 @@
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
+  if (kBakerReadBarrierThunksEnableForFields) {
+    // Note that we do not actually check the value of `GetIsGcMarking()`
+    // to decide whether to mark the loaded reference or not.  Instead, we
+    // load into `temp` (T9) the read barrier mark introspection entrypoint.
+    // If `temp` is null, it means that `GetIsGcMarking()` is false, and
+    // vice versa.
+    //
+    // We use thunks for the slow path. That thunk checks the reference
+    // and jumps to the entrypoint if needed. If the holder is not gray,
+    // it issues a load-load memory barrier and returns to the original
+    // reference load.
+    //
+    //     temp = Thread::Current()->pReadBarrierMarkReg00
+    //     // AKA &art_quick_read_barrier_mark_introspection.
+    //     if (temp != nullptr) {
+    //        temp = &field_array_thunk<holder_reg>
+    //        temp()
+    //     }
+    //   not_gray_return_address:
+    //     // If the offset is too large to fit into the lw instruction, we
+    //     // use an adjusted base register (TMP) here. This register
+    //     // receives bits 16 ... 31 of the offset before the thunk invocation
+    //     // and the thunk benefits from it.
+    //     HeapReference<mirror::Object> reference = *(obj+offset);  // Original reference load.
+    //   gray_return_address:
+
+    DCHECK(temp.IsInvalid());
+    bool short_offset = IsInt<16>(static_cast<int32_t>(offset));
+    const int32_t entry_point_offset =
+        Thread::ReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(0);
+    // There may have or may have not been a null check if the field offset is smaller than
+    // the page size.
+    // There must've been a null check in case it's actually a load from an array.
+    // We will, however, perform an explicit null check in the thunk as it's easier to
+    // do it than not.
+    if (instruction->IsArrayGet()) {
+      DCHECK(!needs_null_check);
+    }
+    const int thunk_disp = GetBakerMarkFieldArrayThunkDisplacement(obj, short_offset);
+    // Loading the entrypoint does not require a load acquire since it is only changed when
+    // threads are suspended or running a checkpoint.
+    __ LoadFromOffset(kLoadDoubleword, T9, TR, entry_point_offset);
+    GpuRegister ref_reg = ref.AsRegister<GpuRegister>();
+    if (short_offset) {
+      __ Beqzc(T9, 2);  // Skip jialc.
+      __ Nop();  // In forbidden slot.
+      __ Jialc(T9, thunk_disp);
+      // /* HeapReference<Object> */ ref = *(obj + offset)
+      __ LoadFromOffset(kLoadUnsignedWord, ref_reg, obj, offset);  // Single instruction.
+    } else {
+      int16_t offset_low = Low16Bits(offset);
+      int16_t offset_high = High16Bits(offset - offset_low);  // Accounts for sign extension in lwu.
+      __ Beqz(T9, 2);  // Skip jialc.
+      __ Daui(TMP, obj, offset_high);  // In delay slot.
+      __ Jialc(T9, thunk_disp);
+      // /* HeapReference<Object> */ ref = *(obj + offset)
+      __ LoadFromOffset(kLoadUnsignedWord, ref_reg, TMP, offset_low);  // Single instruction.
+    }
+    if (needs_null_check) {
+      MaybeRecordImplicitNullCheck(instruction);
+    }
+    __ MaybeUnpoisonHeapReference(ref_reg);
+    return;
+  }
+
   // /* HeapReference<Object> */ ref = *(obj + offset)
   Location no_index = Location::NoLocation();
   ScaleFactor no_scale_factor = TIMES_1;
@@ -4481,9 +4665,57 @@
   static_assert(
       sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
       "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+  ScaleFactor scale_factor = TIMES_4;
+
+  if (kBakerReadBarrierThunksEnableForArrays) {
+    // Note that we do not actually check the value of `GetIsGcMarking()`
+    // to decide whether to mark the loaded reference or not.  Instead, we
+    // load into `temp` (T9) the read barrier mark introspection entrypoint.
+    // If `temp` is null, it means that `GetIsGcMarking()` is false, and
+    // vice versa.
+    //
+    // We use thunks for the slow path. That thunk checks the reference
+    // and jumps to the entrypoint if needed. If the holder is not gray,
+    // it issues a load-load memory barrier and returns to the original
+    // reference load.
+    //
+    //     temp = Thread::Current()->pReadBarrierMarkReg00
+    //     // AKA &art_quick_read_barrier_mark_introspection.
+    //     if (temp != nullptr) {
+    //        temp = &field_array_thunk<holder_reg>
+    //        temp()
+    //     }
+    //   not_gray_return_address:
+    //     // The element address is pre-calculated in the TMP register before the
+    //     // thunk invocation and the thunk benefits from it.
+    //     HeapReference<mirror::Object> reference = data[index];  // Original reference load.
+    //   gray_return_address:
+
+    DCHECK(temp.IsInvalid());
+    DCHECK(index.IsValid());
+    const int32_t entry_point_offset =
+        Thread::ReadBarrierMarkEntryPointsOffset<kMips64PointerSize>(0);
+    // We will not do the explicit null check in the thunk as some form of a null check
+    // must've been done earlier.
+    DCHECK(!needs_null_check);
+    const int thunk_disp = GetBakerMarkFieldArrayThunkDisplacement(obj, /* short_offset */ false);
+    // Loading the entrypoint does not require a load acquire since it is only changed when
+    // threads are suspended or running a checkpoint.
+    __ LoadFromOffset(kLoadDoubleword, T9, TR, entry_point_offset);
+    __ Beqz(T9, 2);  // Skip jialc.
+    GpuRegister ref_reg = ref.AsRegister<GpuRegister>();
+    GpuRegister index_reg = index.AsRegister<GpuRegister>();
+    __ Dlsa(TMP, index_reg, obj, scale_factor);  // In delay slot.
+    __ Jialc(T9, thunk_disp);
+    // /* HeapReference<Object> */ ref = *(obj + data_offset + (index << scale_factor))
+    DCHECK(IsInt<16>(static_cast<int32_t>(data_offset))) << data_offset;
+    __ LoadFromOffset(kLoadUnsignedWord, ref_reg, TMP, data_offset);  // Single instruction.
+    __ MaybeUnpoisonHeapReference(ref_reg);
+    return;
+  }
+
   // /* HeapReference<Object> */ ref =
   //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
-  ScaleFactor scale_factor = TIMES_4;
   GenerateReferenceLoadWithBakerReadBarrier(instruction,
                                             ref,
                                             obj,
@@ -5278,8 +5510,13 @@
       GpuRegister temp = non_baker_read_barrier
           ? out
           : locations->GetTemp(0).AsRegister<GpuRegister>();
-      codegen_->EmitPcRelativeAddressPlaceholderHigh(bss_info_high, temp, info_low);
-      GenerateGcRootFieldLoad(cls, out_loc, temp, /* placeholder */ 0x5678, read_barrier_option);
+      codegen_->EmitPcRelativeAddressPlaceholderHigh(bss_info_high, temp);
+      GenerateGcRootFieldLoad(cls,
+                              out_loc,
+                              temp,
+                              /* placeholder */ 0x5678,
+                              read_barrier_option,
+                              &info_low->label);
       generate_null_check = true;
       break;
     }
@@ -5399,12 +5636,13 @@
       GpuRegister temp = non_baker_read_barrier
           ? out
           : locations->GetTemp(0).AsRegister<GpuRegister>();
-      codegen_->EmitPcRelativeAddressPlaceholderHigh(info_high, temp, info_low);
+      codegen_->EmitPcRelativeAddressPlaceholderHigh(info_high, temp);
       GenerateGcRootFieldLoad(load,
                               out_loc,
                               temp,
                               /* placeholder */ 0x5678,
-                              kCompilerReadBarrierOption);
+                              kCompilerReadBarrierOption,
+                              &info_low->label);
       SlowPathCodeMIPS64* slow_path =
           new (GetGraph()->GetArena()) LoadStringSlowPathMIPS64(load, info_high);
       codegen_->AddSlowPath(slow_path);
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index c94cc93..d03a9ea 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -281,7 +281,8 @@
                                Location root,
                                GpuRegister obj,
                                uint32_t offset,
-                               ReadBarrierOption read_barrier_option);
+                               ReadBarrierOption read_barrier_option,
+                               Mips64Label* label_low = nullptr);
 
   void GenerateTestAndBranch(HInstruction* instruction,
                              size_t condition_input_index,
@@ -592,7 +593,7 @@
 
   void EmitPcRelativeAddressPlaceholderHigh(PcRelativePatchInfo* info_high,
                                             GpuRegister out,
-                                            PcRelativePatchInfo* info_low);
+                                            PcRelativePatchInfo* info_low = nullptr);
 
   void PatchJitRootUse(uint8_t* code,
                        const uint8_t* roots_data,
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index c581f1c..24e3450 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -828,6 +828,22 @@
   DsFsmInstrRrr(EmitI(0xf, rs, rt, imm16), rt, rt, rs);
 }
 
+void MipsAssembler::AddUpper(Register rt, Register rs, uint16_t imm16, Register tmp) {
+  bool increment = (rs == rt);
+  if (increment) {
+    CHECK_NE(rs, tmp);
+  }
+  if (IsR6()) {
+    Aui(rt, rs, imm16);
+  } else if (increment) {
+    Lui(tmp, imm16);
+    Addu(rt, rs, tmp);
+  } else {
+    Lui(rt, imm16);
+    Addu(rt, rs, rt);
+  }
+}
+
 void MipsAssembler::Sync(uint32_t stype) {
   DsFsmInstrNop(EmitR(0, ZERO, ZERO, ZERO, stype & 0x1f, 0xf));
 }
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index 33803bb..e42bb3f 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -280,6 +280,7 @@
   void Lwpc(Register rs, uint32_t imm19);  // R6
   void Lui(Register rt, uint16_t imm16);
   void Aui(Register rt, Register rs, uint16_t imm16);  // R6
+  void AddUpper(Register rt, Register rs, uint16_t imm16, Register tmp = AT);
   void Sync(uint32_t stype);
   void Mfhi(Register rd);  // R2
   void Mflo(Register rd);  // R2
diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc
index 24900a7..9039854 100644
--- a/compiler/utils/mips64/assembler_mips64.cc
+++ b/compiler/utils/mips64/assembler_mips64.cc
@@ -795,6 +795,10 @@
   EmitFI(0x11, 0xD, ft, imm16);
 }
 
+void Mips64Assembler::Beqz(GpuRegister rt, uint16_t imm16) {
+  EmitI(0x4, ZERO, rt, imm16);
+}
+
 void Mips64Assembler::EmitBcondc(BranchCondition cond,
                                  GpuRegister rs,
                                  GpuRegister rt,
diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h
index 773db9b..5e88033 100644
--- a/compiler/utils/mips64/assembler_mips64.h
+++ b/compiler/utils/mips64/assembler_mips64.h
@@ -563,6 +563,7 @@
   void Bnezc(GpuRegister rs, uint32_t imm21);
   void Bc1eqz(FpuRegister ft, uint16_t imm16);
   void Bc1nez(FpuRegister ft, uint16_t imm16);
+  void Beqz(GpuRegister rt, uint16_t imm16);
 
   void AddS(FpuRegister fd, FpuRegister fs, FpuRegister ft);
   void SubS(FpuRegister fd, FpuRegister fs, FpuRegister ft);
diff --git a/runtime/arch/arch_test.cc b/runtime/arch/arch_test.cc
index 838ae40..dd98f51 100644
--- a/runtime/arch/arch_test.cc
+++ b/runtime/arch/arch_test.cc
@@ -129,6 +129,10 @@
 #undef FRAME_SIZE_SAVE_REFS_AND_ARGS
 static constexpr size_t kFrameSizeSaveEverything = FRAME_SIZE_SAVE_EVERYTHING;
 #undef FRAME_SIZE_SAVE_EVERYTHING
+#undef BAKER_MARK_INTROSPECTION_REGISTER_COUNT
+#undef BAKER_MARK_INTROSPECTION_FIELD_ARRAY_ENTRY_SIZE
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRIES_OFFSET
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRY_SIZE
 }  // namespace mips
 
 namespace mips64 {
@@ -141,6 +145,10 @@
 #undef FRAME_SIZE_SAVE_REFS_AND_ARGS
 static constexpr size_t kFrameSizeSaveEverything = FRAME_SIZE_SAVE_EVERYTHING;
 #undef FRAME_SIZE_SAVE_EVERYTHING
+#undef BAKER_MARK_INTROSPECTION_REGISTER_COUNT
+#undef BAKER_MARK_INTROSPECTION_FIELD_ARRAY_ENTRY_SIZE
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRIES_OFFSET
+#undef BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRY_SIZE
 }  // namespace mips64
 
 namespace x86 {
diff --git a/runtime/arch/mips/asm_support_mips.S b/runtime/arch/mips/asm_support_mips.S
index 948b06c..50095ae 100644
--- a/runtime/arch/mips/asm_support_mips.S
+++ b/runtime/arch/mips/asm_support_mips.S
@@ -127,6 +127,13 @@
 #endif  // USE_HEAP_POISONING
 .endm
 
+// Byte size of the instructions (un)poisoning heap references.
+#ifdef USE_HEAP_POISONING
+#define HEAP_POISON_INSTR_SIZE 4
+#else
+#define HEAP_POISON_INSTR_SIZE 0
+#endif  // USE_HEAP_POISONING
+
 // Based on contents of creg select the minimum integer
 // At the end of the macro the original value of creg is lost
 .macro MINint dreg,rreg,sreg,creg
diff --git a/runtime/arch/mips/asm_support_mips.h b/runtime/arch/mips/asm_support_mips.h
index 7437774..9d8572f 100644
--- a/runtime/arch/mips/asm_support_mips.h
+++ b/runtime/arch/mips/asm_support_mips.h
@@ -24,4 +24,24 @@
 #define FRAME_SIZE_SAVE_REFS_AND_ARGS 112
 #define FRAME_SIZE_SAVE_EVERYTHING 256
 
+// &art_quick_read_barrier_mark_introspection is the first of many entry points:
+//   21 entry points for long field offsets, large array indices and variable array indices
+//     (see macro BRB_FIELD_LONG_OFFSET_ENTRY)
+//   21 entry points for short field offsets and small array indices
+//     (see macro BRB_FIELD_SHORT_OFFSET_ENTRY)
+//   21 entry points for GC roots
+//     (see macro BRB_GC_ROOT_ENTRY)
+
+// There are as many entry points of each kind as there are registers that
+// can hold a reference: V0-V1, A0-A3, T0-T7, S2-S8.
+#define BAKER_MARK_INTROSPECTION_REGISTER_COUNT 21
+
+#define BAKER_MARK_INTROSPECTION_FIELD_ARRAY_ENTRY_SIZE (8 * 4)  // 8 instructions in
+                                                                 // BRB_FIELD_*_OFFSET_ENTRY.
+
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRIES_OFFSET \
+    (2 * BAKER_MARK_INTROSPECTION_REGISTER_COUNT * BAKER_MARK_INTROSPECTION_FIELD_ARRAY_ENTRY_SIZE)
+
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRY_SIZE (4 * 4)  // 4 instructions in BRB_GC_ROOT_ENTRY.
+
 #endif  // ART_RUNTIME_ARCH_MIPS_ASM_SUPPORT_MIPS_H_
diff --git a/runtime/arch/mips/entrypoints_init_mips.cc b/runtime/arch/mips/entrypoints_init_mips.cc
index 9978da5..3010246 100644
--- a/runtime/arch/mips/entrypoints_init_mips.cc
+++ b/runtime/arch/mips/entrypoints_init_mips.cc
@@ -16,6 +16,7 @@
 
 #include <string.h>
 
+#include "arch/mips/asm_support_mips.h"
 #include "atomic.h"
 #include "entrypoints/jni/jni_entrypoints.h"
 #include "entrypoints/quick/quick_alloc_entrypoints.h"
@@ -59,6 +60,10 @@
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg22(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg29(mirror::Object*);
 
+extern "C" mirror::Object* art_quick_read_barrier_mark_introspection(mirror::Object*);
+extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_gc_roots(mirror::Object*);
+extern "C" void art_quick_read_barrier_mark_introspection_end_of_entries(void);
+
 // Math entrypoints.
 extern int32_t CmpgDouble(double a, double b);
 extern int32_t CmplDouble(double a, double b);
@@ -87,6 +92,23 @@
 extern "C" int64_t __moddi3(int64_t, int64_t);
 
 void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_active) {
+  intptr_t introspection_field_array_entries_size =
+      reinterpret_cast<intptr_t>(&art_quick_read_barrier_mark_introspection_gc_roots) -
+      reinterpret_cast<intptr_t>(&art_quick_read_barrier_mark_introspection);
+  static_assert(
+      BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRIES_OFFSET == 2 *
+          BAKER_MARK_INTROSPECTION_REGISTER_COUNT * BAKER_MARK_INTROSPECTION_FIELD_ARRAY_ENTRY_SIZE,
+      "Expecting equal");
+  DCHECK_EQ(introspection_field_array_entries_size,
+            BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRIES_OFFSET);
+  intptr_t introspection_gc_root_entries_size =
+      reinterpret_cast<intptr_t>(&art_quick_read_barrier_mark_introspection_end_of_entries) -
+      reinterpret_cast<intptr_t>(&art_quick_read_barrier_mark_introspection_gc_roots);
+  DCHECK_EQ(introspection_gc_root_entries_size,
+            BAKER_MARK_INTROSPECTION_REGISTER_COUNT * BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRY_SIZE);
+  qpoints->pReadBarrierMarkReg00 = is_active ? art_quick_read_barrier_mark_introspection : nullptr;
+  static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg00),
+                "Non-direct C stub marked direct.");
   qpoints->pReadBarrierMarkReg01 = is_active ? art_quick_read_barrier_mark_reg01 : nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg01),
                 "Non-direct C stub marked direct.");
@@ -416,9 +438,6 @@
   // Cannot use the following registers to pass arguments:
   // 0(ZERO), 1(AT), 16(S0), 17(S1), 24(T8), 25(T9), 26(K0), 27(K1), 28(GP), 29(SP), 31(RA).
   // Note that there are 30 entry points only: 00 for register 1(AT), ..., 29 for register 30(S8).
-  qpoints->pReadBarrierMarkReg00 = nullptr;
-  static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg00),
-                "Non-direct C stub marked direct.");
   qpoints->pReadBarrierMarkReg15 = nullptr;
   static_assert(!IsDirectEntrypoint(kQuickReadBarrierMarkReg15),
                 "Non-direct C stub marked direct.");
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index 00e3d67..d9abaa0 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -2721,6 +2721,385 @@
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg29, $s8
 // RA (register 31) is reserved.
 
+// Caller code:
+// Short constant offset/index:
+// R2:                           | R6:
+//  lw      $t9, pReadBarrierMarkReg00
+//  beqz    $t9, skip_call       |  beqzc   $t9, skip_call
+//  addiu   $t9, $t9, thunk_disp |  nop
+//  jalr    $t9                  |  jialc   $t9, thunk_disp
+//  nop                          |
+// skip_call:                    | skip_call:
+//  lw      `out`, ofs(`obj`)    |  lw      `out`, ofs(`obj`)
+// [subu    `out`, $zero, `out`] | [subu    `out`, $zero, `out`]  # Unpoison reference.
+.macro BRB_FIELD_SHORT_OFFSET_ENTRY obj
+1:
+    # Explicit null check. May be redundant (for array elements or when the field
+    # offset is larger than the page size, 4KB).
+    # $ra will be adjusted to point to lw's stack map when throwing NPE.
+    beqz    \obj, .Lintrospection_throw_npe
+#if defined(_MIPS_ARCH_MIPS32R6)
+    lapc    $gp, .Lintrospection_exits                  # $gp = address of .Lintrospection_exits.
+#else
+    addiu   $gp, $t9, (.Lintrospection_exits - 1b)      # $gp = address of .Lintrospection_exits.
+#endif
+    .set push
+    .set noat
+    lw      $at, MIRROR_OBJECT_LOCK_WORD_OFFSET(\obj)
+    sll     $at, $at, 31 - LOCK_WORD_READ_BARRIER_STATE_SHIFT   # Move barrier state bit
+                                                                # to sign bit.
+    bltz    $at, .Lintrospection_field_array            # If gray, load reference, mark.
+    move    $t8, \obj                                   # Move `obj` to $t8 for common code.
+    .set pop
+    jalr    $zero, $ra                                  # Otherwise, load-load barrier and return.
+    sync
+.endm
+
+// Caller code (R2):
+// Long constant offset/index:   | Variable index:
+//  lw      $t9, pReadBarrierMarkReg00
+//  lui     $t8, ofs_hi          |  sll     $t8, `index`, 2
+//  beqz    $t9, skip_call       |  beqz    $t9, skip_call
+//  addiu   $t9, $t9, thunk_disp |  addiu   $t9, $t9, thunk_disp
+//  jalr    $t9                  |  jalr    $t9
+// skip_call:                    | skip_call:
+//  addu    $t8, $t8, `obj`      |  addu    $t8, $t8, `obj`
+//  lw      `out`, ofs_lo($t8)   |  lw      `out`, ofs($t8)
+// [subu    `out`, $zero, `out`] | [subu    `out`, $zero, `out`]  # Unpoison reference.
+//
+// Caller code (R6):
+// Long constant offset/index:   | Variable index:
+//  lw      $t9, pReadBarrierMarkReg00
+//  beqz    $t9, skip_call       |  beqz    $t9, skip_call
+//  aui     $t8, `obj`, ofs_hi   |  lsa     $t8, `index`, `obj`, 2
+//  jialc   $t9, thunk_disp      |  jialc   $t9, thunk_disp
+// skip_call:                    | skip_call:
+//  lw      `out`, ofs_lo($t8)   |  lw      `out`, ofs($t8)
+// [subu    `out`, $zero, `out`] | [subu    `out`, $zero, `out`]  # Unpoison reference.
+.macro BRB_FIELD_LONG_OFFSET_ENTRY obj
+1:
+    # No explicit null check for variable indices or large constant indices/offsets
+    # as it must have been done earlier.
+#if defined(_MIPS_ARCH_MIPS32R6)
+    lapc    $gp, .Lintrospection_exits                  # $gp = address of .Lintrospection_exits.
+#else
+    addiu   $gp, $t9, (.Lintrospection_exits - 1b)      # $gp = address of .Lintrospection_exits.
+#endif
+    .set push
+    .set noat
+    lw      $at, MIRROR_OBJECT_LOCK_WORD_OFFSET(\obj)
+    sll     $at, $at, 31 - LOCK_WORD_READ_BARRIER_STATE_SHIFT   # Move barrier state bit
+                                                                # to sign bit.
+    bltz    $at, .Lintrospection_field_array            # If gray, load reference, mark.
+    nop
+    .set pop
+    jalr    $zero, $ra                                  # Otherwise, load-load barrier and return.
+    sync
+    break                                               # Padding to 8 instructions.
+.endm
+
+.macro BRB_GC_ROOT_ENTRY root
+1:
+#if defined(_MIPS_ARCH_MIPS32R6)
+    lapc    $gp, .Lintrospection_exit_\root             # $gp = exit point address.
+#else
+    addiu   $gp, $t9, (.Lintrospection_exit_\root - 1b)  # $gp = exit point address.
+#endif
+    bnez    \root, .Lintrospection_common
+    move    $t8, \root                                  # Move reference to $t8 for common code.
+    jalr    $zero, $ra                                  # Return if null.
+    # The next instruction (from the following BRB_GC_ROOT_ENTRY) fills the delay slot.
+    # This instruction has no effect (actual NOP for the last entry; otherwise changes $gp,
+    # which is unused after that anyway).
+.endm
+
+.macro BRB_FIELD_EXIT out
+.Lintrospection_exit_\out:
+    jalr    $zero, $ra
+    move    \out, $t8                                   # Return reference in expected register.
+.endm
+
+.macro BRB_FIELD_EXIT_BREAK
+    break
+    break
+.endm
+
+ENTRY_NO_GP art_quick_read_barrier_mark_introspection
+    # Entry points for offsets/indices not fitting into int16_t and for variable indices.
+    BRB_FIELD_LONG_OFFSET_ENTRY $v0
+    BRB_FIELD_LONG_OFFSET_ENTRY $v1
+    BRB_FIELD_LONG_OFFSET_ENTRY $a0
+    BRB_FIELD_LONG_OFFSET_ENTRY $a1
+    BRB_FIELD_LONG_OFFSET_ENTRY $a2
+    BRB_FIELD_LONG_OFFSET_ENTRY $a3
+    BRB_FIELD_LONG_OFFSET_ENTRY $t0
+    BRB_FIELD_LONG_OFFSET_ENTRY $t1
+    BRB_FIELD_LONG_OFFSET_ENTRY $t2
+    BRB_FIELD_LONG_OFFSET_ENTRY $t3
+    BRB_FIELD_LONG_OFFSET_ENTRY $t4
+    BRB_FIELD_LONG_OFFSET_ENTRY $t5
+    BRB_FIELD_LONG_OFFSET_ENTRY $t6
+    BRB_FIELD_LONG_OFFSET_ENTRY $t7
+    BRB_FIELD_LONG_OFFSET_ENTRY $s2
+    BRB_FIELD_LONG_OFFSET_ENTRY $s3
+    BRB_FIELD_LONG_OFFSET_ENTRY $s4
+    BRB_FIELD_LONG_OFFSET_ENTRY $s5
+    BRB_FIELD_LONG_OFFSET_ENTRY $s6
+    BRB_FIELD_LONG_OFFSET_ENTRY $s7
+    BRB_FIELD_LONG_OFFSET_ENTRY $s8
+
+    # Entry points for offsets/indices fitting into int16_t.
+    BRB_FIELD_SHORT_OFFSET_ENTRY $v0
+    BRB_FIELD_SHORT_OFFSET_ENTRY $v1
+    BRB_FIELD_SHORT_OFFSET_ENTRY $a0
+    BRB_FIELD_SHORT_OFFSET_ENTRY $a1
+    BRB_FIELD_SHORT_OFFSET_ENTRY $a2
+    BRB_FIELD_SHORT_OFFSET_ENTRY $a3
+    BRB_FIELD_SHORT_OFFSET_ENTRY $t0
+    BRB_FIELD_SHORT_OFFSET_ENTRY $t1
+    BRB_FIELD_SHORT_OFFSET_ENTRY $t2
+    BRB_FIELD_SHORT_OFFSET_ENTRY $t3
+    BRB_FIELD_SHORT_OFFSET_ENTRY $t4
+    BRB_FIELD_SHORT_OFFSET_ENTRY $t5
+    BRB_FIELD_SHORT_OFFSET_ENTRY $t6
+    BRB_FIELD_SHORT_OFFSET_ENTRY $t7
+    BRB_FIELD_SHORT_OFFSET_ENTRY $s2
+    BRB_FIELD_SHORT_OFFSET_ENTRY $s3
+    BRB_FIELD_SHORT_OFFSET_ENTRY $s4
+    BRB_FIELD_SHORT_OFFSET_ENTRY $s5
+    BRB_FIELD_SHORT_OFFSET_ENTRY $s6
+    BRB_FIELD_SHORT_OFFSET_ENTRY $s7
+    BRB_FIELD_SHORT_OFFSET_ENTRY $s8
+
+    .global art_quick_read_barrier_mark_introspection_gc_roots
+art_quick_read_barrier_mark_introspection_gc_roots:
+    # Entry points for GC roots.
+    BRB_GC_ROOT_ENTRY $v0
+    BRB_GC_ROOT_ENTRY $v1
+    BRB_GC_ROOT_ENTRY $a0
+    BRB_GC_ROOT_ENTRY $a1
+    BRB_GC_ROOT_ENTRY $a2
+    BRB_GC_ROOT_ENTRY $a3
+    BRB_GC_ROOT_ENTRY $t0
+    BRB_GC_ROOT_ENTRY $t1
+    BRB_GC_ROOT_ENTRY $t2
+    BRB_GC_ROOT_ENTRY $t3
+    BRB_GC_ROOT_ENTRY $t4
+    BRB_GC_ROOT_ENTRY $t5
+    BRB_GC_ROOT_ENTRY $t6
+    BRB_GC_ROOT_ENTRY $t7
+    BRB_GC_ROOT_ENTRY $s2
+    BRB_GC_ROOT_ENTRY $s3
+    BRB_GC_ROOT_ENTRY $s4
+    BRB_GC_ROOT_ENTRY $s5
+    BRB_GC_ROOT_ENTRY $s6
+    BRB_GC_ROOT_ENTRY $s7
+    BRB_GC_ROOT_ENTRY $s8
+    .global art_quick_read_barrier_mark_introspection_end_of_entries
+art_quick_read_barrier_mark_introspection_end_of_entries:
+    nop                         # Fill the delay slot of the last BRB_GC_ROOT_ENTRY.
+
+.Lintrospection_throw_npe:
+    b       art_quick_throw_null_pointer_exception
+    addiu   $ra, $ra, 4         # Skip lw, make $ra point to lw's stack map.
+
+    .set push
+    .set noat
+
+    // Fields and array elements.
+
+.Lintrospection_field_array:
+    // Get the field/element address using $t8 and the offset from the lw instruction.
+    lh      $at, 0($ra)         # $ra points to lw: $at = field/element offset.
+    addiu   $ra, $ra, 4 + HEAP_POISON_INSTR_SIZE  # Skip lw(+subu).
+    addu    $t8, $t8, $at       # $t8 = field/element address.
+
+    // Calculate the address of the exit point, store it in $gp and load the reference into $t8.
+    lb      $at, (-HEAP_POISON_INSTR_SIZE - 2)($ra)   # $ra-HEAP_POISON_INSTR_SIZE-4 points to
+                                                      # "lw `out`, ...".
+    andi    $at, $at, 31        # Extract `out` from lw.
+    sll     $at, $at, 3         # Multiply `out` by the exit point size (BRB_FIELD_EXIT* macros).
+
+    lw      $t8, 0($t8)         # $t8 = reference.
+    UNPOISON_HEAP_REF $t8
+
+    // Return if null reference.
+    bnez    $t8, .Lintrospection_common
+    addu    $gp, $gp, $at       # $gp = address of the exit point.
+
+    // Early return through the exit point.
+.Lintrospection_return_early:
+    jalr    $zero, $gp          # Move $t8 to `out` and return.
+    nop
+
+    // Code common for GC roots, fields and array elements.
+
+.Lintrospection_common:
+    // Check lock word for mark bit, if marked return.
+    lw      $t9, MIRROR_OBJECT_LOCK_WORD_OFFSET($t8)
+    sll     $at, $t9, 31 - LOCK_WORD_MARK_BIT_SHIFT     # Move mark bit to sign bit.
+    bltz    $at, .Lintrospection_return_early
+#if (LOCK_WORD_STATE_SHIFT != 30) || (LOCK_WORD_STATE_FORWARDING_ADDRESS != 3)
+    // The below code depends on the lock word state being in the highest bits
+    // and the "forwarding address" state having all bits set.
+#error "Unexpected lock word state shift or forwarding address state value."
+#endif
+    // Test that both the forwarding state bits are 1.
+    sll     $at, $t9, 1
+    and     $at, $at, $t9                               # Sign bit = 1 IFF both bits are 1.
+    bgez    $at, .Lintrospection_mark
+    nop
+
+    .set pop
+
+    // Shift left by the forwarding address shift. This clears out the state bits since they are
+    // in the top 2 bits of the lock word.
+    jalr    $zero, $gp          # Move $t8 to `out` and return.
+    sll     $t8, $t9, LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT
+
+.Lintrospection_mark:
+    // Partially set up the stack frame preserving only $ra.
+    addiu   $sp, $sp, -160      # Includes 16 bytes of space for argument registers $a0-$a3.
+    .cfi_adjust_cfa_offset 160
+    sw      $ra, 156($sp)
+    .cfi_rel_offset 31, 156
+
+    // Set up $gp, clobbering $ra and using the branch delay slot for a useful instruction.
+    bal     1f
+    sw      $gp, 152($sp)       # Preserve the exit point address.
+1:
+    .cpload $ra
+
+    // Finalize the stack frame and call.
+    sw      $t7, 148($sp)
+    .cfi_rel_offset 15, 148
+    sw      $t6, 144($sp)
+    .cfi_rel_offset 14, 144
+    sw      $t5, 140($sp)
+    .cfi_rel_offset 13, 140
+    sw      $t4, 136($sp)
+    .cfi_rel_offset 12, 136
+    sw      $t3, 132($sp)
+    .cfi_rel_offset 11, 132
+    sw      $t2, 128($sp)
+    .cfi_rel_offset 10, 128
+    sw      $t1, 124($sp)
+    .cfi_rel_offset 9, 124
+    sw      $t0, 120($sp)
+    .cfi_rel_offset 8, 120
+    sw      $a3, 116($sp)
+    .cfi_rel_offset 7, 116
+    sw      $a2, 112($sp)
+    .cfi_rel_offset 6, 112
+    sw      $a1, 108($sp)
+    .cfi_rel_offset 5, 108
+    sw      $a0, 104($sp)
+    .cfi_rel_offset 4, 104
+    sw      $v1, 100($sp)
+    .cfi_rel_offset 3, 100
+    sw      $v0, 96($sp)
+    .cfi_rel_offset 2, 96
+
+    la      $t9, artReadBarrierMark
+
+    sdc1    $f18, 88($sp)
+    sdc1    $f16, 80($sp)
+    sdc1    $f14, 72($sp)
+    sdc1    $f12, 64($sp)
+    sdc1    $f10, 56($sp)
+    sdc1    $f8,  48($sp)
+    sdc1    $f6,  40($sp)
+    sdc1    $f4,  32($sp)
+    sdc1    $f2,  24($sp)
+    sdc1    $f0,  16($sp)
+
+    jalr    $t9                 # $v0 <- artReadBarrierMark(reference)
+    move    $a0, $t8            # Pass reference in $a0.
+    move    $t8, $v0
+
+    lw      $ra, 156($sp)
+    .cfi_restore 31
+    lw      $gp, 152($sp)       # $gp = address of the exit point.
+    lw      $t7, 148($sp)
+    .cfi_restore 15
+    lw      $t6, 144($sp)
+    .cfi_restore 14
+    lw      $t5, 140($sp)
+    .cfi_restore 13
+    lw      $t4, 136($sp)
+    .cfi_restore 12
+    lw      $t3, 132($sp)
+    .cfi_restore 11
+    lw      $t2, 128($sp)
+    .cfi_restore 10
+    lw      $t1, 124($sp)
+    .cfi_restore 9
+    lw      $t0, 120($sp)
+    .cfi_restore 8
+    lw      $a3, 116($sp)
+    .cfi_restore 7
+    lw      $a2, 112($sp)
+    .cfi_restore 6
+    lw      $a1, 108($sp)
+    .cfi_restore 5
+    lw      $a0, 104($sp)
+    .cfi_restore 4
+    lw      $v1, 100($sp)
+    .cfi_restore 3
+    lw      $v0, 96($sp)
+    .cfi_restore 2
+
+    ldc1    $f18, 88($sp)
+    ldc1    $f16, 80($sp)
+    ldc1    $f14, 72($sp)
+    ldc1    $f12, 64($sp)
+    ldc1    $f10, 56($sp)
+    ldc1    $f8,  48($sp)
+    ldc1    $f6,  40($sp)
+    ldc1    $f4,  32($sp)
+    ldc1    $f2,  24($sp)
+    ldc1    $f0,  16($sp)
+
+    // Return through the exit point.
+    jalr    $zero, $gp          # Move $t8 to `out` and return.
+    addiu   $sp, $sp, 160
+    .cfi_adjust_cfa_offset -160
+
+.Lintrospection_exits:
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT $v0
+    BRB_FIELD_EXIT $v1
+    BRB_FIELD_EXIT $a0
+    BRB_FIELD_EXIT $a1
+    BRB_FIELD_EXIT $a2
+    BRB_FIELD_EXIT $a3
+    BRB_FIELD_EXIT $t0
+    BRB_FIELD_EXIT $t1
+    BRB_FIELD_EXIT $t2
+    BRB_FIELD_EXIT $t3
+    BRB_FIELD_EXIT $t4
+    BRB_FIELD_EXIT $t5
+    BRB_FIELD_EXIT $t6
+    BRB_FIELD_EXIT $t7
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT $s2
+    BRB_FIELD_EXIT $s3
+    BRB_FIELD_EXIT $s4
+    BRB_FIELD_EXIT $s5
+    BRB_FIELD_EXIT $s6
+    BRB_FIELD_EXIT $s7
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT $s8
+    BRB_FIELD_EXIT_BREAK
+END art_quick_read_barrier_mark_introspection
+
 .extern artInvokePolymorphic
 ENTRY art_quick_invoke_polymorphic
     SETUP_SAVE_REFS_AND_ARGS_FRAME
diff --git a/runtime/arch/mips64/asm_support_mips64.S b/runtime/arch/mips64/asm_support_mips64.S
index ef82bd2..a6b249a 100644
--- a/runtime/arch/mips64/asm_support_mips64.S
+++ b/runtime/arch/mips64/asm_support_mips64.S
@@ -83,6 +83,13 @@
 #endif  // USE_HEAP_POISONING
 .endm
 
+// Byte size of the instructions (un)poisoning heap references.
+#ifdef USE_HEAP_POISONING
+#define HEAP_POISON_INSTR_SIZE 8
+#else
+#define HEAP_POISON_INSTR_SIZE 0
+#endif  // USE_HEAP_POISONING
+
 // Based on contents of creg select the minimum integer
 // At the end of the macro the original value of creg is lost
 .macro MINint dreg,rreg,sreg,creg
diff --git a/runtime/arch/mips64/asm_support_mips64.h b/runtime/arch/mips64/asm_support_mips64.h
index 9063d20..7185da5 100644
--- a/runtime/arch/mips64/asm_support_mips64.h
+++ b/runtime/arch/mips64/asm_support_mips64.h
@@ -28,4 +28,24 @@
 // $f0-$f31, $at, $v0-$v1, $a0-$a7, $t0-$t3, $s0-$s7, $t8-$t9, $gp, $s8, $ra + padding + method*
 #define FRAME_SIZE_SAVE_EVERYTHING 496
 
+// &art_quick_read_barrier_mark_introspection is the first of many entry points:
+//   20 entry points for long field offsets, large array indices and variable array indices
+//     (see macro BRB_FIELD_LONG_OFFSET_ENTRY)
+//   20 entry points for short field offsets and small array indices
+//     (see macro BRB_FIELD_SHORT_OFFSET_ENTRY)
+//   20 entry points for GC roots
+//     (see macro BRB_GC_ROOT_ENTRY)
+
+// There are as many entry points of each kind as there are registers that
+// can hold a reference: V0-V1, A0-A7, T0-T2, S2-S8.
+#define BAKER_MARK_INTROSPECTION_REGISTER_COUNT 20
+
+#define BAKER_MARK_INTROSPECTION_FIELD_ARRAY_ENTRY_SIZE (8 * 4)  // 8 instructions in
+                                                                 // BRB_FIELD_*_OFFSET_ENTRY.
+
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRIES_OFFSET \
+    (2 * BAKER_MARK_INTROSPECTION_REGISTER_COUNT * BAKER_MARK_INTROSPECTION_FIELD_ARRAY_ENTRY_SIZE)
+
+#define BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRY_SIZE (4 * 4)  // 4 instructions in BRB_GC_ROOT_ENTRY.
+
 #endif  // ART_RUNTIME_ARCH_MIPS64_ASM_SUPPORT_MIPS64_H_
diff --git a/runtime/arch/mips64/entrypoints_init_mips64.cc b/runtime/arch/mips64/entrypoints_init_mips64.cc
index 007f7b3..5e58827 100644
--- a/runtime/arch/mips64/entrypoints_init_mips64.cc
+++ b/runtime/arch/mips64/entrypoints_init_mips64.cc
@@ -17,6 +17,7 @@
 #include <math.h>
 #include <string.h>
 
+#include "arch/mips64/asm_support_mips64.h"
 #include "atomic.h"
 #include "entrypoints/jni/jni_entrypoints.h"
 #include "entrypoints/quick/quick_alloc_entrypoints.h"
@@ -59,6 +60,10 @@
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg22(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg29(mirror::Object*);
 
+extern "C" mirror::Object* art_quick_read_barrier_mark_introspection(mirror::Object*);
+extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_gc_roots(mirror::Object*);
+extern "C" void art_quick_read_barrier_mark_introspection_end_of_entries(void);
+
 // Math entrypoints.
 extern int32_t CmpgDouble(double a, double b);
 extern int32_t CmplDouble(double a, double b);
@@ -88,6 +93,21 @@
 
 // No read barrier entrypoints for marking registers.
 void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_active) {
+  intptr_t introspection_field_array_entries_size =
+      reinterpret_cast<intptr_t>(&art_quick_read_barrier_mark_introspection_gc_roots) -
+      reinterpret_cast<intptr_t>(&art_quick_read_barrier_mark_introspection);
+  static_assert(
+      BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRIES_OFFSET == 2 *
+          BAKER_MARK_INTROSPECTION_REGISTER_COUNT * BAKER_MARK_INTROSPECTION_FIELD_ARRAY_ENTRY_SIZE,
+      "Expecting equal");
+  DCHECK_EQ(introspection_field_array_entries_size,
+            BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRIES_OFFSET);
+  intptr_t introspection_gc_root_entries_size =
+      reinterpret_cast<intptr_t>(&art_quick_read_barrier_mark_introspection_end_of_entries) -
+      reinterpret_cast<intptr_t>(&art_quick_read_barrier_mark_introspection_gc_roots);
+  DCHECK_EQ(introspection_gc_root_entries_size,
+            BAKER_MARK_INTROSPECTION_REGISTER_COUNT * BAKER_MARK_INTROSPECTION_GC_ROOT_ENTRY_SIZE);
+  qpoints->pReadBarrierMarkReg00 = is_active ? art_quick_read_barrier_mark_introspection : nullptr;
   qpoints->pReadBarrierMarkReg01 = is_active ? art_quick_read_barrier_mark_reg01 : nullptr;
   qpoints->pReadBarrierMarkReg02 = is_active ? art_quick_read_barrier_mark_reg02 : nullptr;
   qpoints->pReadBarrierMarkReg03 = is_active ? art_quick_read_barrier_mark_reg03 : nullptr;
@@ -173,7 +193,6 @@
   // Cannot use the following registers to pass arguments:
   // 0(ZERO), 1(AT), 15(T3), 16(S0), 17(S1), 24(T8), 25(T9), 26(K0), 27(K1), 28(GP), 29(SP), 31(RA).
   // Note that there are 30 entry points only: 00 for register 1(AT), ..., 29 for register 30(S8).
-  qpoints->pReadBarrierMarkReg00 = nullptr;
   qpoints->pReadBarrierMarkReg14 = nullptr;
   qpoints->pReadBarrierMarkReg15 = nullptr;
   qpoints->pReadBarrierMarkReg16 = nullptr;
diff --git a/runtime/arch/mips64/quick_entrypoints_mips64.S b/runtime/arch/mips64/quick_entrypoints_mips64.S
index d427fe3..fcbed0e 100644
--- a/runtime/arch/mips64/quick_entrypoints_mips64.S
+++ b/runtime/arch/mips64/quick_entrypoints_mips64.S
@@ -847,7 +847,7 @@
     dla  $t9, artThrowNullPointerExceptionFromSignal
     jalr $zero, $t9                 # artThrowNullPointerExceptionFromSignal(uinptr_t, Thread*)
     move $a1, rSELF                 # pass Thread::Current
-END art_quick_throw_null_pointer_exception
+END art_quick_throw_null_pointer_exception_from_signal
 
     /*
      * Called by managed code to create and deliver an ArithmeticException
@@ -2567,6 +2567,375 @@
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg29, $s8
 // RA (register 31) is reserved.
 
+// Caller code:
+// Short constant offset/index:
+//  ld      $t9, pReadBarrierMarkReg00
+//  beqzc   $t9, skip_call
+//  nop
+//  jialc   $t9, thunk_disp
+// skip_call:
+//  lwu     `out`, ofs(`obj`)
+// [dsubu   `out`, $zero, `out`
+//  dext    `out`, `out`, 0, 32]  # Unpoison reference.
+.macro BRB_FIELD_SHORT_OFFSET_ENTRY obj
+    # Explicit null check. May be redundant (for array elements or when the field
+    # offset is larger than the page size, 4KB).
+    # $ra will be adjusted to point to lwu's stack map when throwing NPE.
+    beqzc   \obj, .Lintrospection_throw_npe
+    lapc    $t3, .Lintrospection_exits                  # $t3 = address of .Lintrospection_exits.
+    .set push
+    .set noat
+    lw      $at, MIRROR_OBJECT_LOCK_WORD_OFFSET(\obj)
+    sll     $at, $at, 31 - LOCK_WORD_READ_BARRIER_STATE_SHIFT   # Move barrier state bit
+                                                                # to sign bit.
+    bltz    $at, .Lintrospection_field_array            # If gray, load reference, mark.
+    move    $t8, \obj                                   # Move `obj` to $t8 for common code.
+    .set pop
+    jalr    $zero, $ra                                  # Otherwise, load-load barrier and return.
+    sync
+.endm
+
+// Caller code:
+// Long constant offset/index:   | Variable index:
+//  ld      $t9, pReadBarrierMarkReg00
+//  beqz    $t9, skip_call       |  beqz    $t9, skip_call
+//  daui    $t8, `obj`, ofs_hi   |  dlsa    $t8, `index`, `obj`, 2
+//  jialc   $t9, thunk_disp      |  jialc   $t9, thunk_disp
+// skip_call:                    | skip_call:
+//  lwu     `out`, ofs_lo($t8)   |  lwu     `out`, ofs($t8)
+// [dsubu   `out`, $zero, `out`  | [dsubu   `out`, $zero, `out`
+//  dext    `out`, `out`, 0, 32] |  dext    `out`, `out`, 0, 32]  # Unpoison reference.
+.macro BRB_FIELD_LONG_OFFSET_ENTRY obj
+    # No explicit null check for variable indices or large constant indices/offsets
+    # as it must have been done earlier.
+    lapc    $t3, .Lintrospection_exits                  # $t3 = address of .Lintrospection_exits.
+    .set push
+    .set noat
+    lw      $at, MIRROR_OBJECT_LOCK_WORD_OFFSET(\obj)
+    sll     $at, $at, 31 - LOCK_WORD_READ_BARRIER_STATE_SHIFT   # Move barrier state bit
+                                                                # to sign bit.
+    bltzc   $at, .Lintrospection_field_array            # If gray, load reference, mark.
+    .set pop
+    sync                                                # Otherwise, load-load barrier and return.
+    jic     $ra, 0
+    break                                               # Padding to 8 instructions.
+    break
+.endm
+
+.macro BRB_GC_ROOT_ENTRY root
+    lapc    $t3, .Lintrospection_exit_\root             # $t3 = exit point address.
+    bnez    \root, .Lintrospection_common
+    move    $t8, \root                                  # Move reference to $t8 for common code.
+    jic     $ra, 0                                      # Return if null.
+.endm
+
+.macro BRB_FIELD_EXIT out
+.Lintrospection_exit_\out:
+    jalr    $zero, $ra
+    move    \out, $t8                                   # Return reference in expected register.
+.endm
+
+.macro BRB_FIELD_EXIT_BREAK
+    break
+    break
+.endm
+
+ENTRY_NO_GP art_quick_read_barrier_mark_introspection
+    # Entry points for offsets/indices not fitting into int16_t and for variable indices.
+    BRB_FIELD_LONG_OFFSET_ENTRY $v0
+    BRB_FIELD_LONG_OFFSET_ENTRY $v1
+    BRB_FIELD_LONG_OFFSET_ENTRY $a0
+    BRB_FIELD_LONG_OFFSET_ENTRY $a1
+    BRB_FIELD_LONG_OFFSET_ENTRY $a2
+    BRB_FIELD_LONG_OFFSET_ENTRY $a3
+    BRB_FIELD_LONG_OFFSET_ENTRY $a4
+    BRB_FIELD_LONG_OFFSET_ENTRY $a5
+    BRB_FIELD_LONG_OFFSET_ENTRY $a6
+    BRB_FIELD_LONG_OFFSET_ENTRY $a7
+    BRB_FIELD_LONG_OFFSET_ENTRY $t0
+    BRB_FIELD_LONG_OFFSET_ENTRY $t1
+    BRB_FIELD_LONG_OFFSET_ENTRY $t2
+    BRB_FIELD_LONG_OFFSET_ENTRY $s2
+    BRB_FIELD_LONG_OFFSET_ENTRY $s3
+    BRB_FIELD_LONG_OFFSET_ENTRY $s4
+    BRB_FIELD_LONG_OFFSET_ENTRY $s5
+    BRB_FIELD_LONG_OFFSET_ENTRY $s6
+    BRB_FIELD_LONG_OFFSET_ENTRY $s7
+    BRB_FIELD_LONG_OFFSET_ENTRY $s8
+
+    # Entry points for offsets/indices fitting into int16_t.
+    BRB_FIELD_SHORT_OFFSET_ENTRY $v0
+    BRB_FIELD_SHORT_OFFSET_ENTRY $v1
+    BRB_FIELD_SHORT_OFFSET_ENTRY $a0
+    BRB_FIELD_SHORT_OFFSET_ENTRY $a1
+    BRB_FIELD_SHORT_OFFSET_ENTRY $a2
+    BRB_FIELD_SHORT_OFFSET_ENTRY $a3
+    BRB_FIELD_SHORT_OFFSET_ENTRY $a4
+    BRB_FIELD_SHORT_OFFSET_ENTRY $a5
+    BRB_FIELD_SHORT_OFFSET_ENTRY $a6
+    BRB_FIELD_SHORT_OFFSET_ENTRY $a7
+    BRB_FIELD_SHORT_OFFSET_ENTRY $t0
+    BRB_FIELD_SHORT_OFFSET_ENTRY $t1
+    BRB_FIELD_SHORT_OFFSET_ENTRY $t2
+    BRB_FIELD_SHORT_OFFSET_ENTRY $s2
+    BRB_FIELD_SHORT_OFFSET_ENTRY $s3
+    BRB_FIELD_SHORT_OFFSET_ENTRY $s4
+    BRB_FIELD_SHORT_OFFSET_ENTRY $s5
+    BRB_FIELD_SHORT_OFFSET_ENTRY $s6
+    BRB_FIELD_SHORT_OFFSET_ENTRY $s7
+    BRB_FIELD_SHORT_OFFSET_ENTRY $s8
+
+    .global art_quick_read_barrier_mark_introspection_gc_roots
+art_quick_read_barrier_mark_introspection_gc_roots:
+    # Entry points for GC roots.
+    BRB_GC_ROOT_ENTRY $v0
+    BRB_GC_ROOT_ENTRY $v1
+    BRB_GC_ROOT_ENTRY $a0
+    BRB_GC_ROOT_ENTRY $a1
+    BRB_GC_ROOT_ENTRY $a2
+    BRB_GC_ROOT_ENTRY $a3
+    BRB_GC_ROOT_ENTRY $a4
+    BRB_GC_ROOT_ENTRY $a5
+    BRB_GC_ROOT_ENTRY $a6
+    BRB_GC_ROOT_ENTRY $a7
+    BRB_GC_ROOT_ENTRY $t0
+    BRB_GC_ROOT_ENTRY $t1
+    BRB_GC_ROOT_ENTRY $t2
+    BRB_GC_ROOT_ENTRY $s2
+    BRB_GC_ROOT_ENTRY $s3
+    BRB_GC_ROOT_ENTRY $s4
+    BRB_GC_ROOT_ENTRY $s5
+    BRB_GC_ROOT_ENTRY $s6
+    BRB_GC_ROOT_ENTRY $s7
+    BRB_GC_ROOT_ENTRY $s8
+    .global art_quick_read_barrier_mark_introspection_end_of_entries
+art_quick_read_barrier_mark_introspection_end_of_entries:
+
+.Lintrospection_throw_npe:
+    b       art_quick_throw_null_pointer_exception
+    daddiu  $ra, $ra, 4         # Skip lwu, make $ra point to lwu's stack map.
+
+    .set push
+    .set noat
+
+    // Fields and array elements.
+
+.Lintrospection_field_array:
+    // Get the field/element address using $t8 and the offset from the lwu instruction.
+    lh      $at, 0($ra)         # $ra points to lwu: $at = low 16 bits of field/element offset.
+    daddiu  $ra, $ra, 4 + HEAP_POISON_INSTR_SIZE   # Skip lwu(+dsubu+dext).
+    daddu   $t8, $t8, $at       # $t8 = field/element address.
+
+    // Calculate the address of the exit point, store it in $t3 and load the reference into $t8.
+    lb      $at, (-HEAP_POISON_INSTR_SIZE - 2)($ra)   # $ra-HEAP_POISON_INSTR_SIZE-4 points to
+                                                      # "lwu `out`, ...".
+    andi    $at, $at, 31        # Extract `out` from lwu.
+
+    lwu     $t8, 0($t8)         # $t8 = reference.
+    UNPOISON_HEAP_REF $t8
+
+    // Return if null reference.
+    bnez    $t8, .Lintrospection_common
+    dlsa    $t3, $at, $t3, 3    # $t3 = address of the exit point
+                                # (BRB_FIELD_EXIT* macro is 8 bytes).
+
+    // Early return through the exit point.
+.Lintrospection_return_early:
+    jic     $t3, 0              # Move $t8 to `out` and return.
+
+    // Code common for GC roots, fields and array elements.
+
+.Lintrospection_common:
+    // Check lock word for mark bit, if marked return.
+    lw      $t9, MIRROR_OBJECT_LOCK_WORD_OFFSET($t8)
+    sll     $at, $t9, 31 - LOCK_WORD_MARK_BIT_SHIFT     # Move mark bit to sign bit.
+    bltzc   $at, .Lintrospection_return_early
+#if (LOCK_WORD_STATE_SHIFT != 30) || (LOCK_WORD_STATE_FORWARDING_ADDRESS != 3)
+    // The below code depends on the lock word state being in the highest bits
+    // and the "forwarding address" state having all bits set.
+#error "Unexpected lock word state shift or forwarding address state value."
+#endif
+    // Test that both the forwarding state bits are 1.
+    sll     $at, $t9, 1
+    and     $at, $at, $t9                               # Sign bit = 1 IFF both bits are 1.
+    bgezc   $at, .Lintrospection_mark
+
+    .set pop
+
+    // Shift left by the forwarding address shift. This clears out the state bits since they are
+    // in the top 2 bits of the lock word.
+    sll     $t8, $t9, LOCK_WORD_STATE_FORWARDING_ADDRESS_SHIFT
+    jalr    $zero, $t3          # Move $t8 to `out` and return.
+    dext    $t8, $t8, 0, 32     # Make sure the address is zero-extended.
+
+.Lintrospection_mark:
+    // Partially set up the stack frame preserving only $ra.
+    daddiu  $sp, $sp, -320
+    .cfi_adjust_cfa_offset 320
+    sd      $ra, 312($sp)
+    .cfi_rel_offset 31, 312
+
+    // Set up $gp, clobbering $ra.
+    lapc    $ra, 1f
+1:
+    .cpsetup $ra, 304, 1b       # Save old $gp in 304($sp).
+
+    // Finalize the stack frame and call.
+    sd      $t3, 296($sp)       # Preserve the exit point address.
+    sd      $t2, 288($sp)
+    .cfi_rel_offset 14, 288
+    sd      $t1, 280($sp)
+    .cfi_rel_offset 13, 280
+    sd      $t0, 272($sp)
+    .cfi_rel_offset 12, 272
+    sd      $a7, 264($sp)
+    .cfi_rel_offset 11, 264
+    sd      $a6, 256($sp)
+    .cfi_rel_offset 10, 256
+    sd      $a5, 248($sp)
+    .cfi_rel_offset 9, 248
+    sd      $a4, 240($sp)
+    .cfi_rel_offset 8, 240
+    sd      $a3, 232($sp)
+    .cfi_rel_offset 7, 232
+    sd      $a2, 224($sp)
+    .cfi_rel_offset 6, 224
+    sd      $a1, 216($sp)
+    .cfi_rel_offset 5, 216
+    sd      $a0, 208($sp)
+    .cfi_rel_offset 4, 208
+    sd      $v1, 200($sp)
+    .cfi_rel_offset 3, 200
+    sd      $v0, 192($sp)
+    .cfi_rel_offset 2, 192
+
+    dla     $t9, artReadBarrierMark
+
+    sdc1    $f23, 184($sp)
+    sdc1    $f22, 176($sp)
+    sdc1    $f21, 168($sp)
+    sdc1    $f20, 160($sp)
+    sdc1    $f19, 152($sp)
+    sdc1    $f18, 144($sp)
+    sdc1    $f17, 136($sp)
+    sdc1    $f16, 128($sp)
+    sdc1    $f15, 120($sp)
+    sdc1    $f14, 112($sp)
+    sdc1    $f13, 104($sp)
+    sdc1    $f12,  96($sp)
+    sdc1    $f11,  88($sp)
+    sdc1    $f10,  80($sp)
+    sdc1    $f9,   72($sp)
+    sdc1    $f8,   64($sp)
+    sdc1    $f7,   56($sp)
+    sdc1    $f6,   48($sp)
+    sdc1    $f5,   40($sp)
+    sdc1    $f4,   32($sp)
+    sdc1    $f3,   24($sp)
+    sdc1    $f2,   16($sp)
+    sdc1    $f1,    8($sp)
+    sdc1    $f0,    0($sp)
+
+    jalr    $t9                 # $v0 <- artReadBarrierMark(reference)
+    move    $a0, $t8            # Pass reference in $a0.
+    move    $t8, $v0
+
+    ld      $ra, 312($sp)
+    .cfi_restore 31
+    .cpreturn                   # Restore old $gp from 304($sp).
+    ld      $t3, 296($sp)       # $t3 = address of the exit point.
+    ld      $t2, 288($sp)
+    .cfi_restore 14
+    ld      $t1, 280($sp)
+    .cfi_restore 13
+    ld      $t0, 272($sp)
+    .cfi_restore 12
+    ld      $a7, 264($sp)
+    .cfi_restore 11
+    ld      $a6, 256($sp)
+    .cfi_restore 10
+    ld      $a5, 248($sp)
+    .cfi_restore 9
+    ld      $a4, 240($sp)
+    .cfi_restore 8
+    ld      $a3, 232($sp)
+    .cfi_restore 7
+    ld      $a2, 224($sp)
+    .cfi_restore 6
+    ld      $a1, 216($sp)
+    .cfi_restore 5
+    ld      $a0, 208($sp)
+    .cfi_restore 4
+    ld      $v1, 200($sp)
+    .cfi_restore 3
+    ld      $v0, 192($sp)
+    .cfi_restore 2
+
+    ldc1    $f23, 184($sp)
+    ldc1    $f22, 176($sp)
+    ldc1    $f21, 168($sp)
+    ldc1    $f20, 160($sp)
+    ldc1    $f19, 152($sp)
+    ldc1    $f18, 144($sp)
+    ldc1    $f17, 136($sp)
+    ldc1    $f16, 128($sp)
+    ldc1    $f15, 120($sp)
+    ldc1    $f14, 112($sp)
+    ldc1    $f13, 104($sp)
+    ldc1    $f12,  96($sp)
+    ldc1    $f11,  88($sp)
+    ldc1    $f10,  80($sp)
+    ldc1    $f9,   72($sp)
+    ldc1    $f8,   64($sp)
+    ldc1    $f7,   56($sp)
+    ldc1    $f6,   48($sp)
+    ldc1    $f5,   40($sp)
+    ldc1    $f4,   32($sp)
+    ldc1    $f3,   24($sp)
+    ldc1    $f2,   16($sp)
+    ldc1    $f1,    8($sp)
+    ldc1    $f0,    0($sp)
+
+    // Return through the exit point.
+    jalr    $zero, $t3          # Move $t8 to `out` and return.
+    daddiu  $sp, $sp, 320
+    .cfi_adjust_cfa_offset -320
+
+.Lintrospection_exits:
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT $v0
+    BRB_FIELD_EXIT $v1
+    BRB_FIELD_EXIT $a0
+    BRB_FIELD_EXIT $a1
+    BRB_FIELD_EXIT $a2
+    BRB_FIELD_EXIT $a3
+    BRB_FIELD_EXIT $a4
+    BRB_FIELD_EXIT $a5
+    BRB_FIELD_EXIT $a6
+    BRB_FIELD_EXIT $a7
+    BRB_FIELD_EXIT $t0
+    BRB_FIELD_EXIT $t1
+    BRB_FIELD_EXIT $t2
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT $s2
+    BRB_FIELD_EXIT $s3
+    BRB_FIELD_EXIT $s4
+    BRB_FIELD_EXIT $s5
+    BRB_FIELD_EXIT $s6
+    BRB_FIELD_EXIT $s7
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT_BREAK
+    BRB_FIELD_EXIT $s8
+    BRB_FIELD_EXIT_BREAK
+END art_quick_read_barrier_mark_introspection
+
 .extern artInvokePolymorphic
 ENTRY art_quick_invoke_polymorphic
     SETUP_SAVE_REFS_AND_ARGS_FRAME
diff --git a/runtime/oat.h b/runtime/oat.h
index de4b942..c4a983e 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,8 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  // Last oat version changed reason: update kMultiDexSeparator from ':' to '!'.
-  static constexpr uint8_t kOatVersion[] = { '1', '3', '0', '\0' };
+  // Last oat version changed reason: MIPS Baker thunks.
+  static constexpr uint8_t kOatVersion[] = { '1', '3', '1', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";