ARM64: Introspection Baker RB for volatile fields.

Test: Already covered by 160-read-barrier-stress.
Test: m test-art-host-gtest
Test: Pixel 2 XL boots.
Test: m test-art-target-gtest
Test: testrunner.py --target --optimizing --jit --64
Bug: 36141117
Change-Id: I2f9a707587d1ee27c0efb19d77becba7ec7ffec4
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 723446b..7aaa7bf 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -1417,13 +1417,20 @@
       BakerReadBarrierKind kind = BakerReadBarrierKindField::Decode(encoded_data);
       // Check that the next instruction matches the expected LDR.
       switch (kind) {
-        case BakerReadBarrierKind::kField: {
+        case BakerReadBarrierKind::kField:
+        case BakerReadBarrierKind::kAcquire: {
           DCHECK_GE(code.size() - literal_offset, 8u);
           uint32_t next_insn = GetInsn(literal_offset + 4u);
-          // LDR (immediate) with correct base_reg.
           CheckValidReg(next_insn & 0x1fu);  // Check destination register.
           const uint32_t base_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
-          CHECK_EQ(next_insn & 0xffc003e0u, 0xb9400000u | (base_reg << 5));
+          if (kind == BakerReadBarrierKind::kField) {
+            // LDR (immediate) with correct base_reg.
+            CHECK_EQ(next_insn & 0xffc003e0u, 0xb9400000u | (base_reg << 5));
+          } else {
+            DCHECK(kind == BakerReadBarrierKind::kAcquire);
+            // LDAR with correct base_reg.
+            CHECK_EQ(next_insn & 0xffffffe0u, 0x88dffc00u | (base_reg << 5));
+          }
           break;
         }
         case BakerReadBarrierKind::kArray: {
@@ -2275,17 +2282,12 @@
                                                            : LocationSummary::kNoCall);
   if (object_field_get_with_read_barrier && kUseBakerReadBarrier) {
     locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
-    if (!field_info.IsVolatile()) {
-      // We need a temporary register for the read barrier load in
-      // CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier()
-      // only if the offset is too big.
-      if (field_info.GetFieldOffset().Uint32Value() >= kReferenceLoadMinFarOffset) {
-        locations->AddTemp(FixedTempLocation());
-      }
-    } else {
-      // Volatile fields need a temporary register for the read barrier marking slow
-      // path in CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier().
-      locations->AddTemp(Location::RequiresRegister());
+    // We need a temporary register for the read barrier load in
+    // CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier()
+    // only if the field is volatile or the offset is too big.
+    if (field_info.IsVolatile() ||
+        field_info.GetFieldOffset().Uint32Value() >= kReferenceLoadMinFarOffset) {
+      locations->AddTemp(FixedTempLocation());
     }
   }
   locations->SetInAt(0, Location::RequiresRegister());
@@ -6294,81 +6296,76 @@
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
-  if (!use_load_acquire) {
-    // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the
-    // Marking Register) to decide whether we need to enter the slow
-    // path to mark the reference. Then, in the slow path, check the
-    // gray bit in the lock word of the reference's holder (`obj`) to
-    // decide whether to mark `ref` or not.
-    //
-    // We use shared thunks for the slow path; shared within the method
-    // for JIT, across methods for AOT. That thunk checks the holder
-    // and jumps to the entrypoint if needed. If the holder is not gray,
-    // it creates a fake dependency and returns to the LDR instruction.
-    //
-    //     lr = &gray_return_address;
-    //     if (mr) {  // Thread::Current()->GetIsGcMarking()
-    //       goto field_thunk<holder_reg, base_reg>(lr)
-    //     }
-    //   not_gray_return_address:
-    //     // Original reference load. If the offset is too large to fit
-    //     // into LDR, we use an adjusted base register here.
-    //     HeapReference<mirror::Object> reference = *(obj+offset);
-    //   gray_return_address:
+  // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the
+  // Marking Register) to decide whether we need to enter the slow
+  // path to mark the reference. Then, in the slow path, check the
+  // gray bit in the lock word of the reference's holder (`obj`) to
+  // decide whether to mark `ref` or not.
+  //
+  // We use shared thunks for the slow path; shared within the method
+  // for JIT, across methods for AOT. That thunk checks the holder
+  // and jumps to the entrypoint if needed. If the holder is not gray,
+  // it creates a fake dependency and returns to the LDR instruction.
+  //
+  //     lr = &gray_return_address;
+  //     if (mr) {  // Thread::Current()->GetIsGcMarking()
+  //       goto field_thunk<holder_reg, base_reg, use_load_acquire>(lr)
+  //     }
+  //   not_gray_return_address:
+  //     // Original reference load. If the offset is too large to fit
+  //     // into LDR, we use an adjusted base register here.
+  //     HeapReference<mirror::Object> reference = *(obj+offset);
+  //   gray_return_address:
 
-    DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
-    Register base = obj;
-    if (offset >= kReferenceLoadMinFarOffset) {
-      DCHECK(maybe_temp.IsRegister());
-      base = WRegisterFrom(maybe_temp);
-      static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2.");
-      __ Add(base, obj, Operand(offset & ~(kReferenceLoadMinFarOffset - 1u)));
-      offset &= (kReferenceLoadMinFarOffset - 1u);
-    }
-    UseScratchRegisterScope temps(GetVIXLAssembler());
-    DCHECK(temps.IsAvailable(ip0));
-    DCHECK(temps.IsAvailable(ip1));
-    temps.Exclude(ip0, ip1);
-    uint32_t custom_data = EncodeBakerReadBarrierFieldData(base.GetCode(), obj.GetCode());
-
-    {
-      ExactAssemblyScope guard(GetVIXLAssembler(),
-                               (kPoisonHeapReferences ? 4u : 3u) * vixl::aarch64::kInstructionSize);
-      vixl::aarch64::Label return_address;
-      __ adr(lr, &return_address);
-      EmitBakerReadBarrierCbnz(custom_data);
-      static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                    "Field LDR must be 1 instruction (4B) before the return address label; "
-                    " 2 instructions (8B) for heap poisoning.");
-      Register ref_reg = RegisterFrom(ref, DataType::Type::kReference);
-      __ ldr(ref_reg, MemOperand(base.X(), offset));
-      if (needs_null_check) {
-        MaybeRecordImplicitNullCheck(instruction);
-      }
-      // Unpoison the reference explicitly if needed. MaybeUnpoisonHeapReference() uses
-      // macro instructions disallowed in ExactAssemblyScope.
-      if (kPoisonHeapReferences) {
-        __ neg(ref_reg, Operand(ref_reg));
-      }
-      __ bind(&return_address);
-    }
-    MaybeGenerateMarkingRegisterCheck(/* code */ __LINE__, /* temp_loc */ LocationFrom(ip1));
-    return;
+  DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
+  Register base = obj;
+  if (use_load_acquire) {
+    DCHECK(maybe_temp.IsRegister());
+    base = WRegisterFrom(maybe_temp);
+    __ Add(base, obj, offset);
+    offset = 0u;
+  } else if (offset >= kReferenceLoadMinFarOffset) {
+    DCHECK(maybe_temp.IsRegister());
+    base = WRegisterFrom(maybe_temp);
+    static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2.");
+    __ Add(base, obj, Operand(offset & ~(kReferenceLoadMinFarOffset - 1u)));
+    offset &= (kReferenceLoadMinFarOffset - 1u);
   }
+  UseScratchRegisterScope temps(GetVIXLAssembler());
+  DCHECK(temps.IsAvailable(ip0));
+  DCHECK(temps.IsAvailable(ip1));
+  temps.Exclude(ip0, ip1);
+  uint32_t custom_data = use_load_acquire
+      ? EncodeBakerReadBarrierAcquireData(base.GetCode(), obj.GetCode())
+      : EncodeBakerReadBarrierFieldData(base.GetCode(), obj.GetCode());
 
-  // /* HeapReference<Object> */ ref = *(obj + offset)
-  Register temp = WRegisterFrom(maybe_temp);
-  Location no_index = Location::NoLocation();
-  size_t no_scale_factor = 0u;
-  GenerateReferenceLoadWithBakerReadBarrier(instruction,
-                                            ref,
-                                            obj,
-                                            offset,
-                                            no_index,
-                                            no_scale_factor,
-                                            temp,
-                                            needs_null_check,
-                                            use_load_acquire);
+  {
+    ExactAssemblyScope guard(GetVIXLAssembler(),
+                             (kPoisonHeapReferences ? 4u : 3u) * vixl::aarch64::kInstructionSize);
+    vixl::aarch64::Label return_address;
+    __ adr(lr, &return_address);
+    EmitBakerReadBarrierCbnz(custom_data);
+    static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
+                  "Field LDR must be 1 instruction (4B) before the return address label; "
+                  " 2 instructions (8B) for heap poisoning.");
+    Register ref_reg = RegisterFrom(ref, DataType::Type::kReference);
+    if (use_load_acquire) {
+      DCHECK_EQ(offset, 0u);
+      __ ldar(ref_reg, MemOperand(base.X()));
+    } else {
+      __ ldr(ref_reg, MemOperand(base.X(), offset));
+    }
+    if (needs_null_check) {
+      MaybeRecordImplicitNullCheck(instruction);
+    }
+    // Unpoison the reference explicitly if needed. MaybeUnpoisonHeapReference() uses
+    // macro instructions disallowed in ExactAssemblyScope.
+    if (kPoisonHeapReferences) {
+      __ neg(ref_reg, Operand(ref_reg));
+    }
+    __ bind(&return_address);
+  }
+  MaybeGenerateMarkingRegisterCheck(/* code */ __LINE__, /* temp_loc */ LocationFrom(ip1));
 }
 
 void CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier(Location ref,
@@ -6806,7 +6803,8 @@
                                                       /*out*/ std::string* debug_name) {
   BakerReadBarrierKind kind = BakerReadBarrierKindField::Decode(encoded_data);
   switch (kind) {
-    case BakerReadBarrierKind::kField: {
+    case BakerReadBarrierKind::kField:
+    case BakerReadBarrierKind::kAcquire: {
       auto base_reg =
           Register::GetXRegFromCode(BakerReadBarrierFirstRegField::Decode(encoded_data));
       CheckValidReg(base_reg.GetCode());
@@ -6832,11 +6830,18 @@
       MemOperand lock_word(holder_reg, mirror::Object::MonitorOffset().Int32Value());
       EmitGrayCheckAndFastPath(assembler, base_reg, lock_word, &slow_path, throw_npe);
       __ Bind(&slow_path);
-      MemOperand ldr_address(lr, BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET);
-      __ Ldr(ip0.W(), ldr_address);         // Load the LDR (immediate) unsigned offset.
-      LoadReadBarrierMarkIntrospectionEntrypoint(assembler, ip1);
-      __ Ubfx(ip0.W(), ip0.W(), 10, 12);    // Extract the offset.
-      __ Ldr(ip0.W(), MemOperand(base_reg, ip0, LSL, 2));   // Load the reference.
+      if (kind == BakerReadBarrierKind::kField) {
+        MemOperand ldr_address(lr, BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET);
+        __ Ldr(ip0.W(), ldr_address);         // Load the LDR (immediate) unsigned offset.
+        LoadReadBarrierMarkIntrospectionEntrypoint(assembler, ip1);
+        __ Ubfx(ip0.W(), ip0.W(), 10, 12);    // Extract the offset.
+        __ Ldr(ip0.W(), MemOperand(base_reg, ip0, LSL, 2));   // Load the reference.
+      } else {
+        DCHECK(kind == BakerReadBarrierKind::kAcquire);
+        DCHECK(!base_reg.Is(holder_reg));
+        LoadReadBarrierMarkIntrospectionEntrypoint(assembler, ip1);
+        __ Ldar(ip0.W(), MemOperand(base_reg));
+      }
       // Do not unpoison. With heap poisoning enabled, the entrypoint expects a poisoned reference.
       __ Br(ip1);                           // Jump to the entrypoint.
       break;
@@ -6917,6 +6922,10 @@
         oss << "Field_r" << BakerReadBarrierFirstRegField::Decode(encoded_data)
             << "_r" << BakerReadBarrierSecondRegField::Decode(encoded_data);
         break;
+      case BakerReadBarrierKind::kAcquire:
+        oss << "Acquire_r" << BakerReadBarrierFirstRegField::Decode(encoded_data)
+            << "_r" << BakerReadBarrierSecondRegField::Decode(encoded_data);
+        break;
       case BakerReadBarrierKind::kArray:
         oss << "Array_r" << BakerReadBarrierFirstRegField::Decode(encoded_data);
         DCHECK_EQ(kBakerReadBarrierInvalidEncodedReg,
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 5aeb0b4..6a358a4 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -797,9 +797,10 @@
   // Encoding of thunk type and data for link-time generated thunks for Baker read barriers.
 
   enum class BakerReadBarrierKind : uint8_t {
-    kField,   // Field get or array get with constant offset (i.e. constant index).
-    kArray,   // Array get with index in register.
-    kGcRoot,  // GC root load.
+    kField,     // Field get or array get with constant offset (i.e. constant index).
+    kAcquire,   // Volatile field get.
+    kArray,     // Array get with index in register.
+    kGcRoot,    // GC root load.
     kLast = kGcRoot
   };
 
@@ -832,6 +833,15 @@
            BakerReadBarrierSecondRegField::Encode(holder_reg);
   }
 
+  static inline uint32_t EncodeBakerReadBarrierAcquireData(uint32_t base_reg, uint32_t holder_reg) {
+    CheckValidReg(base_reg);
+    CheckValidReg(holder_reg);
+    DCHECK_NE(base_reg, holder_reg);
+    return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kAcquire) |
+           BakerReadBarrierFirstRegField::Encode(base_reg) |
+           BakerReadBarrierSecondRegField::Encode(holder_reg);
+  }
+
   static inline uint32_t EncodeBakerReadBarrierArrayData(uint32_t base_reg) {
     CheckValidReg(base_reg);
     return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kArray) |
diff --git a/test/635-checker-arm64-volatile-load-cc/src/Main.java b/test/635-checker-arm64-volatile-load-cc/src/Main.java
index 6a26e94..89fad4c 100644
--- a/test/635-checker-arm64-volatile-load-cc/src/Main.java
+++ b/test/635-checker-arm64-volatile-load-cc/src/Main.java
@@ -255,9 +255,9 @@
 
   /// CHECK-START-ARM64: void Main.testStaticVolatileFieldGetWithLargeOffset() disassembly (after)
   /// CHECK:               StaticFieldGet
-  /// CHECK:                 mov x17, #<<Offset:0x[0-9a-f]{4}>>
-  /// CHECK:                 add x16, {{x\d+}}, x17
-  /// CHECK:                 ldar {{w\d+}}, [x16]
+  /// CHECK:                 mov <<Kind:x|w>><<Temp1:\d+>>, #<<Offset:0x[0-9a-f]{4}>>
+  /// CHECK:                 add <<Kind>><<Temp2:\d+>>, <<Kind>>{{\d+}}, <<Kind>><<Temp1>>
+  /// CHECK:                 ldar {{w\d+}}, [x<<Temp2>>]
   static void testStaticVolatileFieldGetWithLargeOffset() {
     // The offset of this static field cannot be encoded as an immediate on ARM64.
     Object s = s999;
@@ -265,9 +265,9 @@
 
   /// CHECK-START-ARM64: void Main.testInstanceVolatileFieldGetWithLargeOffset() disassembly (after)
   /// CHECK:               InstanceFieldGet
-  /// CHECK:                 mov x17, #<<Offset:0x[0-9a-f]{4}>>
-  /// CHECK:                 add x16, {{x\d+}}, x17
-  /// CHECK:                 ldar {{w\d+}}, [x16]
+  /// CHECK:                 mov <<Kind:x|w>><<Temp1:\d+>>, #<<Offset:0x[0-9a-f]{4}>>
+  /// CHECK:                 add <<Kind>><<Temp2:\d+>>, <<Kind>>{{\d+}}, <<Kind>><<Temp1>>
+  /// CHECK:                 ldar {{w\d+}}, [x<<Temp2>>]
   void testInstanceVolatileFieldGetWithLargeOffset() {
     // The offset of this instance field cannot be encoded as an immediate on ARM64.
     Object i = i1029;