ARM: Reimplement the UnsafeCASObject intrinsic.

Clean up the shared code for all the UnsafeCAS* intrinsics
to avoid deprecated instructions (IT should only be followed
by a single 16-bit instruction) and simplify heap poisoning
code (to be more readable at the cost of performance).

For the UnsafeCASObject with Baker read barriers, drop the
old code updating the field. Perform the main path CAS loop
and redirect the flow for failure to a slow path that marks
the old value and compares it with the expected value (if
not marking, this is just a few instructions to determine
that they differ). If it's the same, the old value is known
to be the from-space reference to the expected object and
the slow path performs a modified CAS loop checking for both
expected object references (from-space and to-space).

Also replace all uses of vixl's EmissionCheckScope with the
ExactAssemblyScope in the arm32 codegen; all these uses are
related to the Baker read barrier using introspection.

Test: Already covered by 160-read-barrier-stress.
Test: m test-art-host-gtest
Test: Pixel 2 XL boots.
Test: testrunner.py --target --optimizing --32
Test: ART_HEAP_POISONING=true testrunner.py --target --optimizing --32
Bug: 36141117
Change-Id: If0b77d278826eb909b437ea9a1336afec575ccdf
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index ce93afc..4a76106 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -111,7 +111,7 @@
  public:
   EmitAdrCode(ArmVIXLMacroAssembler* assembler, vixl32::Register rd, vixl32::Label* label)
       : assembler_(assembler), rd_(rd), label_(label) {
-    ExactAssemblyScope aas(assembler, kMaxInstructionSizeInBytes);
+    DCHECK(!assembler->AllowMacroInstructions());  // In ExactAssemblyScope.
     adr_location_ = assembler->GetCursorOffset();
     assembler->adr(EncodingSize(Wide), rd, label);
   }
@@ -715,294 +715,6 @@
   DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathARMVIXL);
 };
 
-// Abstract base class for read barrier slow paths marking a reference
-// `ref`.
-//
-// Argument `entrypoint` must be a register location holding the read
-// barrier marking runtime entry point to be invoked or an empty
-// location; in the latter case, the read barrier marking runtime
-// entry point will be loaded by the slow path code itself.
-class ReadBarrierMarkSlowPathBaseARMVIXL : public SlowPathCodeARMVIXL {
- protected:
-  ReadBarrierMarkSlowPathBaseARMVIXL(HInstruction* instruction, Location ref, Location entrypoint)
-      : SlowPathCodeARMVIXL(instruction), ref_(ref), entrypoint_(entrypoint) {
-    DCHECK(kEmitCompilerReadBarrier);
-  }
-
-  const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathBaseARMVIXL"; }
-
-  // Generate assembly code calling the read barrier marking runtime
-  // entry point (ReadBarrierMarkRegX).
-  void GenerateReadBarrierMarkRuntimeCall(CodeGenerator* codegen) {
-    vixl32::Register ref_reg = RegisterFrom(ref_);
-
-    // No need to save live registers; it's taken care of by the
-    // entrypoint. Also, there is no need to update the stack mask,
-    // as this runtime call will not trigger a garbage collection.
-    CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen);
-    DCHECK(!ref_reg.Is(sp));
-    DCHECK(!ref_reg.Is(lr));
-    DCHECK(!ref_reg.Is(pc));
-    // IP is used internally by the ReadBarrierMarkRegX entry point
-    // as a temporary, it cannot be the entry point's input/output.
-    DCHECK(!ref_reg.Is(ip));
-    DCHECK(ref_reg.IsRegister()) << ref_reg;
-    // "Compact" slow path, saving two moves.
-    //
-    // Instead of using the standard runtime calling convention (input
-    // and output in R0):
-    //
-    //   R0 <- ref
-    //   R0 <- ReadBarrierMark(R0)
-    //   ref <- R0
-    //
-    // we just use rX (the register containing `ref`) as input and output
-    // of a dedicated entrypoint:
-    //
-    //   rX <- ReadBarrierMarkRegX(rX)
-    //
-    if (entrypoint_.IsValid()) {
-      arm_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this);
-      __ Blx(RegisterFrom(entrypoint_));
-    } else {
-      // Entrypoint is not already loaded, load from the thread.
-      int32_t entry_point_offset =
-          Thread::ReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg.GetCode());
-      // This runtime call does not require a stack map.
-      arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
-    }
-  }
-
-  // The location (register) of the marked object reference.
-  const Location ref_;
-
-  // The location of the entrypoint if already loaded.
-  const Location entrypoint_;
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathBaseARMVIXL);
-};
-
-// Slow path loading `obj`'s lock word, loading a reference from
-// object `*(obj + offset + (index << scale_factor))` into `ref`, and
-// marking `ref` if `obj` is gray according to the lock word (Baker
-// read barrier). If needed, this slow path also atomically updates
-// the field `obj.field` in the object `obj` holding this reference
-// after marking.
-//
-// This means that after the execution of this slow path, both `ref`
-// and `obj.field` will be up-to-date; i.e., after the flip, both will
-// hold the same to-space reference (unless another thread installed
-// another object reference (different from `ref`) in `obj.field`).
-//
-// Argument `entrypoint` must be a register location holding the read
-// barrier marking runtime entry point to be invoked or an empty
-// location; in the latter case, the read barrier marking runtime
-// entry point will be loaded by the slow path code itself.
-class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL
-    : public ReadBarrierMarkSlowPathBaseARMVIXL {
- public:
-  LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL(
-      HInstruction* instruction,
-      Location ref,
-      vixl32::Register obj,
-      uint32_t offset,
-      Location index,
-      ScaleFactor scale_factor,
-      bool needs_null_check,
-      vixl32::Register temp1,
-      vixl32::Register temp2,
-      Location entrypoint = Location::NoLocation())
-      : ReadBarrierMarkSlowPathBaseARMVIXL(instruction, ref, entrypoint),
-        obj_(obj),
-        offset_(offset),
-        index_(index),
-        scale_factor_(scale_factor),
-        needs_null_check_(needs_null_check),
-        temp1_(temp1),
-        temp2_(temp2) {
-    DCHECK(kEmitCompilerReadBarrier);
-    DCHECK(kUseBakerReadBarrier);
-  }
-
-  const char* GetDescription() const OVERRIDE {
-    return "LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL";
-  }
-
-  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
-    LocationSummary* locations = instruction_->GetLocations();
-    vixl32::Register ref_reg = RegisterFrom(ref_);
-    DCHECK(locations->CanCall());
-    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg.GetCode())) << ref_reg;
-    DCHECK_NE(ref_.reg(), LocationFrom(temp1_).reg());
-
-    // This slow path is only used by the UnsafeCASObject intrinsic at the moment.
-    DCHECK((instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()))
-        << "Unexpected instruction in read barrier marking and field updating slow path: "
-        << instruction_->DebugName();
-    DCHECK(instruction_->GetLocations()->Intrinsified());
-    DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kUnsafeCASObject);
-    DCHECK_EQ(offset_, 0u);
-    DCHECK_EQ(scale_factor_, ScaleFactor::TIMES_1);
-    Location field_offset = index_;
-    DCHECK(field_offset.IsRegisterPair()) << field_offset;
-
-    // Temporary register `temp1_`, used to store the lock word, must
-    // not be IP, as we may use it to emit the reference load (in the
-    // call to GenerateRawReferenceLoad below), and we need the lock
-    // word to still be in `temp1_` after the reference load.
-    DCHECK(!temp1_.Is(ip));
-
-    __ Bind(GetEntryLabel());
-
-    // The implementation is:
-    //
-    //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
-    //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
-    //   HeapReference<mirror::Object> ref = *src;  // Original reference load.
-    //   bool is_gray = (rb_state == ReadBarrier::GrayState());
-    //   if (is_gray) {
-    //     old_ref = ref;
-    //     ref = entrypoint(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
-    //     compareAndSwapObject(obj, field_offset, old_ref, ref);
-    //   }
-
-    CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen);
-
-    // /* int32_t */ monitor = obj->monitor_
-    uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
-    arm_codegen->GetAssembler()->LoadFromOffset(kLoadWord, temp1_, obj_, monitor_offset);
-    if (needs_null_check_) {
-      codegen->MaybeRecordImplicitNullCheck(instruction_);
-    }
-    // /* LockWord */ lock_word = LockWord(monitor)
-    static_assert(sizeof(LockWord) == sizeof(int32_t),
-                  "art::LockWord and int32_t have different sizes.");
-
-    // Introduce a dependency on the lock_word including the rb_state,
-    // which shall prevent load-load reordering without using
-    // a memory barrier (which would be more expensive).
-    // `obj` is unchanged by this operation, but its value now depends
-    // on `temp`.
-    __ Add(obj_, obj_, Operand(temp1_, ShiftType::LSR, 32));
-
-    // The actual reference load.
-    // A possible implicit null check has already been handled above.
-    arm_codegen->GenerateRawReferenceLoad(
-        instruction_, ref_, obj_, offset_, index_, scale_factor_, /* needs_null_check */ false);
-
-    // Mark the object `ref` when `obj` is gray.
-    //
-    //   if (rb_state == ReadBarrier::GrayState())
-    //     ref = ReadBarrier::Mark(ref);
-    //
-    // Given the numeric representation, it's enough to check the low bit of the
-    // rb_state. We do that by shifting the bit out of the lock word with LSRS
-    // which can be a 16-bit instruction unlike the TST immediate.
-    static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
-    static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
-    __ Lsrs(temp1_, temp1_, LockWord::kReadBarrierStateShift + 1);
-    __ B(cc, GetExitLabel());  // Carry flag is the last bit shifted out by LSRS.
-
-    // Save the old value of the reference before marking it.
-    // Note that we cannot use IP to save the old reference, as IP is
-    // used internally by the ReadBarrierMarkRegX entry point, and we
-    // need the old reference after the call to that entry point.
-    DCHECK(!temp1_.Is(ip));
-    __ Mov(temp1_, ref_reg);
-
-    GenerateReadBarrierMarkRuntimeCall(codegen);
-
-    // If the new reference is different from the old reference,
-    // update the field in the holder (`*(obj_ + field_offset)`).
-    //
-    // Note that this field could also hold a different object, if
-    // another thread had concurrently changed it. In that case, the
-    // LDREX/CMP/BNE sequence of instructions in the compare-and-set
-    // (CAS) operation below would abort the CAS, leaving the field
-    // as-is.
-    __ Cmp(temp1_, ref_reg);
-    __ B(eq, GetExitLabel());
-
-    // Update the the holder's field atomically.  This may fail if
-    // mutator updates before us, but it's OK.  This is achieved
-    // using a strong compare-and-set (CAS) operation with relaxed
-    // memory synchronization ordering, where the expected value is
-    // the old reference and the desired value is the new reference.
-
-    UseScratchRegisterScope temps(arm_codegen->GetVIXLAssembler());
-    // Convenience aliases.
-    vixl32::Register base = obj_;
-    // The UnsafeCASObject intrinsic uses a register pair as field
-    // offset ("long offset"), of which only the low part contains
-    // data.
-    vixl32::Register offset = LowRegisterFrom(field_offset);
-    vixl32::Register expected = temp1_;
-    vixl32::Register value = ref_reg;
-    vixl32::Register tmp_ptr = temps.Acquire();       // Pointer to actual memory.
-    vixl32::Register tmp = temp2_;                    // Value in memory.
-
-    __ Add(tmp_ptr, base, offset);
-
-    if (kPoisonHeapReferences) {
-      arm_codegen->GetAssembler()->PoisonHeapReference(expected);
-      if (value.Is(expected)) {
-        // Do not poison `value`, as it is the same register as
-        // `expected`, which has just been poisoned.
-      } else {
-        arm_codegen->GetAssembler()->PoisonHeapReference(value);
-      }
-    }
-
-    // do {
-    //   tmp = [r_ptr] - expected;
-    // } while (tmp == 0 && failure([r_ptr] <- r_new_value));
-
-    vixl32::Label loop_head, comparison_failed, exit_loop;
-    __ Bind(&loop_head);
-    __ Ldrex(tmp, MemOperand(tmp_ptr));
-    __ Cmp(tmp, expected);
-    __ B(ne, &comparison_failed, /* far_target */ false);
-    __ Strex(tmp, value, MemOperand(tmp_ptr));
-    __ CompareAndBranchIfZero(tmp, &exit_loop, /* far_target */ false);
-    __ B(&loop_head);
-    __ Bind(&comparison_failed);
-    __ Clrex();
-    __ Bind(&exit_loop);
-
-    if (kPoisonHeapReferences) {
-      arm_codegen->GetAssembler()->UnpoisonHeapReference(expected);
-      if (value.Is(expected)) {
-        // Do not unpoison `value`, as it is the same register as
-        // `expected`, which has just been unpoisoned.
-      } else {
-        arm_codegen->GetAssembler()->UnpoisonHeapReference(value);
-      }
-    }
-
-    __ B(GetExitLabel());
-  }
-
- private:
-  // The register containing the object holding the marked object reference field.
-  const vixl32::Register obj_;
-  // The offset, index and scale factor to access the reference in `obj_`.
-  uint32_t offset_;
-  Location index_;
-  ScaleFactor scale_factor_;
-  // Is a null check required?
-  bool needs_null_check_;
-  // A temporary register used to hold the lock word of `obj_`; and
-  // also to hold the original reference value, when the reference is
-  // marked.
-  const vixl32::Register temp1_;
-  // A temporary register used in the implementation of the CAS, to
-  // update the object's reference field.
-  const vixl32::Register temp2_;
-
-  DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL);
-};
-
 // Slow path generating a read barrier for a heap reference.
 class ReadBarrierForHeapReferenceSlowPathARMVIXL : public SlowPathCodeARMVIXL {
  public:
@@ -2283,9 +1995,12 @@
           if (width == BakerReadBarrierWidth::kWide) {
             DCHECK_GE(literal_offset, 4u);
             uint32_t prev_insn = GetInsn32(literal_offset - 4u);
-            // LDR (immediate), encoding T3, with correct root_reg.
             const uint32_t root_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
-            CHECK_EQ(prev_insn & 0xfff0f000u, 0xf8d00000u | (root_reg << 12));
+            // Usually LDR (immediate), encoding T3, with correct root_reg but
+            // we may have an "ADD root_reg, adjusted_old_value, expected" for UnsafeCASObject.
+            if ((prev_insn & 0xfff0fff0u) != (0xeb000000u | (root_reg << 8))) {   // ADD?
+              CHECK_EQ(prev_insn & 0xfff0f000u, 0xf8d00000u | (root_reg << 12));  // LDR?
+            }
           } else {
             DCHECK_GE(literal_offset, 2u);
             uint32_t prev_insn = GetInsn16(literal_offset - 2u);
@@ -8610,7 +8325,11 @@
       bool narrow = CanEmitNarrowLdr(root_reg, obj, offset);
       uint32_t custom_data = EncodeBakerReadBarrierGcRootData(root_reg.GetCode(), narrow);
 
-      vixl::EmissionCheckScope guard(GetVIXLAssembler(), 4 * vixl32::kMaxInstructionSizeInBytes);
+      size_t narrow_instructions = /* CMP */ (mr.IsLow() ? 1u : 0u) + /* LDR */ (narrow ? 1u : 0u);
+      size_t wide_instructions = /* ADR+CMP+LDR+BNE */ 4u - narrow_instructions;
+      size_t exact_size = wide_instructions * vixl32::k32BitT32InstructionSizeInBytes +
+                          narrow_instructions * vixl32::k16BitT32InstructionSizeInBytes;
+      ExactAssemblyScope guard(GetVIXLAssembler(), exact_size);
       vixl32::Label return_address;
       EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
       __ cmp(mr, Operand(0));
@@ -8620,7 +8339,7 @@
       ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
       __ ldr(EncodingSize(narrow ? Narrow : Wide), root_reg, MemOperand(obj, offset));
       EmitBakerReadBarrierBne(custom_data);
-      __ Bind(&return_address);
+      __ bind(&return_address);
       DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
                 narrow ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET
                        : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET);
@@ -8642,6 +8361,32 @@
   MaybeGenerateMarkingRegisterCheck(/* code */ 19);
 }
 
+void CodeGeneratorARMVIXL::GenerateUnsafeCasOldValueAddWithBakerReadBarrier(
+    vixl::aarch32::Register old_value,
+    vixl::aarch32::Register adjusted_old_value,
+    vixl::aarch32::Register expected) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  // Similar to the Baker RB path in GenerateGcRootFieldLoad(), with an ADD instead of LDR.
+  uint32_t custom_data = EncodeBakerReadBarrierGcRootData(old_value.GetCode(), /* narrow */ false);
+
+  size_t narrow_instructions = /* CMP */ (mr.IsLow() ? 1u : 0u);
+  size_t wide_instructions = /* ADR+CMP+ADD+BNE */ 4u - narrow_instructions;
+  size_t exact_size = wide_instructions * vixl32::k32BitT32InstructionSizeInBytes +
+                      narrow_instructions * vixl32::k16BitT32InstructionSizeInBytes;
+  ExactAssemblyScope guard(GetVIXLAssembler(), exact_size);
+  vixl32::Label return_address;
+  EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
+  __ cmp(mr, Operand(0));
+  ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
+  __ add(EncodingSize(Wide), old_value, adjusted_old_value, Operand(expected));  // Preserves flags.
+  EmitBakerReadBarrierBne(custom_data);
+  __ bind(&return_address);
+  DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
+            BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET);
+}
+
 void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
                                                                  Location ref,
                                                                  vixl32::Register obj,
@@ -8682,9 +8427,14 @@
       EncodeBakerReadBarrierFieldData(src.GetBaseRegister().GetCode(), obj.GetCode(), narrow);
 
   {
-    vixl::EmissionCheckScope guard(
-        GetVIXLAssembler(),
-        (kPoisonHeapReferences ? 5u : 4u) * vixl32::kMaxInstructionSizeInBytes);
+    size_t narrow_instructions =
+        /* CMP */ (mr.IsLow() ? 1u : 0u) +
+        /* LDR+unpoison? */ (narrow ? (kPoisonHeapReferences ? 2u : 1u) : 0u);
+    size_t wide_instructions =
+        /* ADR+CMP+LDR+BNE+unpoison? */ (kPoisonHeapReferences ? 5u : 4u) - narrow_instructions;
+    size_t exact_size = wide_instructions * vixl32::k32BitT32InstructionSizeInBytes +
+                        narrow_instructions * vixl32::k16BitT32InstructionSizeInBytes;
+    ExactAssemblyScope guard(GetVIXLAssembler(), exact_size);
     vixl32::Label return_address;
     EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
     __ cmp(mr, Operand(0));
@@ -8703,7 +8453,7 @@
         __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0));
       }
     }
-    __ Bind(&return_address);
+    __ bind(&return_address);
     DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
               narrow ? BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
                      : BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET);
@@ -8775,9 +8525,12 @@
 
   __ Add(data_reg, obj, Operand(data_offset));
   {
-    vixl::EmissionCheckScope guard(
-        GetVIXLAssembler(),
-        (kPoisonHeapReferences ? 5u : 4u) * vixl32::kMaxInstructionSizeInBytes);
+    size_t narrow_instructions = /* CMP */ (mr.IsLow() ? 1u : 0u);
+    size_t wide_instructions =
+        /* ADR+CMP+BNE+LDR+unpoison? */ (kPoisonHeapReferences ? 5u : 4u) - narrow_instructions;
+    size_t exact_size = wide_instructions * vixl32::k32BitT32InstructionSizeInBytes +
+                        narrow_instructions * vixl32::k16BitT32InstructionSizeInBytes;
+    ExactAssemblyScope guard(GetVIXLAssembler(), exact_size);
     vixl32::Label return_address;
     EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
     __ cmp(mr, Operand(0));
@@ -8789,127 +8542,13 @@
     if (kPoisonHeapReferences) {
       __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0));
     }
-    __ Bind(&return_address);
+    __ bind(&return_address);
     DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
               BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET);
   }
   MaybeGenerateMarkingRegisterCheck(/* code */ 21, /* temp_loc */ LocationFrom(ip));
 }
 
-void CodeGeneratorARMVIXL::UpdateReferenceFieldWithBakerReadBarrier(HInstruction* instruction,
-                                                                    Location ref,
-                                                                    vixl32::Register obj,
-                                                                    Location field_offset,
-                                                                    Location temp,
-                                                                    bool needs_null_check,
-                                                                    vixl32::Register temp2) {
-  DCHECK(kEmitCompilerReadBarrier);
-  DCHECK(kUseBakerReadBarrier);
-
-  // Query `art::Thread::Current()->GetIsGcMarking()` (stored in the
-  // Marking Register) to decide whether we need to enter the slow
-  // path to update the reference field within `obj`. Then, in the
-  // slow path, check the gray bit in the lock word of the reference's
-  // holder (`obj`) to decide whether to mark `ref` and update the
-  // field or not.
-  //
-  //   if (mr) {  // Thread::Current()->GetIsGcMarking()
-  //     // Slow path.
-  //     uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
-  //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
-  //     HeapReference<mirror::Object> ref = *(obj + field_offset);  // Reference load.
-  //     bool is_gray = (rb_state == ReadBarrier::GrayState());
-  //     if (is_gray) {
-  //       old_ref = ref;
-  //       entrypoint = Thread::Current()->pReadBarrierMarkReg ## root.reg()
-  //       ref = entrypoint(ref);  // ref = ReadBarrier::Mark(ref);  // Runtime entry point call.
-  //       compareAndSwapObject(obj, field_offset, old_ref, ref);
-  //     }
-  //   }
-
-  vixl32::Register temp_reg = RegisterFrom(temp);
-
-  // Slow path updating the object reference at address `obj + field_offset`
-  // when the GC is marking. The entrypoint will be loaded by the slow path code.
-  SlowPathCodeARMVIXL* slow_path =
-      new (GetScopedAllocator()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL(
-          instruction,
-          ref,
-          obj,
-          /* offset */ 0u,
-          /* index */ field_offset,
-          /* scale_factor */ ScaleFactor::TIMES_1,
-          needs_null_check,
-          temp_reg,
-          temp2);
-  AddSlowPath(slow_path);
-
-  __ CompareAndBranchIfNonZero(mr, slow_path->GetEntryLabel());
-  // Fast path: the GC is not marking: nothing to do (the field is
-  // up-to-date, and we don't need to load the reference).
-  __ Bind(slow_path->GetExitLabel());
-  MaybeGenerateMarkingRegisterCheck(/* code */ 23);
-}
-
-void CodeGeneratorARMVIXL::GenerateRawReferenceLoad(HInstruction* instruction,
-                                                    Location ref,
-                                                    vixl32::Register obj,
-                                                    uint32_t offset,
-                                                    Location index,
-                                                    ScaleFactor scale_factor,
-                                                    bool needs_null_check) {
-  DataType::Type type = DataType::Type::kReference;
-  vixl32::Register ref_reg = RegisterFrom(ref, type);
-
-  // If needed, vixl::EmissionCheckScope guards are used to ensure
-  // that no pools are emitted between the load (macro) instruction
-  // and MaybeRecordImplicitNullCheck.
-
-  if (index.IsValid()) {
-    // Load types involving an "index": ArrayGet,
-    // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
-    // intrinsics.
-    // /* HeapReference<mirror::Object> */ ref = *(obj + offset + (index << scale_factor))
-    if (index.IsConstant()) {
-      size_t computed_offset =
-          (Int32ConstantFrom(index) << scale_factor) + offset;
-      vixl::EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
-      GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, obj, computed_offset);
-      if (needs_null_check) {
-        MaybeRecordImplicitNullCheck(instruction);
-      }
-    } else {
-      // Handle the special case of the
-      // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
-      // intrinsics, which use a register pair as index ("long
-      // offset"), of which only the low part contains data.
-      vixl32::Register index_reg = index.IsRegisterPair()
-          ? LowRegisterFrom(index)
-          : RegisterFrom(index);
-      UseScratchRegisterScope temps(GetVIXLAssembler());
-      vixl32::Register temp = temps.Acquire();
-      __ Add(temp, obj, Operand(index_reg, ShiftType::LSL, scale_factor));
-      {
-        vixl::EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
-        GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, temp, offset);
-        if (needs_null_check) {
-          MaybeRecordImplicitNullCheck(instruction);
-        }
-      }
-    }
-  } else {
-    // /* HeapReference<mirror::Object> */ ref = *(obj + offset)
-    vixl::EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
-    GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, obj, offset);
-    if (needs_null_check) {
-      MaybeRecordImplicitNullCheck(instruction);
-    }
-  }
-
-  // Object* ref = ref_addr->AsMirrorPtr()
-  GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
-}
-
 void CodeGeneratorARMVIXL::MaybeGenerateMarkingRegisterCheck(int code, Location temp_loc) {
   // The following condition is a compile-time one, so it does not have a run-time cost.
   if (kEmitCompilerReadBarrier && kUseBakerReadBarrier && kIsDebugBuild) {
@@ -9199,7 +8838,7 @@
 }
 
 void CodeGeneratorARMVIXL::EmitBakerReadBarrierBne(uint32_t custom_data) {
-  ExactAssemblyScope eas(GetVIXLAssembler(), 1 * k32BitT32InstructionSizeInBytes);
+  DCHECK(!__ AllowMacroInstructions());  // In ExactAssemblyScope.
   if (Runtime::Current()->UseJitCompilation()) {
     auto it = jit_baker_read_barrier_slow_paths_.FindOrAdd(custom_data);
     vixl::aarch32::Label* slow_path_entry = &it->second.label;
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index 2fd18ca..f9eb876 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -622,6 +622,11 @@
                                vixl::aarch32::Register obj,
                                uint32_t offset,
                                ReadBarrierOption read_barrier_option);
+  // Generate ADD for UnsafeCASObject to reconstruct the old value from
+  // `old_value - expected` and mark it with Baker read barrier.
+  void GenerateUnsafeCasOldValueAddWithBakerReadBarrier(vixl::aarch32::Register old_value,
+                                                        vixl::aarch32::Register adjusted_old_value,
+                                                        vixl::aarch32::Register expected);
   // Fast path implementation of ReadBarrier::Barrier for a heap
   // reference field load when Baker's read barriers are used.
   // Overload suitable for Unsafe.getObject/-Volatile() intrinsic.
@@ -647,35 +652,6 @@
                                              Location temp,
                                              bool needs_null_check);
 
-  // Generate code checking whether the the reference field at the
-  // address `obj + field_offset`, held by object `obj`, needs to be
-  // marked, and if so, marking it and updating the field within `obj`
-  // with the marked value.
-  //
-  // This routine is used for the implementation of the
-  // UnsafeCASObject intrinsic with Baker read barriers.
-  //
-  // This method has a structure similar to
-  // GenerateReferenceLoadWithBakerReadBarrier, but note that argument
-  // `ref` is only as a temporary here, and thus its value should not
-  // be used afterwards.
-  void UpdateReferenceFieldWithBakerReadBarrier(HInstruction* instruction,
-                                                Location ref,
-                                                vixl::aarch32::Register obj,
-                                                Location field_offset,
-                                                Location temp,
-                                                bool needs_null_check,
-                                                vixl::aarch32::Register temp2);
-
-  // Generate a heap reference load (with no read barrier).
-  void GenerateRawReferenceLoad(HInstruction* instruction,
-                                Location ref,
-                                vixl::aarch32::Register obj,
-                                uint32_t offset,
-                                Location index,
-                                ScaleFactor scale_factor,
-                                bool needs_null_check);
-
   // Emit code checking the status of the Marking Register, and
   // aborting the program if MR does not match the value stored in the
   // art::Thread object. Code is only emitted in debug mode and if
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index b920750..e5d8693 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -936,9 +936,7 @@
                codegen_);
 }
 
-static void CreateIntIntIntIntIntToIntPlusTemps(ArenaAllocator* allocator,
-                                                HInvoke* invoke,
-                                                DataType::Type type) {
+static void CreateIntIntIntIntIntToIntPlusTemps(ArenaAllocator* allocator, HInvoke* invoke) {
   bool can_call = kEmitCompilerReadBarrier &&
       kUseBakerReadBarrier &&
       (invoke->GetIntrinsic() == Intrinsics::kUnsafeCASObject);
@@ -948,20 +946,16 @@
                                           ? LocationSummary::kCallOnSlowPath
                                           : LocationSummary::kNoCall,
                                       kIntrinsified);
+  if (can_call) {
+    locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
+  }
   locations->SetInAt(0, Location::NoLocation());        // Unused receiver.
   locations->SetInAt(1, Location::RequiresRegister());
   locations->SetInAt(2, Location::RequiresRegister());
   locations->SetInAt(3, Location::RequiresRegister());
   locations->SetInAt(4, Location::RequiresRegister());
 
-  // If heap poisoning is enabled, we don't want the unpoisoning
-  // operations to potentially clobber the output. Likewise when
-  // emitting a (Baker) read barrier, which may call.
-  Location::OutputOverlap overlaps =
-      ((kPoisonHeapReferences && type == DataType::Type::kReference) || can_call)
-      ? Location::kOutputOverlap
-      : Location::kNoOutputOverlap;
-  locations->SetOut(Location::RequiresRegister(), overlaps);
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
 
   // Temporary registers used in CAS. In the object case
   // (UnsafeCASObject intrinsic), these are also used for
@@ -970,24 +964,92 @@
   locations->AddTemp(Location::RequiresRegister());  // Temp 1.
 }
 
+class BakerReadBarrierCasSlowPathARMVIXL : public SlowPathCodeARMVIXL {
+ public:
+  explicit BakerReadBarrierCasSlowPathARMVIXL(HInvoke* invoke)
+      : SlowPathCodeARMVIXL(invoke) {}
+
+  const char* GetDescription() const OVERRIDE { return "BakerReadBarrierCasSlowPathARMVIXL"; }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen);
+    ArmVIXLAssembler* assembler = arm_codegen->GetAssembler();
+    __ Bind(GetEntryLabel());
+
+    LocationSummary* locations = instruction_->GetLocations();
+    vixl32::Register base = InputRegisterAt(instruction_, 1);           // Object pointer.
+    vixl32::Register offset = LowRegisterFrom(locations->InAt(2));      // Offset (discard high 4B).
+    vixl32::Register expected = InputRegisterAt(instruction_, 3);       // Expected.
+    vixl32::Register value = InputRegisterAt(instruction_, 4);          // Value.
+
+    vixl32::Register tmp_ptr = RegisterFrom(locations->GetTemp(0));     // Pointer to actual memory.
+    vixl32::Register tmp = RegisterFrom(locations->GetTemp(1));         // Temporary.
+
+    // The `tmp` is initialized to `[tmp_ptr] - expected` in the main path. Reconstruct
+    // and mark the old value and compare with `expected`. We clobber `tmp_ptr` in the
+    // process due to lack of other temps suitable for the read barrier.
+    arm_codegen->GenerateUnsafeCasOldValueAddWithBakerReadBarrier(tmp_ptr, tmp, expected);
+    __ Cmp(tmp_ptr, expected);
+    __ B(ne, GetExitLabel());
+
+    // The old value we have read did not match `expected` (which is always a to-space reference)
+    // but after the read barrier in GenerateUnsafeCasOldValueAddWithBakerReadBarrier() the marked
+    // to-space value matched, so the old value must be a from-space reference to the same object.
+    // Do the same CAS loop as the main path but check for both `expected` and the unmarked
+    // old value representing the to-space and from-space references for the same object.
+
+    UseScratchRegisterScope temps(assembler->GetVIXLAssembler());
+    vixl32::Register adjusted_old_value = temps.Acquire();      // For saved `tmp` from main path.
+
+    // Recalculate the `tmp_ptr` clobbered above and store the `adjusted_old_value`, i.e. IP.
+    __ Add(tmp_ptr, base, offset);
+    __ Mov(adjusted_old_value, tmp);
+
+    // do {
+    //   tmp = [r_ptr] - expected;
+    // } while ((tmp == 0 || tmp == adjusted_old_value) && failure([r_ptr] <- r_new_value));
+    // result = (tmp == 0 || tmp == adjusted_old_value);
+
+    vixl32::Label loop_head;
+    __ Bind(&loop_head);
+    __ Ldrex(tmp, MemOperand(tmp_ptr));  // This can now load null stored by another thread.
+    assembler->MaybeUnpoisonHeapReference(tmp);
+    __ Subs(tmp, tmp, expected);         // Use SUBS to get non-zero value if both compares fail.
+    {
+      // If the newly loaded value did not match `expected`, compare with `adjusted_old_value`.
+      ExactAssemblyScope aas(assembler->GetVIXLAssembler(), 2 * k16BitT32InstructionSizeInBytes);
+      __ it(ne);
+      __ cmp(ne, tmp, adjusted_old_value);
+    }
+    __ B(ne, GetExitLabel());
+    assembler->MaybePoisonHeapReference(value);
+    __ Strex(tmp, value, MemOperand(tmp_ptr));
+    assembler->MaybeUnpoisonHeapReference(value);
+    __ Cmp(tmp, 0);
+    __ B(ne, &loop_head, /* far_target */ false);
+    __ B(GetExitLabel());
+  }
+};
+
 static void GenCas(HInvoke* invoke, DataType::Type type, CodeGeneratorARMVIXL* codegen) {
   DCHECK_NE(type, DataType::Type::kInt64);
 
   ArmVIXLAssembler* assembler = codegen->GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
 
-  Location out_loc = locations->Out();
   vixl32::Register out = OutputRegister(invoke);                      // Boolean result.
 
   vixl32::Register base = InputRegisterAt(invoke, 1);                 // Object pointer.
-  Location offset_loc = locations->InAt(2);
-  vixl32::Register offset = LowRegisterFrom(offset_loc);              // Offset (discard high 4B).
+  vixl32::Register offset = LowRegisterFrom(locations->InAt(2));      // Offset (discard high 4B).
   vixl32::Register expected = InputRegisterAt(invoke, 3);             // Expected.
   vixl32::Register value = InputRegisterAt(invoke, 4);                // Value.
 
-  Location tmp_ptr_loc = locations->GetTemp(0);
-  vixl32::Register tmp_ptr = RegisterFrom(tmp_ptr_loc);               // Pointer to actual memory.
-  vixl32::Register tmp = RegisterFrom(locations->GetTemp(1));         // Value in memory.
+  vixl32::Register tmp_ptr = RegisterFrom(locations->GetTemp(0));     // Pointer to actual memory.
+  vixl32::Register tmp = RegisterFrom(locations->GetTemp(1));         // Temporary.
+
+  vixl32::Label loop_exit_label;
+  vixl32::Label* loop_exit = &loop_exit_label;
+  vixl32::Label* failure = &loop_exit_label;
 
   if (type == DataType::Type::kReference) {
     // The only read barrier implementation supporting the
@@ -1000,87 +1062,62 @@
     codegen->MarkGCCard(tmp_ptr, tmp, base, value, value_can_be_null);
 
     if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-      // Need to make sure the reference stored in the field is a to-space
-      // one before attempting the CAS or the CAS could fail incorrectly.
-      codegen->UpdateReferenceFieldWithBakerReadBarrier(
-          invoke,
-          out_loc,  // Unused, used only as a "temporary" within the read barrier.
-          base,
-          /* field_offset */ offset_loc,
-          tmp_ptr_loc,
-          /* needs_null_check */ false,
-          tmp);
+      // If marking, check if the stored reference is a from-space reference to the same
+      // object as the to-space reference `expected`. If so, perform a custom CAS loop.
+      BakerReadBarrierCasSlowPathARMVIXL* slow_path =
+          new (codegen->GetScopedAllocator()) BakerReadBarrierCasSlowPathARMVIXL(invoke);
+      codegen->AddSlowPath(slow_path);
+      failure = slow_path->GetEntryLabel();
+      loop_exit = slow_path->GetExitLabel();
     }
   }
 
   // Prevent reordering with prior memory operations.
   // Emit a DMB ISH instruction instead of an DMB ISHST one, as the
-  // latter allows a preceding load to be delayed past the STXR
+  // latter allows a preceding load to be delayed past the STREX
   // instruction below.
   __ Dmb(vixl32::ISH);
 
   __ Add(tmp_ptr, base, offset);
 
-  if (kPoisonHeapReferences && type == DataType::Type::kReference) {
-    codegen->GetAssembler()->PoisonHeapReference(expected);
-    if (value.Is(expected)) {
-      // Do not poison `value`, as it is the same register as
-      // `expected`, which has just been poisoned.
-    } else {
-      codegen->GetAssembler()->PoisonHeapReference(value);
-    }
-  }
-
   // do {
   //   tmp = [r_ptr] - expected;
   // } while (tmp == 0 && failure([r_ptr] <- r_new_value));
-  // result = tmp != 0;
+  // result = tmp == 0;
 
   vixl32::Label loop_head;
   __ Bind(&loop_head);
-
   __ Ldrex(tmp, MemOperand(tmp_ptr));
-
-  __ Subs(tmp, tmp, expected);
-
-  {
-    ExactAssemblyScope aas(assembler->GetVIXLAssembler(),
-                           3 * kMaxInstructionSizeInBytes,
-                           CodeBufferCheckScope::kMaximumSize);
-
-    __ itt(eq);
-    __ strex(eq, tmp, value, MemOperand(tmp_ptr));
-    __ cmp(eq, tmp, 1);
+  if (type == DataType::Type::kReference) {
+    assembler->MaybeUnpoisonHeapReference(tmp);
   }
+  __ Subs(tmp, tmp, expected);
+  __ B(ne, failure, (failure == loop_exit) ? kNear : kBranchWithoutHint);
+  if (type == DataType::Type::kReference) {
+    assembler->MaybePoisonHeapReference(value);
+  }
+  __ Strex(tmp, value, MemOperand(tmp_ptr));
+  if (type == DataType::Type::kReference) {
+    assembler->MaybeUnpoisonHeapReference(value);
+  }
+  __ Cmp(tmp, 0);
+  __ B(ne, &loop_head, /* far_target */ false);
 
-  __ B(eq, &loop_head, /* far_target */ false);
+  __ Bind(loop_exit);
 
   __ Dmb(vixl32::ISH);
 
-  __ Rsbs(out, tmp, 1);
+  // out = tmp == 0.
+  __ Clz(out, tmp);
+  __ Lsr(out, out, WhichPowerOf2(out.GetSizeInBits()));
 
-  {
-    ExactAssemblyScope aas(assembler->GetVIXLAssembler(),
-                           2 * kMaxInstructionSizeInBytes,
-                           CodeBufferCheckScope::kMaximumSize);
-
-    __ it(cc);
-    __ mov(cc, out, 0);
-  }
-
-  if (kPoisonHeapReferences && type == DataType::Type::kReference) {
-    codegen->GetAssembler()->UnpoisonHeapReference(expected);
-    if (value.Is(expected)) {
-      // Do not unpoison `value`, as it is the same register as
-      // `expected`, which has just been unpoisoned.
-    } else {
-      codegen->GetAssembler()->UnpoisonHeapReference(value);
-    }
+  if (type == DataType::Type::kReference) {
+    codegen->MaybeGenerateMarkingRegisterCheck(/* code */ 128);
   }
 }
 
 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafeCASInt(HInvoke* invoke) {
-  CreateIntIntIntIntIntToIntPlusTemps(allocator_, invoke, DataType::Type::kInt32);
+  CreateIntIntIntIntIntToIntPlusTemps(allocator_, invoke);
 }
 void IntrinsicLocationsBuilderARMVIXL::VisitUnsafeCASObject(HInvoke* invoke) {
   // The only read barrier implementation supporting the
@@ -1089,7 +1126,7 @@
     return;
   }
 
-  CreateIntIntIntIntIntToIntPlusTemps(allocator_, invoke, DataType::Type::kReference);
+  CreateIntIntIntIntIntToIntPlusTemps(allocator_, invoke);
 }
 void IntrinsicCodeGeneratorARMVIXL::VisitUnsafeCASInt(HInvoke* invoke) {
   GenCas(invoke, DataType::Type::kInt32, codegen_);