arm: Implement VarHandle CAS intrinsics.

Using benchmarks provided by
on blueline little cores with fixed frequency 1420800:
                                           before after
CompareAndSetStaticFieldInt                26.452 0.031
CompareAndSetStaticFieldString             31.672 0.037
CompareAndSetFieldInt                      29.569 0.033
CompareAndSetFieldString                   34.095 0.042
WeakCompareAndSetStaticFieldInt            26.470 0.031
WeakCompareAndSetStaticFieldString         31.604 0.038
WeakCompareAndSetFieldInt                  29.619 0.033
WeakCompareAndSetFieldString               34.058 0.040
WeakCompareAndSetPlainStaticFieldInt       26.508 0.026
WeakCompareAndSetPlainStaticFieldString    31.675 0.031
WeakCompareAndSetPlainFieldInt             29.635 0.028
WeakCompareAndSetPlainFieldString          34.116 0.034
WeakCompareAndSetAcquireStaticFieldInt     26.512 0.030
WeakCompareAndSetAcquireStaticFieldString  31.661 0.035
WeakCompareAndSetAcquireFieldInt           29.661 0.032
WeakCompareAndSetAcquireFieldString        34.120 0.038
WeakCompareAndSetReleaseStaticFieldInt     26.566 0.027
WeakCompareAndSetReleaseStaticFieldString  31.659 0.034
WeakCompareAndSetReleaseFieldInt           29.676 0.029
WeakCompareAndSetReleaseFieldString        34.204 0.037
CompareAndExchangeStaticFieldInt           25.550 0.031
CompareAndExchangeStaticFieldString        31.219 0.039
CompareAndExchangeFieldInt                 28.923 0.032
CompareAndExchangeFieldString              33.622 0.040
CompareAndExchangeAcquireStaticFieldInt    25.559 0.029
CompareAndExchangeAcquireStaticFieldString 31.177 0.037
CompareAndExchangeAcquireFieldInt          28.807 0.031
CompareAndExchangeAcquireFieldString       33.524 0.038
CompareAndExchangeReleaseStaticFieldInt    25.481 0.027
CompareAndExchangeReleaseStaticFieldString 31.132 0.036
CompareAndExchangeReleaseFieldInt          28.825 0.029
CompareAndExchangeReleaseFieldString       33.511 0.038

Oddly, this rewrite makes the Unsafe CAS benchmarks regress
a bit on this configuration. However, experiments show that
adding useless CLZ+LSR operating on a temporary register
(corresponding to the old code's result calculation) would
restore the performance to the old level. We prefer not to
add these useless instructions as the situation is likely
to be reversed on different CPU cores.

Test: Covered by existing tests.
Test: --target --32 --optimizing
Bug: 71781600
Change-Id: I591009d7494533cdf60a47be2f8826144e059ff5
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index e7f49f0..a310545 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -658,11 +658,9 @@
                                vixl::aarch32::Register obj,
                                uint32_t offset,
                                ReadBarrierOption read_barrier_option);
-  // Generate ADD for UnsafeCASObject to reconstruct the old value from
-  // `old_value - expected` and mark it with Baker read barrier.
-  void GenerateUnsafeCasOldValueAddWithBakerReadBarrier(vixl::aarch32::Register old_value,
-                                                        vixl::aarch32::Register adjusted_old_value,
-                                                        vixl::aarch32::Register expected);
+  // Generate MOV for an intrinsic CAS to mark the old value with Baker read barrier.
+  void GenerateIntrinsicCasMoveWithBakerReadBarrier(vixl::aarch32::Register marked_old_value,
+                                                    vixl::aarch32::Register old_value);
   // Fast path implementation of ReadBarrier::Barrier for a heap
   // reference field load when Baker's read barriers are used.
   // Overload suitable for Unsafe.getObject/-Volatile() intrinsic.
@@ -710,6 +708,18 @@
   virtual void MaybeGenerateMarkingRegisterCheck(int code,
                                                  Location temp_loc = Location::NoLocation());
+  // Create slow path for a read barrier for a heap reference within `instruction`.
+  //
+  // This is a helper function for GenerateReadBarrierSlow() that has the same
+  // arguments. The creation and adding of the slow path is exposed for intrinsics
+  // that cannot use GenerateReadBarrierSlow() from their own slow paths.
+  SlowPathCodeARMVIXL* AddReadBarrierSlowPath(HInstruction* instruction,
+                                              Location out,
+                                              Location ref,
+                                              Location obj,
+                                              uint32_t offset,
+                                              Location index);
   // Generate a read barrier for a heap reference within `instruction`
   // using a slow path.
@@ -799,11 +809,11 @@
   // Encoding of thunk type and data for link-time generated thunks for Baker read barriers.
   enum class BakerReadBarrierKind : uint8_t {
-    kField,       // Field get or array get with constant offset (i.e. constant index).
-    kArray,       // Array get with index in register.
-    kGcRoot,      // GC root load.
-    kUnsafeCas,   // UnsafeCASObject intrinsic.
-    kLast = kUnsafeCas
+    kField,         // Field get or array get with constant offset (i.e. constant index).
+    kArray,         // Array get with index in register.
+    kGcRoot,        // GC root load.
+    kIntrinsicCas,  // Unsafe/VarHandle CAS intrinsic.
+    kLast = kIntrinsicCas
   enum class BakerReadBarrierWidth : uint8_t {
@@ -870,9 +880,9 @@
-  static uint32_t EncodeBakerReadBarrierUnsafeCasData(uint32_t root_reg) {
+  static uint32_t EncodeBakerReadBarrierIntrinsicCasData(uint32_t root_reg) {
-    return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kUnsafeCas) |
+    return BakerReadBarrierKindField::Encode(BakerReadBarrierKind::kIntrinsicCas) |
            BakerReadBarrierFirstRegField::Encode(root_reg) |
            BakerReadBarrierSecondRegField::Encode(kBakerReadBarrierInvalidEncodedReg) |