arm: Fix VarHandle.compareAndSet() with read barriers.

A stress test shall be added in separate change,
    https://android-review.googlesource.com/2003811 .

Test: testrunner.py --target --32 --optimizing
Bug: 208156527
Change-Id: I9d5f122235a27b6ab629e187afbc3e2f5d4a75ce
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 255f605..f651518 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -3485,6 +3485,7 @@
     __ cmp(eq, HighRegisterFrom(old_value), HighRegisterFrom(expected));
   } else if (expected.IsRegisterPair()) {
     DCHECK_EQ(type, DataType::Type::kReference);
+    DCHECK(!expected.Contains(old_value));
     // Check if the loaded value matches any of the two registers in `expected`.
     __ Cmp(RegisterFrom(old_value), LowRegisterFrom(expected));
     ExactAssemblyScope aas(assembler->GetVIXLAssembler(), 2 * k16BitT32InstructionSizeInBytes);
@@ -4893,10 +4894,10 @@
 
   if (kEmitCompilerReadBarrier && value_type == DataType::Type::kReference) {
     // The `old_value_temp` is used first for the marked `old_value` and then for the unmarked
-    // reloaded old value for subsequent CAS in the slow path.
-    vixl32::Register old_value_temp = store_result;
+    // reloaded old value for subsequent CAS in the slow path. This must not clobber `old_value`.
+    vixl32::Register old_value_temp = return_success ? RegisterFrom(out) : store_result;
     // The slow path store result must not clobber `old_value`.
-    vixl32::Register slow_path_store_result = return_success ? RegisterFrom(out) : store_result;
+    vixl32::Register slow_path_store_result = old_value_temp;
     ReadBarrierCasSlowPathARMVIXL* rb_slow_path =
         new (codegen->GetScopedAllocator()) ReadBarrierCasSlowPathARMVIXL(
             invoke,