arm: Implement VarHandle CAS intrinsics.

Using benchmarks provided by
    https://android-review.googlesource.com/1420959
on blueline little cores with fixed frequency 1420800:
                                           before after
CompareAndSetStaticFieldInt                26.452 0.031
CompareAndSetStaticFieldString             31.672 0.037
CompareAndSetFieldInt                      29.569 0.033
CompareAndSetFieldString                   34.095 0.042
WeakCompareAndSetStaticFieldInt            26.470 0.031
WeakCompareAndSetStaticFieldString         31.604 0.038
WeakCompareAndSetFieldInt                  29.619 0.033
WeakCompareAndSetFieldString               34.058 0.040
WeakCompareAndSetPlainStaticFieldInt       26.508 0.026
WeakCompareAndSetPlainStaticFieldString    31.675 0.031
WeakCompareAndSetPlainFieldInt             29.635 0.028
WeakCompareAndSetPlainFieldString          34.116 0.034
WeakCompareAndSetAcquireStaticFieldInt     26.512 0.030
WeakCompareAndSetAcquireStaticFieldString  31.661 0.035
WeakCompareAndSetAcquireFieldInt           29.661 0.032
WeakCompareAndSetAcquireFieldString        34.120 0.038
WeakCompareAndSetReleaseStaticFieldInt     26.566 0.027
WeakCompareAndSetReleaseStaticFieldString  31.659 0.034
WeakCompareAndSetReleaseFieldInt           29.676 0.029
WeakCompareAndSetReleaseFieldString        34.204 0.037
CompareAndExchangeStaticFieldInt           25.550 0.031
CompareAndExchangeStaticFieldString        31.219 0.039
CompareAndExchangeFieldInt                 28.923 0.032
CompareAndExchangeFieldString              33.622 0.040
CompareAndExchangeAcquireStaticFieldInt    25.559 0.029
CompareAndExchangeAcquireStaticFieldString 31.177 0.037
CompareAndExchangeAcquireFieldInt          28.807 0.031
CompareAndExchangeAcquireFieldString       33.524 0.038
CompareAndExchangeReleaseStaticFieldInt    25.481 0.027
CompareAndExchangeReleaseStaticFieldString 31.132 0.036
CompareAndExchangeReleaseFieldInt          28.825 0.029
CompareAndExchangeReleaseFieldString       33.511 0.038

Oddly, this rewrite makes the Unsafe CAS benchmarks regress
a bit on this configuration. However, experiments show that
adding useless CLZ+LSR operating on a temporary register
(corresponding to the old code's result calculation) would
restore the performance to the old level. We prefer not to
add these useless instructions as the situation is likely
to be reversed on different CPU cores.

Test: Covered by existing tests.
Test: testrunner.py --target --32 --optimizing
Test: Repeat with ART_USE_READ_BARRIER=false ART_HEAP_POISONING=true.
Test: Repeat with ART_READ_BARRIER_TYPE=TABLELOOKUP.
Test: run-gtests.sh
Bug: 71781600
Change-Id: I591009d7494533cdf60a47be2f8826144e059ff5
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index cdd5d22..44fd128 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -839,7 +839,11 @@
         DCHECK(intrinsic == Intrinsics::kUnsafeGetObject ||
                intrinsic == Intrinsics::kUnsafeGetObjectVolatile ||
                mirror::VarHandle::GetAccessModeTemplateByIntrinsic(intrinsic) ==
-                   mirror::VarHandle::AccessModeTemplate::kGet)
+                   mirror::VarHandle::AccessModeTemplate::kGet ||
+               mirror::VarHandle::GetAccessModeTemplateByIntrinsic(intrinsic) ==
+                   mirror::VarHandle::AccessModeTemplate::kCompareAndSet ||
+               mirror::VarHandle::GetAccessModeTemplateByIntrinsic(intrinsic) ==
+                   mirror::VarHandle::AccessModeTemplate::kCompareAndExchange)
             << instruction_->AsInvoke()->GetIntrinsic();
         DCHECK_EQ(offset_, 0U);
         // Though UnsafeGet's offset location is a register pair, we only pass the low
@@ -2019,18 +2023,22 @@
           } else {
             DCHECK_GE(literal_offset, 2u);
             uint32_t prev_insn = GetInsn16(literal_offset - 2u);
-            // LDR (immediate), encoding T1, with correct root_reg.
             const uint32_t root_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
-            CHECK_EQ(prev_insn & 0xf807u, 0x6800u | root_reg);
+            // Usually LDR (immediate), encoding T1, with correct root_reg but we may have
+            // a `MOV marked, old_value` for intrinsic CAS where `marked` is a low register.
+            if ((prev_insn & 0xff87u) != (0x4600 | root_reg)) {
+              CHECK_EQ(prev_insn & 0xf807u, 0x6800u | root_reg);
+            }
           }
           break;
         }
-        case BakerReadBarrierKind::kUnsafeCas: {
+        case BakerReadBarrierKind::kIntrinsicCas: {
           DCHECK_GE(literal_offset, 4u);
           uint32_t prev_insn = GetInsn32(literal_offset - 4u);
-          // ADD (register), encoding T3, with correct root_reg.
+          // MOV (register), encoding T3, with correct root_reg.
           const uint32_t root_reg = BakerReadBarrierFirstRegField::Decode(encoded_data);
-          CHECK_EQ(prev_insn & 0xfff0fff0u, 0xeb000000u | (root_reg << 8));
+          DCHECK_GE(root_reg, 8u);  // Used only for high registers.
+          CHECK_EQ(prev_insn & 0xfffffff0u, 0xea4f0000u | (root_reg << 8));
           break;
         }
         default:
@@ -8801,18 +8809,22 @@
   MaybeGenerateMarkingRegisterCheck(/* code= */ 20);
 }
 
-void CodeGeneratorARMVIXL::GenerateUnsafeCasOldValueAddWithBakerReadBarrier(
-    vixl::aarch32::Register old_value,
-    vixl::aarch32::Register adjusted_old_value,
-    vixl::aarch32::Register expected) {
+void CodeGeneratorARMVIXL::GenerateIntrinsicCasMoveWithBakerReadBarrier(
+    vixl::aarch32::Register marked_old_value,
+    vixl::aarch32::Register old_value) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
-  // Similar to the Baker RB path in GenerateGcRootFieldLoad(), with an ADD instead of LDR.
-  uint32_t custom_data = EncodeBakerReadBarrierUnsafeCasData(old_value.GetCode());
+  // Similar to the Baker RB path in GenerateGcRootFieldLoad(), with a MOV instead of LDR.
+  // For low registers, we can reuse the GC root narrow entrypoint, for high registers
+  // we use a specialized entrypoint because the register bits are 8-11 instead of 12-15.
+  bool narrow_mov = marked_old_value.IsLow();
+  uint32_t custom_data = narrow_mov
+      ? EncodeBakerReadBarrierGcRootData(marked_old_value.GetCode(), /*narrow=*/ true)
+      : EncodeBakerReadBarrierIntrinsicCasData(marked_old_value.GetCode());
 
-  size_t narrow_instructions = /* CMP */ (mr.IsLow() ? 1u : 0u);
-  size_t wide_instructions = /* ADR+CMP+ADD+BNE */ 4u - narrow_instructions;
+  size_t narrow_instructions = /* CMP */ (mr.IsLow() ? 1u : 0u) + /* MOV */ (narrow_mov ? 1u : 0u);
+  size_t wide_instructions = /* ADR+CMP+MOV+BNE */ 4u - narrow_instructions;
   size_t exact_size = wide_instructions * vixl32::k32BitT32InstructionSizeInBytes +
                       narrow_instructions * vixl32::k16BitT32InstructionSizeInBytes;
   ExactAssemblyScope guard(GetVIXLAssembler(), exact_size);
@@ -8820,11 +8832,13 @@
   EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
   __ cmp(mr, Operand(0));
   ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
-  __ add(EncodingSize(Wide), old_value, adjusted_old_value, Operand(expected));  // Preserves flags.
+  __ mov(EncodingSize(narrow_mov ? Narrow : Wide), marked_old_value, old_value);
   EmitBakerReadBarrierBne(custom_data);
   __ bind(&return_address);
   DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
-            BAKER_MARK_INTROSPECTION_UNSAFE_CAS_ADD_OFFSET);
+            narrow_mov
+                ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET
+                : BAKER_MARK_INTROSPECTION_INTRINSIC_CAS_MOV_OFFSET);
 }
 
 void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
@@ -9003,6 +9017,18 @@
   }
 }
 
+SlowPathCodeARMVIXL* CodeGeneratorARMVIXL::AddReadBarrierSlowPath(HInstruction* instruction,
+                                                                  Location out,
+                                                                  Location ref,
+                                                                  Location obj,
+                                                                  uint32_t offset,
+                                                                  Location index) {
+  SlowPathCodeARMVIXL* slow_path = new (GetScopedAllocator())
+      ReadBarrierForHeapReferenceSlowPathARMVIXL(instruction, out, ref, obj, offset, index);
+  AddSlowPath(slow_path);
+  return slow_path;
+}
+
 void CodeGeneratorARMVIXL::GenerateReadBarrierSlow(HInstruction* instruction,
                                                    Location out,
                                                    Location ref,
@@ -9022,9 +9048,8 @@
   // not used by the artReadBarrierSlow entry point.
   //
   // TODO: Unpoison `ref` when it is used by artReadBarrierSlow.
-  SlowPathCodeARMVIXL* slow_path = new (GetScopedAllocator())
-      ReadBarrierForHeapReferenceSlowPathARMVIXL(instruction, out, ref, obj, offset, index);
-  AddSlowPath(slow_path);
+  SlowPathCodeARMVIXL* slow_path =
+      AddReadBarrierSlowPath(instruction, out, ref, obj, offset, index);
 
   __ B(slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
@@ -9932,7 +9957,7 @@
       break;
     }
     case BakerReadBarrierKind::kGcRoot:
-    case BakerReadBarrierKind::kUnsafeCas: {
+    case BakerReadBarrierKind::kIntrinsicCas: {
       // Check if the reference needs to be marked and if so (i.e. not null, not marked yet
       // and it does not have a forwarding address), call the correct introspection entrypoint;
       // otherwise return the reference (or the extracted forwarding address).
@@ -9959,15 +9984,19 @@
       __ Cmp(ip, Operand(0xc0000000));
       __ B(hs, &forwarding_address);
       vixl32::Register ep_reg = LoadReadBarrierMarkIntrospectionEntrypoint(assembler);
-      // Adjust the art_quick_read_barrier_mark_introspection address in kBakerCcEntrypointRegister
-      // to one of art_quick_read_barrier_mark_introspection_{gc_roots_{wide,narrow},unsafe_cas}.
-      DCHECK(kind != BakerReadBarrierKind::kUnsafeCas || width == BakerReadBarrierWidth::kWide);
+      // Adjust the art_quick_read_barrier_mark_introspection address
+      // in kBakerCcEntrypointRegister to one of
+      //     art_quick_read_barrier_mark_introspection_{gc_roots_{wide,narrow},intrinsic_cas}.
+      if (kind == BakerReadBarrierKind::kIntrinsicCas) {
+        DCHECK(width == BakerReadBarrierWidth::kWide);
+        DCHECK(!root_reg.IsLow());
+      }
       int32_t entrypoint_offset =
           (kind == BakerReadBarrierKind::kGcRoot)
               ? (width == BakerReadBarrierWidth::kWide)
                   ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_ENTRYPOINT_OFFSET
                   : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_ENTRYPOINT_OFFSET
-              : BAKER_MARK_INTROSPECTION_UNSAFE_CAS_ENTRYPOINT_OFFSET;
+              : BAKER_MARK_INTROSPECTION_INTRINSIC_CAS_ENTRYPOINT_OFFSET;
       __ Add(ep_reg, ep_reg, Operand(entrypoint_offset));
       __ Mov(ip, root_reg);
       __ Bx(ep_reg);
@@ -10011,8 +10040,8 @@
         DCHECK_EQ(kBakerReadBarrierInvalidEncodedReg,
                   BakerReadBarrierSecondRegField::Decode(encoded_data));
         break;
-      case BakerReadBarrierKind::kUnsafeCas:
-        oss << "UnsafeCas_r" << BakerReadBarrierFirstRegField::Decode(encoded_data);
+      case BakerReadBarrierKind::kIntrinsicCas:
+        oss << "IntrinsicCas_r" << BakerReadBarrierFirstRegField::Decode(encoded_data);
         DCHECK_EQ(kBakerReadBarrierInvalidEncodedReg,
                   BakerReadBarrierSecondRegField::Decode(encoded_data));
         DCHECK(BakerReadBarrierWidthField::Decode(encoded_data) == BakerReadBarrierWidth::kWide);