arm: Implement VarHandle.get/set intrinsics.

Including Opaque, Acquire/Release and Volatile variants.
Refactor Unsafe.get/put operations to share code with the
new VarHandle intrinsics, fixing potentially non-atomic
64-bit "Ordered" operations in the process.

Using benchmarks provided by
    https://android-review.googlesource.com/1420959
on blueline little cores with fixed frequency 1420800:
                             before after
GetStaticFieldInt            23.937 0.014
GetStaticFieldString         24.497 0.019
GetFieldInt                  27.510 0.016
GetFieldString               28.000 0.021
GetAcquireStaticFieldInt     23.953 0.017
GetAcquireStaticFieldString  24.532 0.021
GetAcquireFieldInt           27.457 0.020
GetAcquireFieldString        28.137 0.023
GetOpaqueStaticFieldInt      23.955 0.014
GetOpaqueStaticFieldString   24.530 0.019
GetOpaqueFieldInt            27.461 0.016
GetOpaqueFieldString         28.164 0.021
GetVolatileStaticFieldInt    23.971 0.017
GetVolatileStaticFieldString 24.612 0.021
GetVolatileFieldInt          27.518 0.020
GetVolatileFieldString       28.178 0.023
SetStaticFieldInt            25.291 0.014
SetStaticFieldString         28.873 0.018
SetFieldInt                  28.676 0.016
SetFieldString               32.286 0.021
SetVolatileStaticFieldInt    25.339 0.021
SetVolatileStaticFieldString 28.904 0.028
SetVolatileFieldInt          28.730 0.023
SetVolatileFieldString       32.322 0.030
SetOpaqueStaticFieldInt      25.343 0.014
SetOpaqueStaticFieldString   28.992 0.018
SetOpaqueFieldInt            28.749 0.016
SetOpaqueFieldString         32.317 0.022
SetReleaseStaticFieldInt     25.354 0.016
SetReleaseStaticFieldString  28.906 0.025
SetReleaseFieldInt           28.678 0.017
SetReleaseFieldString        32.262 0.027

Test: Covered by existing test 712-varhandle-invocations.
Test: testrunner.py --target --32 --optimizing
Test: Repeat with ART_USE_READ_BARRIER=false ART_HEAP_POISONING=true.
Test: Repeat with ART_READ_BARRIER_TYPE=TABLELOOKUP.
Bug: 71781600
Change-Id: I0ac6d0c154791d787d5c4abd8095e3c2eee9abbb
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 940f521..fcc4e06 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -36,6 +36,7 @@
 #include "linker/linker_patch.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
+#include "mirror/var_handle.h"
 #include "scoped_thread_state_change-inl.h"
 #include "thread.h"
 #include "utils/arm/assembler_arm_vixl.h"
@@ -834,14 +835,18 @@
         // to an object field within an object.
         DCHECK(instruction_->IsInvoke()) << instruction_->DebugName();
         DCHECK(instruction_->GetLocations()->Intrinsified());
-        DCHECK((instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObject) ||
-               (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile))
+        Intrinsics intrinsic = instruction_->AsInvoke()->GetIntrinsic();
+        DCHECK(intrinsic == Intrinsics::kUnsafeGetObject ||
+               intrinsic == Intrinsics::kUnsafeGetObjectVolatile ||
+               mirror::VarHandle::GetAccessModeTemplateByIntrinsic(intrinsic) ==
+                   mirror::VarHandle::AccessModeTemplate::kGet)
             << instruction_->AsInvoke()->GetIntrinsic();
         DCHECK_EQ(offset_, 0U);
-        DCHECK(index_.IsRegisterPair());
-        // UnsafeGet's offset location is a register pair, the low
-        // part contains the correct offset.
-        index = index_.ToLow();
+        // Though UnsafeGet's offset location is a register pair, we only pass the low
+        // part (high part is irrelevant for 32-bit addresses) to the slow path.
+        // For VarHandle intrinsics, the index is always just a register.
+        DCHECK(index_.IsRegister());
+        index = index_;
       }
     }
 
@@ -923,7 +928,9 @@
     vixl32::Register reg_out = RegisterFrom(out_);
     DCHECK(locations->CanCall());
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out.GetCode()));
-    DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString())
+    DCHECK(instruction_->IsLoadClass() ||
+           instruction_->IsLoadString() ||
+           (instruction_->IsInvoke() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier for GC root slow path: "
         << instruction_->DebugName();
 
@@ -6729,7 +6736,7 @@
         }
       }
 
-      codegen_->MarkGCCard(temp1, temp2, array, value, /* can_be_null= */ false);
+      codegen_->MarkGCCard(temp1, temp2, array, value, /* value_can_be_null= */ false);
 
       if (can_value_be_null) {
         DCHECK(do_store.IsReferenced());
@@ -6960,10 +6967,10 @@
                                       vixl32::Register card,
                                       vixl32::Register object,
                                       vixl32::Register value,
-                                      bool can_be_null) {
+                                      bool value_can_be_null) {
   vixl32::Label is_null;
-  if (can_be_null) {
-    __ CompareAndBranchIfZero(value, &is_null);
+  if (value_can_be_null) {
+    __ CompareAndBranchIfZero(value, &is_null, /* is_far_target=*/ false);
   }
   // Load the address of the card table into `card`.
   GetAssembler()->LoadFromOffset(
@@ -6985,7 +6992,7 @@
   // of the card to mark; and 2. to load the `kCardDirty` value) saves a load
   // (no need to explicitly load `kCardDirty` as an immediate value).
   __ Strb(card, MemOperand(card, temp));
-  if (can_be_null) {
+  if (value_can_be_null) {
     __ Bind(&is_null);
   }
 }
@@ -9711,18 +9718,10 @@
     return;
   }
 
-  // TODO: Consider pairs in the parallel move resolver, then this could be nicely merged
-  //       with the last branch.
-  if (type == DataType::Type::kInt64) {
-    TODO_VIXL32(FATAL);
-  } else if (type == DataType::Type::kFloat64) {
-    TODO_VIXL32(FATAL);
-  } else {
-    // Let the parallel move resolver take care of all of this.
-    HParallelMove parallel_move(GetGraph()->GetAllocator());
-    parallel_move.AddMove(return_loc, trg, type, nullptr);
-    GetMoveResolver()->EmitNativeCode(&parallel_move);
-  }
+  // Let the parallel move resolver take care of all of this.
+  HParallelMove parallel_move(GetGraph()->GetAllocator());
+  parallel_move.AddMove(return_loc, trg, type, nullptr);
+  GetMoveResolver()->EmitNativeCode(&parallel_move);
 }
 
 void LocationsBuilderARMVIXL::VisitClassTableGet(HClassTableGet* instruction) {