x86_64: Implement VarHandle.compareAnd* for arrays and byte array views.

Benchmarks improvements (using benchmarks provided by
https://android-review.googlesource.com/1420959):

  benchmark                                   before   after
  ----------------------------------------------------------
  CompareAndSetArrayElementInt                2.870    0.007
  CompareAndSetArrayElementString             3.447    0.008
  CompareAndExchangeArrayElementInt           2.826    0.007
  CompareAndExchangeArrayElementString        3.437    0.009
  CompareAndSetByteArrayViewInt               2.929    0.007
  CompareAndExchangeByteArrayViewInt          2.878    0.007
  CompareAndSetByteArrayViewBigEndianInt      2.930    0.007
  CompareAndExchangeByteArrayViewBigEndianInt 2.875    0.007

Bug: 71781600
Test: lunch aosp_cf_x86_64_phone-userdebug \
  && art/test.py --host -r -t 712-varhandle-invocations --64
Test: Repeat with ART_USE_READ_BARRIER=false.
Test: Repeat with ART_HEAP_POISONING=true.
Change-Id: I48fb24ff53b7badfe55ee9d6f394d99ae59efb39
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index be3af2d..c276b8f 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -2215,8 +2215,15 @@
                                           DataType::Type type,
                                           Address field_addr,
                                           Location value,
-                                          bool is_cmpxchg) {
+                                          bool is_cmpxchg,
+                                          bool byte_swap) {
   X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
+  InstructionCodeGeneratorX86_64* instr_codegen = codegen->GetInstructionCodegen();
+
+  if (byte_swap) {
+    instr_codegen->Bswap(Location::RegisterLocation(RAX), type);
+    instr_codegen->Bswap(value, type);
+  }
 
   switch (type) {
     case DataType::Type::kBool:
@@ -2240,8 +2247,16 @@
   }
   // LOCK CMPXCHG has full barrier semantics, so we don't need barriers here.
 
+  if (byte_swap) {
+    // Restore byte order for value.
+    instr_codegen->Bswap(value, type);
+  }
+
   CpuRegister rax(RAX);
   if (is_cmpxchg) {
+    if (byte_swap) {
+      instr_codegen->Bswap(Location::RegisterLocation(RAX), type);
+    }
     // Sign-extend or zero-extend the result as necessary.
     switch (type) {
       case DataType::Type::kBool:
@@ -2271,15 +2286,27 @@
                                          Location expected,
                                          Location out,
                                          bool is64bit,
-                                         bool is_cmpxchg) {
+                                         bool is_cmpxchg,
+                                         bool byte_swap) {
   X86_64Assembler* assembler = down_cast<X86_64Assembler*>(codegen->GetAssembler());
+  InstructionCodeGeneratorX86_64* instr_codegen = codegen->GetInstructionCodegen();
+
+  Location rax_loc = Location::RegisterLocation(RAX);
+  Location temp_loc = Location::RegisterLocation(temp.AsRegister());
+
+  DataType::Type type = is64bit ? DataType::Type::kUint64 : DataType::Type::kUint32;
 
   // Copy `expected` to RAX (required by the CMPXCHG instruction).
-  codegen->Move(Location::RegisterLocation(RAX), expected);
+  codegen->Move(rax_loc, expected);
 
   // Copy value to some other register (ensure it's not RAX).
   DCHECK_NE(temp.AsRegister(), RAX);
-  codegen->Move(Location::RegisterLocation(temp.AsRegister()), value);
+  codegen->Move(temp_loc, value);
+
+  if (byte_swap) {
+    instr_codegen->Bswap(rax_loc, type);
+    instr_codegen->Bswap(temp_loc, type);
+  }
 
   if (is64bit) {
     __ LockCmpxchgq(field_addr, temp);
@@ -2287,8 +2314,12 @@
     __ LockCmpxchgl(field_addr, temp);
   }
   // LOCK CMPXCHG has full barrier semantics, so we don't need barriers here.
+  // No need to restore byte order for temporary register.
 
   if (is_cmpxchg) {
+    if (byte_swap) {
+      instr_codegen->Bswap(rax_loc, type);
+    }
     __ movd(out.AsFpuRegister<XmmRegister>(), CpuRegister(RAX), is64bit);
   } else {
     GenZFlagToResult(assembler, out.AsRegister<CpuRegister>());
@@ -2403,7 +2434,8 @@
                                        Location new_value,
                                        Location expected,
                                        Location out,
-                                       bool is_cmpxchg) {
+                                       bool is_cmpxchg,
+                                       bool byte_swap) {
   LocationSummary* locations = invoke->GetLocations();
   Address field_address(base, offset, TIMES_1, 0);
 
@@ -2413,7 +2445,7 @@
     DCHECK(RegsAreAllDifferent({base, offset, temp, CpuRegister(RAX)}));
 
     GenCompareAndSetOrExchangeFP(
-        codegen, field_address, temp, new_value, expected, out, is64bit, is_cmpxchg);
+        codegen, field_address, temp, new_value, expected, out, is64bit, is_cmpxchg, byte_swap);
   } else {
     // Both the expected value for CMPXCHG and the output are in RAX.
     DCHECK_EQ(RAX, expected.AsRegister<Register>());
@@ -2428,10 +2460,11 @@
           : CpuRegister(kNoRegister);
       DCHECK(RegsAreAllDifferent({base, offset, temp1, temp2, temp3}));
 
+      DCHECK(!byte_swap);
       GenCompareAndSetOrExchangeRef(
           codegen, invoke, base, offset, new_value_reg, temp1, temp2, temp3, is_cmpxchg);
     } else {
-      GenCompareAndSetOrExchangeInt(codegen, type, field_address, new_value, is_cmpxchg);
+      GenCompareAndSetOrExchangeInt(codegen, type, field_address, new_value, is_cmpxchg, byte_swap);
     }
   }
 }
@@ -2449,7 +2482,8 @@
                              /*new_value=*/ locations->InAt(4),
                              /*expected=*/ locations->InAt(3),
                              locations->Out(),
-                             /*is_cmpxchg=*/ false);
+                             /*is_cmpxchg=*/ false,
+                             /*byte_swap=*/ false);
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitUnsafeCASInt(HInvoke* invoke) {
@@ -3611,6 +3645,8 @@
     switch (mirror::VarHandle::GetAccessModeTemplateByIntrinsic(invoke->GetIntrinsic())) {
       case mirror::VarHandle::AccessModeTemplate::kGet:
       case mirror::VarHandle::AccessModeTemplate::kSet:
+      case mirror::VarHandle::AccessModeTemplate::kCompareAndSet:
+      case mirror::VarHandle::AccessModeTemplate::kCompareAndExchange:
         break;
       default:
         // TODO: Add support for all intrinsics.
@@ -3894,7 +3930,8 @@
 
 static void GenerateVarHandleCompareAndSetOrExchange(HInvoke* invoke,
                                                      CodeGeneratorX86_64* codegen,
-                                                     bool is_cmpxchg) {
+                                                     bool is_cmpxchg,
+                                                     bool byte_swap = false) {
   DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
 
   X86_64Assembler* assembler = codegen->GetAssembler();
@@ -3905,9 +3942,13 @@
   uint32_t new_value_index = number_of_arguments - 1;
   DataType::Type type = GetDataTypeFromShorty(invoke, expected_value_index);
 
-  VarHandleSlowPathX86_64* slow_path = GenerateVarHandleChecks(invoke, codegen, type);
+  VarHandleSlowPathX86_64* slow_path = nullptr;
   VarHandleTarget target = GetVarHandleTarget(invoke);
-  GenerateVarHandleTarget(invoke, target, codegen);
+  if (!byte_swap) {
+    slow_path = GenerateVarHandleChecks(invoke, codegen, type);
+    GenerateVarHandleTarget(invoke, target, codegen);
+    __ Bind(slow_path->GetNativeByteOrderLabel());
+  }
 
   uint32_t temp_count = locations->GetTempCount();
   GenCompareAndSetOrExchange(codegen,
@@ -3921,12 +3962,15 @@
                              locations->InAt(new_value_index),
                              locations->InAt(expected_value_index),
                              locations->Out(),
-                             is_cmpxchg);
+                             is_cmpxchg,
+                             byte_swap);
 
   // We are using LOCK CMPXCHG in all cases because there is no CAS equivalent that has weak
   // failure semantics. LOCK CMPXCHG has full barrier semantics, so we don't need barriers.
 
-  __ Bind(slow_path->GetExitLabel());
+  if (!byte_swap) {
+    __ Bind(slow_path->GetExitLabel());
+  }
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitVarHandleCompareAndSet(HInvoke* invoke) {
@@ -4664,6 +4708,14 @@
     case mirror::VarHandle::AccessModeTemplate::kSet:
       GenerateVarHandleSet(invoke, codegen, is_volatile_, is_atomic_, /*byte_swap=*/ true);
       break;
+    case mirror::VarHandle::AccessModeTemplate::kCompareAndSet:
+      GenerateVarHandleCompareAndSetOrExchange(
+          invoke, codegen, /*is_cmpxchg=*/ false, /*byte_swap=*/ true);
+      break;
+    case mirror::VarHandle::AccessModeTemplate::kCompareAndExchange:
+      GenerateVarHandleCompareAndSetOrExchange(
+          invoke, codegen, /*is_cmpxchg=*/ true, /*byte_swap=*/ true);
+      break;
     default:
       DCHECK(false);
       UNREACHABLE();