x86_64: Implement VarHandle.getAnd* for arrays and byte array views.

Benchmarks improvements (using benchmarks provided by https://android-review.googlesource.com/1420959): benchmark before after ---------------------------------------------------------- GetAndSetArrayElementInt 2.771 0.013 GetAndSetArrayElementDouble 2.709 0.011 GetAndSetArrayElementString 3.105 0.012 GetAndAddArrayElementInt 2.769 0.013 GetAndAddArrayElementDouble 2.713 0.011 GetAndBitwiseXorArrayElementInt 2.767 0.011 GetAndSetByteArrayViewInt 2.819 0.013 GetAndAddByteArrayViewInt 2.830 0.013 GetAndBitwiseXorByteArrayViewInt 2.822 0.012 GetAndSetByteArrayViewBigEndianInt 2.829 0.012 GetAndAddByteArrayViewBigEndianInt 2.826 0.012 GetAndBitwiseXorByteArrayViewBigEndianInt 2.824 0.012 Bug: 71781600 Test: lunch aosp_cf_x86_64_phone-userdebug \ && art/test.py --host -r -t 712-varhandle-invocations --64 Test: Repeat with ART_USE_READ_BARRIER=false. Test: Repeat with ART_HEAP_POISONING=true. Change-Id: Icdc066bb5cbef7b7afe5513842ec8e1ee1d8eb94
author: Ulya Trafimovich <skvadrik@google.com> 2021-11-19 12:48:28 +0000
committer: Ulya Trofimovich <skvadrik@google.com> 2021-12-06 10:13:55 +0000
commit: 3322cc5ff93b83c49e694bff14a8f43c893b2ed8 (patch)
tree: deca812b3035d104ba0db9937dd0fbbc1f103125 /compiler/optimizing
parent: 3a50f34c84f79048dfa19b5a4a66e38a0d148a5c (diff)
3 files changed, 247 insertions, 134 deletions
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index fa61c6713f..0747a3a5cd 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -5124,12 +5124,12 @@ void InstructionCodeGeneratorX86_64::Bswap(Location value,
                                            CpuRegister* temp) {
   switch (type) {
     case DataType::Type::kInt16:
-      // TODO: Can be done with an xchg of 8b registers. This is straight from Quick.
+      // This should sign-extend, even if reimplemented with an XCHG of 8-bit registers.
       __ bswapl(value.AsRegister<CpuRegister>());
       __ sarl(value.AsRegister<CpuRegister>(), Immediate(16));
       break;
     case DataType::Type::kUint16:
-      // TODO: Can be done with an xchg of 8b registers. This is straight from Quick.
+      // TODO: Can be done with an XCHG of 8-bit registers. This is straight from Quick.
       __ bswapl(value.AsRegister<CpuRegister>());
       __ shrl(value.AsRegister<CpuRegister>(), Immediate(16));
       break;
diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc
index ed760f190d..f8bf126eeb 100644
--- a/compiler/optimizing/instruction_builder.cc
+++ b/compiler/optimizing/instruction_builder.cc
@@ -1302,6 +1302,11 @@ static void DecideVarHandleIntrinsic(HInvoke* invoke) {
       if (IsVarHandleGetAndAdd(invoke) &&
           (value_type == DataType::Type::kReference || value_type == DataType::Type::kBool)) {
         // We should only add numerical types.
+        //
+        // For byte array views floating-point types are not allowed, see javadoc comments for
+        // java.lang.invoke.MethodHandles.byteArrayViewVarHandle(). But ART treats them as numeric
+        // types in ByteArrayViewVarHandle::Access(). Consequently we do generate intrinsic code,
+        // but it always fails access mode check at runtime.
         optimizations.SetDoNotIntrinsify();
         return;
       } else if (IsVarHandleGetAndBitwiseOp(invoke) && !DataType::IsIntegralType(value_type)) {
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index c716201313..66d0743022 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -3224,6 +3224,14 @@ void IntrinsicCodeGeneratorX86_64::VisitMathMultiplyHigh(HInvoke* invoke) {
   __ imulq(y);
 }
 
+enum class GetAndUpdateOp {
+  kSet,
+  kAdd,
+  kBitwiseAnd,
+  kBitwiseOr,
+  kBitwiseXor
+};
+
 class VarHandleSlowPathX86_64 : public IntrinsicSlowPathX86_64 {
  public:
   explicit VarHandleSlowPathX86_64(HInvoke* invoke)
@@ -3238,6 +3246,18 @@ class VarHandleSlowPathX86_64 : public IntrinsicSlowPathX86_64 {
     is_atomic_ = is_atomic;
   }
 
+  void SetNeedAnyStoreBarrier(bool need_any_store_barrier) {
+    need_any_store_barrier_ = need_any_store_barrier;
+  }
+
+  void SetNeedAnyAnyBarrier(bool need_any_any_barrier) {
+    need_any_any_barrier_ = need_any_any_barrier;
+  }
+
+  void SetGetAndUpdateOp(GetAndUpdateOp get_and_update_op) {
+    get_and_update_op_ = get_and_update_op;
+  }
+
   Label* GetByteArrayViewCheckLabel() {
     return &byte_array_view_check_label_;
   }
@@ -3270,6 +3290,9 @@ class VarHandleSlowPathX86_64 : public IntrinsicSlowPathX86_64 {
   // Arguments forwarded to specific methods.
   bool is_volatile_;
   bool is_atomic_;
+  bool need_any_store_barrier_;
+  bool need_any_any_barrier_;
+  GetAndUpdateOp get_and_update_op_;
 };
 
 static void GenerateMathFma(HInvoke* invoke, CodeGeneratorX86_64* codegen) {
@@ -3641,19 +3664,6 @@ static bool HasVarHandleIntrinsicImplementation(HInvoke* invoke) {
 
   size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
   DCHECK_LE(expected_coordinates_count, 2u);  // Filtered by the `DoNotIntrinsify` flag above.
-  if (expected_coordinates_count > 1u) {
-    switch (mirror::VarHandle::GetAccessModeTemplateByIntrinsic(invoke->GetIntrinsic())) {
-      case mirror::VarHandle::AccessModeTemplate::kGet:
-      case mirror::VarHandle::AccessModeTemplate::kSet:
-      case mirror::VarHandle::AccessModeTemplate::kCompareAndSet:
-      case mirror::VarHandle::AccessModeTemplate::kCompareAndExchange:
-        break;
-      default:
-        // TODO: Add support for all intrinsics.
-        return false;
-    }
-  }
-
   return true;
 }
 
@@ -4037,14 +4047,6 @@ void IntrinsicCodeGeneratorX86_64::VisitVarHandleCompareAndExchangeRelease(HInvo
   GenerateVarHandleCompareAndSetOrExchange(invoke, codegen_, /*is_cmpxchg=*/ true);
 }
 
-enum class GetAndUpdateOp {
-  kSet,
-  kAdd,
-  kBitwiseAnd,
-  kBitwiseOr,
-  kBitwiseXor
-};
-
 static void CreateVarHandleGetAndSetLocations(HInvoke* invoke) {
   if (!HasVarHandleIntrinsicImplementation(invoke)) {
     return;
@@ -4079,16 +4081,15 @@ static void CreateVarHandleGetAndSetLocations(HInvoke* invoke) {
   }
 }
 
-static void GenerateVarHandleGetAndSet(CodeGeneratorX86_64* codegen,
-                                       HInvoke* invoke,
-                                       LocationSummary* locations,
-                                       uint32_t value_index,
+static void GenerateVarHandleGetAndSet(HInvoke* invoke,
+                                       CodeGeneratorX86_64* codegen,
+                                       Location value,
                                        DataType::Type type,
                                        Address field_addr,
-                                       CpuRegister ref) {
+                                       CpuRegister ref,
+                                       bool byte_swap) {
   X86_64Assembler* assembler = codegen->GetAssembler();
-
-  Location value = locations->InAt(value_index);
+  LocationSummary* locations = invoke->GetLocations();
   Location out = locations->Out();
   uint32_t temp_count = locations->GetTempCount();
 
@@ -4098,15 +4099,24 @@ static void GenerateVarHandleGetAndSet(CodeGeneratorX86_64* codegen,
     Location temp = locations->GetTemp(temp_count - 1);
     codegen->Move(temp, value);
     bool is64bit = (type == DataType::Type::kFloat64);
+    DataType::Type bswap_type = is64bit ? DataType::Type::kUint64 : DataType::Type::kUint32;
+    if (byte_swap) {
+      codegen->GetInstructionCodegen()->Bswap(temp, bswap_type);
+    }
     if (is64bit) {
       __ xchgq(temp.AsRegister<CpuRegister>(), field_addr);
     } else {
       __ xchgl(temp.AsRegister<CpuRegister>(), field_addr);
     }
+    if (byte_swap) {
+      codegen->GetInstructionCodegen()->Bswap(temp, bswap_type);
+    }
     __ movd(out.AsFpuRegister<XmmRegister>(), temp.AsRegister<CpuRegister>(), is64bit);
   } else if (type == DataType::Type::kReference) {
     // `getAndSet` for references: load reference and atomically exchange it with the field.
     // Output register is the same as the one holding new value, so no need to move the result.
+    DCHECK(!byte_swap);
+
     CpuRegister temp1 = locations->GetTemp(temp_count - 1).AsRegister<CpuRegister>();
     CpuRegister temp2 = locations->GetTemp(temp_count - 2).AsRegister<CpuRegister>();
     CpuRegister valreg = value.AsRegister<CpuRegister>();
@@ -4139,6 +4149,9 @@ static void GenerateVarHandleGetAndSet(CodeGeneratorX86_64* codegen,
   } else {
     // `getAndSet` for integral types: atomically exchange the new value with the field. Output
     // register is the same as the one holding new value. Do sign extend / zero extend as needed.
+    if (byte_swap) {
+      codegen->GetInstructionCodegen()->Bswap(value, type);
+    }
     CpuRegister valreg = value.AsRegister<CpuRegister>();
     DCHECK_EQ(valreg, out.AsRegister<CpuRegister>());
     switch (type) {
@@ -4171,6 +4184,9 @@ static void GenerateVarHandleGetAndSet(CodeGeneratorX86_64* codegen,
         DCHECK(false) << "unexpected type in getAndSet intrinsic";
         UNREACHABLE();
     }
+    if (byte_swap) {
+      codegen->GetInstructionCodegen()->Bswap(value, type);
+    }
   }
 }
 
@@ -4191,64 +4207,96 @@ static void CreateVarHandleGetAndBitwiseOpLocations(HInvoke* invoke) {
 
   // A temporary to compute the bitwise operation on the old and the new values.
   locations->AddTemp(Location::RequiresRegister());
+  // We need value to be either in a register, or a 32-bit constant (as there are no arithmetic
+  // instructions that accept 64-bit immediate on x86_64).
+  locations->SetInAt(new_value_index, DataType::Is64BitType(type)
+      ? Location::RequiresRegister()
+      : Location::RegisterOrConstant(invoke->InputAt(new_value_index)));
   // Output is in RAX to accommodate CMPXCHG. It is also used as a temporary.
   locations->SetOut(Location::RegisterLocation(RAX));
 }
 
-static void GenerateVarHandleGetAndBitwiseOp(CodeGeneratorX86_64* codegen,
-                                             LocationSummary* locations,
-                                             uint32_t value_index,
-                                             DataType::Type type,
-                                             Address field_addr,
-                                             GetAndUpdateOp get_and_update_op) {
+static void GenerateVarHandleGetAndOp(HInvoke* invoke,
+                                      CodeGeneratorX86_64* codegen,
+                                      Location value,
+                                      DataType::Type type,
+                                      Address field_addr,
+                                      GetAndUpdateOp get_and_update_op,
+                                      bool byte_swap) {
   X86_64Assembler* assembler = codegen->GetAssembler();
-
-  Location value = locations->InAt(value_index);
+  LocationSummary* locations = invoke->GetLocations();
   Location temp_loc = locations->GetTemp(locations->GetTempCount() - 1);
+  Location rax_loc = locations->Out();
   CpuRegister temp = temp_loc.AsRegister<CpuRegister>();
-  CpuRegister rax = locations->Out().AsRegister<CpuRegister>();
+  CpuRegister rax = rax_loc.AsRegister<CpuRegister>();
   DCHECK_EQ(rax.AsRegister(), RAX);
+  bool is64Bit = DataType::Is64BitType(type);
 
   NearLabel retry;
   __ Bind(&retry);
 
-  // Load field value into register.
+  // Load field value into RAX and copy it into a temporary register for the operation.
   codegen->LoadFromMemoryNoReference(type, Location::RegisterLocation(RAX), field_addr);
+  codegen->Move(temp_loc, rax_loc);
+  if (byte_swap) {
+    // Byte swap the temporary, since we need to perform operation in native endianness.
+    codegen->GetInstructionCodegen()->Bswap(temp_loc, type);
+  }
 
-  // Move the new value into a temporary register.
-  codegen->Move(temp_loc, value);
+  DCHECK(!value.IsConstant() || !is64Bit);
+  int32_t const_value = value.IsConstant()
+      ? CodeGenerator::GetInt32ValueOf(value.GetConstant())
+      : 0;
 
-  // Use temporary register as first operand and destination of the bitwise operations, as the
-  // operation is symmetric and we need the old value in RAX for CMPXCHG. Use 32-bit registers for
-  // 8/16/32-bit types to save on the REX prefix that might be needed for 64 bits.
-  bool is64Bit = DataType::Is64BitType(type);
+  // Use 32-bit registers for 8/16/32-bit types to save on the REX prefix.
   switch (get_and_update_op) {
+    case GetAndUpdateOp::kAdd:
+      DCHECK(byte_swap);  // The non-byte-swapping path should use a faster XADD instruction.
+      if (is64Bit) {
+        __ addq(temp, value.AsRegister<CpuRegister>());
+      } else if (value.IsConstant()) {
+        __ addl(temp, Immediate(const_value));
+      } else {
+        __ addl(temp, value.AsRegister<CpuRegister>());
+      }
+      break;
     case GetAndUpdateOp::kBitwiseAnd:
       if (is64Bit) {
-        __ andq(temp, rax);
+        __ andq(temp, value.AsRegister<CpuRegister>());
+      } else if (value.IsConstant()) {
+        __ andl(temp, Immediate(const_value));
       } else {
-        __ andl(temp, rax);
+        __ andl(temp, value.AsRegister<CpuRegister>());
       }
       break;
     case GetAndUpdateOp::kBitwiseOr:
       if (is64Bit) {
-        __ orq(temp, rax);
+        __ orq(temp, value.AsRegister<CpuRegister>());
+      } else if (value.IsConstant()) {
+        __ orl(temp, Immediate(const_value));
       } else {
-        __ orl(temp, rax);
+        __ orl(temp, value.AsRegister<CpuRegister>());
       }
       break;
     case GetAndUpdateOp::kBitwiseXor:
       if (is64Bit) {
-        __ xorq(temp, rax);
+        __ xorq(temp, value.AsRegister<CpuRegister>());
+      } else if (value.IsConstant()) {
+        __ xorl(temp, Immediate(const_value));
       } else {
-        __ xorl(temp, rax);
+        __ xorl(temp, value.AsRegister<CpuRegister>());
       }
       break;
     default:
-      DCHECK(false) <<  "expected bitwise operation";
+      DCHECK(false) <<  "unexpected operation";
       UNREACHABLE();
   }
 
+  if (byte_swap) {
+    // RAX still contains the original value, but we need to byte swap the temporary back.
+    codegen->GetInstructionCodegen()->Bswap(temp_loc, type);
+  }
+
   switch (type) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
@@ -4274,23 +4322,10 @@ static void GenerateVarHandleGetAndBitwiseOp(CodeGeneratorX86_64* codegen,
 
   __ j(kNotZero, &retry);
 
-  // Sign-extend or zero-extend the result if necessary. Output is in RAX after CMPXCHG.
-  switch (type) {
-    case DataType::Type::kBool:
-    case DataType::Type::kUint8:
-      __ movzxb(rax, rax);
-      break;
-    case DataType::Type::kInt8:
-      __ movsxb(rax, rax);
-      break;
-    case DataType::Type::kUint16:
-      __ movzxw(rax, rax);
-      break;
-    case DataType::Type::kInt16:
-      __ movsxw(rax, rax);
-      break;
-    default:
-      break;
+  // The result is in RAX after CMPXCHG. Byte swap if necessary, but do not sign/zero extend,
+  // as it has already been done by `LoadFromMemoryNoReference` above (and not altered by CMPXCHG).
+  if (byte_swap) {
+    codegen->GetInstructionCodegen()->Bswap(rax_loc, type);
   }
 }
 
@@ -4312,92 +4347,150 @@ static void CreateVarHandleGetAndAddLocations(HInvoke* invoke) {
     locations->SetInAt(new_value_index, Location::RequiresFpuRegister());
     // CMPXCHG clobbers RAX.
     locations->AddTemp(Location::RegisterLocation(RAX));
-    // A temporary to hold the new value for CMPXCHG.
-    locations->AddTemp(Location::RequiresRegister());
     // An FP temporary to load the old value from the field and perform FP addition.
     locations->AddTemp(Location::RequiresFpuRegister());
+    // A temporary to hold the new value for CMPXCHG.
+    locations->AddTemp(Location::RequiresRegister());
   } else {
     DCHECK_NE(type, DataType::Type::kReference);
     // Use the same register for both the new value and output to take advantage of XADD.
-    // It doesn't have to be RAX, but we need to choose some to make sure it's the same.
+    // It should be RAX, because the byte-swapping path of GenerateVarHandleGetAndAdd falls
+    // back to GenerateVarHandleGetAndOp that expects out in RAX.
     locations->SetOut(Location::RegisterLocation(RAX));
     locations->SetInAt(new_value_index, Location::RegisterLocation(RAX));
+    if (GetExpectedVarHandleCoordinatesCount(invoke) == 2) {
+      // For byte array views with non-native endianness we need extra BSWAP operations, so we
+      // cannot use XADD and have to fallback to a generic implementation based on CMPXCH. In that
+      // case we need two temporary registers: one to hold value instead of RAX (which may get
+      // clobbered by repeated CMPXCHG) and one for performing the operation. At compile time we
+      // cannot distinguish this case from arrays or native-endian byte array views.
+      locations->AddTemp(Location::RequiresRegister());
+      locations->AddTemp(Location::RequiresRegister());
+    }
   }
 }
 
-static void GenerateVarHandleGetAndAdd(CodeGeneratorX86_64* codegen,
-                                       LocationSummary* locations,
-                                       uint32_t value_index,
+static void GenerateVarHandleGetAndAdd(HInvoke* invoke,
+                                       CodeGeneratorX86_64* codegen,
+                                       Location value,
                                        DataType::Type type,
-                                       Address field_addr) {
+                                       Address field_addr,
+                                       bool byte_swap) {
   X86_64Assembler* assembler = codegen->GetAssembler();
-
-  Location value = locations->InAt(value_index);
+  LocationSummary* locations = invoke->GetLocations();
   Location out = locations->Out();
   uint32_t temp_count = locations->GetTempCount();
 
   if (DataType::IsFloatingPointType(type)) {
+    if (byte_swap) {
+      // This code should never be executed: it is the case of a byte array view (since it requires
+      // a byte swap), and varhandles for byte array views support numeric atomic update access mode
+      // only for int and long, but not for floating-point types (see javadoc comments for
+      // java.lang.invoke.MethodHandles.byteArrayViewVarHandle()). But ART varhandle implementation
+      // for byte array views treats floating-point types them as numeric types in
+      // ByteArrayViewVarHandle::Access(). Terefore we do generate intrinsic code, but it always
+      // fails access mode check at runtime prior to reaching this point. Illegal instruction UD2
+      // ensures that if control flow gets here by mistake, we will notice.
+      __ ud2();
+    }
+
     // `getAndAdd` for floating-point types: load the old FP value into a temporary FP register and
     // in RAX for CMPXCHG, add the new FP value to the old one, move it to a non-FP temporary for
     // CMPXCHG and loop until CMPXCHG succeeds. Move the result from RAX to the output FP register.
     bool is64bit = (type == DataType::Type::kFloat64);
-    XmmRegister fptemp = locations->GetTemp(temp_count - 1).AsFpuRegister<XmmRegister>();
-    CpuRegister temp = locations->GetTemp(temp_count - 2).AsRegister<CpuRegister>();
+    DataType::Type bswap_type = is64bit ? DataType::Type::kUint64 : DataType::Type::kUint32;
+    XmmRegister fptemp = locations->GetTemp(temp_count - 2).AsFpuRegister<XmmRegister>();
+    Location rax_loc = Location::RegisterLocation(RAX);
+    Location temp_loc = locations->GetTemp(temp_count - 1);
+    CpuRegister temp = temp_loc.AsRegister<CpuRegister>();
 
     NearLabel retry;
     __ Bind(&retry);
+
+    // Read value from memory into an FP register and copy in into RAX.
     if (is64bit) {
       __ movsd(fptemp, field_addr);
-      __ movd(CpuRegister(RAX), fptemp, is64bit);
-      __ addsd(fptemp, value.AsFpuRegister<XmmRegister>());
-      __ movd(temp, fptemp, is64bit);
-      __ LockCmpxchgq(field_addr, temp);
     } else {
       __ movss(fptemp, field_addr);
-      __ movd(CpuRegister(RAX), fptemp, is64bit);
+    }
+    __ movd(CpuRegister(RAX), fptemp, is64bit);
+    // If necessary, byte swap RAX and update the value in FP register to also be byte-swapped.
+    if (byte_swap) {
+      codegen->GetInstructionCodegen()->Bswap(rax_loc, bswap_type);
+      __ movd(fptemp, CpuRegister(RAX), is64bit);
+    }
+    // Perform the FP addition and move it to a temporary register to prepare for CMPXCHG.
+    if (is64bit) {
+      __ addsd(fptemp, value.AsFpuRegister<XmmRegister>());
+    } else {
       __ addss(fptemp, value.AsFpuRegister<XmmRegister>());
-      __ movd(temp, fptemp, is64bit);
+    }
+    __ movd(temp, fptemp, is64bit);
+    // If necessary, byte swap RAX before CMPXCHG and the temporary before copying to FP register.
+    if (byte_swap) {
+      codegen->GetInstructionCodegen()->Bswap(temp_loc, bswap_type);
+      codegen->GetInstructionCodegen()->Bswap(rax_loc, bswap_type);
+    }
+    if (is64bit) {
+      __ LockCmpxchgq(field_addr, temp);
+    } else {
       __ LockCmpxchgl(field_addr, temp);
     }
+
     __ j(kNotZero, &retry);
 
-    // The old value is in RAX.
+    // The old value is in RAX, byte swap if necessary.
+    if (byte_swap) {
+      codegen->GetInstructionCodegen()->Bswap(rax_loc, bswap_type);
+    }
     __ movd(out.AsFpuRegister<XmmRegister>(), CpuRegister(RAX), is64bit);
   } else {
-    // `getAndAdd` for integral types: atomically exchange the new value with the field and add the
-    // old value to the field. Output register is the same as the one holding new value. Do sign
-    // extend / zero extend as needed.
-    CpuRegister valreg = value.AsRegister<CpuRegister>();
-    DCHECK_EQ(valreg, out.AsRegister<CpuRegister>());
-    switch (type) {
-      case DataType::Type::kBool:
-      case DataType::Type::kUint8:
-        __ LockXaddb(field_addr, valreg);
-        __ movzxb(valreg, valreg);
-        break;
-      case DataType::Type::kInt8:
-        __ LockXaddb(field_addr, valreg);
-        __ movsxb(valreg, valreg);
-        break;
-      case DataType::Type::kUint16:
-        __ LockXaddw(field_addr, valreg);
-        __ movzxw(valreg, valreg);
-        break;
-      case DataType::Type::kInt16:
-        __ LockXaddw(field_addr, valreg);
-        __ movsxw(valreg, valreg);
-        break;
-      case DataType::Type::kInt32:
-      case DataType::Type::kUint32:
-        __ LockXaddl(field_addr, valreg);
-        break;
-      case DataType::Type::kInt64:
-      case DataType::Type::kUint64:
-        __ LockXaddq(field_addr, valreg);
-        break;
-      default:
-        DCHECK(false) << "unexpected type in getAndAdd intrinsic";
-        UNREACHABLE();
+    if (byte_swap) {
+      // We cannot use XADD since we need to byte-swap the old value when reading it from memory,
+      // and then byte-swap the sum before writing it to memory. So fallback to the slower generic
+      // implementation that is also used for bitwise operations.
+      // Move value from RAX to a temporary register, as RAX may get clobbered by repeated CMPXCHG.
+      DCHECK_EQ(GetExpectedVarHandleCoordinatesCount(invoke), 2u);
+      Location temp = locations->GetTemp(temp_count - 2);
+      codegen->Move(temp, value);
+      GenerateVarHandleGetAndOp(
+          invoke, codegen, temp, type, field_addr, GetAndUpdateOp::kAdd, byte_swap);
+    } else {
+      // `getAndAdd` for integral types: atomically exchange the new value with the field and add
+      // the old value to the field. Output register is the same as the one holding new value. Do
+      // sign extend / zero extend as needed.
+      CpuRegister valreg = value.AsRegister<CpuRegister>();
+      DCHECK_EQ(valreg, out.AsRegister<CpuRegister>());
+      switch (type) {
+        case DataType::Type::kBool:
+        case DataType::Type::kUint8:
+          __ LockXaddb(field_addr, valreg);
+          __ movzxb(valreg, valreg);
+          break;
+        case DataType::Type::kInt8:
+          __ LockXaddb(field_addr, valreg);
+          __ movsxb(valreg, valreg);
+          break;
+        case DataType::Type::kUint16:
+          __ LockXaddw(field_addr, valreg);
+          __ movzxw(valreg, valreg);
+          break;
+        case DataType::Type::kInt16:
+          __ LockXaddw(field_addr, valreg);
+          __ movsxw(valreg, valreg);
+          break;
+        case DataType::Type::kInt32:
+        case DataType::Type::kUint32:
+          __ LockXaddl(field_addr, valreg);
+          break;
+        case DataType::Type::kInt64:
+        case DataType::Type::kUint64:
+          __ LockXaddq(field_addr, valreg);
+          break;
+        default:
+          DCHECK(false) << "unexpected type in getAndAdd intrinsic";
+          UNREACHABLE();
+      }
     }
   }
 }
@@ -4406,19 +4499,27 @@ static void GenerateVarHandleGetAndUpdate(HInvoke* invoke,
                                           CodeGeneratorX86_64* codegen,
                                           GetAndUpdateOp get_and_update_op,
                                           bool need_any_store_barrier,
-                                          bool need_any_any_barrier) {
+                                          bool need_any_any_barrier,
+                                          bool byte_swap = false) {
   DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
 
   X86_64Assembler* assembler = codegen->GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
 
   uint32_t number_of_arguments = invoke->GetNumberOfArguments();
-  uint32_t value_index = number_of_arguments - 1;
+  Location value = locations->InAt(number_of_arguments - 1);
   DataType::Type type = invoke->GetType();
 
-  VarHandleSlowPathX86_64* slow_path = GenerateVarHandleChecks(invoke, codegen, type);
+  VarHandleSlowPathX86_64* slow_path = nullptr;
   VarHandleTarget target = GetVarHandleTarget(invoke);
-  GenerateVarHandleTarget(invoke, target, codegen);
+  if (!byte_swap) {
+    slow_path = GenerateVarHandleChecks(invoke, codegen, type);
+    slow_path->SetGetAndUpdateOp(get_and_update_op);
+    slow_path->SetNeedAnyStoreBarrier(need_any_store_barrier);
+    slow_path->SetNeedAnyAnyBarrier(need_any_any_barrier);
+    GenerateVarHandleTarget(invoke, target, codegen);
+    __ Bind(slow_path->GetNativeByteOrderLabel());
+  }
 
   CpuRegister ref(target.object);
   Address field_addr(ref, CpuRegister(target.offset), TIMES_1, 0);
@@ -4429,16 +4530,16 @@ static void GenerateVarHandleGetAndUpdate(HInvoke* invoke,
 
   switch (get_and_update_op) {
     case GetAndUpdateOp::kSet:
-      GenerateVarHandleGetAndSet(codegen, invoke, locations, value_index, type, field_addr, ref);
+      GenerateVarHandleGetAndSet(invoke, codegen, value, type, field_addr, ref, byte_swap);
       break;
     case GetAndUpdateOp::kAdd:
-      GenerateVarHandleGetAndAdd(codegen, locations, value_index, type, field_addr);
+      GenerateVarHandleGetAndAdd(invoke, codegen, value, type, field_addr, byte_swap);
       break;
     case GetAndUpdateOp::kBitwiseAnd:
     case GetAndUpdateOp::kBitwiseOr:
     case GetAndUpdateOp::kBitwiseXor:
-      GenerateVarHandleGetAndBitwiseOp(
-          codegen, locations, value_index, type, field_addr, get_and_update_op);
+      GenerateVarHandleGetAndOp(
+          invoke, codegen, value, type, field_addr, get_and_update_op, byte_swap);
       break;
   }
 
@@ -4446,7 +4547,9 @@ static void GenerateVarHandleGetAndUpdate(HInvoke* invoke,
     codegen->GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
   }
 
-  __ Bind(slow_path->GetExitLabel());
+  if (!byte_swap) {
+    __ Bind(slow_path->GetExitLabel());
+  }
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitVarHandleGetAndSet(HInvoke* invoke) {
@@ -4722,9 +4825,14 @@ void VarHandleSlowPathX86_64::EmitByteArrayViewCode(CodeGeneratorX86_64* codegen
       GenerateVarHandleCompareAndSetOrExchange(
           invoke, codegen, /*is_cmpxchg=*/ true, /*byte_swap=*/ true);
       break;
-    default:
-      DCHECK(false);
-      UNREACHABLE();
+    case mirror::VarHandle::AccessModeTemplate::kGetAndUpdate:
+      GenerateVarHandleGetAndUpdate(invoke,
+                                    codegen,
+                                    get_and_update_op_,
+                                    need_any_store_barrier_,
+                                    need_any_any_barrier_,
+                                    /*byte_swap=*/ true);
+      break;
   }
 
   __ jmp(GetExitLabel());
author	Ulya Trafimovich <skvadrik@google.com>	2021-11-19 12:48:28 +0000
committer	Ulya Trofimovich <skvadrik@google.com>	2021-12-06 10:13:55 +0000
commit	3322cc5ff93b83c49e694bff14a8f43c893b2ed8 (patch)
tree	deca812b3035d104ba0db9937dd0fbbc1f103125 /compiler/optimizing
parent	3a50f34c84f79048dfa19b5a4a66e38a0d148a5c (diff)