diff options
author | 2023-11-20 10:06:45 +0000 | |
---|---|---|
committer | 2023-11-24 11:52:40 +0000 | |
commit | aeefe81cc03267d7ba9a6bfaf8daef91fbd33aa0 (patch) | |
tree | 9ea7db94ab2a5b454d42c04200a8efa44580372c /compiler/optimizing | |
parent | 0ac7d570c99ddb1a22429a2f51e05d48a26daa36 (diff) |
riscv64: Implement VarHandle.GetAndUpdate intrinsics.
Also fix `GenerateCompareAndSet()` to avoid ANDN inside
a LR/SC sequence as the instruction is not part of the
base "I" instruction set.
Test: testrunner.py --target --64 --ndebug --optimizing
Bug: 283082089
Change-Id: I09caa0486d9bedf93a40f0f15cab1e6bef19969c
Diffstat (limited to 'compiler/optimizing')
-rw-r--r-- | compiler/optimizing/code_generator_riscv64.cc | 2 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_riscv64.h | 20 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_riscv64.cc | 678 |
3 files changed, 629 insertions, 71 deletions
diff --git a/compiler/optimizing/code_generator_riscv64.cc b/compiler/optimizing/code_generator_riscv64.cc index 5006371377..fb44abce55 100644 --- a/compiler/optimizing/code_generator_riscv64.cc +++ b/compiler/optimizing/code_generator_riscv64.cc @@ -786,7 +786,7 @@ inline void InstructionCodeGeneratorRISCV64::FpBinOp( } } -inline void InstructionCodeGeneratorRISCV64::FAdd( +void InstructionCodeGeneratorRISCV64::FAdd( FRegister rd, FRegister rs1, FRegister rs2, DataType::Type type) { FpBinOp<FRegister, &Riscv64Assembler::FAddS, &Riscv64Assembler::FAddD>(rd, rs1, rs2, type); } diff --git a/compiler/optimizing/code_generator_riscv64.h b/compiler/optimizing/code_generator_riscv64.h index b1c45dea82..20c48db12c 100644 --- a/compiler/optimizing/code_generator_riscv64.h +++ b/compiler/optimizing/code_generator_riscv64.h @@ -160,21 +160,6 @@ static constexpr int32_t kFClassNaNMinValue = 0x100; V(CRC32UpdateByteBuffer) \ V(MethodHandleInvokeExact) \ V(MethodHandleInvoke) \ - V(VarHandleGetAndAdd) \ - V(VarHandleGetAndAddAcquire) \ - V(VarHandleGetAndAddRelease) \ - V(VarHandleGetAndBitwiseAnd) \ - V(VarHandleGetAndBitwiseAndAcquire) \ - V(VarHandleGetAndBitwiseAndRelease) \ - V(VarHandleGetAndBitwiseOr) \ - V(VarHandleGetAndBitwiseOrAcquire) \ - V(VarHandleGetAndBitwiseOrRelease) \ - V(VarHandleGetAndBitwiseXor) \ - V(VarHandleGetAndBitwiseXorAcquire) \ - V(VarHandleGetAndBitwiseXorRelease) \ - V(VarHandleGetAndSet) \ - V(VarHandleGetAndSetAcquire) \ - V(VarHandleGetAndSetRelease) \ V(ByteValueOf) \ V(ShortValueOf) \ V(CharacterValueOf) \ @@ -183,6 +168,9 @@ static constexpr int32_t kFClassNaNMinValue = 0x100; // Method register on invoke. static const XRegister kArtMethodRegister = A0; +// Helper used by codegen as well as intrinsics. +XRegister InputXRegisterOrZero(Location location); + class CodeGeneratorRISCV64; class InvokeRuntimeCallingConvention : public CallingConvention<XRegister, FRegister> { @@ -367,6 +355,7 @@ class InstructionCodeGeneratorRISCV64 : public InstructionCodeGenerator { void GenerateMemoryBarrier(MemBarrierKind kind); + void FAdd(FRegister rd, FRegister rs1, FRegister rs2, DataType::Type type); void FClass(XRegister rd, FRegister rs1, DataType::Type type); void Load(Location out, XRegister rs1, int32_t offset, DataType::Type type); @@ -473,7 +462,6 @@ class InstructionCodeGeneratorRISCV64 : public InstructionCodeGenerator { void (Riscv64Assembler::*opS)(Reg, FRegister, FRegister), void (Riscv64Assembler::*opD)(Reg, FRegister, FRegister)> void FpBinOp(Reg rd, FRegister rs1, FRegister rs2, DataType::Type type); - void FAdd(FRegister rd, FRegister rs1, FRegister rs2, DataType::Type type); void FSub(FRegister rd, FRegister rs1, FRegister rs2, DataType::Type type); void FDiv(FRegister rd, FRegister rs1, FRegister rs2, DataType::Type type); void FMul(FRegister rd, FRegister rs1, FRegister rs2, DataType::Type type); diff --git a/compiler/optimizing/intrinsics_riscv64.cc b/compiler/optimizing/intrinsics_riscv64.cc index 225809d58a..fb3f11357a 100644 --- a/compiler/optimizing/intrinsics_riscv64.cc +++ b/compiler/optimizing/intrinsics_riscv64.cc @@ -553,6 +553,35 @@ void IntrinsicCodeGeneratorRISCV64::VisitStringIndexOfAfter(HInvoke* invoke) { GenerateVisitStringIndexOf(invoke, GetAssembler(), codegen_, /* start_at_zero= */ false); } +std::pair<AqRl, AqRl> GetLrScAqRl(std::memory_order order) { + AqRl load_aqrl = AqRl::kNone; + AqRl store_aqrl = AqRl::kNone; + if (order == std::memory_order_acquire) { + load_aqrl = AqRl::kAcquire; + } else if (order == std::memory_order_release) { + store_aqrl = AqRl::kRelease; + } else if (order == std::memory_order_seq_cst) { + load_aqrl = AqRl::kAqRl; + store_aqrl = AqRl::kRelease; + } else { + DCHECK(order == std::memory_order_relaxed); + } + return {load_aqrl, store_aqrl}; +} + +AqRl GetAmoAqRl(std::memory_order order) { + AqRl amo_aqrl = AqRl::kNone; + if (order == std::memory_order_acquire) { + amo_aqrl = AqRl::kAcquire; + } else if (order == std::memory_order_release) { + amo_aqrl = AqRl::kRelease; + } else { + DCHECK(order == std::memory_order_seq_cst); + amo_aqrl = AqRl::kAqRl; + } + return amo_aqrl; +} + static void EmitLoadReserved(Riscv64Assembler* assembler, DataType::Type type, XRegister ptr, @@ -620,18 +649,7 @@ static void GenerateCompareAndSet(Riscv64Assembler* assembler, // from the main path attempt to emit CAS when the marked old value matched `expected`. DCHECK_IMPLIES(expected2 != kNoXRegister, type == DataType::Type::kReference); - AqRl load_aqrl = AqRl::kNone; - AqRl store_aqrl = AqRl::kNone; - if (order == std::memory_order_acquire) { - load_aqrl = AqRl::kAcquire; - } else if (order == std::memory_order_release) { - store_aqrl = AqRl::kRelease; - } else if (order == std::memory_order_seq_cst) { - load_aqrl = AqRl::kAqRl; - store_aqrl = AqRl::kRelease; - } else { - DCHECK(order == std::memory_order_relaxed); - } + auto [load_aqrl, store_aqrl] = GetLrScAqRl(order); // repeat: { // old_value = [ptr]; // Load exclusive. @@ -665,9 +683,9 @@ static void GenerateCompareAndSet(Riscv64Assembler* assembler, // The `old_value` does not need to be preserved as the caller shall use `masked` // to return the old value if needed. to_store = old_value; - // TODO(riscv64): We could XOR the old and new value before the loop and use XOR here - // instead of the ANDN+OR. (The `new_value` is either Zero or a temporary we can clobber.) - __ Andn(to_store, old_value, mask); + // TODO(riscv64): We could XOR the old and new value before the loop and use a single XOR here + // instead of the XOR+OR. (The `new_value` is either Zero or a temporary we can clobber.) + __ Xor(to_store, old_value, masked); __ Or(to_store, to_store, new_value); } else if (expected2 != kNoXRegister) { Riscv64Label match2; @@ -846,12 +864,109 @@ class ReadBarrierCasSlowPathRISCV64 : public SlowPathCodeRISCV64 { enum class GetAndUpdateOp { kSet, kAdd, - kAddWithByteSwap, kAnd, kOr, kXor }; +// Generate a GetAndUpdate operation. +// +// Only 32-bit and 64-bit atomics are currently supported, therefore smaller types need +// special handling. The caller emits code to prepare aligned `ptr` and adjusted `arg` +// and extract the needed bits from `old_value`. For bitwise operations, no extra +// handling is needed here. For `GetAndUpdateOp::kSet` and `GetAndUpdateOp::kAdd` we +// also use a special LR/SC sequence that uses a `mask` to update only the desired bits. +// Note: The `mask` must contain the bits to keep for `GetAndUpdateOp::kSet` and +// the bits to replace for `GetAndUpdateOp::kAdd`. +static void GenerateGetAndUpdate(CodeGeneratorRISCV64* codegen, + GetAndUpdateOp get_and_update_op, + DataType::Type type, + std::memory_order order, + XRegister ptr, + XRegister arg, + XRegister old_value, + XRegister mask, + XRegister temp) { + DCHECK_EQ(mask != kNoXRegister, temp != kNoXRegister); + DCHECK_IMPLIES(mask != kNoXRegister, type == DataType::Type::kInt32); + DCHECK_IMPLIES( + mask != kNoXRegister, + (get_and_update_op == GetAndUpdateOp::kSet) || (get_and_update_op == GetAndUpdateOp::kAdd)); + Riscv64Assembler* assembler = codegen->GetAssembler(); + AqRl amo_aqrl = GetAmoAqRl(order); + switch (get_and_update_op) { + case GetAndUpdateOp::kSet: + if (type == DataType::Type::kInt64) { + __ AmoSwapD(old_value, arg, ptr, amo_aqrl); + } else if (mask == kNoXRegister) { + DCHECK_EQ(type, DataType::Type::kInt32); + __ AmoSwapW(old_value, arg, ptr, amo_aqrl); + } else { + DCHECK_EQ(type, DataType::Type::kInt32); + DCHECK_NE(temp, kNoXRegister); + auto [load_aqrl, store_aqrl] = GetLrScAqRl(order); + Riscv64Label retry; + __ Bind(&retry); + __ LrW(old_value, ptr, load_aqrl); + __ And(temp, old_value, mask); + __ Or(temp, temp, arg); + __ ScW(temp, temp, ptr, store_aqrl); + __ Bnez(temp, &retry, /*is_bare=*/ true); // Bare: `TMP` shall not be clobbered. + } + break; + case GetAndUpdateOp::kAdd: + if (type == DataType::Type::kInt64) { + __ AmoAddD(old_value, arg, ptr, amo_aqrl); + } else if (mask == kNoXRegister) { + DCHECK_EQ(type, DataType::Type::kInt32); + __ AmoAddW(old_value, arg, ptr, amo_aqrl); + } else { + DCHECK_EQ(type, DataType::Type::kInt32); + DCHECK_NE(temp, kNoXRegister); + auto [load_aqrl, store_aqrl] = GetLrScAqRl(order); + Riscv64Label retry; + __ Bind(&retry); + __ LrW(old_value, ptr, load_aqrl); + __ Add(temp, old_value, arg); + // We use `(A ^ B) ^ A == B` and with the masking `((A ^ B) & mask) ^ A`, the result + // contains bits from `B` for bits specified in `mask` and bits from `A` elsewhere. + // Note: These instructions directly depend on each other, so it's not necessarily the + // fastest approach but for `(A ^ ~mask) | (B & mask)` we would need an extra register for + // `~mask` because ANDN is not in the "I" instruction set as required for a LR/SC sequence. + __ Xor(temp, temp, old_value); + __ And(temp, temp, mask); + __ Xor(temp, temp, old_value); + __ ScW(temp, temp, ptr, store_aqrl); + __ Bnez(temp, &retry, /*is_bare=*/ true); // Bare: `TMP` shall not be clobbered. + } + break; + case GetAndUpdateOp::kAnd: + if (type == DataType::Type::kInt64) { + __ AmoAndD(old_value, arg, ptr, amo_aqrl); + } else { + DCHECK_EQ(type, DataType::Type::kInt32); + __ AmoAndW(old_value, arg, ptr, amo_aqrl); + } + break; + case GetAndUpdateOp::kOr: + if (type == DataType::Type::kInt64) { + __ AmoOrD(old_value, arg, ptr, amo_aqrl); + } else { + DCHECK_EQ(type, DataType::Type::kInt32); + __ AmoOrW(old_value, arg, ptr, amo_aqrl); + } + break; + case GetAndUpdateOp::kXor: + if (type == DataType::Type::kInt64) { + __ AmoXorD(old_value, arg, ptr, amo_aqrl); + } else { + DCHECK_EQ(type, DataType::Type::kInt32); + __ AmoXorW(old_value, arg, ptr, amo_aqrl); + } + break; + } +} + class VarHandleSlowPathRISCV64 : public IntrinsicSlowPathRISCV64 { public: VarHandleSlowPathRISCV64(HInvoke* invoke, std::memory_order order) @@ -1610,21 +1725,21 @@ static void CreateVarHandleCompareAndSetOrExchangeLocations(HInvoke* invoke, Location expected = locations->InAt(expected_index); Location new_value = locations->InAt(new_value_index); size_t data_size = DataType::Size(value_type); - bool small = (data_size < 4u); - bool byte_swap = + bool is_small = (data_size < 4u); + bool can_byte_swap = (expected_index == 3u) && (value_type != DataType::Type::kReference && data_size != 1u); - bool fp = DataType::IsFloatingPointType(value_type); + bool is_fp = DataType::IsFloatingPointType(value_type); size_t temps_needed = // The offset temp is used for the `tmp_ptr`. 1u + // For small values, we need a temp for the `mask`, `masked` and maybe also for the `shift`. - (small ? (return_success ? 2u : 3u) : 0u) + + (is_small ? (return_success ? 2u : 3u) : 0u) + // Some cases need modified copies of `new_value` and `expected`. - (ScratchXRegisterNeeded(expected, value_type, byte_swap) ? 1u : 0u) + - (ScratchXRegisterNeeded(new_value, value_type, byte_swap) ? 1u : 0u) + + (ScratchXRegisterNeeded(expected, value_type, can_byte_swap) ? 1u : 0u) + + (ScratchXRegisterNeeded(new_value, value_type, can_byte_swap) ? 1u : 0u) + // We need a scratch register either for the old value or for the result of SC. // If we need to return a floating point old value, we need a temp for each. - ((!return_success && fp) ? 2u : 1u); + ((!return_success && is_fp) ? 2u : 1u); size_t scratch_registers_available = 2u; DCHECK_EQ(scratch_registers_available, ScratchRegisterScope(codegen->GetAssembler()).AvailableXRegisters()); @@ -1643,7 +1758,7 @@ static XRegister PrepareXRegister(CodeGeneratorRISCV64* codegen, XRegister mask, bool byte_swap, ScratchRegisterScope* srs) { - DCHECK_EQ(shift == kNoXRegister, mask == kNoXRegister); + DCHECK_IMPLIES(mask != kNoXRegister, shift != kNoXRegister); DCHECK_EQ(shift == kNoXRegister, DataType::Size(type) >= 4u); if (loc.IsConstant()) { // The `shift`/`mask` and `byte_swap` are irrelevant for zero input. @@ -1671,13 +1786,35 @@ static XRegister PrepareXRegister(CodeGeneratorRISCV64* codegen, Riscv64Assembler* assembler = codegen->GetAssembler(); __ Sllw(result.AsRegister<XRegister>(), loc.AsRegister<XRegister>(), shift); DCHECK_NE(type, DataType::Type::kUint8); - if (type != DataType::Type::kUint16 && type != DataType::Type::kBool) { + if (mask != kNoXRegister && type != DataType::Type::kUint16 && type != DataType::Type::kBool) { __ And(result.AsRegister<XRegister>(), result.AsRegister<XRegister>(), mask); } } return result.AsRegister<XRegister>(); } +static void GenerateByteSwapAndExtract(Riscv64Assembler* assembler, + Location rd, + XRegister rs1, + XRegister shift, + DataType::Type type) { + // Do not apply shift in `GenerateReverseBytes()` for small types. + DCHECK_EQ(shift != kNoXRegister, DataType::Size(type) < 4u); + DataType::Type swap_type = (shift != kNoXRegister) ? DataType::Type::kInt32 : type; + // Also handles moving to FP registers. + GenerateReverseBytes(assembler, rd, rs1, swap_type); + if (shift != kNoXRegister) { + DCHECK_EQ(rs1, rd.AsRegister<XRegister>()); + __ Sllw(rs1, rs1, shift); + if (type == DataType::Type::kUint16) { + __ Srliw(rs1, rs1, 16); + } else { + DCHECK_EQ(type, DataType::Type::kInt16); + __ Sraiw(rs1, rs1, 16); + } + } +} + static void GenerateVarHandleCompareAndSetOrExchange(HInvoke* invoke, CodeGeneratorRISCV64* codegen, std::memory_order order, @@ -1727,7 +1864,8 @@ static void GenerateVarHandleCompareAndSetOrExchange(HInvoke* invoke, DCHECK_EQ(target.offset, locations->GetTemp(0u).AsRegister<XRegister>()); size_t next_temp = 1u; XRegister tmp_ptr = target.offset; - if (value_type == DataType::Type::kReference && codegen->EmitReadBarrier()) { + bool is_reference = (value_type == DataType::Type::kReference); + if (is_reference && codegen->EmitReadBarrier()) { DCHECK_EQ(available_scratch_registers, 2u); available_scratch_registers -= 1u; DCHECK_EQ(expected_index, 1u + GetExpectedVarHandleCoordinatesCount(invoke)); @@ -1751,15 +1889,16 @@ static void GenerateVarHandleCompareAndSetOrExchange(HInvoke* invoke, XRegister mask = kNoXRegister; XRegister masked = kNoXRegister; size_t data_size = DataType::Size(value_type); - if (data_size < 4u) { + bool is_small = (data_size < 4u); + if (is_small) { // When returning "success" and not the old value, we shall not need the `shift` after // the raw CAS operation, so use the output register as a temporary here. shift = return_success ? locations->Out().AsRegister<XRegister>() : get_temp(); mask = get_temp(); masked = get_temp(); - __ Andi(shift, tmp_ptr, 3); + // Upper bits of the shift are not used, so we do not need to clear them. + __ Slli(shift, tmp_ptr, WhichPowerOf2(kBitsPerByte)); __ Andi(tmp_ptr, tmp_ptr, -4); - __ Slli(shift, shift, WhichPowerOf2(kBitsPerByte)); __ Li(mask, (1 << (data_size * kBitsPerByte)) - 1); __ Sllw(mask, mask, shift); } @@ -1770,9 +1909,10 @@ static void GenerateVarHandleCompareAndSetOrExchange(HInvoke* invoke, PrepareXRegister(codegen, expected, value_type, shift, mask, byte_swap, &srs); XRegister new_value_reg = PrepareXRegister(codegen, new_value, value_type, shift, mask, byte_swap, &srs); - DataType::Type cas_type = DataType::IsFloatingPointType(value_type) + bool is_fp = DataType::IsFloatingPointType(value_type); + DataType::Type cas_type = is_fp ? IntTypeForFloatingPointType(value_type) - : (data_size >= 4u ? value_type : DataType::Type::kInt32); + : (is_small ? DataType::Type::kInt32 : value_type); // Prepare registers for old value and the result of the store conditional. XRegister old_value; @@ -1781,7 +1921,7 @@ static void GenerateVarHandleCompareAndSetOrExchange(HInvoke* invoke, // Use a temp for the old value and the output register for the store conditional result. old_value = get_temp(); store_result = out.AsRegister<XRegister>(); - } else if (DataType::IsFloatingPointType(value_type)) { + } else if (is_fp) { // We need two temporary registers. old_value = get_temp(); store_result = get_temp(); @@ -1795,7 +1935,7 @@ static void GenerateVarHandleCompareAndSetOrExchange(HInvoke* invoke, Riscv64Label* exit_loop = &exit_loop_label; Riscv64Label* cmp_failure = &exit_loop_label; - if (value_type == DataType::Type::kReference && codegen->EmitReadBarrier()) { + if (is_reference && codegen->EmitReadBarrier()) { // The `old_value_temp` is used first for marking the `old_value` and then for the unmarked // reloaded old value for subsequent CAS in the slow path. It cannot be a scratch register. XRegister old_value_temp = locations->GetTemp(next_temp).AsRegister<XRegister>(); @@ -1849,24 +1989,11 @@ static void GenerateVarHandleCompareAndSetOrExchange(HInvoke* invoke, if (return_success) { // Nothing to do, the result register already contains 1 on success and 0 on failure. } else if (byte_swap) { - // Do not apply shift in `GenerateReverseBytes()` for small types. - DataType::Type swap_type = data_size < 4u ? DataType::Type::kInt32 : value_type; - // Also handles moving to FP registers. - GenerateReverseBytes(assembler, out, old_value, swap_type); - if (data_size < 4u) { - DCHECK(Location::RegisterLocation(old_value).Equals(out)); - __ Sllw(old_value, old_value, shift); - if (value_type == DataType::Type::kUint16) { - __ Srliw(old_value, old_value, 16); - } else { - DCHECK_EQ(value_type, DataType::Type::kInt16); - __ Sraiw(old_value, old_value, 16); - } - } - } else if (DataType::IsFloatingPointType(value_type)) { + GenerateByteSwapAndExtract(assembler, out, old_value, shift, value_type); + } else if (is_fp) { codegen->MoveLocation(out, Location::RegisterLocation(old_value), value_type); - } else if (data_size < 4u) { - __ Srl(old_value, masked, shift); + } else if (is_small) { + __ Srlw(old_value, masked, shift); if (value_type == DataType::Type::kInt8) { __ SextB(old_value, old_value); } else if (value_type == DataType::Type::kInt16) { @@ -1881,8 +2008,7 @@ static void GenerateVarHandleCompareAndSetOrExchange(HInvoke* invoke, // Check that we have allocated the right number of temps. We may need more registers // for byte swapped CAS in the slow path, so skip this check for the main path in that case. - bool has_byte_swap = - (expected_index == 3u) && (value_type != DataType::Type::kReference && data_size != 1u); + bool has_byte_swap = (expected_index == 3u) && (!is_reference && data_size != 1u); if ((!has_byte_swap || byte_swap) && next_temp != locations->GetTempCount()) { // We allocate a temporary register for the class object for a static field `VarHandle` but // we do not update the `next_temp` if it's otherwise unused after the address calculation. @@ -1964,13 +2090,457 @@ void IntrinsicCodeGeneratorRISCV64::VisitVarHandleWeakCompareAndSetRelease(HInvo invoke, codegen_, std::memory_order_release, /*return_success=*/ true, /*strong=*/ false); } +static void CreateVarHandleGetAndUpdateLocations(HInvoke* invoke, + CodeGeneratorRISCV64* codegen, + GetAndUpdateOp get_and_update_op) { + VarHandleOptimizations optimizations(invoke); + if (optimizations.GetDoNotIntrinsify()) { + return; + } + + if (invoke->GetType() == DataType::Type::kReference && codegen->EmitNonBakerReadBarrier()) { + // Unsupported for non-Baker read barrier because the artReadBarrierSlow() ignores + // the passed reference and reloads it from the field, thus seeing the new value + // that we have just stored. (And it also gets the memory visibility wrong.) b/173104084 + return; + } + + LocationSummary* locations = CreateVarHandleCommonLocations(invoke, codegen); + uint32_t arg_index = invoke->GetNumberOfArguments() - 1; + DCHECK_EQ(arg_index, 1u + GetExpectedVarHandleCoordinatesCount(invoke)); + DataType::Type value_type = invoke->GetType(); + DCHECK_EQ(value_type, GetDataTypeFromShorty(invoke, arg_index)); + Location arg = locations->InAt(arg_index); + + bool is_fp = DataType::IsFloatingPointType(value_type); + if (is_fp) { + if (get_and_update_op == GetAndUpdateOp::kAdd) { + // For ADD, do not use ZR for zero bit pattern (+0.0f or +0.0). + locations->SetInAt(invoke->GetNumberOfArguments() - 1u, Location::RequiresFpuRegister()); + } else { + DCHECK(get_and_update_op == GetAndUpdateOp::kSet); + } + } + + size_t data_size = DataType::Size(value_type); + bool can_byte_swap = + (arg_index == 3u) && (value_type != DataType::Type::kReference && data_size != 1u); + bool can_use_cas = (get_and_update_op == GetAndUpdateOp::kAdd) && (can_byte_swap || is_fp); + bool is_small = (data_size < 4u); + bool is_small_and = is_small && (get_and_update_op == GetAndUpdateOp::kAnd); + bool is_bitwise = + (get_and_update_op != GetAndUpdateOp::kSet && get_and_update_op != GetAndUpdateOp::kAdd); + + size_t temps_needed = + // The offset temp is used for the `tmp_ptr`. + 1u + + // For small values, we need temps for `shift` and maybe also `mask` and `temp`. + (is_small ? (is_bitwise ? 1u : 3u) : 0u) + + // Some cases need modified copies of `arg`. + (is_small_and || ScratchXRegisterNeeded(arg, value_type, can_byte_swap) ? 1u : 0u) + + // For FP types, we need a temp for `old_value` which cannot be loaded directly to `out`. + (is_fp ? 1u : 0u); + if (can_use_cas) { + size_t cas_temps_needed = + // The offset temp is used for the `tmp_ptr`. + 1u + + // For small values, we need a temp for `shift`. + (is_small ? 1u : 0u) + + // And we always need temps for `old_value`, `new_value` and `reloaded_old_value`. + 3u; + DCHECK_GE(cas_temps_needed, temps_needed); + temps_needed = cas_temps_needed; + } + + size_t scratch_registers_available = 2u; + DCHECK_EQ(scratch_registers_available, + ScratchRegisterScope(codegen->GetAssembler()).AvailableXRegisters()); + size_t old_temp_count = locations->GetTempCount(); + DCHECK_EQ(old_temp_count, (arg_index == 1u) ? 2u : 1u); + if (temps_needed > old_temp_count + scratch_registers_available) { + locations->AddRegisterTemps(temps_needed - (old_temp_count + scratch_registers_available)); + } +} + static void GenerateVarHandleGetAndUpdate(HInvoke* invoke, CodeGeneratorRISCV64* codegen, GetAndUpdateOp get_and_update_op, std::memory_order order, bool byte_swap = false) { - UNUSED(invoke, codegen, get_and_update_op, order, byte_swap); - LOG(FATAL) << "Unimplemented!"; + uint32_t arg_index = invoke->GetNumberOfArguments() - 1; + DCHECK_EQ(arg_index, 1u + GetExpectedVarHandleCoordinatesCount(invoke)); + DataType::Type value_type = invoke->GetType(); + DCHECK_EQ(value_type, GetDataTypeFromShorty(invoke, arg_index)); + + Riscv64Assembler* assembler = codegen->GetAssembler(); + LocationSummary* locations = invoke->GetLocations(); + Location arg = locations->InAt(arg_index); + DCHECK_IMPLIES(arg.IsConstant(), arg.GetConstant()->IsZeroBitPattern()); + Location out = locations->Out(); + + VarHandleTarget target = GetVarHandleTarget(invoke); + VarHandleSlowPathRISCV64* slow_path = nullptr; + if (!byte_swap) { + slow_path = GenerateVarHandleChecks(invoke, codegen, order, value_type); + GenerateVarHandleTarget(invoke, target, codegen); + if (slow_path != nullptr) { + slow_path->SetGetAndUpdateOp(get_and_update_op); + __ Bind(slow_path->GetNativeByteOrderLabel()); + } + } + + // This needs to be before the temp registers, as MarkGCCard also uses scratch registers. + if (CodeGenerator::StoreNeedsWriteBarrier(value_type, invoke->InputAt(arg_index))) { + DCHECK(get_and_update_op == GetAndUpdateOp::kSet); + // Mark card for object, the new value shall be stored. + bool new_value_can_be_null = true; // TODO: Worth finding out this information? + codegen->MarkGCCard(target.object, arg.AsRegister<XRegister>(), new_value_can_be_null); + } + + size_t data_size = DataType::Size(value_type); + bool is_fp = DataType::IsFloatingPointType(value_type); + bool use_cas = (get_and_update_op == GetAndUpdateOp::kAdd) && (byte_swap || is_fp); + bool is_small = (data_size < 4u); + bool is_small_and = is_small && (get_and_update_op == GetAndUpdateOp::kAnd); + bool is_reference = (value_type == DataType::Type::kReference); + DataType::Type op_type = is_fp + ? IntTypeForFloatingPointType(value_type) + : (is_small || is_reference ? DataType::Type::kInt32 : value_type); + + ScratchRegisterScope srs(assembler); + DCHECK_EQ(srs.AvailableXRegisters(), 2u); + size_t available_scratch_registers = use_cas + // We use scratch registers differently for the CAS path. + ? 0u + // Reserve one scratch register for `PrepareXRegister()` or similar `arg_reg` allocation. + : (is_small_and || ScratchXRegisterNeeded(arg, value_type, byte_swap) ? 1u : 2u); + + // Reuse the `target.offset` temporary for the pointer to the target location, + // except for references that need the offset for the non-Baker read barrier. + DCHECK_EQ(target.offset, locations->GetTemp(0u).AsRegister<XRegister>()); + size_t next_temp = 1u; + XRegister tmp_ptr = target.offset; + if (is_reference && codegen->EmitNonBakerReadBarrier()) { + DCHECK_EQ(available_scratch_registers, 2u); + available_scratch_registers -= 1u; + tmp_ptr = srs.AllocateXRegister(); + } + __ Add(tmp_ptr, target.object, target.offset); + + auto get_temp = [&]() { + if (available_scratch_registers != 0u) { + available_scratch_registers -= 1u; + return srs.AllocateXRegister(); + } else { + XRegister temp = locations->GetTemp(next_temp).AsRegister<XRegister>(); + next_temp += 1u; + return temp; + } + }; + + XRegister shift = kNoXRegister; + XRegister mask = kNoXRegister; + XRegister prepare_mask = kNoXRegister; + XRegister temp = kNoXRegister; + XRegister arg_reg = kNoXRegister; + if (is_small) { + shift = get_temp(); + // Upper bits of the shift are not used, so we do not need to clear them. + __ Slli(shift, tmp_ptr, WhichPowerOf2(kBitsPerByte)); + __ Andi(tmp_ptr, tmp_ptr, -4); + switch (get_and_update_op) { + case GetAndUpdateOp::kAdd: + if (byte_swap) { + // The mask is not needed in the CAS path. + DCHECK(use_cas); + break; + } + FALLTHROUGH_INTENDED; + case GetAndUpdateOp::kSet: + mask = get_temp(); + temp = get_temp(); + __ Li(mask, (1 << (data_size * kBitsPerByte)) - 1); + __ Sllw(mask, mask, shift); + // The argument does not need to be masked for `GetAndUpdateOp::kAdd`, + // the mask shall be applied after the ADD instruction. + prepare_mask = (get_and_update_op == GetAndUpdateOp::kSet) ? mask : kNoXRegister; + break; + case GetAndUpdateOp::kAnd: + // We need to set all other bits, so we always need a temp. + arg_reg = srs.AllocateXRegister(); + if (data_size == 1u) { + __ Ori(arg_reg, InputXRegisterOrZero(arg), ~0xff); + DCHECK(!byte_swap); + } else { + DCHECK_EQ(data_size, 2u); + __ Li(arg_reg, ~0xffff); + __ Or(arg_reg, InputXRegisterOrZero(arg), arg_reg); + if (byte_swap) { + __ Rev8(arg_reg, arg_reg); + __ Rori(arg_reg, arg_reg, 48); + } + } + __ Rolw(arg_reg, arg_reg, shift); + break; + case GetAndUpdateOp::kOr: + case GetAndUpdateOp::kXor: + // Signed values need to be truncated but we're keeping `prepare_mask == kNoXRegister`. + if (value_type == DataType::Type::kInt8 && !arg.IsConstant()) { + DCHECK(!byte_swap); + arg_reg = srs.AllocateXRegister(); + __ ZextB(arg_reg, arg.AsRegister<XRegister>()); + __ Sllw(arg_reg, arg_reg, shift); + } else if (value_type == DataType::Type::kInt16 && !arg.IsConstant() && !byte_swap) { + arg_reg = srs.AllocateXRegister(); + __ ZextH(arg_reg, arg.AsRegister<XRegister>()); + __ Sllw(arg_reg, arg_reg, shift); + } // else handled by `PrepareXRegister()` below. + break; + } + } + if (arg_reg == kNoXRegister && !use_cas) { + arg_reg = PrepareXRegister(codegen, arg, value_type, shift, prepare_mask, byte_swap, &srs); + } + if (mask != kNoXRegister && get_and_update_op == GetAndUpdateOp::kSet) { + __ Not(mask, mask); // We need to flip the mask for `kSet`, see `GenerateGetAndUpdate()`. + } + + if (use_cas) { + // Allocate scratch registers for temps that can theoretically be clobbered on retry. + // (Even though the `retry` label shall never be far enough for `TMP` to be clobbered.) + DCHECK_EQ(available_scratch_registers, 0u); // Reserved for the two uses below. + XRegister old_value = srs.AllocateXRegister(); + XRegister new_value = srs.AllocateXRegister(); + // Allocate other needed temporaries. + XRegister reloaded_old_value = get_temp(); + XRegister store_result = reloaded_old_value; // Clobber reloaded old value by store result. + FRegister ftmp = is_fp ? srs.AllocateFRegister() : kNoFRegister; + + Riscv64Label retry; + __ Bind(&retry); + codegen->GetInstructionVisitor()->Load( + Location::RegisterLocation(old_value), tmp_ptr, /*offset=*/ 0, op_type); + if (byte_swap) { + GenerateByteSwapAndExtract(assembler, out, old_value, shift, value_type); + } else { + DCHECK(is_fp); + codegen->MoveLocation(out, Location::RegisterLocation(old_value), value_type); + } + if (is_fp) { + codegen->GetInstructionVisitor()->FAdd( + ftmp, out.AsFpuRegister<FRegister>(), arg.AsFpuRegister<FRegister>(), value_type); + codegen->MoveLocation( + Location::RegisterLocation(new_value), Location::FpuRegisterLocation(ftmp), op_type); + } else if (value_type == DataType::Type::kInt64) { + __ Add(new_value, out.AsRegister<XRegister>(), arg.AsRegister<XRegister>()); + } else { + DCHECK_EQ(op_type, DataType::Type::kInt32); + __ Addw(new_value, out.AsRegister<XRegister>(), arg.AsRegister<XRegister>()); + } + if (byte_swap) { + DataType::Type swap_type = op_type; + if (is_small) { + DCHECK_EQ(data_size, 2u); + // We want to update only 16 bits of the 32-bit location. The 16 bits we want to replace + // are present in both `old_value` and `out` but in different bits and byte order. + // To update the 16 bits, we can XOR the new value with the `out`, byte swap as Uint16 + // (extracting only the bits we want to update), shift and XOR with the old value. + swap_type = DataType::Type::kUint16; + __ Xor(new_value, new_value, out.AsRegister<XRegister>()); + } + GenerateReverseBytes(assembler, Location::RegisterLocation(new_value), new_value, swap_type); + if (is_small) { + __ Sllw(new_value, new_value, shift); + __ Xor(new_value, new_value, old_value); + } + } + GenerateCompareAndSet(assembler, + op_type, + order, + /*strong=*/ true, + /*cmp_failure=*/ &retry, + tmp_ptr, + new_value, + /*old_value=*/ reloaded_old_value, + /*mask=*/ kNoXRegister, + /*masked=*/ kNoXRegister, + store_result, + /*expected=*/ old_value); + } else { + XRegister old_value = is_fp ? get_temp() : out.AsRegister<XRegister>(); + GenerateGetAndUpdate( + codegen, get_and_update_op, op_type, order, tmp_ptr, arg_reg, old_value, mask, temp); + if (byte_swap) { + GenerateByteSwapAndExtract(assembler, out, old_value, shift, value_type); + } else if (is_fp) { + codegen->MoveLocation(out, Location::RegisterLocation(old_value), value_type); + } else if (is_small) { + __ Srlw(old_value, old_value, shift); + DCHECK_NE(value_type, DataType::Type::kUint8); + if (value_type == DataType::Type::kInt8) { + __ SextB(old_value, old_value); + } else if (value_type == DataType::Type::kBool) { + __ ZextB(old_value, old_value); + } else if (value_type == DataType::Type::kInt16) { + __ SextH(old_value, old_value); + } else { + DCHECK_EQ(value_type, DataType::Type::kUint16); + __ ZextH(old_value, old_value); + } + } else if (is_reference) { + __ ZextW(old_value, old_value); + if (codegen->EmitBakerReadBarrier()) { + // Use RA as temp. It is clobbered in the slow path anyway. + static constexpr Location kBakerReadBarrierTemp = Location::RegisterLocation(RA); + SlowPathCodeRISCV64* rb_slow_path = + codegen->AddGcRootBakerBarrierBarrierSlowPath(invoke, out, kBakerReadBarrierTemp); + codegen->EmitBakerReadBarierMarkingCheck(rb_slow_path, out, kBakerReadBarrierTemp); + } else if (codegen->EmitNonBakerReadBarrier()) { + Location base_loc = Location::RegisterLocation(target.object); + Location index = Location::RegisterLocation(target.offset); + SlowPathCodeRISCV64* rb_slow_path = codegen->AddReadBarrierSlowPath( + invoke, out, out, base_loc, /*offset=*/ 0u, index); + __ J(rb_slow_path->GetEntryLabel()); + __ Bind(rb_slow_path->GetExitLabel()); + } + } + } + + if (slow_path != nullptr) { + DCHECK(!byte_swap); + __ Bind(slow_path->GetExitLabel()); + } + + // Check that we have allocated the right number of temps. We may need more registers + // for byte swapped CAS in the slow path, so skip this check for the main path in that case. + bool has_byte_swap = (arg_index == 3u) && (!is_reference && data_size != 1u); + if ((!has_byte_swap || byte_swap) && next_temp != locations->GetTempCount()) { + // We allocate a temporary register for the class object for a static field `VarHandle` but + // we do not update the `next_temp` if it's otherwise unused after the address calculation. + CHECK_EQ(arg_index, 1u); + CHECK_EQ(next_temp, 1u); + CHECK_EQ(locations->GetTempCount(), 2u); + } +} + +void IntrinsicLocationsBuilderRISCV64::VisitVarHandleGetAndSet(HInvoke* invoke) { + CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kSet); +} + +void IntrinsicCodeGeneratorRISCV64::VisitVarHandleGetAndSet(HInvoke* invoke) { + GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kSet, std::memory_order_seq_cst); +} + +void IntrinsicLocationsBuilderRISCV64::VisitVarHandleGetAndSetAcquire(HInvoke* invoke) { + CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kSet); +} + +void IntrinsicCodeGeneratorRISCV64::VisitVarHandleGetAndSetAcquire(HInvoke* invoke) { + GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kSet, std::memory_order_acquire); +} + +void IntrinsicLocationsBuilderRISCV64::VisitVarHandleGetAndSetRelease(HInvoke* invoke) { + CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kSet); +} + +void IntrinsicCodeGeneratorRISCV64::VisitVarHandleGetAndSetRelease(HInvoke* invoke) { + GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kSet, std::memory_order_release); +} + +void IntrinsicLocationsBuilderRISCV64::VisitVarHandleGetAndAdd(HInvoke* invoke) { + CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAdd); +} + +void IntrinsicCodeGeneratorRISCV64::VisitVarHandleGetAndAdd(HInvoke* invoke) { + GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAdd, std::memory_order_seq_cst); +} + +void IntrinsicLocationsBuilderRISCV64::VisitVarHandleGetAndAddAcquire(HInvoke* invoke) { + CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAdd); +} + +void IntrinsicCodeGeneratorRISCV64::VisitVarHandleGetAndAddAcquire(HInvoke* invoke) { + GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAdd, std::memory_order_acquire); +} + +void IntrinsicLocationsBuilderRISCV64::VisitVarHandleGetAndAddRelease(HInvoke* invoke) { + CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAdd); +} + +void IntrinsicCodeGeneratorRISCV64::VisitVarHandleGetAndAddRelease(HInvoke* invoke) { + GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAdd, std::memory_order_release); +} + +void IntrinsicLocationsBuilderRISCV64::VisitVarHandleGetAndBitwiseAnd(HInvoke* invoke) { + CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAnd); +} + +void IntrinsicCodeGeneratorRISCV64::VisitVarHandleGetAndBitwiseAnd(HInvoke* invoke) { + GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAnd, std::memory_order_seq_cst); +} + +void IntrinsicLocationsBuilderRISCV64::VisitVarHandleGetAndBitwiseAndAcquire(HInvoke* invoke) { + CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAnd); +} + +void IntrinsicCodeGeneratorRISCV64::VisitVarHandleGetAndBitwiseAndAcquire(HInvoke* invoke) { + GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAnd, std::memory_order_acquire); +} + +void IntrinsicLocationsBuilderRISCV64::VisitVarHandleGetAndBitwiseAndRelease(HInvoke* invoke) { + CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kAnd); +} + +void IntrinsicCodeGeneratorRISCV64::VisitVarHandleGetAndBitwiseAndRelease(HInvoke* invoke) { + GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kAnd, std::memory_order_release); +} + +void IntrinsicLocationsBuilderRISCV64::VisitVarHandleGetAndBitwiseOr(HInvoke* invoke) { + CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kOr); +} + +void IntrinsicCodeGeneratorRISCV64::VisitVarHandleGetAndBitwiseOr(HInvoke* invoke) { + GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kOr, std::memory_order_seq_cst); +} + +void IntrinsicLocationsBuilderRISCV64::VisitVarHandleGetAndBitwiseOrAcquire(HInvoke* invoke) { + CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kOr); +} + +void IntrinsicCodeGeneratorRISCV64::VisitVarHandleGetAndBitwiseOrAcquire(HInvoke* invoke) { + GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kOr, std::memory_order_acquire); +} + +void IntrinsicLocationsBuilderRISCV64::VisitVarHandleGetAndBitwiseOrRelease(HInvoke* invoke) { + CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kOr); +} + +void IntrinsicCodeGeneratorRISCV64::VisitVarHandleGetAndBitwiseOrRelease(HInvoke* invoke) { + GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kOr, std::memory_order_release); +} + +void IntrinsicLocationsBuilderRISCV64::VisitVarHandleGetAndBitwiseXor(HInvoke* invoke) { + CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kXor); +} + +void IntrinsicCodeGeneratorRISCV64::VisitVarHandleGetAndBitwiseXor(HInvoke* invoke) { + GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kXor, std::memory_order_seq_cst); +} + +void IntrinsicLocationsBuilderRISCV64::VisitVarHandleGetAndBitwiseXorAcquire(HInvoke* invoke) { + CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kXor); +} + +void IntrinsicCodeGeneratorRISCV64::VisitVarHandleGetAndBitwiseXorAcquire(HInvoke* invoke) { + GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kXor, std::memory_order_acquire); +} + +void IntrinsicLocationsBuilderRISCV64::VisitVarHandleGetAndBitwiseXorRelease(HInvoke* invoke) { + CreateVarHandleGetAndUpdateLocations(invoke, codegen_, GetAndUpdateOp::kXor); +} + +void IntrinsicCodeGeneratorRISCV64::VisitVarHandleGetAndBitwiseXorRelease(HInvoke* invoke) { + GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kXor, std::memory_order_release); } void VarHandleSlowPathRISCV64::EmitByteArrayViewCode(CodeGenerator* codegen_in) { |