diff options
author | 2017-03-09 13:02:12 +0000 | |
---|---|---|
committer | 2017-03-09 13:02:12 +0000 | |
commit | 5ed51e3176f3dc4ff2e50ba4bf52743d404b5b4f (patch) | |
tree | 1638115757601e4d41d1dc3f3cb9045f5d3d6dd9 /compiler/optimizing | |
parent | 079f5fd58799a23aa5d60a5f85008a4663a33f2a (diff) | |
parent | 54f869ed3c7910e6eb7bade924d41570e9a4cb14 (diff) |
Merge changes Ia26b07f0,Id3d2758c
* changes:
Revert "Revert "Use the holder's gray bit in Baker read barrier slow paths (ARM, ARM64).""
Revert "Revert "Use the "GC is marking" information in compiler read barriers (ARM, ARM64).""
Diffstat (limited to 'compiler/optimizing')
-rw-r--r-- | compiler/optimizing/code_generator_arm.cc | 590 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm.h | 12 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm64.cc | 659 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm64.h | 14 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm_vixl.cc | 616 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm_vixl.h | 14 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_arm.cc | 2 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_arm64.cc | 46 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_arm_vixl.cc | 2 |
9 files changed, 1358 insertions, 597 deletions
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc index 511bd9b7ef..2b0ab3e20e 100644 --- a/compiler/optimizing/code_generator_arm.cc +++ b/compiler/optimizing/code_generator_arm.cc @@ -636,10 +636,75 @@ class ArraySetSlowPathARM : public SlowPathCodeARM { DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathARM); }; +// Abstract base class for read barrier slow paths marking a reference +// `ref`. +// +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class ReadBarrierMarkSlowPathBaseARM : public SlowPathCodeARM { + protected: + ReadBarrierMarkSlowPathBaseARM(HInstruction* instruction, Location ref, Location entrypoint) + : SlowPathCodeARM(instruction), ref_(ref), entrypoint_(entrypoint) { + DCHECK(kEmitCompilerReadBarrier); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathBaseARM"; } + + // Generate assembly code calling the read barrier marking runtime + // entry point (ReadBarrierMarkRegX). + void GenerateReadBarrierMarkRuntimeCall(CodeGenerator* codegen) { + Register ref_reg = ref_.AsRegister<Register>(); + + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. + CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen); + DCHECK_NE(ref_reg, SP); + DCHECK_NE(ref_reg, LR); + DCHECK_NE(ref_reg, PC); + // IP is used internally by the ReadBarrierMarkRegX entry point + // as a temporary, it cannot be the entry point's input/output. + DCHECK_NE(ref_reg, IP); + DCHECK(0 <= ref_reg && ref_reg < kNumberOfCoreRegisters) << ref_reg; + // "Compact" slow path, saving two moves. + // + // Instead of using the standard runtime calling convention (input + // and output in R0): + // + // R0 <- ref + // R0 <- ReadBarrierMark(R0) + // ref <- R0 + // + // we just use rX (the register containing `ref`) as input and output + // of a dedicated entrypoint: + // + // rX <- ReadBarrierMarkRegX(rX) + // + if (entrypoint_.IsValid()) { + arm_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this); + __ blx(entrypoint_.AsRegister<Register>()); + } else { + // Entrypoint is not already loaded, load from the thread. + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg); + // This runtime call does not require a stack map. + arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + } + } + + // The location (register) of the marked object reference. + const Location ref_; + + // The location of the entrypoint if it is already loaded. + const Location entrypoint_; + + private: + DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathBaseARM); +}; + // Slow path marking an object reference `ref` during a read // barrier. The field `obj.field` in the object `obj` holding this -// reference does not get updated by this slow path after marking (see -// ReadBarrierMarkAndUpdateFieldSlowPathARM below for that). +// reference does not get updated by this slow path after marking. // // This means that after the execution of this slow path, `ref` will // always be up-to-date, but `obj.field` may not; i.e., after the @@ -650,13 +715,13 @@ class ArraySetSlowPathARM : public SlowPathCodeARM { // // If `entrypoint` is a valid location it is assumed to already be // holding the entrypoint. The case where the entrypoint is passed in -// is for the GcRoot read barrier. -class ReadBarrierMarkSlowPathARM : public SlowPathCodeARM { +// is when the decision to mark is based on whether the GC is marking. +class ReadBarrierMarkSlowPathARM : public ReadBarrierMarkSlowPathBaseARM { public: ReadBarrierMarkSlowPathARM(HInstruction* instruction, Location ref, Location entrypoint = Location::NoLocation()) - : SlowPathCodeARM(instruction), ref_(ref), entrypoint_(entrypoint) { + : ReadBarrierMarkSlowPathBaseARM(instruction, ref, entrypoint) { DCHECK(kEmitCompilerReadBarrier); } @@ -664,15 +729,77 @@ class ReadBarrierMarkSlowPathARM : public SlowPathCodeARM { void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { LocationSummary* locations = instruction_->GetLocations(); + DCHECK(locations->CanCall()); + if (kIsDebugBuild) { + Register ref_reg = ref_.AsRegister<Register>(); + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg; + } + DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString()) + << "Unexpected instruction in read barrier marking slow path: " + << instruction_->DebugName(); + + __ Bind(GetEntryLabel()); + GenerateReadBarrierMarkRuntimeCall(codegen); + __ b(GetExitLabel()); + } + + private: + DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM); +}; + +// Slow path loading `obj`'s lock word, loading a reference from +// object `*(obj + offset + (index << scale_factor))` into `ref`, and +// marking `ref` if `obj` is gray according to the lock word (Baker +// read barrier). The field `obj.field` in the object `obj` holding +// this reference does not get updated by this slow path after marking +// (see LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM +// below for that). +// +// This means that after the execution of this slow path, `ref` will +// always be up-to-date, but `obj.field` may not; i.e., after the +// flip, `ref` will be a to-space reference, but `obj.field` will +// probably still be a from-space reference (unless it gets updated by +// another thread, or if another thread installed another object +// reference (different from `ref`) in `obj.field`). +// +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class LoadReferenceWithBakerReadBarrierSlowPathARM : public ReadBarrierMarkSlowPathBaseARM { + public: + LoadReferenceWithBakerReadBarrierSlowPathARM(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + bool needs_null_check, + Register temp, + Location entrypoint) + : ReadBarrierMarkSlowPathBaseARM(instruction, ref, entrypoint), + obj_(obj), + offset_(offset), + index_(index), + scale_factor_(scale_factor), + needs_null_check_(needs_null_check), + temp_(temp) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + } + + const char* GetDescription() const OVERRIDE { + return "LoadReferenceWithBakerReadBarrierSlowPathARM"; + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); Register ref_reg = ref_.AsRegister<Register>(); DCHECK(locations->CanCall()); DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg; + DCHECK_NE(ref_reg, temp_); DCHECK(instruction_->IsInstanceFieldGet() || instruction_->IsStaticFieldGet() || instruction_->IsArrayGet() || instruction_->IsArraySet() || - instruction_->IsLoadClass() || - instruction_->IsLoadString() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || @@ -686,145 +813,202 @@ class ReadBarrierMarkSlowPathARM : public SlowPathCodeARM { instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress())); __ Bind(GetEntryLabel()); - // No need to save live registers; it's taken care of by the - // entrypoint. Also, there is no need to update the stack mask, - // as this runtime call will not trigger a garbage collection. - CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen); - DCHECK_NE(ref_reg, SP); - DCHECK_NE(ref_reg, LR); - DCHECK_NE(ref_reg, PC); - // IP is used internally by the ReadBarrierMarkRegX entry point - // as a temporary, it cannot be the entry point's input/output. - DCHECK_NE(ref_reg, IP); - DCHECK(0 <= ref_reg && ref_reg < kNumberOfCoreRegisters) << ref_reg; - // "Compact" slow path, saving two moves. + + // When using MaybeGenerateReadBarrierSlow, the read barrier call is + // inserted after the original load. However, in fast path based + // Baker's read barriers, we need to perform the load of + // mirror::Object::monitor_ *before* the original reference load. + // This load-load ordering is required by the read barrier. + // The fast path/slow path (for Baker's algorithm) should look like: // - // Instead of using the standard runtime calling convention (input - // and output in R0): + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // ref = entrypoint(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // } // - // R0 <- ref - // R0 <- ReadBarrierMark(R0) - // ref <- R0 - // - // we just use rX (the register containing `ref`) as input and output - // of a dedicated entrypoint: + // Note: the original implementation in ReadBarrier::Barrier is + // slightly more complex as it performs additional checks that we do + // not do here for performance reasons. + + // /* int32_t */ monitor = obj->monitor_ + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); + __ LoadFromOffset(kLoadWord, temp_, obj_, monitor_offset); + if (needs_null_check_) { + codegen->MaybeRecordImplicitNullCheck(instruction_); + } + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including the rb_state, + // which shall prevent load-load reordering without using + // a memory barrier (which would be more expensive). + // `obj` is unchanged by this operation, but its value now depends + // on `temp`. + __ add(obj_, obj_, ShifterOperand(temp_, LSR, 32)); + + // The actual reference load. + // A possible implicit null check has already been handled above. + CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen); + arm_codegen->GenerateRawReferenceLoad( + instruction_, ref_, obj_, offset_, index_, scale_factor_, /* needs_null_check */ false); + + // Mark the object `ref` when `obj` is gray. // - // rX <- ReadBarrierMarkRegX(rX) + // if (rb_state == ReadBarrier::GrayState()) + // ref = ReadBarrier::Mark(ref); // - if (entrypoint_.IsValid()) { - arm_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this); - __ blx(entrypoint_.AsRegister<Register>()); - } else { - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg); - // This runtime call does not require a stack map. - arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); - } + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with LSRS + // which can be a 16-bit instruction unlike the TST immediate. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + __ Lsrs(temp_, temp_, LockWord::kReadBarrierStateShift + 1); + __ b(GetExitLabel(), CC); // Carry flag is the last bit shifted out by LSRS. + GenerateReadBarrierMarkRuntimeCall(codegen); + __ b(GetExitLabel()); } private: - // The location (register) of the marked object reference. - const Location ref_; - - // The location of the entrypoint if already loaded. - const Location entrypoint_; - - DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM); + // The register containing the object holding the marked object reference field. + Register obj_; + // The offset, index and scale factor to access the reference in `obj_`. + uint32_t offset_; + Location index_; + ScaleFactor scale_factor_; + // Is a null check required? + bool needs_null_check_; + // A temporary register used to hold the lock word of `obj_`. + Register temp_; + + DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierSlowPathARM); }; -// Slow path marking an object reference `ref` during a read barrier, -// and if needed, atomically updating the field `obj.field` in the -// object `obj` holding this reference after marking (contrary to -// ReadBarrierMarkSlowPathARM above, which never tries to update -// `obj.field`). +// Slow path loading `obj`'s lock word, loading a reference from +// object `*(obj + offset + (index << scale_factor))` into `ref`, and +// marking `ref` if `obj` is gray according to the lock word (Baker +// read barrier). If needed, this slow path also atomically updates +// the field `obj.field` in the object `obj` holding this reference +// after marking (contrary to +// LoadReferenceWithBakerReadBarrierSlowPathARM above, which never +// tries to update `obj.field`). // // This means that after the execution of this slow path, both `ref` // and `obj.field` will be up-to-date; i.e., after the flip, both will // hold the same to-space reference (unless another thread installed // another object reference (different from `ref`) in `obj.field`). -class ReadBarrierMarkAndUpdateFieldSlowPathARM : public SlowPathCodeARM { +// +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM + : public ReadBarrierMarkSlowPathBaseARM { public: - ReadBarrierMarkAndUpdateFieldSlowPathARM(HInstruction* instruction, - Location ref, - Register obj, - Location field_offset, - Register temp1, - Register temp2) - : SlowPathCodeARM(instruction), - ref_(ref), + LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + bool needs_null_check, + Register temp1, + Register temp2, + Location entrypoint) + : ReadBarrierMarkSlowPathBaseARM(instruction, ref, entrypoint), obj_(obj), - field_offset_(field_offset), + offset_(offset), + index_(index), + scale_factor_(scale_factor), + needs_null_check_(needs_null_check), temp1_(temp1), temp2_(temp2) { DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); } - const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkAndUpdateFieldSlowPathARM"; } + const char* GetDescription() const OVERRIDE { + return "LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM"; + } void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { LocationSummary* locations = instruction_->GetLocations(); Register ref_reg = ref_.AsRegister<Register>(); DCHECK(locations->CanCall()); DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg; - // This slow path is only used by the UnsafeCASObject intrinsic. + DCHECK_NE(ref_reg, temp1_); + + // This slow path is only used by the UnsafeCASObject intrinsic at the moment. DCHECK((instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier marking and field updating slow path: " << instruction_->DebugName(); DCHECK(instruction_->GetLocations()->Intrinsified()); DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kUnsafeCASObject); - DCHECK(field_offset_.IsRegisterPair()) << field_offset_; + DCHECK_EQ(offset_, 0u); + DCHECK_EQ(scale_factor_, ScaleFactor::TIMES_1); + // The location of the offset of the marked reference field within `obj_`. + Location field_offset = index_; + DCHECK(field_offset.IsRegisterPair()) << field_offset; __ Bind(GetEntryLabel()); - // Save the old reference. + // /* int32_t */ monitor = obj->monitor_ + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); + __ LoadFromOffset(kLoadWord, temp1_, obj_, monitor_offset); + if (needs_null_check_) { + codegen->MaybeRecordImplicitNullCheck(instruction_); + } + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including the rb_state, + // which shall prevent load-load reordering without using + // a memory barrier (which would be more expensive). + // `obj` is unchanged by this operation, but its value now depends + // on `temp1`. + __ add(obj_, obj_, ShifterOperand(temp1_, LSR, 32)); + + // The actual reference load. + // A possible implicit null check has already been handled above. + CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen); + arm_codegen->GenerateRawReferenceLoad( + instruction_, ref_, obj_, offset_, index_, scale_factor_, /* needs_null_check */ false); + + // Mark the object `ref` when `obj` is gray. + // + // if (rb_state == ReadBarrier::GrayState()) + // ref = ReadBarrier::Mark(ref); + // + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with LSRS + // which can be a 16-bit instruction unlike the TST immediate. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + __ Lsrs(temp1_, temp1_, LockWord::kReadBarrierStateShift + 1); + __ b(GetExitLabel(), CC); // Carry flag is the last bit shifted out by LSRS. + + // Save the old value of the reference before marking it. // Note that we cannot use IP to save the old reference, as IP is // used internally by the ReadBarrierMarkRegX entry point, and we // need the old reference after the call to that entry point. DCHECK_NE(temp1_, IP); __ Mov(temp1_, ref_reg); - // No need to save live registers; it's taken care of by the - // entrypoint. Also, there is no need to update the stack mask, - // as this runtime call will not trigger a garbage collection. - CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen); - DCHECK_NE(ref_reg, SP); - DCHECK_NE(ref_reg, LR); - DCHECK_NE(ref_reg, PC); - // IP is used internally by the ReadBarrierMarkRegX entry point - // as a temporary, it cannot be the entry point's input/output. - DCHECK_NE(ref_reg, IP); - DCHECK(0 <= ref_reg && ref_reg < kNumberOfCoreRegisters) << ref_reg; - // "Compact" slow path, saving two moves. - // - // Instead of using the standard runtime calling convention (input - // and output in R0): - // - // R0 <- ref - // R0 <- ReadBarrierMark(R0) - // ref <- R0 - // - // we just use rX (the register containing `ref`) as input and output - // of a dedicated entrypoint: - // - // rX <- ReadBarrierMarkRegX(rX) - // - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg); - // This runtime call does not require a stack map. - arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + GenerateReadBarrierMarkRuntimeCall(codegen); // If the new reference is different from the old reference, - // update the field in the holder (`*(obj_ + field_offset_)`). + // update the field in the holder (`*(obj_ + field_offset)`). // // Note that this field could also hold a different object, if // another thread had concurrently changed it. In that case, the // LDREX/SUBS/ITNE sequence of instructions in the compare-and-set // (CAS) operation below would abort the CAS, leaving the field // as-is. - Label done; __ cmp(temp1_, ShifterOperand(ref_reg)); - __ b(&done, EQ); + __ b(GetExitLabel(), EQ); // Update the the holder's field atomically. This may fail if // mutator updates before us, but it's OK. This is achieved @@ -837,7 +1021,7 @@ class ReadBarrierMarkAndUpdateFieldSlowPathARM : public SlowPathCodeARM { // The UnsafeCASObject intrinsic uses a register pair as field // offset ("long offset"), of which only the low part contains // data. - Register offset = field_offset_.AsRegisterPairLow<Register>(); + Register offset = field_offset.AsRegisterPairLow<Register>(); Register expected = temp1_; Register value = ref_reg; Register tmp_ptr = IP; // Pointer to actual memory. @@ -887,22 +1071,27 @@ class ReadBarrierMarkAndUpdateFieldSlowPathARM : public SlowPathCodeARM { } } - __ Bind(&done); __ b(GetExitLabel()); } private: - // The location (register) of the marked object reference. - const Location ref_; // The register containing the object holding the marked object reference field. const Register obj_; - // The location of the offset of the marked reference field within `obj_`. - Location field_offset_; - + // The offset, index and scale factor to access the reference in `obj_`. + uint32_t offset_; + Location index_; + ScaleFactor scale_factor_; + // Is a null check required? + bool needs_null_check_; + // A temporary register used to hold the lock word of `obj_`; and + // also to hold the original reference value, when the reference is + // marked. const Register temp1_; + // A temporary register used in the implementation of the CAS, to + // update the object's reference field. const Register temp2_; - DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathARM); + DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM); }; // Slow path generating a read barrier for a heap reference. @@ -7183,14 +7372,35 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct DCHECK(kEmitCompilerReadBarrier); if (kUseBakerReadBarrier) { // Fast path implementation of art::ReadBarrier::BarrierForRoot when - // Baker's read barrier are used: + // Baker's read barrier are used. + // + // Note that we do not actually check the value of + // `GetIsGcMarking()` to decide whether to mark the loaded GC + // root or not. Instead, we load into `temp` the read barrier + // mark entry point corresponding to register `root`. If `temp` + // is null, it means that `GetIsGcMarking()` is false, and vice + // versa. // - // root = obj.field; // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // if (temp != null) { - // root = temp(root) + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. // } + // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. + Location temp = Location::RegisterLocation(LR); + SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM( + instruction, root, /* entrypoint */ temp); + codegen_->AddSlowPath(slow_path); + + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset); + // /* GcRoot<mirror::Object> */ root = *(obj + offset) __ LoadFromOffset(kLoadWord, root_reg, obj, offset); static_assert( @@ -7201,21 +7411,6 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct "art::mirror::CompressedReference<mirror::Object> and int32_t " "have different sizes."); - // Slow path marking the GC root `root`. - Location temp = Location::RegisterLocation(LR); - SlowPathCodeARM* slow_path = - new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM( - instruction, - root, - /*entrypoint*/ temp); - codegen_->AddSlowPath(slow_path); - - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); - // Loading the entrypoint does not require a load acquire since it is only changed when - // threads are suspended or running a checkpoint. - __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset); // The entrypoint is null when the GC is not marking, this prevents one load compared to // checking GetIsGcMarking. __ CompareAndBranchIfNonZero(temp.AsRegister<Register>(), slow_path->GetEntryLabel()); @@ -7286,51 +7481,101 @@ void CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i DCHECK(kEmitCompilerReadBarrier); DCHECK(kUseBakerReadBarrier); - // In slow path based read barriers, the read barrier call is - // inserted after the original load. However, in fast path based - // Baker's read barriers, we need to perform the load of - // mirror::Object::monitor_ *before* the original reference load. - // This load-load ordering is required by the read barrier. - // The fast path/slow path (for Baker's algorithm) should look like: + // Query `art::Thread::Current()->GetIsGcMarking()` to decide + // whether we need to enter the slow path to mark the reference. + // Then, in the slow path, check the gray bit in the lock word of + // the reference's holder (`obj`) to decide whether to mark `ref` or + // not. // - // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); - // lfence; // Load fence or artificial data dependency to prevent load-load reordering - // HeapReference<Object> ref = *src; // Original reference load. - // bool is_gray = (rb_state == ReadBarrier::GrayState()); - // if (is_gray) { - // ref = ReadBarrier::Mark(ref); // Performed by runtime entrypoint slow path. - // } + // Note that we do not actually check the value of `GetIsGcMarking()`; + // instead, we load into `temp3` the read barrier mark entry point + // corresponding to register `ref`. If `temp3` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. // - // Note: the original implementation in ReadBarrier::Barrier is - // slightly more complex as it performs additional checks that we do - // not do here for performance reasons. + // temp3 = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // if (temp3 != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // ref = temp3(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // } + // } else { + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // } - Register ref_reg = ref.AsRegister<Register>(); Register temp_reg = temp.AsRegister<Register>(); - uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); - // /* int32_t */ monitor = obj->monitor_ - __ LoadFromOffset(kLoadWord, temp_reg, obj, monitor_offset); - if (needs_null_check) { - MaybeRecordImplicitNullCheck(instruction); + // Slow path marking the object `ref` when the GC is marking. The + // entrypoint will already be loaded in `temp3`. + Location temp3 = Location::RegisterLocation(LR); + SlowPathCodeARM* slow_path; + if (always_update_field) { + DCHECK(temp2 != nullptr); + // LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM only + // supports address of the form `obj + field_offset`, where `obj` + // is a register and `field_offset` is a register pair (of which + // only the lower half is used). Thus `offset` and `scale_factor` + // above are expected to be null in this code path. + DCHECK_EQ(offset, 0u); + DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1); + Location field_offset = index; + slow_path = + new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM( + instruction, + ref, + obj, + offset, + /* index */ field_offset, + scale_factor, + needs_null_check, + temp_reg, + *temp2, + /* entrypoint */ temp3); + } else { + slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARM( + instruction, + ref, + obj, + offset, + index, + scale_factor, + needs_null_check, + temp_reg, + /* entrypoint */ temp3); } - // /* LockWord */ lock_word = LockWord(monitor) - static_assert(sizeof(LockWord) == sizeof(int32_t), - "art::LockWord and int32_t have different sizes."); + AddSlowPath(slow_path); - // Introduce a dependency on the lock_word including the rb_state, - // which shall prevent load-load reordering without using - // a memory barrier (which would be more expensive). - // `obj` is unchanged by this operation, but its value now depends - // on `temp_reg`. - __ add(obj, obj, ShifterOperand(temp_reg, LSR, 32)); + // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ LoadFromOffset(kLoadWord, temp3.AsRegister<Register>(), TR, entry_point_offset); + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ CompareAndBranchIfNonZero(temp3.AsRegister<Register>(), slow_path->GetEntryLabel()); + // Fast path: just load the reference. + GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check); + __ Bind(slow_path->GetExitLabel()); +} + +void CodeGeneratorARM::GenerateRawReferenceLoad(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + bool needs_null_check) { + Register ref_reg = ref.AsRegister<Register>(); - // The actual reference load. if (index.IsValid()) { // Load types involving an "index": ArrayGet, // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject // intrinsics. - // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor)) + // /* HeapReference<mirror::Object> */ ref = *(obj + offset + (index << scale_factor)) if (index.IsConstant()) { size_t computed_offset = (index.GetConstant()->AsIntConstant()->GetValue() << scale_factor) + offset; @@ -7347,41 +7592,16 @@ void CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i __ LoadFromOffset(kLoadWord, ref_reg, IP, offset); } } else { - // /* HeapReference<Object> */ ref = *(obj + offset) + // /* HeapReference<mirror::Object> */ ref = *(obj + offset) __ LoadFromOffset(kLoadWord, ref_reg, obj, offset); } - // Object* ref = ref_addr->AsMirrorPtr() - __ MaybeUnpoisonHeapReference(ref_reg); - - // Slow path marking the object `ref` when it is gray. - SlowPathCodeARM* slow_path; - if (always_update_field) { - DCHECK(temp2 != nullptr); - // ReadBarrierMarkAndUpdateFieldSlowPathARM only supports address - // of the form `obj + field_offset`, where `obj` is a register and - // `field_offset` is a register pair (of which only the lower half - // is used). Thus `offset` and `scale_factor` above are expected - // to be null in this code path. - DCHECK_EQ(offset, 0u); - DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1); - slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkAndUpdateFieldSlowPathARM( - instruction, ref, obj, /* field_offset */ index, temp_reg, *temp2); - } else { - slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, ref); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); } - AddSlowPath(slow_path); - // if (rb_state == ReadBarrier::GrayState()) - // ref = ReadBarrier::Mark(ref); - // Given the numeric representation, it's enough to check the low bit of the - // rb_state. We do that by shifting the bit out of the lock word with LSRS - // which can be a 16-bit instruction unlike the TST immediate. - static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); - static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); - __ Lsrs(temp_reg, temp_reg, LockWord::kReadBarrierStateShift + 1); - __ b(slow_path->GetEntryLabel(), CS); // Carry flag is the last bit shifted out by LSRS. - __ Bind(slow_path->GetExitLabel()); + // Object* ref = ref_addr->AsMirrorPtr() + __ MaybeUnpoisonHeapReference(ref_reg); } void CodeGeneratorARM::GenerateReadBarrierSlow(HInstruction* instruction, diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h index e993756b3b..f081a910ee 100644 --- a/compiler/optimizing/code_generator_arm.h +++ b/compiler/optimizing/code_generator_arm.h @@ -520,9 +520,6 @@ class CodeGeneratorARM : public CodeGenerator { Location index, Location temp, bool needs_null_check); - // Factored implementation used by GenerateFieldLoadWithBakerReadBarrier - // and GenerateArrayLoadWithBakerReadBarrier. - // Factored implementation, used by GenerateFieldLoadWithBakerReadBarrier, // GenerateArrayLoadWithBakerReadBarrier and some intrinsics. // @@ -545,6 +542,15 @@ class CodeGeneratorARM : public CodeGenerator { bool always_update_field = false, Register* temp2 = nullptr); + // Generate a heap reference load (with no read barrier). + void GenerateRawReferenceLoad(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + bool needs_null_check); + // Generate a read barrier for a heap reference within `instruction` // using a slow path. // diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index f5038fb1c0..7d1ae7d28b 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -633,10 +633,73 @@ void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) { } } +// Abstract base class for read barrier slow paths marking a reference +// `ref`. +// +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class ReadBarrierMarkSlowPathBaseARM64 : public SlowPathCodeARM64 { + protected: + ReadBarrierMarkSlowPathBaseARM64(HInstruction* instruction, Location ref, Location entrypoint) + : SlowPathCodeARM64(instruction), ref_(ref), entrypoint_(entrypoint) { + DCHECK(kEmitCompilerReadBarrier); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathBaseARM64"; } + + // Generate assembly code calling the read barrier marking runtime + // entry point (ReadBarrierMarkRegX). + void GenerateReadBarrierMarkRuntimeCall(CodeGenerator* codegen) { + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. + CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen); + DCHECK_NE(ref_.reg(), LR); + DCHECK_NE(ref_.reg(), WSP); + DCHECK_NE(ref_.reg(), WZR); + // IP0 is used internally by the ReadBarrierMarkRegX entry point + // as a temporary, it cannot be the entry point's input/output. + DCHECK_NE(ref_.reg(), IP0); + DCHECK(0 <= ref_.reg() && ref_.reg() < kNumberOfWRegisters) << ref_.reg(); + // "Compact" slow path, saving two moves. + // + // Instead of using the standard runtime calling convention (input + // and output in W0): + // + // W0 <- ref + // W0 <- ReadBarrierMark(W0) + // ref <- W0 + // + // we just use rX (the register containing `ref`) as input and output + // of a dedicated entrypoint: + // + // rX <- ReadBarrierMarkRegX(rX) + // + if (entrypoint_.IsValid()) { + arm64_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this); + __ Blr(XRegisterFrom(entrypoint_)); + } else { + // Entrypoint is not already loaded, load from the thread. + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref_.reg()); + // This runtime call does not require a stack map. + arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + } + } + + // The location (register) of the marked object reference. + const Location ref_; + + // The location of the entrypoint if it is already loaded. + const Location entrypoint_; + + private: + DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathBaseARM64); +}; + // Slow path marking an object reference `ref` during a read // barrier. The field `obj.field` in the object `obj` holding this -// reference does not get updated by this slow path after marking (see -// ReadBarrierMarkAndUpdateFieldSlowPathARM64 below for that). +// reference does not get updated by this slow path after marking. // // This means that after the execution of this slow path, `ref` will // always be up-to-date, but `obj.field` may not; i.e., after the @@ -647,15 +710,13 @@ void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) { // // If `entrypoint` is a valid location it is assumed to already be // holding the entrypoint. The case where the entrypoint is passed in -// is for the GcRoot read barrier. -class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 { +// is when the decision to mark is based on whether the GC is marking. +class ReadBarrierMarkSlowPathARM64 : public ReadBarrierMarkSlowPathBaseARM64 { public: ReadBarrierMarkSlowPathARM64(HInstruction* instruction, Location ref, Location entrypoint = Location::NoLocation()) - : SlowPathCodeARM64(instruction), - ref_(ref), - entrypoint_(entrypoint) { + : ReadBarrierMarkSlowPathBaseARM64(instruction, ref, entrypoint) { DCHECK(kEmitCompilerReadBarrier); } @@ -666,12 +727,75 @@ class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 { DCHECK(locations->CanCall()); DCHECK(ref_.IsRegister()) << ref_; DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_.reg())) << ref_.reg(); + DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString()) + << "Unexpected instruction in read barrier marking slow path: " + << instruction_->DebugName(); + + __ Bind(GetEntryLabel()); + GenerateReadBarrierMarkRuntimeCall(codegen); + __ B(GetExitLabel()); + } + + private: + DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM64); +}; + +// Slow path loading `obj`'s lock word, loading a reference from +// object `*(obj + offset + (index << scale_factor))` into `ref`, and +// marking `ref` if `obj` is gray according to the lock word (Baker +// read barrier). The field `obj.field` in the object `obj` holding +// this reference does not get updated by this slow path after marking +// (see LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64 +// below for that). +// +// This means that after the execution of this slow path, `ref` will +// always be up-to-date, but `obj.field` may not; i.e., after the +// flip, `ref` will be a to-space reference, but `obj.field` will +// probably still be a from-space reference (unless it gets updated by +// another thread, or if another thread installed another object +// reference (different from `ref`) in `obj.field`). +// +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class LoadReferenceWithBakerReadBarrierSlowPathARM64 : public ReadBarrierMarkSlowPathBaseARM64 { + public: + LoadReferenceWithBakerReadBarrierSlowPathARM64(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + size_t scale_factor, + bool needs_null_check, + bool use_load_acquire, + Register temp, + Location entrypoint) + : ReadBarrierMarkSlowPathBaseARM64(instruction, ref, entrypoint), + obj_(obj), + offset_(offset), + index_(index), + scale_factor_(scale_factor), + needs_null_check_(needs_null_check), + use_load_acquire_(use_load_acquire), + temp_(temp) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + } + + const char* GetDescription() const OVERRIDE { + return "LoadReferenceWithBakerReadBarrierSlowPathARM64"; + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); + DCHECK(locations->CanCall()); + DCHECK(ref_.IsRegister()) << ref_; + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_.reg())) << ref_.reg(); + DCHECK(obj_.IsW()); + DCHECK_NE(ref_.reg(), LocationFrom(temp_).reg()); DCHECK(instruction_->IsInstanceFieldGet() || instruction_->IsStaticFieldGet() || instruction_->IsArrayGet() || instruction_->IsArraySet() || - instruction_->IsLoadClass() || - instruction_->IsLoadString() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || @@ -684,82 +808,138 @@ class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 { DCHECK(!(instruction_->IsArrayGet() && instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress())); + // Temporary register `temp_`, used to store the lock word, must + // not be IP0 nor IP1, as we may use them to emit the reference + // load (in the call to GenerateRawReferenceLoad below), and we + // need the lock word to still be in `temp_` after the reference + // load. + DCHECK_NE(LocationFrom(temp_).reg(), IP0); + DCHECK_NE(LocationFrom(temp_).reg(), IP1); + __ Bind(GetEntryLabel()); - // No need to save live registers; it's taken care of by the - // entrypoint. Also, there is no need to update the stack mask, - // as this runtime call will not trigger a garbage collection. - CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen); - DCHECK_NE(ref_.reg(), LR); - DCHECK_NE(ref_.reg(), WSP); - DCHECK_NE(ref_.reg(), WZR); - // IP0 is used internally by the ReadBarrierMarkRegX entry point - // as a temporary, it cannot be the entry point's input/output. - DCHECK_NE(ref_.reg(), IP0); - DCHECK(0 <= ref_.reg() && ref_.reg() < kNumberOfWRegisters) << ref_.reg(); - // "Compact" slow path, saving two moves. - // - // Instead of using the standard runtime calling convention (input - // and output in W0): + + // When using MaybeGenerateReadBarrierSlow, the read barrier call is + // inserted after the original load. However, in fast path based + // Baker's read barriers, we need to perform the load of + // mirror::Object::monitor_ *before* the original reference load. + // This load-load ordering is required by the read barrier. + // The fast path/slow path (for Baker's algorithm) should look like: // - // W0 <- ref - // W0 <- ReadBarrierMark(W0) - // ref <- W0 + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // ref = entrypoint(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // } // - // we just use rX (the register containing `ref`) as input and output - // of a dedicated entrypoint: + // Note: the original implementation in ReadBarrier::Barrier is + // slightly more complex as it performs additional checks that we do + // not do here for performance reasons. + + // /* int32_t */ monitor = obj->monitor_ + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); + __ Ldr(temp_, HeapOperand(obj_, monitor_offset)); + if (needs_null_check_) { + codegen->MaybeRecordImplicitNullCheck(instruction_); + } + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including rb_state, + // to prevent load-load reordering, and without using + // a memory barrier (which would be more expensive). + // `obj` is unchanged by this operation, but its value now depends + // on `temp`. + __ Add(obj_.X(), obj_.X(), Operand(temp_.X(), LSR, 32)); + + // The actual reference load. + // A possible implicit null check has already been handled above. + CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen); + arm64_codegen->GenerateRawReferenceLoad(instruction_, + ref_, + obj_, + offset_, + index_, + scale_factor_, + /* needs_null_check */ false, + use_load_acquire_); + + // Mark the object `ref` when `obj` is gray. // - // rX <- ReadBarrierMarkRegX(rX) + // if (rb_state == ReadBarrier::GrayState()) + // ref = ReadBarrier::Mark(ref); // - if (entrypoint_.IsValid()) { - arm64_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this); - __ Blr(XRegisterFrom(entrypoint_)); - } else { - // Entrypoint is not already loaded, load from the thread. - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref_.reg()); - // This runtime call does not require a stack map. - arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); - } + // Given the numeric representation, it's enough to check the low bit of the rb_state. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + __ Tbz(temp_, LockWord::kReadBarrierStateShift, GetExitLabel()); + GenerateReadBarrierMarkRuntimeCall(codegen); + __ B(GetExitLabel()); } private: - // The location (register) of the marked object reference. - const Location ref_; - - // The location of the entrypoint if it is already loaded. - const Location entrypoint_; - - DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM64); + // The register containing the object holding the marked object reference field. + Register obj_; + // The offset, index and scale factor to access the reference in `obj_`. + uint32_t offset_; + Location index_; + size_t scale_factor_; + // Is a null check required? + bool needs_null_check_; + // Should this reference load use Load-Acquire semantics? + bool use_load_acquire_; + // A temporary register used to hold the lock word of `obj_`. + Register temp_; + + DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierSlowPathARM64); }; -// Slow path marking an object reference `ref` during a read barrier, -// and if needed, atomically updating the field `obj.field` in the -// object `obj` holding this reference after marking (contrary to -// ReadBarrierMarkSlowPathARM64 above, which never tries to update -// `obj.field`). +// Slow path loading `obj`'s lock word, loading a reference from +// object `*(obj + offset + (index << scale_factor))` into `ref`, and +// marking `ref` if `obj` is gray according to the lock word (Baker +// read barrier). If needed, this slow path also atomically updates +// the field `obj.field` in the object `obj` holding this reference +// after marking (contrary to +// LoadReferenceWithBakerReadBarrierSlowPathARM64 above, which never +// tries to update `obj.field`). // // This means that after the execution of this slow path, both `ref` // and `obj.field` will be up-to-date; i.e., after the flip, both will // hold the same to-space reference (unless another thread installed // another object reference (different from `ref`) in `obj.field`). -class ReadBarrierMarkAndUpdateFieldSlowPathARM64 : public SlowPathCodeARM64 { +// +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64 + : public ReadBarrierMarkSlowPathBaseARM64 { public: - ReadBarrierMarkAndUpdateFieldSlowPathARM64(HInstruction* instruction, - Location ref, - Register obj, - Location field_offset, - Register temp) - : SlowPathCodeARM64(instruction), - ref_(ref), + LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + size_t scale_factor, + bool needs_null_check, + bool use_load_acquire, + Register temp, + Location entrypoint) + : ReadBarrierMarkSlowPathBaseARM64(instruction, ref, entrypoint), obj_(obj), - field_offset_(field_offset), + offset_(offset), + index_(index), + scale_factor_(scale_factor), + needs_null_check_(needs_null_check), + use_load_acquire_(use_load_acquire), temp_(temp) { DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); } const char* GetDescription() const OVERRIDE { - return "ReadBarrierMarkAndUpdateFieldSlowPathARM64"; + return "LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64"; } void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { @@ -768,64 +948,90 @@ class ReadBarrierMarkAndUpdateFieldSlowPathARM64 : public SlowPathCodeARM64 { DCHECK(locations->CanCall()); DCHECK(ref_.IsRegister()) << ref_; DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_.reg())) << ref_.reg(); - // This slow path is only used by the UnsafeCASObject intrinsic. + DCHECK(obj_.IsW()); + DCHECK_NE(ref_.reg(), LocationFrom(temp_).reg()); + + // This slow path is only used by the UnsafeCASObject intrinsic at the moment. DCHECK((instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier marking and field updating slow path: " << instruction_->DebugName(); DCHECK(instruction_->GetLocations()->Intrinsified()); DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kUnsafeCASObject); - DCHECK(field_offset_.IsRegister()) << field_offset_; + DCHECK_EQ(offset_, 0u); + DCHECK_EQ(scale_factor_, 0u); + DCHECK_EQ(use_load_acquire_, false); + // The location of the offset of the marked reference field within `obj_`. + Location field_offset = index_; + DCHECK(field_offset.IsRegister()) << field_offset; + + // Temporary register `temp_`, used to store the lock word, must + // not be IP0 nor IP1, as we may use them to emit the reference + // load (in the call to GenerateRawReferenceLoad below), and we + // need the lock word to still be in `temp_` after the reference + // load. + DCHECK_NE(LocationFrom(temp_).reg(), IP0); + DCHECK_NE(LocationFrom(temp_).reg(), IP1); __ Bind(GetEntryLabel()); - // Save the old reference. + // /* int32_t */ monitor = obj->monitor_ + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); + __ Ldr(temp_, HeapOperand(obj_, monitor_offset)); + if (needs_null_check_) { + codegen->MaybeRecordImplicitNullCheck(instruction_); + } + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including rb_state, + // to prevent load-load reordering, and without using + // a memory barrier (which would be more expensive). + // `obj` is unchanged by this operation, but its value now depends + // on `temp`. + __ Add(obj_.X(), obj_.X(), Operand(temp_.X(), LSR, 32)); + + // The actual reference load. + // A possible implicit null check has already been handled above. + CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen); + arm64_codegen->GenerateRawReferenceLoad(instruction_, + ref_, + obj_, + offset_, + index_, + scale_factor_, + /* needs_null_check */ false, + use_load_acquire_); + + // Mark the object `ref` when `obj` is gray. + // + // if (rb_state == ReadBarrier::GrayState()) + // ref = ReadBarrier::Mark(ref); + // + // Given the numeric representation, it's enough to check the low bit of the rb_state. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + __ Tbz(temp_, LockWord::kReadBarrierStateShift, GetExitLabel()); + + // Save the old value of the reference before marking it. // Note that we cannot use IP to save the old reference, as IP is // used internally by the ReadBarrierMarkRegX entry point, and we // need the old reference after the call to that entry point. DCHECK_NE(LocationFrom(temp_).reg(), IP0); __ Mov(temp_.W(), ref_reg); - // No need to save live registers; it's taken care of by the - // entrypoint. Also, there is no need to update the stack mask, - // as this runtime call will not trigger a garbage collection. - CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen); - DCHECK_NE(ref_.reg(), LR); - DCHECK_NE(ref_.reg(), WSP); - DCHECK_NE(ref_.reg(), WZR); - // IP0 is used internally by the ReadBarrierMarkRegX entry point - // as a temporary, it cannot be the entry point's input/output. - DCHECK_NE(ref_.reg(), IP0); - DCHECK(0 <= ref_.reg() && ref_.reg() < kNumberOfWRegisters) << ref_.reg(); - // "Compact" slow path, saving two moves. - // - // Instead of using the standard runtime calling convention (input - // and output in W0): - // - // W0 <- ref - // W0 <- ReadBarrierMark(W0) - // ref <- W0 - // - // we just use rX (the register containing `ref`) as input and output - // of a dedicated entrypoint: - // - // rX <- ReadBarrierMarkRegX(rX) - // - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref_.reg()); - // This runtime call does not require a stack map. - arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + GenerateReadBarrierMarkRuntimeCall(codegen); // If the new reference is different from the old reference, - // update the field in the holder (`*(obj_ + field_offset_)`). + // update the field in the holder (`*(obj_ + field_offset)`). // // Note that this field could also hold a different object, if // another thread had concurrently changed it. In that case, the // LDXR/CMP/BNE sequence of instructions in the compare-and-set // (CAS) operation below would abort the CAS, leaving the field // as-is. - vixl::aarch64::Label done; __ Cmp(temp_.W(), ref_reg); - __ B(eq, &done); + __ B(eq, GetExitLabel()); // Update the the holder's field atomically. This may fail if // mutator updates before us, but it's OK. This is achieved @@ -838,7 +1044,7 @@ class ReadBarrierMarkAndUpdateFieldSlowPathARM64 : public SlowPathCodeARM64 { // Convenience aliases. Register base = obj_.W(); - Register offset = XRegisterFrom(field_offset_); + Register offset = XRegisterFrom(field_offset); Register expected = temp_.W(); Register value = ref_reg; Register tmp_ptr = temps.AcquireX(); // Pointer to actual memory. @@ -882,21 +1088,26 @@ class ReadBarrierMarkAndUpdateFieldSlowPathARM64 : public SlowPathCodeARM64 { } } - __ Bind(&done); __ B(GetExitLabel()); } private: - // The location (register) of the marked object reference. - const Location ref_; // The register containing the object holding the marked object reference field. const Register obj_; - // The location of the offset of the marked reference field within `obj_`. - Location field_offset_; - + // The offset, index and scale factor to access the reference in `obj_`. + uint32_t offset_; + Location index_; + size_t scale_factor_; + // Is a null check required? + bool needs_null_check_; + // Should this reference load use Load-Acquire semantics? + bool use_load_acquire_; + // A temporary register used to hold the lock word of `obj_`; and + // also to hold the original reference value, when the reference is + // marked. const Register temp_; - DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathARM64); + DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64); }; // Slow path generating a read barrier for a heap reference. @@ -2425,6 +2636,9 @@ void LocationsBuilderARM64::VisitArrayGet(HArrayGet* instruction) { LocationSummary::kNoCall); if (object_array_get_with_read_barrier && kUseBakerReadBarrier) { locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + // We need a temporary register for the read barrier marking slow + // path in CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier. + locations->AddTemp(Location::RequiresRegister()); } locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); @@ -2460,7 +2674,7 @@ void InstructionCodeGeneratorARM64::VisitArrayGet(HArrayGet* instruction) { if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) { // Object ArrayGet with Baker's read barrier case. - Register temp = temps.AcquireW(); + Register temp = WRegisterFrom(locations->GetTemp(0)); // Note that a potential implicit null check is handled in the // CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier call. codegen_->GenerateArrayLoadWithBakerReadBarrier( @@ -5604,14 +5818,35 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad( DCHECK(kEmitCompilerReadBarrier); if (kUseBakerReadBarrier) { // Fast path implementation of art::ReadBarrier::BarrierForRoot when - // Baker's read barrier are used: + // Baker's read barrier are used. + // + // Note that we do not actually check the value of + // `GetIsGcMarking()` to decide whether to mark the loaded GC + // root or not. Instead, we load into `temp` the read barrier + // mark entry point corresponding to register `root`. If `temp` + // is null, it means that `GetIsGcMarking()` is false, and vice + // versa. // - // root = obj.field; // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // if (temp != null) { - // root = temp(root) + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. // } + // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. + Register temp = lr; + SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64( + instruction, root, /* entrypoint */ LocationFrom(temp)); + codegen_->AddSlowPath(slow_path); + + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(root.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ Ldr(temp, MemOperand(tr, entry_point_offset)); + // /* GcRoot<mirror::Object> */ root = *(obj + offset) if (fixup_label == nullptr) { __ Ldr(root_reg, MemOperand(obj, offset)); @@ -5626,20 +5861,6 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad( "art::mirror::CompressedReference<mirror::Object> and int32_t " "have different sizes."); - Register temp = lr; - - // Slow path marking the GC root `root`. The entrypoint will alrady be loaded in temp. - SlowPathCodeARM64* slow_path = - new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, - root, - LocationFrom(temp)); - codegen_->AddSlowPath(slow_path); - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(root.reg()); - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // Loading the entrypoint does not require a load acquire since it is only changed when - // threads are suspended or running a checkpoint. - __ Ldr(temp, MemOperand(tr, entry_point_offset)); // The entrypoint is null when the GC is not marking, this prevents one load compared to // checking GetIsGcMarking. __ Cbnz(temp, slow_path->GetEntryLabel()); @@ -5741,54 +5962,103 @@ void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* // `instruction->IsArrayGet()` => `!use_load_acquire`. DCHECK(!instruction->IsArrayGet() || !use_load_acquire); - MacroAssembler* masm = GetVIXLAssembler(); - UseScratchRegisterScope temps(masm); - - // In slow path based read barriers, the read barrier call is - // inserted after the original load. However, in fast path based - // Baker's read barriers, we need to perform the load of - // mirror::Object::monitor_ *before* the original reference load. - // This load-load ordering is required by the read barrier. - // The fast path/slow path (for Baker's algorithm) should look like: + // Query `art::Thread::Current()->GetIsGcMarking()` to decide + // whether we need to enter the slow path to mark the reference. + // Then, in the slow path, check the gray bit in the lock word of + // the reference's holder (`obj`) to decide whether to mark `ref` or + // not. // - // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); - // lfence; // Load fence or artificial data dependency to prevent load-load reordering - // HeapReference<Object> ref = *src; // Original reference load. - // bool is_gray = (rb_state == ReadBarrier::GrayState()); - // if (is_gray) { - // ref = ReadBarrier::Mark(ref); // Performed by runtime entrypoint slow path. - // } + // Note that we do not actually check the value of `GetIsGcMarking()`; + // instead, we load into `temp2` the read barrier mark entry point + // corresponding to register `ref`. If `temp2` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. // - // Note: the original implementation in ReadBarrier::Barrier is - // slightly more complex as it performs additional checks that we do - // not do here for performance reasons. + // temp2 = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // if (temp2 != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // ref = temp2(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // } + // } else { + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // } + + // Slow path marking the object `ref` when the GC is marking. The + // entrypoint will already be loaded in `temp2`. + Register temp2 = lr; + Location temp2_loc = LocationFrom(temp2); + SlowPathCodeARM64* slow_path; + if (always_update_field) { + // LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64 + // only supports address of the form `obj + field_offset`, where + // `obj` is a register and `field_offset` is a register. Thus + // `offset` and `scale_factor` above are expected to be null in + // this code path. + DCHECK_EQ(offset, 0u); + DCHECK_EQ(scale_factor, 0u); /* "times 1" */ + Location field_offset = index; + slow_path = + new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64( + instruction, + ref, + obj, + offset, + /* index */ field_offset, + scale_factor, + needs_null_check, + use_load_acquire, + temp, + /* entrypoint */ temp2_loc); + } else { + slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARM64( + instruction, + ref, + obj, + offset, + index, + scale_factor, + needs_null_check, + use_load_acquire, + temp, + /* entrypoint */ temp2_loc); + } + AddSlowPath(slow_path); + // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ Ldr(temp2, MemOperand(tr, entry_point_offset)); + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ Cbnz(temp2, slow_path->GetEntryLabel()); + // Fast path: just load the reference. + GenerateRawReferenceLoad( + instruction, ref, obj, offset, index, scale_factor, needs_null_check, use_load_acquire); + __ Bind(slow_path->GetExitLabel()); +} + +void CodeGeneratorARM64::GenerateRawReferenceLoad(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + size_t scale_factor, + bool needs_null_check, + bool use_load_acquire) { + DCHECK(obj.IsW()); Primitive::Type type = Primitive::kPrimNot; Register ref_reg = RegisterFrom(ref, type); - DCHECK(obj.IsW()); - uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); - - { - // Ensure that between load and MaybeRecordImplicitNullCheck there are no pools emitted. - EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes); - // /* int32_t */ monitor = obj->monitor_ - __ Ldr(temp, HeapOperand(obj, monitor_offset)); - if (needs_null_check) { - MaybeRecordImplicitNullCheck(instruction); - } - } - // /* LockWord */ lock_word = LockWord(monitor) - static_assert(sizeof(LockWord) == sizeof(int32_t), - "art::LockWord and int32_t have different sizes."); - // Introduce a dependency on the lock_word including rb_state, - // to prevent load-load reordering, and without using - // a memory barrier (which would be more expensive). - // `obj` is unchanged by this operation, but its value now depends - // on `temp`. - __ Add(obj.X(), obj.X(), Operand(temp.X(), LSR, 32)); + // If needed, vixl::EmissionCheckScope guards are used to ensure + // that no pools are emitted between the load (macro) instruction + // and MaybeRecordImplicitNullCheck. - // The actual reference load. if (index.IsValid()) { // Load types involving an "index": ArrayGet, // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject @@ -5803,59 +6073,50 @@ void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* << instruction->AsInvoke()->GetIntrinsic(); DCHECK_EQ(offset, 0u); DCHECK_EQ(scale_factor, 0u); - DCHECK_EQ(needs_null_check, 0u); - // /* HeapReference<Object> */ ref = *(obj + index) + DCHECK_EQ(needs_null_check, false); + // /* HeapReference<mirror::Object> */ ref = *(obj + index) MemOperand field = HeapOperand(obj, XRegisterFrom(index)); LoadAcquire(instruction, ref_reg, field, /* needs_null_check */ false); } else { - // ArrayGet and UnsafeGetObject intrinsics cases. - // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor)) + // ArrayGet and UnsafeGetObject and UnsafeCASObject intrinsics cases. + // /* HeapReference<mirror::Object> */ ref = *(obj + offset + (index << scale_factor)) if (index.IsConstant()) { uint32_t computed_offset = offset + (Int64ConstantFrom(index) << scale_factor); + EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes); Load(type, ref_reg, HeapOperand(obj, computed_offset)); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } } else { - Register temp3 = temps.AcquireW(); - __ Add(temp3, obj, offset); - Load(type, ref_reg, HeapOperand(temp3, XRegisterFrom(index), LSL, scale_factor)); - temps.Release(temp3); + UseScratchRegisterScope temps(GetVIXLAssembler()); + Register temp = temps.AcquireW(); + __ Add(temp, obj, offset); + { + EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes); + Load(type, ref_reg, HeapOperand(temp, XRegisterFrom(index), LSL, scale_factor)); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } + } } } } else { - // /* HeapReference<Object> */ ref = *(obj + offset) + // /* HeapReference<mirror::Object> */ ref = *(obj + offset) MemOperand field = HeapOperand(obj, offset); if (use_load_acquire) { - LoadAcquire(instruction, ref_reg, field, /* needs_null_check */ false); + // Implicit null checks are handled by CodeGeneratorARM64::LoadAcquire. + LoadAcquire(instruction, ref_reg, field, needs_null_check); } else { + EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes); Load(type, ref_reg, field); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } } } // Object* ref = ref_addr->AsMirrorPtr() GetAssembler()->MaybeUnpoisonHeapReference(ref_reg); - - // Slow path marking the object `ref` when it is gray. - SlowPathCodeARM64* slow_path; - if (always_update_field) { - // ReadBarrierMarkAndUpdateFieldSlowPathARM64 only supports - // address of the form `obj + field_offset`, where `obj` is a - // register and `field_offset` is a register. Thus `offset` and - // `scale_factor` above are expected to be null in this code path. - DCHECK_EQ(offset, 0u); - DCHECK_EQ(scale_factor, 0u); /* "times 1" */ - slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkAndUpdateFieldSlowPathARM64( - instruction, ref, obj, /* field_offset */ index, temp); - } else { - slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, ref); - } - AddSlowPath(slow_path); - - // if (rb_state == ReadBarrier::GrayState()) - // ref = ReadBarrier::Mark(ref); - // Given the numeric representation, it's enough to check the low bit of the rb_state. - static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); - static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); - __ Tbnz(temp, LockWord::kReadBarrierStateShift, slow_path->GetEntryLabel()); - __ Bind(slow_path->GetExitLabel()); } void CodeGeneratorARM64::GenerateReadBarrierSlow(HInstruction* instruction, diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index 24a602400e..7471cd5f12 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -616,8 +616,8 @@ class CodeGeneratorARM64 : public CodeGenerator { Location index, vixl::aarch64::Register temp, bool needs_null_check); - // Factored implementation used by GenerateFieldLoadWithBakerReadBarrier - // and GenerateArrayLoadWithBakerReadBarrier. + // Factored implementation, used by GenerateFieldLoadWithBakerReadBarrier, + // GenerateArrayLoadWithBakerReadBarrier and some intrinsics. // // Load the object reference located at the address // `obj + offset + (index << scale_factor)`, held by object `obj`, into @@ -636,6 +636,16 @@ class CodeGeneratorARM64 : public CodeGenerator { bool use_load_acquire, bool always_update_field = false); + // Generate a heap reference load (with no read barrier). + void GenerateRawReferenceLoad(HInstruction* instruction, + Location ref, + vixl::aarch64::Register obj, + uint32_t offset, + Location index, + size_t scale_factor, + bool needs_null_check, + bool use_load_acquire); + // Generate a read barrier for a heap reference within `instruction` // using a slow path. // diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc index 58bf2de70b..180db923bf 100644 --- a/compiler/optimizing/code_generator_arm_vixl.cc +++ b/compiler/optimizing/code_generator_arm_vixl.cc @@ -657,52 +657,25 @@ class ArraySetSlowPathARMVIXL : public SlowPathCodeARMVIXL { DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathARMVIXL); }; -// Slow path marking an object reference `ref` during a read -// barrier. The field `obj.field` in the object `obj` holding this -// reference does not get updated by this slow path after marking (see -// ReadBarrierMarkAndUpdateFieldSlowPathARM below for that). +// Abstract base class for read barrier slow paths marking a reference +// `ref`. // -// This means that after the execution of this slow path, `ref` will -// always be up-to-date, but `obj.field` may not; i.e., after the -// flip, `ref` will be a to-space reference, but `obj.field` will -// probably still be a from-space reference (unless it gets updated by -// another thread, or if another thread installed another object -// reference (different from `ref`) in `obj.field`). -class ReadBarrierMarkSlowPathARMVIXL : public SlowPathCodeARMVIXL { - public: - ReadBarrierMarkSlowPathARMVIXL(HInstruction* instruction, - Location ref, - Location entrypoint = Location::NoLocation()) +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class ReadBarrierMarkSlowPathBaseARMVIXL : public SlowPathCodeARMVIXL { + protected: + ReadBarrierMarkSlowPathBaseARMVIXL(HInstruction* instruction, Location ref, Location entrypoint) : SlowPathCodeARMVIXL(instruction), ref_(ref), entrypoint_(entrypoint) { DCHECK(kEmitCompilerReadBarrier); } - const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathARMVIXL"; } + const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathBaseARMVIXL"; } - void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { - LocationSummary* locations = instruction_->GetLocations(); + // Generate assembly code calling the read barrier marking runtime + // entry point (ReadBarrierMarkRegX). + void GenerateReadBarrierMarkRuntimeCall(CodeGenerator* codegen) { vixl32::Register ref_reg = RegisterFrom(ref_); - DCHECK(locations->CanCall()); - DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg.GetCode())) << ref_reg; - DCHECK(instruction_->IsInstanceFieldGet() || - instruction_->IsStaticFieldGet() || - instruction_->IsArrayGet() || - instruction_->IsArraySet() || - instruction_->IsLoadClass() || - instruction_->IsLoadString() || - instruction_->IsInstanceOf() || - instruction_->IsCheckCast() || - (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || - (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified())) - << "Unexpected instruction in read barrier marking slow path: " - << instruction_->DebugName(); - // The read barrier instrumentation of object ArrayGet - // instructions does not support the HIntermediateAddress - // instruction. - DCHECK(!(instruction_->IsArrayGet() && - instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress())); - __ Bind(GetEntryLabel()); // No need to save live registers; it's taken care of by the // entrypoint. Also, there is no need to update the stack mask, // as this runtime call will not trigger a garbage collection. @@ -732,53 +705,258 @@ class ReadBarrierMarkSlowPathARMVIXL : public SlowPathCodeARMVIXL { arm_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this); __ Blx(RegisterFrom(entrypoint_)); } else { + // Entrypoint is not already loaded, load from the thread. int32_t entry_point_offset = CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg.GetCode()); // This runtime call does not require a stack map. arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); } - __ B(GetExitLabel()); } - private: // The location (register) of the marked object reference. const Location ref_; // The location of the entrypoint if already loaded. const Location entrypoint_; + private: + DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathBaseARMVIXL); +}; + +// Slow path marking an object reference `ref` during a read +// barrier. The field `obj.field` in the object `obj` holding this +// reference does not get updated by this slow path after marking. +// +// This means that after the execution of this slow path, `ref` will +// always be up-to-date, but `obj.field` may not; i.e., after the +// flip, `ref` will be a to-space reference, but `obj.field` will +// probably still be a from-space reference (unless it gets updated by +// another thread, or if another thread installed another object +// reference (different from `ref`) in `obj.field`). +// +// If `entrypoint` is a valid location it is assumed to already be +// holding the entrypoint. The case where the entrypoint is passed in +// is when the decision to mark is based on whether the GC is marking. +class ReadBarrierMarkSlowPathARMVIXL : public ReadBarrierMarkSlowPathBaseARMVIXL { + public: + ReadBarrierMarkSlowPathARMVIXL(HInstruction* instruction, + Location ref, + Location entrypoint = Location::NoLocation()) + : ReadBarrierMarkSlowPathBaseARMVIXL(instruction, ref, entrypoint) { + DCHECK(kEmitCompilerReadBarrier); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathARMVIXL"; } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); + DCHECK(locations->CanCall()); + DCHECK(ref_.IsRegister()) << ref_; + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_.reg())) << ref_.reg(); + DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString()) + << "Unexpected instruction in read barrier marking slow path: " + << instruction_->DebugName(); + + __ Bind(GetEntryLabel()); + GenerateReadBarrierMarkRuntimeCall(codegen); + __ B(GetExitLabel()); + } + + private: DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARMVIXL); }; -// Slow path marking an object reference `ref` during a read barrier, -// and if needed, atomically updating the field `obj.field` in the -// object `obj` holding this reference after marking (contrary to -// ReadBarrierMarkSlowPathARM above, which never tries to update -// `obj.field`). +// Slow path loading `obj`'s lock word, loading a reference from +// object `*(obj + offset + (index << scale_factor))` into `ref`, and +// marking `ref` if `obj` is gray according to the lock word (Baker +// read barrier). The field `obj.field` in the object `obj` holding +// this reference does not get updated by this slow path after marking +// (see LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL +// below for that). +// +// This means that after the execution of this slow path, `ref` will +// always be up-to-date, but `obj.field` may not; i.e., after the +// flip, `ref` will be a to-space reference, but `obj.field` will +// probably still be a from-space reference (unless it gets updated by +// another thread, or if another thread installed another object +// reference (different from `ref`) in `obj.field`). +// +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class LoadReferenceWithBakerReadBarrierSlowPathARMVIXL : public ReadBarrierMarkSlowPathBaseARMVIXL { + public: + LoadReferenceWithBakerReadBarrierSlowPathARMVIXL(HInstruction* instruction, + Location ref, + vixl32::Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + bool needs_null_check, + vixl32::Register temp, + Location entrypoint) + : ReadBarrierMarkSlowPathBaseARMVIXL(instruction, ref, entrypoint), + obj_(obj), + offset_(offset), + index_(index), + scale_factor_(scale_factor), + needs_null_check_(needs_null_check), + temp_(temp) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + } + + const char* GetDescription() const OVERRIDE { + return "LoadReferenceWithBakerReadBarrierSlowPathARMVIXL"; + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); + vixl32::Register ref_reg = RegisterFrom(ref_); + DCHECK(locations->CanCall()); + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg.GetCode())) << ref_reg; + DCHECK(instruction_->IsInstanceFieldGet() || + instruction_->IsStaticFieldGet() || + instruction_->IsArrayGet() || + instruction_->IsArraySet() || + instruction_->IsInstanceOf() || + instruction_->IsCheckCast() || + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || + (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified())) + << "Unexpected instruction in read barrier marking slow path: " + << instruction_->DebugName(); + // The read barrier instrumentation of object ArrayGet + // instructions does not support the HIntermediateAddress + // instruction. + DCHECK(!(instruction_->IsArrayGet() && + instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress())); + + // Temporary register `temp_`, used to store the lock word, must + // not be IP, as we may use it to emit the reference load (in the + // call to GenerateRawReferenceLoad below), and we need the lock + // word to still be in `temp_` after the reference load. + DCHECK(!temp_.Is(ip)); + + __ Bind(GetEntryLabel()); + + // When using MaybeGenerateReadBarrierSlow, the read barrier call is + // inserted after the original load. However, in fast path based + // Baker's read barriers, we need to perform the load of + // mirror::Object::monitor_ *before* the original reference load. + // This load-load ordering is required by the read barrier. + // The fast path/slow path (for Baker's algorithm) should look like: + // + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // ref = entrypoint(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // } + // + // Note: the original implementation in ReadBarrier::Barrier is + // slightly more complex as it performs additional checks that we do + // not do here for performance reasons. + + CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen); + + // /* int32_t */ monitor = obj->monitor_ + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); + arm_codegen->GetAssembler()->LoadFromOffset(kLoadWord, temp_, obj_, monitor_offset); + if (needs_null_check_) { + codegen->MaybeRecordImplicitNullCheck(instruction_); + } + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including the rb_state, + // which shall prevent load-load reordering without using + // a memory barrier (which would be more expensive). + // `obj` is unchanged by this operation, but its value now depends + // on `temp`. + __ Add(obj_, obj_, Operand(temp_, ShiftType::LSR, 32)); + + // The actual reference load. + // A possible implicit null check has already been handled above. + arm_codegen->GenerateRawReferenceLoad( + instruction_, ref_, obj_, offset_, index_, scale_factor_, /* needs_null_check */ false); + + // Mark the object `ref` when `obj` is gray. + // + // if (rb_state == ReadBarrier::GrayState()) + // ref = ReadBarrier::Mark(ref); + // + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with LSRS + // which can be a 16-bit instruction unlike the TST immediate. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + __ Lsrs(temp_, temp_, LockWord::kReadBarrierStateShift + 1); + __ B(cc, GetExitLabel()); // Carry flag is the last bit shifted out by LSRS. + GenerateReadBarrierMarkRuntimeCall(codegen); + + __ B(GetExitLabel()); + } + + private: + // The register containing the object holding the marked object reference field. + vixl32::Register obj_; + // The offset, index and scale factor to access the reference in `obj_`. + uint32_t offset_; + Location index_; + ScaleFactor scale_factor_; + // Is a null check required? + bool needs_null_check_; + // A temporary register used to hold the lock word of `obj_`. + vixl32::Register temp_; + + DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierSlowPathARMVIXL); +}; + +// Slow path loading `obj`'s lock word, loading a reference from +// object `*(obj + offset + (index << scale_factor))` into `ref`, and +// marking `ref` if `obj` is gray according to the lock word (Baker +// read barrier). If needed, this slow path also atomically updates +// the field `obj.field` in the object `obj` holding this reference +// after marking (contrary to +// LoadReferenceWithBakerReadBarrierSlowPathARMVIXL above, which never +// tries to update `obj.field`). // // This means that after the execution of this slow path, both `ref` // and `obj.field` will be up-to-date; i.e., after the flip, both will // hold the same to-space reference (unless another thread installed // another object reference (different from `ref`) in `obj.field`). -class ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL : public SlowPathCodeARMVIXL { +// +// +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL + : public ReadBarrierMarkSlowPathBaseARMVIXL { public: - ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL(HInstruction* instruction, - Location ref, - vixl32::Register obj, - Location field_offset, - vixl32::Register temp1, - vixl32::Register temp2) - : SlowPathCodeARMVIXL(instruction), - ref_(ref), + LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL(HInstruction* instruction, + Location ref, + vixl32::Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + bool needs_null_check, + vixl32::Register temp1, + vixl32::Register temp2, + Location entrypoint) + : ReadBarrierMarkSlowPathBaseARMVIXL(instruction, ref, entrypoint), obj_(obj), - field_offset_(field_offset), + offset_(offset), + index_(index), + scale_factor_(scale_factor), + needs_null_check_(needs_null_check), temp1_(temp1), temp2_(temp2) { DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); } const char* GetDescription() const OVERRIDE { - return "ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL"; + return "LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL"; } void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { @@ -786,64 +964,83 @@ class ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL : public SlowPathCodeARMVIXL vixl32::Register ref_reg = RegisterFrom(ref_); DCHECK(locations->CanCall()); DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg.GetCode())) << ref_reg; - // This slow path is only used by the UnsafeCASObject intrinsic. + DCHECK_NE(ref_.reg(), LocationFrom(temp1_).reg()); + + // This slow path is only used by the UnsafeCASObject intrinsic at the moment. DCHECK((instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier marking and field updating slow path: " << instruction_->DebugName(); DCHECK(instruction_->GetLocations()->Intrinsified()); DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kUnsafeCASObject); - DCHECK(field_offset_.IsRegisterPair()) << field_offset_; + DCHECK_EQ(offset_, 0u); + DCHECK_EQ(scale_factor_, ScaleFactor::TIMES_1); + Location field_offset = index_; + DCHECK(field_offset.IsRegisterPair()) << field_offset; + + // Temporary register `temp1_`, used to store the lock word, must + // not be IP, as we may use it to emit the reference load (in the + // call to GenerateRawReferenceLoad below), and we need the lock + // word to still be in `temp1_` after the reference load. + DCHECK(!temp1_.Is(ip)); __ Bind(GetEntryLabel()); - // Save the old reference. + CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen); + + // /* int32_t */ monitor = obj->monitor_ + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); + arm_codegen->GetAssembler()->LoadFromOffset(kLoadWord, temp1_, obj_, monitor_offset); + if (needs_null_check_) { + codegen->MaybeRecordImplicitNullCheck(instruction_); + } + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including the rb_state, + // which shall prevent load-load reordering without using + // a memory barrier (which would be more expensive). + // `obj` is unchanged by this operation, but its value now depends + // on `temp`. + __ Add(obj_, obj_, Operand(temp1_, ShiftType::LSR, 32)); + + // The actual reference load. + // A possible implicit null check has already been handled above. + arm_codegen->GenerateRawReferenceLoad( + instruction_, ref_, obj_, offset_, index_, scale_factor_, /* needs_null_check */ false); + + // Mark the object `ref` when `obj` is gray. + // + // if (rb_state == ReadBarrier::GrayState()) + // ref = ReadBarrier::Mark(ref); + // + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with LSRS + // which can be a 16-bit instruction unlike the TST immediate. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + __ Lsrs(temp1_, temp1_, LockWord::kReadBarrierStateShift + 1); + __ B(cc, GetExitLabel()); // Carry flag is the last bit shifted out by LSRS. + + // Save the old value of the reference before marking it. // Note that we cannot use IP to save the old reference, as IP is // used internally by the ReadBarrierMarkRegX entry point, and we // need the old reference after the call to that entry point. DCHECK(!temp1_.Is(ip)); __ Mov(temp1_, ref_reg); - // No need to save live registers; it's taken care of by the - // entrypoint. Also, there is no need to update the stack mask, - // as this runtime call will not trigger a garbage collection. - CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen); - DCHECK(!ref_reg.Is(sp)); - DCHECK(!ref_reg.Is(lr)); - DCHECK(!ref_reg.Is(pc)); - // IP is used internally by the ReadBarrierMarkRegX entry point - // as a temporary, it cannot be the entry point's input/output. - DCHECK(!ref_reg.Is(ip)); - DCHECK(ref_reg.IsRegister()) << ref_reg; - // "Compact" slow path, saving two moves. - // - // Instead of using the standard runtime calling convention (input - // and output in R0): - // - // R0 <- ref - // R0 <- ReadBarrierMark(R0) - // ref <- R0 - // - // we just use rX (the register containing `ref`) as input and output - // of a dedicated entrypoint: - // - // rX <- ReadBarrierMarkRegX(rX) - // - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg.GetCode()); - // This runtime call does not require a stack map. - arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + GenerateReadBarrierMarkRuntimeCall(codegen); // If the new reference is different from the old reference, - // update the field in the holder (`*(obj_ + field_offset_)`). + // update the field in the holder (`*(obj_ + field_offset)`). // // Note that this field could also hold a different object, if // another thread had concurrently changed it. In that case, the // LDREX/SUBS/ITNE sequence of instructions in the compare-and-set // (CAS) operation below would abort the CAS, leaving the field // as-is. - vixl32::Label done; __ Cmp(temp1_, ref_reg); - __ B(eq, &done, /* far_target */ false); + __ B(eq, GetExitLabel()); // Update the the holder's field atomically. This may fail if // mutator updates before us, but it's OK. This is achieved @@ -857,7 +1054,7 @@ class ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL : public SlowPathCodeARMVIXL // The UnsafeCASObject intrinsic uses a register pair as field // offset ("long offset"), of which only the low part contains // data. - vixl32::Register offset = LowRegisterFrom(field_offset_); + vixl32::Register offset = LowRegisterFrom(field_offset); vixl32::Register expected = temp1_; vixl32::Register value = ref_reg; vixl32::Register tmp_ptr = temps.Acquire(); // Pointer to actual memory. @@ -913,22 +1110,27 @@ class ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL : public SlowPathCodeARMVIXL } } - __ Bind(&done); __ B(GetExitLabel()); } private: - // The location (register) of the marked object reference. - const Location ref_; // The register containing the object holding the marked object reference field. const vixl32::Register obj_; - // The location of the offset of the marked reference field within `obj_`. - Location field_offset_; - + // The offset, index and scale factor to access the reference in `obj_`. + uint32_t offset_; + Location index_; + ScaleFactor scale_factor_; + // Is a null check required? + bool needs_null_check_; + // A temporary register used to hold the lock word of `obj_`; and + // also to hold the original reference value, when the reference is + // marked. const vixl32::Register temp1_; + // A temporary register used in the implementation of the CAS, to + // update the object's reference field. const vixl32::Register temp2_; - DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL); + DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL); }; // Slow path generating a read barrier for a heap reference. @@ -7261,14 +7463,35 @@ void InstructionCodeGeneratorARMVIXL::GenerateGcRootFieldLoad( DCHECK(kEmitCompilerReadBarrier); if (kUseBakerReadBarrier) { // Fast path implementation of art::ReadBarrier::BarrierForRoot when - // Baker's read barrier are used: + // Baker's read barrier are used. + // + // Note that we do not actually check the value of + // `GetIsGcMarking()` to decide whether to mark the loaded GC + // root or not. Instead, we load into `temp` the read barrier + // mark entry point corresponding to register `root`. If `temp` + // is null, it means that `GetIsGcMarking()` is false, and vice + // versa. // - // root = obj.field; // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // if (temp != null) { - // root = temp(root) + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. // } + // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. + Location temp = LocationFrom(lr); + SlowPathCodeARMVIXL* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL( + instruction, root, /* entrypoint */ temp); + codegen_->AddSlowPath(slow_path); + + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset); + // /* GcRoot<mirror::Object> */ root = *(obj + offset) GetAssembler()->LoadFromOffset(kLoadWord, root_reg, obj, offset); static_assert( @@ -7279,21 +7502,6 @@ void InstructionCodeGeneratorARMVIXL::GenerateGcRootFieldLoad( "art::mirror::CompressedReference<mirror::Object> and int32_t " "have different sizes."); - // Slow path marking the GC root `root`. - Location temp = LocationFrom(lr); - SlowPathCodeARMVIXL* slow_path = - new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL( - instruction, - root, - /*entrypoint*/ temp); - codegen_->AddSlowPath(slow_path); - - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); - // Loading the entrypoint does not require a load acquire since it is only changed when - // threads are suspended or running a checkpoint. - GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset); // The entrypoint is null when the GC is not marking, this prevents one load compared to // checking GetIsGcMarking. __ CompareAndBranchIfNonZero(RegisterFrom(temp), slow_path->GetEntryLabel()); @@ -7364,55 +7572,114 @@ void CodeGeneratorARMVIXL::GenerateReferenceLoadWithBakerReadBarrier(HInstructio DCHECK(kEmitCompilerReadBarrier); DCHECK(kUseBakerReadBarrier); - // In slow path based read barriers, the read barrier call is - // inserted after the original load. However, in fast path based - // Baker's read barriers, we need to perform the load of - // mirror::Object::monitor_ *before* the original reference load. - // This load-load ordering is required by the read barrier. - // The fast path/slow path (for Baker's algorithm) should look like: + // Query `art::Thread::Current()->GetIsGcMarking()` to decide + // whether we need to enter the slow path to mark the reference. + // Then, in the slow path, check the gray bit in the lock word of + // the reference's holder (`obj`) to decide whether to mark `ref` or + // not. // - // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); - // lfence; // Load fence or artificial data dependency to prevent load-load reordering - // HeapReference<Object> ref = *src; // Original reference load. - // bool is_gray = (rb_state == ReadBarrier::GrayState()); - // if (is_gray) { - // ref = ReadBarrier::Mark(ref); // Performed by runtime entrypoint slow path. - // } + // Note that we do not actually check the value of `GetIsGcMarking()`; + // instead, we load into `temp3` the read barrier mark entry point + // corresponding to register `ref`. If `temp3` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. // - // Note: the original implementation in ReadBarrier::Barrier is - // slightly more complex as it performs additional checks that we do - // not do here for performance reasons. + // temp3 = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // if (temp3 != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // ref = temp3(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // } + // } else { + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // } - vixl32::Register ref_reg = RegisterFrom(ref); vixl32::Register temp_reg = RegisterFrom(temp); - uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); - // /* int32_t */ monitor = obj->monitor_ - GetAssembler()->LoadFromOffset(kLoadWord, temp_reg, obj, monitor_offset); - if (needs_null_check) { - MaybeRecordImplicitNullCheck(instruction); + // Slow path marking the object `ref` when the GC is marking. The + // entrypoint will already be loaded in `temp3`. + Location temp3 = LocationFrom(lr); + SlowPathCodeARMVIXL* slow_path; + if (always_update_field) { + DCHECK(temp2 != nullptr); + // LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL + // only supports address of the form `obj + field_offset`, where + // `obj` is a register and `field_offset` is a register pair (of + // which only the lower half is used). Thus `offset` and + // `scale_factor` above are expected to be null in this code path. + DCHECK_EQ(offset, 0u); + DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1); + Location field_offset = index; + slow_path = + new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL( + instruction, + ref, + obj, + offset, + /* index */ field_offset, + scale_factor, + needs_null_check, + temp_reg, + *temp2, + /* entrypoint */ temp3); + } else { + slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARMVIXL( + instruction, + ref, + obj, + offset, + index, + scale_factor, + needs_null_check, + temp_reg, + /* entrypoint */ temp3); } - // /* LockWord */ lock_word = LockWord(monitor) - static_assert(sizeof(LockWord) == sizeof(int32_t), - "art::LockWord and int32_t have different sizes."); + AddSlowPath(slow_path); + + // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp3), tr, entry_point_offset); + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ CompareAndBranchIfNonZero(RegisterFrom(temp3), slow_path->GetEntryLabel()); + // Fast path: just load the reference. + GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check); + __ Bind(slow_path->GetExitLabel()); +} - // Introduce a dependency on the lock_word including the rb_state, - // which shall prevent load-load reordering without using - // a memory barrier (which would be more expensive). - // `obj` is unchanged by this operation, but its value now depends - // on `temp_reg`. - __ Add(obj, obj, Operand(temp_reg, ShiftType::LSR, 32)); +void CodeGeneratorARMVIXL::GenerateRawReferenceLoad(HInstruction* instruction, + Location ref, + vixl::aarch32::Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + bool needs_null_check) { + Primitive::Type type = Primitive::kPrimNot; + vixl32::Register ref_reg = RegisterFrom(ref, type); + + // If needed, vixl::EmissionCheckScope guards are used to ensure + // that no pools are emitted between the load (macro) instruction + // and MaybeRecordImplicitNullCheck. - // The actual reference load. if (index.IsValid()) { // Load types involving an "index": ArrayGet, // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject // intrinsics. - // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor)) + // /* HeapReference<mirror::Object> */ ref = *(obj + offset + (index << scale_factor)) if (index.IsConstant()) { size_t computed_offset = (Int32ConstantFrom(index) << scale_factor) + offset; + vixl::EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes); GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, obj, computed_offset); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } } else { // Handle the special case of the // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject @@ -7422,46 +7689,27 @@ void CodeGeneratorARMVIXL::GenerateReferenceLoadWithBakerReadBarrier(HInstructio ? LowRegisterFrom(index) : RegisterFrom(index); UseScratchRegisterScope temps(GetVIXLAssembler()); - const vixl32::Register temp3 = temps.Acquire(); - __ Add(temp3, obj, Operand(index_reg, ShiftType::LSL, scale_factor)); - GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, temp3, offset); + vixl32::Register temp = temps.Acquire(); + __ Add(temp, obj, Operand(index_reg, ShiftType::LSL, scale_factor)); + { + vixl::EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes); + GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, temp, offset); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } + } } } else { - // /* HeapReference<Object> */ ref = *(obj + offset) + // /* HeapReference<mirror::Object> */ ref = *(obj + offset) + vixl::EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes); GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, obj, offset); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } } // Object* ref = ref_addr->AsMirrorPtr() GetAssembler()->MaybeUnpoisonHeapReference(ref_reg); - - // Slow path marking the object `ref` when it is gray. - SlowPathCodeARMVIXL* slow_path; - if (always_update_field) { - DCHECK(temp2 != nullptr); - // ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL only supports address - // of the form `obj + field_offset`, where `obj` is a register and - // `field_offset` is a register pair (of which only the lower half - // is used). Thus `offset` and `scale_factor` above are expected - // to be null in this code path. - DCHECK_EQ(offset, 0u); - DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1); - slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL( - instruction, ref, obj, /* field_offset */ index, temp_reg, *temp2); - } else { - slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL(instruction, ref); - } - AddSlowPath(slow_path); - - // if (rb_state == ReadBarrier::GrayState()) - // ref = ReadBarrier::Mark(ref); - // Given the numeric representation, it's enough to check the low bit of the - // rb_state. We do that by shifting the bit out of the lock word with LSRS - // which can be a 16-bit instruction unlike the TST immediate. - static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); - static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); - __ Lsrs(temp_reg, temp_reg, LockWord::kReadBarrierStateShift + 1); - __ B(cs, slow_path->GetEntryLabel()); // Carry flag is the last bit shifted out by LSRS. - __ Bind(slow_path->GetExitLabel()); } void CodeGeneratorARMVIXL::GenerateReadBarrierSlow(HInstruction* instruction, diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h index 92e922d8f9..5ff7dd69e7 100644 --- a/compiler/optimizing/code_generator_arm_vixl.h +++ b/compiler/optimizing/code_generator_arm_vixl.h @@ -45,6 +45,11 @@ static constexpr bool kArmUseVIXL32 = true; namespace art { namespace arm { +// This constant is used as an approximate margin when emission of veneer and literal pools +// must be blocked. +static constexpr int kMaxMacroInstructionSizeInBytes = + 15 * vixl::aarch32::kMaxInstructionSizeInBytes; + static const vixl::aarch32::Register kParameterCoreRegistersVIXL[] = { vixl::aarch32::r1, vixl::aarch32::r2, @@ -625,6 +630,15 @@ class CodeGeneratorARMVIXL : public CodeGenerator { bool always_update_field = false, vixl::aarch32::Register* temp2 = nullptr); + // Generate a heap reference load (with no read barrier). + void GenerateRawReferenceLoad(HInstruction* instruction, + Location ref, + vixl::aarch32::Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + bool needs_null_check); + // Generate a read barrier for a heap reference within `instruction` // using a slow path. // diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc index 86000e9356..28095c4d3f 100644 --- a/compiler/optimizing/intrinsics_arm.cc +++ b/compiler/optimizing/intrinsics_arm.cc @@ -1947,6 +1947,8 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { } if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // TODO: Also convert this intrinsic to the IsGcMarking strategy? + // The base destination address is computed later, as `temp2` is // used for intermediate computations. diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc index 6c3938c1a9..934ba1b9fb 100644 --- a/compiler/optimizing/intrinsics_arm64.cc +++ b/compiler/optimizing/intrinsics_arm64.cc @@ -853,7 +853,6 @@ static void GenUnsafeGet(HInvoke* invoke, DCHECK((type == Primitive::kPrimInt) || (type == Primitive::kPrimLong) || (type == Primitive::kPrimNot)); - MacroAssembler* masm = codegen->GetVIXLAssembler(); Location base_loc = locations->InAt(1); Register base = WRegisterFrom(base_loc); // Object pointer. Location offset_loc = locations->InAt(2); @@ -863,8 +862,7 @@ static void GenUnsafeGet(HInvoke* invoke, if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) { // UnsafeGetObject/UnsafeGetObjectVolatile with Baker's read barrier case. - UseScratchRegisterScope temps(masm); - Register temp = temps.AcquireW(); + Register temp = WRegisterFrom(locations->GetTemp(0)); codegen->GenerateReferenceLoadWithBakerReadBarrier(invoke, trg_loc, base, @@ -901,6 +899,9 @@ static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke kIntrinsified); if (can_call && kUseBakerReadBarrier) { locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + // We need a temporary register for the read barrier marking slow + // path in CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier. + locations->AddTemp(Location::RequiresRegister()); } locations->SetInAt(0, Location::NoLocation()); // Unused receiver. locations->SetInAt(1, Location::RequiresRegister()); @@ -2381,9 +2382,14 @@ void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopy(HInvoke* invoke) { // Temporary register IP0, obtained from the VIXL scratch register // pool, cannot be used in ReadBarrierSystemArrayCopySlowPathARM64 // (because that register is clobbered by ReadBarrierMarkRegX - // entry points). Get an extra temporary register from the - // register allocator. + // entry points). It cannot be used in calls to + // CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier + // either. For these reasons, get a third extra temporary register + // from the register allocator. locations->AddTemp(Location::RequiresRegister()); + } else { + // Cases other than Baker read barriers: the third temporary will + // be acquired from the VIXL scratch register pool. } } @@ -2494,11 +2500,12 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { // We use a block to end the scratch scope before the write barrier, thus // freeing the temporary registers so they can be used in `MarkGCCard`. UseScratchRegisterScope temps(masm); - // Note: Because it is acquired from VIXL's scratch register pool, - // `temp3` might be IP0, and thus cannot be used as `ref` argument - // of CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier - // calls below (see ReadBarrierMarkSlowPathARM64 for more details). - Register temp3 = temps.AcquireW(); + Register temp3; + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + temp3 = WRegisterFrom(locations->GetTemp(2)); + } else { + temp3 = temps.AcquireW(); + } if (!optimizations.GetDoesNotNeedTypeCheck()) { // Check whether all elements of the source array are assignable to the component @@ -2704,19 +2711,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { Register src_curr_addr = temp1.X(); Register dst_curr_addr = temp2.X(); - Register src_stop_addr; - if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { - // Temporary register IP0, obtained from the VIXL scratch - // register pool as `temp3`, cannot be used in - // ReadBarrierSystemArrayCopySlowPathARM64 (because that - // register is clobbered by ReadBarrierMarkRegX entry points). - // So another temporary register allocated by the register - // allocator instead. - DCHECK_EQ(LocationFrom(temp3).reg(), IP0); - src_stop_addr = XRegisterFrom(locations->GetTemp(2)); - } else { - src_stop_addr = temp3.X(); - } + Register src_stop_addr = temp3.X(); GenSystemArrayCopyAddresses(masm, Primitive::kPrimNot, @@ -2732,6 +2727,8 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { const int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // TODO: Also convert this intrinsic to the IsGcMarking strategy? + // SystemArrayCopy implementation for Baker read barriers (see // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier): // @@ -2758,10 +2755,11 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { __ Cmp(src_curr_addr, src_stop_addr); __ B(&done, eq); - Register tmp = temps.AcquireW(); // Make sure `tmp` is not IP0, as it is clobbered by // ReadBarrierMarkRegX entry points in // ReadBarrierSystemArrayCopySlowPathARM64. + temps.Exclude(ip0); + Register tmp = temps.AcquireW(); DCHECK_NE(LocationFrom(tmp).reg(), IP0); // /* int32_t */ monitor = src->monitor_ diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc index aa89deae34..60bcf2cfd5 100644 --- a/compiler/optimizing/intrinsics_arm_vixl.cc +++ b/compiler/optimizing/intrinsics_arm_vixl.cc @@ -2265,6 +2265,8 @@ void IntrinsicCodeGeneratorARMVIXL::VisitSystemArrayCopy(HInvoke* invoke) { } if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // TODO: Also convert this intrinsic to the IsGcMarking strategy? + // The base destination address is computed later, as `temp2` is // used for intermediate computations. |