diff options
-rw-r--r-- | compiler/optimizing/code_generator_arm.cc | 590 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm.h | 12 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm64.cc | 659 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm64.h | 14 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm_vixl.cc | 616 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm_vixl.h | 14 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_arm.cc | 2 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_arm64.cc | 46 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_arm_vixl.cc | 2 | ||||
-rw-r--r-- | runtime/lock_word.h | 3 | ||||
-rw-r--r-- | runtime/type_lookup_table.h | 2 |
11 files changed, 1361 insertions, 599 deletions
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc index 511bd9b7ef..2b0ab3e20e 100644 --- a/compiler/optimizing/code_generator_arm.cc +++ b/compiler/optimizing/code_generator_arm.cc @@ -636,10 +636,75 @@ class ArraySetSlowPathARM : public SlowPathCodeARM { DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathARM); }; +// Abstract base class for read barrier slow paths marking a reference +// `ref`. +// +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class ReadBarrierMarkSlowPathBaseARM : public SlowPathCodeARM { + protected: + ReadBarrierMarkSlowPathBaseARM(HInstruction* instruction, Location ref, Location entrypoint) + : SlowPathCodeARM(instruction), ref_(ref), entrypoint_(entrypoint) { + DCHECK(kEmitCompilerReadBarrier); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathBaseARM"; } + + // Generate assembly code calling the read barrier marking runtime + // entry point (ReadBarrierMarkRegX). + void GenerateReadBarrierMarkRuntimeCall(CodeGenerator* codegen) { + Register ref_reg = ref_.AsRegister<Register>(); + + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. + CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen); + DCHECK_NE(ref_reg, SP); + DCHECK_NE(ref_reg, LR); + DCHECK_NE(ref_reg, PC); + // IP is used internally by the ReadBarrierMarkRegX entry point + // as a temporary, it cannot be the entry point's input/output. + DCHECK_NE(ref_reg, IP); + DCHECK(0 <= ref_reg && ref_reg < kNumberOfCoreRegisters) << ref_reg; + // "Compact" slow path, saving two moves. + // + // Instead of using the standard runtime calling convention (input + // and output in R0): + // + // R0 <- ref + // R0 <- ReadBarrierMark(R0) + // ref <- R0 + // + // we just use rX (the register containing `ref`) as input and output + // of a dedicated entrypoint: + // + // rX <- ReadBarrierMarkRegX(rX) + // + if (entrypoint_.IsValid()) { + arm_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this); + __ blx(entrypoint_.AsRegister<Register>()); + } else { + // Entrypoint is not already loaded, load from the thread. + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg); + // This runtime call does not require a stack map. + arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + } + } + + // The location (register) of the marked object reference. + const Location ref_; + + // The location of the entrypoint if it is already loaded. + const Location entrypoint_; + + private: + DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathBaseARM); +}; + // Slow path marking an object reference `ref` during a read // barrier. The field `obj.field` in the object `obj` holding this -// reference does not get updated by this slow path after marking (see -// ReadBarrierMarkAndUpdateFieldSlowPathARM below for that). +// reference does not get updated by this slow path after marking. // // This means that after the execution of this slow path, `ref` will // always be up-to-date, but `obj.field` may not; i.e., after the @@ -650,13 +715,13 @@ class ArraySetSlowPathARM : public SlowPathCodeARM { // // If `entrypoint` is a valid location it is assumed to already be // holding the entrypoint. The case where the entrypoint is passed in -// is for the GcRoot read barrier. -class ReadBarrierMarkSlowPathARM : public SlowPathCodeARM { +// is when the decision to mark is based on whether the GC is marking. +class ReadBarrierMarkSlowPathARM : public ReadBarrierMarkSlowPathBaseARM { public: ReadBarrierMarkSlowPathARM(HInstruction* instruction, Location ref, Location entrypoint = Location::NoLocation()) - : SlowPathCodeARM(instruction), ref_(ref), entrypoint_(entrypoint) { + : ReadBarrierMarkSlowPathBaseARM(instruction, ref, entrypoint) { DCHECK(kEmitCompilerReadBarrier); } @@ -664,15 +729,77 @@ class ReadBarrierMarkSlowPathARM : public SlowPathCodeARM { void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { LocationSummary* locations = instruction_->GetLocations(); + DCHECK(locations->CanCall()); + if (kIsDebugBuild) { + Register ref_reg = ref_.AsRegister<Register>(); + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg; + } + DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString()) + << "Unexpected instruction in read barrier marking slow path: " + << instruction_->DebugName(); + + __ Bind(GetEntryLabel()); + GenerateReadBarrierMarkRuntimeCall(codegen); + __ b(GetExitLabel()); + } + + private: + DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM); +}; + +// Slow path loading `obj`'s lock word, loading a reference from +// object `*(obj + offset + (index << scale_factor))` into `ref`, and +// marking `ref` if `obj` is gray according to the lock word (Baker +// read barrier). The field `obj.field` in the object `obj` holding +// this reference does not get updated by this slow path after marking +// (see LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM +// below for that). +// +// This means that after the execution of this slow path, `ref` will +// always be up-to-date, but `obj.field` may not; i.e., after the +// flip, `ref` will be a to-space reference, but `obj.field` will +// probably still be a from-space reference (unless it gets updated by +// another thread, or if another thread installed another object +// reference (different from `ref`) in `obj.field`). +// +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class LoadReferenceWithBakerReadBarrierSlowPathARM : public ReadBarrierMarkSlowPathBaseARM { + public: + LoadReferenceWithBakerReadBarrierSlowPathARM(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + bool needs_null_check, + Register temp, + Location entrypoint) + : ReadBarrierMarkSlowPathBaseARM(instruction, ref, entrypoint), + obj_(obj), + offset_(offset), + index_(index), + scale_factor_(scale_factor), + needs_null_check_(needs_null_check), + temp_(temp) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + } + + const char* GetDescription() const OVERRIDE { + return "LoadReferenceWithBakerReadBarrierSlowPathARM"; + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); Register ref_reg = ref_.AsRegister<Register>(); DCHECK(locations->CanCall()); DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg; + DCHECK_NE(ref_reg, temp_); DCHECK(instruction_->IsInstanceFieldGet() || instruction_->IsStaticFieldGet() || instruction_->IsArrayGet() || instruction_->IsArraySet() || - instruction_->IsLoadClass() || - instruction_->IsLoadString() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || @@ -686,145 +813,202 @@ class ReadBarrierMarkSlowPathARM : public SlowPathCodeARM { instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress())); __ Bind(GetEntryLabel()); - // No need to save live registers; it's taken care of by the - // entrypoint. Also, there is no need to update the stack mask, - // as this runtime call will not trigger a garbage collection. - CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen); - DCHECK_NE(ref_reg, SP); - DCHECK_NE(ref_reg, LR); - DCHECK_NE(ref_reg, PC); - // IP is used internally by the ReadBarrierMarkRegX entry point - // as a temporary, it cannot be the entry point's input/output. - DCHECK_NE(ref_reg, IP); - DCHECK(0 <= ref_reg && ref_reg < kNumberOfCoreRegisters) << ref_reg; - // "Compact" slow path, saving two moves. + + // When using MaybeGenerateReadBarrierSlow, the read barrier call is + // inserted after the original load. However, in fast path based + // Baker's read barriers, we need to perform the load of + // mirror::Object::monitor_ *before* the original reference load. + // This load-load ordering is required by the read barrier. + // The fast path/slow path (for Baker's algorithm) should look like: // - // Instead of using the standard runtime calling convention (input - // and output in R0): + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // ref = entrypoint(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // } // - // R0 <- ref - // R0 <- ReadBarrierMark(R0) - // ref <- R0 - // - // we just use rX (the register containing `ref`) as input and output - // of a dedicated entrypoint: + // Note: the original implementation in ReadBarrier::Barrier is + // slightly more complex as it performs additional checks that we do + // not do here for performance reasons. + + // /* int32_t */ monitor = obj->monitor_ + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); + __ LoadFromOffset(kLoadWord, temp_, obj_, monitor_offset); + if (needs_null_check_) { + codegen->MaybeRecordImplicitNullCheck(instruction_); + } + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including the rb_state, + // which shall prevent load-load reordering without using + // a memory barrier (which would be more expensive). + // `obj` is unchanged by this operation, but its value now depends + // on `temp`. + __ add(obj_, obj_, ShifterOperand(temp_, LSR, 32)); + + // The actual reference load. + // A possible implicit null check has already been handled above. + CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen); + arm_codegen->GenerateRawReferenceLoad( + instruction_, ref_, obj_, offset_, index_, scale_factor_, /* needs_null_check */ false); + + // Mark the object `ref` when `obj` is gray. // - // rX <- ReadBarrierMarkRegX(rX) + // if (rb_state == ReadBarrier::GrayState()) + // ref = ReadBarrier::Mark(ref); // - if (entrypoint_.IsValid()) { - arm_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this); - __ blx(entrypoint_.AsRegister<Register>()); - } else { - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg); - // This runtime call does not require a stack map. - arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); - } + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with LSRS + // which can be a 16-bit instruction unlike the TST immediate. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + __ Lsrs(temp_, temp_, LockWord::kReadBarrierStateShift + 1); + __ b(GetExitLabel(), CC); // Carry flag is the last bit shifted out by LSRS. + GenerateReadBarrierMarkRuntimeCall(codegen); + __ b(GetExitLabel()); } private: - // The location (register) of the marked object reference. - const Location ref_; - - // The location of the entrypoint if already loaded. - const Location entrypoint_; - - DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM); + // The register containing the object holding the marked object reference field. + Register obj_; + // The offset, index and scale factor to access the reference in `obj_`. + uint32_t offset_; + Location index_; + ScaleFactor scale_factor_; + // Is a null check required? + bool needs_null_check_; + // A temporary register used to hold the lock word of `obj_`. + Register temp_; + + DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierSlowPathARM); }; -// Slow path marking an object reference `ref` during a read barrier, -// and if needed, atomically updating the field `obj.field` in the -// object `obj` holding this reference after marking (contrary to -// ReadBarrierMarkSlowPathARM above, which never tries to update -// `obj.field`). +// Slow path loading `obj`'s lock word, loading a reference from +// object `*(obj + offset + (index << scale_factor))` into `ref`, and +// marking `ref` if `obj` is gray according to the lock word (Baker +// read barrier). If needed, this slow path also atomically updates +// the field `obj.field` in the object `obj` holding this reference +// after marking (contrary to +// LoadReferenceWithBakerReadBarrierSlowPathARM above, which never +// tries to update `obj.field`). // // This means that after the execution of this slow path, both `ref` // and `obj.field` will be up-to-date; i.e., after the flip, both will // hold the same to-space reference (unless another thread installed // another object reference (different from `ref`) in `obj.field`). -class ReadBarrierMarkAndUpdateFieldSlowPathARM : public SlowPathCodeARM { +// +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM + : public ReadBarrierMarkSlowPathBaseARM { public: - ReadBarrierMarkAndUpdateFieldSlowPathARM(HInstruction* instruction, - Location ref, - Register obj, - Location field_offset, - Register temp1, - Register temp2) - : SlowPathCodeARM(instruction), - ref_(ref), + LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + bool needs_null_check, + Register temp1, + Register temp2, + Location entrypoint) + : ReadBarrierMarkSlowPathBaseARM(instruction, ref, entrypoint), obj_(obj), - field_offset_(field_offset), + offset_(offset), + index_(index), + scale_factor_(scale_factor), + needs_null_check_(needs_null_check), temp1_(temp1), temp2_(temp2) { DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); } - const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkAndUpdateFieldSlowPathARM"; } + const char* GetDescription() const OVERRIDE { + return "LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM"; + } void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { LocationSummary* locations = instruction_->GetLocations(); Register ref_reg = ref_.AsRegister<Register>(); DCHECK(locations->CanCall()); DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg)) << ref_reg; - // This slow path is only used by the UnsafeCASObject intrinsic. + DCHECK_NE(ref_reg, temp1_); + + // This slow path is only used by the UnsafeCASObject intrinsic at the moment. DCHECK((instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier marking and field updating slow path: " << instruction_->DebugName(); DCHECK(instruction_->GetLocations()->Intrinsified()); DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kUnsafeCASObject); - DCHECK(field_offset_.IsRegisterPair()) << field_offset_; + DCHECK_EQ(offset_, 0u); + DCHECK_EQ(scale_factor_, ScaleFactor::TIMES_1); + // The location of the offset of the marked reference field within `obj_`. + Location field_offset = index_; + DCHECK(field_offset.IsRegisterPair()) << field_offset; __ Bind(GetEntryLabel()); - // Save the old reference. + // /* int32_t */ monitor = obj->monitor_ + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); + __ LoadFromOffset(kLoadWord, temp1_, obj_, monitor_offset); + if (needs_null_check_) { + codegen->MaybeRecordImplicitNullCheck(instruction_); + } + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including the rb_state, + // which shall prevent load-load reordering without using + // a memory barrier (which would be more expensive). + // `obj` is unchanged by this operation, but its value now depends + // on `temp1`. + __ add(obj_, obj_, ShifterOperand(temp1_, LSR, 32)); + + // The actual reference load. + // A possible implicit null check has already been handled above. + CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen); + arm_codegen->GenerateRawReferenceLoad( + instruction_, ref_, obj_, offset_, index_, scale_factor_, /* needs_null_check */ false); + + // Mark the object `ref` when `obj` is gray. + // + // if (rb_state == ReadBarrier::GrayState()) + // ref = ReadBarrier::Mark(ref); + // + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with LSRS + // which can be a 16-bit instruction unlike the TST immediate. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + __ Lsrs(temp1_, temp1_, LockWord::kReadBarrierStateShift + 1); + __ b(GetExitLabel(), CC); // Carry flag is the last bit shifted out by LSRS. + + // Save the old value of the reference before marking it. // Note that we cannot use IP to save the old reference, as IP is // used internally by the ReadBarrierMarkRegX entry point, and we // need the old reference after the call to that entry point. DCHECK_NE(temp1_, IP); __ Mov(temp1_, ref_reg); - // No need to save live registers; it's taken care of by the - // entrypoint. Also, there is no need to update the stack mask, - // as this runtime call will not trigger a garbage collection. - CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen); - DCHECK_NE(ref_reg, SP); - DCHECK_NE(ref_reg, LR); - DCHECK_NE(ref_reg, PC); - // IP is used internally by the ReadBarrierMarkRegX entry point - // as a temporary, it cannot be the entry point's input/output. - DCHECK_NE(ref_reg, IP); - DCHECK(0 <= ref_reg && ref_reg < kNumberOfCoreRegisters) << ref_reg; - // "Compact" slow path, saving two moves. - // - // Instead of using the standard runtime calling convention (input - // and output in R0): - // - // R0 <- ref - // R0 <- ReadBarrierMark(R0) - // ref <- R0 - // - // we just use rX (the register containing `ref`) as input and output - // of a dedicated entrypoint: - // - // rX <- ReadBarrierMarkRegX(rX) - // - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg); - // This runtime call does not require a stack map. - arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + GenerateReadBarrierMarkRuntimeCall(codegen); // If the new reference is different from the old reference, - // update the field in the holder (`*(obj_ + field_offset_)`). + // update the field in the holder (`*(obj_ + field_offset)`). // // Note that this field could also hold a different object, if // another thread had concurrently changed it. In that case, the // LDREX/SUBS/ITNE sequence of instructions in the compare-and-set // (CAS) operation below would abort the CAS, leaving the field // as-is. - Label done; __ cmp(temp1_, ShifterOperand(ref_reg)); - __ b(&done, EQ); + __ b(GetExitLabel(), EQ); // Update the the holder's field atomically. This may fail if // mutator updates before us, but it's OK. This is achieved @@ -837,7 +1021,7 @@ class ReadBarrierMarkAndUpdateFieldSlowPathARM : public SlowPathCodeARM { // The UnsafeCASObject intrinsic uses a register pair as field // offset ("long offset"), of which only the low part contains // data. - Register offset = field_offset_.AsRegisterPairLow<Register>(); + Register offset = field_offset.AsRegisterPairLow<Register>(); Register expected = temp1_; Register value = ref_reg; Register tmp_ptr = IP; // Pointer to actual memory. @@ -887,22 +1071,27 @@ class ReadBarrierMarkAndUpdateFieldSlowPathARM : public SlowPathCodeARM { } } - __ Bind(&done); __ b(GetExitLabel()); } private: - // The location (register) of the marked object reference. - const Location ref_; // The register containing the object holding the marked object reference field. const Register obj_; - // The location of the offset of the marked reference field within `obj_`. - Location field_offset_; - + // The offset, index and scale factor to access the reference in `obj_`. + uint32_t offset_; + Location index_; + ScaleFactor scale_factor_; + // Is a null check required? + bool needs_null_check_; + // A temporary register used to hold the lock word of `obj_`; and + // also to hold the original reference value, when the reference is + // marked. const Register temp1_; + // A temporary register used in the implementation of the CAS, to + // update the object's reference field. const Register temp2_; - DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathARM); + DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM); }; // Slow path generating a read barrier for a heap reference. @@ -7183,14 +7372,35 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct DCHECK(kEmitCompilerReadBarrier); if (kUseBakerReadBarrier) { // Fast path implementation of art::ReadBarrier::BarrierForRoot when - // Baker's read barrier are used: + // Baker's read barrier are used. + // + // Note that we do not actually check the value of + // `GetIsGcMarking()` to decide whether to mark the loaded GC + // root or not. Instead, we load into `temp` the read barrier + // mark entry point corresponding to register `root`. If `temp` + // is null, it means that `GetIsGcMarking()` is false, and vice + // versa. // - // root = obj.field; // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // if (temp != null) { - // root = temp(root) + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. // } + // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. + Location temp = Location::RegisterLocation(LR); + SlowPathCodeARM* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM( + instruction, root, /* entrypoint */ temp); + codegen_->AddSlowPath(slow_path); + + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset); + // /* GcRoot<mirror::Object> */ root = *(obj + offset) __ LoadFromOffset(kLoadWord, root_reg, obj, offset); static_assert( @@ -7201,21 +7411,6 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct "art::mirror::CompressedReference<mirror::Object> and int32_t " "have different sizes."); - // Slow path marking the GC root `root`. - Location temp = Location::RegisterLocation(LR); - SlowPathCodeARM* slow_path = - new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM( - instruction, - root, - /*entrypoint*/ temp); - codegen_->AddSlowPath(slow_path); - - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); - // Loading the entrypoint does not require a load acquire since it is only changed when - // threads are suspended or running a checkpoint. - __ LoadFromOffset(kLoadWord, temp.AsRegister<Register>(), TR, entry_point_offset); // The entrypoint is null when the GC is not marking, this prevents one load compared to // checking GetIsGcMarking. __ CompareAndBranchIfNonZero(temp.AsRegister<Register>(), slow_path->GetEntryLabel()); @@ -7286,51 +7481,101 @@ void CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i DCHECK(kEmitCompilerReadBarrier); DCHECK(kUseBakerReadBarrier); - // In slow path based read barriers, the read barrier call is - // inserted after the original load. However, in fast path based - // Baker's read barriers, we need to perform the load of - // mirror::Object::monitor_ *before* the original reference load. - // This load-load ordering is required by the read barrier. - // The fast path/slow path (for Baker's algorithm) should look like: + // Query `art::Thread::Current()->GetIsGcMarking()` to decide + // whether we need to enter the slow path to mark the reference. + // Then, in the slow path, check the gray bit in the lock word of + // the reference's holder (`obj`) to decide whether to mark `ref` or + // not. // - // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); - // lfence; // Load fence or artificial data dependency to prevent load-load reordering - // HeapReference<Object> ref = *src; // Original reference load. - // bool is_gray = (rb_state == ReadBarrier::GrayState()); - // if (is_gray) { - // ref = ReadBarrier::Mark(ref); // Performed by runtime entrypoint slow path. - // } + // Note that we do not actually check the value of `GetIsGcMarking()`; + // instead, we load into `temp3` the read barrier mark entry point + // corresponding to register `ref`. If `temp3` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. // - // Note: the original implementation in ReadBarrier::Barrier is - // slightly more complex as it performs additional checks that we do - // not do here for performance reasons. + // temp3 = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // if (temp3 != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // ref = temp3(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // } + // } else { + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // } - Register ref_reg = ref.AsRegister<Register>(); Register temp_reg = temp.AsRegister<Register>(); - uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); - // /* int32_t */ monitor = obj->monitor_ - __ LoadFromOffset(kLoadWord, temp_reg, obj, monitor_offset); - if (needs_null_check) { - MaybeRecordImplicitNullCheck(instruction); + // Slow path marking the object `ref` when the GC is marking. The + // entrypoint will already be loaded in `temp3`. + Location temp3 = Location::RegisterLocation(LR); + SlowPathCodeARM* slow_path; + if (always_update_field) { + DCHECK(temp2 != nullptr); + // LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM only + // supports address of the form `obj + field_offset`, where `obj` + // is a register and `field_offset` is a register pair (of which + // only the lower half is used). Thus `offset` and `scale_factor` + // above are expected to be null in this code path. + DCHECK_EQ(offset, 0u); + DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1); + Location field_offset = index; + slow_path = + new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM( + instruction, + ref, + obj, + offset, + /* index */ field_offset, + scale_factor, + needs_null_check, + temp_reg, + *temp2, + /* entrypoint */ temp3); + } else { + slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARM( + instruction, + ref, + obj, + offset, + index, + scale_factor, + needs_null_check, + temp_reg, + /* entrypoint */ temp3); } - // /* LockWord */ lock_word = LockWord(monitor) - static_assert(sizeof(LockWord) == sizeof(int32_t), - "art::LockWord and int32_t have different sizes."); + AddSlowPath(slow_path); - // Introduce a dependency on the lock_word including the rb_state, - // which shall prevent load-load reordering without using - // a memory barrier (which would be more expensive). - // `obj` is unchanged by this operation, but its value now depends - // on `temp_reg`. - __ add(obj, obj, ShifterOperand(temp_reg, LSR, 32)); + // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ LoadFromOffset(kLoadWord, temp3.AsRegister<Register>(), TR, entry_point_offset); + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ CompareAndBranchIfNonZero(temp3.AsRegister<Register>(), slow_path->GetEntryLabel()); + // Fast path: just load the reference. + GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check); + __ Bind(slow_path->GetExitLabel()); +} + +void CodeGeneratorARM::GenerateRawReferenceLoad(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + bool needs_null_check) { + Register ref_reg = ref.AsRegister<Register>(); - // The actual reference load. if (index.IsValid()) { // Load types involving an "index": ArrayGet, // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject // intrinsics. - // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor)) + // /* HeapReference<mirror::Object> */ ref = *(obj + offset + (index << scale_factor)) if (index.IsConstant()) { size_t computed_offset = (index.GetConstant()->AsIntConstant()->GetValue() << scale_factor) + offset; @@ -7347,41 +7592,16 @@ void CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* i __ LoadFromOffset(kLoadWord, ref_reg, IP, offset); } } else { - // /* HeapReference<Object> */ ref = *(obj + offset) + // /* HeapReference<mirror::Object> */ ref = *(obj + offset) __ LoadFromOffset(kLoadWord, ref_reg, obj, offset); } - // Object* ref = ref_addr->AsMirrorPtr() - __ MaybeUnpoisonHeapReference(ref_reg); - - // Slow path marking the object `ref` when it is gray. - SlowPathCodeARM* slow_path; - if (always_update_field) { - DCHECK(temp2 != nullptr); - // ReadBarrierMarkAndUpdateFieldSlowPathARM only supports address - // of the form `obj + field_offset`, where `obj` is a register and - // `field_offset` is a register pair (of which only the lower half - // is used). Thus `offset` and `scale_factor` above are expected - // to be null in this code path. - DCHECK_EQ(offset, 0u); - DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1); - slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkAndUpdateFieldSlowPathARM( - instruction, ref, obj, /* field_offset */ index, temp_reg, *temp2); - } else { - slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, ref); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); } - AddSlowPath(slow_path); - // if (rb_state == ReadBarrier::GrayState()) - // ref = ReadBarrier::Mark(ref); - // Given the numeric representation, it's enough to check the low bit of the - // rb_state. We do that by shifting the bit out of the lock word with LSRS - // which can be a 16-bit instruction unlike the TST immediate. - static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); - static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); - __ Lsrs(temp_reg, temp_reg, LockWord::kReadBarrierStateShift + 1); - __ b(slow_path->GetEntryLabel(), CS); // Carry flag is the last bit shifted out by LSRS. - __ Bind(slow_path->GetExitLabel()); + // Object* ref = ref_addr->AsMirrorPtr() + __ MaybeUnpoisonHeapReference(ref_reg); } void CodeGeneratorARM::GenerateReadBarrierSlow(HInstruction* instruction, diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h index e993756b3b..f081a910ee 100644 --- a/compiler/optimizing/code_generator_arm.h +++ b/compiler/optimizing/code_generator_arm.h @@ -520,9 +520,6 @@ class CodeGeneratorARM : public CodeGenerator { Location index, Location temp, bool needs_null_check); - // Factored implementation used by GenerateFieldLoadWithBakerReadBarrier - // and GenerateArrayLoadWithBakerReadBarrier. - // Factored implementation, used by GenerateFieldLoadWithBakerReadBarrier, // GenerateArrayLoadWithBakerReadBarrier and some intrinsics. // @@ -545,6 +542,15 @@ class CodeGeneratorARM : public CodeGenerator { bool always_update_field = false, Register* temp2 = nullptr); + // Generate a heap reference load (with no read barrier). + void GenerateRawReferenceLoad(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + bool needs_null_check); + // Generate a read barrier for a heap reference within `instruction` // using a slow path. // diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index f5038fb1c0..7d1ae7d28b 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -633,10 +633,73 @@ void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) { } } +// Abstract base class for read barrier slow paths marking a reference +// `ref`. +// +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class ReadBarrierMarkSlowPathBaseARM64 : public SlowPathCodeARM64 { + protected: + ReadBarrierMarkSlowPathBaseARM64(HInstruction* instruction, Location ref, Location entrypoint) + : SlowPathCodeARM64(instruction), ref_(ref), entrypoint_(entrypoint) { + DCHECK(kEmitCompilerReadBarrier); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathBaseARM64"; } + + // Generate assembly code calling the read barrier marking runtime + // entry point (ReadBarrierMarkRegX). + void GenerateReadBarrierMarkRuntimeCall(CodeGenerator* codegen) { + // No need to save live registers; it's taken care of by the + // entrypoint. Also, there is no need to update the stack mask, + // as this runtime call will not trigger a garbage collection. + CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen); + DCHECK_NE(ref_.reg(), LR); + DCHECK_NE(ref_.reg(), WSP); + DCHECK_NE(ref_.reg(), WZR); + // IP0 is used internally by the ReadBarrierMarkRegX entry point + // as a temporary, it cannot be the entry point's input/output. + DCHECK_NE(ref_.reg(), IP0); + DCHECK(0 <= ref_.reg() && ref_.reg() < kNumberOfWRegisters) << ref_.reg(); + // "Compact" slow path, saving two moves. + // + // Instead of using the standard runtime calling convention (input + // and output in W0): + // + // W0 <- ref + // W0 <- ReadBarrierMark(W0) + // ref <- W0 + // + // we just use rX (the register containing `ref`) as input and output + // of a dedicated entrypoint: + // + // rX <- ReadBarrierMarkRegX(rX) + // + if (entrypoint_.IsValid()) { + arm64_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this); + __ Blr(XRegisterFrom(entrypoint_)); + } else { + // Entrypoint is not already loaded, load from the thread. + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref_.reg()); + // This runtime call does not require a stack map. + arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + } + } + + // The location (register) of the marked object reference. + const Location ref_; + + // The location of the entrypoint if it is already loaded. + const Location entrypoint_; + + private: + DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathBaseARM64); +}; + // Slow path marking an object reference `ref` during a read // barrier. The field `obj.field` in the object `obj` holding this -// reference does not get updated by this slow path after marking (see -// ReadBarrierMarkAndUpdateFieldSlowPathARM64 below for that). +// reference does not get updated by this slow path after marking. // // This means that after the execution of this slow path, `ref` will // always be up-to-date, but `obj.field` may not; i.e., after the @@ -647,15 +710,13 @@ void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) { // // If `entrypoint` is a valid location it is assumed to already be // holding the entrypoint. The case where the entrypoint is passed in -// is for the GcRoot read barrier. -class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 { +// is when the decision to mark is based on whether the GC is marking. +class ReadBarrierMarkSlowPathARM64 : public ReadBarrierMarkSlowPathBaseARM64 { public: ReadBarrierMarkSlowPathARM64(HInstruction* instruction, Location ref, Location entrypoint = Location::NoLocation()) - : SlowPathCodeARM64(instruction), - ref_(ref), - entrypoint_(entrypoint) { + : ReadBarrierMarkSlowPathBaseARM64(instruction, ref, entrypoint) { DCHECK(kEmitCompilerReadBarrier); } @@ -666,12 +727,75 @@ class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 { DCHECK(locations->CanCall()); DCHECK(ref_.IsRegister()) << ref_; DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_.reg())) << ref_.reg(); + DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString()) + << "Unexpected instruction in read barrier marking slow path: " + << instruction_->DebugName(); + + __ Bind(GetEntryLabel()); + GenerateReadBarrierMarkRuntimeCall(codegen); + __ B(GetExitLabel()); + } + + private: + DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM64); +}; + +// Slow path loading `obj`'s lock word, loading a reference from +// object `*(obj + offset + (index << scale_factor))` into `ref`, and +// marking `ref` if `obj` is gray according to the lock word (Baker +// read barrier). The field `obj.field` in the object `obj` holding +// this reference does not get updated by this slow path after marking +// (see LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64 +// below for that). +// +// This means that after the execution of this slow path, `ref` will +// always be up-to-date, but `obj.field` may not; i.e., after the +// flip, `ref` will be a to-space reference, but `obj.field` will +// probably still be a from-space reference (unless it gets updated by +// another thread, or if another thread installed another object +// reference (different from `ref`) in `obj.field`). +// +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class LoadReferenceWithBakerReadBarrierSlowPathARM64 : public ReadBarrierMarkSlowPathBaseARM64 { + public: + LoadReferenceWithBakerReadBarrierSlowPathARM64(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + size_t scale_factor, + bool needs_null_check, + bool use_load_acquire, + Register temp, + Location entrypoint) + : ReadBarrierMarkSlowPathBaseARM64(instruction, ref, entrypoint), + obj_(obj), + offset_(offset), + index_(index), + scale_factor_(scale_factor), + needs_null_check_(needs_null_check), + use_load_acquire_(use_load_acquire), + temp_(temp) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + } + + const char* GetDescription() const OVERRIDE { + return "LoadReferenceWithBakerReadBarrierSlowPathARM64"; + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); + DCHECK(locations->CanCall()); + DCHECK(ref_.IsRegister()) << ref_; + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_.reg())) << ref_.reg(); + DCHECK(obj_.IsW()); + DCHECK_NE(ref_.reg(), LocationFrom(temp_).reg()); DCHECK(instruction_->IsInstanceFieldGet() || instruction_->IsStaticFieldGet() || instruction_->IsArrayGet() || instruction_->IsArraySet() || - instruction_->IsLoadClass() || - instruction_->IsLoadString() || instruction_->IsInstanceOf() || instruction_->IsCheckCast() || (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || @@ -684,82 +808,138 @@ class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 { DCHECK(!(instruction_->IsArrayGet() && instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress())); + // Temporary register `temp_`, used to store the lock word, must + // not be IP0 nor IP1, as we may use them to emit the reference + // load (in the call to GenerateRawReferenceLoad below), and we + // need the lock word to still be in `temp_` after the reference + // load. + DCHECK_NE(LocationFrom(temp_).reg(), IP0); + DCHECK_NE(LocationFrom(temp_).reg(), IP1); + __ Bind(GetEntryLabel()); - // No need to save live registers; it's taken care of by the - // entrypoint. Also, there is no need to update the stack mask, - // as this runtime call will not trigger a garbage collection. - CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen); - DCHECK_NE(ref_.reg(), LR); - DCHECK_NE(ref_.reg(), WSP); - DCHECK_NE(ref_.reg(), WZR); - // IP0 is used internally by the ReadBarrierMarkRegX entry point - // as a temporary, it cannot be the entry point's input/output. - DCHECK_NE(ref_.reg(), IP0); - DCHECK(0 <= ref_.reg() && ref_.reg() < kNumberOfWRegisters) << ref_.reg(); - // "Compact" slow path, saving two moves. - // - // Instead of using the standard runtime calling convention (input - // and output in W0): + + // When using MaybeGenerateReadBarrierSlow, the read barrier call is + // inserted after the original load. However, in fast path based + // Baker's read barriers, we need to perform the load of + // mirror::Object::monitor_ *before* the original reference load. + // This load-load ordering is required by the read barrier. + // The fast path/slow path (for Baker's algorithm) should look like: // - // W0 <- ref - // W0 <- ReadBarrierMark(W0) - // ref <- W0 + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // ref = entrypoint(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // } // - // we just use rX (the register containing `ref`) as input and output - // of a dedicated entrypoint: + // Note: the original implementation in ReadBarrier::Barrier is + // slightly more complex as it performs additional checks that we do + // not do here for performance reasons. + + // /* int32_t */ monitor = obj->monitor_ + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); + __ Ldr(temp_, HeapOperand(obj_, monitor_offset)); + if (needs_null_check_) { + codegen->MaybeRecordImplicitNullCheck(instruction_); + } + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including rb_state, + // to prevent load-load reordering, and without using + // a memory barrier (which would be more expensive). + // `obj` is unchanged by this operation, but its value now depends + // on `temp`. + __ Add(obj_.X(), obj_.X(), Operand(temp_.X(), LSR, 32)); + + // The actual reference load. + // A possible implicit null check has already been handled above. + CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen); + arm64_codegen->GenerateRawReferenceLoad(instruction_, + ref_, + obj_, + offset_, + index_, + scale_factor_, + /* needs_null_check */ false, + use_load_acquire_); + + // Mark the object `ref` when `obj` is gray. // - // rX <- ReadBarrierMarkRegX(rX) + // if (rb_state == ReadBarrier::GrayState()) + // ref = ReadBarrier::Mark(ref); // - if (entrypoint_.IsValid()) { - arm64_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this); - __ Blr(XRegisterFrom(entrypoint_)); - } else { - // Entrypoint is not already loaded, load from the thread. - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref_.reg()); - // This runtime call does not require a stack map. - arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); - } + // Given the numeric representation, it's enough to check the low bit of the rb_state. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + __ Tbz(temp_, LockWord::kReadBarrierStateShift, GetExitLabel()); + GenerateReadBarrierMarkRuntimeCall(codegen); + __ B(GetExitLabel()); } private: - // The location (register) of the marked object reference. - const Location ref_; - - // The location of the entrypoint if it is already loaded. - const Location entrypoint_; - - DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM64); + // The register containing the object holding the marked object reference field. + Register obj_; + // The offset, index and scale factor to access the reference in `obj_`. + uint32_t offset_; + Location index_; + size_t scale_factor_; + // Is a null check required? + bool needs_null_check_; + // Should this reference load use Load-Acquire semantics? + bool use_load_acquire_; + // A temporary register used to hold the lock word of `obj_`. + Register temp_; + + DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierSlowPathARM64); }; -// Slow path marking an object reference `ref` during a read barrier, -// and if needed, atomically updating the field `obj.field` in the -// object `obj` holding this reference after marking (contrary to -// ReadBarrierMarkSlowPathARM64 above, which never tries to update -// `obj.field`). +// Slow path loading `obj`'s lock word, loading a reference from +// object `*(obj + offset + (index << scale_factor))` into `ref`, and +// marking `ref` if `obj` is gray according to the lock word (Baker +// read barrier). If needed, this slow path also atomically updates +// the field `obj.field` in the object `obj` holding this reference +// after marking (contrary to +// LoadReferenceWithBakerReadBarrierSlowPathARM64 above, which never +// tries to update `obj.field`). // // This means that after the execution of this slow path, both `ref` // and `obj.field` will be up-to-date; i.e., after the flip, both will // hold the same to-space reference (unless another thread installed // another object reference (different from `ref`) in `obj.field`). -class ReadBarrierMarkAndUpdateFieldSlowPathARM64 : public SlowPathCodeARM64 { +// +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64 + : public ReadBarrierMarkSlowPathBaseARM64 { public: - ReadBarrierMarkAndUpdateFieldSlowPathARM64(HInstruction* instruction, - Location ref, - Register obj, - Location field_offset, - Register temp) - : SlowPathCodeARM64(instruction), - ref_(ref), + LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + size_t scale_factor, + bool needs_null_check, + bool use_load_acquire, + Register temp, + Location entrypoint) + : ReadBarrierMarkSlowPathBaseARM64(instruction, ref, entrypoint), obj_(obj), - field_offset_(field_offset), + offset_(offset), + index_(index), + scale_factor_(scale_factor), + needs_null_check_(needs_null_check), + use_load_acquire_(use_load_acquire), temp_(temp) { DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); } const char* GetDescription() const OVERRIDE { - return "ReadBarrierMarkAndUpdateFieldSlowPathARM64"; + return "LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64"; } void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { @@ -768,64 +948,90 @@ class ReadBarrierMarkAndUpdateFieldSlowPathARM64 : public SlowPathCodeARM64 { DCHECK(locations->CanCall()); DCHECK(ref_.IsRegister()) << ref_; DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_.reg())) << ref_.reg(); - // This slow path is only used by the UnsafeCASObject intrinsic. + DCHECK(obj_.IsW()); + DCHECK_NE(ref_.reg(), LocationFrom(temp_).reg()); + + // This slow path is only used by the UnsafeCASObject intrinsic at the moment. DCHECK((instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier marking and field updating slow path: " << instruction_->DebugName(); DCHECK(instruction_->GetLocations()->Intrinsified()); DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kUnsafeCASObject); - DCHECK(field_offset_.IsRegister()) << field_offset_; + DCHECK_EQ(offset_, 0u); + DCHECK_EQ(scale_factor_, 0u); + DCHECK_EQ(use_load_acquire_, false); + // The location of the offset of the marked reference field within `obj_`. + Location field_offset = index_; + DCHECK(field_offset.IsRegister()) << field_offset; + + // Temporary register `temp_`, used to store the lock word, must + // not be IP0 nor IP1, as we may use them to emit the reference + // load (in the call to GenerateRawReferenceLoad below), and we + // need the lock word to still be in `temp_` after the reference + // load. + DCHECK_NE(LocationFrom(temp_).reg(), IP0); + DCHECK_NE(LocationFrom(temp_).reg(), IP1); __ Bind(GetEntryLabel()); - // Save the old reference. + // /* int32_t */ monitor = obj->monitor_ + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); + __ Ldr(temp_, HeapOperand(obj_, monitor_offset)); + if (needs_null_check_) { + codegen->MaybeRecordImplicitNullCheck(instruction_); + } + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including rb_state, + // to prevent load-load reordering, and without using + // a memory barrier (which would be more expensive). + // `obj` is unchanged by this operation, but its value now depends + // on `temp`. + __ Add(obj_.X(), obj_.X(), Operand(temp_.X(), LSR, 32)); + + // The actual reference load. + // A possible implicit null check has already been handled above. + CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen); + arm64_codegen->GenerateRawReferenceLoad(instruction_, + ref_, + obj_, + offset_, + index_, + scale_factor_, + /* needs_null_check */ false, + use_load_acquire_); + + // Mark the object `ref` when `obj` is gray. + // + // if (rb_state == ReadBarrier::GrayState()) + // ref = ReadBarrier::Mark(ref); + // + // Given the numeric representation, it's enough to check the low bit of the rb_state. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + __ Tbz(temp_, LockWord::kReadBarrierStateShift, GetExitLabel()); + + // Save the old value of the reference before marking it. // Note that we cannot use IP to save the old reference, as IP is // used internally by the ReadBarrierMarkRegX entry point, and we // need the old reference after the call to that entry point. DCHECK_NE(LocationFrom(temp_).reg(), IP0); __ Mov(temp_.W(), ref_reg); - // No need to save live registers; it's taken care of by the - // entrypoint. Also, there is no need to update the stack mask, - // as this runtime call will not trigger a garbage collection. - CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen); - DCHECK_NE(ref_.reg(), LR); - DCHECK_NE(ref_.reg(), WSP); - DCHECK_NE(ref_.reg(), WZR); - // IP0 is used internally by the ReadBarrierMarkRegX entry point - // as a temporary, it cannot be the entry point's input/output. - DCHECK_NE(ref_.reg(), IP0); - DCHECK(0 <= ref_.reg() && ref_.reg() < kNumberOfWRegisters) << ref_.reg(); - // "Compact" slow path, saving two moves. - // - // Instead of using the standard runtime calling convention (input - // and output in W0): - // - // W0 <- ref - // W0 <- ReadBarrierMark(W0) - // ref <- W0 - // - // we just use rX (the register containing `ref`) as input and output - // of a dedicated entrypoint: - // - // rX <- ReadBarrierMarkRegX(rX) - // - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref_.reg()); - // This runtime call does not require a stack map. - arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + GenerateReadBarrierMarkRuntimeCall(codegen); // If the new reference is different from the old reference, - // update the field in the holder (`*(obj_ + field_offset_)`). + // update the field in the holder (`*(obj_ + field_offset)`). // // Note that this field could also hold a different object, if // another thread had concurrently changed it. In that case, the // LDXR/CMP/BNE sequence of instructions in the compare-and-set // (CAS) operation below would abort the CAS, leaving the field // as-is. - vixl::aarch64::Label done; __ Cmp(temp_.W(), ref_reg); - __ B(eq, &done); + __ B(eq, GetExitLabel()); // Update the the holder's field atomically. This may fail if // mutator updates before us, but it's OK. This is achieved @@ -838,7 +1044,7 @@ class ReadBarrierMarkAndUpdateFieldSlowPathARM64 : public SlowPathCodeARM64 { // Convenience aliases. Register base = obj_.W(); - Register offset = XRegisterFrom(field_offset_); + Register offset = XRegisterFrom(field_offset); Register expected = temp_.W(); Register value = ref_reg; Register tmp_ptr = temps.AcquireX(); // Pointer to actual memory. @@ -882,21 +1088,26 @@ class ReadBarrierMarkAndUpdateFieldSlowPathARM64 : public SlowPathCodeARM64 { } } - __ Bind(&done); __ B(GetExitLabel()); } private: - // The location (register) of the marked object reference. - const Location ref_; // The register containing the object holding the marked object reference field. const Register obj_; - // The location of the offset of the marked reference field within `obj_`. - Location field_offset_; - + // The offset, index and scale factor to access the reference in `obj_`. + uint32_t offset_; + Location index_; + size_t scale_factor_; + // Is a null check required? + bool needs_null_check_; + // Should this reference load use Load-Acquire semantics? + bool use_load_acquire_; + // A temporary register used to hold the lock word of `obj_`; and + // also to hold the original reference value, when the reference is + // marked. const Register temp_; - DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathARM64); + DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64); }; // Slow path generating a read barrier for a heap reference. @@ -2425,6 +2636,9 @@ void LocationsBuilderARM64::VisitArrayGet(HArrayGet* instruction) { LocationSummary::kNoCall); if (object_array_get_with_read_barrier && kUseBakerReadBarrier) { locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + // We need a temporary register for the read barrier marking slow + // path in CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier. + locations->AddTemp(Location::RequiresRegister()); } locations->SetInAt(0, Location::RequiresRegister()); locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); @@ -2460,7 +2674,7 @@ void InstructionCodeGeneratorARM64::VisitArrayGet(HArrayGet* instruction) { if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) { // Object ArrayGet with Baker's read barrier case. - Register temp = temps.AcquireW(); + Register temp = WRegisterFrom(locations->GetTemp(0)); // Note that a potential implicit null check is handled in the // CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier call. codegen_->GenerateArrayLoadWithBakerReadBarrier( @@ -5604,14 +5818,35 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad( DCHECK(kEmitCompilerReadBarrier); if (kUseBakerReadBarrier) { // Fast path implementation of art::ReadBarrier::BarrierForRoot when - // Baker's read barrier are used: + // Baker's read barrier are used. + // + // Note that we do not actually check the value of + // `GetIsGcMarking()` to decide whether to mark the loaded GC + // root or not. Instead, we load into `temp` the read barrier + // mark entry point corresponding to register `root`. If `temp` + // is null, it means that `GetIsGcMarking()` is false, and vice + // versa. // - // root = obj.field; // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // if (temp != null) { - // root = temp(root) + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. // } + // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. + Register temp = lr; + SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64( + instruction, root, /* entrypoint */ LocationFrom(temp)); + codegen_->AddSlowPath(slow_path); + + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(root.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ Ldr(temp, MemOperand(tr, entry_point_offset)); + // /* GcRoot<mirror::Object> */ root = *(obj + offset) if (fixup_label == nullptr) { __ Ldr(root_reg, MemOperand(obj, offset)); @@ -5626,20 +5861,6 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad( "art::mirror::CompressedReference<mirror::Object> and int32_t " "have different sizes."); - Register temp = lr; - - // Slow path marking the GC root `root`. The entrypoint will alrady be loaded in temp. - SlowPathCodeARM64* slow_path = - new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, - root, - LocationFrom(temp)); - codegen_->AddSlowPath(slow_path); - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(root.reg()); - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // Loading the entrypoint does not require a load acquire since it is only changed when - // threads are suspended or running a checkpoint. - __ Ldr(temp, MemOperand(tr, entry_point_offset)); // The entrypoint is null when the GC is not marking, this prevents one load compared to // checking GetIsGcMarking. __ Cbnz(temp, slow_path->GetEntryLabel()); @@ -5741,54 +5962,103 @@ void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* // `instruction->IsArrayGet()` => `!use_load_acquire`. DCHECK(!instruction->IsArrayGet() || !use_load_acquire); - MacroAssembler* masm = GetVIXLAssembler(); - UseScratchRegisterScope temps(masm); - - // In slow path based read barriers, the read barrier call is - // inserted after the original load. However, in fast path based - // Baker's read barriers, we need to perform the load of - // mirror::Object::monitor_ *before* the original reference load. - // This load-load ordering is required by the read barrier. - // The fast path/slow path (for Baker's algorithm) should look like: + // Query `art::Thread::Current()->GetIsGcMarking()` to decide + // whether we need to enter the slow path to mark the reference. + // Then, in the slow path, check the gray bit in the lock word of + // the reference's holder (`obj`) to decide whether to mark `ref` or + // not. // - // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); - // lfence; // Load fence or artificial data dependency to prevent load-load reordering - // HeapReference<Object> ref = *src; // Original reference load. - // bool is_gray = (rb_state == ReadBarrier::GrayState()); - // if (is_gray) { - // ref = ReadBarrier::Mark(ref); // Performed by runtime entrypoint slow path. - // } + // Note that we do not actually check the value of `GetIsGcMarking()`; + // instead, we load into `temp2` the read barrier mark entry point + // corresponding to register `ref`. If `temp2` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. // - // Note: the original implementation in ReadBarrier::Barrier is - // slightly more complex as it performs additional checks that we do - // not do here for performance reasons. + // temp2 = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // if (temp2 != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // ref = temp2(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // } + // } else { + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // } + + // Slow path marking the object `ref` when the GC is marking. The + // entrypoint will already be loaded in `temp2`. + Register temp2 = lr; + Location temp2_loc = LocationFrom(temp2); + SlowPathCodeARM64* slow_path; + if (always_update_field) { + // LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64 + // only supports address of the form `obj + field_offset`, where + // `obj` is a register and `field_offset` is a register. Thus + // `offset` and `scale_factor` above are expected to be null in + // this code path. + DCHECK_EQ(offset, 0u); + DCHECK_EQ(scale_factor, 0u); /* "times 1" */ + Location field_offset = index; + slow_path = + new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARM64( + instruction, + ref, + obj, + offset, + /* index */ field_offset, + scale_factor, + needs_null_check, + use_load_acquire, + temp, + /* entrypoint */ temp2_loc); + } else { + slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARM64( + instruction, + ref, + obj, + offset, + index, + scale_factor, + needs_null_check, + use_load_acquire, + temp, + /* entrypoint */ temp2_loc); + } + AddSlowPath(slow_path); + // temp2 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(ref.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ Ldr(temp2, MemOperand(tr, entry_point_offset)); + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ Cbnz(temp2, slow_path->GetEntryLabel()); + // Fast path: just load the reference. + GenerateRawReferenceLoad( + instruction, ref, obj, offset, index, scale_factor, needs_null_check, use_load_acquire); + __ Bind(slow_path->GetExitLabel()); +} + +void CodeGeneratorARM64::GenerateRawReferenceLoad(HInstruction* instruction, + Location ref, + Register obj, + uint32_t offset, + Location index, + size_t scale_factor, + bool needs_null_check, + bool use_load_acquire) { + DCHECK(obj.IsW()); Primitive::Type type = Primitive::kPrimNot; Register ref_reg = RegisterFrom(ref, type); - DCHECK(obj.IsW()); - uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); - - { - // Ensure that between load and MaybeRecordImplicitNullCheck there are no pools emitted. - EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes); - // /* int32_t */ monitor = obj->monitor_ - __ Ldr(temp, HeapOperand(obj, monitor_offset)); - if (needs_null_check) { - MaybeRecordImplicitNullCheck(instruction); - } - } - // /* LockWord */ lock_word = LockWord(monitor) - static_assert(sizeof(LockWord) == sizeof(int32_t), - "art::LockWord and int32_t have different sizes."); - // Introduce a dependency on the lock_word including rb_state, - // to prevent load-load reordering, and without using - // a memory barrier (which would be more expensive). - // `obj` is unchanged by this operation, but its value now depends - // on `temp`. - __ Add(obj.X(), obj.X(), Operand(temp.X(), LSR, 32)); + // If needed, vixl::EmissionCheckScope guards are used to ensure + // that no pools are emitted between the load (macro) instruction + // and MaybeRecordImplicitNullCheck. - // The actual reference load. if (index.IsValid()) { // Load types involving an "index": ArrayGet, // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject @@ -5803,59 +6073,50 @@ void CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* << instruction->AsInvoke()->GetIntrinsic(); DCHECK_EQ(offset, 0u); DCHECK_EQ(scale_factor, 0u); - DCHECK_EQ(needs_null_check, 0u); - // /* HeapReference<Object> */ ref = *(obj + index) + DCHECK_EQ(needs_null_check, false); + // /* HeapReference<mirror::Object> */ ref = *(obj + index) MemOperand field = HeapOperand(obj, XRegisterFrom(index)); LoadAcquire(instruction, ref_reg, field, /* needs_null_check */ false); } else { - // ArrayGet and UnsafeGetObject intrinsics cases. - // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor)) + // ArrayGet and UnsafeGetObject and UnsafeCASObject intrinsics cases. + // /* HeapReference<mirror::Object> */ ref = *(obj + offset + (index << scale_factor)) if (index.IsConstant()) { uint32_t computed_offset = offset + (Int64ConstantFrom(index) << scale_factor); + EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes); Load(type, ref_reg, HeapOperand(obj, computed_offset)); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } } else { - Register temp3 = temps.AcquireW(); - __ Add(temp3, obj, offset); - Load(type, ref_reg, HeapOperand(temp3, XRegisterFrom(index), LSL, scale_factor)); - temps.Release(temp3); + UseScratchRegisterScope temps(GetVIXLAssembler()); + Register temp = temps.AcquireW(); + __ Add(temp, obj, offset); + { + EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes); + Load(type, ref_reg, HeapOperand(temp, XRegisterFrom(index), LSL, scale_factor)); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } + } } } } else { - // /* HeapReference<Object> */ ref = *(obj + offset) + // /* HeapReference<mirror::Object> */ ref = *(obj + offset) MemOperand field = HeapOperand(obj, offset); if (use_load_acquire) { - LoadAcquire(instruction, ref_reg, field, /* needs_null_check */ false); + // Implicit null checks are handled by CodeGeneratorARM64::LoadAcquire. + LoadAcquire(instruction, ref_reg, field, needs_null_check); } else { + EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes); Load(type, ref_reg, field); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } } } // Object* ref = ref_addr->AsMirrorPtr() GetAssembler()->MaybeUnpoisonHeapReference(ref_reg); - - // Slow path marking the object `ref` when it is gray. - SlowPathCodeARM64* slow_path; - if (always_update_field) { - // ReadBarrierMarkAndUpdateFieldSlowPathARM64 only supports - // address of the form `obj + field_offset`, where `obj` is a - // register and `field_offset` is a register. Thus `offset` and - // `scale_factor` above are expected to be null in this code path. - DCHECK_EQ(offset, 0u); - DCHECK_EQ(scale_factor, 0u); /* "times 1" */ - slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkAndUpdateFieldSlowPathARM64( - instruction, ref, obj, /* field_offset */ index, temp); - } else { - slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, ref); - } - AddSlowPath(slow_path); - - // if (rb_state == ReadBarrier::GrayState()) - // ref = ReadBarrier::Mark(ref); - // Given the numeric representation, it's enough to check the low bit of the rb_state. - static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); - static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); - __ Tbnz(temp, LockWord::kReadBarrierStateShift, slow_path->GetEntryLabel()); - __ Bind(slow_path->GetExitLabel()); } void CodeGeneratorARM64::GenerateReadBarrierSlow(HInstruction* instruction, diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index 24a602400e..7471cd5f12 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -616,8 +616,8 @@ class CodeGeneratorARM64 : public CodeGenerator { Location index, vixl::aarch64::Register temp, bool needs_null_check); - // Factored implementation used by GenerateFieldLoadWithBakerReadBarrier - // and GenerateArrayLoadWithBakerReadBarrier. + // Factored implementation, used by GenerateFieldLoadWithBakerReadBarrier, + // GenerateArrayLoadWithBakerReadBarrier and some intrinsics. // // Load the object reference located at the address // `obj + offset + (index << scale_factor)`, held by object `obj`, into @@ -636,6 +636,16 @@ class CodeGeneratorARM64 : public CodeGenerator { bool use_load_acquire, bool always_update_field = false); + // Generate a heap reference load (with no read barrier). + void GenerateRawReferenceLoad(HInstruction* instruction, + Location ref, + vixl::aarch64::Register obj, + uint32_t offset, + Location index, + size_t scale_factor, + bool needs_null_check, + bool use_load_acquire); + // Generate a read barrier for a heap reference within `instruction` // using a slow path. // diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc index 58bf2de70b..180db923bf 100644 --- a/compiler/optimizing/code_generator_arm_vixl.cc +++ b/compiler/optimizing/code_generator_arm_vixl.cc @@ -657,52 +657,25 @@ class ArraySetSlowPathARMVIXL : public SlowPathCodeARMVIXL { DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathARMVIXL); }; -// Slow path marking an object reference `ref` during a read -// barrier. The field `obj.field` in the object `obj` holding this -// reference does not get updated by this slow path after marking (see -// ReadBarrierMarkAndUpdateFieldSlowPathARM below for that). +// Abstract base class for read barrier slow paths marking a reference +// `ref`. // -// This means that after the execution of this slow path, `ref` will -// always be up-to-date, but `obj.field` may not; i.e., after the -// flip, `ref` will be a to-space reference, but `obj.field` will -// probably still be a from-space reference (unless it gets updated by -// another thread, or if another thread installed another object -// reference (different from `ref`) in `obj.field`). -class ReadBarrierMarkSlowPathARMVIXL : public SlowPathCodeARMVIXL { - public: - ReadBarrierMarkSlowPathARMVIXL(HInstruction* instruction, - Location ref, - Location entrypoint = Location::NoLocation()) +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class ReadBarrierMarkSlowPathBaseARMVIXL : public SlowPathCodeARMVIXL { + protected: + ReadBarrierMarkSlowPathBaseARMVIXL(HInstruction* instruction, Location ref, Location entrypoint) : SlowPathCodeARMVIXL(instruction), ref_(ref), entrypoint_(entrypoint) { DCHECK(kEmitCompilerReadBarrier); } - const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathARMVIXL"; } + const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathBaseARMVIXL"; } - void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { - LocationSummary* locations = instruction_->GetLocations(); + // Generate assembly code calling the read barrier marking runtime + // entry point (ReadBarrierMarkRegX). + void GenerateReadBarrierMarkRuntimeCall(CodeGenerator* codegen) { vixl32::Register ref_reg = RegisterFrom(ref_); - DCHECK(locations->CanCall()); - DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg.GetCode())) << ref_reg; - DCHECK(instruction_->IsInstanceFieldGet() || - instruction_->IsStaticFieldGet() || - instruction_->IsArrayGet() || - instruction_->IsArraySet() || - instruction_->IsLoadClass() || - instruction_->IsLoadString() || - instruction_->IsInstanceOf() || - instruction_->IsCheckCast() || - (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || - (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified())) - << "Unexpected instruction in read barrier marking slow path: " - << instruction_->DebugName(); - // The read barrier instrumentation of object ArrayGet - // instructions does not support the HIntermediateAddress - // instruction. - DCHECK(!(instruction_->IsArrayGet() && - instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress())); - __ Bind(GetEntryLabel()); // No need to save live registers; it's taken care of by the // entrypoint. Also, there is no need to update the stack mask, // as this runtime call will not trigger a garbage collection. @@ -732,53 +705,258 @@ class ReadBarrierMarkSlowPathARMVIXL : public SlowPathCodeARMVIXL { arm_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this); __ Blx(RegisterFrom(entrypoint_)); } else { + // Entrypoint is not already loaded, load from the thread. int32_t entry_point_offset = CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg.GetCode()); // This runtime call does not require a stack map. arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); } - __ B(GetExitLabel()); } - private: // The location (register) of the marked object reference. const Location ref_; // The location of the entrypoint if already loaded. const Location entrypoint_; + private: + DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathBaseARMVIXL); +}; + +// Slow path marking an object reference `ref` during a read +// barrier. The field `obj.field` in the object `obj` holding this +// reference does not get updated by this slow path after marking. +// +// This means that after the execution of this slow path, `ref` will +// always be up-to-date, but `obj.field` may not; i.e., after the +// flip, `ref` will be a to-space reference, but `obj.field` will +// probably still be a from-space reference (unless it gets updated by +// another thread, or if another thread installed another object +// reference (different from `ref`) in `obj.field`). +// +// If `entrypoint` is a valid location it is assumed to already be +// holding the entrypoint. The case where the entrypoint is passed in +// is when the decision to mark is based on whether the GC is marking. +class ReadBarrierMarkSlowPathARMVIXL : public ReadBarrierMarkSlowPathBaseARMVIXL { + public: + ReadBarrierMarkSlowPathARMVIXL(HInstruction* instruction, + Location ref, + Location entrypoint = Location::NoLocation()) + : ReadBarrierMarkSlowPathBaseARMVIXL(instruction, ref, entrypoint) { + DCHECK(kEmitCompilerReadBarrier); + } + + const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathARMVIXL"; } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); + DCHECK(locations->CanCall()); + DCHECK(ref_.IsRegister()) << ref_; + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_.reg())) << ref_.reg(); + DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString()) + << "Unexpected instruction in read barrier marking slow path: " + << instruction_->DebugName(); + + __ Bind(GetEntryLabel()); + GenerateReadBarrierMarkRuntimeCall(codegen); + __ B(GetExitLabel()); + } + + private: DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARMVIXL); }; -// Slow path marking an object reference `ref` during a read barrier, -// and if needed, atomically updating the field `obj.field` in the -// object `obj` holding this reference after marking (contrary to -// ReadBarrierMarkSlowPathARM above, which never tries to update -// `obj.field`). +// Slow path loading `obj`'s lock word, loading a reference from +// object `*(obj + offset + (index << scale_factor))` into `ref`, and +// marking `ref` if `obj` is gray according to the lock word (Baker +// read barrier). The field `obj.field` in the object `obj` holding +// this reference does not get updated by this slow path after marking +// (see LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL +// below for that). +// +// This means that after the execution of this slow path, `ref` will +// always be up-to-date, but `obj.field` may not; i.e., after the +// flip, `ref` will be a to-space reference, but `obj.field` will +// probably still be a from-space reference (unless it gets updated by +// another thread, or if another thread installed another object +// reference (different from `ref`) in `obj.field`). +// +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class LoadReferenceWithBakerReadBarrierSlowPathARMVIXL : public ReadBarrierMarkSlowPathBaseARMVIXL { + public: + LoadReferenceWithBakerReadBarrierSlowPathARMVIXL(HInstruction* instruction, + Location ref, + vixl32::Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + bool needs_null_check, + vixl32::Register temp, + Location entrypoint) + : ReadBarrierMarkSlowPathBaseARMVIXL(instruction, ref, entrypoint), + obj_(obj), + offset_(offset), + index_(index), + scale_factor_(scale_factor), + needs_null_check_(needs_null_check), + temp_(temp) { + DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); + } + + const char* GetDescription() const OVERRIDE { + return "LoadReferenceWithBakerReadBarrierSlowPathARMVIXL"; + } + + void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); + vixl32::Register ref_reg = RegisterFrom(ref_); + DCHECK(locations->CanCall()); + DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg.GetCode())) << ref_reg; + DCHECK(instruction_->IsInstanceFieldGet() || + instruction_->IsStaticFieldGet() || + instruction_->IsArrayGet() || + instruction_->IsArraySet() || + instruction_->IsInstanceOf() || + instruction_->IsCheckCast() || + (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) || + (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified())) + << "Unexpected instruction in read barrier marking slow path: " + << instruction_->DebugName(); + // The read barrier instrumentation of object ArrayGet + // instructions does not support the HIntermediateAddress + // instruction. + DCHECK(!(instruction_->IsArrayGet() && + instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress())); + + // Temporary register `temp_`, used to store the lock word, must + // not be IP, as we may use it to emit the reference load (in the + // call to GenerateRawReferenceLoad below), and we need the lock + // word to still be in `temp_` after the reference load. + DCHECK(!temp_.Is(ip)); + + __ Bind(GetEntryLabel()); + + // When using MaybeGenerateReadBarrierSlow, the read barrier call is + // inserted after the original load. However, in fast path based + // Baker's read barriers, we need to perform the load of + // mirror::Object::monitor_ *before* the original reference load. + // This load-load ordering is required by the read barrier. + // The fast path/slow path (for Baker's algorithm) should look like: + // + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // ref = entrypoint(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // } + // + // Note: the original implementation in ReadBarrier::Barrier is + // slightly more complex as it performs additional checks that we do + // not do here for performance reasons. + + CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen); + + // /* int32_t */ monitor = obj->monitor_ + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); + arm_codegen->GetAssembler()->LoadFromOffset(kLoadWord, temp_, obj_, monitor_offset); + if (needs_null_check_) { + codegen->MaybeRecordImplicitNullCheck(instruction_); + } + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including the rb_state, + // which shall prevent load-load reordering without using + // a memory barrier (which would be more expensive). + // `obj` is unchanged by this operation, but its value now depends + // on `temp`. + __ Add(obj_, obj_, Operand(temp_, ShiftType::LSR, 32)); + + // The actual reference load. + // A possible implicit null check has already been handled above. + arm_codegen->GenerateRawReferenceLoad( + instruction_, ref_, obj_, offset_, index_, scale_factor_, /* needs_null_check */ false); + + // Mark the object `ref` when `obj` is gray. + // + // if (rb_state == ReadBarrier::GrayState()) + // ref = ReadBarrier::Mark(ref); + // + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with LSRS + // which can be a 16-bit instruction unlike the TST immediate. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + __ Lsrs(temp_, temp_, LockWord::kReadBarrierStateShift + 1); + __ B(cc, GetExitLabel()); // Carry flag is the last bit shifted out by LSRS. + GenerateReadBarrierMarkRuntimeCall(codegen); + + __ B(GetExitLabel()); + } + + private: + // The register containing the object holding the marked object reference field. + vixl32::Register obj_; + // The offset, index and scale factor to access the reference in `obj_`. + uint32_t offset_; + Location index_; + ScaleFactor scale_factor_; + // Is a null check required? + bool needs_null_check_; + // A temporary register used to hold the lock word of `obj_`. + vixl32::Register temp_; + + DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierSlowPathARMVIXL); +}; + +// Slow path loading `obj`'s lock word, loading a reference from +// object `*(obj + offset + (index << scale_factor))` into `ref`, and +// marking `ref` if `obj` is gray according to the lock word (Baker +// read barrier). If needed, this slow path also atomically updates +// the field `obj.field` in the object `obj` holding this reference +// after marking (contrary to +// LoadReferenceWithBakerReadBarrierSlowPathARMVIXL above, which never +// tries to update `obj.field`). // // This means that after the execution of this slow path, both `ref` // and `obj.field` will be up-to-date; i.e., after the flip, both will // hold the same to-space reference (unless another thread installed // another object reference (different from `ref`) in `obj.field`). -class ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL : public SlowPathCodeARMVIXL { +// +// +// Argument `entrypoint` must be a register location holding the read +// barrier marking runtime entry point to be invoked. +class LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL + : public ReadBarrierMarkSlowPathBaseARMVIXL { public: - ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL(HInstruction* instruction, - Location ref, - vixl32::Register obj, - Location field_offset, - vixl32::Register temp1, - vixl32::Register temp2) - : SlowPathCodeARMVIXL(instruction), - ref_(ref), + LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL(HInstruction* instruction, + Location ref, + vixl32::Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + bool needs_null_check, + vixl32::Register temp1, + vixl32::Register temp2, + Location entrypoint) + : ReadBarrierMarkSlowPathBaseARMVIXL(instruction, ref, entrypoint), obj_(obj), - field_offset_(field_offset), + offset_(offset), + index_(index), + scale_factor_(scale_factor), + needs_null_check_(needs_null_check), temp1_(temp1), temp2_(temp2) { DCHECK(kEmitCompilerReadBarrier); + DCHECK(kUseBakerReadBarrier); } const char* GetDescription() const OVERRIDE { - return "ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL"; + return "LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL"; } void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { @@ -786,64 +964,83 @@ class ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL : public SlowPathCodeARMVIXL vixl32::Register ref_reg = RegisterFrom(ref_); DCHECK(locations->CanCall()); DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg.GetCode())) << ref_reg; - // This slow path is only used by the UnsafeCASObject intrinsic. + DCHECK_NE(ref_.reg(), LocationFrom(temp1_).reg()); + + // This slow path is only used by the UnsafeCASObject intrinsic at the moment. DCHECK((instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified())) << "Unexpected instruction in read barrier marking and field updating slow path: " << instruction_->DebugName(); DCHECK(instruction_->GetLocations()->Intrinsified()); DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kUnsafeCASObject); - DCHECK(field_offset_.IsRegisterPair()) << field_offset_; + DCHECK_EQ(offset_, 0u); + DCHECK_EQ(scale_factor_, ScaleFactor::TIMES_1); + Location field_offset = index_; + DCHECK(field_offset.IsRegisterPair()) << field_offset; + + // Temporary register `temp1_`, used to store the lock word, must + // not be IP, as we may use it to emit the reference load (in the + // call to GenerateRawReferenceLoad below), and we need the lock + // word to still be in `temp1_` after the reference load. + DCHECK(!temp1_.Is(ip)); __ Bind(GetEntryLabel()); - // Save the old reference. + CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen); + + // /* int32_t */ monitor = obj->monitor_ + uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); + arm_codegen->GetAssembler()->LoadFromOffset(kLoadWord, temp1_, obj_, monitor_offset); + if (needs_null_check_) { + codegen->MaybeRecordImplicitNullCheck(instruction_); + } + // /* LockWord */ lock_word = LockWord(monitor) + static_assert(sizeof(LockWord) == sizeof(int32_t), + "art::LockWord and int32_t have different sizes."); + + // Introduce a dependency on the lock_word including the rb_state, + // which shall prevent load-load reordering without using + // a memory barrier (which would be more expensive). + // `obj` is unchanged by this operation, but its value now depends + // on `temp`. + __ Add(obj_, obj_, Operand(temp1_, ShiftType::LSR, 32)); + + // The actual reference load. + // A possible implicit null check has already been handled above. + arm_codegen->GenerateRawReferenceLoad( + instruction_, ref_, obj_, offset_, index_, scale_factor_, /* needs_null_check */ false); + + // Mark the object `ref` when `obj` is gray. + // + // if (rb_state == ReadBarrier::GrayState()) + // ref = ReadBarrier::Mark(ref); + // + // Given the numeric representation, it's enough to check the low bit of the + // rb_state. We do that by shifting the bit out of the lock word with LSRS + // which can be a 16-bit instruction unlike the TST immediate. + static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); + static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); + __ Lsrs(temp1_, temp1_, LockWord::kReadBarrierStateShift + 1); + __ B(cc, GetExitLabel()); // Carry flag is the last bit shifted out by LSRS. + + // Save the old value of the reference before marking it. // Note that we cannot use IP to save the old reference, as IP is // used internally by the ReadBarrierMarkRegX entry point, and we // need the old reference after the call to that entry point. DCHECK(!temp1_.Is(ip)); __ Mov(temp1_, ref_reg); - // No need to save live registers; it's taken care of by the - // entrypoint. Also, there is no need to update the stack mask, - // as this runtime call will not trigger a garbage collection. - CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen); - DCHECK(!ref_reg.Is(sp)); - DCHECK(!ref_reg.Is(lr)); - DCHECK(!ref_reg.Is(pc)); - // IP is used internally by the ReadBarrierMarkRegX entry point - // as a temporary, it cannot be the entry point's input/output. - DCHECK(!ref_reg.Is(ip)); - DCHECK(ref_reg.IsRegister()) << ref_reg; - // "Compact" slow path, saving two moves. - // - // Instead of using the standard runtime calling convention (input - // and output in R0): - // - // R0 <- ref - // R0 <- ReadBarrierMark(R0) - // ref <- R0 - // - // we just use rX (the register containing `ref`) as input and output - // of a dedicated entrypoint: - // - // rX <- ReadBarrierMarkRegX(rX) - // - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg.GetCode()); - // This runtime call does not require a stack map. - arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + GenerateReadBarrierMarkRuntimeCall(codegen); // If the new reference is different from the old reference, - // update the field in the holder (`*(obj_ + field_offset_)`). + // update the field in the holder (`*(obj_ + field_offset)`). // // Note that this field could also hold a different object, if // another thread had concurrently changed it. In that case, the // LDREX/SUBS/ITNE sequence of instructions in the compare-and-set // (CAS) operation below would abort the CAS, leaving the field // as-is. - vixl32::Label done; __ Cmp(temp1_, ref_reg); - __ B(eq, &done, /* far_target */ false); + __ B(eq, GetExitLabel()); // Update the the holder's field atomically. This may fail if // mutator updates before us, but it's OK. This is achieved @@ -857,7 +1054,7 @@ class ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL : public SlowPathCodeARMVIXL // The UnsafeCASObject intrinsic uses a register pair as field // offset ("long offset"), of which only the low part contains // data. - vixl32::Register offset = LowRegisterFrom(field_offset_); + vixl32::Register offset = LowRegisterFrom(field_offset); vixl32::Register expected = temp1_; vixl32::Register value = ref_reg; vixl32::Register tmp_ptr = temps.Acquire(); // Pointer to actual memory. @@ -913,22 +1110,27 @@ class ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL : public SlowPathCodeARMVIXL } } - __ Bind(&done); __ B(GetExitLabel()); } private: - // The location (register) of the marked object reference. - const Location ref_; // The register containing the object holding the marked object reference field. const vixl32::Register obj_; - // The location of the offset of the marked reference field within `obj_`. - Location field_offset_; - + // The offset, index and scale factor to access the reference in `obj_`. + uint32_t offset_; + Location index_; + ScaleFactor scale_factor_; + // Is a null check required? + bool needs_null_check_; + // A temporary register used to hold the lock word of `obj_`; and + // also to hold the original reference value, when the reference is + // marked. const vixl32::Register temp1_; + // A temporary register used in the implementation of the CAS, to + // update the object's reference field. const vixl32::Register temp2_; - DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL); + DISALLOW_COPY_AND_ASSIGN(LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL); }; // Slow path generating a read barrier for a heap reference. @@ -7261,14 +7463,35 @@ void InstructionCodeGeneratorARMVIXL::GenerateGcRootFieldLoad( DCHECK(kEmitCompilerReadBarrier); if (kUseBakerReadBarrier) { // Fast path implementation of art::ReadBarrier::BarrierForRoot when - // Baker's read barrier are used: + // Baker's read barrier are used. + // + // Note that we do not actually check the value of + // `GetIsGcMarking()` to decide whether to mark the loaded GC + // root or not. Instead, we load into `temp` the read barrier + // mark entry point corresponding to register `root`. If `temp` + // is null, it means that `GetIsGcMarking()` is false, and vice + // versa. // - // root = obj.field; // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - // if (temp != null) { - // root = temp(root) + // GcRoot<mirror::Object> root = *(obj+offset); // Original reference load. + // if (temp != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // root = temp(root); // root = ReadBarrier::Mark(root); // Runtime entry point call. // } + // Slow path marking the GC root `root`. The entrypoint will already be loaded in `temp`. + Location temp = LocationFrom(lr); + SlowPathCodeARMVIXL* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL( + instruction, root, /* entrypoint */ temp); + codegen_->AddSlowPath(slow_path); + + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset); + // /* GcRoot<mirror::Object> */ root = *(obj + offset) GetAssembler()->LoadFromOffset(kLoadWord, root_reg, obj, offset); static_assert( @@ -7279,21 +7502,6 @@ void InstructionCodeGeneratorARMVIXL::GenerateGcRootFieldLoad( "art::mirror::CompressedReference<mirror::Object> and int32_t " "have different sizes."); - // Slow path marking the GC root `root`. - Location temp = LocationFrom(lr); - SlowPathCodeARMVIXL* slow_path = - new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL( - instruction, - root, - /*entrypoint*/ temp); - codegen_->AddSlowPath(slow_path); - - // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() - const int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg()); - // Loading the entrypoint does not require a load acquire since it is only changed when - // threads are suspended or running a checkpoint. - GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset); // The entrypoint is null when the GC is not marking, this prevents one load compared to // checking GetIsGcMarking. __ CompareAndBranchIfNonZero(RegisterFrom(temp), slow_path->GetEntryLabel()); @@ -7364,55 +7572,114 @@ void CodeGeneratorARMVIXL::GenerateReferenceLoadWithBakerReadBarrier(HInstructio DCHECK(kEmitCompilerReadBarrier); DCHECK(kUseBakerReadBarrier); - // In slow path based read barriers, the read barrier call is - // inserted after the original load. However, in fast path based - // Baker's read barriers, we need to perform the load of - // mirror::Object::monitor_ *before* the original reference load. - // This load-load ordering is required by the read barrier. - // The fast path/slow path (for Baker's algorithm) should look like: + // Query `art::Thread::Current()->GetIsGcMarking()` to decide + // whether we need to enter the slow path to mark the reference. + // Then, in the slow path, check the gray bit in the lock word of + // the reference's holder (`obj`) to decide whether to mark `ref` or + // not. // - // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); - // lfence; // Load fence or artificial data dependency to prevent load-load reordering - // HeapReference<Object> ref = *src; // Original reference load. - // bool is_gray = (rb_state == ReadBarrier::GrayState()); - // if (is_gray) { - // ref = ReadBarrier::Mark(ref); // Performed by runtime entrypoint slow path. - // } + // Note that we do not actually check the value of `GetIsGcMarking()`; + // instead, we load into `temp3` the read barrier mark entry point + // corresponding to register `ref`. If `temp3` is null, it means + // that `GetIsGcMarking()` is false, and vice versa. // - // Note: the original implementation in ReadBarrier::Barrier is - // slightly more complex as it performs additional checks that we do - // not do here for performance reasons. + // temp3 = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // if (temp3 != nullptr) { // <=> Thread::Current()->GetIsGcMarking() + // // Slow path. + // uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState(); + // lfence; // Load fence or artificial data dependency to prevent load-load reordering + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // bool is_gray = (rb_state == ReadBarrier::GrayState()); + // if (is_gray) { + // ref = temp3(ref); // ref = ReadBarrier::Mark(ref); // Runtime entry point call. + // } + // } else { + // HeapReference<mirror::Object> ref = *src; // Original reference load. + // } - vixl32::Register ref_reg = RegisterFrom(ref); vixl32::Register temp_reg = RegisterFrom(temp); - uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value(); - // /* int32_t */ monitor = obj->monitor_ - GetAssembler()->LoadFromOffset(kLoadWord, temp_reg, obj, monitor_offset); - if (needs_null_check) { - MaybeRecordImplicitNullCheck(instruction); + // Slow path marking the object `ref` when the GC is marking. The + // entrypoint will already be loaded in `temp3`. + Location temp3 = LocationFrom(lr); + SlowPathCodeARMVIXL* slow_path; + if (always_update_field) { + DCHECK(temp2 != nullptr); + // LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL + // only supports address of the form `obj + field_offset`, where + // `obj` is a register and `field_offset` is a register pair (of + // which only the lower half is used). Thus `offset` and + // `scale_factor` above are expected to be null in this code path. + DCHECK_EQ(offset, 0u); + DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1); + Location field_offset = index; + slow_path = + new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierAndUpdateFieldSlowPathARMVIXL( + instruction, + ref, + obj, + offset, + /* index */ field_offset, + scale_factor, + needs_null_check, + temp_reg, + *temp2, + /* entrypoint */ temp3); + } else { + slow_path = new (GetGraph()->GetArena()) LoadReferenceWithBakerReadBarrierSlowPathARMVIXL( + instruction, + ref, + obj, + offset, + index, + scale_factor, + needs_null_check, + temp_reg, + /* entrypoint */ temp3); } - // /* LockWord */ lock_word = LockWord(monitor) - static_assert(sizeof(LockWord) == sizeof(int32_t), - "art::LockWord and int32_t have different sizes."); + AddSlowPath(slow_path); + + // temp3 = Thread::Current()->pReadBarrierMarkReg ## ref.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp3), tr, entry_point_offset); + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ CompareAndBranchIfNonZero(RegisterFrom(temp3), slow_path->GetEntryLabel()); + // Fast path: just load the reference. + GenerateRawReferenceLoad(instruction, ref, obj, offset, index, scale_factor, needs_null_check); + __ Bind(slow_path->GetExitLabel()); +} - // Introduce a dependency on the lock_word including the rb_state, - // which shall prevent load-load reordering without using - // a memory barrier (which would be more expensive). - // `obj` is unchanged by this operation, but its value now depends - // on `temp_reg`. - __ Add(obj, obj, Operand(temp_reg, ShiftType::LSR, 32)); +void CodeGeneratorARMVIXL::GenerateRawReferenceLoad(HInstruction* instruction, + Location ref, + vixl::aarch32::Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + bool needs_null_check) { + Primitive::Type type = Primitive::kPrimNot; + vixl32::Register ref_reg = RegisterFrom(ref, type); + + // If needed, vixl::EmissionCheckScope guards are used to ensure + // that no pools are emitted between the load (macro) instruction + // and MaybeRecordImplicitNullCheck. - // The actual reference load. if (index.IsValid()) { // Load types involving an "index": ArrayGet, // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject // intrinsics. - // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor)) + // /* HeapReference<mirror::Object> */ ref = *(obj + offset + (index << scale_factor)) if (index.IsConstant()) { size_t computed_offset = (Int32ConstantFrom(index) << scale_factor) + offset; + vixl::EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes); GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, obj, computed_offset); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } } else { // Handle the special case of the // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject @@ -7422,46 +7689,27 @@ void CodeGeneratorARMVIXL::GenerateReferenceLoadWithBakerReadBarrier(HInstructio ? LowRegisterFrom(index) : RegisterFrom(index); UseScratchRegisterScope temps(GetVIXLAssembler()); - const vixl32::Register temp3 = temps.Acquire(); - __ Add(temp3, obj, Operand(index_reg, ShiftType::LSL, scale_factor)); - GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, temp3, offset); + vixl32::Register temp = temps.Acquire(); + __ Add(temp, obj, Operand(index_reg, ShiftType::LSL, scale_factor)); + { + vixl::EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes); + GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, temp, offset); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } + } } } else { - // /* HeapReference<Object> */ ref = *(obj + offset) + // /* HeapReference<mirror::Object> */ ref = *(obj + offset) + vixl::EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes); GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, obj, offset); + if (needs_null_check) { + MaybeRecordImplicitNullCheck(instruction); + } } // Object* ref = ref_addr->AsMirrorPtr() GetAssembler()->MaybeUnpoisonHeapReference(ref_reg); - - // Slow path marking the object `ref` when it is gray. - SlowPathCodeARMVIXL* slow_path; - if (always_update_field) { - DCHECK(temp2 != nullptr); - // ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL only supports address - // of the form `obj + field_offset`, where `obj` is a register and - // `field_offset` is a register pair (of which only the lower half - // is used). Thus `offset` and `scale_factor` above are expected - // to be null in this code path. - DCHECK_EQ(offset, 0u); - DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1); - slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL( - instruction, ref, obj, /* field_offset */ index, temp_reg, *temp2); - } else { - slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL(instruction, ref); - } - AddSlowPath(slow_path); - - // if (rb_state == ReadBarrier::GrayState()) - // ref = ReadBarrier::Mark(ref); - // Given the numeric representation, it's enough to check the low bit of the - // rb_state. We do that by shifting the bit out of the lock word with LSRS - // which can be a 16-bit instruction unlike the TST immediate. - static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0"); - static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1"); - __ Lsrs(temp_reg, temp_reg, LockWord::kReadBarrierStateShift + 1); - __ B(cs, slow_path->GetEntryLabel()); // Carry flag is the last bit shifted out by LSRS. - __ Bind(slow_path->GetExitLabel()); } void CodeGeneratorARMVIXL::GenerateReadBarrierSlow(HInstruction* instruction, diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h index 92e922d8f9..5ff7dd69e7 100644 --- a/compiler/optimizing/code_generator_arm_vixl.h +++ b/compiler/optimizing/code_generator_arm_vixl.h @@ -45,6 +45,11 @@ static constexpr bool kArmUseVIXL32 = true; namespace art { namespace arm { +// This constant is used as an approximate margin when emission of veneer and literal pools +// must be blocked. +static constexpr int kMaxMacroInstructionSizeInBytes = + 15 * vixl::aarch32::kMaxInstructionSizeInBytes; + static const vixl::aarch32::Register kParameterCoreRegistersVIXL[] = { vixl::aarch32::r1, vixl::aarch32::r2, @@ -625,6 +630,15 @@ class CodeGeneratorARMVIXL : public CodeGenerator { bool always_update_field = false, vixl::aarch32::Register* temp2 = nullptr); + // Generate a heap reference load (with no read barrier). + void GenerateRawReferenceLoad(HInstruction* instruction, + Location ref, + vixl::aarch32::Register obj, + uint32_t offset, + Location index, + ScaleFactor scale_factor, + bool needs_null_check); + // Generate a read barrier for a heap reference within `instruction` // using a slow path. // diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc index 86000e9356..28095c4d3f 100644 --- a/compiler/optimizing/intrinsics_arm.cc +++ b/compiler/optimizing/intrinsics_arm.cc @@ -1947,6 +1947,8 @@ void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) { } if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // TODO: Also convert this intrinsic to the IsGcMarking strategy? + // The base destination address is computed later, as `temp2` is // used for intermediate computations. diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc index 6c3938c1a9..934ba1b9fb 100644 --- a/compiler/optimizing/intrinsics_arm64.cc +++ b/compiler/optimizing/intrinsics_arm64.cc @@ -853,7 +853,6 @@ static void GenUnsafeGet(HInvoke* invoke, DCHECK((type == Primitive::kPrimInt) || (type == Primitive::kPrimLong) || (type == Primitive::kPrimNot)); - MacroAssembler* masm = codegen->GetVIXLAssembler(); Location base_loc = locations->InAt(1); Register base = WRegisterFrom(base_loc); // Object pointer. Location offset_loc = locations->InAt(2); @@ -863,8 +862,7 @@ static void GenUnsafeGet(HInvoke* invoke, if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) { // UnsafeGetObject/UnsafeGetObjectVolatile with Baker's read barrier case. - UseScratchRegisterScope temps(masm); - Register temp = temps.AcquireW(); + Register temp = WRegisterFrom(locations->GetTemp(0)); codegen->GenerateReferenceLoadWithBakerReadBarrier(invoke, trg_loc, base, @@ -901,6 +899,9 @@ static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke kIntrinsified); if (can_call && kUseBakerReadBarrier) { locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + // We need a temporary register for the read barrier marking slow + // path in CodeGeneratorARM64::GenerateReferenceLoadWithBakerReadBarrier. + locations->AddTemp(Location::RequiresRegister()); } locations->SetInAt(0, Location::NoLocation()); // Unused receiver. locations->SetInAt(1, Location::RequiresRegister()); @@ -2381,9 +2382,14 @@ void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopy(HInvoke* invoke) { // Temporary register IP0, obtained from the VIXL scratch register // pool, cannot be used in ReadBarrierSystemArrayCopySlowPathARM64 // (because that register is clobbered by ReadBarrierMarkRegX - // entry points). Get an extra temporary register from the - // register allocator. + // entry points). It cannot be used in calls to + // CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier + // either. For these reasons, get a third extra temporary register + // from the register allocator. locations->AddTemp(Location::RequiresRegister()); + } else { + // Cases other than Baker read barriers: the third temporary will + // be acquired from the VIXL scratch register pool. } } @@ -2494,11 +2500,12 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { // We use a block to end the scratch scope before the write barrier, thus // freeing the temporary registers so they can be used in `MarkGCCard`. UseScratchRegisterScope temps(masm); - // Note: Because it is acquired from VIXL's scratch register pool, - // `temp3` might be IP0, and thus cannot be used as `ref` argument - // of CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier - // calls below (see ReadBarrierMarkSlowPathARM64 for more details). - Register temp3 = temps.AcquireW(); + Register temp3; + if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + temp3 = WRegisterFrom(locations->GetTemp(2)); + } else { + temp3 = temps.AcquireW(); + } if (!optimizations.GetDoesNotNeedTypeCheck()) { // Check whether all elements of the source array are assignable to the component @@ -2704,19 +2711,7 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { Register src_curr_addr = temp1.X(); Register dst_curr_addr = temp2.X(); - Register src_stop_addr; - if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { - // Temporary register IP0, obtained from the VIXL scratch - // register pool as `temp3`, cannot be used in - // ReadBarrierSystemArrayCopySlowPathARM64 (because that - // register is clobbered by ReadBarrierMarkRegX entry points). - // So another temporary register allocated by the register - // allocator instead. - DCHECK_EQ(LocationFrom(temp3).reg(), IP0); - src_stop_addr = XRegisterFrom(locations->GetTemp(2)); - } else { - src_stop_addr = temp3.X(); - } + Register src_stop_addr = temp3.X(); GenSystemArrayCopyAddresses(masm, Primitive::kPrimNot, @@ -2732,6 +2727,8 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { const int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot); if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // TODO: Also convert this intrinsic to the IsGcMarking strategy? + // SystemArrayCopy implementation for Baker read barriers (see // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier): // @@ -2758,10 +2755,11 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) { __ Cmp(src_curr_addr, src_stop_addr); __ B(&done, eq); - Register tmp = temps.AcquireW(); // Make sure `tmp` is not IP0, as it is clobbered by // ReadBarrierMarkRegX entry points in // ReadBarrierSystemArrayCopySlowPathARM64. + temps.Exclude(ip0); + Register tmp = temps.AcquireW(); DCHECK_NE(LocationFrom(tmp).reg(), IP0); // /* int32_t */ monitor = src->monitor_ diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc index aa89deae34..60bcf2cfd5 100644 --- a/compiler/optimizing/intrinsics_arm_vixl.cc +++ b/compiler/optimizing/intrinsics_arm_vixl.cc @@ -2265,6 +2265,8 @@ void IntrinsicCodeGeneratorARMVIXL::VisitSystemArrayCopy(HInvoke* invoke) { } if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) { + // TODO: Also convert this intrinsic to the IsGcMarking strategy? + // The base destination address is computed later, as `temp2` is // used for intermediate computations. diff --git a/runtime/lock_word.h b/runtime/lock_word.h index 2f2565b9d0..edc64f35a1 100644 --- a/runtime/lock_word.h +++ b/runtime/lock_word.h @@ -57,7 +57,8 @@ class Monitor; * |10|9|87654321098765432109876543210| * |11|0| ForwardingAddress | * - * The rb bits store the read barrier state. + * The `r` bit stores the read barrier state. + * The `m` bit stores the mark state. */ class LockWord { public: diff --git a/runtime/type_lookup_table.h b/runtime/type_lookup_table.h index 3f6f76f510..fd68deb71c 100644 --- a/runtime/type_lookup_table.h +++ b/runtime/type_lookup_table.h @@ -148,7 +148,7 @@ class TypeLookupTable { return mask_; } - // Attempt to set an entry on it's hash' slot. If there is alrady something there, return false. + // Attempt to set an entry on its hash's slot. If there is already something there, return false. // Otherwise return true. bool SetOnInitialPos(const Entry& entry, uint32_t hash); |