From fe814e89965ddf9a8b603863bd28259f8dd7be35 Mon Sep 17 00:00:00 2001 From: Mathieu Chartier Date: Wed, 9 Nov 2016 14:32:49 -0800 Subject: Use entrypoint switching to reduce code size of GcRoot read barrier Set the read barrier mark register entrypoints to null when the GC is not marking. The compiler uses this to avoid needing to load the is_gc_marking boolean. Code size results on ritzperf CC: arm32: 13439400 -> 13242792 (-1.5%) arm64: 16380544 -> 16208512 (-1.05%) Implemented for arm32 and arm64. TODO: Consider implementing on x86. Bug: 32638713 Bug: 29516974 Test: test-art-host + run ritzperf Change-Id: I527ca5dc4cd43950ba43b872d0ac81e1eb5791eb --- compiler/optimizing/code_generator_arm.cc | 46 +++++++++++++++++------- compiler/optimizing/code_generator_arm64.cc | 54 ++++++++++++++++++++--------- 2 files changed, 71 insertions(+), 29 deletions(-) (limited to 'compiler/optimizing') diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc index 8ca8b8a57b..74a74ee193 100644 --- a/compiler/optimizing/code_generator_arm.cc +++ b/compiler/optimizing/code_generator_arm.cc @@ -620,8 +620,10 @@ class ArraySetSlowPathARM : public SlowPathCodeARM { // reference (different from `ref`) in `obj.field`). class ReadBarrierMarkSlowPathARM : public SlowPathCodeARM { public: - ReadBarrierMarkSlowPathARM(HInstruction* instruction, Location ref) - : SlowPathCodeARM(instruction), ref_(ref) { + ReadBarrierMarkSlowPathARM(HInstruction* instruction, + Location ref, + Location entrypoint = Location::NoLocation()) + : SlowPathCodeARM(instruction), ref_(ref), entrypoint_(entrypoint) { DCHECK(kEmitCompilerReadBarrier); } @@ -676,10 +678,15 @@ class ReadBarrierMarkSlowPathARM : public SlowPathCodeARM { // // rX <- ReadBarrierMarkRegX(rX) // - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset(ref_reg); - // This runtime call does not require a stack map. - arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + if (entrypoint_.IsValid()) { + arm_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this); + __ blx(entrypoint_.AsRegister()); + } else { + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset(ref_reg); + // This runtime call does not require a stack map. + arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + } __ b(GetExitLabel()); } @@ -687,6 +694,9 @@ class ReadBarrierMarkSlowPathARM : public SlowPathCodeARM { // The location (register) of the marked object reference. const Location ref_; + // The location of the entrypoint if already loaded. + const Location entrypoint_; + DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM); }; @@ -6829,8 +6839,9 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct // Baker's read barrier are used: // // root = obj.field; - // if (Thread::Current()->GetIsGcMarking()) { - // root = ReadBarrier::Mark(root) + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // if (temp != null) { + // root = temp(root) // } // /* GcRoot */ root = *(obj + offset) @@ -6844,14 +6855,23 @@ void InstructionCodeGeneratorARM::GenerateGcRootFieldLoad(HInstruction* instruct "have different sizes."); // Slow path marking the GC root `root`. + Location temp = Location::RegisterLocation(LR); SlowPathCodeARM* slow_path = - new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, root); + new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM( + instruction, + root, + /*entrypoint*/ temp); codegen_->AddSlowPath(slow_path); - // IP = Thread::Current()->GetIsGcMarking() - __ LoadFromOffset( - kLoadWord, IP, TR, Thread::IsGcMarkingOffset().Int32Value()); - __ CompareAndBranchIfNonZero(IP, slow_path->GetEntryLabel()); + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset(root.reg()); + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ LoadFromOffset(kLoadWord, temp.AsRegister(), TR, entry_point_offset); + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. + __ CompareAndBranchIfNonZero(temp.AsRegister(), slow_path->GetEntryLabel()); __ Bind(slow_path->GetExitLabel()); } else { // GC root loaded through a slow path for read barriers other diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index 6f55b422fa..7860138e04 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -607,10 +607,16 @@ void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) { // probably still be a from-space reference (unless it gets updated by // another thread, or if another thread installed another object // reference (different from `ref`) in `obj.field`). +// If entrypoint is a valid location it is assumed to already be holding the entrypoint. The case +// where the entrypoint is passed in is for the GcRoot read barrier. class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 { public: - ReadBarrierMarkSlowPathARM64(HInstruction* instruction, Location ref) - : SlowPathCodeARM64(instruction), ref_(ref) { + ReadBarrierMarkSlowPathARM64(HInstruction* instruction, + Location ref, + Location entrypoint = Location::NoLocation()) + : SlowPathCodeARM64(instruction), + ref_(ref), + entrypoint_(entrypoint) { DCHECK(kEmitCompilerReadBarrier); } @@ -665,10 +671,16 @@ class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 { // // rX <- ReadBarrierMarkRegX(rX) // - int32_t entry_point_offset = - CodeGenerator::GetReadBarrierMarkEntryPointsOffset(ref_.reg()); - // This runtime call does not require a stack map. - arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + if (entrypoint_.IsValid()) { + arm64_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this); + __ Blr(XRegisterFrom(entrypoint_)); + } else { + // Entrypoint is not already loaded, load from the thread. + int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset(ref_.reg()); + // This runtime call does not require a stack map. + arm64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this); + } __ B(GetExitLabel()); } @@ -676,6 +688,9 @@ class ReadBarrierMarkSlowPathARM64 : public SlowPathCodeARM64 { // The location (register) of the marked object reference. const Location ref_; + // The location of the entrypoint if it is already loaded. + const Location entrypoint_; + DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARM64); }; @@ -5371,8 +5386,9 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instru // Baker's read barrier are used: // // root = obj.field; - // if (Thread::Current()->GetIsGcMarking()) { - // root = ReadBarrier::Mark(root) + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // if (temp != null) { + // root = temp(root) // } // /* GcRoot */ root = *(obj + offset) @@ -5389,16 +5405,22 @@ void InstructionCodeGeneratorARM64::GenerateGcRootFieldLoad(HInstruction* instru "art::mirror::CompressedReference and int32_t " "have different sizes."); - // Slow path marking the GC root `root`. + Register temp = lr; + + // Slow path marking the GC root `root`. The entrypoint will alrady be loaded in temp. SlowPathCodeARM64* slow_path = - new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, root); + new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, + root, + LocationFrom(temp)); codegen_->AddSlowPath(slow_path); - - MacroAssembler* masm = GetVIXLAssembler(); - UseScratchRegisterScope temps(masm); - Register temp = temps.AcquireW(); - // temp = Thread::Current()->GetIsGcMarking() - __ Ldr(temp, MemOperand(tr, Thread::IsGcMarkingOffset().Int32Value())); + const int32_t entry_point_offset = + CodeGenerator::GetReadBarrierMarkEntryPointsOffset(root.reg()); + // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg() + // Loading the entrypoint does not require a load acquire since it is only changed when + // threads are suspended or running a checkpoint. + __ Ldr(temp, MemOperand(tr, entry_point_offset)); + // The entrypoint is null when the GC is not marking, this prevents one load compared to + // checking GetIsGcMarking. __ Cbnz(temp, slow_path->GetEntryLabel()); __ Bind(slow_path->GetExitLabel()); } else { -- cgit v1.2.3-59-g8ed1b