diff options
author | 2022-11-04 10:57:53 +0000 | |
---|---|---|
committer | 2022-11-08 10:43:08 +0000 | |
commit | 5eff6b3307dde2feff372decd0bc7e7e80e340da (patch) | |
tree | 6e8ccdb81cc8691645f7fe918abbb235f04e22bd | |
parent | 00ca1b514f69bec767d56259dea75711e6c939f9 (diff) |
Use ShouldDeoptimizeFlag to check if method exit hooks are needed
When we want to execute a particular method in switch interrpeter we
update the entry point of the method and we need to deopt any method
invocations that are on the stack currently. All future invocations
would use the correct entry point so we only need to check for a deopt
for the methods that are currently on the stack. In the current
implementation, we check a global flag to call method exit hooks (that
checks if a deopt of caller is necessary) which means we call method
exit hooks more often than necessary. This CL changes it so we use a bit
on the ShouldDeoptimizeFlag and update the bit for all method
invocations that are currently on the stack.
We still have to call method exit hooks for any future invocations if
method exit listeners are installed. So the JITed code is now updated
to call method exit hooks if the stack slot indicates a deopt check
is necessary or if method exit listeners are installed.
This improves the performance of golem benchmarks by close to 8x
bringing the performance close it what it was before adding a
breakpoint.
Bug: 253232638
Test: art/test.py
Change-Id: Ic70a568c3099bc9df8d72f423b33b4f148209de9
-rw-r--r-- | compiler/optimizing/code_generator_arm64.cc | 12 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm_vixl.cc | 15 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_x86.cc | 12 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_x86_64.cc | 13 | ||||
-rw-r--r-- | runtime/deoptimization_kind.h | 7 | ||||
-rw-r--r-- | runtime/instrumentation.cc | 34 | ||||
-rw-r--r-- | runtime/instrumentation.h | 9 | ||||
-rw-r--r-- | runtime/stack.h | 10 |
8 files changed, 85 insertions, 27 deletions
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index 40928585f6..7fb3b2408b 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -1171,9 +1171,19 @@ void InstructionCodeGeneratorARM64::GenerateMethodEntryExitHook(HInstruction* in new (codegen_->GetScopedAllocator()) MethodEntryExitHooksSlowPathARM64(instruction); codegen_->AddSlowPath(slow_path); + if (instruction->IsMethodExitHook()) { + // Check if we are required to check if the caller needs a deoptimization. Strictly speaking it + // would be sufficient to check if CheckCallerForDeopt bit is set. Though it is faster to check + // if it is just non-zero. kCHA bit isn't used in debuggable runtimes as cha optimization is + // disabled in debuggable runtime. The other bit is used when this method itself requires a + // deoptimization due to redefinition. So it is safe to just check for non-zero value here. + __ Ldr(value, MemOperand(sp, codegen_->GetStackOffsetOfShouldDeoptimizeFlag())); + __ Cbnz(value, slow_path->GetEntryLabel()); + } + uint64_t address = reinterpret_cast64<uint64_t>(Runtime::Current()->GetInstrumentation()); MemberOffset offset = instruction->IsMethodExitHook() ? - instrumentation::Instrumentation::NeedsExitHooksOffset() : + instrumentation::Instrumentation::HaveMethodExitListenersOffset() : instrumentation::Instrumentation::HaveMethodEntryListenersOffset(); __ Mov(temp, address + offset.Int32Value()); __ Ldrb(value, MemOperand(temp, 0)); diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc index d9bb63723f..002ca79802 100644 --- a/compiler/optimizing/code_generator_arm_vixl.cc +++ b/compiler/optimizing/code_generator_arm_vixl.cc @@ -2170,8 +2170,21 @@ void InstructionCodeGeneratorARMVIXL::GenerateMethodEntryExitHook(HInstruction* new (codegen_->GetScopedAllocator()) MethodEntryExitHooksSlowPathARMVIXL(instruction); codegen_->AddSlowPath(slow_path); + if (instruction->IsMethodExitHook()) { + // Check if we are required to check if the caller needs a deoptimization. Strictly speaking it + // would be sufficient to check if CheckCallerForDeopt bit is set. Though it is faster to check + // if it is just non-zero. kCHA bit isn't used in debuggable runtimes as cha optimization is + // disabled in debuggable runtime. The other bit is used when this method itself requires a + // deoptimization due to redefinition. So it is safe to just check for non-zero value here. + GetAssembler()->LoadFromOffset(kLoadWord, + temp, + sp, + codegen_->GetStackOffsetOfShouldDeoptimizeFlag()); + __ CompareAndBranchIfNonZero(temp, slow_path->GetEntryLabel()); + } + MemberOffset offset = instruction->IsMethodExitHook() ? - instrumentation::Instrumentation::NeedsExitHooksOffset() : + instrumentation::Instrumentation::HaveMethodExitListenersOffset() : instrumentation::Instrumentation::HaveMethodEntryListenersOffset(); uint32_t address = reinterpret_cast32<uint32_t>(Runtime::Current()->GetInstrumentation()); __ Mov(temp, address + offset.Int32Value()); diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 254e20a07a..5daa73ea35 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -1199,9 +1199,19 @@ void InstructionCodeGeneratorX86::GenerateMethodEntryExitHook(HInstruction* inst new (codegen_->GetScopedAllocator()) MethodEntryExitHooksSlowPathX86(instruction); codegen_->AddSlowPath(slow_path); + if (instruction->IsMethodExitHook()) { + // Check if we are required to check if the caller needs a deoptimization. Strictly speaking it + // would be sufficient to check if CheckCallerForDeopt bit is set. Though it is faster to check + // if it is just non-zero. kCHA bit isn't used in debuggable runtimes as cha optimization is + // disabled in debuggable runtime. The other bit is used when this method itself requires a + // deoptimization due to redefinition. So it is safe to just check for non-zero value here. + __ cmpl(Address(ESP, codegen_->GetStackOffsetOfShouldDeoptimizeFlag()), Immediate(0)); + __ j(kNotEqual, slow_path->GetEntryLabel()); + } + uint64_t address = reinterpret_cast64<uint64_t>(Runtime::Current()->GetInstrumentation()); MemberOffset offset = instruction->IsMethodExitHook() ? - instrumentation::Instrumentation::NeedsExitHooksOffset() : + instrumentation::Instrumentation::HaveMethodExitListenersOffset() : instrumentation::Instrumentation::HaveMethodEntryListenersOffset(); __ cmpb(Address::Absolute(address + offset.Int32Value()), Immediate(0)); __ j(kNotEqual, slow_path->GetEntryLabel()); diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index ad2509f56a..d7867110db 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -1564,9 +1564,20 @@ void InstructionCodeGeneratorX86_64::GenerateMethodEntryExitHook(HInstruction* i new (codegen_->GetScopedAllocator()) MethodEntryExitHooksSlowPathX86_64(instruction); codegen_->AddSlowPath(slow_path); + if (instruction->IsMethodExitHook()) { + // Check if we are required to check if the caller needs a deoptimization. Strictly speaking it + // would be sufficient to check if CheckCallerForDeopt bit is set. Though it is faster to check + // if it is just non-zero. kCHA bit isn't used in debuggable runtimes as cha optimization is + // disabled in debuggable runtime. The other bit is used when this method itself requires a + // deoptimization due to redefinition. So it is safe to just check for non-zero value here. + __ cmpl(Address(CpuRegister(RSP), codegen_->GetStackOffsetOfShouldDeoptimizeFlag()), + Immediate(0)); + __ j(kNotEqual, slow_path->GetEntryLabel()); + } + uint64_t address = reinterpret_cast64<uint64_t>(Runtime::Current()->GetInstrumentation()); MemberOffset offset = instruction->IsMethodExitHook() ? - instrumentation::Instrumentation::NeedsExitHooksOffset() + instrumentation::Instrumentation::HaveMethodExitListenersOffset() : instrumentation::Instrumentation::HaveMethodEntryListenersOffset(); __ movq(CpuRegister(TMP), Immediate(address + offset.Int32Value())); __ cmpb(Address(CpuRegister(TMP), 0), Immediate(0)); diff --git a/runtime/deoptimization_kind.h b/runtime/deoptimization_kind.h index c2e6a6585a..65a1cf1f2f 100644 --- a/runtime/deoptimization_kind.h +++ b/runtime/deoptimization_kind.h @@ -56,9 +56,10 @@ std::ostream& operator<<(std::ostream& os, const DeoptimizationKind& kind); // for functions that are already on stack. The value in the slot specifies the // reason we need to deoptimize. enum class DeoptimizeFlagValue: uint8_t { - kCHA = 0b01, - kDebug = 0b10, - kAll = kCHA | kDebug + kCHA = 0b001, + kForceDeoptForRedefinition = 0b010, + kCheckCallerForDeopt = 0b100, + kAll = kCHA | kForceDeoptForRedefinition | kCheckCallerForDeopt }; } // namespace art diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc index dea36a86a7..c7393eb25f 100644 --- a/runtime/instrumentation.cc +++ b/runtime/instrumentation.cc @@ -558,8 +558,9 @@ void InstrumentationInstallStack(Thread* thread, void* arg, bool deopt_all_frame if (method_header != nullptr && method_header->HasShouldDeoptimizeFlag()) { if (deopt_all_frames_) { runtime_methods_need_deopt_check_ = true; - SetShouldDeoptimizeFlag(DeoptimizeFlagValue::kDebug); + SetShouldDeoptimizeFlag(DeoptimizeFlagValue::kForceDeoptForRedefinition); } + SetShouldDeoptimizeFlag(DeoptimizeFlagValue::kCheckCallerForDeopt); return true; } CHECK_NE(return_pc, 0U); @@ -676,9 +677,6 @@ static void InstrumentationRestoreStack(Thread* thread, void* arg) runtime_methods_need_deopt_check_(false) {} bool VisitFrame() override REQUIRES_SHARED(Locks::mutator_lock_) { - if (instrumentation_stack_->size() == 0) { - return false; // Stop. - } ArtMethod* m = GetMethod(); if (GetCurrentQuickFrame() == nullptr) { if (kVerboseInstrumentation) { @@ -695,9 +693,10 @@ static void InstrumentationRestoreStack(Thread* thread, void* arg) } const OatQuickMethodHeader* method_header = GetCurrentOatQuickMethodHeader(); if (method_header != nullptr && method_header->HasShouldDeoptimizeFlag()) { - if (IsShouldDeoptimizeFlagForDebugSet()) { + if (ShouldForceDeoptForRedefinition()) { runtime_methods_need_deopt_check_ = true; } + UnsetShouldDeoptimizeFlag(DeoptimizeFlagValue::kCheckCallerForDeopt); } auto it = instrumentation_stack_->find(GetReturnPcAddr()); if (it != instrumentation_stack_->end()) { @@ -743,19 +742,17 @@ static void InstrumentationRestoreStack(Thread* thread, void* arg) } std::map<uintptr_t, instrumentation::InstrumentationStackFrame>* stack = thread->GetInstrumentationStack(); - if (stack->size() > 0) { - Instrumentation* instrumentation = reinterpret_cast<Instrumentation*>(arg); - uintptr_t instrumentation_exit_pc = - reinterpret_cast<uintptr_t>(GetQuickInstrumentationExitPc()); - RestoreStackVisitor visitor(thread, instrumentation_exit_pc, instrumentation); - visitor.WalkStack(true); - DCHECK_IMPLIES(visitor.runtime_methods_need_deopt_check_, thread->IsDeoptCheckRequired()); - if (!visitor.runtime_methods_need_deopt_check_) { - thread->SetDeoptCheckRequired(false); - } - CHECK_EQ(visitor.frames_removed_, stack->size()); - stack->clear(); + Instrumentation* instrumentation = reinterpret_cast<Instrumentation*>(arg); + uintptr_t instrumentation_exit_pc = + reinterpret_cast<uintptr_t>(GetQuickInstrumentationExitPc()); + RestoreStackVisitor visitor(thread, instrumentation_exit_pc, instrumentation); + visitor.WalkStack(true); + DCHECK_IMPLIES(visitor.runtime_methods_need_deopt_check_, thread->IsDeoptCheckRequired()); + if (!visitor.runtime_methods_need_deopt_check_) { + thread->SetDeoptCheckRequired(false); } + CHECK_EQ(visitor.frames_removed_, stack->size()); + stack->clear(); } void Instrumentation::DeoptimizeAllThreadFrames() { @@ -1720,7 +1717,8 @@ bool Instrumentation::ShouldDeoptimizeCaller(Thread* self, DCHECK(header->IsOptimized()); uint8_t* should_deopt_flag_addr = reinterpret_cast<uint8_t*>(caller_sp) + header->GetShouldDeoptimizeFlagOffset(); - if ((*should_deopt_flag_addr & static_cast<uint8_t>(DeoptimizeFlagValue::kDebug)) != 0) { + if ((*should_deopt_flag_addr & + static_cast<uint8_t>(DeoptimizeFlagValue::kForceDeoptForRedefinition)) != 0) { needs_deopt = true; } } diff --git a/runtime/instrumentation.h b/runtime/instrumentation.h index 25e787f433..390d84a87a 100644 --- a/runtime/instrumentation.h +++ b/runtime/instrumentation.h @@ -218,6 +218,15 @@ class Instrumentation { return MemberOffset(OFFSETOF_MEMBER(Instrumentation, have_method_entry_listeners_)); } + static constexpr MemberOffset HaveMethodExitListenersOffset() { + // Assert that have_method_exit_listeners_ is 8bits wide. If the size changes + // update the compare instructions in the code generator when generating checks for + // MethodEntryExitHooks. + static_assert(sizeof(have_method_exit_listeners_) == 1, + "have_method_exit_listeners_ isn't expected size"); + return MemberOffset(OFFSETOF_MEMBER(Instrumentation, have_method_exit_listeners_)); + } + // Add a listener to be notified of the masked together sent of instrumentation events. This // suspend the runtime to install stubs. You are expected to hold the mutator lock as a proxy // for saying you should have suspended all threads (installing stubs while threads are running diff --git a/runtime/stack.h b/runtime/stack.h index 56f38aecef..a4bcf17ca5 100644 --- a/runtime/stack.h +++ b/runtime/stack.h @@ -308,13 +308,19 @@ class StackVisitor { *should_deoptimize_addr = *should_deoptimize_addr | static_cast<uint8_t>(value); }; + void UnsetShouldDeoptimizeFlag(DeoptimizeFlagValue value) REQUIRES_SHARED(Locks::mutator_lock_) { + uint8_t* should_deoptimize_addr = GetShouldDeoptimizeFlagAddr(); + *should_deoptimize_addr = *should_deoptimize_addr & ~static_cast<uint8_t>(value); + }; + uint8_t GetShouldDeoptimizeFlag() const REQUIRES_SHARED(Locks::mutator_lock_) { return *GetShouldDeoptimizeFlagAddr(); } - bool IsShouldDeoptimizeFlagForDebugSet() const REQUIRES_SHARED(Locks::mutator_lock_) { + bool ShouldForceDeoptForRedefinition() const REQUIRES_SHARED(Locks::mutator_lock_) { uint8_t should_deopt_flag = GetShouldDeoptimizeFlag(); - return (should_deopt_flag & static_cast<uint8_t>(DeoptimizeFlagValue::kDebug)) != 0; + return (should_deopt_flag & + static_cast<uint8_t>(DeoptimizeFlagValue::kForceDeoptForRedefinition)) != 0; } // Return the number of dex register in the map from the outermost frame to the number of inlined |