Add support for calling entry / exit hooks directly from JIT code

The idea of this CL is to avoid maintaining the instrumentation stack
and manipulating the return addresses on the stack to call the entry /
exit hooks. This Cl only addresses this for JITed code. In follow up
CLs, we will extend this to others (native, nterp). Once we have
everything in place we could remove the complexity of instrumentation
stack.

This CL introduces new nodes (HMethodEntry / HMethodExit(Void)) that
generate code to call the trace entry / exit hooks when
instrumentation_stubs are installed. Currently these are introduced for
JITed code in debuggable mode. The entry / exit hooks roughly do the
same this as instrumentation entry / exit points.
We also extend the JITed frame slots by adding a ShouldDeoptimize slot.
This will be used to force deoptimization of frames when requested by
jvmti (for ex: structural re-definition).

Test: art/testrunner.py
Change-Id: Id4aa439731d214a8d2b820a67e75415ca1d5424e
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 74efc9e..d455614 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -822,6 +822,31 @@
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierForRootSlowPathARM64);
 };
 
+class MethodEntryExitHooksSlowPathARM64 : public SlowPathCodeARM64 {
+ public:
+  explicit MethodEntryExitHooksSlowPathARM64(HInstruction* instruction)
+      : SlowPathCodeARM64(instruction) {}
+
+  void EmitNativeCode(CodeGenerator* codegen) override {
+    LocationSummary* locations = instruction_->GetLocations();
+    QuickEntrypointEnum entry_point =
+        (instruction_->IsMethodEntryHook()) ? kQuickMethodEntryHook : kQuickMethodExitHook;
+    CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+    arm64_codegen->InvokeRuntime(entry_point, instruction_, instruction_->GetDexPc(), this);
+    RestoreLiveRegisters(codegen, locations);
+    __ B(GetExitLabel());
+  }
+
+  const char* GetDescription() const override {
+    return "MethodEntryExitHooksSlowPath";
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(MethodEntryExitHooksSlowPathARM64);
+};
+
 #undef __
 
 Location InvokeDexCallingConventionVisitorARM64::GetNextLocation(DataType::Type type) {
@@ -1113,6 +1138,47 @@
   codegen_->MoveLocation(move->GetDestination(), move->GetSource(), DataType::Type::kVoid);
 }
 
+void LocationsBuilderARM64::VisitMethodExitHook(HMethodExitHook* method_hook) {
+  LocationSummary* locations = new (GetGraph()->GetAllocator())
+      LocationSummary(method_hook, LocationSummary::kCallOnSlowPath);
+  DataType::Type return_type = method_hook->InputAt(0)->GetType();
+  locations->SetInAt(0, ARM64ReturnLocation(return_type));
+}
+
+void InstructionCodeGeneratorARM64::GenerateMethodEntryExitHook(HInstruction* instruction) {
+  MacroAssembler* masm = GetVIXLAssembler();
+  UseScratchRegisterScope temps(masm);
+  Register temp = temps.AcquireX();
+  Register value = temps.AcquireW();
+
+  SlowPathCodeARM64* slow_path =
+      new (codegen_->GetScopedAllocator()) MethodEntryExitHooksSlowPathARM64(instruction);
+  codegen_->AddSlowPath(slow_path);
+
+  uint64_t address = reinterpret_cast64<uint64_t>(Runtime::Current()->GetInstrumentation());
+  int offset = instrumentation::Instrumentation::NeedsEntryExitHooksOffset().Int32Value();
+  __ Mov(temp, address + offset);
+  __ Ldrh(value, MemOperand(temp, 0));
+  __ Cbnz(value, slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void InstructionCodeGeneratorARM64::VisitMethodExitHook(HMethodExitHook* instruction) {
+  DCHECK(codegen_->GetCompilerOptions().IsJitCompiler() && GetGraph()->IsDebuggable());
+  DCHECK(codegen_->RequiresCurrentMethod());
+  GenerateMethodEntryExitHook(instruction);
+}
+
+void LocationsBuilderARM64::VisitMethodEntryHook(HMethodEntryHook* method_hook) {
+  new (GetGraph()->GetAllocator()) LocationSummary(method_hook, LocationSummary::kCallOnSlowPath);
+}
+
+void InstructionCodeGeneratorARM64::VisitMethodEntryHook(HMethodEntryHook* instruction) {
+  DCHECK(codegen_->GetCompilerOptions().IsJitCompiler() && GetGraph()->IsDebuggable());
+  DCHECK(codegen_->RequiresCurrentMethod());
+  GenerateMethodEntryExitHook(instruction);
+}
+
 void CodeGeneratorARM64::MaybeIncrementHotness(bool is_frame_entry) {
   MacroAssembler* masm = GetVIXLAssembler();
   if (GetCompilerOptions().CountHotnessInCompiledCode()) {
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index d4546e5..750151a 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -388,6 +388,7 @@
   void GenerateIntRemForConstDenom(HRem *instruction);
   void GenerateIntRemForPower2Denom(HRem *instruction);
   void HandleGoto(HInstruction* got, HBasicBlock* successor);
+  void GenerateMethodEntryExitHook(HInstruction* instruction);
 
   // Helpers to set up locations for vector memory operations. Returns the memory operand and,
   // if used, sets the output parameter scratch to a temporary register used in this operand,
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 700202b..bf0c77d 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -971,6 +971,31 @@
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierForRootSlowPathARMVIXL);
 };
 
+class MethodEntryExitHooksSlowPathARMVIXL : public SlowPathCodeARMVIXL {
+ public:
+  explicit MethodEntryExitHooksSlowPathARMVIXL(HInstruction* instruction)
+      : SlowPathCodeARMVIXL(instruction) {}
+
+  void EmitNativeCode(CodeGenerator* codegen) override {
+    LocationSummary* locations = instruction_->GetLocations();
+    CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen);
+    QuickEntrypointEnum entry_point =
+        (instruction_->IsMethodEntryHook()) ? kQuickMethodEntryHook : kQuickMethodExitHook;
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+    arm_codegen->InvokeRuntime(entry_point, instruction_, instruction_->GetDexPc(), this);
+    RestoreLiveRegisters(codegen, locations);
+    __ B(GetExitLabel());
+  }
+
+  const char* GetDescription() const override {
+    return "MethodEntryExitHooksSlowPath";
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(MethodEntryExitHooksSlowPathARMVIXL);
+};
+
 inline vixl32::Condition ARMCondition(IfCondition cond) {
   switch (cond) {
     case kCondEQ: return eq;
@@ -2111,6 +2136,44 @@
   }
 }
 
+void LocationsBuilderARMVIXL::VisitMethodExitHook(HMethodExitHook* method_hook) {
+  LocationSummary* locations = new (GetGraph()->GetAllocator())
+      LocationSummary(method_hook, LocationSummary::kCallOnSlowPath);
+  locations->SetInAt(0, parameter_visitor_.GetReturnLocation(method_hook->InputAt(0)->GetType()));
+}
+
+void InstructionCodeGeneratorARMVIXL::GenerateMethodEntryExitHook(HInstruction* instruction) {
+  UseScratchRegisterScope temps(GetVIXLAssembler());
+  vixl32::Register temp = temps.Acquire();
+
+  SlowPathCodeARMVIXL* slow_path =
+      new (codegen_->GetScopedAllocator()) MethodEntryExitHooksSlowPathARMVIXL(instruction);
+  codegen_->AddSlowPath(slow_path);
+
+  int offset = instrumentation::Instrumentation::NeedsEntryExitHooksOffset().Int32Value();
+  uint32_t address = reinterpret_cast32<uint32_t>(Runtime::Current()->GetInstrumentation());
+  __ Mov(temp, address + offset);
+  __ Ldrh(temp, MemOperand(temp, 0));
+  __ CompareAndBranchIfNonZero(temp, slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitMethodExitHook(HMethodExitHook* instruction) {
+  DCHECK(codegen_->GetCompilerOptions().IsJitCompiler() && GetGraph()->IsDebuggable());
+  DCHECK(codegen_->RequiresCurrentMethod());
+  GenerateMethodEntryExitHook(instruction);
+}
+
+void LocationsBuilderARMVIXL::VisitMethodEntryHook(HMethodEntryHook* method_hook) {
+  new (GetGraph()->GetAllocator()) LocationSummary(method_hook, LocationSummary::kCallOnSlowPath);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitMethodEntryHook(HMethodEntryHook* instruction) {
+  DCHECK(codegen_->GetCompilerOptions().IsJitCompiler() && GetGraph()->IsDebuggable());
+  DCHECK(codegen_->RequiresCurrentMethod());
+  GenerateMethodEntryExitHook(instruction);
+}
+
 void CodeGeneratorARMVIXL::MaybeIncrementHotness(bool is_frame_entry) {
   if (GetCompilerOptions().CountHotnessInCompiledCode()) {
     UseScratchRegisterScope temps(GetVIXLAssembler());
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index b797c30..aa40755 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -431,6 +431,7 @@
   void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
   void GenerateDivRemConstantIntegral(HBinaryOperation* instruction);
   void HandleGoto(HInstruction* got, HBasicBlock* successor);
+  void GenerateMethodEntryExitHook(HInstruction* instruction);
 
   vixl::aarch32::MemOperand VecAddress(
       HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index c49b08b..a04b412 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -942,6 +942,30 @@
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierForRootSlowPathX86);
 };
 
+class MethodEntryExitHooksSlowPathX86 : public SlowPathCode {
+ public:
+  explicit MethodEntryExitHooksSlowPathX86(HInstruction* instruction) : SlowPathCode(instruction) {}
+
+  void EmitNativeCode(CodeGenerator* codegen) override {
+    CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    QuickEntrypointEnum entry_point =
+        (instruction_->IsMethodEntryHook()) ? kQuickMethodEntryHook : kQuickMethodExitHook;
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+    x86_codegen->InvokeRuntime(entry_point, instruction_, instruction_->GetDexPc(), this);
+    RestoreLiveRegisters(codegen, locations);
+    __ jmp(GetExitLabel());
+  }
+
+  const char* GetDescription() const override {
+    return "MethodEntryExitHooksSlowPath";
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(MethodEntryExitHooksSlowPathX86);
+};
+
 #undef __
 // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
 #define __ down_cast<X86Assembler*>(GetAssembler())->  // NOLINT
@@ -1097,6 +1121,70 @@
   return dwarf::Reg::X86Core(static_cast<int>(reg));
 }
 
+void SetInForReturnValue(HInstruction* ret, LocationSummary* locations) {
+  switch (ret->InputAt(0)->GetType()) {
+    case DataType::Type::kReference:
+    case DataType::Type::kBool:
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+    case DataType::Type::kInt32:
+      locations->SetInAt(0, Location::RegisterLocation(EAX));
+      break;
+
+    case DataType::Type::kInt64:
+      locations->SetInAt(0, Location::RegisterPairLocation(EAX, EDX));
+      break;
+
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      locations->SetInAt(0, Location::FpuRegisterLocation(XMM0));
+      break;
+
+    case DataType::Type::kVoid:
+      locations->SetInAt(0, Location::NoLocation());
+      break;
+
+    default:
+      LOG(FATAL) << "Unknown return type " << ret->InputAt(0)->GetType();
+  }
+}
+
+void LocationsBuilderX86::VisitMethodExitHook(HMethodExitHook* method_hook) {
+  LocationSummary* locations = new (GetGraph()->GetAllocator())
+      LocationSummary(method_hook, LocationSummary::kCallOnSlowPath);
+  SetInForReturnValue(method_hook, locations);
+}
+
+void InstructionCodeGeneratorX86::GenerateMethodEntryExitHook(HInstruction* instruction) {
+  SlowPathCode* slow_path =
+      new (codegen_->GetScopedAllocator()) MethodEntryExitHooksSlowPathX86(instruction);
+  codegen_->AddSlowPath(slow_path);
+
+  uint64_t address = reinterpret_cast64<uint64_t>(Runtime::Current()->GetInstrumentation());
+  int offset = instrumentation::Instrumentation::NeedsEntryExitHooksOffset().Int32Value();
+  __ cmpw(Address::Absolute(address + offset), Immediate(0));
+  __ j(kEqual, slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void InstructionCodeGeneratorX86::VisitMethodExitHook(HMethodExitHook* instruction) {
+  DCHECK(codegen_->GetCompilerOptions().IsJitCompiler() && GetGraph()->IsDebuggable());
+  DCHECK(codegen_->RequiresCurrentMethod());
+  GenerateMethodEntryExitHook(instruction);
+}
+
+void LocationsBuilderX86::VisitMethodEntryHook(HMethodEntryHook* method_hook) {
+  new (GetGraph()->GetAllocator()) LocationSummary(method_hook, LocationSummary::kCallOnSlowPath);
+}
+
+void InstructionCodeGeneratorX86::VisitMethodEntryHook(HMethodEntryHook* instruction) {
+  DCHECK(codegen_->GetCompilerOptions().IsJitCompiler() && GetGraph()->IsDebuggable());
+  DCHECK(codegen_->RequiresCurrentMethod());
+  GenerateMethodEntryExitHook(instruction);
+}
+
 void CodeGeneratorX86::MaybeIncrementHotness(bool is_frame_entry) {
   if (GetCompilerOptions().CountHotnessInCompiledCode()) {
     Register reg = EAX;
@@ -2408,31 +2496,7 @@
 void LocationsBuilderX86::VisitReturn(HReturn* ret) {
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(ret, LocationSummary::kNoCall);
-  switch (ret->InputAt(0)->GetType()) {
-    case DataType::Type::kReference:
-    case DataType::Type::kBool:
-    case DataType::Type::kUint8:
-    case DataType::Type::kInt8:
-    case DataType::Type::kUint16:
-    case DataType::Type::kInt16:
-    case DataType::Type::kInt32:
-      locations->SetInAt(0, Location::RegisterLocation(EAX));
-      break;
-
-    case DataType::Type::kInt64:
-      locations->SetInAt(
-          0, Location::RegisterPairLocation(EAX, EDX));
-      break;
-
-    case DataType::Type::kFloat32:
-    case DataType::Type::kFloat64:
-      locations->SetInAt(
-          0, Location::FpuRegisterLocation(XMM0));
-      break;
-
-    default:
-      LOG(FATAL) << "Unknown return type " << ret->InputAt(0)->GetType();
-  }
+  SetInForReturnValue(ret, locations);
 }
 
 void InstructionCodeGeneratorX86::VisitReturn(HReturn* ret) {
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 94f010e..75c5ceb 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -344,6 +344,8 @@
   bool CpuHasAvxFeatureFlag();
   bool CpuHasAvx2FeatureFlag();
 
+  void GenerateMethodEntryExitHook(HInstruction* instruction);
+
   X86Assembler* const assembler_;
   CodeGeneratorX86* const codegen_;
 
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index dae2ae2..4ec2dd7 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -965,6 +965,31 @@
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierForRootSlowPathX86_64);
 };
 
+class MethodEntryExitHooksSlowPathX86_64 : public SlowPathCode {
+ public:
+  explicit MethodEntryExitHooksSlowPathX86_64(HInstruction* instruction)
+      : SlowPathCode(instruction) {}
+
+  void EmitNativeCode(CodeGenerator* codegen) override {
+    CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    QuickEntrypointEnum entry_point =
+        (instruction_->IsMethodEntryHook()) ? kQuickMethodEntryHook : kQuickMethodExitHook;
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+    x86_64_codegen->InvokeRuntime(entry_point, instruction_, instruction_->GetDexPc(), this);
+    RestoreLiveRegisters(codegen, locations);
+    __ jmp(GetExitLabel());
+  }
+
+  const char* GetDescription() const override {
+    return "MethodEntryExitHooksSlowPath";
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(MethodEntryExitHooksSlowPathX86_64);
+};
+
 #undef __
 // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
 #define __ down_cast<X86_64Assembler*>(GetAssembler())->  // NOLINT
@@ -1494,6 +1519,68 @@
   return dwarf::Reg::X86_64Fp(static_cast<int>(reg));
 }
 
+void LocationsBuilderX86_64::VisitMethodEntryHook(HMethodEntryHook* method_hook) {
+  new (GetGraph()->GetAllocator()) LocationSummary(method_hook, LocationSummary::kCallOnSlowPath);
+}
+
+void InstructionCodeGeneratorX86_64::GenerateMethodEntryExitHook(HInstruction* instruction) {
+  SlowPathCode* slow_path =
+      new (codegen_->GetScopedAllocator()) MethodEntryExitHooksSlowPathX86_64(instruction);
+  codegen_->AddSlowPath(slow_path);
+
+  uint64_t address = reinterpret_cast64<uint64_t>(Runtime::Current()->GetInstrumentation());
+  int offset = instrumentation::Instrumentation::NeedsEntryExitHooksOffset().Int32Value();
+  __ movq(CpuRegister(TMP), Immediate(address + offset));
+  __ cmpw(Address(CpuRegister(TMP), 0), Immediate(0));
+  __ j(kNotEqual, slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void InstructionCodeGeneratorX86_64::VisitMethodEntryHook(HMethodEntryHook* instruction) {
+  DCHECK(codegen_->GetCompilerOptions().IsJitCompiler() && GetGraph()->IsDebuggable());
+  DCHECK(codegen_->RequiresCurrentMethod());
+  GenerateMethodEntryExitHook(instruction);
+}
+
+void SetInForReturnValue(HInstruction* instr, LocationSummary* locations) {
+  switch (instr->InputAt(0)->GetType()) {
+    case DataType::Type::kReference:
+    case DataType::Type::kBool:
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+    case DataType::Type::kInt32:
+    case DataType::Type::kInt64:
+      locations->SetInAt(0, Location::RegisterLocation(RAX));
+      break;
+
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      locations->SetInAt(0, Location::FpuRegisterLocation(XMM0));
+      break;
+
+    case DataType::Type::kVoid:
+      locations->SetInAt(0, Location::NoLocation());
+      break;
+
+    default:
+      LOG(FATAL) << "Unexpected return type " << instr->InputAt(0)->GetType();
+  }
+}
+
+void LocationsBuilderX86_64::VisitMethodExitHook(HMethodExitHook* method_hook) {
+  LocationSummary* locations = new (GetGraph()->GetAllocator())
+      LocationSummary(method_hook, LocationSummary::kCallOnSlowPath);
+  SetInForReturnValue(method_hook, locations);
+}
+
+void InstructionCodeGeneratorX86_64::VisitMethodExitHook(HMethodExitHook* instruction) {
+  DCHECK(codegen_->GetCompilerOptions().IsJitCompiler() && GetGraph()->IsDebuggable());
+  DCHECK(codegen_->RequiresCurrentMethod());
+  GenerateMethodEntryExitHook(instruction);
+}
+
 void CodeGeneratorX86_64::MaybeIncrementHotness(bool is_frame_entry) {
   if (GetCompilerOptions().CountHotnessInCompiledCode()) {
     NearLabel overflow;
@@ -2542,26 +2629,7 @@
 void LocationsBuilderX86_64::VisitReturn(HReturn* ret) {
   LocationSummary* locations =
       new (GetGraph()->GetAllocator()) LocationSummary(ret, LocationSummary::kNoCall);
-  switch (ret->InputAt(0)->GetType()) {
-    case DataType::Type::kReference:
-    case DataType::Type::kBool:
-    case DataType::Type::kUint8:
-    case DataType::Type::kInt8:
-    case DataType::Type::kUint16:
-    case DataType::Type::kInt16:
-    case DataType::Type::kInt32:
-    case DataType::Type::kInt64:
-      locations->SetInAt(0, Location::RegisterLocation(RAX));
-      break;
-
-    case DataType::Type::kFloat32:
-    case DataType::Type::kFloat64:
-      locations->SetInAt(0, Location::FpuRegisterLocation(XMM0));
-      break;
-
-    default:
-      LOG(FATAL) << "Unexpected return type " << ret->InputAt(0)->GetType();
-  }
+  SetInForReturnValue(ret, locations);
 }
 
 void InstructionCodeGeneratorX86_64::VisitReturn(HReturn* ret) {
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 3e601bb..1115c83 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -276,6 +276,7 @@
   void GenerateMinMaxInt(LocationSummary* locations, bool is_min, DataType::Type type);
   void GenerateMinMaxFP(LocationSummary* locations, bool is_min, DataType::Type type);
   void GenerateMinMax(HBinaryOperation* minmax, bool is_min);
+  void GenerateMethodEntryExitHook(HInstruction* instruction);
 
   // Generate a heap reference load using one register `out`:
   //
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index 3abbbae..c742682 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -812,6 +812,11 @@
                            HBasicBlock* bb_cursor) {
   HShouldDeoptimizeFlag* deopt_flag = new (graph_->GetAllocator())
       HShouldDeoptimizeFlag(graph_->GetAllocator(), dex_pc);
+  // ShouldDeoptimizeFlag is used to perform a deoptimization because of a CHA
+  // invalidation or for debugging reasons. It is OK to just check for non-zero
+  // value here instead of the specific CHA value. When a debugging deopt is
+  // requested we deoptimize before we execute any code and hence we shouldn't
+  // see that case here.
   HInstruction* compare = new (graph_->GetAllocator()) HNotEqual(
       deopt_flag, graph_->GetIntConstant(0, dex_pc));
   HInstruction* deopt = new (graph_->GetAllocator()) HDeoptimize(
diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc
index 390a2bb..ed760f1 100644
--- a/compiler/optimizing/instruction_builder.cc
+++ b/compiler/optimizing/instruction_builder.cc
@@ -372,6 +372,9 @@
     if (current_block_->IsEntryBlock()) {
       InitializeParameters();
       AppendInstruction(new (allocator_) HSuspendCheck(0u));
+      if (graph_->IsDebuggable() && code_generator_->GetCompilerOptions().IsJitCompiler()) {
+        AppendInstruction(new (allocator_) HMethodEntryHook(0u));
+      }
       AppendInstruction(new (allocator_) HGoto(0u));
       continue;
     } else if (current_block_->IsExitBlock()) {
@@ -822,10 +825,18 @@
           compilation_stats_,
           MethodCompilationStat::kConstructorFenceGeneratedFinal);
     }
+    if (graph_->IsDebuggable() && code_generator_->GetCompilerOptions().IsJitCompiler()) {
+      // Return value is not used for void functions. We pass NullConstant to
+      // avoid special cases when generating code.
+      AppendInstruction(new (allocator_) HMethodExitHook(graph_->GetNullConstant(), dex_pc));
+    }
     AppendInstruction(new (allocator_) HReturnVoid(dex_pc));
   } else {
     DCHECK(!RequiresConstructorBarrier(dex_compilation_unit_));
     HInstruction* value = LoadLocal(instruction.VRegA(), type);
+    if (graph_->IsDebuggable() && code_generator_->GetCompilerOptions().IsJitCompiler()) {
+      AppendInstruction(new (allocator_) HMethodExitHook(value, dex_pc));
+    }
     AppendInstruction(new (allocator_) HReturn(value, dex_pc));
   }
   current_block_ = nullptr;
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 17080f0..2478693 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -2913,7 +2913,10 @@
     } else if (current->IsCurrentMethod()) {
       replacement = outer_graph->GetCurrentMethod();
     } else {
-      DCHECK(current->IsGoto() || current->IsSuspendCheck());
+      // It is OK to ignore MethodEntryHook for inlined functions.
+      // In debug mode we don't inline and in release mode method
+      // tracing is best effort so OK to ignore them.
+      DCHECK(current->IsGoto() || current->IsSuspendCheck() || current->IsMethodEntryHook());
       entry_block_->RemoveInstruction(current);
     }
     if (replacement != nullptr) {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 06fb88e..978e7c4 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -21,6 +21,7 @@
 #include <array>
 #include <type_traits>
 
+#include "art_method.h"
 #include "base/arena_allocator.h"
 #include "base/arena_bit_vector.h"
 #include "base/arena_containers.h"
@@ -32,7 +33,6 @@
 #include "base/quasi_atomic.h"
 #include "base/stl_util.h"
 #include "base/transform_array_ref.h"
-#include "art_method.h"
 #include "block_namer.h"
 #include "class_root.h"
 #include "compilation_kind.h"
@@ -680,7 +680,7 @@
   }
 
   bool HasShouldDeoptimizeFlag() const {
-    return number_of_cha_guards_ != 0;
+    return number_of_cha_guards_ != 0 || debuggable_;
   }
 
   bool HasTryCatch() const { return has_try_catch_; }
@@ -1530,6 +1530,8 @@
   M(LongConstant, Constant)                                             \
   M(Max, Instruction)                                                   \
   M(MemoryBarrier, Instruction)                                         \
+  M(MethodEntryHook, Instruction)                                       \
+  M(MethodExitHook, Instruction)                                        \
   M(Min, BinaryOperation)                                               \
   M(MonitorOperation, Instruction)                                      \
   M(Mul, BinaryOperation)                                               \
@@ -2991,6 +2993,38 @@
   friend class SsaBuilder;
 };
 
+class HMethodEntryHook : public HExpression<0> {
+ public:
+  explicit HMethodEntryHook(uint32_t dex_pc)
+      : HExpression(kMethodEntryHook, SideEffects::All(), dex_pc) {}
+
+  bool NeedsEnvironment() const override {
+    return true;
+  }
+
+  DECLARE_INSTRUCTION(MethodEntryHook);
+
+ protected:
+  DEFAULT_COPY_CONSTRUCTOR(MethodEntryHook);
+};
+
+class HMethodExitHook : public HExpression<1> {
+ public:
+  HMethodExitHook(HInstruction* value, uint32_t dex_pc)
+      : HExpression(kMethodExitHook, SideEffects::All(), dex_pc) {
+    SetRawInputAt(0, value);
+  }
+
+  bool NeedsEnvironment() const override {
+    return true;
+  }
+
+  DECLARE_INSTRUCTION(MethodExitHook);
+
+ protected:
+  DEFAULT_COPY_CONSTRUCTOR(MethodExitHook);
+};
+
 // Represents dex's RETURN_VOID opcode. A HReturnVoid is a control flow
 // instruction that branches to the exit block.
 class HReturnVoid final : public HExpression<0> {
diff --git a/dex2oat/linker/oat_writer_test.cc b/dex2oat/linker/oat_writer_test.cc
index 7bcff2b..0d7e0e5 100644
--- a/dex2oat/linker/oat_writer_test.cc
+++ b/dex2oat/linker/oat_writer_test.cc
@@ -505,7 +505,7 @@
   EXPECT_EQ(64U, sizeof(OatHeader));
   EXPECT_EQ(4U, sizeof(OatMethodOffsets));
   EXPECT_EQ(4U, sizeof(OatQuickMethodHeader));
-  EXPECT_EQ(169 * static_cast<size_t>(GetInstructionSetPointerSize(kRuntimeISA)),
+  EXPECT_EQ(171 * static_cast<size_t>(GetInstructionSetPointerSize(kRuntimeISA)),
             sizeof(QuickEntryPoints));
 }
 
diff --git a/openjdkjvmti/deopt_manager.cc b/openjdkjvmti/deopt_manager.cc
index bf1b4f0..cf28a71 100644
--- a/openjdkjvmti/deopt_manager.cc
+++ b/openjdkjvmti/deopt_manager.cc
@@ -492,7 +492,12 @@
                                          art::gc::GcCause::kGcCauseDebugger,
                                          art::gc::CollectorType::kCollectorTypeDebugger);
   art::ScopedSuspendAll ssa("Instrument thread stack");
-  art::Runtime::Current()->GetInstrumentation()->InstrumentThreadStack(target);
+  // Prepare the stack so methods can be deoptimized as and when required.
+  // This by itself doesn't cause any methods to deoptimize but enables
+  // deoptimization on demand.
+  art::Runtime::Current()->GetInstrumentation()->InstrumentThreadStack(
+      target,
+      /* deopt_all_frames= */ false);
 }
 
 extern DeoptManager* gDeoptManager;
diff --git a/openjdkjvmti/ti_heap.cc b/openjdkjvmti/ti_heap.cc
index 27fed28..bd9d2dd 100644
--- a/openjdkjvmti/ti_heap.cc
+++ b/openjdkjvmti/ti_heap.cc
@@ -1780,7 +1780,7 @@
       // already have.
       // TODO We technically only need to do this if the frames are not already being interpreted.
       // The cost for doing an extra stack walk is unlikely to be worth it though.
-      instr->InstrumentThreadStack(t);
+      instr->InstrumentThreadStack(t, /* deopt_all_frames= */ true);
     }
   }
 }
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index f5f1274..5ef1d3e 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -2525,3 +2525,36 @@
     // artCompileOptimized doesn't allow thread suspension.
     blx lr
 END art_quick_compile_optimized
+
+// On entry, method is at the bottom of the stack.
+ENTRY art_quick_method_entry_hook
+    SETUP_SAVE_EVERYTHING_FRAME r0
+    ldr r0, [sp, FRAME_SIZE_SAVE_EVERYTHING] @ pass ArtMethod
+    mov r1, rSELF                            @ pass Thread::Current
+    bl  artMethodEntryHook                   @ (ArtMethod*, Thread*)
+    RESTORE_SAVE_EVERYTHING_FRAME
+    REFRESH_MARKING_REGISTER
+    blx lr
+END art_quick_method_entry_hook
+
+ENTRY art_quick_method_exit_hook
+    SETUP_SAVE_EVERYTHING_FRAME r2
+
+    add r3, sp, #8                            @ store fpr_res pointer, in kSaveEverything frame
+    add r2, sp, #136                          @ store gpr_res pointer, in kSaveEverything frame
+    ldr r1, [sp, #FRAME_SIZE_SAVE_EVERYTHING] @ pass ArtMethod*
+    mov r0, rSELF                             @ pass Thread::Current
+    blx artMethodExitHook                     @ (Thread*, ArtMethod*, gpr_res*, fpr_res*)
+
+    .cfi_remember_state
+    cbnz r0, .Ldo_deliver_instrumentation_exception_exit @ Deliver exception
+
+    // Normal return.
+    RESTORE_SAVE_EVERYTHING_FRAME
+    REFRESH_MARKING_REGISTER
+    blx lr
+.Ldo_deliver_instrumentation_exception_exit:
+    .cfi_restore_state
+    .cfi_def_cfa sp, FRAME_SIZE_SAVE_EVERYTHING
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
+END art_quick_method_exit_hook
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 022a0e4..e5dbeda 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -2630,3 +2630,40 @@
     // artCompileOptimized doesn't allow thread suspension.
     ret
 END art_quick_compile_optimized
+
+    .extern artMethodEntryHook
+ENTRY art_quick_method_entry_hook
+    SETUP_SAVE_EVERYTHING_FRAME
+
+    ldr x0, [sp, #FRAME_SIZE_SAVE_EVERYTHING] // pass ArtMethod*
+    mov x1, xSELF                             // pass Thread::Current
+    bl  artMethodEntryHook                    // (ArtMethod*, Thread*)
+
+    RESTORE_SAVE_EVERYTHING_FRAME             // Note: will restore xSELF
+    REFRESH_MARKING_REGISTER
+    ret
+END art_quick_method_entry_hook
+
+    .extern artMethodExitHook
+ENTRY art_quick_method_exit_hook
+    SETUP_SAVE_EVERYTHING_FRAME
+
+    add x3, sp, #16                           // floating-point result ptr in kSaveEverything frame
+    add x2, sp, #272                          // integer result ptr in kSaveEverything frame
+    ldr x1, [sp, #FRAME_SIZE_SAVE_EVERYTHING] // ArtMethod*
+    mov x0, xSELF                             // Thread::Current
+    bl  artMethodExitHook                     // (Thread*, ArtMethod*, gpr_res*, fpr_res*)
+
+    .cfi_remember_state
+    cbnz x0, .Ldo_deliver_instrumentation_exception_exit // Handle exception
+
+    // Normal return.
+    RESTORE_SAVE_EVERYTHING_FRAME
+    REFRESH_MARKING_REGISTER
+    ret
+.Ldo_deliver_instrumentation_exception_exit:
+    .cfi_restore_state
+    .cfi_def_cfa sp, FRAME_SIZE_SAVE_EVERYTHING
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
+END art_quick_method_exit_hook
+
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index cda98d2..2f6af4f 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -2381,3 +2381,62 @@
     RESTORE_SAVE_EVERYTHING_FRAME
     ret
 END_FUNCTION art_quick_compile_optimized
+
+DEFINE_FUNCTION art_quick_method_entry_hook
+    SETUP_SAVE_EVERYTHING_FRAME edx
+    mov FRAME_SIZE_SAVE_EVERYTHING(%esp), %eax // Fetch ArtMethod
+    subl LITERAL(8), %esp
+    CFI_ADJUST_CFA_OFFSET(8)
+
+    pushl %fs:THREAD_SELF_OFFSET    // Pass Thread::Current().
+    CFI_ADJUST_CFA_OFFSET(4)
+    pushl %eax                      // Pass Method*.
+    CFI_ADJUST_CFA_OFFSET(4)
+
+    call SYMBOL(artMethodEntryHook) // (Method*, Thread*)
+
+    addl LITERAL(16), %esp          // Pop arguments.
+    CFI_ADJUST_CFA_OFFSET(-16)
+
+    RESTORE_SAVE_EVERYTHING_FRAME
+    ret
+END_FUNCTION art_quick_method_entry_hook
+
+DEFINE_FUNCTION art_quick_method_exit_hook
+    SETUP_SAVE_EVERYTHING_FRAME ebx
+
+    mov FRAME_SIZE_SAVE_EVERYTHING(%esp), %ebx // Remember ArtMethod*
+    subl LITERAL(8), %esp                      // Align stack.
+    CFI_ADJUST_CFA_OFFSET(8)
+    PUSH_ARG edx                   // Save gpr return value. edx and eax need to be together
+                                   // which isn't the case in kSaveEverything frame.
+    PUSH_ARG eax
+    movl %esp, %edx                // Get pointer to gpr_result
+    leal 32(%esp), %eax            // Get pointer to fpr_result, in kSaveEverything frame
+    PUSH_ARG eax                   // Pass fpr_result
+    PUSH_ARG edx                   // Pass gpr_result
+    PUSH_ARG ebx                   // Pass ArtMethod*
+    pushl %fs:THREAD_SELF_OFFSET   // Pass Thread::Current.
+    CFI_ADJUST_CFA_OFFSET(4)
+    call SYMBOL(artMethodExitHook) // (Thread*, ArtMethod*, gpr_result*, fpr_result*)
+
+    // Return result could have been changed if it's a reference.
+    movl 16(%esp), %ecx
+    movl %ecx, (80+32)(%esp)
+    addl LITERAL(32), %esp         // Pop arguments and grp_result.
+    CFI_ADJUST_CFA_OFFSET(-32)
+
+    cmpl LITERAL(1), %eax          // Check if we returned error.
+    CFI_REMEMBER_STATE
+    je .Ldo_deliver_instrumentation_exception_exit
+
+    // Normal return.
+    RESTORE_SAVE_EVERYTHING_FRAME
+    ret
+.Ldo_deliver_instrumentation_exception_exit:
+    CFI_RESTORE_STATE_AND_DEF_CFA esp, FRAME_SIZE_SAVE_EVERYTHING
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
+END_FUNCTION art_quick_method_exit_hook
+
+
+
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 8c21384..136198f 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -2208,3 +2208,40 @@
     RESTORE_SAVE_EVERYTHING_FRAME               // restore frame up to return address
     ret
 END_FUNCTION art_quick_compile_optimized
+
+// On entry, method is at the bottom of the stack.
+DEFINE_FUNCTION art_quick_method_entry_hook
+    SETUP_SAVE_EVERYTHING_FRAME
+
+    movq FRAME_SIZE_SAVE_EVERYTHING(%rsp), %rdi // pass ArtMethod
+    movq %gs:THREAD_SELF_OFFSET, %rsi           // pass Thread::Current()
+
+    call SYMBOL(artMethodEntryHook)              // (ArtMethod*, Thread*)
+
+    RESTORE_SAVE_EVERYTHING_FRAME
+    ret
+END_FUNCTION art_quick_method_entry_hook
+
+// On entry, method is at the bottom of the stack.
+// and r8 has should_deopt_frame value.
+DEFINE_FUNCTION art_quick_method_exit_hook
+    SETUP_SAVE_EVERYTHING_FRAME
+
+    leaq 16(%rsp), %rcx                         // floating-point result pointer in kSaveEverything
+                                                // frame
+    leaq 144(%rsp), %rdx                        // integer result pointer in kSaveEverything frame
+    movq FRAME_SIZE_SAVE_EVERYTHING(%rsp), %rsi // ArtMethod
+    movq %gs:THREAD_SELF_OFFSET, %rdi           // Thread::Current
+    call SYMBOL(artMethodExitHook)              // (Thread*, SP, gpr_res*, fpr_res*)
+
+    cmpq LITERAL(1), %rax
+    CFI_REMEMBER_STATE
+    je .Ldo_deliver_instrumentation_exception_exit
+
+    // Normal return.
+    RESTORE_SAVE_EVERYTHING_FRAME
+    ret
+.Ldo_deliver_instrumentation_exception_exit:
+    CFI_RESTORE_STATE_AND_DEF_CFA rsp, FRAME_SIZE_SAVE_EVERYTHING
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
+END_FUNCTION art_quick_method_entry_hook
diff --git a/runtime/cha.cc b/runtime/cha.cc
index c345af8..392b35c 100644
--- a/runtime/cha.cc
+++ b/runtime/cha.cc
@@ -219,27 +219,12 @@
     }
 
     // The compiled code on stack is not valid anymore. Need to deoptimize.
-    SetShouldDeoptimizeFlag();
+    SetShouldDeoptimizeFlag(DeoptimizeFlagValue::kCHA);
 
     return true;
   }
 
  private:
-  void SetShouldDeoptimizeFlag() REQUIRES_SHARED(Locks::mutator_lock_) {
-    QuickMethodFrameInfo frame_info = GetCurrentQuickFrameInfo();
-    size_t frame_size = frame_info.FrameSizeInBytes();
-    uint8_t* sp = reinterpret_cast<uint8_t*>(GetCurrentQuickFrame());
-    size_t core_spill_size = POPCOUNT(frame_info.CoreSpillMask()) *
-        GetBytesPerGprSpillLocation(kRuntimeISA);
-    size_t fpu_spill_size = POPCOUNT(frame_info.FpSpillMask()) *
-        GetBytesPerFprSpillLocation(kRuntimeISA);
-    size_t offset = frame_size - core_spill_size - fpu_spill_size - kShouldDeoptimizeFlagSize;
-    uint8_t* should_deoptimize_addr = sp + offset;
-    // Set deoptimization flag to 1.
-    DCHECK(*should_deoptimize_addr == 0 || *should_deoptimize_addr == 1);
-    *should_deoptimize_addr = 1;
-  }
-
   // Set of method headers for compiled code that should be deoptimized.
   const std::unordered_set<OatQuickMethodHeader*>& method_headers_;
 
diff --git a/runtime/deoptimization_kind.h b/runtime/deoptimization_kind.h
index 5be6f3d..c2e6a65 100644
--- a/runtime/deoptimization_kind.h
+++ b/runtime/deoptimization_kind.h
@@ -29,6 +29,7 @@
   kLoopNullBCE,
   kBlockBCE,
   kCHA,
+  kDebugging,
   kFullFrame,
   kLast = kFullFrame
 };
@@ -42,6 +43,7 @@
     case DeoptimizationKind::kLoopNullBCE: return "loop bounds check elimination on null";
     case DeoptimizationKind::kBlockBCE: return "block bounds check elimination";
     case DeoptimizationKind::kCHA: return "class hierarchy analysis";
+    case DeoptimizationKind::kDebugging: return "Deopt requested for debug support";
     case DeoptimizationKind::kFullFrame: return "full frame";
   }
   LOG(FATAL) << "Unexpected kind " << static_cast<size_t>(kind);
@@ -50,6 +52,15 @@
 
 std::ostream& operator<<(std::ostream& os, const DeoptimizationKind& kind);
 
+// We use a DeoptimizationStackSlot to record if a deoptimization is required
+// for functions that are already on stack. The value in the slot specifies the
+// reason we need to deoptimize.
+enum class DeoptimizeFlagValue: uint8_t {
+  kCHA = 0b01,
+  kDebug = 0b10,
+  kAll = kCHA | kDebug
+};
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_DEOPTIMIZATION_KIND_H_
diff --git a/runtime/entrypoints/quick/quick_default_init_entrypoints.h b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
index f3fc97e..3fc23ee 100644
--- a/runtime/entrypoints/quick/quick_default_init_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
@@ -131,6 +131,10 @@
   qpoints->pUpdateInlineCache = art_quick_update_inline_cache;
   qpoints->pCompileOptimized = art_quick_compile_optimized;
 
+  // Tracing hooks
+  qpoints->pMethodEntryHook = art_quick_method_entry_hook;
+  qpoints->pMethodExitHook = art_quick_method_exit_hook;
+
   bool should_report = false;
   PaletteShouldReportJniInvocations(&should_report);
   if (should_report) {
diff --git a/runtime/entrypoints/quick/quick_entrypoints_list.h b/runtime/entrypoints/quick/quick_entrypoints_list.h
index 5deb557..f69ab1d 100644
--- a/runtime/entrypoints/quick/quick_entrypoints_list.h
+++ b/runtime/entrypoints/quick/quick_entrypoints_list.h
@@ -208,6 +208,8 @@
   V(ReadBarrierSlow, mirror::Object*, mirror::Object*, mirror::Object*, uint32_t) \
   V(ReadBarrierForRootSlow, mirror::Object*, GcRoot<mirror::Object>*) \
 \
+  V(MethodEntryHook, void, ArtMethod*, Thread*) \
+  V(MethodExitHook, int32_t, Thread*, ArtMethod*, uint64_t*, uint64_t*)
 
 #endif  // ART_RUNTIME_ENTRYPOINTS_QUICK_QUICK_ENTRYPOINTS_LIST_H_
 #undef ART_RUNTIME_ENTRYPOINTS_QUICK_QUICK_ENTRYPOINTS_LIST_H_   // #define is only for lint.
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 3279f7d..be9d949 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -60,6 +60,9 @@
 
 namespace art {
 
+extern "C" NO_RETURN void artDeoptimizeFromCompiledCode(DeoptimizationKind kind, Thread* self);
+extern "C" NO_RETURN void artDeoptimize(Thread* self);
+
 // Visits the arguments as saved to the stack by a CalleeSaveType::kRefAndArgs callee save frame.
 class QuickArgumentVisitor {
   // Number of bytes for each out register in the caller method's frame.
@@ -2588,4 +2591,74 @@
   return result.GetJ();
 }
 
+extern "C" void artMethodEntryHook(ArtMethod* method, Thread* self, ArtMethod** sp ATTRIBUTE_UNUSED)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  instrumentation::Instrumentation* instr = Runtime::Current()->GetInstrumentation();
+  instr->MethodEnterEvent(self, method);
+  if (instr->IsDeoptimized(method)) {
+    // Instrumentation can request deoptimizing only a particular method (for
+    // ex: when there are break points on the method). In such cases deoptimize
+    // only this method. FullFrame deoptimizations are handled on method exits.
+    artDeoptimizeFromCompiledCode(DeoptimizationKind::kDebugging, self);
+  }
+}
+
+extern "C" int artMethodExitHook(Thread* self,
+                                 ArtMethod* method,
+                                 uint64_t* gpr_result,
+                                 uint64_t* fpr_result)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  DCHECK_EQ(reinterpret_cast<uintptr_t>(self), reinterpret_cast<uintptr_t>(Thread::Current()));
+  CHECK(gpr_result != nullptr);
+  CHECK(fpr_result != nullptr);
+  // Instrumentation exit stub must not be entered with a pending exception.
+  CHECK(!self->IsExceptionPending())
+      << "Enter instrumentation exit stub with pending exception " << self->GetException()->Dump();
+
+  instrumentation::Instrumentation* instr = Runtime::Current()->GetInstrumentation();
+  bool is_ref;
+  JValue return_value = instr->GetReturnValue(self, method, &is_ref, gpr_result, fpr_result);
+  bool deoptimize = false;
+  {
+    StackHandleScope<1> hs(self);
+    MutableHandle<mirror::Object> res(hs.NewHandle<mirror::Object>(nullptr));
+    if (is_ref) {
+      // Take a handle to the return value so we won't lose it if we suspend.
+      res.Assign(return_value.GetL());
+    }
+    uint32_t dex_pc = dex::kDexNoIndex;
+    DCHECK(!method->IsRuntimeMethod());
+    instr->MethodExitEvent(self,
+                           ObjPtr<mirror::Object>(),
+                           method,
+                           dex_pc,
+                           /* frame= */ {},
+                           return_value);
+
+    // Deoptimize if the caller needs to continue execution in the interpreter. Do nothing if we get
+    // back to an upcall.
+    NthCallerVisitor visitor(self, 1, true);
+    visitor.WalkStack(true);
+    deoptimize = instr->ShouldDeoptimizeMethod(self, visitor);
+
+    if (is_ref) {
+      // Restore the return value if it's a reference since it might have moved.
+      *reinterpret_cast<mirror::Object**>(gpr_result) = res.Get();
+    }
+  }
+
+  if (self->IsExceptionPending() || self->ObserveAsyncException()) {
+    return 1;
+  }
+
+  if (deoptimize) {
+    DeoptimizationMethodType deopt_method_type = instr->GetDeoptimizationMethodType(method);
+    self->PushDeoptimizationContext(return_value, is_ref, nullptr, false, deopt_method_type);
+    artDeoptimize(self);
+    UNREACHABLE();
+  }
+
+  return 0;
+}
+
 }  // namespace art
diff --git a/runtime/entrypoints/runtime_asm_entrypoints.h b/runtime/entrypoints/runtime_asm_entrypoints.h
index d2096ec..c4e62e5 100644
--- a/runtime/entrypoints/runtime_asm_entrypoints.h
+++ b/runtime/entrypoints/runtime_asm_entrypoints.h
@@ -96,6 +96,8 @@
 
 extern "C" void* art_quick_string_builder_append(uint32_t format);
 extern "C" void art_quick_compile_optimized(ArtMethod*, Thread*);
+extern "C" void art_quick_method_entry_hook(ArtMethod*, Thread*);
+extern "C" int32_t art_quick_method_exit_hook(Thread*, ArtMethod*, uint64_t*, uint64_t*);
 
 }  // namespace art
 
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index 73f97bc..b515245 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -405,9 +405,13 @@
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pReadBarrierMarkReg29, pReadBarrierSlow, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pReadBarrierSlow, pReadBarrierForRootSlow,
                          sizeof(void*));
+    EXPECT_OFFSET_DIFFNP(
+        QuickEntryPoints, pReadBarrierForRootSlow, pMethodEntryHook, sizeof(void*));
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pMethodEntryHook, pMethodExitHook, sizeof(void*));
 
-    CHECKED(OFFSETOF_MEMBER(QuickEntryPoints, pReadBarrierForRootSlow)
-            + sizeof(void*) == sizeof(QuickEntryPoints), QuickEntryPoints_all);
+    CHECKED(OFFSETOF_MEMBER(QuickEntryPoints, pMethodExitHook) + sizeof(void*) ==
+                sizeof(QuickEntryPoints),
+            QuickEntryPoints_all);
   }
 };
 
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index 97dad8c..91c30c7 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -231,6 +231,26 @@
          !method->IsProxyMethod();
 }
 
+bool Instrumentation::CodeNeedsEntryExitStub(const void* code) {
+  // In some tests runtime isn't setup fully and hence the entry points could
+  // be nullptr.
+  if (code == nullptr) {
+    return true;
+  }
+  // When jiting code for debuggable apps we generate the code to call method
+  // entry / exit hooks when required. Hence it is not required to update
+  // to instrumentation entry point for JITed code in debuggable mode.
+  if (!Runtime::Current()->IsJavaDebuggable()) {
+    return true;
+  }
+
+  jit::Jit* jit = Runtime::Current()->GetJit();
+  if (jit != nullptr && jit->GetCodeCache()->ContainsPc(code)) {
+    return false;
+  }
+  return true;
+}
+
 void Instrumentation::InstallStubsForMethod(ArtMethod* method) {
   if (!method->IsInvokable() || method->IsProxyMethod()) {
     // Do not change stubs for these methods.
@@ -274,7 +294,12 @@
         if (entry_exit_stubs_installed_) {
           // This needs to be checked first since the instrumentation entrypoint will be able to
           // find the actual JIT compiled code that corresponds to this method.
-          new_quick_code = GetQuickInstrumentationEntryPoint();
+          const void* code = method->GetEntryPointFromQuickCompiledCodePtrSize(kRuntimePointerSize);
+          if (CodeNeedsEntryExitStub(code)) {
+            new_quick_code = GetQuickInstrumentationEntryPoint();
+          } else {
+            new_quick_code = code;
+          }
         } else if (NeedDebugVersionFor(method)) {
           // It would be great to search the JIT for its implementation here but we cannot due to
           // the locks we hold. Instead just set to the interpreter bridge and that code will search
@@ -292,23 +317,30 @@
 }
 
 // Places the instrumentation exit pc as the return PC for every quick frame. This also allows
-// deoptimization of quick frames to interpreter frames.
+// deoptimization of quick frames to interpreter frames. When force_deopt is
+// true the frames have to be deoptimized. If the frame has a deoptimization
+// stack slot (all Jited frames), it is set to true to indicate this. For frames
+// that do not have this slot, the force_deopt_id on the InstrumentationStack is
+// used to check if the frame needs to be deoptimized. When force_deopt is false
+// we just instrument the stack for method entry / exit hooks.
 // Since we may already have done this previously, we need to push new instrumentation frame before
 // existing instrumentation frames.
-void InstrumentationInstallStack(Thread* thread, void* arg)
+void InstrumentationInstallStack(Thread* thread, void* arg, bool deopt_all_frames)
     REQUIRES(Locks::mutator_lock_) {
   Locks::mutator_lock_->AssertExclusiveHeld(Thread::Current());
   struct InstallStackVisitor final : public StackVisitor {
     InstallStackVisitor(Thread* thread_in,
                         Context* context,
                         uintptr_t instrumentation_exit_pc,
-                        uint64_t force_deopt_id)
+                        uint64_t force_deopt_id,
+                        bool deopt_all_frames)
         : StackVisitor(thread_in, context, kInstrumentationStackWalk),
           instrumentation_stack_(thread_in->GetInstrumentationStack()),
           instrumentation_exit_pc_(instrumentation_exit_pc),
           reached_existing_instrumentation_frames_(false),
           last_return_pc_(0),
-          force_deopt_id_(force_deopt_id) {}
+          force_deopt_id_(force_deopt_id),
+          deopt_all_frames_(deopt_all_frames) {}
 
     bool VisitFrame() override REQUIRES_SHARED(Locks::mutator_lock_) {
       ArtMethod* m = GetMethod();
@@ -366,6 +398,15 @@
           LOG(INFO) << "Ignoring already instrumented " << frame.Dump();
         }
       } else {
+        // If it is a JITed frame then just set the deopt bit if required
+        // otherwise continue
+        const OatQuickMethodHeader* method_header = GetCurrentOatQuickMethodHeader();
+        if (deopt_all_frames_ &&
+            method_header != nullptr &&
+            method_header->HasShouldDeoptimizeFlag()) {
+          SetShouldDeoptimizeFlag(DeoptimizeFlagValue::kDebug);
+          return true;
+        }
         CHECK_NE(return_pc, 0U);
         if (UNLIKELY(reached_existing_instrumentation_frames_ && !m->IsRuntimeMethod())) {
           // We already saw an existing instrumentation frame so this should be a runtime-method
@@ -373,9 +414,8 @@
           std::string thread_name;
           GetThread()->GetThreadName(thread_name);
           uint32_t dex_pc = dex::kDexNoIndex;
-          if (last_return_pc_ != 0 && GetCurrentOatQuickMethodHeader() != nullptr) {
-            dex_pc = GetCurrentOatQuickMethodHeader()->ToDexPc(
-                GetCurrentQuickFrame(), last_return_pc_);
+          if (last_return_pc_ != 0 && method_header != nullptr) {
+            dex_pc = method_header->ToDexPc(GetCurrentQuickFrame(), last_return_pc_);
           }
           LOG(FATAL) << "While walking " << thread_name << " found unexpected non-runtime method"
                      << " without instrumentation exit return or interpreter frame."
@@ -413,6 +453,7 @@
     bool reached_existing_instrumentation_frames_;
     uintptr_t last_return_pc_;
     uint64_t force_deopt_id_;
+    bool deopt_all_frames_;
   };
   if (kVerboseInstrumentation) {
     std::string thread_name;
@@ -423,8 +464,11 @@
   Instrumentation* instrumentation = reinterpret_cast<Instrumentation*>(arg);
   std::unique_ptr<Context> context(Context::Create());
   uintptr_t instrumentation_exit_pc = reinterpret_cast<uintptr_t>(GetQuickInstrumentationExitPc());
-  InstallStackVisitor visitor(
-      thread, context.get(), instrumentation_exit_pc, instrumentation->current_force_deopt_id_);
+  InstallStackVisitor visitor(thread,
+                              context.get(),
+                              instrumentation_exit_pc,
+                              instrumentation->current_force_deopt_id_,
+                              deopt_all_frames);
   visitor.WalkStack(true);
   CHECK_EQ(visitor.dex_pcs_.size(), thread->GetInstrumentationStack()->size());
 
@@ -449,9 +493,9 @@
   thread->VerifyStack();
 }
 
-void Instrumentation::InstrumentThreadStack(Thread* thread) {
+void Instrumentation::InstrumentThreadStack(Thread* thread, bool force_deopt) {
   instrumentation_stubs_installed_ = true;
-  InstrumentationInstallStack(thread, this);
+  InstrumentationInstallStack(thread, this, force_deopt);
 }
 
 // Removes the instrumentation exit pc as the return PC for every quick frame.
@@ -548,7 +592,7 @@
   ThreadList* tl = Runtime::Current()->GetThreadList();
   tl->ForEach([&](Thread* t) {
     Locks::mutator_lock_->AssertExclusiveHeld(self);
-    InstrumentThreadStack(t);
+    InstrumentThreadStack(t, /* deopt_all_frames= */ true);
   });
   current_force_deopt_id_++;
 }
@@ -800,7 +844,9 @@
     runtime->GetClassLinker()->VisitClasses(&visitor);
     instrumentation_stubs_installed_ = true;
     MutexLock mu(self, *Locks::thread_list_lock_);
-    runtime->GetThreadList()->ForEach(InstrumentationInstallStack, this);
+    for (Thread* thread : Runtime::Current()->GetThreadList()->GetList()) {
+      InstrumentThreadStack(thread, /* deopt_all_frames= */ false);
+    }
   } else {
     interpreter_stubs_installed_ = false;
     entry_exit_stubs_installed_ = false;
@@ -924,7 +970,8 @@
                  // implementation directly and this will confuse the instrumentation trampolines.
                  // TODO We should remove the need for this since it makes it impossible to profile
                  // Proxy.<init> correctly in all cases.
-                 method != jni::DecodeArtMethod(WellKnownClasses::java_lang_reflect_Proxy_init)) {
+                 method != jni::DecodeArtMethod(WellKnownClasses::java_lang_reflect_Proxy_init) &&
+                 CodeNeedsEntryExitStub(quick_code)) {
         new_quick_code = GetQuickInstrumentationEntryPoint();
       } else {
         new_quick_code = quick_code;
@@ -1017,7 +1064,12 @@
     // these previously so it will only cover the newly created frames.
     instrumentation_stubs_installed_ = true;
     MutexLock mu(self, *Locks::thread_list_lock_);
-    Runtime::Current()->GetThreadList()->ForEach(InstrumentationInstallStack, this);
+    for (Thread* thread : Runtime::Current()->GetThreadList()->GetList()) {
+      // This isn't a strong deopt. We deopt this method if it is still in the
+      // deopt methods list. If by the time we hit this frame we no longer need
+      // a deopt it is safe to continue. So we don't mark the frame.
+      InstrumentThreadStack(thread, /* deopt_all_frames= */ false);
+    }
   }
 }
 
@@ -1451,28 +1503,8 @@
   return shorty;
 }
 
-TwoWordReturn Instrumentation::PopInstrumentationStackFrame(Thread* self,
-                                                            uintptr_t* return_pc_addr,
-                                                            uint64_t* gpr_result,
-                                                            uint64_t* fpr_result) {
-  DCHECK(gpr_result != nullptr);
-  DCHECK(fpr_result != nullptr);
-  // Do the pop.
-  std::map<uintptr_t, instrumentation::InstrumentationStackFrame>* stack =
-      self->GetInstrumentationStack();
-  CHECK_GT(stack->size(), 0U);
-  auto it = stack->find(reinterpret_cast<uintptr_t>(return_pc_addr));
-  CHECK(it != stack->end());
-  InstrumentationStackFrame instrumentation_frame = it->second;
-  stack->erase(it);
-
-  // Set return PC and check the consistency of the stack.
-  // We don't cache the return pc value in a local as it may change after
-  // sending a method exit event.
-  *return_pc_addr = instrumentation_frame.return_pc_;
-  self->VerifyStack();
-
-  ArtMethod* method = instrumentation_frame.method_;
+JValue Instrumentation::GetReturnValue(
+    Thread* self, ArtMethod* method, bool* is_ref, uint64_t* gpr_result, uint64_t* fpr_result) {
   uint32_t length;
   const PointerSize pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
   char return_shorty;
@@ -1503,9 +1535,7 @@
     return_shorty = method->GetInterfaceMethodIfProxy(pointer_size)->GetShorty(&length)[0];
   }
 
-  bool is_ref = return_shorty == '[' || return_shorty == 'L';
-  StackHandleScope<1> hs(self);
-  MutableHandle<mirror::Object> res(hs.NewHandle<mirror::Object>(nullptr));
+  *is_ref = return_shorty == '[' || return_shorty == 'L';
   JValue return_value;
   if (return_shorty == 'V') {
     return_value.SetJ(0);
@@ -1514,6 +1544,59 @@
   } else {
     return_value.SetJ(*gpr_result);
   }
+  return return_value;
+}
+
+bool Instrumentation::ShouldDeoptimizeMethod(Thread* self, const NthCallerVisitor& visitor) {
+  bool should_deoptimize_frame = false;
+  const OatQuickMethodHeader* header = visitor.GetCurrentOatQuickMethodHeader();
+  if (header != nullptr && header->HasShouldDeoptimizeFlag()) {
+    uint8_t should_deopt_flag = visitor.GetShouldDeoptimizeFlag();
+    // DeoptimizeFlag could be set for debugging or for CHA invalidations.
+    // Deoptimize here only if it was requested for debugging. CHA
+    // invalidations are handled in the JITed code.
+    if ((should_deopt_flag & static_cast<uint8_t>(DeoptimizeFlagValue::kDebug)) != 0) {
+      should_deoptimize_frame = true;
+    }
+  }
+  return (visitor.caller != nullptr) &&
+         (interpreter_stubs_installed_ || IsDeoptimized(visitor.caller) ||
+          self->IsForceInterpreter() ||
+          // NB Since structurally obsolete compiled methods might have the offsets of
+          // methods/fields compiled in we need to go back to interpreter whenever we hit
+          // them.
+          visitor.caller->GetDeclaringClass()->IsObsoleteObject() ||
+          Dbg::IsForcedInterpreterNeededForUpcall(self, visitor.caller) ||
+          should_deoptimize_frame);
+}
+
+TwoWordReturn Instrumentation::PopInstrumentationStackFrame(Thread* self,
+                                                            uintptr_t* return_pc_addr,
+                                                            uint64_t* gpr_result,
+                                                            uint64_t* fpr_result) {
+  DCHECK(gpr_result != nullptr);
+  DCHECK(fpr_result != nullptr);
+  // Do the pop.
+  std::map<uintptr_t, instrumentation::InstrumentationStackFrame>* stack =
+      self->GetInstrumentationStack();
+  CHECK_GT(stack->size(), 0U);
+  auto it = stack->find(reinterpret_cast<uintptr_t>(return_pc_addr));
+  CHECK(it != stack->end());
+  InstrumentationStackFrame instrumentation_frame = it->second;
+  stack->erase(it);
+
+  // Set return PC and check the consistency of the stack.
+  // We don't cache the return pc value in a local as it may change after
+  // sending a method exit event.
+  *return_pc_addr = instrumentation_frame.return_pc_;
+  self->VerifyStack();
+
+  ArtMethod* method = instrumentation_frame.method_;
+
+  bool is_ref;
+  JValue return_value = GetReturnValue(self, method, &is_ref, gpr_result, fpr_result);
+  StackHandleScope<1> hs(self);
+  MutableHandle<mirror::Object> res(hs.NewHandle<mirror::Object>(nullptr));
   if (is_ref) {
     // Take a handle to the return value so we won't lose it if we suspend.
     res.Assign(return_value.GetL());
@@ -1532,17 +1615,11 @@
   // back to an upcall.
   NthCallerVisitor visitor(self, 1, true);
   visitor.WalkStack(true);
-  bool deoptimize = (visitor.caller != nullptr) &&
-                    (interpreter_stubs_installed_ || IsDeoptimized(visitor.caller) ||
-                    self->IsForceInterpreter() ||
-                    // NB Since structurally obsolete compiled methods might have the offsets of
-                    // methods/fields compiled in we need to go back to interpreter whenever we hit
-                    // them.
-                    visitor.caller->GetDeclaringClass()->IsObsoleteObject() ||
-                    // Check if we forced all threads to deoptimize in the time between this frame
-                    // being created and now.
-                    instrumentation_frame.force_deopt_id_ != current_force_deopt_id_ ||
-                    Dbg::IsForcedInterpreterNeededForUpcall(self, visitor.caller));
+  // Check if we forced all threads to deoptimize in the time between this frame being created and
+  // now.
+  bool should_deoptimize_frame = instrumentation_frame.force_deopt_id_ != current_force_deopt_id_;
+  bool deoptimize = ShouldDeoptimizeMethod(self, visitor) || should_deoptimize_frame;
+
   if (is_ref) {
     // Restore the return value if it's a reference since it might have moved.
     *reinterpret_cast<mirror::Object**>(gpr_result) = res.Get();
@@ -1560,8 +1637,8 @@
     }
     DeoptimizationMethodType deopt_method_type = GetDeoptimizationMethodType(method);
     self->PushDeoptimizationContext(return_value,
-                                    return_shorty == 'L' || return_shorty == '[',
-                                    /* exception= */ nullptr ,
+                                    is_ref,
+                                    /* exception= */ nullptr,
                                     /* from_code= */ false,
                                     deopt_method_type);
     return GetTwoWordSuccessValue(*return_pc_addr,
diff --git a/runtime/instrumentation.h b/runtime/instrumentation.h
index bdeaf30..c49d672 100644
--- a/runtime/instrumentation.h
+++ b/runtime/instrumentation.h
@@ -17,12 +17,13 @@
 #ifndef ART_RUNTIME_INSTRUMENTATION_H_
 #define ART_RUNTIME_INSTRUMENTATION_H_
 
-#include <functional>
 #include <stdint.h>
+
+#include <functional>
 #include <list>
 #include <memory>
-#include <unordered_set>
 #include <optional>
+#include <unordered_set>
 
 #include "arch/instruction_set.h"
 #include "base/enums.h"
@@ -30,6 +31,7 @@
 #include "base/macros.h"
 #include "base/safe_map.h"
 #include "gc_root.h"
+#include "offsets.h"
 
 namespace art {
 namespace mirror {
@@ -41,6 +43,7 @@
 class ArtMethod;
 template <typename T> class Handle;
 template <typename T> class MutableHandle;
+struct NthCallerVisitor;
 union JValue;
 class SHARED_LOCKABLE ReaderWriterMutex;
 class ShadowFrame;
@@ -207,6 +210,10 @@
 
   Instrumentation();
 
+  static constexpr MemberOffset NeedsEntryExitHooksOffset() {
+    return MemberOffset(OFFSETOF_MEMBER(Instrumentation, instrumentation_stubs_installed_));
+  }
+
   // Add a listener to be notified of the masked together sent of instrumentation events. This
   // suspend the runtime to install stubs. You are expected to hold the mutator lock as a proxy
   // for saying you should have suspended all threads (installing stubs while threads are running
@@ -485,6 +492,14 @@
   void ExceptionHandledEvent(Thread* thread, ObjPtr<mirror::Throwable> exception_object) const
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  JValue GetReturnValue(Thread* self,
+                        ArtMethod* method,
+                        bool* is_ref,
+                        uint64_t* gpr_result,
+                        uint64_t* fpr_result) REQUIRES_SHARED(Locks::mutator_lock_);
+  bool ShouldDeoptimizeMethod(Thread* self, const NthCallerVisitor& visitor)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
   // Called when an instrumented method is entered. The intended link register (lr) is saved so
   // that returning causes a branch to the method exit stub. Generates method enter events.
   void PushInstrumentationStackFrame(Thread* self,
@@ -530,10 +545,13 @@
                !GetDeoptimizedMethodsLock());
 
   // Install instrumentation exit stub on every method of the stack of the given thread.
-  // This is used by the debugger to cause a deoptimization of the thread's stack after updating
-  // local variable(s).
-  void InstrumentThreadStack(Thread* thread)
-      REQUIRES(Locks::mutator_lock_);
+  // This is used by:
+  //  - the debugger to cause a deoptimization of the all frames in thread's stack (for
+  //    example, after updating local variables)
+  //  - to call method entry / exit hooks for tracing. For this we instrument
+  //    the stack frame to run entry / exit hooks but we don't need to deoptimize.
+  // deopt_all_frames indicates whether the frames need to deoptimize or not.
+  void InstrumentThreadStack(Thread* thread, bool deopt_all_frames) REQUIRES(Locks::mutator_lock_);
 
   // Force all currently running frames to be deoptimized back to interpreter. This should only be
   // used in cases where basically all compiled code has been invalidated.
@@ -557,6 +575,10 @@
   // False otherwise.
   bool RequiresInstrumentationInstallation(InstrumentationLevel new_level) const;
 
+  // Returns true if we need entry exit stub to call entry hooks. JITed code
+  // directly call entry / exit hooks and don't need the stub.
+  bool CodeNeedsEntryExitStub(const void* code);
+
   // Does the job of installing or removing instrumentation code within methods.
   // In order to support multiple clients using instrumentation at the same time,
   // the caller must pass a unique key (a string) identifying it so we remind which
@@ -751,7 +773,7 @@
 
   friend class InstrumentationTest;  // For GetCurrentInstrumentationLevel and ConfigureStubs.
   friend class InstrumentationStackPopper;  // For popping instrumentation frames.
-  friend void InstrumentationInstallStack(Thread*, void*);
+  friend void InstrumentationInstallStack(Thread*, void*, bool);
 
   DISALLOW_COPY_AND_ASSIGN(Instrumentation);
 };
diff --git a/runtime/oat.h b/runtime/oat.h
index 95eb0e1..ac70a77 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,8 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr std::array<uint8_t, 4> kOatMagic { { 'o', 'a', 't', '\n' } };
-  // Last oat version changed reason: Inline IRT frame push/pop into JNI stubs.
-  static constexpr std::array<uint8_t, 4> kOatVersion { { '2', '0', '3', '\0' } };
+  // Last oat version changed reason: Introduced new entry points for method entry / exit hooks.
+  static constexpr std::array<uint8_t, 4> kOatVersion{ {'2', '0', '4', '\0'} };
 
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
   static constexpr const char* kDebuggableKey = "debuggable";
diff --git a/runtime/quick_exception_handler.cc b/runtime/quick_exception_handler.cc
index 5f497af..ac5065b 100644
--- a/runtime/quick_exception_handler.cc
+++ b/runtime/quick_exception_handler.cc
@@ -599,7 +599,10 @@
               << GetDeoptimizationKindName(kind);
     DumpFramesWithType(self_, /* details= */ true);
   }
-  if (Runtime::Current()->UseJitCompilation()) {
+  // When deoptimizing for debug support the optimized code is still valid and
+  // can be reused when debugging support (like breakpoints) are no longer
+  // needed fot this method.
+  if (Runtime::Current()->UseJitCompilation() && (kind != DeoptimizationKind::kDebugging)) {
     Runtime::Current()->GetJit()->GetCodeCache()->InvalidateCompiledCodeFor(
         deopt_method, visitor.GetSingleFrameDeoptQuickMethodHeader());
   } else {
diff --git a/runtime/stack.cc b/runtime/stack.cc
index 233106e..eb0fe56 100644
--- a/runtime/stack.cc
+++ b/runtime/stack.cc
@@ -800,6 +800,21 @@
   return RuntimeCalleeSaveFrame::GetMethodFrameInfo(CalleeSaveType::kSaveRefsAndArgs);
 }
 
+uint8_t* StackVisitor::GetShouldDeoptimizeFlagAddr() const REQUIRES_SHARED(Locks::mutator_lock_) {
+  DCHECK(GetCurrentOatQuickMethodHeader()->HasShouldDeoptimizeFlag());
+  QuickMethodFrameInfo frame_info = GetCurrentQuickFrameInfo();
+  size_t frame_size = frame_info.FrameSizeInBytes();
+  uint8_t* sp = reinterpret_cast<uint8_t*>(GetCurrentQuickFrame());
+  size_t core_spill_size =
+      POPCOUNT(frame_info.CoreSpillMask()) * GetBytesPerGprSpillLocation(kRuntimeISA);
+  size_t fpu_spill_size =
+      POPCOUNT(frame_info.FpSpillMask()) * GetBytesPerFprSpillLocation(kRuntimeISA);
+  size_t offset = frame_size - core_spill_size - fpu_spill_size - kShouldDeoptimizeFlagSize;
+  uint8_t* should_deoptimize_addr = sp + offset;
+  DCHECK_EQ(*should_deoptimize_addr & ~static_cast<uint8_t>(DeoptimizeFlagValue::kAll), 0);
+  return should_deoptimize_addr;
+}
+
 template <StackVisitor::CountTransitions kCount>
 void StackVisitor::WalkStack(bool include_transitions) {
   if (check_suspended_) {
diff --git a/runtime/stack.h b/runtime/stack.h
index 2a6fdc2..1b00b54 100644
--- a/runtime/stack.h
+++ b/runtime/stack.h
@@ -17,12 +17,14 @@
 #ifndef ART_RUNTIME_STACK_H_
 #define ART_RUNTIME_STACK_H_
 
-#include <optional>
 #include <stdint.h>
+
+#include <optional>
 #include <string>
 
 #include "base/locks.h"
 #include "base/macros.h"
+#include "deoptimization_kind.h"
 #include "obj_ptr.h"
 #include "quick/quick_method_frame_info.h"
 #include "stack_map.h"
@@ -295,6 +297,15 @@
 
   QuickMethodFrameInfo GetCurrentQuickFrameInfo() const REQUIRES_SHARED(Locks::mutator_lock_);
 
+  void SetShouldDeoptimizeFlag(DeoptimizeFlagValue value) REQUIRES_SHARED(Locks::mutator_lock_) {
+    uint8_t* should_deoptimize_addr = GetShouldDeoptimizeFlagAddr();
+    *should_deoptimize_addr = *should_deoptimize_addr | static_cast<uint8_t>(value);
+  };
+
+  uint8_t GetShouldDeoptimizeFlag() const REQUIRES_SHARED(Locks::mutator_lock_) {
+    return *GetShouldDeoptimizeFlagAddr();
+  }
+
  private:
   // Private constructor known in the case that num_frames_ has already been computed.
   StackVisitor(Thread* thread,
@@ -368,6 +379,8 @@
   mutable std::pair<const OatQuickMethodHeader*, CodeInfo> cur_inline_info_;
   mutable std::pair<uintptr_t, StackMap> cur_stack_map_;
 
+  uint8_t* GetShouldDeoptimizeFlagAddr() const REQUIRES_SHARED(Locks::mutator_lock_);
+
  protected:
   Context* const context_;
   const bool check_suspended_;
diff --git a/runtime/trace.cc b/runtime/trace.cc
index 5996a57..4082721 100644
--- a/runtime/trace.cc
+++ b/runtime/trace.cc
@@ -421,10 +421,11 @@
                                             "Sampling profiler thread");
         the_trace_->interval_us_ = interval_us;
       } else {
-        runtime->GetInstrumentation()->AddListener(the_trace_,
-                                                   instrumentation::Instrumentation::kMethodEntered |
-                                                   instrumentation::Instrumentation::kMethodExited |
-                                                   instrumentation::Instrumentation::kMethodUnwind);
+        runtime->GetInstrumentation()->AddListener(
+            the_trace_,
+            instrumentation::Instrumentation::kMethodEntered |
+                instrumentation::Instrumentation::kMethodExited |
+                instrumentation::Instrumentation::kMethodUnwind);
         // TODO: In full-PIC mode, we don't need to fully deopt.
         // TODO: We can only use trampoline entrypoints if we are java-debuggable since in that case
         // we know that inlining and other problematic optimizations are disabled. We might just
@@ -480,9 +481,10 @@
         runtime->GetThreadList()->ForEach(ClearThreadStackTraceAndClockBase, nullptr);
       } else {
         runtime->GetInstrumentation()->RemoveListener(
-            the_trace, instrumentation::Instrumentation::kMethodEntered |
-            instrumentation::Instrumentation::kMethodExited |
-            instrumentation::Instrumentation::kMethodUnwind);
+            the_trace,
+            instrumentation::Instrumentation::kMethodEntered |
+                instrumentation::Instrumentation::kMethodExited |
+                instrumentation::Instrumentation::kMethodUnwind);
         runtime->GetInstrumentation()->DisableMethodTracing(kTracerInstrumentationKey);
       }
     }
diff --git a/test/2011-stack-walk-concurrent-instrument/src/Main.java b/test/2011-stack-walk-concurrent-instrument/src/Main.java
index 8f96f93..53a7eea 100644
--- a/test/2011-stack-walk-concurrent-instrument/src/Main.java
+++ b/test/2011-stack-walk-concurrent-instrument/src/Main.java
@@ -33,7 +33,7 @@
   }
 
   public native void resetTest();
-  public native void waitAndDeopt(Thread t);
+  public native void waitAndInstrumentStack(Thread t);
   public native void doSelfStackWalk();
 
   void testConcurrent() throws Exception {
@@ -41,7 +41,7 @@
     final Thread current = Thread.currentThread();
     Thread t = new Thread(() -> {
       try {
-        this.waitAndDeopt(current);
+        this.waitAndInstrumentStack(current);
       } catch (Exception e) {
         throw new Error("Fail!", e);
       }
diff --git a/test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc b/test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc
index a10fe2e..5eaaa05 100644
--- a/test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc
+++ b/test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc
@@ -76,7 +76,9 @@
   CHECK(sswv.found_g_);
   CHECK(sswv.found_h_);
 }
-extern "C" JNIEXPORT void JNICALL Java_Main_waitAndDeopt(JNIEnv*, jobject, jobject target) {
+extern "C" JNIEXPORT void JNICALL Java_Main_waitAndInstrumentStack(JNIEnv*,
+                                                                   jobject,
+                                                                   jobject target) {
   while (!instrument_waiting) {
   }
   bool timed_out = false;
@@ -85,7 +87,8 @@
   CHECK(!timed_out);
   CHECK(other != nullptr);
   ScopedSuspendAll ssa(__FUNCTION__);
-  Runtime::Current()->GetInstrumentation()->InstrumentThreadStack(other);
+  Runtime::Current()->GetInstrumentation()->InstrumentThreadStack(other,
+                                                                  /* deopt_all_frames= */ false);
   MutexLock mu(Thread::Current(), *Locks::thread_suspend_count_lock_);
   bool updated = other->ModifySuspendCount(Thread::Current(), -1, nullptr, SuspendReason::kInternal);
   CHECK(updated);