Simplify hotness count in baseline compiled code.

- Always require the current ArtMethod, which also removes the need to
  handle empty frames.
- Remove the use of some temporary registers.
- Require a profiling info when compiling baseline.
- Add a slow path for requiring an optimized compilation.
- Make the counter decrement instead of increment.

A next CL will make the hotness configurable through --jittreshold.

Test: test.py
Bug: 146423102
Change-Id: I1485f66401d6ed218456fe2849eb05fa77479668
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index ced94f4..27eabaf 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -1081,6 +1081,11 @@
       }
     }
   }
+  if (GetGraph()->IsCompilingBaseline()) {
+    // We need the current method in case we reach the hotness threshold. As a
+    // side effect this makes the frame non-empty.
+    SetRequiresCurrentMethod();
+  }
 }
 
 CodeGenerator::~CodeGenerator() {}
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index bcb5ac5..933e270 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -848,6 +848,29 @@
   DISALLOW_COPY_AND_ASSIGN(MethodEntryExitHooksSlowPathARM64);
 };
 
+class CompileOptimizedSlowPathARM64 : public SlowPathCodeARM64 {
+ public:
+  CompileOptimizedSlowPathARM64() : SlowPathCodeARM64(/* instruction= */ nullptr) {}
+
+  void EmitNativeCode(CodeGenerator* codegen) override {
+    uint32_t entrypoint_offset =
+        GetThreadOffset<kArm64PointerSize>(kQuickCompileOptimized).Int32Value();
+    __ Bind(GetEntryLabel());
+    __ Ldr(lr, MemOperand(tr, entrypoint_offset));
+    // Note: we don't record the call here (and therefore don't generate a stack
+    // map), as the entrypoint should never be suspended.
+    __ Blr(lr);
+    __ B(GetExitLabel());
+  }
+
+  const char* GetDescription() const override {
+    return "CompileOptimizedSlowPath";
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(CompileOptimizedSlowPathARM64);
+};
+
 #undef __
 
 Location InvokeDexCallingConventionVisitorARM64::GetNextLocation(DataType::Type type) {
@@ -1199,46 +1222,22 @@
   }
 
   if (GetGraph()->IsCompilingBaseline() && !Runtime::Current()->IsAotCompiler()) {
-    ScopedProfilingInfoUse spiu(
-        Runtime::Current()->GetJit(), GetGraph()->GetArtMethod(), Thread::Current());
-    ProfilingInfo* info = spiu.GetProfilingInfo();
-    if (info != nullptr) {
-      uint64_t address = reinterpret_cast64<uint64_t>(info);
-      vixl::aarch64::Label done;
-      UseScratchRegisterScope temps(masm);
-      Register temp = temps.AcquireX();
-      Register counter = temps.AcquireW();
-      __ Mov(temp, address);
-      __ Ldrh(counter, MemOperand(temp, ProfilingInfo::BaselineHotnessCountOffset().Int32Value()));
-      __ Add(counter, counter, 1);
-      __ And(counter, counter, interpreter::kTieredHotnessMask);
-      __ Strh(counter, MemOperand(temp, ProfilingInfo::BaselineHotnessCountOffset().Int32Value()));
-      __ Cbnz(counter, &done);
-      if (is_frame_entry) {
-        if (HasEmptyFrame()) {
-          // The entrypoint expects the method at the bottom of the stack. We
-          // claim stack space necessary for alignment.
-          IncreaseFrame(kStackAlignment);
-          __ Stp(kArtMethodRegister, lr, MemOperand(sp, 0));
-        } else if (!RequiresCurrentMethod()) {
-          __ Str(kArtMethodRegister, MemOperand(sp, 0));
-        }
-      } else {
-        CHECK(RequiresCurrentMethod());
-      }
-      uint32_t entrypoint_offset =
-          GetThreadOffset<kArm64PointerSize>(kQuickCompileOptimized).Int32Value();
-      __ Ldr(lr, MemOperand(tr, entrypoint_offset));
-      // Note: we don't record the call here (and therefore don't generate a stack
-      // map), as the entrypoint should never be suspended.
-      __ Blr(lr);
-      if (HasEmptyFrame()) {
-        CHECK(is_frame_entry);
-        __ Ldr(lr, MemOperand(sp, 8));
-        DecreaseFrame(kStackAlignment);
-      }
-      __ Bind(&done);
-    }
+    SlowPathCodeARM64* slow_path = new (GetScopedAllocator()) CompileOptimizedSlowPathARM64();
+    AddSlowPath(slow_path);
+    ProfilingInfo* info = GetGraph()->GetProfilingInfo();
+    DCHECK(info != nullptr);
+    DCHECK(!HasEmptyFrame());
+    uint64_t address = reinterpret_cast64<uint64_t>(info);
+    vixl::aarch64::Label done;
+    UseScratchRegisterScope temps(masm);
+    Register temp = temps.AcquireX();
+    Register counter = temps.AcquireW();
+    __ Ldr(temp, DeduplicateUint64Literal(address));
+    __ Ldrh(counter, MemOperand(temp, ProfilingInfo::BaselineHotnessCountOffset().Int32Value()));
+    __ Cbz(counter, slow_path->GetEntryLabel());
+    __ Add(counter, counter, -1);
+    __ Strh(counter, MemOperand(temp, ProfilingInfo::BaselineHotnessCountOffset().Int32Value()));
+    __ Bind(slow_path->GetExitLabel());
   }
 }
 
@@ -4458,21 +4457,18 @@
       GetGraph()->IsCompilingBaseline() &&
       !Runtime::Current()->IsAotCompiler()) {
     DCHECK(!instruction->GetEnvironment()->IsFromInlinedInvoke());
-    ScopedProfilingInfoUse spiu(
-        Runtime::Current()->GetJit(), GetGraph()->GetArtMethod(), Thread::Current());
-    ProfilingInfo* info = spiu.GetProfilingInfo();
-    if (info != nullptr) {
-      InlineCache* cache = info->GetInlineCache(instruction->GetDexPc());
-      uint64_t address = reinterpret_cast64<uint64_t>(cache);
-      vixl::aarch64::Label done;
-      __ Mov(x8, address);
-      __ Ldr(x9, MemOperand(x8, InlineCache::ClassesOffset().Int32Value()));
-      // Fast path for a monomorphic cache.
-      __ Cmp(klass, x9);
-      __ B(eq, &done);
-      InvokeRuntime(kQuickUpdateInlineCache, instruction, instruction->GetDexPc());
-      __ Bind(&done);
-    }
+    ProfilingInfo* info = GetGraph()->GetProfilingInfo();
+    DCHECK(info != nullptr);
+    InlineCache* cache = info->GetInlineCache(instruction->GetDexPc());
+    uint64_t address = reinterpret_cast64<uint64_t>(cache);
+    vixl::aarch64::Label done;
+    __ Mov(x8, address);
+    __ Ldr(x9, MemOperand(x8, InlineCache::ClassesOffset().Int32Value()));
+    // Fast path for a monomorphic cache.
+    __ Cmp(klass, x9);
+    __ B(eq, &done);
+    InvokeRuntime(kQuickUpdateInlineCache, instruction, instruction->GetDexPc());
+    __ Bind(&done);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index aa06c5a..c514c22 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -997,6 +997,29 @@
   DISALLOW_COPY_AND_ASSIGN(MethodEntryExitHooksSlowPathARMVIXL);
 };
 
+class CompileOptimizedSlowPathARMVIXL : public SlowPathCodeARMVIXL {
+ public:
+  CompileOptimizedSlowPathARMVIXL() : SlowPathCodeARMVIXL(/* instruction= */ nullptr) {}
+
+  void EmitNativeCode(CodeGenerator* codegen) override {
+    uint32_t entry_point_offset =
+        GetThreadOffset<kArmPointerSize>(kQuickCompileOptimized).Int32Value();
+    __ Bind(GetEntryLabel());
+    __ Ldr(lr, MemOperand(tr, entry_point_offset));
+    // Note: we don't record the call here (and therefore don't generate a stack
+    // map), as the entrypoint should never be suspended.
+    __ Blx(lr);
+    __ B(GetExitLabel());
+  }
+
+  const char* GetDescription() const override {
+    return "CompileOptimizedSlowPath";
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(CompileOptimizedSlowPathARMVIXL);
+};
+
 inline vixl32::Condition ARMCondition(IfCondition cond) {
   switch (cond) {
     case kCondEQ: return eq;
@@ -2200,54 +2223,20 @@
   }
 
   if (GetGraph()->IsCompilingBaseline() && !Runtime::Current()->IsAotCompiler()) {
-    ScopedProfilingInfoUse spiu(
-        Runtime::Current()->GetJit(), GetGraph()->GetArtMethod(), Thread::Current());
-    ProfilingInfo* info = spiu.GetProfilingInfo();
-    if (info != nullptr) {
-      uint32_t address = reinterpret_cast32<uint32_t>(info);
-      vixl::aarch32::Label done;
-      UseScratchRegisterScope temps(GetVIXLAssembler());
-      temps.Exclude(ip);
-      if (!is_frame_entry) {
-        __ Push(r4);  // Will be used as temporary. For frame entry, r4 is always available.
-        GetAssembler()->cfi().AdjustCFAOffset(kArmWordSize);
-      }
-      __ Mov(r4, address);
-      __ Ldrh(ip, MemOperand(r4, ProfilingInfo::BaselineHotnessCountOffset().Int32Value()));
-      __ Add(ip, ip, 1);
-      instruction_visitor_.GenerateAndConst(ip, ip, interpreter::kTieredHotnessMask);
-      __ Strh(ip, MemOperand(r4, ProfilingInfo::BaselineHotnessCountOffset().Int32Value()));
-      if (!is_frame_entry) {
-        __ Pop(r4);
-        GetAssembler()->cfi().AdjustCFAOffset(-static_cast<int>(kArmWordSize));
-      }
-      __ Lsls(ip, ip, 16);
-      __ B(ne, &done);
-      uint32_t entry_point_offset =
-          GetThreadOffset<kArmPointerSize>(kQuickCompileOptimized).Int32Value();
-      if (HasEmptyFrame()) {
-        CHECK(is_frame_entry);
-        // For leaf methods, we need to spill lr and r0. Also spill r1 and r2 for
-        // alignment.
-        uint32_t core_spill_mask =
-            (1 << lr.GetCode()) | (1 << r0.GetCode()) | (1 << r1.GetCode()) | (1 << r2.GetCode());
-        __ Push(RegisterList(core_spill_mask));
-        GetAssembler()->cfi().AdjustCFAOffset(kArmWordSize * POPCOUNT(core_spill_mask));
-        __ Ldr(lr, MemOperand(tr, entry_point_offset));
-        __ Blx(lr);
-        __ Pop(RegisterList(core_spill_mask));
-        GetAssembler()->cfi().AdjustCFAOffset(
-            -static_cast<int>(kArmWordSize) * POPCOUNT(core_spill_mask));
-      } else {
-        if (!RequiresCurrentMethod()) {
-          CHECK(is_frame_entry);
-          GetAssembler()->StoreToOffset(kStoreWord, kMethodRegister, sp, 0);
-        }
-      __ Ldr(lr, MemOperand(tr, entry_point_offset));
-      __ Blx(lr);
-      }
-      __ Bind(&done);
-    }
+    SlowPathCodeARMVIXL* slow_path = new (GetScopedAllocator()) CompileOptimizedSlowPathARMVIXL();
+    AddSlowPath(slow_path);
+    ProfilingInfo* info = GetGraph()->GetProfilingInfo();
+    DCHECK(info != nullptr);
+    DCHECK(!HasEmptyFrame());
+    uint32_t address = reinterpret_cast32<uint32_t>(info);
+    UseScratchRegisterScope temps(GetVIXLAssembler());
+    vixl32::Register tmp = temps.Acquire();
+    __ Mov(lr, address);
+    __ Ldrh(tmp, MemOperand(lr, ProfilingInfo::BaselineHotnessCountOffset().Int32Value()));
+    __ Adds(tmp, tmp, -1);
+    __ B(cc, slow_path->GetEntryLabel());
+    __ Strh(tmp, MemOperand(lr, ProfilingInfo::BaselineHotnessCountOffset().Int32Value()));
+    __ Bind(slow_path->GetExitLabel());
   }
 }
 
@@ -3535,23 +3524,20 @@
       GetGraph()->IsCompilingBaseline() &&
       !Runtime::Current()->IsAotCompiler()) {
     DCHECK(!instruction->GetEnvironment()->IsFromInlinedInvoke());
-    ScopedProfilingInfoUse spiu(
-        Runtime::Current()->GetJit(), GetGraph()->GetArtMethod(), Thread::Current());
-    ProfilingInfo* info = spiu.GetProfilingInfo();
-    if (info != nullptr) {
-      InlineCache* cache = info->GetInlineCache(instruction->GetDexPc());
-      uint32_t address = reinterpret_cast32<uint32_t>(cache);
-      vixl32::Label done;
-      UseScratchRegisterScope temps(GetVIXLAssembler());
-      temps.Exclude(ip);
-      __ Mov(r4, address);
-      __ Ldr(ip, MemOperand(r4, InlineCache::ClassesOffset().Int32Value()));
-      // Fast path for a monomorphic cache.
-      __ Cmp(klass, ip);
-      __ B(eq, &done, /* is_far_target= */ false);
-      InvokeRuntime(kQuickUpdateInlineCache, instruction, instruction->GetDexPc());
-      __ Bind(&done);
-    }
+    ProfilingInfo* info = GetGraph()->GetProfilingInfo();
+    DCHECK(info != nullptr);
+    InlineCache* cache = info->GetInlineCache(instruction->GetDexPc());
+    uint32_t address = reinterpret_cast32<uint32_t>(cache);
+    vixl32::Label done;
+    UseScratchRegisterScope temps(GetVIXLAssembler());
+    temps.Exclude(ip);
+    __ Mov(r4, address);
+    __ Ldr(ip, MemOperand(r4, InlineCache::ClassesOffset().Int32Value()));
+    // Fast path for a monomorphic cache.
+    __ Cmp(klass, ip);
+    __ B(eq, &done, /* is_far_target= */ false);
+    InvokeRuntime(kQuickUpdateInlineCache, instruction, instruction->GetDexPc());
+    __ Bind(&done);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 758a471..f19eaae 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -967,6 +967,26 @@
   DISALLOW_COPY_AND_ASSIGN(MethodEntryExitHooksSlowPathX86);
 };
 
+class CompileOptimizedSlowPathX86 : public SlowPathCode {
+ public:
+  CompileOptimizedSlowPathX86() : SlowPathCode(/* instruction= */ nullptr) {}
+
+  void EmitNativeCode(CodeGenerator* codegen) override {
+    CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
+    __ Bind(GetEntryLabel());
+    x86_codegen->GenerateInvokeRuntime(
+        GetThreadOffset<kX86PointerSize>(kQuickCompileOptimized).Int32Value());
+    __ jmp(GetExitLabel());
+  }
+
+  const char* GetDescription() const override {
+    return "CompileOptimizedSlowPath";
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(CompileOptimizedSlowPathX86);
+};
+
 #undef __
 // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
 #define __ down_cast<X86Assembler*>(GetAssembler())->  // NOLINT
@@ -1210,52 +1230,19 @@
   }
 
   if (GetGraph()->IsCompilingBaseline() && !Runtime::Current()->IsAotCompiler()) {
-    ScopedProfilingInfoUse spiu(
-        Runtime::Current()->GetJit(), GetGraph()->GetArtMethod(), Thread::Current());
-    ProfilingInfo* info = spiu.GetProfilingInfo();
-    if (info != nullptr) {
-      uint32_t address = reinterpret_cast32<uint32_t>(info);
-      NearLabel done;
-      if (HasEmptyFrame()) {
-        CHECK(is_frame_entry);
-        // Alignment
-        IncreaseFrame(8);
-        // We need a temporary. The stub also expects the method at bottom of stack.
-        __ pushl(EAX);
-        __ cfi().AdjustCFAOffset(4);
-        __ movl(EAX, Immediate(address));
-        __ addw(Address(EAX, ProfilingInfo::BaselineHotnessCountOffset().Int32Value()),
-                Immediate(1));
-        __ andw(Address(EAX, ProfilingInfo::BaselineHotnessCountOffset().Int32Value()),
-                Immediate(interpreter::kTieredHotnessMask));
-        __ j(kNotZero, &done);
-        GenerateInvokeRuntime(
-            GetThreadOffset<kX86PointerSize>(kQuickCompileOptimized).Int32Value());
-        __ Bind(&done);
-        // We don't strictly require to restore EAX, but this makes the generated
-        // code easier to reason about.
-        __ popl(EAX);
-        __ cfi().AdjustCFAOffset(-4);
-        DecreaseFrame(8);
-      } else {
-        if (!RequiresCurrentMethod()) {
-          CHECK(is_frame_entry);
-          __ movl(Address(ESP, kCurrentMethodStackOffset), kMethodRegisterArgument);
-        }
-        // We need a temporary.
-        __ pushl(EAX);
-        __ cfi().AdjustCFAOffset(4);
-        __ movl(EAX, Immediate(address));
-        __ addw(Address(EAX, ProfilingInfo::BaselineHotnessCountOffset().Int32Value()),
-                Immediate(1));
-        __ popl(EAX);  // Put stack as expected before exiting or calling stub.
-        __ cfi().AdjustCFAOffset(-4);
-        __ j(kCarryClear, &done);
-        GenerateInvokeRuntime(
-            GetThreadOffset<kX86PointerSize>(kQuickCompileOptimized).Int32Value());
-        __ Bind(&done);
-      }
-    }
+    SlowPathCode* slow_path = new (GetScopedAllocator()) CompileOptimizedSlowPathX86();
+    AddSlowPath(slow_path);
+    ProfilingInfo* info = GetGraph()->GetProfilingInfo();
+    DCHECK(info != nullptr);
+    uint32_t address = reinterpret_cast32<uint32_t>(info) +
+        ProfilingInfo::BaselineHotnessCountOffset().Int32Value();
+    DCHECK(!HasEmptyFrame());
+    // With multiple threads, this can overflow. This is OK, we will eventually get to see
+    // it reaching 0. Also, at this point we have no register available to look
+    // at the counter directly.
+    __ addw(Address::Absolute(address), Immediate(-1));
+    __ j(kEqual, slow_path->GetEntryLabel());
+    __ Bind(slow_path->GetExitLabel());
   }
 }
 
@@ -2669,25 +2656,22 @@
       GetGraph()->IsCompilingBaseline() &&
       !Runtime::Current()->IsAotCompiler()) {
     DCHECK(!instruction->GetEnvironment()->IsFromInlinedInvoke());
-    ScopedProfilingInfoUse spiu(
-        Runtime::Current()->GetJit(), GetGraph()->GetArtMethod(), Thread::Current());
-    ProfilingInfo* info = spiu.GetProfilingInfo();
-    if (info != nullptr) {
-      InlineCache* cache = info->GetInlineCache(instruction->GetDexPc());
-      uint32_t address = reinterpret_cast32<uint32_t>(cache);
-      if (kIsDebugBuild) {
-        uint32_t temp_index = instruction->GetLocations()->GetTempCount() - 1u;
-        CHECK_EQ(EBP, instruction->GetLocations()->GetTemp(temp_index).AsRegister<Register>());
-      }
-      Register temp = EBP;
-      NearLabel done;
-      __ movl(temp, Immediate(address));
-      // Fast path for a monomorphic cache.
-      __ cmpl(klass, Address(temp, InlineCache::ClassesOffset().Int32Value()));
-      __ j(kEqual, &done);
-      GenerateInvokeRuntime(GetThreadOffset<kX86PointerSize>(kQuickUpdateInlineCache).Int32Value());
-      __ Bind(&done);
+    ProfilingInfo* info = GetGraph()->GetProfilingInfo();
+    DCHECK(info != nullptr);
+    InlineCache* cache = info->GetInlineCache(instruction->GetDexPc());
+    uint32_t address = reinterpret_cast32<uint32_t>(cache);
+    if (kIsDebugBuild) {
+      uint32_t temp_index = instruction->GetLocations()->GetTempCount() - 1u;
+      CHECK_EQ(EBP, instruction->GetLocations()->GetTemp(temp_index).AsRegister<Register>());
     }
+    Register temp = EBP;
+    NearLabel done;
+    __ movl(temp, Immediate(address));
+    // Fast path for a monomorphic cache.
+    __ cmpl(klass, Address(temp, InlineCache::ClassesOffset().Int32Value()));
+    __ j(kEqual, &done);
+    GenerateInvokeRuntime(GetThreadOffset<kX86PointerSize>(kQuickUpdateInlineCache).Int32Value());
+    __ Bind(&done);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index c402e83..b0bdffe 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -991,6 +991,26 @@
   DISALLOW_COPY_AND_ASSIGN(MethodEntryExitHooksSlowPathX86_64);
 };
 
+class CompileOptimizedSlowPathX86_64 : public SlowPathCode {
+ public:
+  CompileOptimizedSlowPathX86_64() : SlowPathCode(/* instruction= */ nullptr) {}
+
+  void EmitNativeCode(CodeGenerator* codegen) override {
+    CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
+    __ Bind(GetEntryLabel());
+    x86_64_codegen->GenerateInvokeRuntime(
+        GetThreadOffset<kX86_64PointerSize>(kQuickCompileOptimized).Int32Value());
+    __ jmp(GetExitLabel());
+  }
+
+  const char* GetDescription() const override {
+    return "CompileOptimizedSlowPath";
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(CompileOptimizedSlowPathX86_64);
+};
+
 #undef __
 // NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
 #define __ down_cast<X86_64Assembler*>(GetAssembler())->  // NOLINT
@@ -1602,37 +1622,22 @@
   }
 
   if (GetGraph()->IsCompilingBaseline() && !Runtime::Current()->IsAotCompiler()) {
-    ScopedProfilingInfoUse spiu(
-        Runtime::Current()->GetJit(), GetGraph()->GetArtMethod(), Thread::Current());
-    ProfilingInfo* info = spiu.GetProfilingInfo();
-    if (info != nullptr) {
-      uint64_t address = reinterpret_cast64<uint64_t>(info);
-      NearLabel done;
-      __ movq(CpuRegister(TMP), Immediate(address));
-      __ addw(Address(CpuRegister(TMP), ProfilingInfo::BaselineHotnessCountOffset().Int32Value()),
-              Immediate(1));
-      __ andw(Address(CpuRegister(TMP), ProfilingInfo::BaselineHotnessCountOffset().Int32Value()),
-              Immediate(interpreter::kTieredHotnessMask));
-      __ j(kNotZero, &done);
-      if (HasEmptyFrame()) {
-        CHECK(is_frame_entry);
-        // Frame alignment, and the stub expects the method on the stack.
-        __ pushq(CpuRegister(RDI));
-        __ cfi().AdjustCFAOffset(kX86_64WordSize);
-        __ cfi().RelOffset(DWARFReg(RDI), 0);
-      } else if (!RequiresCurrentMethod()) {
-        CHECK(is_frame_entry);
-        __ movq(Address(CpuRegister(RSP), kCurrentMethodStackOffset), CpuRegister(RDI));
-      }
-      GenerateInvokeRuntime(
-          GetThreadOffset<kX86_64PointerSize>(kQuickCompileOptimized).Int32Value());
-      if (HasEmptyFrame()) {
-        __ popq(CpuRegister(RDI));
-        __ cfi().AdjustCFAOffset(-static_cast<int>(kX86_64WordSize));
-        __ cfi().Restore(DWARFReg(RDI));
-      }
-      __ Bind(&done);
-    }
+    SlowPathCode* slow_path = new (GetScopedAllocator()) CompileOptimizedSlowPathX86_64();
+    AddSlowPath(slow_path);
+    ProfilingInfo* info = GetGraph()->GetProfilingInfo();
+    DCHECK(info != nullptr);
+    CHECK(!HasEmptyFrame());
+    uint64_t address = reinterpret_cast64<uint64_t>(info);
+    // Note: if the address was in the 32bit range, we could use
+    // Address::Absolute and avoid this movq.
+    __ movq(CpuRegister(TMP), Immediate(address));
+    // With multiple threads, this can overflow. This is OK, we will eventually get to see
+    // it reaching 0. Also, at this point we have no register available to look
+    // at the counter directly.
+    __ addw(Address(CpuRegister(TMP), ProfilingInfo::BaselineHotnessCountOffset().Int32Value()),
+            Immediate(-1));
+    __ j(kEqual, slow_path->GetEntryLabel());
+    __ Bind(slow_path->GetExitLabel());
   }
 }
 
@@ -2903,21 +2908,18 @@
   if (!instruction->GetLocations()->Intrinsified() &&
       GetGraph()->IsCompilingBaseline() &&
       !Runtime::Current()->IsAotCompiler()) {
-    ScopedProfilingInfoUse spiu(
-        Runtime::Current()->GetJit(), GetGraph()->GetArtMethod(), Thread::Current());
-    ProfilingInfo* info = spiu.GetProfilingInfo();
-    if (info != nullptr) {
-      InlineCache* cache = info->GetInlineCache(instruction->GetDexPc());
-      uint64_t address = reinterpret_cast64<uint64_t>(cache);
-      NearLabel done;
-      __ movq(CpuRegister(TMP), Immediate(address));
-      // Fast path for a monomorphic cache.
-      __ cmpl(Address(CpuRegister(TMP), InlineCache::ClassesOffset().Int32Value()), klass);
-      __ j(kEqual, &done);
-      GenerateInvokeRuntime(
-          GetThreadOffset<kX86_64PointerSize>(kQuickUpdateInlineCache).Int32Value());
-      __ Bind(&done);
-    }
+    ProfilingInfo* info = GetGraph()->GetProfilingInfo();
+    DCHECK(info != nullptr);
+    InlineCache* cache = info->GetInlineCache(instruction->GetDexPc());
+    uint64_t address = reinterpret_cast64<uint64_t>(cache);
+    NearLabel done;
+    __ movq(CpuRegister(TMP), Immediate(address));
+    // Fast path for a monomorphic cache.
+    __ cmpl(Address(CpuRegister(TMP), InlineCache::ClassesOffset().Int32Value()), klass);
+    __ j(kEqual, &done);
+    GenerateInvokeRuntime(
+        GetThreadOffset<kX86_64PointerSize>(kQuickUpdateInlineCache).Int32Value());
+    __ Bind(&done);
   }
 }
 
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index a40218d..ac71ce9 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -621,9 +621,7 @@
   ArtMethod* caller = graph_->GetArtMethod();
   // Under JIT, we should always know the caller.
   DCHECK(caller != nullptr);
-  ScopedProfilingInfoUse spiu(Runtime::Current()->GetJit(), caller, Thread::Current());
-  ProfilingInfo* profiling_info = spiu.GetProfilingInfo();
-
+  ProfilingInfo* profiling_info = graph_->GetProfilingInfo();
   if (profiling_info == nullptr) {
     return kInlineCacheNoData;
   }
@@ -1995,6 +1993,11 @@
       /* start_instruction_id= */ caller_instruction_counter);
   callee_graph->SetArtMethod(resolved_method);
 
+  ScopedProfilingInfoUse spiu(Runtime::Current()->GetJit(), resolved_method, Thread::Current());
+  if (Runtime::Current()->GetJit() != nullptr) {
+    callee_graph->SetProfilingInfo(spiu.GetProfilingInfo());
+  }
+
   // When they are needed, allocate `inline_stats_` on the Arena instead
   // of on the stack, as Clang might produce a stack frame too large
   // for this function, that would not fit the requirements of the
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 16e26dc..c50e047 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -75,6 +75,7 @@
 class FieldInfo;
 class LiveInterval;
 class LocationSummary;
+class ProfilingInfo;
 class SlowPathCode;
 class SsaBuilder;
 
@@ -704,6 +705,9 @@
   ArtMethod* GetArtMethod() const { return art_method_; }
   void SetArtMethod(ArtMethod* method) { art_method_ = method; }
 
+  void SetProfilingInfo(ProfilingInfo* info) { profiling_info_ = info; }
+  ProfilingInfo* GetProfilingInfo() const { return profiling_info_; }
+
   // Returns an instruction with the opposite Boolean value from 'cond'.
   // The instruction has been inserted into the graph, either as a constant, or
   // before cursor.
@@ -870,6 +874,9 @@
   // (such as when the superclass could not be found).
   ArtMethod* art_method_;
 
+  // The `ProfilingInfo` associated with the method being compiled.
+  ProfilingInfo* profiling_info_;
+
   // How we are compiling the graph: either optimized, osr, or baseline.
   // For osr, we will make all loops seen as irreducible and emit special
   // stack maps to mark compiled code entries which the interpreter can
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index c4dd31d..10b59d2 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -812,6 +812,14 @@
     graph->SetArtMethod(method);
   }
 
+  jit::Jit* jit = Runtime::Current()->GetJit();
+  if (jit != nullptr) {
+    ProfilingInfo* info = jit->GetCodeCache()->GetProfilingInfo(method, Thread::Current());
+    DCHECK(compilation_kind != CompilationKind::kBaseline || info != nullptr)
+        << "Compiling a method baseline should always have a ProfilingInfo";
+    graph->SetProfilingInfo(info);
+  }
+
   std::unique_ptr<CodeGenerator> codegen(
       CodeGenerator::Create(graph,
                             compiler_options,