JNI: Inline fast-path for `JniMethodEnd()`.

Golem results for art-opt-cc (higher is better):
linux-ia32                       before after
NativeDowncallStaticNormal       46.766 51.016 (+9.086%)
NativeDowncallStaticNormal6      42.268 45.748 (+8.235%)
NativeDowncallStaticNormalRefs6  41.355 44.776 (+8.272%)
NativeDowncallVirtualNormal      46.361 52.527 (+13.30%)
NativeDowncallVirtualNormal6     41.812 45.206 (+8.118%)
NativeDowncallVirtualNormalRefs6 40.500 44.169 (+9.059%)
(The NativeDowncallVirtualNormal result for x86 is skewed
by one extra good run as Golem reports the best result in
the summary. Using the second best and most frequent
result 50.5, the improvement is only around 8.9%.)
linux-x64                        before after
NativeDowncallStaticNormal       44.169 47.976 (+8.620%)
NativeDowncallStaticNormal6      43.198 46.836 (+8.423%)
NativeDowncallStaticNormalRefs6  38.481 44.687 (+16.13%)
NativeDowncallVirtualNormal      43.672 47.405 (+8.547%)
NativeDowncallVirtualNormal6     42.268 45.726 (+8.182%)
NativeDowncallVirtualNormalRefs6 41.355 44.687 (+8.057%)
(The NativeDowncallStaticNormalRefs6 result for x86-64 is
a bit inflated because recent results jump between ~38.5
and ~40.5. If we take the latter as the baseline, the
improvements is only around 10.3%.)
linux-armv7                      before after
NativeDowncallStaticNormal       10.659 14.620 (+37.16%)
NativeDowncallStaticNormal6      9.8377 13.120 (+33.36%)
NativeDowncallStaticNormalRefs6  8.8714 11.454 (+29.11%)
NativeDowncallVirtualNormal      10.511 14.349 (+36.51%)
NativeDowncallVirtualNormal6     9.9701 13.347 (+33.87%)
NativeDowncallVirtualNormalRefs6 8.9241 11.454 (+28.35%)
linux-armv8                      before after
NativeDowncallStaticNormal       10.608 16.329 (+53.93%)
NativeDowncallStaticNormal6      10.179 15.347 (+50.76%)
NativeDowncallStaticNormalRefs6  9.2457 13.705 (+48.23%)
NativeDowncallVirtualNormal      9.9850 14.903 (+49.25%)
NativeDowncallVirtualNormal6     9.9206 14.757 (+48.75%)
NativeDowncallVirtualNormalRefs6 8.8235 12.789 (+44.94%)

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 172332525
Change-Id: Ie144bc4f7f82be95790ea7d3123b81a3b6bfa603
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index 2c1b4be..418cf57 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -1079,6 +1079,45 @@
   ___ Str(scratch, MemOperand(tr, thread_held_mutex_mutator_lock_offset.Int32Value()));
+void ArmVIXLJNIMacroAssembler::TryToTransitionFromNativeToRunnable(
+    JNIMacroLabel* label,
+    ArrayRef<const ManagedRegister> scratch_regs,
+    ManagedRegister return_reg) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset32 thread_flags_offset = Thread::ThreadFlagsOffset<kArmPointerSize>();
+  constexpr ThreadOffset32 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kArmPointerSize>(kMutatorLock);
+  constexpr ThreadOffset32 thread_mutator_lock_offset =
+      Thread::MutatorLockOffset<kArmPointerSize>();
+  // There must be at least two scratch registers.
+  DCHECK_GE(scratch_regs.size(), 2u);
+  DCHECK(!scratch_regs[0].AsArm().Overlaps(return_reg.AsArm()));
+  vixl32::Register scratch = AsVIXLRegister(scratch_regs[0].AsArm());
+  DCHECK(!scratch_regs[1].AsArm().Overlaps(return_reg.AsArm()));
+  vixl32::Register scratch2 = AsVIXLRegister(scratch_regs[1].AsArm());
+  // CAS acquire, old_value = kNativeStateValue, new_value = kRunnableStateValue, no flags.
+  vixl32::Label retry;
+  ___ Bind(&retry);
+  ___ Ldrex(scratch, MemOperand(tr, thread_flags_offset.Int32Value()));
+  // If any flags are set, or the state is not Native, go to the slow path.
+  // (While the thread can theoretically transition between different Suspended states,
+  // it would be very unexpected to see a state other than Native at this point.)
+  ___ Eors(scratch2, scratch, kNativeStateValue);
+  ___ B(ne, ArmVIXLJNIMacroLabel::Cast(label)->AsArm());
+  static_assert(kRunnableStateValue == 0u);
+  ___ Strex(scratch, scratch2, MemOperand(tr, thread_flags_offset.Int32Value()));
+  ___ Cmp(scratch, 0);
+  ___ B(ne, &retry);
+  ___ Dmb(DmbOptions::ISH);  // Memory barrier "load-any" for the "acquire" operation.
+  // Set `self->tlsPtr_.held_mutexes[kMutatorLock]` to the mutator lock.
+  ___ Ldr(scratch, MemOperand(tr, thread_mutator_lock_offset.Int32Value()));
+  ___ Str(scratch, MemOperand(tr, thread_held_mutex_mutator_lock_offset.Int32Value()));
 void ArmVIXLJNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   vixl32::Register scratch = temps.Acquire();
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
index 7b9d7de..426502d 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
@@ -185,9 +185,18 @@
   void CallFromThread(ThreadOffset32 offset) override;
   // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be callee save core registers
+  // (already saved before this call) and must preserve all argument registers.
   void TryToTransitionFromRunnableToNative(
       JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
+  // Generate fast-path for transition to Runnable. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be core argument registers
+  // not used as return registers and it must preserve the `return_reg` if any.
+  void TryToTransitionFromNativeToRunnable(JNIMacroLabel* label,
+                                           ArrayRef<const ManagedRegister> scratch_regs,
+                                           ManagedRegister return_reg) override;
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   void SuspendCheck(JNIMacroLabel* label) override;
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index e84fe04..df7bb5e 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -917,6 +917,42 @@
   ___ Str(xzr, MEM_OP(reg_x(TR), thread_held_mutex_mutator_lock_offset.Int32Value()));
+void Arm64JNIMacroAssembler::TryToTransitionFromNativeToRunnable(
+    JNIMacroLabel* label,
+    ArrayRef<const ManagedRegister> scratch_regs ATTRIBUTE_UNUSED,
+    ManagedRegister return_reg ATTRIBUTE_UNUSED) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset64 thread_flags_offset = Thread::ThreadFlagsOffset<kArm64PointerSize>();
+  constexpr ThreadOffset64 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kArm64PointerSize>(kMutatorLock);
+  constexpr ThreadOffset64 thread_mutator_lock_offset =
+      Thread::MutatorLockOffset<kArm64PointerSize>();
+  UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+  Register scratch = temps.AcquireW();
+  Register scratch2 = temps.AcquireW();
+  // CAS acquire, old_value = kNativeStateValue, new_value = kRunnableStateValue, no flags.
+  vixl::aarch64::Label retry;
+  ___ Bind(&retry);
+  static_assert(thread_flags_offset.Int32Value() == 0);  // LDAXR/STXR require exact address.
+  ___ Ldaxr(scratch, MEM_OP(reg_x(TR)));
+  ___ Mov(scratch2, kNativeStateValue);
+  // If any flags are set, or the state is not Native, go to the slow path.
+  // (While the thread can theoretically transition between different Suspended states,
+  // it would be very unexpected to see a state other than Native at this point.)
+  ___ Cmp(scratch, scratch2);
+  ___ B(ne, Arm64JNIMacroLabel::Cast(label)->AsArm64());
+  static_assert(kRunnableStateValue == 0u);
+  ___ Stxr(scratch, wzr, MEM_OP(reg_x(TR)));
+  ___ Cbnz(scratch, &retry);
+  // Set `self->tlsPtr_.held_mutexes[kMutatorLock]` to the mutator lock.
+  ___ Ldr(scratch.X(), MEM_OP(reg_x(TR), thread_mutator_lock_offset.Int32Value()));
+  ___ Str(scratch.X(), MEM_OP(reg_x(TR), thread_held_mutex_mutator_lock_offset.Int32Value()));
 void Arm64JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   Register scratch = temps.AcquireW();
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.h b/compiler/utils/arm64/jni_macro_assembler_arm64.h
index 1c61d96..0fb512e 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.h
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.h
@@ -169,9 +169,18 @@
   void CallFromThread(ThreadOffset64 offset) override;
   // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be callee save core registers
+  // (already saved before this call) and must preserve all argument registers.
   void TryToTransitionFromRunnableToNative(
       JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
+  // Generate fast-path for transition to Runnable. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be core argument registers
+  // not used as return registers and it must preserve the `return_reg` if any.
+  void TryToTransitionFromNativeToRunnable(JNIMacroLabel* label,
+                                           ArrayRef<const ManagedRegister> scratch_regs,
+                                           ManagedRegister return_reg) override;
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   void SuspendCheck(JNIMacroLabel* label) override;
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index b35066f..2d1de97 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -1,12 +1,12 @@
 const char* const VixlJniHelpersResults = {
   "       0: 2d e9 e0 4d   push.w {r5, r6, r7, r8, r10, r11, lr}\n"
   "       4: 2d ed 10 8a   vpush {s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31}\n"
-  "       8: 85 b0         sub sp, #20\n"
+  "       8: 81 b0         sub sp, #4\n"
   "       a: 00 90         str r0, [sp]\n"
-  "       c: 1d 91         str r1, [sp, #116]\n"
-  "       e: 8d ed 1e 0a   vstr s0, [sp, #120]\n"
-  "      12: 1f 92         str r2, [sp, #124]\n"
-  "      14: 20 93         str r3, [sp, #128]\n"
+  "       c: 19 91         str r1, [sp, #100]\n"
+  "       e: 8d ed 1a 0a   vstr s0, [sp, #104]\n"
+  "      12: 1b 92         str r2, [sp, #108]\n"
+  "      14: 1c 93         str r3, [sp, #112]\n"
   "      16: 88 b0         sub sp, #32\n"
   "      18: ad f5 80 5d   sub.w sp, sp, #4096\n"
   "      1c: 08 98         ldr r0, [sp, #32]\n"
@@ -147,13 +147,13 @@
   "     208: cd f8 ff c7   str.w r12, [sp, #2047]\n"
   "     20c: 0d f5 80 5d   add.w sp, sp, #4096\n"
   "     210: 08 b0         add sp, #32\n"
-  "     212: 05 b0         add sp, #20\n"
+  "     212: 01 b0         add sp, #4\n"
   "     214: bd ec 10 8a   vpop {s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31}\n"
   "     218: bd e8 e0 4d   pop.w {r5, r6, r7, r8, r10, r11, lr}\n"
   "     21c: d9 f8 24 80   ldr.w r8, [r9, #36]\n"
   "     220: 70 47         bx lr\n"
   "     222: d9 f8 8c 00   ldr.w r0, [r9, #140]\n"
-  "     226: d9 f8 c4 e2   ldr.w lr, [r9, #708]\n"
+  "     226: d9 f8 c0 e2   ldr.w lr, [r9, #704]\n"
   "     22a: f0 47         blx lr\n"
diff --git a/compiler/utils/jni_macro_assembler.h b/compiler/utils/jni_macro_assembler.h
index 659ff4c..0d82458 100644
--- a/compiler/utils/jni_macro_assembler.h
+++ b/compiler/utils/jni_macro_assembler.h
@@ -252,9 +252,18 @@
   virtual void CallFromThread(ThreadOffset<kPointerSize> offset) = 0;
   // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be callee save core registers
+  // (already saved before this call) and must preserve all argument registers.
   virtual void TryToTransitionFromRunnableToNative(
       JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) = 0;
+  // Generate fast-path for transition to Runnable. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be core argument registers
+  // not used as return registers and it must preserve the `return_reg` if any.
+  virtual void TryToTransitionFromNativeToRunnable(JNIMacroLabel* label,
+                                                   ArrayRef<const ManagedRegister> scratch_regs,
+                                                   ManagedRegister return_reg) = 0;
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   virtual void SuspendCheck(JNIMacroLabel* label) = 0;
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 8be2a32..1a0d521 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -618,6 +618,57 @@
+void X86JNIMacroAssembler::TryToTransitionFromNativeToRunnable(
+    JNIMacroLabel* label,
+    ArrayRef<const ManagedRegister> scratch_regs,
+    ManagedRegister return_reg) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset32 thread_flags_offset = Thread::ThreadFlagsOffset<kX86PointerSize>();
+  constexpr ThreadOffset32 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kX86PointerSize>(kMutatorLock);
+  constexpr ThreadOffset32 thread_mutator_lock_offset =
+      Thread::MutatorLockOffset<kX86PointerSize>();
+  size_t scratch_index = 0u;
+  auto get_scratch_reg = [&]() {
+    while (true) {
+      DCHECK_LT(scratch_index, scratch_regs.size());
+      X86ManagedRegister scratch_reg = scratch_regs[scratch_index].AsX86();
+      ++scratch_index;
+      DCHECK(!scratch_reg.Overlaps(return_reg.AsX86()));
+      if (scratch_reg.AsCpuRegister() != EAX) {
+        return scratch_reg.AsCpuRegister();
+      }
+    }
+  };
+  Register scratch = get_scratch_reg();
+  bool preserve_eax = return_reg.AsX86().Overlaps(X86ManagedRegister::FromCpuRegister(EAX));
+  Register saved_eax = preserve_eax ? get_scratch_reg() : kNoRegister;
+  // CAS acquire, old_value = kNativeStateValue, new_value = kRunnableStateValue, no flags.
+  if (preserve_eax) {
+    __ movl(saved_eax, EAX);  // Save EAX.
+  }
+  __ movl(EAX, Immediate(kNativeStateValue));
+  static_assert(kRunnableStateValue == 0u);
+  __ xorl(scratch, scratch);
+  __ fs()->LockCmpxchgl(Address::Absolute(thread_flags_offset.Uint32Value()), scratch);
+  // LOCK CMPXCHG has full barrier semantics, so we don't need barriers here.
+  if (preserve_eax) {
+    __ movl(EAX, saved_eax);  // Restore EAX; MOV does not change flags.
+  }
+  // If any flags are set, or the state is not Native, go to the slow path.
+  // (While the thread can theoretically transition between different Suspended states,
+  // it would be very unexpected to see a state other than Native at this point.)
+  __ j(kNotZero, X86JNIMacroLabel::Cast(label)->AsX86());
+  // Set `self->tlsPtr_.held_mutexes[kMutatorLock]` to the mutator lock.
+  __ fs()->movl(scratch, Address::Absolute(thread_mutator_lock_offset.Uint32Value()));
+  __ fs()->movl(Address::Absolute(thread_held_mutex_mutator_lock_offset.Uint32Value()),
+                scratch);
 void X86JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   __ fs()->testl(Address::Absolute(Thread::ThreadFlagsOffset<kX86PointerSize>()),
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.h b/compiler/utils/x86/jni_macro_assembler_x86.h
index 0af6371..7fe0e42 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.h
+++ b/compiler/utils/x86/jni_macro_assembler_x86.h
@@ -161,9 +161,18 @@
   void CallFromThread(ThreadOffset32 offset) override;
   // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be callee save core registers
+  // (already saved before this call) and must preserve all argument registers.
   void TryToTransitionFromRunnableToNative(
       JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
+  // Generate fast-path for transition to Runnable. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be core argument registers
+  // not used as return registers and it must preserve the `return_reg` if any.
+  void TryToTransitionFromNativeToRunnable(JNIMacroLabel* label,
+                                           ArrayRef<const ManagedRegister> scratch_regs,
+                                           ManagedRegister return_reg) override;
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   void SuspendCheck(JNIMacroLabel* label) override;
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index b25d5c7..8a90a13 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -698,6 +698,52 @@
+void X86_64JNIMacroAssembler::TryToTransitionFromNativeToRunnable(
+    JNIMacroLabel* label,
+    ArrayRef<const ManagedRegister> scratch_regs,
+    ManagedRegister return_reg) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset64 thread_flags_offset = Thread::ThreadFlagsOffset<kX86_64PointerSize>();
+  constexpr ThreadOffset64 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kX86_64PointerSize>(kMutatorLock);
+  constexpr ThreadOffset64 thread_mutator_lock_offset =
+      Thread::MutatorLockOffset<kX86_64PointerSize>();
+  DCHECK_GE(scratch_regs.size(), 2u);
+  DCHECK(!scratch_regs[0].AsX86_64().Overlaps(return_reg.AsX86_64()));
+  CpuRegister scratch = scratch_regs[0].AsX86_64().AsCpuRegister();
+  DCHECK(!scratch_regs[1].AsX86_64().Overlaps(return_reg.AsX86_64()));
+  CpuRegister saved_rax = scratch_regs[1].AsX86_64().AsCpuRegister();
+  CpuRegister rax(RAX);
+  bool preserve_rax = return_reg.AsX86_64().Overlaps(X86_64ManagedRegister::FromCpuRegister(RAX));
+  // CAS acquire, old_value = kNativeStateValue, new_value = kRunnableStateValue, no flags.
+  if (preserve_rax) {
+    __ movq(saved_rax, rax);  // Save RAX.
+  }
+  __ movl(rax, Immediate(kNativeStateValue));
+  static_assert(kRunnableStateValue == 0u);
+  __ xorl(scratch, scratch);
+  __ gs()->LockCmpxchgl(Address::Absolute(thread_flags_offset.Uint32Value(), /*no_rip=*/ true),
+                        scratch);
+  // LOCK CMPXCHG has full barrier semantics, so we don't need barriers here.
+  if (preserve_rax) {
+    __ movq(rax, saved_rax);  // Restore RAX; MOV does not change flags.
+  }
+  // If any flags are set, or the state is not Native, go to the slow path.
+  // (While the thread can theoretically transition between different Suspended states,
+  // it would be very unexpected to see a state other than Native at this point.)
+  __ j(kNotZero, X86_64JNIMacroLabel::Cast(label)->AsX86_64());
+  // Set `self->tlsPtr_.held_mutexes[kMutatorLock]` to the mutator lock.
+  __ gs()->movq(scratch,
+                Address::Absolute(thread_mutator_lock_offset.Uint32Value(), /*no_rip=*/ true));
+  __ gs()->movq(
+      Address::Absolute(thread_held_mutex_mutator_lock_offset.Uint32Value(), /*no_rip=*/ true),
+      scratch);
 void X86_64JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   __ gs()->testl(Address::Absolute(Thread::ThreadFlagsOffset<kX86_64PointerSize>(), true),
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
index 6eb7873..c46d5c6 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
@@ -181,9 +181,18 @@
   void CallFromThread(ThreadOffset64 offset) override;
   // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be callee save core registers
+  // (already saved before this call) and must preserve all argument registers.
   void TryToTransitionFromRunnableToNative(
       JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
+  // Generate fast-path for transition to Runnable. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be core argument registers
+  // not used as return registers and it must preserve the `return_reg` if any.
+  void TryToTransitionFromNativeToRunnable(JNIMacroLabel* label,
+                                           ArrayRef<const ManagedRegister> scratch_regs,
+                                           ManagedRegister return_reg) override;
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   void SuspendCheck(JNIMacroLabel* label) override;