JNI: Inline fast-path for `JniMethodStart()`.

Golem results for art-opt-cc (higher is better):
linux-ia32                       before after
NativeDowncallStaticNormal       35.306 47.382 (+34.20%)
NativeDowncallStaticNormal6      32.951 42.247 (+28.21%)
NativeDowncallStaticNormalRefs6  17.866 41.355 (+131.5%)
NativeDowncallVirtualNormal      35.341 46.836 (+32.53%)
NativeDowncallVirtualNormal6     32.403 41.791 (+28.97%)
NativeDowncallVirtualNormalRefs6 32.131 40.500 (+26.05%)
linux-x64                        before after
NativeDowncallStaticNormal       33.350 43.716 (+31.08%)
NativeDowncallStaticNormal6      31.096 43.176 (+38.85%)
NativeDowncallStaticNormalRefs6  30.617 38.500 (+25.75%)
NativeDowncallVirtualNormal      33.234 43.672 (+32.41%)
NativeDowncallVirtualNormal6     30.617 42.247 (+37.98%)
NativeDowncallVirtualNormalRefs6 32.131 42.701 (+32.90%)
linux-armv7                      before after
NativeDowncallStaticNormal       7.8701 9.9651 (+26.62%)
NativeDowncallStaticNormal6      7.4147 8.9463 (+20.66%)
NativeDowncallStaticNormalRefs6  6.8830 8.3868 (+21.85%)
NativeDowncallVirtualNormal      7.8316 9.8377 (+25.61%)
NativeDowncallVirtualNormal6     7.4147 9.3596 (+26.23%)
NativeDowncallVirtualNormalRefs6 6.6794 8.4325 (+26.25%)
linux-armv8                      before after
NativeDowncallStaticNormal       7.6372 9.8571 (+29.07%)
NativeDowncallStaticNormal6      7.4147 9.4905 (+28.00%)
NativeDowncallStaticNormalRefs6  6.8527 8.6705 (+26.53%)
NativeDowncallVirtualNormal      7.4147 9.3183 (+25.67%)
NativeDowncallVirtualNormal6     7.0755 9.2593 (+30.86%)
NativeDowncallVirtualNormalRefs6 6.5604 8.2967 (+26.47%)

Note that NativeDowncallStaticNormalRefs6 on x86 has been
jumping like crazy since
    https://android-review.googlesource.com/1905055
between ~17.6 and ~32.4 for completely unrelated changes,
so if we take the 32.4 as a baseline, the improvement is
only ~27.6% in line with the other x86 benchmarks.

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 172332525
Change-Id: I771a4765bd3a7c4e58b94be4155515241ea6fa3c
diff --git a/compiler/jni/jni_compiler_test.cc b/compiler/jni/jni_compiler_test.cc
index a222ff3..8de5c9c 100644
--- a/compiler/jni/jni_compiler_test.cc
+++ b/compiler/jni/jni_compiler_test.cc
@@ -27,6 +27,7 @@
 #include "common_compiler_test.h"
 #include "compiler.h"
 #include "dex/dex_file.h"
+#include "entrypoints/entrypoint_utils-inl.h"
 #include "gtest/gtest.h"
 #include "indirect_reference_table.h"
 #include "java_frame_root_info.h"
@@ -337,6 +338,8 @@
   static jobject jobj_;
   static jobject class_loader_;
 
+  static void AssertCallerObjectLocked(JNIEnv* env);
+
   static LockWord GetLockWord(jobject obj);
 
  protected:
@@ -391,53 +394,17 @@
   jmethodID jmethod_;
 
  private:
-  // Helper class that overrides original entrypoints with alternative versions
-  // that check that the object (`this` or class) is locked.
-  class ScopedSynchronizedEntryPointOverrides {
-   public:
-    ScopedSynchronizedEntryPointOverrides() {
-      QuickEntryPoints* qpoints = &Thread::Current()->tlsPtr_.quick_entrypoints;
-      jni_method_start_original_ = qpoints->pJniMethodStart;
-      qpoints->pJniMethodStart = JniMethodStartSynchronizedOverride;
-      jni_method_end_original_ = qpoints->pJniMethodEnd;
-      qpoints->pJniMethodEnd = JniMethodEndSynchronizedOverride;
-      jni_method_end_with_reference_original_ = qpoints->pJniMethodEndWithReference;
-      qpoints->pJniMethodEndWithReference = JniMethodEndWithReferenceSynchronizedOverride;
-    }
-
-    ~ScopedSynchronizedEntryPointOverrides() {
-      QuickEntryPoints* qpoints = &Thread::Current()->tlsPtr_.quick_entrypoints;
-      qpoints->pJniMethodStart = jni_method_start_original_;
-      qpoints->pJniMethodEnd = jni_method_end_original_;
-      qpoints->pJniMethodEndWithReference = jni_method_end_with_reference_original_;
-    }
-  };
-
-  static void AssertCallerObjectLocked(Thread* self) REQUIRES_SHARED(Locks::mutator_lock_);
-  static void JniMethodStartSynchronizedOverride(Thread* self);
-  static void JniMethodEndSynchronizedOverride(Thread* self);
-  static mirror::Object* JniMethodEndWithReferenceSynchronizedOverride(
-      jobject result, Thread* self);
-
-  using JniStartType = void (*)(Thread*);
-  using JniEndType = void (*)(Thread*);
-  using JniEndWithReferenceType = mirror::Object* (*)(jobject, Thread*);
-
-  static JniStartType jni_method_start_original_;
-  static JniEndType jni_method_end_original_;
-  static JniEndWithReferenceType jni_method_end_with_reference_original_;
-
   bool check_generic_jni_;
 };
 
 jclass JniCompilerTest::jklass_;
 jobject JniCompilerTest::jobj_;
 jobject JniCompilerTest::class_loader_;
-JniCompilerTest::JniStartType JniCompilerTest::jni_method_start_original_;
-JniCompilerTest::JniEndType JniCompilerTest::jni_method_end_original_;
-JniCompilerTest::JniEndWithReferenceType JniCompilerTest::jni_method_end_with_reference_original_;
 
-void JniCompilerTest::AssertCallerObjectLocked(Thread* self) {
+void JniCompilerTest::AssertCallerObjectLocked(JNIEnv* env) {
+  Thread* self = down_cast<JNIEnvExt*>(env)->GetSelf();
+  CHECK_EQ(self, Thread::Current());
+  ScopedObjectAccess soa(self);
   ArtMethod** caller_frame = self->GetManagedStack()->GetTopQuickFrame();
   CHECK(caller_frame != nullptr);
   ArtMethod* caller = *caller_frame;
@@ -447,7 +414,10 @@
   CHECK(!caller->IsCriticalNative());
   CHECK(caller->IsSynchronized());
   ObjPtr<mirror::Object> lock;
-  if (caller->IsStatic()) {
+  if (self->GetManagedStack()->GetTopQuickFrameTag()) {
+    // Generic JNI.
+    lock = GetGenericJniSynchronizationObject(self, caller);
+  } else if (caller->IsStatic()) {
     lock = caller->GetDeclaringClass();
   } else {
     uint8_t* sp = reinterpret_cast<uint8_t*>(caller_frame);
@@ -461,23 +431,6 @@
   CHECK_EQ(Monitor::GetLockOwnerThreadId(lock), self->GetThreadId());
 }
 
-void JniCompilerTest::JniMethodStartSynchronizedOverride(Thread* self) NO_THREAD_SAFETY_ANALYSIS {
-  AssertCallerObjectLocked(self);
-  jni_method_start_original_(self);
-}
-
-void JniCompilerTest::JniMethodEndSynchronizedOverride(Thread* self) NO_THREAD_SAFETY_ANALYSIS {
-  jni_method_end_original_(self);
-  AssertCallerObjectLocked(self);
-}
-
-mirror::Object* JniCompilerTest::JniMethodEndWithReferenceSynchronizedOverride(
-    jobject result, Thread* self) NO_THREAD_SAFETY_ANALYSIS {
-  mirror::Object* raw_result = jni_method_end_with_reference_original_(result, self);
-  AssertCallerObjectLocked(self);
-  return raw_result;
-}
-
 LockWord JniCompilerTest::GetLockWord(jobject obj) {
   ScopedObjectAccess soa(Thread::Current());
   return soa.Decode<mirror::Object>(obj)->GetLockWord(/*as_volatile=*/ false);
@@ -886,7 +839,8 @@
 }
 
 int gJava_MyClassNatives_fooJJ_synchronized_calls[kJniKindCount] = {};
-jlong Java_MyClassNatives_fooJJ_synchronized(JNIEnv*, jobject, jlong x, jlong y) {
+jlong Java_MyClassNatives_fooJJ_synchronized(JNIEnv* env, jobject, jlong x, jlong y) {
+  JniCompilerTest::AssertCallerObjectLocked(env);
   gJava_MyClassNatives_fooJJ_synchronized_calls[gCurrentJni]++;
   return x | y;
 }
@@ -894,7 +848,6 @@
 void JniCompilerTest::CompileAndRun_fooJJ_synchronizedImpl() {
   SetUpForTest(false, "fooJJ_synchronized", "(JJ)J",
                CURRENT_JNI_WRAPPER(Java_MyClassNatives_fooJJ_synchronized));
-  ScopedSynchronizedEntryPointOverrides ssepo;
 
   EXPECT_EQ(0, gJava_MyClassNatives_fooJJ_synchronized_calls[gCurrentJni]);
   jlong a = 0x1000000020000000ULL;
@@ -1220,7 +1173,8 @@
 JNI_TEST(CompileAndRunStaticIntObjectObjectMethod)
 
 int gJava_MyClassNatives_fooSSIOO_calls[kJniKindCount] = {};
-jobject Java_MyClassNatives_fooSSIOO(JNIEnv*, jclass klass, jint x, jobject y, jobject z) {
+jobject Java_MyClassNatives_fooSSIOO(JNIEnv* env, jclass klass, jint x, jobject y, jobject z) {
+  JniCompilerTest::AssertCallerObjectLocked(env);
   gJava_MyClassNatives_fooSSIOO_calls[gCurrentJni]++;
   switch (x) {
     case 1:
@@ -1236,7 +1190,6 @@
   SetUpForTest(true, "fooSSIOO",
                "(ILjava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;",
                CURRENT_JNI_WRAPPER(Java_MyClassNatives_fooSSIOO));
-  ScopedSynchronizedEntryPointOverrides ssepo;
 
   EXPECT_EQ(0, gJava_MyClassNatives_fooSSIOO_calls[gCurrentJni]);
   jobject result = env_->CallStaticObjectMethod(jklass_, jmethod_, 0, nullptr, nullptr);
@@ -1505,7 +1458,8 @@
 JNI_TEST(GetText)
 
 int gJava_MyClassNatives_GetSinkProperties_calls[kJniKindCount] = {};
-jarray Java_MyClassNatives_GetSinkProperties(JNIEnv*, jobject thisObj, jstring s) {
+jarray Java_MyClassNatives_GetSinkProperties(JNIEnv* env, jobject thisObj, jstring s) {
+  JniCompilerTest::AssertCallerObjectLocked(env);
   EXPECT_EQ(s, nullptr);
   gJava_MyClassNatives_GetSinkProperties_calls[gCurrentJni]++;
 
@@ -1518,7 +1472,6 @@
 void JniCompilerTest::GetSinkPropertiesNativeImpl() {
   SetUpForTest(false, "getSinkPropertiesNative", "(Ljava/lang/String;)[Ljava/lang/Object;",
                CURRENT_JNI_WRAPPER(Java_MyClassNatives_GetSinkProperties));
-  ScopedSynchronizedEntryPointOverrides ssepo;
 
   EXPECT_EQ(0, gJava_MyClassNatives_GetSinkProperties_calls[gCurrentJni]);
   jarray result = down_cast<jarray>(
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index 40110d7..bc1c842 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -74,28 +74,17 @@
   return JNIMacroAssembler<kPointerSize>::Create(allocator, isa, features);
 }
 
-enum class JniEntrypoint {
-  kStart,
-  kEnd
-};
-
 template <PointerSize kPointerSize>
-static ThreadOffset<kPointerSize> GetJniEntrypointThreadOffset(JniEntrypoint which,
-                                                               bool reference_return) {
-  if (which == JniEntrypoint::kStart) {  // JniMethodStart
-    ThreadOffset<kPointerSize> jni_start = QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodStart);
-    return jni_start;
-  } else {  // JniMethodEnd
-    ThreadOffset<kPointerSize> jni_end(-1);
-    if (reference_return) {
-      // Pass result.
-      jni_end = QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndWithReference);
-    } else {
-      jni_end = QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEnd);
-    }
-
-    return jni_end;
+static ThreadOffset<kPointerSize> GetJniMethodEndThreadOffset(bool reference_return) {
+  ThreadOffset<kPointerSize> jni_end(-1);
+  if (reference_return) {
+    // Pass result.
+    jni_end = QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndWithReference);
+  } else {
+    jni_end = QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEnd);
   }
+
+  return jni_end;
 }
 
 
@@ -249,7 +238,7 @@
     __ StoreStackPointerToThread(Thread::TopOfManagedStackOffset<kPointerSize>());
   }
 
-  // 2. Lock the object (if synchronized) and transition out of runnable (if normal native).
+  // 2. Lock the object (if synchronized) and transition out of Runnable (if normal native).
 
   // 2.1. Lock the synchronization object (`this` or class) for synchronized methods.
   if (UNLIKELY(is_synchronized)) {
@@ -273,92 +262,19 @@
     __ CallFromThread(QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniLockObject));
   }
 
-  // 2.2. Move frame down to allow space for out going args.
-  //      This prepares for both the `JniMethodStart()` call as well as the main native call.
-  size_t current_out_arg_size = main_out_arg_size;
-  if (UNLIKELY(is_critical_native)) {
-    DCHECK_EQ(main_out_arg_size, current_frame_size);
-  } else {
-    __ IncreaseFrameSize(main_out_arg_size);
-    current_frame_size += main_out_arg_size;
-  }
-
-  // 2.3. Spill all register arguments to preserve them across the `JniLockObject()`
-  //      call (if synchronized) and `JniMethodStart()` call (if normal native).
-  //      Native stack arguments are spilled directly to their argument stack slots and
-  //      references are converted to `jobject`. Native register arguments are spilled to
-  //      the reserved slots in the caller frame, references are not converted to `jobject`;
-  //      references from registers are actually skipped as they were already spilled above.
-  // TODO: Implement fast-path for transition to Native and avoid this spilling.
-  src_args.clear();
-  dest_args.clear();
-  refs.clear();
+  // 2.2. Transition from Runnable to Suspended.
+  // Managed callee-saves were already saved, so these registers are now available.
+  ArrayRef<const ManagedRegister> callee_save_scratch_regs = UNLIKELY(is_critical_native)
+      ? ArrayRef<const ManagedRegister>()
+      : main_jni_conv->CalleeSaveScratchRegisters();
+  std::unique_ptr<JNIMacroLabel> transition_to_native_slow_path;
+  std::unique_ptr<JNIMacroLabel> transition_to_native_resume;
   if (LIKELY(!is_critical_native && !is_fast_native)) {
-    mr_conv->ResetIterator(FrameOffset(current_frame_size));
-    main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
-    main_jni_conv->Next();    // Skip JNIEnv*.
-    // Add a no-op move for the `jclass` / `this` argument to avoid the
-    // next argument being treated as non-null if it's a reference.
-    // Note: We have already spilled `this` as raw reference above. Since `this`
-    // cannot be null, the argument move before the native call does not need
-    // to reload the reference, and that argument move also needs to see the
-    // `this` argument to avoid treating another reference as non-null.
-    // Note: Using the method register for the no-op move even for `this`.
-    src_args.emplace_back(method_register, kRawPointerSize);
-    dest_args.emplace_back(method_register, kRawPointerSize);
-    refs.push_back(kInvalidReferenceOffset);
-    if (is_static) {
-      main_jni_conv->Next();    // Skip `jclass`.
-    } else {
-      // Skip `this`
-      DCHECK(mr_conv->HasNext());
-      DCHECK(main_jni_conv->HasNext());
-      DCHECK(mr_conv->IsCurrentParamAReference());
-      mr_conv->Next();
-      main_jni_conv->Next();
-    }
-    for (; mr_conv->HasNext(); mr_conv->Next(), main_jni_conv->Next()) {
-      DCHECK(main_jni_conv->HasNext());
-      static_assert(kObjectReferenceSize == 4u);
-      bool is_reference = mr_conv->IsCurrentParamAReference();
-      bool src_in_reg = mr_conv->IsCurrentParamInRegister();
-      bool dest_in_reg = main_jni_conv->IsCurrentParamInRegister();
-      if (is_reference && src_in_reg && dest_in_reg) {
-        // We have already spilled the raw reference above.
-        continue;
-      }
-      bool spill_jobject = is_reference && !dest_in_reg;
-      size_t src_size = (!is_reference && mr_conv->IsCurrentParamALongOrDouble()) ? 8u : 4u;
-      size_t dest_size = spill_jobject ? kRawPointerSize : src_size;
-      src_args.push_back(src_in_reg
-          ? ArgumentLocation(mr_conv->CurrentParamRegister(), src_size)
-          : ArgumentLocation(mr_conv->CurrentParamStackOffset(), src_size));
-      dest_args.push_back(dest_in_reg
-          ? ArgumentLocation(mr_conv->CurrentParamStackOffset(), dest_size)
-          : ArgumentLocation(main_jni_conv->CurrentParamStackOffset(), dest_size));
-      refs.push_back(spill_jobject ? mr_conv->CurrentParamStackOffset() : kInvalidReferenceOffset);
-    }
-    __ MoveArguments(ArrayRef<ArgumentLocation>(dest_args),
-                     ArrayRef<ArgumentLocation>(src_args),
-                     ArrayRef<FrameOffset>(refs));
-  }  // if (!is_critical_native)
-
-  // 2.4. Call into `JniMethodStart()` passing Thread* so that transition out of Runnable
-  //      can occur. We abuse the JNI calling convention here, that is guaranteed to support
-  //      passing two pointer arguments, `JNIEnv*` and `jclass`/`jobject`, and we use just one.
-  if (LIKELY(!is_critical_native && !is_fast_native)) {
-    // Skip this for @CriticalNative and @FastNative methods. They do not call JniMethodStart.
-    ThreadOffset<kPointerSize> jni_start =
-        GetJniEntrypointThreadOffset<kPointerSize>(JniEntrypoint::kStart, reference_return);
-    main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
-    if (main_jni_conv->IsCurrentParamInRegister()) {
-      __ GetCurrentThread(main_jni_conv->CurrentParamRegister());
-      __ Call(main_jni_conv->CurrentParamRegister(), Offset(jni_start));
-    } else {
-      __ GetCurrentThread(main_jni_conv->CurrentParamStackOffset());
-      __ CallFromThread(jni_start);
-    }
-    method_register = ManagedRegister::NoRegister();  // Method register is clobbered by the call.
+    transition_to_native_slow_path = __ CreateLabel();
+    transition_to_native_resume = __ CreateLabel();
+    __ TryToTransitionFromRunnableToNative(transition_to_native_slow_path.get(),
+                                           callee_save_scratch_regs);
+    __ Bind(transition_to_native_resume.get());
   }
 
   // 3. Push local reference frame.
@@ -369,9 +285,6 @@
   if (LIKELY(!is_critical_native)) {
     // To pop the local reference frame later, we shall need the JNI environment pointer
     // as well as the cookie, so we preserve them across calls in callee-save registers.
-    // Managed callee-saves were already saved, so these registers are now available.
-    ArrayRef<const ManagedRegister> callee_save_scratch_regs =
-        main_jni_conv->CalleeSaveScratchRegisters();
     CHECK_GE(callee_save_scratch_regs.size(), 3u);  // At least 3 for each supported architecture.
     jni_env_reg = callee_save_scratch_regs[0];
     saved_cookie_reg = __ CoreRegisterWithSize(callee_save_scratch_regs[1], kIRTCookieSize);
@@ -387,7 +300,19 @@
 
   // 4. Make the main native call.
 
-  // 4.1. Fill arguments except the `JNIEnv*`.
+  // 4.1. Move frame down to allow space for out going args.
+  size_t current_out_arg_size = main_out_arg_size;
+  if (UNLIKELY(is_critical_native)) {
+    DCHECK_EQ(main_out_arg_size, current_frame_size);
+  } else {
+    __ IncreaseFrameSize(main_out_arg_size);
+    current_frame_size += main_out_arg_size;
+  }
+
+  // 4.2. Fill arguments except the `JNIEnv*`.
+  // Note: Non-null reference arguments in registers may point to the from-space if we
+  // took the slow-path for locking or transition to Native. However, we only need to
+  // compare them with null to construct `jobject`s, so we can still use them.
   src_args.clear();
   dest_args.clear();
   refs.clear();
@@ -406,16 +331,10 @@
     if (!is_static || main_jni_conv->IsCurrentParamOnStack()) {
       // The method shall not be available in the `jclass` argument register.
       // Make sure it is available in `callee_save_temp` for the call below.
-      // (For @FastNative, the old method register can be clobbered by argument moves.
-      // For normal native, it was already clobbered by the `JniMethodStart*()` call.)
+      // (The old method register can be clobbered by argument moves.)
       ManagedRegister new_method_reg = __ CoreRegisterWithSize(callee_save_temp, kRawPointerSize);
-      if (UNLIKELY(is_fast_native)) {
-        DCHECK(!method_register.IsNoRegister());
-        __ Move(new_method_reg, method_register, kRawPointerSize);
-      } else {
-        DCHECK(method_register.IsNoRegister());
-        __ Load(new_method_reg, method_offset, kRawPointerSize);
-      }
+      DCHECK(!method_register.IsNoRegister());
+      __ Move(new_method_reg, method_register, kRawPointerSize);
       method_register = new_method_reg;
     }
     if (is_static) {
@@ -436,41 +355,19 @@
       }
       refs.push_back(kInvalidReferenceOffset);
       main_jni_conv->Next();
-    } else {
-      // The `this` argument for instance methods is passed first, so that `MoveArguments()`
-      // treats it as non-null. It has not been converted to `jobject` yet, not even for normal
-      // native methods on architectures where this argument is passed on the stack (x86).
-      DCHECK(mr_conv->HasNext());
-      DCHECK(main_jni_conv->HasNext());
-      DCHECK(mr_conv->IsCurrentParamAReference());
-      src_args.push_back(UNLIKELY(is_fast_native) && mr_conv->IsCurrentParamInRegister()
-          ? ArgumentLocation(mr_conv->CurrentParamRegister(), kObjectReferenceSize)
-          : ArgumentLocation(mr_conv->CurrentParamStackOffset(), kObjectReferenceSize));
-      dest_args.push_back(main_jni_conv->IsCurrentParamInRegister()
-          ? ArgumentLocation(main_jni_conv->CurrentParamRegister(), kRawPointerSize)
-          : ArgumentLocation(main_jni_conv->CurrentParamStackOffset(), kRawPointerSize));
-      refs.push_back(mr_conv->CurrentParamStackOffset());
-      mr_conv->Next();
-      main_jni_conv->Next();
     }
   }
   // Move normal arguments to their locations.
   for (; mr_conv->HasNext(); mr_conv->Next(), main_jni_conv->Next()) {
     DCHECK(main_jni_conv->HasNext());
-    bool dest_in_reg = main_jni_conv->IsCurrentParamInRegister();
-    if (LIKELY(!is_critical_native && !is_fast_native) && !dest_in_reg) {
-      // Stack arguments for normal native have already been filled.
-      continue;
-    }
     static_assert(kObjectReferenceSize == 4u);
     bool is_reference = mr_conv->IsCurrentParamAReference();
     size_t src_size = (!is_reference && mr_conv->IsCurrentParamALongOrDouble()) ? 8u : 4u;
     size_t dest_size = is_reference ? kRawPointerSize : src_size;
-    src_args.push_back(
-        UNLIKELY(is_critical_native || is_fast_native) && mr_conv->IsCurrentParamInRegister()
-            ? ArgumentLocation(mr_conv->CurrentParamRegister(), src_size)
-            : ArgumentLocation(mr_conv->CurrentParamStackOffset(), src_size));
-    dest_args.push_back(dest_in_reg
+    src_args.push_back(mr_conv->IsCurrentParamInRegister()
+        ? ArgumentLocation(mr_conv->CurrentParamRegister(), src_size)
+        : ArgumentLocation(mr_conv->CurrentParamStackOffset(), src_size));
+    dest_args.push_back(main_jni_conv->IsCurrentParamInRegister()
         ? ArgumentLocation(main_jni_conv->CurrentParamRegister(), dest_size)
         : ArgumentLocation(main_jni_conv->CurrentParamStackOffset(), dest_size));
     refs.push_back(is_reference ? mr_conv->CurrentParamStackOffset() : kInvalidReferenceOffset);
@@ -480,7 +377,7 @@
                    ArrayRef<ArgumentLocation>(src_args),
                    ArrayRef<FrameOffset>(refs));
 
-  // 4.2. Create 1st argument, the JNI environment ptr.
+  // 4.3. Create 1st argument, the JNI environment ptr.
   if (LIKELY(!is_critical_native)) {
     main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
     if (main_jni_conv->IsCurrentParamInRegister()) {
@@ -492,7 +389,7 @@
     }
   }
 
-  // 4.3. Plant call to native code associated with method.
+  // 4.4. Plant call to native code associated with method.
   MemberOffset jni_entrypoint_offset =
       ArtMethod::EntryPointFromJniOffset(InstructionSetPointerSize(instruction_set));
   if (UNLIKELY(is_critical_native)) {
@@ -509,7 +406,7 @@
     method_register = ManagedRegister::NoRegister();
   }
 
-  // 4.4. Fix differences in result widths.
+  // 4.5. Fix differences in result widths.
   if (main_jni_conv->RequiresSmallResultTypeExtension()) {
     DCHECK(main_jni_conv->HasSmallReturnType());
     CHECK(!is_critical_native || !main_jni_conv->UseTailCall());
@@ -591,7 +488,7 @@
     if (LIKELY(!is_fast_native) || reference_return) {
       ThreadOffset<kPointerSize> jni_end = is_fast_native
           ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniDecodeReferenceResult)
-          : GetJniEntrypointThreadOffset<kPointerSize>(JniEntrypoint::kEnd, reference_return);
+          : GetJniMethodEndThreadOffset<kPointerSize>(reference_return);
       if (reference_return) {
         // Pass result.
         SetNativeParameter(jni_asm.get(), main_jni_conv.get(), main_jni_conv->ReturnRegister());
@@ -709,7 +606,14 @@
     __ Jump(jclass_read_barrier_return.get());
   }
 
-  // 8.2. Suspend check slow path.
+  // 8.2. Slow path for transition to Native.
+  if (LIKELY(!is_critical_native && !is_fast_native)) {
+    __ Bind(transition_to_native_slow_path.get());
+    __ CallFromThread(QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodStart));
+    __ Jump(transition_to_native_resume.get());
+  }
+
+  // 8.3. Suspend check slow path.
   if (UNLIKELY(is_fast_native)) {
     __ Bind(suspend_check_slow_path.get());
     if (reference_return && main_out_arg_size != 0) {
@@ -729,7 +633,7 @@
     __ Jump(suspend_check_resume.get());
   }
 
-  // 8.3. Exception poll slow path(s).
+  // 8.4. Exception poll slow path(s).
   if (LIKELY(!is_critical_native)) {
     __ Bind(exception_slow_path.get());
     if (UNLIKELY(is_fast_native) && reference_return) {
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index 3d45abd..b06f428 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -1050,6 +1050,35 @@
   asm_.StoreToOffset(kStoreWord, tr, sp, dest_offset.Int32Value());
 }
 
+void ArmVIXLJNIMacroAssembler::TryToTransitionFromRunnableToNative(
+    JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset32 thread_flags_offset = Thread::ThreadFlagsOffset<kArmPointerSize>();
+  constexpr ThreadOffset32 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kArmPointerSize>(kMutatorLock);
+
+  DCHECK_GE(scratch_regs.size(), 2u);
+  vixl32::Register scratch = AsVIXLRegister(scratch_regs[0].AsArm());
+  vixl32::Register scratch2 = AsVIXLRegister(scratch_regs[1].AsArm());
+
+  // CAS acquire, old_value = kRunnableStateValue, new_value = kNativeStateValue, no flags.
+  vixl32::Label retry;
+  ___ Bind(&retry);
+  ___ Ldrex(scratch, MemOperand(tr, thread_flags_offset.Int32Value()));
+  ___ Mov(scratch2, kNativeStateValue);
+  // If any flags are set, go to the slow path.
+  ___ Cmp(scratch, kRunnableStateValue);
+  ___ B(ne, ArmVIXLJNIMacroLabel::Cast(label)->AsArm());
+  ___ Strex(scratch, scratch2, MemOperand(tr, thread_flags_offset.Int32Value()));
+  ___ Cmp(scratch, 0);
+  ___ B(ne, &retry);
+  ___ Dmb(DmbOptions::ISH);  // Memory barrier "load-any" for the "acquire" operation.
+
+  // Clear `self->tlsPtr_.held_mutexes[kMutatorLock]`; `scratch` holds 0 at this point.
+  ___ Str(scratch, MemOperand(tr, thread_held_mutex_mutator_lock_offset.Int32Value()));
+}
+
 void ArmVIXLJNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   vixl32::Register scratch = temps.Acquire();
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
index 49f5e7c..7b9d7de 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
@@ -184,6 +184,10 @@
   void Call(FrameOffset base, Offset offset) override;
   void CallFromThread(ThreadOffset32 offset) override;
 
+  // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  void TryToTransitionFromRunnableToNative(
+      JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
+
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   void SuspendCheck(JNIMacroLabel* label) override;
 
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index a505db0..8ae1d04 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -889,6 +889,34 @@
   ___ Str(scratch, MEM_OP(reg_x(SP), out_off.Int32Value()));
 }
 
+void Arm64JNIMacroAssembler::TryToTransitionFromRunnableToNative(
+    JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs ATTRIBUTE_UNUSED) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset64 thread_flags_offset = Thread::ThreadFlagsOffset<kArm64PointerSize>();
+  constexpr ThreadOffset64 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kArm64PointerSize>(kMutatorLock);
+
+  UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+  Register scratch = temps.AcquireW();
+  Register scratch2 = temps.AcquireW();
+
+  // CAS acquire, old_value = kRunnableStateValue, new_value = kNativeStateValue, no flags.
+  vixl::aarch64::Label retry;
+  ___ Bind(&retry);
+  static_assert(thread_flags_offset.Int32Value() == 0);  // LDAXR/STXR require exact address.
+  ___ Ldaxr(scratch, MEM_OP(reg_x(TR)));
+  ___ Mov(scratch2, kNativeStateValue);
+  // If any flags are set, go to the slow path.
+  static_assert(kRunnableStateValue == 0u);
+  ___ Cbnz(scratch, Arm64JNIMacroLabel::Cast(label)->AsArm64());
+  ___ Stxr(scratch, scratch2, MEM_OP(reg_x(TR)));
+  ___ Cbnz(scratch, &retry);
+
+  // Clear `self->tlsPtr_.held_mutexes[kMutatorLock]`.
+  ___ Str(xzr, MEM_OP(reg_x(TR), thread_held_mutex_mutator_lock_offset.Int32Value()));
+}
+
 void Arm64JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   Register scratch = temps.AcquireW();
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.h b/compiler/utils/arm64/jni_macro_assembler_arm64.h
index b6e31c2..1c61d96 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.h
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.h
@@ -168,6 +168,10 @@
   void Call(FrameOffset base, Offset offset) override;
   void CallFromThread(ThreadOffset64 offset) override;
 
+  // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  void TryToTransitionFromRunnableToNative(
+      JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
+
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   void SuspendCheck(JNIMacroLabel* label) override;
 
diff --git a/compiler/utils/jni_macro_assembler.h b/compiler/utils/jni_macro_assembler.h
index abb53b7..659ff4c 100644
--- a/compiler/utils/jni_macro_assembler.h
+++ b/compiler/utils/jni_macro_assembler.h
@@ -251,6 +251,10 @@
   virtual void Call(FrameOffset base, Offset offset) = 0;
   virtual void CallFromThread(ThreadOffset<kPointerSize> offset) = 0;
 
+  // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  virtual void TryToTransitionFromRunnableToNative(
+      JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) = 0;
+
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   virtual void SuspendCheck(JNIMacroLabel* label) = 0;
 
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 4ba3aa1..fc92c30 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -589,6 +589,35 @@
   __ movl(Address(ESP, offset), scratch);
 }
 
+void X86JNIMacroAssembler::TryToTransitionFromRunnableToNative(
+    JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset32 thread_flags_offset = Thread::ThreadFlagsOffset<kX86PointerSize>();
+  constexpr ThreadOffset32 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kX86PointerSize>(kMutatorLock);
+
+  // We need to preserve managed argument EAX.
+  DCHECK_GE(scratch_regs.size(), 2u);
+  Register saved_eax = scratch_regs[0].AsX86().AsCpuRegister();
+  Register scratch = scratch_regs[1].AsX86().AsCpuRegister();
+
+  // CAS acquire, old_value = kRunnableStateValue, new_value = kNativeStateValue, no flags.
+  __ movl(saved_eax, EAX);  // Save EAX.
+  static_assert(kRunnableStateValue == 0u);
+  __ xorl(EAX, EAX);
+  __ movl(scratch, Immediate(kNativeStateValue));
+  __ fs()->LockCmpxchgl(Address::Absolute(thread_flags_offset.Uint32Value()), scratch);
+  // LOCK CMPXCHG has full barrier semantics, so we don't need barriers here.
+  __ movl(EAX, saved_eax);  // Restore EAX; MOV does not change flags.
+  // If any flags are set, go to the slow path.
+  __ j(kNotZero, X86JNIMacroLabel::Cast(label)->AsX86());
+
+  // Clear `self->tlsPtr_.held_mutexes[kMutatorLock]`.
+  __ fs()->movl(Address::Absolute(thread_held_mutex_mutator_lock_offset.Uint32Value()),
+                Immediate(0));
+}
+
 void X86JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   __ fs()->testl(Address::Absolute(Thread::ThreadFlagsOffset<kX86PointerSize>()),
                  Immediate(Thread::SuspendOrCheckpointRequestFlags()));
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.h b/compiler/utils/x86/jni_macro_assembler_x86.h
index 058e040..0af6371 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.h
+++ b/compiler/utils/x86/jni_macro_assembler_x86.h
@@ -160,6 +160,10 @@
   void Call(FrameOffset base, Offset offset) override;
   void CallFromThread(ThreadOffset32 offset) override;
 
+  // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  void TryToTransitionFromRunnableToNative(
+      JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
+
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   void SuspendCheck(JNIMacroLabel* label) override;
 
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index de99e74..3ddb689 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -671,6 +671,33 @@
   __ movq(Address(CpuRegister(RSP), offset), scratch);
 }
 
+void X86_64JNIMacroAssembler::TryToTransitionFromRunnableToNative(
+    JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs ATTRIBUTE_UNUSED) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset64 thread_flags_offset = Thread::ThreadFlagsOffset<kX86_64PointerSize>();
+  constexpr ThreadOffset64 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kX86_64PointerSize>(kMutatorLock);
+
+  CpuRegister rax(RAX);  // RAX can be freely clobbered. It does not hold any argument.
+  CpuRegister scratch = GetScratchRegister();
+
+  // CAS acquire, old_value = kRunnableStateValue, new_value = kNativeStateValue, no flags.
+  static_assert(kRunnableStateValue == 0u);
+  __ xorl(rax, rax);
+  __ movl(scratch, Immediate(kNativeStateValue));
+  __ gs()->LockCmpxchgl(Address::Absolute(thread_flags_offset.Uint32Value(), /*no_rip=*/ true),
+                        scratch);
+  // LOCK CMPXCHG has full barrier semantics, so we don't need barriers here.
+  // If any flags are set, go to the slow path.
+  __ j(kNotZero, X86_64JNIMacroLabel::Cast(label)->AsX86_64());
+
+  // Clear `self->tlsPtr_.held_mutexes[kMutatorLock]`.
+  __ gs()->movq(
+      Address::Absolute(thread_held_mutex_mutator_lock_offset.Uint32Value(), /*no_rip=*/ true),
+      Immediate(0));
+}
+
 void X86_64JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   __ gs()->testl(Address::Absolute(Thread::ThreadFlagsOffset<kX86_64PointerSize>(), true),
                  Immediate(Thread::SuspendOrCheckpointRequestFlags()));
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
index 3e5dfb7..6eb7873 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
@@ -180,6 +180,10 @@
   void Call(FrameOffset base, Offset offset) override;
   void CallFromThread(ThreadOffset64 offset) override;
 
+  // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  void TryToTransitionFromRunnableToNative(
+      JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
+
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   void SuspendCheck(JNIMacroLabel* label) override;