JNI: Inline fast-path for `JniMethodStart()`.

Golem results for art-opt-cc (higher is better):
linux-ia32                       before after
NativeDowncallStaticNormal       35.306 47.382 (+34.20%)
NativeDowncallStaticNormal6      32.951 42.247 (+28.21%)
NativeDowncallStaticNormalRefs6  17.866 41.355 (+131.5%)
NativeDowncallVirtualNormal      35.341 46.836 (+32.53%)
NativeDowncallVirtualNormal6     32.403 41.791 (+28.97%)
NativeDowncallVirtualNormalRefs6 32.131 40.500 (+26.05%)
linux-x64                        before after
NativeDowncallStaticNormal       33.350 43.716 (+31.08%)
NativeDowncallStaticNormal6      31.096 43.176 (+38.85%)
NativeDowncallStaticNormalRefs6  30.617 38.500 (+25.75%)
NativeDowncallVirtualNormal      33.234 43.672 (+32.41%)
NativeDowncallVirtualNormal6     30.617 42.247 (+37.98%)
NativeDowncallVirtualNormalRefs6 32.131 42.701 (+32.90%)
linux-armv7                      before after
NativeDowncallStaticNormal       7.8701 9.9651 (+26.62%)
NativeDowncallStaticNormal6      7.4147 8.9463 (+20.66%)
NativeDowncallStaticNormalRefs6  6.8830 8.3868 (+21.85%)
NativeDowncallVirtualNormal      7.8316 9.8377 (+25.61%)
NativeDowncallVirtualNormal6     7.4147 9.3596 (+26.23%)
NativeDowncallVirtualNormalRefs6 6.6794 8.4325 (+26.25%)
linux-armv8                      before after
NativeDowncallStaticNormal       7.6372 9.8571 (+29.07%)
NativeDowncallStaticNormal6      7.4147 9.4905 (+28.00%)
NativeDowncallStaticNormalRefs6  6.8527 8.6705 (+26.53%)
NativeDowncallVirtualNormal      7.4147 9.3183 (+25.67%)
NativeDowncallVirtualNormal6     7.0755 9.2593 (+30.86%)
NativeDowncallVirtualNormalRefs6 6.5604 8.2967 (+26.47%)

Note that NativeDowncallStaticNormalRefs6 on x86 has been
jumping like crazy since
    https://android-review.googlesource.com/1905055
between ~17.6 and ~32.4 for completely unrelated changes,
so if we take the 32.4 as a baseline, the improvement is
only ~27.6% in line with the other x86 benchmarks.

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 172332525
Change-Id: I771a4765bd3a7c4e58b94be4155515241ea6fa3c
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 4ba3aa1..fc92c30 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -589,6 +589,35 @@
   __ movl(Address(ESP, offset), scratch);
 }
 
+void X86JNIMacroAssembler::TryToTransitionFromRunnableToNative(
+    JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset32 thread_flags_offset = Thread::ThreadFlagsOffset<kX86PointerSize>();
+  constexpr ThreadOffset32 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kX86PointerSize>(kMutatorLock);
+
+  // We need to preserve managed argument EAX.
+  DCHECK_GE(scratch_regs.size(), 2u);
+  Register saved_eax = scratch_regs[0].AsX86().AsCpuRegister();
+  Register scratch = scratch_regs[1].AsX86().AsCpuRegister();
+
+  // CAS acquire, old_value = kRunnableStateValue, new_value = kNativeStateValue, no flags.
+  __ movl(saved_eax, EAX);  // Save EAX.
+  static_assert(kRunnableStateValue == 0u);
+  __ xorl(EAX, EAX);
+  __ movl(scratch, Immediate(kNativeStateValue));
+  __ fs()->LockCmpxchgl(Address::Absolute(thread_flags_offset.Uint32Value()), scratch);
+  // LOCK CMPXCHG has full barrier semantics, so we don't need barriers here.
+  __ movl(EAX, saved_eax);  // Restore EAX; MOV does not change flags.
+  // If any flags are set, go to the slow path.
+  __ j(kNotZero, X86JNIMacroLabel::Cast(label)->AsX86());
+
+  // Clear `self->tlsPtr_.held_mutexes[kMutatorLock]`.
+  __ fs()->movl(Address::Absolute(thread_held_mutex_mutator_lock_offset.Uint32Value()),
+                Immediate(0));
+}
+
 void X86JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   __ fs()->testl(Address::Absolute(Thread::ThreadFlagsOffset<kX86PointerSize>()),
                  Immediate(Thread::SuspendOrCheckpointRequestFlags()));