JNI: Inline fast-path for `JniMethodEnd()`.

Golem results for art-opt-cc (higher is better):
linux-ia32                       before after
NativeDowncallStaticNormal       46.766 51.016 (+9.086%)
NativeDowncallStaticNormal6      42.268 45.748 (+8.235%)
NativeDowncallStaticNormalRefs6  41.355 44.776 (+8.272%)
NativeDowncallVirtualNormal      46.361 52.527 (+13.30%)
NativeDowncallVirtualNormal6     41.812 45.206 (+8.118%)
NativeDowncallVirtualNormalRefs6 40.500 44.169 (+9.059%)
(The NativeDowncallVirtualNormal result for x86 is skewed
by one extra good run as Golem reports the best result in
the summary. Using the second best and most frequent
result 50.5, the improvement is only around 8.9%.)
linux-x64                        before after
NativeDowncallStaticNormal       44.169 47.976 (+8.620%)
NativeDowncallStaticNormal6      43.198 46.836 (+8.423%)
NativeDowncallStaticNormalRefs6  38.481 44.687 (+16.13%)
NativeDowncallVirtualNormal      43.672 47.405 (+8.547%)
NativeDowncallVirtualNormal6     42.268 45.726 (+8.182%)
NativeDowncallVirtualNormalRefs6 41.355 44.687 (+8.057%)
(The NativeDowncallStaticNormalRefs6 result for x86-64 is
a bit inflated because recent results jump between ~38.5
and ~40.5. If we take the latter as the baseline, the
improvements is only around 10.3%.)
linux-armv7                      before after
NativeDowncallStaticNormal       10.659 14.620 (+37.16%)
NativeDowncallStaticNormal6      9.8377 13.120 (+33.36%)
NativeDowncallStaticNormalRefs6  8.8714 11.454 (+29.11%)
NativeDowncallVirtualNormal      10.511 14.349 (+36.51%)
NativeDowncallVirtualNormal6     9.9701 13.347 (+33.87%)
NativeDowncallVirtualNormalRefs6 8.9241 11.454 (+28.35%)
linux-armv8                      before after
NativeDowncallStaticNormal       10.608 16.329 (+53.93%)
NativeDowncallStaticNormal6      10.179 15.347 (+50.76%)
NativeDowncallStaticNormalRefs6  9.2457 13.705 (+48.23%)
NativeDowncallVirtualNormal      9.9850 14.903 (+49.25%)
NativeDowncallVirtualNormal6     9.9206 14.757 (+48.75%)
NativeDowncallVirtualNormalRefs6 8.8235 12.789 (+44.94%)

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 172332525
Change-Id: Ie144bc4f7f82be95790ea7d3123b81a3b6bfa603
diff --git a/compiler/jni/jni_cfi_test_expected.inc b/compiler/jni/jni_cfi_test_expected.inc
index 6278f12..3fe8226 100644
--- a/compiler/jni/jni_cfi_test_expected.inc
+++ b/compiler/jni/jni_cfi_test_expected.inc
@@ -1,8 +1,8 @@
 // TODO These arrays should be generated automatically or have instructions for re-creation.
 static constexpr uint8_t expected_asm_kThumb2[] = {
-    0x2D, 0xE9, 0xE0, 0x4D, 0x2D, 0xED, 0x10, 0x8A, 0x85, 0xB0, 0x00, 0x90,
-    0x1D, 0x91, 0x8D, 0xED, 0x1E, 0x0A, 0x1F, 0x92, 0x20, 0x93, 0x88, 0xB0,
-    0x08, 0xB0, 0x05, 0xB0, 0xBD, 0xEC, 0x10, 0x8A, 0xBD, 0xE8, 0xE0, 0x4D,
+    0x2D, 0xE9, 0xE0, 0x4D, 0x2D, 0xED, 0x10, 0x8A, 0x81, 0xB0, 0x00, 0x90,
+    0x19, 0x91, 0x8D, 0xED, 0x1A, 0x0A, 0x1B, 0x92, 0x1C, 0x93, 0x88, 0xB0,
+    0x08, 0xB0, 0x01, 0xB0, 0xBD, 0xEC, 0x10, 0x8A, 0xBD, 0xE8, 0xE0, 0x4D,
     0xD9, 0xF8, 0x24, 0x80, 0x70, 0x47,
 };
 static constexpr uint8_t expected_cfi_kThumb2[] = {
@@ -11,12 +11,12 @@
     0x51, 0x16, 0x05, 0x52, 0x15, 0x05, 0x53, 0x14, 0x05, 0x54, 0x13, 0x05,
     0x55, 0x12, 0x05, 0x56, 0x11, 0x05, 0x57, 0x10, 0x05, 0x58, 0x0F, 0x05,
     0x59, 0x0E, 0x05, 0x5A, 0x0D, 0x05, 0x5B, 0x0C, 0x05, 0x5C, 0x0B, 0x05,
-    0x5D, 0x0A, 0x05, 0x5E, 0x09, 0x05, 0x5F, 0x08, 0x42, 0x0E, 0x70, 0x4E,
-    0x0E, 0x90, 0x01, 0x42, 0x0E, 0x70, 0x0A, 0x42, 0x0E, 0x5C, 0x44, 0x0E,
+    0x5D, 0x0A, 0x05, 0x5E, 0x09, 0x05, 0x5F, 0x08, 0x42, 0x0E, 0x60, 0x4E,
+    0x0E, 0x80, 0x01, 0x42, 0x0E, 0x60, 0x0A, 0x42, 0x0E, 0x5C, 0x44, 0x0E,
     0x1C, 0x06, 0x50, 0x06, 0x51, 0x06, 0x52, 0x06, 0x53, 0x06, 0x54, 0x06,
     0x55, 0x06, 0x56, 0x06, 0x57, 0x06, 0x58, 0x06, 0x59, 0x06, 0x5A, 0x06,
     0x5B, 0x06, 0x5C, 0x06, 0x5D, 0x06, 0x5E, 0x06, 0x5F, 0x44, 0x0E, 0x00,
-    0xC5, 0xC6, 0xC7, 0xC8, 0xCA, 0xCB, 0xCE, 0x46, 0x0B, 0x0E, 0x70,
+    0xC5, 0xC6, 0xC7, 0xC8, 0xCA, 0xCB, 0xCE, 0x46, 0x0B, 0x0E, 0x60,
 };
 // 0x00000000: push {r5,r6,r7,r8,r10,r11,lr}
 // 0x00000004: .cfi_def_cfa_offset: 28
@@ -45,19 +45,19 @@
 // 0x00000008: .cfi_offset_extended: r93 at cfa-40
 // 0x00000008: .cfi_offset_extended: r94 at cfa-36
 // 0x00000008: .cfi_offset_extended: r95 at cfa-32
-// 0x00000008: sub sp, #20
-// 0x0000000a: .cfi_def_cfa_offset: 112
+// 0x00000008: sub sp, #4
+// 0x0000000a: .cfi_def_cfa_offset: 96
 // 0x0000000a: str r0, [sp]
-// 0x0000000c: str r1, [sp, #116]
-// 0x0000000e: vstr s0, [sp, #120]
-// 0x00000012: str r2, [sp, #124]
-// 0x00000014: str r3, [sp, #128]
+// 0x0000000c: str r1, [sp, #100]
+// 0x0000000e: vstr s0, [sp, #104]
+// 0x00000012: str r2, [sp, #108]
+// 0x00000014: str r3, [sp, #112]
 // 0x00000016: sub sp, #32
-// 0x00000018: .cfi_def_cfa_offset: 144
+// 0x00000018: .cfi_def_cfa_offset: 128
 // 0x00000018: add sp, #32
-// 0x0000000a: .cfi_def_cfa_offset: 112
+// 0x0000001a: .cfi_def_cfa_offset: 96
 // 0x0000001a: .cfi_remember_state
-// 0x0000001a: add sp, #20
+// 0x0000001a: add sp, #4
 // 0x0000001c: .cfi_def_cfa_offset: 92
 // 0x0000001c: vpop {s16-s31}
 // 0x00000020: .cfi_def_cfa_offset: 28
@@ -86,7 +86,7 @@
 // 0x00000024: .cfi_restore: r10
 // 0x00000024: .cfi_restore: r11
 // 0x00000024: .cfi_restore: r14
-// 0x00000024: ldr r8, [tr, #48] ; is_gc_marking
+// 0x00000024: ldr r8, [tr, #36] ; is_gc_marking
 // 0x00000028: bx lr
 // 0x0000002a: .cfi_restore_state
 // 0x0000002a: .cfi_def_cfa_offset: 112
@@ -246,26 +246,25 @@
 
 static constexpr uint8_t expected_asm_kX86_64[] = {
     0x41, 0x57, 0x41, 0x56, 0x41, 0x55, 0x41, 0x54, 0x55, 0x53, 0x48, 0x83,
-    0xEC, 0x38, 0xF2, 0x44, 0x0F, 0x11, 0x7C, 0x24, 0x30, 0xF2, 0x44, 0x0F,
-    0x11, 0x74, 0x24, 0x28, 0xF2, 0x44, 0x0F, 0x11, 0x6C, 0x24, 0x20, 0xF2,
-    0x44, 0x0F, 0x11, 0x64, 0x24, 0x18, 0x48, 0x89, 0x3C, 0x24, 0x89, 0x74,
-    0x24, 0x78, 0xF3, 0x0F, 0x11, 0x44, 0x24, 0x7C, 0x89, 0x94, 0x24, 0x80,
-    0x00, 0x00, 0x00, 0x89, 0x8C, 0x24, 0x84, 0x00, 0x00, 0x00, 0x48, 0x83,
-    0xC4, 0xE0, 0x48, 0x83, 0xC4, 0x20, 0xF2, 0x44, 0x0F, 0x10, 0x64, 0x24,
-    0x18, 0xF2, 0x44, 0x0F, 0x10, 0x6C, 0x24, 0x20, 0xF2, 0x44, 0x0F, 0x10,
-    0x74, 0x24, 0x28, 0xF2, 0x44, 0x0F, 0x10, 0x7C, 0x24, 0x30, 0x48, 0x83,
-    0xC4, 0x38, 0x5B, 0x5D, 0x41, 0x5C, 0x41, 0x5D, 0x41, 0x5E, 0x41, 0x5F,
-    0xC3,
+    0xEC, 0x28, 0xF2, 0x44, 0x0F, 0x11, 0x7C, 0x24, 0x20, 0xF2, 0x44, 0x0F,
+    0x11, 0x74, 0x24, 0x18, 0xF2, 0x44, 0x0F, 0x11, 0x6C, 0x24, 0x10, 0xF2,
+    0x44, 0x0F, 0x11, 0x64, 0x24, 0x08, 0x48, 0x89, 0x3C, 0x24, 0x89, 0x74,
+    0x24, 0x68, 0xF3, 0x0F, 0x11, 0x44, 0x24, 0x6C, 0x89, 0x54, 0x24, 0x70,
+    0x89, 0x4C, 0x24, 0x74, 0x48, 0x83, 0xC4, 0xE0, 0x48, 0x83, 0xC4, 0x20,
+    0xF2, 0x44, 0x0F, 0x10, 0x64, 0x24, 0x08, 0xF2, 0x44, 0x0F, 0x10, 0x6C,
+    0x24, 0x10, 0xF2, 0x44, 0x0F, 0x10, 0x74, 0x24, 0x18, 0xF2, 0x44, 0x0F,
+    0x10, 0x7C, 0x24, 0x20, 0x48, 0x83, 0xC4, 0x28, 0x5B, 0x5D, 0x41, 0x5C,
+    0x41, 0x5D, 0x41, 0x5E, 0x41, 0x5F, 0xC3,
 };
 static constexpr uint8_t expected_cfi_kX86_64[] = {
     0x42, 0x0E, 0x10, 0x8F, 0x04, 0x42, 0x0E, 0x18, 0x8E, 0x06, 0x42, 0x0E,
     0x20, 0x8D, 0x08, 0x42, 0x0E, 0x28, 0x8C, 0x0A, 0x41, 0x0E, 0x30, 0x86,
-    0x0C, 0x41, 0x0E, 0x38, 0x83, 0x0E, 0x44, 0x0E, 0x70, 0x47, 0xA0, 0x10,
-    0x47, 0x9F, 0x12, 0x47, 0x9E, 0x14, 0x47, 0x9D, 0x16, 0x60, 0x0E, 0x90,
-    0x01, 0x44, 0x0E, 0x70, 0x0A, 0x47, 0xDD, 0x47, 0xDE, 0x47, 0xDF, 0x47,
+    0x0C, 0x41, 0x0E, 0x38, 0x83, 0x0E, 0x44, 0x0E, 0x60, 0x47, 0xA0, 0x10,
+    0x47, 0x9F, 0x12, 0x47, 0x9E, 0x14, 0x47, 0x9D, 0x16, 0x5A, 0x0E, 0x80,
+    0x01, 0x44, 0x0E, 0x60, 0x0A, 0x47, 0xDD, 0x47, 0xDE, 0x47, 0xDF, 0x47,
     0xE0, 0x44, 0x0E, 0x38, 0x41, 0x0E, 0x30, 0xC3, 0x41, 0x0E, 0x28, 0xC6,
     0x42, 0x0E, 0x20, 0xCC, 0x42, 0x0E, 0x18, 0xCD, 0x42, 0x0E, 0x10, 0xCE,
-    0x42, 0x0E, 0x08, 0xCF, 0x41, 0x0B, 0x0E, 0x70,
+    0x42, 0x0E, 0x08, 0xCF, 0x41, 0x0B, 0x0E, 0x60,
 };
 // 0x00000000: push r15
 // 0x00000002: .cfi_def_cfa_offset: 16
@@ -285,55 +284,55 @@
 // 0x00000009: push rbx
 // 0x0000000a: .cfi_def_cfa_offset: 56
 // 0x0000000a: .cfi_offset: r3 at cfa-56
-// 0x0000000a: subq rsp, 56
-// 0x0000000e: .cfi_def_cfa_offset: 112
-// 0x0000000e: movsd [rsp + 48], xmm15
+// 0x0000000a: subq rsp, 40
+// 0x0000000e: .cfi_def_cfa_offset: 96
+// 0x0000000e: movsd [rsp + 32], xmm15
 // 0x00000015: .cfi_offset: r32 at cfa-64
-// 0x00000015: movsd [rsp + 40], xmm14
+// 0x00000015: movsd [rsp + 24], xmm14
 // 0x0000001c: .cfi_offset: r31 at cfa-72
-// 0x0000001c: movsd [rsp + 32], xmm13
+// 0x0000001c: movsd [rsp + 16], xmm13
 // 0x00000023: .cfi_offset: r30 at cfa-80
-// 0x00000023: movsd [rsp + 24], xmm12
+// 0x00000023: movsd [rsp + 8], xmm12
 // 0x0000002a: .cfi_offset: r29 at cfa-88
 // 0x0000002a: movq [rsp], rdi
-// 0x0000002e: mov [rsp + 120], esi
-// 0x00000032: movss [rsp + 124], xmm0
-// 0x00000038: mov [rsp + 128], edx
-// 0x0000003f: mov [rsp + 132], ecx
-// 0x00000046: addq rsp, -32
-// 0x0000004a: .cfi_def_cfa_offset: 144
-// 0x0000004a: addq rsp, 32
-// 0x0000004e: .cfi_def_cfa_offset: 112
-// 0x0000004e: .cfi_remember_state
-// 0x0000004e: movsd xmm12, [rsp + 24]
-// 0x00000055: .cfi_restore: r29
-// 0x00000055: movsd xmm13, [rsp + 32]
-// 0x0000005c: .cfi_restore: r30
-// 0x0000005c: movsd xmm14, [rsp + 40]
-// 0x00000063: .cfi_restore: r31
-// 0x00000063: movsd xmm15, [rsp + 48]
-// 0x0000006a: .cfi_restore: r32
-// 0x0000006a: addq rsp, 56
-// 0x0000006e: .cfi_def_cfa_offset: 56
-// 0x0000006e: pop rbx
-// 0x0000006f: .cfi_def_cfa_offset: 48
-// 0x0000006f: .cfi_restore: r3
-// 0x0000006f: pop rbp
-// 0x00000070: .cfi_def_cfa_offset: 40
-// 0x00000070: .cfi_restore: r6
-// 0x00000070: pop r12
-// 0x00000072: .cfi_def_cfa_offset: 32
-// 0x00000072: .cfi_restore: r12
-// 0x00000072: pop r13
-// 0x00000074: .cfi_def_cfa_offset: 24
-// 0x00000074: .cfi_restore: r13
-// 0x00000074: pop r14
-// 0x00000076: .cfi_def_cfa_offset: 16
-// 0x00000076: .cfi_restore: r14
-// 0x00000076: pop r15
-// 0x00000078: .cfi_def_cfa_offset: 8
-// 0x00000078: .cfi_restore: r15
-// 0x00000078: ret
-// 0x00000079: .cfi_restore_state
-// 0x00000079: .cfi_def_cfa_offset: 112
+// 0x0000002e: mov [rsp + 104], esi
+// 0x00000032: movss [rsp + 108], xmm0
+// 0x00000038: mov [rsp + 112], edx
+// 0x0000003c: mov [rsp + 116], ecx
+// 0x00000040: addq rsp, -32
+// 0x00000044: .cfi_def_cfa_offset: 128
+// 0x00000044: addq rsp, 32
+// 0x00000048: .cfi_def_cfa_offset: 96
+// 0x00000048: .cfi_remember_state
+// 0x00000048: movsd xmm12, [rsp + 8]
+// 0x0000004f: .cfi_restore: r29
+// 0x0000004f: movsd xmm13, [rsp + 16]
+// 0x00000056: .cfi_restore: r30
+// 0x00000056: movsd xmm14, [rsp + 24]
+// 0x0000005d: .cfi_restore: r31
+// 0x0000005d: movsd xmm15, [rsp + 32]
+// 0x00000064: .cfi_restore: r32
+// 0x00000064: addq rsp, 40
+// 0x00000068: .cfi_def_cfa_offset: 56
+// 0x00000068: pop rbx
+// 0x00000069: .cfi_def_cfa_offset: 48
+// 0x00000069: .cfi_restore: r3
+// 0x00000069: pop rbp
+// 0x0000006a: .cfi_def_cfa_offset: 40
+// 0x0000006a: .cfi_restore: r6
+// 0x0000006a: pop r12
+// 0x0000006c: .cfi_def_cfa_offset: 32
+// 0x0000006c: .cfi_restore: r12
+// 0x0000006c: pop r13
+// 0x0000006e: .cfi_def_cfa_offset: 24
+// 0x0000006e: .cfi_restore: r13
+// 0x0000006e: pop r14
+// 0x00000070: .cfi_def_cfa_offset: 16
+// 0x00000070: .cfi_restore: r14
+// 0x00000070: pop r15
+// 0x00000072: .cfi_def_cfa_offset: 8
+// 0x00000072: .cfi_restore: r15
+// 0x00000072: ret
+// 0x00000073: .cfi_restore_state
+// 0x00000073: .cfi_def_cfa_offset: 96
 
diff --git a/compiler/jni/quick/arm/calling_convention_arm.cc b/compiler/jni/quick/arm/calling_convention_arm.cc
index da438bd..c1afdb8 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.cc
+++ b/compiler/jni/quick/arm/calling_convention_arm.cc
@@ -32,8 +32,8 @@
 
 // List of parameters passed via registers for JNI.
 // JNI uses soft-float, so there is only a GPR list.
-static const Register kJniArgumentRegisters[] = {
-  R0, R1, R2, R3
+static constexpr Register kJniArgumentRegisters[] = {
+    R0, R1, R2, R3
 };
 
 static_assert(kJniArgumentRegisterCount == arraysize(kJniArgumentRegisters));
@@ -43,20 +43,23 @@
 //
 
 // Used by hard float. (General purpose registers.)
-static const Register kHFCoreArgumentRegisters[] = {
-  R0, R1, R2, R3
+static constexpr ManagedRegister kHFCoreArgumentRegisters[] = {
+    ArmManagedRegister::FromCoreRegister(R0),
+    ArmManagedRegister::FromCoreRegister(R1),
+    ArmManagedRegister::FromCoreRegister(R2),
+    ArmManagedRegister::FromCoreRegister(R3),
 };
 static constexpr size_t kHFCoreArgumentRegistersCount = arraysize(kHFCoreArgumentRegisters);
 
 // (VFP single-precision registers.)
-static const SRegister kHFSArgumentRegisters[] = {
-  S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15
+static constexpr SRegister kHFSArgumentRegisters[] = {
+    S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15
 };
 static constexpr size_t kHFSArgumentRegistersCount = arraysize(kHFSArgumentRegisters);
 
 // (VFP double-precision registers.)
-static const DRegister kHFDArgumentRegisters[] = {
-  D0, D1, D2, D3, D4, D5, D6, D7
+static constexpr DRegister kHFDArgumentRegisters[] = {
+    D0, D1, D2, D3, D4, D5, D6, D7
 };
 static constexpr size_t kHFDArgumentRegistersCount = arraysize(kHFDArgumentRegisters);
 
@@ -159,7 +162,7 @@
 
 // Calling convention
 
-ManagedRegister ArmManagedRuntimeCallingConvention::ReturnRegister() {
+ManagedRegister ArmManagedRuntimeCallingConvention::ReturnRegister() const {
   switch (GetShorty()[0]) {
     case 'V':
       return ArmManagedRegister::NoRegister();
@@ -174,7 +177,7 @@
   }
 }
 
-ManagedRegister ArmJniCallingConvention::ReturnRegister() {
+ManagedRegister ArmJniCallingConvention::ReturnRegister() const {
   switch (GetShorty()[0]) {
   case 'V':
     return ArmManagedRegister::NoRegister();
@@ -186,7 +189,7 @@
   }
 }
 
-ManagedRegister ArmJniCallingConvention::IntReturnRegister() {
+ManagedRegister ArmJniCallingConvention::IntReturnRegister() const {
   return ArmManagedRegister::FromCoreRegister(R0);
 }
 
@@ -272,7 +275,7 @@
       CHECK_EQ(RoundUp(gpr_index_, 2u), 2u);
       return ArmManagedRegister::FromRegisterPair(R2_R3);
     } else {
-      return ArmManagedRegister::FromCoreRegister(kHFCoreArgumentRegisters[gpr_index_]);
+      return kHFCoreArgumentRegisters[gpr_index_];
     }
   }
 }
@@ -400,11 +403,27 @@
   return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters).SubArray(kStart, kLength);
 }
 
+ArrayRef<const ManagedRegister> ArmJniCallingConvention::ArgumentScratchRegisters() const {
+  DCHECK(!IsCriticalNative());
+  // Exclude r0 or r0-r1 if they are used as return registers.
+  static_assert(kHFCoreArgumentRegisters[0].Equals(ArmManagedRegister::FromCoreRegister(R0)));
+  static_assert(kHFCoreArgumentRegisters[1].Equals(ArmManagedRegister::FromCoreRegister(R1)));
+  ArrayRef<const ManagedRegister> scratch_regs(kHFCoreArgumentRegisters);
+  ArmManagedRegister return_reg = ReturnRegister().AsArm();
+  auto return_reg_overlaps = [return_reg](ManagedRegister reg) {
+    return return_reg.Overlaps(reg.AsArm());
+  };
+  if (return_reg_overlaps(scratch_regs[0])) {
+    scratch_regs = scratch_regs.SubArray(/*pos=*/ return_reg_overlaps(scratch_regs[1]) ? 2u : 1u);
+  }
+  DCHECK(std::none_of(scratch_regs.begin(), scratch_regs.end(), return_reg_overlaps));
+  return scratch_regs;
+}
+
 size_t ArmJniCallingConvention::FrameSize() const {
   if (UNLIKELY(is_critical_native_)) {
     CHECK(!SpillsMethod());
     CHECK(!HasLocalReferenceSegmentState());
-    CHECK(!SpillsReturnValue());
     return 0u;  // There is no managed frame for @CriticalNative.
   }
 
@@ -417,19 +436,6 @@
   DCHECK(HasLocalReferenceSegmentState());
   // Cookie is saved in one of the spilled registers.
 
-  // Plus return value spill area size
-  if (SpillsReturnValue()) {
-    // For 64-bit return values there shall be a 4B alignment gap between
-    // the method pointer and the saved return value.
-    size_t padding = ReturnValueSaveLocation().SizeValue() - method_ptr_size;
-    DCHECK_EQ(padding,
-              (GetReturnType() == Primitive::kPrimLong || GetReturnType() == Primitive::kPrimDouble)
-                  ? 4u
-                  : 0u);
-    total_size += padding;
-    total_size += SizeOfReturnValue();
-  }
-
   return RoundUp(total_size, kStackAlignment);
 }
 
diff --git a/compiler/jni/quick/arm/calling_convention_arm.h b/compiler/jni/quick/arm/calling_convention_arm.h
index 94dacc4..4526d9e 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.h
+++ b/compiler/jni/quick/arm/calling_convention_arm.h
@@ -35,7 +35,7 @@
         double_index_(0u) {}
   ~ArmManagedRuntimeCallingConvention() override {}
   // Calling convention
-  ManagedRegister ReturnRegister() override;
+  ManagedRegister ReturnRegister() const override;
   void ResetIterator(FrameOffset displacement) override;
   // Managed runtime calling convention
   ManagedRegister MethodRegister() override;
@@ -61,14 +61,15 @@
                           const char* shorty);
   ~ArmJniCallingConvention() override {}
   // Calling convention
-  ManagedRegister ReturnRegister() override;
-  ManagedRegister IntReturnRegister() override;
+  ManagedRegister ReturnRegister() const override;
+  ManagedRegister IntReturnRegister() const override;
   // JNI calling convention
   void Next() override;  // Override default behavior for AAPCS
   size_t FrameSize() const override;
   size_t OutFrameSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ArrayRef<const ManagedRegister> CalleeSaveScratchRegisters() const override;
+  ArrayRef<const ManagedRegister> ArgumentScratchRegisters() const override;
   uint32_t CoreSpillMask() const override;
   uint32_t FpSpillMask() const override;
   bool IsCurrentParamInRegister() override;
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.cc b/compiler/jni/quick/arm64/calling_convention_arm64.cc
index f816a69..ec77db3 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.cc
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.cc
@@ -25,16 +25,18 @@
 namespace art {
 namespace arm64 {
 
-static const XRegister kXArgumentRegisters[] = {
-  X0, X1, X2, X3, X4, X5, X6, X7
+static constexpr ManagedRegister kXArgumentRegisters[] = {
+    Arm64ManagedRegister::FromXRegister(X0),
+    Arm64ManagedRegister::FromXRegister(X1),
+    Arm64ManagedRegister::FromXRegister(X2),
+    Arm64ManagedRegister::FromXRegister(X3),
+    Arm64ManagedRegister::FromXRegister(X4),
+    Arm64ManagedRegister::FromXRegister(X5),
+    Arm64ManagedRegister::FromXRegister(X6),
+    Arm64ManagedRegister::FromXRegister(X7),
 };
 static_assert(kMaxIntLikeRegisterArguments == arraysize(kXArgumentRegisters));
 
-static const WRegister kWArgumentRegisters[] = {
-  W0, W1, W2, W3, W4, W5, W6, W7
-};
-static_assert(kMaxIntLikeRegisterArguments == arraysize(kWArgumentRegisters));
-
 static const DRegister kDArgumentRegisters[] = {
   D0, D1, D2, D3, D4, D5, D6, D7
 };
@@ -154,15 +156,15 @@
   }
 }
 
-ManagedRegister Arm64ManagedRuntimeCallingConvention::ReturnRegister() {
+ManagedRegister Arm64ManagedRuntimeCallingConvention::ReturnRegister() const {
   return ReturnRegisterForShorty(GetShorty());
 }
 
-ManagedRegister Arm64JniCallingConvention::ReturnRegister() {
+ManagedRegister Arm64JniCallingConvention::ReturnRegister() const {
   return ReturnRegisterForShorty(GetShorty());
 }
 
-ManagedRegister Arm64JniCallingConvention::IntReturnRegister() {
+ManagedRegister Arm64JniCallingConvention::IntReturnRegister() const {
   return Arm64ManagedRegister::FromWRegister(W0);
 }
 
@@ -195,12 +197,11 @@
     }
   } else {
     size_t non_fp_arg_number = itr_args_ - itr_float_and_doubles_;
+    ManagedRegister x_reg = kXArgumentRegisters[/* method */ 1u + non_fp_arg_number];
     if (IsCurrentParamALong()) {
-      XRegister x_reg = kXArgumentRegisters[/* method */ 1u + non_fp_arg_number];
-      return Arm64ManagedRegister::FromXRegister(x_reg);
+      return x_reg;
     } else {
-      WRegister w_reg = kWArgumentRegisters[/* method */ 1u + non_fp_arg_number];
-      return Arm64ManagedRegister::FromWRegister(w_reg);
+      return Arm64ManagedRegister::FromWRegister(x_reg.AsArm64().AsOverlappingWRegister());
     }
   }
 }
@@ -247,11 +248,26 @@
   return ArrayRef<const ManagedRegister>(kAapcs64CalleeSaveRegisters).SubArray(kStart, kLength);
 }
 
+ArrayRef<const ManagedRegister> Arm64JniCallingConvention::ArgumentScratchRegisters() const {
+  DCHECK(!IsCriticalNative());
+  // Exclude x0 if it's used as a return register.
+  static_assert(kXArgumentRegisters[0].Equals(Arm64ManagedRegister::FromXRegister(X0)));
+  ArrayRef<const ManagedRegister> scratch_regs(kXArgumentRegisters);
+  Arm64ManagedRegister return_reg = ReturnRegister().AsArm64();
+  auto return_reg_overlaps = [return_reg](ManagedRegister reg) {
+    return return_reg.Overlaps(reg.AsArm64());
+  };
+  if (return_reg_overlaps(scratch_regs[0])) {
+    scratch_regs = scratch_regs.SubArray(/*pos=*/ 1u);
+  }
+  DCHECK(std::none_of(scratch_regs.begin(), scratch_regs.end(), return_reg_overlaps));
+  return scratch_regs;
+}
+
 size_t Arm64JniCallingConvention::FrameSize() const {
   if (is_critical_native_) {
     CHECK(!SpillsMethod());
     CHECK(!HasLocalReferenceSegmentState());
-    CHECK(!SpillsReturnValue());
     return 0u;  // There is no managed frame for @CriticalNative.
   }
 
@@ -264,13 +280,6 @@
   DCHECK(HasLocalReferenceSegmentState());
   // Cookie is saved in one of the spilled registers.
 
-  // Plus return value spill area size
-  if (SpillsReturnValue()) {
-    // No padding between the method pointer and the return value on arm64.
-    DCHECK_EQ(ReturnValueSaveLocation().SizeValue(), method_ptr_size);
-    total_size += SizeOfReturnValue();
-  }
-
   return RoundUp(total_size, kStackAlignment);
 }
 
@@ -343,10 +352,11 @@
   } else {
     int gp_reg = itr_args_ - itr_float_and_doubles_;
     CHECK_LT(static_cast<unsigned int>(gp_reg), kMaxIntLikeRegisterArguments);
+    ManagedRegister x_reg = kXArgumentRegisters[gp_reg];
     if (IsCurrentParamALong() || IsCurrentParamAReference() || IsCurrentParamJniEnv())  {
-      return Arm64ManagedRegister::FromXRegister(kXArgumentRegisters[gp_reg]);
+      return x_reg;
     } else {
-      return Arm64ManagedRegister::FromWRegister(kWArgumentRegisters[gp_reg]);
+      return Arm64ManagedRegister::FromWRegister(x_reg.AsArm64().AsOverlappingWRegister());
     }
   }
 }
@@ -374,7 +384,7 @@
                       }));
   DCHECK(std::none_of(kXArgumentRegisters,
                       kXArgumentRegisters + std::size(kXArgumentRegisters),
-                      [](XRegister arg) { return arg == X15; }));
+                      [](ManagedRegister arg) { return arg.AsArm64().AsXRegister() == X15; }));
 }
 
 ManagedRegister Arm64JniCallingConvention::LockingArgumentRegister() const {
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.h b/compiler/jni/quick/arm64/calling_convention_arm64.h
index 003b0c3..176271e 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.h
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.h
@@ -32,7 +32,7 @@
                                         PointerSize::k64) {}
   ~Arm64ManagedRuntimeCallingConvention() override {}
   // Calling convention
-  ManagedRegister ReturnRegister() override;
+  ManagedRegister ReturnRegister() const override;
   // Managed runtime calling convention
   ManagedRegister MethodRegister() override;
   bool IsCurrentParamInRegister() override;
@@ -53,13 +53,14 @@
                             const char* shorty);
   ~Arm64JniCallingConvention() override {}
   // Calling convention
-  ManagedRegister ReturnRegister() override;
-  ManagedRegister IntReturnRegister() override;
+  ManagedRegister ReturnRegister() const override;
+  ManagedRegister IntReturnRegister() const override;
   // JNI calling convention
   size_t FrameSize() const override;
   size_t OutFrameSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ArrayRef<const ManagedRegister> CalleeSaveScratchRegisters() const override;
+  ArrayRef<const ManagedRegister> ArgumentScratchRegisters() const override;
   uint32_t CoreSpillMask() const override;
   uint32_t FpSpillMask() const override;
   bool IsCurrentParamInRegister() override;
diff --git a/compiler/jni/quick/calling_convention.cc b/compiler/jni/quick/calling_convention.cc
index e7a84fd..eb4d372 100644
--- a/compiler/jni/quick/calling_convention.cc
+++ b/compiler/jni/quick/calling_convention.cc
@@ -174,19 +174,6 @@
   return NumReferenceArgs() + (IsStatic() ? 1 : 0);
 }
 
-FrameOffset JniCallingConvention::ReturnValueSaveLocation() const {
-  // The saved return value goes at a properly aligned slot after the method pointer.
-  DCHECK(SpillsReturnValue());
-  size_t return_value_offset = static_cast<size_t>(frame_pointer_size_);
-  const size_t return_value_size = SizeOfReturnValue();
-  DCHECK(return_value_size == 4u || return_value_size == 8u) << return_value_size;
-  DCHECK_ALIGNED(return_value_offset, 4u);
-  if (return_value_size == 8u) {
-    return_value_offset = RoundUp(return_value_offset, 8u);
-  }
-  return FrameOffset(displacement_.SizeValue() + return_value_offset);
-}
-
 bool JniCallingConvention::HasNext() {
   if (IsCurrentArgExtraForJni()) {
     return true;
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index 0be5233..e2f3bfb 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -46,7 +46,7 @@
   }
 
   // Register that holds result of this method invocation.
-  virtual ManagedRegister ReturnRegister() = 0;
+  virtual ManagedRegister ReturnRegister() const = 0;
 
   // Iterator interface
 
@@ -305,11 +305,8 @@
   virtual size_t OutFrameSize() const = 0;
   // Number of references in stack indirect reference table
   size_t ReferenceCount() const;
-  // Location where the return value of a call can be squirreled if another
-  // call is made following the native call
-  FrameOffset ReturnValueSaveLocation() const;
   // Register that holds result if it is integer.
-  virtual ManagedRegister IntReturnRegister() = 0;
+  virtual ManagedRegister IntReturnRegister() const = 0;
   // Whether the compiler needs to ensure zero-/sign-extension of a small result type
   virtual bool RequiresSmallResultTypeExtension() const = 0;
 
@@ -322,6 +319,10 @@
   // JNI compiler currently requires at least 3 callee save scratch registers.
   virtual ArrayRef<const ManagedRegister> CalleeSaveScratchRegisters() const = 0;
 
+  // Subset of core argument registers that can be used for arbitrary purposes after
+  // calling the native function. These should exclude the return register(s).
+  virtual ArrayRef<const ManagedRegister> ArgumentScratchRegisters() const = 0;
+
   // Spill mask values
   virtual uint32_t CoreSpillMask() const = 0;
   virtual uint32_t FpSpillMask() const = 0;
@@ -383,14 +384,6 @@
            return_type == Primitive::kPrimChar;
   }
 
-  // Does the transition back spill the return value in the stack frame?
-  bool SpillsReturnValue() const {
-    // Exclude return value for @FastNative and @CriticalNative methods for optimization speed.
-    // References are passed directly to the "end method" and there is nothing to save for `void`.
-    return (!IsFastNative() && !IsCriticalNative()) &&
-           (!IsReturnAReference() && SizeOfReturnValue() != 0u);
-  }
-
  protected:
   // Named iterator positions
   enum IteratorPos {
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index 8bb6cc5..be519c1 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -74,19 +74,6 @@
   return JNIMacroAssembler<kPointerSize>::Create(allocator, isa, features);
 }
 
-template <PointerSize kPointerSize>
-static ThreadOffset<kPointerSize> GetJniMethodEndThreadOffset(bool reference_return) {
-  ThreadOffset<kPointerSize> jni_end(-1);
-  if (reference_return) {
-    // Pass result.
-    jni_end = QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndWithReference);
-  } else {
-    jni_end = QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEnd);
-  }
-
-  return jni_end;
-}
-
 
 // Generate the JNI bridge for the given method, general contract:
 // - Arguments are in the managed runtime format, either on stack or in
@@ -422,25 +409,8 @@
     }
   }
 
-  // 5. Transition to Runnable (if normal native).
-
-  // 5.1. Spill or move the return value if needed.
-  // TODO: Use `callee_save_temp` instead of stack slot when possible.
-  bool spill_return_value = main_jni_conv->SpillsReturnValue();
-  FrameOffset return_save_location =
-      spill_return_value ? main_jni_conv->ReturnValueSaveLocation() : FrameOffset(0);
-  if (spill_return_value) {
-    DCHECK(!is_critical_native);
-    // For normal JNI, store the return value on the stack because the call to
-    // JniMethodEnd will clobber the return value. It will be restored in (13).
-    CHECK_LT(return_save_location.Uint32Value(), current_frame_size);
-    __ Store(return_save_location,
-             main_jni_conv->ReturnRegister(),
-             main_jni_conv->SizeOfReturnValue());
-  } else if (UNLIKELY(is_fast_native || is_critical_native) &&
-             main_jni_conv->SizeOfReturnValue() != 0) {
-    // For @FastNative and @CriticalNative only,
-    // move the JNI return register into the managed return register (if they don't match).
+  // 4.6. Move the JNI return register into the managed return register (if they don't match).
+  if (main_jni_conv->SizeOfReturnValue() != 0) {
     ManagedRegister jni_return_reg = main_jni_conv->ReturnRegister();
     ManagedRegister mr_return_reg = mr_conv->ReturnRegister();
 
@@ -460,11 +430,27 @@
     }
   }
 
-  // 5.2. For @FastNative that returns a reference, do an early exception check so that the
+  // 5. Transition to Runnable (if normal native).
+
+  // 5.1. Try transitioning to Runnable with a fast-path implementation.
+  //      If fast-path fails, make a slow-path call to `JniMethodEnd()`.
+  std::unique_ptr<JNIMacroLabel> transition_to_runnable_slow_path;
+  std::unique_ptr<JNIMacroLabel> transition_to_runnable_resume;
+  if (LIKELY(!is_critical_native && !is_fast_native)) {
+    transition_to_runnable_slow_path = __ CreateLabel();
+    transition_to_runnable_resume = __ CreateLabel();
+    __ TryToTransitionFromNativeToRunnable(transition_to_runnable_slow_path.get(),
+                                           main_jni_conv->ArgumentScratchRegisters(),
+                                           mr_conv->ReturnRegister());
+    __ Bind(transition_to_runnable_resume.get());
+  }
+
+  // 5.2. For methods that return a reference, do an early exception check so that the
   //      `JniDecodeReferenceResult()` in the main path does not need to check for exceptions.
   std::unique_ptr<JNIMacroLabel> exception_slow_path =
       LIKELY(!is_critical_native) ? __ CreateLabel() : nullptr;
-  if (UNLIKELY(is_fast_native) && reference_return) {
+  if (reference_return) {
+    DCHECK(!is_critical_native);
     __ ExceptionPoll(exception_slow_path.get());
   }
 
@@ -479,33 +465,23 @@
     __ Bind(suspend_check_resume.get());
   }
 
-  if (LIKELY(!is_critical_native)) {
-    // 5.4. Call JniMethodEnd for normal native.
-    //      For @FastNative with reference return, decode the `jobject`.
-    //      We abuse the JNI calling convention here, that is guaranteed to support passing
-    //      two pointer arguments, `JNIEnv*` and `jclass`/`jobject`, enough for all cases.
+  // 5.4 For methods with reference return, decode the `jobject` with `JniDecodeReferenceResult()`.
+  if (reference_return) {
+    DCHECK(!is_critical_native);
+    // We abuse the JNI calling convention here, that is guaranteed to support passing
+    // two pointer arguments, `JNIEnv*` and `jclass`/`jobject`.
     main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
-    if (LIKELY(!is_fast_native) || reference_return) {
-      ThreadOffset<kPointerSize> jni_end = is_fast_native
-          ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniDecodeReferenceResult)
-          : GetJniMethodEndThreadOffset<kPointerSize>(reference_return);
-      if (reference_return) {
-        // Pass result.
-        SetNativeParameter(jni_asm.get(), main_jni_conv.get(), main_jni_conv->ReturnRegister());
-        main_jni_conv->Next();
-      }
-      if (main_jni_conv->IsCurrentParamInRegister()) {
-        __ GetCurrentThread(main_jni_conv->CurrentParamRegister());
-        __ Call(main_jni_conv->CurrentParamRegister(), Offset(jni_end));
-      } else {
-        __ GetCurrentThread(main_jni_conv->CurrentParamStackOffset());
-        __ CallFromThread(jni_end);
-      }
-    }
-
-    // 5.5. Reload return value if it was spilled.
-    if (spill_return_value) {
-      __ Load(mr_conv->ReturnRegister(), return_save_location, mr_conv->SizeOfReturnValue());
+    ThreadOffset<kPointerSize> jni_decode_reference_result =
+        QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniDecodeReferenceResult);
+    // Pass result.
+    SetNativeParameter(jni_asm.get(), main_jni_conv.get(), mr_conv->ReturnRegister());
+    main_jni_conv->Next();
+    if (main_jni_conv->IsCurrentParamInRegister()) {
+      __ GetCurrentThread(main_jni_conv->CurrentParamRegister());
+      __ Call(main_jni_conv->CurrentParamRegister(), Offset(jni_decode_reference_result));
+    } else {
+      __ GetCurrentThread(main_jni_conv->CurrentParamStackOffset());
+      __ CallFromThread(jni_decode_reference_result);
     }
   }  // if (!is_critical_native)
 
@@ -546,8 +522,8 @@
 
   // 7.3. Process pending exceptions from JNI call or monitor exit.
   //      @CriticalNative methods do not need exception poll in the stub.
-  //      @FastNative methods with reference return emit the exception poll earlier.
-  if (LIKELY(!is_critical_native) && (LIKELY(!is_fast_native) || !reference_return)) {
+  //      Methods with reference return emit the exception poll earlier.
+  if (LIKELY(!is_critical_native) && !reference_return) {
     __ ExceptionPoll(exception_slow_path.get());
   }
 
@@ -614,7 +590,14 @@
     __ Jump(transition_to_native_resume.get());
   }
 
-  // 8.3. Suspend check slow path.
+  // 8.3. Slow path for transition to Runnable.
+  if (LIKELY(!is_critical_native && !is_fast_native)) {
+    __ Bind(transition_to_runnable_slow_path.get());
+    __ CallFromThread(QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEnd));
+    __ Jump(transition_to_runnable_resume.get());
+  }
+
+  // 8.4. Suspend check slow path.
   if (UNLIKELY(is_fast_native)) {
     __ Bind(suspend_check_slow_path.get());
     if (reference_return && main_out_arg_size != 0) {
@@ -634,10 +617,10 @@
     __ Jump(suspend_check_resume.get());
   }
 
-  // 8.4. Exception poll slow path(s).
+  // 8.5. Exception poll slow path(s).
   if (LIKELY(!is_critical_native)) {
     __ Bind(exception_slow_path.get());
-    if (UNLIKELY(is_fast_native) && reference_return) {
+    if (reference_return) {
       // We performed the exception check early, so we need to adjust SP and pop IRT frame.
       if (main_out_arg_size != 0) {
         jni_asm->cfi().AdjustCFAOffset(main_out_arg_size);
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index 2fb063f..65be92c 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -25,8 +25,11 @@
 namespace art {
 namespace x86 {
 
-static constexpr Register kManagedCoreArgumentRegisters[] = {
-    EAX, ECX, EDX, EBX
+static constexpr ManagedRegister kManagedCoreArgumentRegisters[] = {
+    X86ManagedRegister::FromCpuRegister(EAX),
+    X86ManagedRegister::FromCpuRegister(ECX),
+    X86ManagedRegister::FromCpuRegister(EDX),
+    X86ManagedRegister::FromCpuRegister(EBX),
 };
 static constexpr size_t kManagedCoreArgumentRegistersCount =
     arraysize(kManagedCoreArgumentRegisters);
@@ -79,6 +82,33 @@
   return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
 }
 
+ArrayRef<const ManagedRegister> X86JniCallingConvention::ArgumentScratchRegisters() const {
+  DCHECK(!IsCriticalNative());
+  // Exclude EAX or EAX/EDX if they are used as return registers.
+  // Due to the odd ordering of argument registers, use a re-ordered array (pull EDX forward).
+  static constexpr ManagedRegister kArgumentRegisters[] = {
+      X86ManagedRegister::FromCpuRegister(EAX),
+      X86ManagedRegister::FromCpuRegister(EDX),
+      X86ManagedRegister::FromCpuRegister(ECX),
+      X86ManagedRegister::FromCpuRegister(EBX),
+  };
+  static_assert(arraysize(kArgumentRegisters) == kManagedCoreArgumentRegistersCount);
+  static_assert(kManagedCoreArgumentRegisters[0].Equals(kArgumentRegisters[0]));
+  static_assert(kManagedCoreArgumentRegisters[1].Equals(kArgumentRegisters[2]));
+  static_assert(kManagedCoreArgumentRegisters[2].Equals(kArgumentRegisters[1]));
+  static_assert(kManagedCoreArgumentRegisters[3].Equals(kArgumentRegisters[3]));
+  ArrayRef<const ManagedRegister> scratch_regs(kArgumentRegisters);
+  X86ManagedRegister return_reg = ReturnRegister().AsX86();
+  auto return_reg_overlaps = [return_reg](ManagedRegister reg) {
+    return return_reg.Overlaps(reg.AsX86());
+  };
+  if (return_reg_overlaps(scratch_regs[0])) {
+    scratch_regs = scratch_regs.SubArray(/*pos=*/ return_reg_overlaps(scratch_regs[1]) ? 2u : 1u);
+  }
+  DCHECK(std::none_of(scratch_regs.begin(), scratch_regs.end(), return_reg_overlaps));
+  return scratch_regs;
+}
+
 static ManagedRegister ReturnRegisterForShorty(const char* shorty, bool jni) {
   if (shorty[0] == 'F' || shorty[0] == 'D') {
     if (jni) {
@@ -95,15 +125,15 @@
   }
 }
 
-ManagedRegister X86ManagedRuntimeCallingConvention::ReturnRegister() {
+ManagedRegister X86ManagedRuntimeCallingConvention::ReturnRegister() const {
   return ReturnRegisterForShorty(GetShorty(), false);
 }
 
-ManagedRegister X86JniCallingConvention::ReturnRegister() {
+ManagedRegister X86JniCallingConvention::ReturnRegister() const {
   return ReturnRegisterForShorty(GetShorty(), true);
 }
 
-ManagedRegister X86JniCallingConvention::IntReturnRegister() {
+ManagedRegister X86JniCallingConvention::IntReturnRegister() const {
   return X86ManagedRegister::FromCpuRegister(EAX);
 }
 
@@ -149,20 +179,19 @@
     if (IsCurrentParamALong()) {
       switch (gpr_arg_count_) {
         case 1:
-          static_assert(kManagedCoreArgumentRegisters[1] == ECX);
-          static_assert(kManagedCoreArgumentRegisters[2] == EDX);
+          static_assert(kManagedCoreArgumentRegisters[1].AsX86().AsCpuRegister() == ECX);
+          static_assert(kManagedCoreArgumentRegisters[2].AsX86().AsCpuRegister() == EDX);
           return X86ManagedRegister::FromRegisterPair(ECX_EDX);
         case 2:
-          static_assert(kManagedCoreArgumentRegisters[2] == EDX);
-          static_assert(kManagedCoreArgumentRegisters[3] == EBX);
+          static_assert(kManagedCoreArgumentRegisters[2].AsX86().AsCpuRegister() == EDX);
+          static_assert(kManagedCoreArgumentRegisters[3].AsX86().AsCpuRegister() == EBX);
           return X86ManagedRegister::FromRegisterPair(EDX_EBX);
         default:
           LOG(FATAL) << "UNREACHABLE";
           UNREACHABLE();
       }
     } else {
-      Register core_reg = kManagedCoreArgumentRegisters[gpr_arg_count_];
-      return X86ManagedRegister::FromCpuRegister(core_reg);
+      return kManagedCoreArgumentRegisters[gpr_arg_count_];
     }
   }
 }
@@ -200,7 +229,6 @@
   if (is_critical_native_) {
     CHECK(!SpillsMethod());
     CHECK(!HasLocalReferenceSegmentState());
-    CHECK(!SpillsReturnValue());
     return 0u;  // There is no managed frame for @CriticalNative.
   }
 
@@ -214,19 +242,6 @@
   DCHECK(HasLocalReferenceSegmentState());
   // Cookie is saved in one of the spilled registers.
 
-  // Plus return value spill area size
-  if (SpillsReturnValue()) {
-    // For 64-bit return values there shall be a 4B alignment gap between
-    // the method pointer and the saved return value.
-    size_t padding = ReturnValueSaveLocation().SizeValue() - method_ptr_size;
-    DCHECK_EQ(padding,
-              (GetReturnType() == Primitive::kPrimLong || GetReturnType() == Primitive::kPrimDouble)
-                  ? 4u
-                  : 0u);
-    total_size += padding;
-    total_size += SizeOfReturnValue();
-  }
-
   return RoundUp(total_size, kStackAlignment);
 }
 
diff --git a/compiler/jni/quick/x86/calling_convention_x86.h b/compiler/jni/quick/x86/calling_convention_x86.h
index f028090..cd7ef5b 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.h
+++ b/compiler/jni/quick/x86/calling_convention_x86.h
@@ -33,7 +33,7 @@
         gpr_arg_count_(1u) {}  // Skip EAX for ArtMethod*
   ~X86ManagedRuntimeCallingConvention() override {}
   // Calling convention
-  ManagedRegister ReturnRegister() override;
+  ManagedRegister ReturnRegister() const override;
   void ResetIterator(FrameOffset displacement) override;
   // Managed runtime calling convention
   ManagedRegister MethodRegister() override;
@@ -58,13 +58,14 @@
                           const char* shorty);
   ~X86JniCallingConvention() override {}
   // Calling convention
-  ManagedRegister ReturnRegister() override;
-  ManagedRegister IntReturnRegister() override;
+  ManagedRegister ReturnRegister() const override;
+  ManagedRegister IntReturnRegister() const override;
   // JNI calling convention
   size_t FrameSize() const override;
   size_t OutFrameSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ArrayRef<const ManagedRegister> CalleeSaveScratchRegisters() const override;
+  ArrayRef<const ManagedRegister> ArgumentScratchRegisters() const override;
   uint32_t CoreSpillMask() const override;
   uint32_t FpSpillMask() const override;
   bool IsCurrentParamInRegister() override;
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
index 469de42..862ee5e 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
@@ -26,8 +26,13 @@
 namespace art {
 namespace x86_64 {
 
-static constexpr Register kCoreArgumentRegisters[] = {
-    RDI, RSI, RDX, RCX, R8, R9
+static constexpr ManagedRegister kCoreArgumentRegisters[] = {
+    X86_64ManagedRegister::FromCpuRegister(RDI),
+    X86_64ManagedRegister::FromCpuRegister(RSI),
+    X86_64ManagedRegister::FromCpuRegister(RDX),
+    X86_64ManagedRegister::FromCpuRegister(RCX),
+    X86_64ManagedRegister::FromCpuRegister(R8),
+    X86_64ManagedRegister::FromCpuRegister(R9),
 };
 static_assert(kMaxIntLikeRegisterArguments == arraysize(kCoreArgumentRegisters));
 
@@ -99,6 +104,19 @@
   return ArrayRef<const ManagedRegister>(kNativeCalleeSaveRegisters);
 }
 
+ArrayRef<const ManagedRegister> X86_64JniCallingConvention::ArgumentScratchRegisters() const {
+  DCHECK(!IsCriticalNative());
+  ArrayRef<const ManagedRegister> scratch_regs(kCoreArgumentRegisters);
+  if (kIsDebugBuild) {
+    X86_64ManagedRegister return_reg = ReturnRegister().AsX86_64();
+    auto return_reg_overlaps = [return_reg](ManagedRegister reg) {
+      return return_reg.Overlaps(reg.AsX86_64());
+    };
+    CHECK(std::none_of(scratch_regs.begin(), scratch_regs.end(), return_reg_overlaps));
+  }
+  return scratch_regs;
+}
+
 static ManagedRegister ReturnRegisterForShorty(const char* shorty, bool jni ATTRIBUTE_UNUSED) {
   if (shorty[0] == 'F' || shorty[0] == 'D') {
     return X86_64ManagedRegister::FromXmmRegister(XMM0);
@@ -111,15 +129,15 @@
   }
 }
 
-ManagedRegister X86_64ManagedRuntimeCallingConvention::ReturnRegister() {
+ManagedRegister X86_64ManagedRuntimeCallingConvention::ReturnRegister() const {
   return ReturnRegisterForShorty(GetShorty(), false);
 }
 
-ManagedRegister X86_64JniCallingConvention::ReturnRegister() {
+ManagedRegister X86_64JniCallingConvention::ReturnRegister() const {
   return ReturnRegisterForShorty(GetShorty(), true);
 }
 
-ManagedRegister X86_64JniCallingConvention::IntReturnRegister() {
+ManagedRegister X86_64JniCallingConvention::IntReturnRegister() const {
   return X86_64ManagedRegister::FromCpuRegister(RAX);
 }
 
@@ -150,8 +168,7 @@
     return X86_64ManagedRegister::FromXmmRegister(fp_reg);
   } else {
     size_t non_fp_arg_number = itr_args_ - itr_float_and_doubles_;
-    Register core_reg = kCoreArgumentRegisters[/* method */ 1u + non_fp_arg_number];
-    return X86_64ManagedRegister::FromCpuRegister(core_reg);
+    return kCoreArgumentRegisters[/* method */ 1u + non_fp_arg_number];
   }
 }
 
@@ -188,7 +205,6 @@
   if (is_critical_native_) {
     CHECK(!SpillsMethod());
     CHECK(!HasLocalReferenceSegmentState());
-    CHECK(!SpillsReturnValue());
     return 0u;  // There is no managed frame for @CriticalNative.
   }
 
@@ -202,13 +218,6 @@
   DCHECK(HasLocalReferenceSegmentState());
   // Cookie is saved in one of the spilled registers.
 
-  // Plus return value spill area size
-  if (SpillsReturnValue()) {
-    // No padding between the method pointer and the return value on arm64.
-    DCHECK_EQ(ReturnValueSaveLocation().SizeValue(), method_ptr_size);
-    total_size += SizeOfReturnValue();
-  }
-
   return RoundUp(total_size, kStackAlignment);
 }
 
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.h b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
index fda5c0e..483f1f5 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.h
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
@@ -32,7 +32,7 @@
                                         PointerSize::k64) {}
   ~X86_64ManagedRuntimeCallingConvention() override {}
   // Calling convention
-  ManagedRegister ReturnRegister() override;
+  ManagedRegister ReturnRegister() const override;
   // Managed runtime calling convention
   ManagedRegister MethodRegister() override;
   bool IsCurrentParamInRegister() override;
@@ -53,13 +53,14 @@
                              const char* shorty);
   ~X86_64JniCallingConvention() override {}
   // Calling convention
-  ManagedRegister ReturnRegister() override;
-  ManagedRegister IntReturnRegister() override;
+  ManagedRegister ReturnRegister() const override;
+  ManagedRegister IntReturnRegister() const override;
   // JNI calling convention
   size_t FrameSize() const override;
   size_t OutFrameSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ArrayRef<const ManagedRegister> CalleeSaveScratchRegisters() const override;
+  ArrayRef<const ManagedRegister> ArgumentScratchRegisters() const override;
   uint32_t CoreSpillMask() const override;
   uint32_t FpSpillMask() const override;
   bool IsCurrentParamInRegister() override;
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index 2c1b4be..418cf57 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -1079,6 +1079,45 @@
   ___ Str(scratch, MemOperand(tr, thread_held_mutex_mutator_lock_offset.Int32Value()));
 }
 
+void ArmVIXLJNIMacroAssembler::TryToTransitionFromNativeToRunnable(
+    JNIMacroLabel* label,
+    ArrayRef<const ManagedRegister> scratch_regs,
+    ManagedRegister return_reg) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset32 thread_flags_offset = Thread::ThreadFlagsOffset<kArmPointerSize>();
+  constexpr ThreadOffset32 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kArmPointerSize>(kMutatorLock);
+  constexpr ThreadOffset32 thread_mutator_lock_offset =
+      Thread::MutatorLockOffset<kArmPointerSize>();
+
+  // There must be at least two scratch registers.
+  DCHECK_GE(scratch_regs.size(), 2u);
+  DCHECK(!scratch_regs[0].AsArm().Overlaps(return_reg.AsArm()));
+  vixl32::Register scratch = AsVIXLRegister(scratch_regs[0].AsArm());
+  DCHECK(!scratch_regs[1].AsArm().Overlaps(return_reg.AsArm()));
+  vixl32::Register scratch2 = AsVIXLRegister(scratch_regs[1].AsArm());
+
+  // CAS acquire, old_value = kNativeStateValue, new_value = kRunnableStateValue, no flags.
+  vixl32::Label retry;
+  ___ Bind(&retry);
+  ___ Ldrex(scratch, MemOperand(tr, thread_flags_offset.Int32Value()));
+  // If any flags are set, or the state is not Native, go to the slow path.
+  // (While the thread can theoretically transition between different Suspended states,
+  // it would be very unexpected to see a state other than Native at this point.)
+  ___ Eors(scratch2, scratch, kNativeStateValue);
+  ___ B(ne, ArmVIXLJNIMacroLabel::Cast(label)->AsArm());
+  static_assert(kRunnableStateValue == 0u);
+  ___ Strex(scratch, scratch2, MemOperand(tr, thread_flags_offset.Int32Value()));
+  ___ Cmp(scratch, 0);
+  ___ B(ne, &retry);
+  ___ Dmb(DmbOptions::ISH);  // Memory barrier "load-any" for the "acquire" operation.
+
+  // Set `self->tlsPtr_.held_mutexes[kMutatorLock]` to the mutator lock.
+  ___ Ldr(scratch, MemOperand(tr, thread_mutator_lock_offset.Int32Value()));
+  ___ Str(scratch, MemOperand(tr, thread_held_mutex_mutator_lock_offset.Int32Value()));
+}
+
 void ArmVIXLJNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   vixl32::Register scratch = temps.Acquire();
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
index 7b9d7de..426502d 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
@@ -185,9 +185,18 @@
   void CallFromThread(ThreadOffset32 offset) override;
 
   // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be callee save core registers
+  // (already saved before this call) and must preserve all argument registers.
   void TryToTransitionFromRunnableToNative(
       JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
 
+  // Generate fast-path for transition to Runnable. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be core argument registers
+  // not used as return registers and it must preserve the `return_reg` if any.
+  void TryToTransitionFromNativeToRunnable(JNIMacroLabel* label,
+                                           ArrayRef<const ManagedRegister> scratch_regs,
+                                           ManagedRegister return_reg) override;
+
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   void SuspendCheck(JNIMacroLabel* label) override;
 
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index e84fe04..df7bb5e 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -917,6 +917,42 @@
   ___ Str(xzr, MEM_OP(reg_x(TR), thread_held_mutex_mutator_lock_offset.Int32Value()));
 }
 
+void Arm64JNIMacroAssembler::TryToTransitionFromNativeToRunnable(
+    JNIMacroLabel* label,
+    ArrayRef<const ManagedRegister> scratch_regs ATTRIBUTE_UNUSED,
+    ManagedRegister return_reg ATTRIBUTE_UNUSED) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset64 thread_flags_offset = Thread::ThreadFlagsOffset<kArm64PointerSize>();
+  constexpr ThreadOffset64 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kArm64PointerSize>(kMutatorLock);
+  constexpr ThreadOffset64 thread_mutator_lock_offset =
+      Thread::MutatorLockOffset<kArm64PointerSize>();
+
+  UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+  Register scratch = temps.AcquireW();
+  Register scratch2 = temps.AcquireW();
+
+  // CAS acquire, old_value = kNativeStateValue, new_value = kRunnableStateValue, no flags.
+  vixl::aarch64::Label retry;
+  ___ Bind(&retry);
+  static_assert(thread_flags_offset.Int32Value() == 0);  // LDAXR/STXR require exact address.
+  ___ Ldaxr(scratch, MEM_OP(reg_x(TR)));
+  ___ Mov(scratch2, kNativeStateValue);
+  // If any flags are set, or the state is not Native, go to the slow path.
+  // (While the thread can theoretically transition between different Suspended states,
+  // it would be very unexpected to see a state other than Native at this point.)
+  ___ Cmp(scratch, scratch2);
+  ___ B(ne, Arm64JNIMacroLabel::Cast(label)->AsArm64());
+  static_assert(kRunnableStateValue == 0u);
+  ___ Stxr(scratch, wzr, MEM_OP(reg_x(TR)));
+  ___ Cbnz(scratch, &retry);
+
+  // Set `self->tlsPtr_.held_mutexes[kMutatorLock]` to the mutator lock.
+  ___ Ldr(scratch.X(), MEM_OP(reg_x(TR), thread_mutator_lock_offset.Int32Value()));
+  ___ Str(scratch.X(), MEM_OP(reg_x(TR), thread_held_mutex_mutator_lock_offset.Int32Value()));
+}
+
 void Arm64JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   Register scratch = temps.AcquireW();
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.h b/compiler/utils/arm64/jni_macro_assembler_arm64.h
index 1c61d96..0fb512e 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.h
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.h
@@ -169,9 +169,18 @@
   void CallFromThread(ThreadOffset64 offset) override;
 
   // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be callee save core registers
+  // (already saved before this call) and must preserve all argument registers.
   void TryToTransitionFromRunnableToNative(
       JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
 
+  // Generate fast-path for transition to Runnable. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be core argument registers
+  // not used as return registers and it must preserve the `return_reg` if any.
+  void TryToTransitionFromNativeToRunnable(JNIMacroLabel* label,
+                                           ArrayRef<const ManagedRegister> scratch_regs,
+                                           ManagedRegister return_reg) override;
+
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   void SuspendCheck(JNIMacroLabel* label) override;
 
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index b35066f..2d1de97 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -1,12 +1,12 @@
 const char* const VixlJniHelpersResults = {
   "       0: 2d e9 e0 4d   push.w {r5, r6, r7, r8, r10, r11, lr}\n"
   "       4: 2d ed 10 8a   vpush {s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31}\n"
-  "       8: 85 b0         sub sp, #20\n"
+  "       8: 81 b0         sub sp, #4\n"
   "       a: 00 90         str r0, [sp]\n"
-  "       c: 1d 91         str r1, [sp, #116]\n"
-  "       e: 8d ed 1e 0a   vstr s0, [sp, #120]\n"
-  "      12: 1f 92         str r2, [sp, #124]\n"
-  "      14: 20 93         str r3, [sp, #128]\n"
+  "       c: 19 91         str r1, [sp, #100]\n"
+  "       e: 8d ed 1a 0a   vstr s0, [sp, #104]\n"
+  "      12: 1b 92         str r2, [sp, #108]\n"
+  "      14: 1c 93         str r3, [sp, #112]\n"
   "      16: 88 b0         sub sp, #32\n"
   "      18: ad f5 80 5d   sub.w sp, sp, #4096\n"
   "      1c: 08 98         ldr r0, [sp, #32]\n"
@@ -147,13 +147,13 @@
   "     208: cd f8 ff c7   str.w r12, [sp, #2047]\n"
   "     20c: 0d f5 80 5d   add.w sp, sp, #4096\n"
   "     210: 08 b0         add sp, #32\n"
-  "     212: 05 b0         add sp, #20\n"
+  "     212: 01 b0         add sp, #4\n"
   "     214: bd ec 10 8a   vpop {s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31}\n"
   "     218: bd e8 e0 4d   pop.w {r5, r6, r7, r8, r10, r11, lr}\n"
   "     21c: d9 f8 24 80   ldr.w r8, [r9, #36]\n"
   "     220: 70 47         bx lr\n"
   "     222: d9 f8 8c 00   ldr.w r0, [r9, #140]\n"
-  "     226: d9 f8 c4 e2   ldr.w lr, [r9, #708]\n"
+  "     226: d9 f8 c0 e2   ldr.w lr, [r9, #704]\n"
   "     22a: f0 47         blx lr\n"
 };
 
diff --git a/compiler/utils/jni_macro_assembler.h b/compiler/utils/jni_macro_assembler.h
index 659ff4c..0d82458 100644
--- a/compiler/utils/jni_macro_assembler.h
+++ b/compiler/utils/jni_macro_assembler.h
@@ -252,9 +252,18 @@
   virtual void CallFromThread(ThreadOffset<kPointerSize> offset) = 0;
 
   // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be callee save core registers
+  // (already saved before this call) and must preserve all argument registers.
   virtual void TryToTransitionFromRunnableToNative(
       JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) = 0;
 
+  // Generate fast-path for transition to Runnable. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be core argument registers
+  // not used as return registers and it must preserve the `return_reg` if any.
+  virtual void TryToTransitionFromNativeToRunnable(JNIMacroLabel* label,
+                                                   ArrayRef<const ManagedRegister> scratch_regs,
+                                                   ManagedRegister return_reg) = 0;
+
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   virtual void SuspendCheck(JNIMacroLabel* label) = 0;
 
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 8be2a32..1a0d521 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -618,6 +618,57 @@
                 Immediate(0));
 }
 
+void X86JNIMacroAssembler::TryToTransitionFromNativeToRunnable(
+    JNIMacroLabel* label,
+    ArrayRef<const ManagedRegister> scratch_regs,
+    ManagedRegister return_reg) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset32 thread_flags_offset = Thread::ThreadFlagsOffset<kX86PointerSize>();
+  constexpr ThreadOffset32 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kX86PointerSize>(kMutatorLock);
+  constexpr ThreadOffset32 thread_mutator_lock_offset =
+      Thread::MutatorLockOffset<kX86PointerSize>();
+
+  size_t scratch_index = 0u;
+  auto get_scratch_reg = [&]() {
+    while (true) {
+      DCHECK_LT(scratch_index, scratch_regs.size());
+      X86ManagedRegister scratch_reg = scratch_regs[scratch_index].AsX86();
+      ++scratch_index;
+      DCHECK(!scratch_reg.Overlaps(return_reg.AsX86()));
+      if (scratch_reg.AsCpuRegister() != EAX) {
+        return scratch_reg.AsCpuRegister();
+      }
+    }
+  };
+  Register scratch = get_scratch_reg();
+  bool preserve_eax = return_reg.AsX86().Overlaps(X86ManagedRegister::FromCpuRegister(EAX));
+  Register saved_eax = preserve_eax ? get_scratch_reg() : kNoRegister;
+
+  // CAS acquire, old_value = kNativeStateValue, new_value = kRunnableStateValue, no flags.
+  if (preserve_eax) {
+    __ movl(saved_eax, EAX);  // Save EAX.
+  }
+  __ movl(EAX, Immediate(kNativeStateValue));
+  static_assert(kRunnableStateValue == 0u);
+  __ xorl(scratch, scratch);
+  __ fs()->LockCmpxchgl(Address::Absolute(thread_flags_offset.Uint32Value()), scratch);
+  // LOCK CMPXCHG has full barrier semantics, so we don't need barriers here.
+  if (preserve_eax) {
+    __ movl(EAX, saved_eax);  // Restore EAX; MOV does not change flags.
+  }
+  // If any flags are set, or the state is not Native, go to the slow path.
+  // (While the thread can theoretically transition between different Suspended states,
+  // it would be very unexpected to see a state other than Native at this point.)
+  __ j(kNotZero, X86JNIMacroLabel::Cast(label)->AsX86());
+
+  // Set `self->tlsPtr_.held_mutexes[kMutatorLock]` to the mutator lock.
+  __ fs()->movl(scratch, Address::Absolute(thread_mutator_lock_offset.Uint32Value()));
+  __ fs()->movl(Address::Absolute(thread_held_mutex_mutator_lock_offset.Uint32Value()),
+                scratch);
+}
+
 void X86JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   __ fs()->testl(Address::Absolute(Thread::ThreadFlagsOffset<kX86PointerSize>()),
                  Immediate(Thread::SuspendOrCheckpointRequestFlags()));
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.h b/compiler/utils/x86/jni_macro_assembler_x86.h
index 0af6371..7fe0e42 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.h
+++ b/compiler/utils/x86/jni_macro_assembler_x86.h
@@ -161,9 +161,18 @@
   void CallFromThread(ThreadOffset32 offset) override;
 
   // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be callee save core registers
+  // (already saved before this call) and must preserve all argument registers.
   void TryToTransitionFromRunnableToNative(
       JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
 
+  // Generate fast-path for transition to Runnable. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be core argument registers
+  // not used as return registers and it must preserve the `return_reg` if any.
+  void TryToTransitionFromNativeToRunnable(JNIMacroLabel* label,
+                                           ArrayRef<const ManagedRegister> scratch_regs,
+                                           ManagedRegister return_reg) override;
+
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   void SuspendCheck(JNIMacroLabel* label) override;
 
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index b25d5c7..8a90a13 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -698,6 +698,52 @@
       Immediate(0));
 }
 
+void X86_64JNIMacroAssembler::TryToTransitionFromNativeToRunnable(
+    JNIMacroLabel* label,
+    ArrayRef<const ManagedRegister> scratch_regs,
+    ManagedRegister return_reg) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset64 thread_flags_offset = Thread::ThreadFlagsOffset<kX86_64PointerSize>();
+  constexpr ThreadOffset64 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kX86_64PointerSize>(kMutatorLock);
+  constexpr ThreadOffset64 thread_mutator_lock_offset =
+      Thread::MutatorLockOffset<kX86_64PointerSize>();
+
+  DCHECK_GE(scratch_regs.size(), 2u);
+  DCHECK(!scratch_regs[0].AsX86_64().Overlaps(return_reg.AsX86_64()));
+  CpuRegister scratch = scratch_regs[0].AsX86_64().AsCpuRegister();
+  DCHECK(!scratch_regs[1].AsX86_64().Overlaps(return_reg.AsX86_64()));
+  CpuRegister saved_rax = scratch_regs[1].AsX86_64().AsCpuRegister();
+  CpuRegister rax(RAX);
+  bool preserve_rax = return_reg.AsX86_64().Overlaps(X86_64ManagedRegister::FromCpuRegister(RAX));
+
+  // CAS acquire, old_value = kNativeStateValue, new_value = kRunnableStateValue, no flags.
+  if (preserve_rax) {
+    __ movq(saved_rax, rax);  // Save RAX.
+  }
+  __ movl(rax, Immediate(kNativeStateValue));
+  static_assert(kRunnableStateValue == 0u);
+  __ xorl(scratch, scratch);
+  __ gs()->LockCmpxchgl(Address::Absolute(thread_flags_offset.Uint32Value(), /*no_rip=*/ true),
+                        scratch);
+  // LOCK CMPXCHG has full barrier semantics, so we don't need barriers here.
+  if (preserve_rax) {
+    __ movq(rax, saved_rax);  // Restore RAX; MOV does not change flags.
+  }
+  // If any flags are set, or the state is not Native, go to the slow path.
+  // (While the thread can theoretically transition between different Suspended states,
+  // it would be very unexpected to see a state other than Native at this point.)
+  __ j(kNotZero, X86_64JNIMacroLabel::Cast(label)->AsX86_64());
+
+  // Set `self->tlsPtr_.held_mutexes[kMutatorLock]` to the mutator lock.
+  __ gs()->movq(scratch,
+                Address::Absolute(thread_mutator_lock_offset.Uint32Value(), /*no_rip=*/ true));
+  __ gs()->movq(
+      Address::Absolute(thread_held_mutex_mutator_lock_offset.Uint32Value(), /*no_rip=*/ true),
+      scratch);
+}
+
 void X86_64JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   __ gs()->testl(Address::Absolute(Thread::ThreadFlagsOffset<kX86_64PointerSize>(), true),
                  Immediate(Thread::SuspendOrCheckpointRequestFlags()));
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
index 6eb7873..c46d5c6 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
@@ -181,9 +181,18 @@
   void CallFromThread(ThreadOffset64 offset) override;
 
   // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be callee save core registers
+  // (already saved before this call) and must preserve all argument registers.
   void TryToTransitionFromRunnableToNative(
       JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
 
+  // Generate fast-path for transition to Runnable. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be core argument registers
+  // not used as return registers and it must preserve the `return_reg` if any.
+  void TryToTransitionFromNativeToRunnable(JNIMacroLabel* label,
+                                           ArrayRef<const ManagedRegister> scratch_regs,
+                                           ManagedRegister return_reg) override;
+
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   void SuspendCheck(JNIMacroLabel* label) override;