JNI: Inline fast-path for `JniMethodEnd()`.

Golem results for art-opt-cc (higher is better):
linux-ia32                       before after
NativeDowncallStaticNormal       46.766 51.016 (+9.086%)
NativeDowncallStaticNormal6      42.268 45.748 (+8.235%)
NativeDowncallStaticNormalRefs6  41.355 44.776 (+8.272%)
NativeDowncallVirtualNormal      46.361 52.527 (+13.30%)
NativeDowncallVirtualNormal6     41.812 45.206 (+8.118%)
NativeDowncallVirtualNormalRefs6 40.500 44.169 (+9.059%)
(The NativeDowncallVirtualNormal result for x86 is skewed
by one extra good run as Golem reports the best result in
the summary. Using the second best and most frequent
result 50.5, the improvement is only around 8.9%.)
linux-x64                        before after
NativeDowncallStaticNormal       44.169 47.976 (+8.620%)
NativeDowncallStaticNormal6      43.198 46.836 (+8.423%)
NativeDowncallStaticNormalRefs6  38.481 44.687 (+16.13%)
NativeDowncallVirtualNormal      43.672 47.405 (+8.547%)
NativeDowncallVirtualNormal6     42.268 45.726 (+8.182%)
NativeDowncallVirtualNormalRefs6 41.355 44.687 (+8.057%)
(The NativeDowncallStaticNormalRefs6 result for x86-64 is
a bit inflated because recent results jump between ~38.5
and ~40.5. If we take the latter as the baseline, the
improvements is only around 10.3%.)
linux-armv7                      before after
NativeDowncallStaticNormal       10.659 14.620 (+37.16%)
NativeDowncallStaticNormal6      9.8377 13.120 (+33.36%)
NativeDowncallStaticNormalRefs6  8.8714 11.454 (+29.11%)
NativeDowncallVirtualNormal      10.511 14.349 (+36.51%)
NativeDowncallVirtualNormal6     9.9701 13.347 (+33.87%)
NativeDowncallVirtualNormalRefs6 8.9241 11.454 (+28.35%)
linux-armv8                      before after
NativeDowncallStaticNormal       10.608 16.329 (+53.93%)
NativeDowncallStaticNormal6      10.179 15.347 (+50.76%)
NativeDowncallStaticNormalRefs6  9.2457 13.705 (+48.23%)
NativeDowncallVirtualNormal      9.9850 14.903 (+49.25%)
NativeDowncallVirtualNormal6     9.9206 14.757 (+48.75%)
NativeDowncallVirtualNormalRefs6 8.8235 12.789 (+44.94%)

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 172332525
Change-Id: Ie144bc4f7f82be95790ea7d3123b81a3b6bfa603
diff --git a/compiler/jni/jni_cfi_test_expected.inc b/compiler/jni/jni_cfi_test_expected.inc
index 6278f12..3fe8226 100644
--- a/compiler/jni/jni_cfi_test_expected.inc
+++ b/compiler/jni/jni_cfi_test_expected.inc
@@ -1,8 +1,8 @@
 // TODO These arrays should be generated automatically or have instructions for re-creation.
 static constexpr uint8_t expected_asm_kThumb2[] = {
-    0x2D, 0xE9, 0xE0, 0x4D, 0x2D, 0xED, 0x10, 0x8A, 0x85, 0xB0, 0x00, 0x90,
-    0x1D, 0x91, 0x8D, 0xED, 0x1E, 0x0A, 0x1F, 0x92, 0x20, 0x93, 0x88, 0xB0,
-    0x08, 0xB0, 0x05, 0xB0, 0xBD, 0xEC, 0x10, 0x8A, 0xBD, 0xE8, 0xE0, 0x4D,
+    0x2D, 0xE9, 0xE0, 0x4D, 0x2D, 0xED, 0x10, 0x8A, 0x81, 0xB0, 0x00, 0x90,
+    0x19, 0x91, 0x8D, 0xED, 0x1A, 0x0A, 0x1B, 0x92, 0x1C, 0x93, 0x88, 0xB0,
+    0x08, 0xB0, 0x01, 0xB0, 0xBD, 0xEC, 0x10, 0x8A, 0xBD, 0xE8, 0xE0, 0x4D,
     0xD9, 0xF8, 0x24, 0x80, 0x70, 0x47,
 };
 static constexpr uint8_t expected_cfi_kThumb2[] = {
@@ -11,12 +11,12 @@
     0x51, 0x16, 0x05, 0x52, 0x15, 0x05, 0x53, 0x14, 0x05, 0x54, 0x13, 0x05,
     0x55, 0x12, 0x05, 0x56, 0x11, 0x05, 0x57, 0x10, 0x05, 0x58, 0x0F, 0x05,
     0x59, 0x0E, 0x05, 0x5A, 0x0D, 0x05, 0x5B, 0x0C, 0x05, 0x5C, 0x0B, 0x05,
-    0x5D, 0x0A, 0x05, 0x5E, 0x09, 0x05, 0x5F, 0x08, 0x42, 0x0E, 0x70, 0x4E,
-    0x0E, 0x90, 0x01, 0x42, 0x0E, 0x70, 0x0A, 0x42, 0x0E, 0x5C, 0x44, 0x0E,
+    0x5D, 0x0A, 0x05, 0x5E, 0x09, 0x05, 0x5F, 0x08, 0x42, 0x0E, 0x60, 0x4E,
+    0x0E, 0x80, 0x01, 0x42, 0x0E, 0x60, 0x0A, 0x42, 0x0E, 0x5C, 0x44, 0x0E,
     0x1C, 0x06, 0x50, 0x06, 0x51, 0x06, 0x52, 0x06, 0x53, 0x06, 0x54, 0x06,
     0x55, 0x06, 0x56, 0x06, 0x57, 0x06, 0x58, 0x06, 0x59, 0x06, 0x5A, 0x06,
     0x5B, 0x06, 0x5C, 0x06, 0x5D, 0x06, 0x5E, 0x06, 0x5F, 0x44, 0x0E, 0x00,
-    0xC5, 0xC6, 0xC7, 0xC8, 0xCA, 0xCB, 0xCE, 0x46, 0x0B, 0x0E, 0x70,
+    0xC5, 0xC6, 0xC7, 0xC8, 0xCA, 0xCB, 0xCE, 0x46, 0x0B, 0x0E, 0x60,
 };
 // 0x00000000: push {r5,r6,r7,r8,r10,r11,lr}
 // 0x00000004: .cfi_def_cfa_offset: 28
@@ -45,19 +45,19 @@
 // 0x00000008: .cfi_offset_extended: r93 at cfa-40
 // 0x00000008: .cfi_offset_extended: r94 at cfa-36
 // 0x00000008: .cfi_offset_extended: r95 at cfa-32
-// 0x00000008: sub sp, #20
-// 0x0000000a: .cfi_def_cfa_offset: 112
+// 0x00000008: sub sp, #4
+// 0x0000000a: .cfi_def_cfa_offset: 96
 // 0x0000000a: str r0, [sp]
-// 0x0000000c: str r1, [sp, #116]
-// 0x0000000e: vstr s0, [sp, #120]
-// 0x00000012: str r2, [sp, #124]
-// 0x00000014: str r3, [sp, #128]
+// 0x0000000c: str r1, [sp, #100]
+// 0x0000000e: vstr s0, [sp, #104]
+// 0x00000012: str r2, [sp, #108]
+// 0x00000014: str r3, [sp, #112]
 // 0x00000016: sub sp, #32
-// 0x00000018: .cfi_def_cfa_offset: 144
+// 0x00000018: .cfi_def_cfa_offset: 128
 // 0x00000018: add sp, #32
-// 0x0000000a: .cfi_def_cfa_offset: 112
+// 0x0000001a: .cfi_def_cfa_offset: 96
 // 0x0000001a: .cfi_remember_state
-// 0x0000001a: add sp, #20
+// 0x0000001a: add sp, #4
 // 0x0000001c: .cfi_def_cfa_offset: 92
 // 0x0000001c: vpop {s16-s31}
 // 0x00000020: .cfi_def_cfa_offset: 28
@@ -86,7 +86,7 @@
 // 0x00000024: .cfi_restore: r10
 // 0x00000024: .cfi_restore: r11
 // 0x00000024: .cfi_restore: r14
-// 0x00000024: ldr r8, [tr, #48] ; is_gc_marking
+// 0x00000024: ldr r8, [tr, #36] ; is_gc_marking
 // 0x00000028: bx lr
 // 0x0000002a: .cfi_restore_state
 // 0x0000002a: .cfi_def_cfa_offset: 112
@@ -246,26 +246,25 @@
 
 static constexpr uint8_t expected_asm_kX86_64[] = {
     0x41, 0x57, 0x41, 0x56, 0x41, 0x55, 0x41, 0x54, 0x55, 0x53, 0x48, 0x83,
-    0xEC, 0x38, 0xF2, 0x44, 0x0F, 0x11, 0x7C, 0x24, 0x30, 0xF2, 0x44, 0x0F,
-    0x11, 0x74, 0x24, 0x28, 0xF2, 0x44, 0x0F, 0x11, 0x6C, 0x24, 0x20, 0xF2,
-    0x44, 0x0F, 0x11, 0x64, 0x24, 0x18, 0x48, 0x89, 0x3C, 0x24, 0x89, 0x74,
-    0x24, 0x78, 0xF3, 0x0F, 0x11, 0x44, 0x24, 0x7C, 0x89, 0x94, 0x24, 0x80,
-    0x00, 0x00, 0x00, 0x89, 0x8C, 0x24, 0x84, 0x00, 0x00, 0x00, 0x48, 0x83,
-    0xC4, 0xE0, 0x48, 0x83, 0xC4, 0x20, 0xF2, 0x44, 0x0F, 0x10, 0x64, 0x24,
-    0x18, 0xF2, 0x44, 0x0F, 0x10, 0x6C, 0x24, 0x20, 0xF2, 0x44, 0x0F, 0x10,
-    0x74, 0x24, 0x28, 0xF2, 0x44, 0x0F, 0x10, 0x7C, 0x24, 0x30, 0x48, 0x83,
-    0xC4, 0x38, 0x5B, 0x5D, 0x41, 0x5C, 0x41, 0x5D, 0x41, 0x5E, 0x41, 0x5F,
-    0xC3,
+    0xEC, 0x28, 0xF2, 0x44, 0x0F, 0x11, 0x7C, 0x24, 0x20, 0xF2, 0x44, 0x0F,
+    0x11, 0x74, 0x24, 0x18, 0xF2, 0x44, 0x0F, 0x11, 0x6C, 0x24, 0x10, 0xF2,
+    0x44, 0x0F, 0x11, 0x64, 0x24, 0x08, 0x48, 0x89, 0x3C, 0x24, 0x89, 0x74,
+    0x24, 0x68, 0xF3, 0x0F, 0x11, 0x44, 0x24, 0x6C, 0x89, 0x54, 0x24, 0x70,
+    0x89, 0x4C, 0x24, 0x74, 0x48, 0x83, 0xC4, 0xE0, 0x48, 0x83, 0xC4, 0x20,
+    0xF2, 0x44, 0x0F, 0x10, 0x64, 0x24, 0x08, 0xF2, 0x44, 0x0F, 0x10, 0x6C,
+    0x24, 0x10, 0xF2, 0x44, 0x0F, 0x10, 0x74, 0x24, 0x18, 0xF2, 0x44, 0x0F,
+    0x10, 0x7C, 0x24, 0x20, 0x48, 0x83, 0xC4, 0x28, 0x5B, 0x5D, 0x41, 0x5C,
+    0x41, 0x5D, 0x41, 0x5E, 0x41, 0x5F, 0xC3,
 };
 static constexpr uint8_t expected_cfi_kX86_64[] = {
     0x42, 0x0E, 0x10, 0x8F, 0x04, 0x42, 0x0E, 0x18, 0x8E, 0x06, 0x42, 0x0E,
     0x20, 0x8D, 0x08, 0x42, 0x0E, 0x28, 0x8C, 0x0A, 0x41, 0x0E, 0x30, 0x86,
-    0x0C, 0x41, 0x0E, 0x38, 0x83, 0x0E, 0x44, 0x0E, 0x70, 0x47, 0xA0, 0x10,
-    0x47, 0x9F, 0x12, 0x47, 0x9E, 0x14, 0x47, 0x9D, 0x16, 0x60, 0x0E, 0x90,
-    0x01, 0x44, 0x0E, 0x70, 0x0A, 0x47, 0xDD, 0x47, 0xDE, 0x47, 0xDF, 0x47,
+    0x0C, 0x41, 0x0E, 0x38, 0x83, 0x0E, 0x44, 0x0E, 0x60, 0x47, 0xA0, 0x10,
+    0x47, 0x9F, 0x12, 0x47, 0x9E, 0x14, 0x47, 0x9D, 0x16, 0x5A, 0x0E, 0x80,
+    0x01, 0x44, 0x0E, 0x60, 0x0A, 0x47, 0xDD, 0x47, 0xDE, 0x47, 0xDF, 0x47,
     0xE0, 0x44, 0x0E, 0x38, 0x41, 0x0E, 0x30, 0xC3, 0x41, 0x0E, 0x28, 0xC6,
     0x42, 0x0E, 0x20, 0xCC, 0x42, 0x0E, 0x18, 0xCD, 0x42, 0x0E, 0x10, 0xCE,
-    0x42, 0x0E, 0x08, 0xCF, 0x41, 0x0B, 0x0E, 0x70,
+    0x42, 0x0E, 0x08, 0xCF, 0x41, 0x0B, 0x0E, 0x60,
 };
 // 0x00000000: push r15
 // 0x00000002: .cfi_def_cfa_offset: 16
@@ -285,55 +284,55 @@
 // 0x00000009: push rbx
 // 0x0000000a: .cfi_def_cfa_offset: 56
 // 0x0000000a: .cfi_offset: r3 at cfa-56
-// 0x0000000a: subq rsp, 56
-// 0x0000000e: .cfi_def_cfa_offset: 112
-// 0x0000000e: movsd [rsp + 48], xmm15
+// 0x0000000a: subq rsp, 40
+// 0x0000000e: .cfi_def_cfa_offset: 96
+// 0x0000000e: movsd [rsp + 32], xmm15
 // 0x00000015: .cfi_offset: r32 at cfa-64
-// 0x00000015: movsd [rsp + 40], xmm14
+// 0x00000015: movsd [rsp + 24], xmm14
 // 0x0000001c: .cfi_offset: r31 at cfa-72
-// 0x0000001c: movsd [rsp + 32], xmm13
+// 0x0000001c: movsd [rsp + 16], xmm13
 // 0x00000023: .cfi_offset: r30 at cfa-80
-// 0x00000023: movsd [rsp + 24], xmm12
+// 0x00000023: movsd [rsp + 8], xmm12
 // 0x0000002a: .cfi_offset: r29 at cfa-88
 // 0x0000002a: movq [rsp], rdi
-// 0x0000002e: mov [rsp + 120], esi
-// 0x00000032: movss [rsp + 124], xmm0
-// 0x00000038: mov [rsp + 128], edx
-// 0x0000003f: mov [rsp + 132], ecx
-// 0x00000046: addq rsp, -32
-// 0x0000004a: .cfi_def_cfa_offset: 144
-// 0x0000004a: addq rsp, 32
-// 0x0000004e: .cfi_def_cfa_offset: 112
-// 0x0000004e: .cfi_remember_state
-// 0x0000004e: movsd xmm12, [rsp + 24]
-// 0x00000055: .cfi_restore: r29
-// 0x00000055: movsd xmm13, [rsp + 32]
-// 0x0000005c: .cfi_restore: r30
-// 0x0000005c: movsd xmm14, [rsp + 40]
-// 0x00000063: .cfi_restore: r31
-// 0x00000063: movsd xmm15, [rsp + 48]
-// 0x0000006a: .cfi_restore: r32
-// 0x0000006a: addq rsp, 56
-// 0x0000006e: .cfi_def_cfa_offset: 56
-// 0x0000006e: pop rbx
-// 0x0000006f: .cfi_def_cfa_offset: 48
-// 0x0000006f: .cfi_restore: r3
-// 0x0000006f: pop rbp
-// 0x00000070: .cfi_def_cfa_offset: 40
-// 0x00000070: .cfi_restore: r6
-// 0x00000070: pop r12
-// 0x00000072: .cfi_def_cfa_offset: 32
-// 0x00000072: .cfi_restore: r12
-// 0x00000072: pop r13
-// 0x00000074: .cfi_def_cfa_offset: 24
-// 0x00000074: .cfi_restore: r13
-// 0x00000074: pop r14
-// 0x00000076: .cfi_def_cfa_offset: 16
-// 0x00000076: .cfi_restore: r14
-// 0x00000076: pop r15
-// 0x00000078: .cfi_def_cfa_offset: 8
-// 0x00000078: .cfi_restore: r15
-// 0x00000078: ret
-// 0x00000079: .cfi_restore_state
-// 0x00000079: .cfi_def_cfa_offset: 112
+// 0x0000002e: mov [rsp + 104], esi
+// 0x00000032: movss [rsp + 108], xmm0
+// 0x00000038: mov [rsp + 112], edx
+// 0x0000003c: mov [rsp + 116], ecx
+// 0x00000040: addq rsp, -32
+// 0x00000044: .cfi_def_cfa_offset: 128
+// 0x00000044: addq rsp, 32
+// 0x00000048: .cfi_def_cfa_offset: 96
+// 0x00000048: .cfi_remember_state
+// 0x00000048: movsd xmm12, [rsp + 8]
+// 0x0000004f: .cfi_restore: r29
+// 0x0000004f: movsd xmm13, [rsp + 16]
+// 0x00000056: .cfi_restore: r30
+// 0x00000056: movsd xmm14, [rsp + 24]
+// 0x0000005d: .cfi_restore: r31
+// 0x0000005d: movsd xmm15, [rsp + 32]
+// 0x00000064: .cfi_restore: r32
+// 0x00000064: addq rsp, 40
+// 0x00000068: .cfi_def_cfa_offset: 56
+// 0x00000068: pop rbx
+// 0x00000069: .cfi_def_cfa_offset: 48
+// 0x00000069: .cfi_restore: r3
+// 0x00000069: pop rbp
+// 0x0000006a: .cfi_def_cfa_offset: 40
+// 0x0000006a: .cfi_restore: r6
+// 0x0000006a: pop r12
+// 0x0000006c: .cfi_def_cfa_offset: 32
+// 0x0000006c: .cfi_restore: r12
+// 0x0000006c: pop r13
+// 0x0000006e: .cfi_def_cfa_offset: 24
+// 0x0000006e: .cfi_restore: r13
+// 0x0000006e: pop r14
+// 0x00000070: .cfi_def_cfa_offset: 16
+// 0x00000070: .cfi_restore: r14
+// 0x00000070: pop r15
+// 0x00000072: .cfi_def_cfa_offset: 8
+// 0x00000072: .cfi_restore: r15
+// 0x00000072: ret
+// 0x00000073: .cfi_restore_state
+// 0x00000073: .cfi_def_cfa_offset: 96
 
diff --git a/compiler/jni/quick/arm/calling_convention_arm.cc b/compiler/jni/quick/arm/calling_convention_arm.cc
index da438bd..c1afdb8 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.cc
+++ b/compiler/jni/quick/arm/calling_convention_arm.cc
@@ -32,8 +32,8 @@
 
 // List of parameters passed via registers for JNI.
 // JNI uses soft-float, so there is only a GPR list.
-static const Register kJniArgumentRegisters[] = {
-  R0, R1, R2, R3
+static constexpr Register kJniArgumentRegisters[] = {
+    R0, R1, R2, R3
 };
 
 static_assert(kJniArgumentRegisterCount == arraysize(kJniArgumentRegisters));
@@ -43,20 +43,23 @@
 //
 
 // Used by hard float. (General purpose registers.)
-static const Register kHFCoreArgumentRegisters[] = {
-  R0, R1, R2, R3
+static constexpr ManagedRegister kHFCoreArgumentRegisters[] = {
+    ArmManagedRegister::FromCoreRegister(R0),
+    ArmManagedRegister::FromCoreRegister(R1),
+    ArmManagedRegister::FromCoreRegister(R2),
+    ArmManagedRegister::FromCoreRegister(R3),
 };
 static constexpr size_t kHFCoreArgumentRegistersCount = arraysize(kHFCoreArgumentRegisters);
 
 // (VFP single-precision registers.)
-static const SRegister kHFSArgumentRegisters[] = {
-  S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15
+static constexpr SRegister kHFSArgumentRegisters[] = {
+    S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15
 };
 static constexpr size_t kHFSArgumentRegistersCount = arraysize(kHFSArgumentRegisters);
 
 // (VFP double-precision registers.)
-static const DRegister kHFDArgumentRegisters[] = {
-  D0, D1, D2, D3, D4, D5, D6, D7
+static constexpr DRegister kHFDArgumentRegisters[] = {
+    D0, D1, D2, D3, D4, D5, D6, D7
 };
 static constexpr size_t kHFDArgumentRegistersCount = arraysize(kHFDArgumentRegisters);
 
@@ -159,7 +162,7 @@
 
 // Calling convention
 
-ManagedRegister ArmManagedRuntimeCallingConvention::ReturnRegister() {
+ManagedRegister ArmManagedRuntimeCallingConvention::ReturnRegister() const {
   switch (GetShorty()[0]) {
     case 'V':
       return ArmManagedRegister::NoRegister();
@@ -174,7 +177,7 @@
   }
 }
 
-ManagedRegister ArmJniCallingConvention::ReturnRegister() {
+ManagedRegister ArmJniCallingConvention::ReturnRegister() const {
   switch (GetShorty()[0]) {
   case 'V':
     return ArmManagedRegister::NoRegister();
@@ -186,7 +189,7 @@
   }
 }
 
-ManagedRegister ArmJniCallingConvention::IntReturnRegister() {
+ManagedRegister ArmJniCallingConvention::IntReturnRegister() const {
   return ArmManagedRegister::FromCoreRegister(R0);
 }
 
@@ -272,7 +275,7 @@
       CHECK_EQ(RoundUp(gpr_index_, 2u), 2u);
       return ArmManagedRegister::FromRegisterPair(R2_R3);
     } else {
-      return ArmManagedRegister::FromCoreRegister(kHFCoreArgumentRegisters[gpr_index_]);
+      return kHFCoreArgumentRegisters[gpr_index_];
     }
   }
 }
@@ -400,11 +403,27 @@
   return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters).SubArray(kStart, kLength);
 }
 
+ArrayRef<const ManagedRegister> ArmJniCallingConvention::ArgumentScratchRegisters() const {
+  DCHECK(!IsCriticalNative());
+  // Exclude r0 or r0-r1 if they are used as return registers.
+  static_assert(kHFCoreArgumentRegisters[0].Equals(ArmManagedRegister::FromCoreRegister(R0)));
+  static_assert(kHFCoreArgumentRegisters[1].Equals(ArmManagedRegister::FromCoreRegister(R1)));
+  ArrayRef<const ManagedRegister> scratch_regs(kHFCoreArgumentRegisters);
+  ArmManagedRegister return_reg = ReturnRegister().AsArm();
+  auto return_reg_overlaps = [return_reg](ManagedRegister reg) {
+    return return_reg.Overlaps(reg.AsArm());
+  };
+  if (return_reg_overlaps(scratch_regs[0])) {
+    scratch_regs = scratch_regs.SubArray(/*pos=*/ return_reg_overlaps(scratch_regs[1]) ? 2u : 1u);
+  }
+  DCHECK(std::none_of(scratch_regs.begin(), scratch_regs.end(), return_reg_overlaps));
+  return scratch_regs;
+}
+
 size_t ArmJniCallingConvention::FrameSize() const {
   if (UNLIKELY(is_critical_native_)) {
     CHECK(!SpillsMethod());
     CHECK(!HasLocalReferenceSegmentState());
-    CHECK(!SpillsReturnValue());
     return 0u;  // There is no managed frame for @CriticalNative.
   }
 
@@ -417,19 +436,6 @@
   DCHECK(HasLocalReferenceSegmentState());
   // Cookie is saved in one of the spilled registers.
 
-  // Plus return value spill area size
-  if (SpillsReturnValue()) {
-    // For 64-bit return values there shall be a 4B alignment gap between
-    // the method pointer and the saved return value.
-    size_t padding = ReturnValueSaveLocation().SizeValue() - method_ptr_size;
-    DCHECK_EQ(padding,
-              (GetReturnType() == Primitive::kPrimLong || GetReturnType() == Primitive::kPrimDouble)
-                  ? 4u
-                  : 0u);
-    total_size += padding;
-    total_size += SizeOfReturnValue();
-  }
-
   return RoundUp(total_size, kStackAlignment);
 }
 
diff --git a/compiler/jni/quick/arm/calling_convention_arm.h b/compiler/jni/quick/arm/calling_convention_arm.h
index 94dacc4..4526d9e 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.h
+++ b/compiler/jni/quick/arm/calling_convention_arm.h
@@ -35,7 +35,7 @@
         double_index_(0u) {}
   ~ArmManagedRuntimeCallingConvention() override {}
   // Calling convention
-  ManagedRegister ReturnRegister() override;
+  ManagedRegister ReturnRegister() const override;
   void ResetIterator(FrameOffset displacement) override;
   // Managed runtime calling convention
   ManagedRegister MethodRegister() override;
@@ -61,14 +61,15 @@
                           const char* shorty);
   ~ArmJniCallingConvention() override {}
   // Calling convention
-  ManagedRegister ReturnRegister() override;
-  ManagedRegister IntReturnRegister() override;
+  ManagedRegister ReturnRegister() const override;
+  ManagedRegister IntReturnRegister() const override;
   // JNI calling convention
   void Next() override;  // Override default behavior for AAPCS
   size_t FrameSize() const override;
   size_t OutFrameSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ArrayRef<const ManagedRegister> CalleeSaveScratchRegisters() const override;
+  ArrayRef<const ManagedRegister> ArgumentScratchRegisters() const override;
   uint32_t CoreSpillMask() const override;
   uint32_t FpSpillMask() const override;
   bool IsCurrentParamInRegister() override;
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.cc b/compiler/jni/quick/arm64/calling_convention_arm64.cc
index f816a69..ec77db3 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.cc
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.cc
@@ -25,16 +25,18 @@
 namespace art {
 namespace arm64 {
 
-static const XRegister kXArgumentRegisters[] = {
-  X0, X1, X2, X3, X4, X5, X6, X7
+static constexpr ManagedRegister kXArgumentRegisters[] = {
+    Arm64ManagedRegister::FromXRegister(X0),
+    Arm64ManagedRegister::FromXRegister(X1),
+    Arm64ManagedRegister::FromXRegister(X2),
+    Arm64ManagedRegister::FromXRegister(X3),
+    Arm64ManagedRegister::FromXRegister(X4),
+    Arm64ManagedRegister::FromXRegister(X5),
+    Arm64ManagedRegister::FromXRegister(X6),
+    Arm64ManagedRegister::FromXRegister(X7),
 };
 static_assert(kMaxIntLikeRegisterArguments == arraysize(kXArgumentRegisters));
 
-static const WRegister kWArgumentRegisters[] = {
-  W0, W1, W2, W3, W4, W5, W6, W7
-};
-static_assert(kMaxIntLikeRegisterArguments == arraysize(kWArgumentRegisters));
-
 static const DRegister kDArgumentRegisters[] = {
   D0, D1, D2, D3, D4, D5, D6, D7
 };
@@ -154,15 +156,15 @@
   }
 }
 
-ManagedRegister Arm64ManagedRuntimeCallingConvention::ReturnRegister() {
+ManagedRegister Arm64ManagedRuntimeCallingConvention::ReturnRegister() const {
   return ReturnRegisterForShorty(GetShorty());
 }
 
-ManagedRegister Arm64JniCallingConvention::ReturnRegister() {
+ManagedRegister Arm64JniCallingConvention::ReturnRegister() const {
   return ReturnRegisterForShorty(GetShorty());
 }
 
-ManagedRegister Arm64JniCallingConvention::IntReturnRegister() {
+ManagedRegister Arm64JniCallingConvention::IntReturnRegister() const {
   return Arm64ManagedRegister::FromWRegister(W0);
 }
 
@@ -195,12 +197,11 @@
     }
   } else {
     size_t non_fp_arg_number = itr_args_ - itr_float_and_doubles_;
+    ManagedRegister x_reg = kXArgumentRegisters[/* method */ 1u + non_fp_arg_number];
     if (IsCurrentParamALong()) {
-      XRegister x_reg = kXArgumentRegisters[/* method */ 1u + non_fp_arg_number];
-      return Arm64ManagedRegister::FromXRegister(x_reg);
+      return x_reg;
     } else {
-      WRegister w_reg = kWArgumentRegisters[/* method */ 1u + non_fp_arg_number];
-      return Arm64ManagedRegister::FromWRegister(w_reg);
+      return Arm64ManagedRegister::FromWRegister(x_reg.AsArm64().AsOverlappingWRegister());
     }
   }
 }
@@ -247,11 +248,26 @@
   return ArrayRef<const ManagedRegister>(kAapcs64CalleeSaveRegisters).SubArray(kStart, kLength);
 }
 
+ArrayRef<const ManagedRegister> Arm64JniCallingConvention::ArgumentScratchRegisters() const {
+  DCHECK(!IsCriticalNative());
+  // Exclude x0 if it's used as a return register.
+  static_assert(kXArgumentRegisters[0].Equals(Arm64ManagedRegister::FromXRegister(X0)));
+  ArrayRef<const ManagedRegister> scratch_regs(kXArgumentRegisters);
+  Arm64ManagedRegister return_reg = ReturnRegister().AsArm64();
+  auto return_reg_overlaps = [return_reg](ManagedRegister reg) {
+    return return_reg.Overlaps(reg.AsArm64());
+  };
+  if (return_reg_overlaps(scratch_regs[0])) {
+    scratch_regs = scratch_regs.SubArray(/*pos=*/ 1u);
+  }
+  DCHECK(std::none_of(scratch_regs.begin(), scratch_regs.end(), return_reg_overlaps));
+  return scratch_regs;
+}
+
 size_t Arm64JniCallingConvention::FrameSize() const {
   if (is_critical_native_) {
     CHECK(!SpillsMethod());
     CHECK(!HasLocalReferenceSegmentState());
-    CHECK(!SpillsReturnValue());
     return 0u;  // There is no managed frame for @CriticalNative.
   }
 
@@ -264,13 +280,6 @@
   DCHECK(HasLocalReferenceSegmentState());
   // Cookie is saved in one of the spilled registers.
 
-  // Plus return value spill area size
-  if (SpillsReturnValue()) {
-    // No padding between the method pointer and the return value on arm64.
-    DCHECK_EQ(ReturnValueSaveLocation().SizeValue(), method_ptr_size);
-    total_size += SizeOfReturnValue();
-  }
-
   return RoundUp(total_size, kStackAlignment);
 }
 
@@ -343,10 +352,11 @@
   } else {
     int gp_reg = itr_args_ - itr_float_and_doubles_;
     CHECK_LT(static_cast<unsigned int>(gp_reg), kMaxIntLikeRegisterArguments);
+    ManagedRegister x_reg = kXArgumentRegisters[gp_reg];
     if (IsCurrentParamALong() || IsCurrentParamAReference() || IsCurrentParamJniEnv())  {
-      return Arm64ManagedRegister::FromXRegister(kXArgumentRegisters[gp_reg]);
+      return x_reg;
     } else {
-      return Arm64ManagedRegister::FromWRegister(kWArgumentRegisters[gp_reg]);
+      return Arm64ManagedRegister::FromWRegister(x_reg.AsArm64().AsOverlappingWRegister());
     }
   }
 }
@@ -374,7 +384,7 @@
                       }));
   DCHECK(std::none_of(kXArgumentRegisters,
                       kXArgumentRegisters + std::size(kXArgumentRegisters),
-                      [](XRegister arg) { return arg == X15; }));
+                      [](ManagedRegister arg) { return arg.AsArm64().AsXRegister() == X15; }));
 }
 
 ManagedRegister Arm64JniCallingConvention::LockingArgumentRegister() const {
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.h b/compiler/jni/quick/arm64/calling_convention_arm64.h
index 003b0c3..176271e 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.h
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.h
@@ -32,7 +32,7 @@
                                         PointerSize::k64) {}
   ~Arm64ManagedRuntimeCallingConvention() override {}
   // Calling convention
-  ManagedRegister ReturnRegister() override;
+  ManagedRegister ReturnRegister() const override;
   // Managed runtime calling convention
   ManagedRegister MethodRegister() override;
   bool IsCurrentParamInRegister() override;
@@ -53,13 +53,14 @@
                             const char* shorty);
   ~Arm64JniCallingConvention() override {}
   // Calling convention
-  ManagedRegister ReturnRegister() override;
-  ManagedRegister IntReturnRegister() override;
+  ManagedRegister ReturnRegister() const override;
+  ManagedRegister IntReturnRegister() const override;
   // JNI calling convention
   size_t FrameSize() const override;
   size_t OutFrameSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ArrayRef<const ManagedRegister> CalleeSaveScratchRegisters() const override;
+  ArrayRef<const ManagedRegister> ArgumentScratchRegisters() const override;
   uint32_t CoreSpillMask() const override;
   uint32_t FpSpillMask() const override;
   bool IsCurrentParamInRegister() override;
diff --git a/compiler/jni/quick/calling_convention.cc b/compiler/jni/quick/calling_convention.cc
index e7a84fd..eb4d372 100644
--- a/compiler/jni/quick/calling_convention.cc
+++ b/compiler/jni/quick/calling_convention.cc
@@ -174,19 +174,6 @@
   return NumReferenceArgs() + (IsStatic() ? 1 : 0);
 }
 
-FrameOffset JniCallingConvention::ReturnValueSaveLocation() const {
-  // The saved return value goes at a properly aligned slot after the method pointer.
-  DCHECK(SpillsReturnValue());
-  size_t return_value_offset = static_cast<size_t>(frame_pointer_size_);
-  const size_t return_value_size = SizeOfReturnValue();
-  DCHECK(return_value_size == 4u || return_value_size == 8u) << return_value_size;
-  DCHECK_ALIGNED(return_value_offset, 4u);
-  if (return_value_size == 8u) {
-    return_value_offset = RoundUp(return_value_offset, 8u);
-  }
-  return FrameOffset(displacement_.SizeValue() + return_value_offset);
-}
-
 bool JniCallingConvention::HasNext() {
   if (IsCurrentArgExtraForJni()) {
     return true;
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index 0be5233..e2f3bfb 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -46,7 +46,7 @@
   }
 
   // Register that holds result of this method invocation.
-  virtual ManagedRegister ReturnRegister() = 0;
+  virtual ManagedRegister ReturnRegister() const = 0;
 
   // Iterator interface
 
@@ -305,11 +305,8 @@
   virtual size_t OutFrameSize() const = 0;
   // Number of references in stack indirect reference table
   size_t ReferenceCount() const;
-  // Location where the return value of a call can be squirreled if another
-  // call is made following the native call
-  FrameOffset ReturnValueSaveLocation() const;
   // Register that holds result if it is integer.
-  virtual ManagedRegister IntReturnRegister() = 0;
+  virtual ManagedRegister IntReturnRegister() const = 0;
   // Whether the compiler needs to ensure zero-/sign-extension of a small result type
   virtual bool RequiresSmallResultTypeExtension() const = 0;
 
@@ -322,6 +319,10 @@
   // JNI compiler currently requires at least 3 callee save scratch registers.
   virtual ArrayRef<const ManagedRegister> CalleeSaveScratchRegisters() const = 0;
 
+  // Subset of core argument registers that can be used for arbitrary purposes after
+  // calling the native function. These should exclude the return register(s).
+  virtual ArrayRef<const ManagedRegister> ArgumentScratchRegisters() const = 0;
+
   // Spill mask values
   virtual uint32_t CoreSpillMask() const = 0;
   virtual uint32_t FpSpillMask() const = 0;
@@ -383,14 +384,6 @@
            return_type == Primitive::kPrimChar;
   }
 
-  // Does the transition back spill the return value in the stack frame?
-  bool SpillsReturnValue() const {
-    // Exclude return value for @FastNative and @CriticalNative methods for optimization speed.
-    // References are passed directly to the "end method" and there is nothing to save for `void`.
-    return (!IsFastNative() && !IsCriticalNative()) &&
-           (!IsReturnAReference() && SizeOfReturnValue() != 0u);
-  }
-
  protected:
   // Named iterator positions
   enum IteratorPos {
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index 8bb6cc5..be519c1 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -74,19 +74,6 @@
   return JNIMacroAssembler<kPointerSize>::Create(allocator, isa, features);
 }
 
-template <PointerSize kPointerSize>
-static ThreadOffset<kPointerSize> GetJniMethodEndThreadOffset(bool reference_return) {
-  ThreadOffset<kPointerSize> jni_end(-1);
-  if (reference_return) {
-    // Pass result.
-    jni_end = QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndWithReference);
-  } else {
-    jni_end = QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEnd);
-  }
-
-  return jni_end;
-}
-
 
 // Generate the JNI bridge for the given method, general contract:
 // - Arguments are in the managed runtime format, either on stack or in
@@ -422,25 +409,8 @@
     }
   }
 
-  // 5. Transition to Runnable (if normal native).
-
-  // 5.1. Spill or move the return value if needed.
-  // TODO: Use `callee_save_temp` instead of stack slot when possible.
-  bool spill_return_value = main_jni_conv->SpillsReturnValue();
-  FrameOffset return_save_location =
-      spill_return_value ? main_jni_conv->ReturnValueSaveLocation() : FrameOffset(0);
-  if (spill_return_value) {
-    DCHECK(!is_critical_native);
-    // For normal JNI, store the return value on the stack because the call to
-    // JniMethodEnd will clobber the return value. It will be restored in (13).
-    CHECK_LT(return_save_location.Uint32Value(), current_frame_size);
-    __ Store(return_save_location,
-             main_jni_conv->ReturnRegister(),
-             main_jni_conv->SizeOfReturnValue());
-  } else if (UNLIKELY(is_fast_native || is_critical_native) &&
-             main_jni_conv->SizeOfReturnValue() != 0) {
-    // For @FastNative and @CriticalNative only,
-    // move the JNI return register into the managed return register (if they don't match).
+  // 4.6. Move the JNI return register into the managed return register (if they don't match).
+  if (main_jni_conv->SizeOfReturnValue() != 0) {
     ManagedRegister jni_return_reg = main_jni_conv->ReturnRegister();
     ManagedRegister mr_return_reg = mr_conv->ReturnRegister();
 
@@ -460,11 +430,27 @@
     }
   }
 
-  // 5.2. For @FastNative that returns a reference, do an early exception check so that the
+  // 5. Transition to Runnable (if normal native).
+
+  // 5.1. Try transitioning to Runnable with a fast-path implementation.
+  //      If fast-path fails, make a slow-path call to `JniMethodEnd()`.
+  std::unique_ptr<JNIMacroLabel> transition_to_runnable_slow_path;
+  std::unique_ptr<JNIMacroLabel> transition_to_runnable_resume;
+  if (LIKELY(!is_critical_native && !is_fast_native)) {
+    transition_to_runnable_slow_path = __ CreateLabel();
+    transition_to_runnable_resume = __ CreateLabel();
+    __ TryToTransitionFromNativeToRunnable(transition_to_runnable_slow_path.get(),
+                                           main_jni_conv->ArgumentScratchRegisters(),
+                                           mr_conv->ReturnRegister());
+    __ Bind(transition_to_runnable_resume.get());
+  }
+
+  // 5.2. For methods that return a reference, do an early exception check so that the
   //      `JniDecodeReferenceResult()` in the main path does not need to check for exceptions.
   std::unique_ptr<JNIMacroLabel> exception_slow_path =
       LIKELY(!is_critical_native) ? __ CreateLabel() : nullptr;
-  if (UNLIKELY(is_fast_native) && reference_return) {
+  if (reference_return) {
+    DCHECK(!is_critical_native);
     __ ExceptionPoll(exception_slow_path.get());
   }
 
@@ -479,33 +465,23 @@
     __ Bind(suspend_check_resume.get());
   }
 
-  if (LIKELY(!is_critical_native)) {
-    // 5.4. Call JniMethodEnd for normal native.
-    //      For @FastNative with reference return, decode the `jobject`.
-    //      We abuse the JNI calling convention here, that is guaranteed to support passing
-    //      two pointer arguments, `JNIEnv*` and `jclass`/`jobject`, enough for all cases.
+  // 5.4 For methods with reference return, decode the `jobject` with `JniDecodeReferenceResult()`.
+  if (reference_return) {
+    DCHECK(!is_critical_native);
+    // We abuse the JNI calling convention here, that is guaranteed to support passing
+    // two pointer arguments, `JNIEnv*` and `jclass`/`jobject`.
     main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
-    if (LIKELY(!is_fast_native) || reference_return) {
-      ThreadOffset<kPointerSize> jni_end = is_fast_native
-          ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniDecodeReferenceResult)
-          : GetJniMethodEndThreadOffset<kPointerSize>(reference_return);
-      if (reference_return) {
-        // Pass result.
-        SetNativeParameter(jni_asm.get(), main_jni_conv.get(), main_jni_conv->ReturnRegister());
-        main_jni_conv->Next();
-      }
-      if (main_jni_conv->IsCurrentParamInRegister()) {
-        __ GetCurrentThread(main_jni_conv->CurrentParamRegister());
-        __ Call(main_jni_conv->CurrentParamRegister(), Offset(jni_end));
-      } else {
-        __ GetCurrentThread(main_jni_conv->CurrentParamStackOffset());
-        __ CallFromThread(jni_end);
-      }
-    }
-
-    // 5.5. Reload return value if it was spilled.
-    if (spill_return_value) {
-      __ Load(mr_conv->ReturnRegister(), return_save_location, mr_conv->SizeOfReturnValue());
+    ThreadOffset<kPointerSize> jni_decode_reference_result =
+        QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniDecodeReferenceResult);
+    // Pass result.
+    SetNativeParameter(jni_asm.get(), main_jni_conv.get(), mr_conv->ReturnRegister());
+    main_jni_conv->Next();
+    if (main_jni_conv->IsCurrentParamInRegister()) {
+      __ GetCurrentThread(main_jni_conv->CurrentParamRegister());
+      __ Call(main_jni_conv->CurrentParamRegister(), Offset(jni_decode_reference_result));
+    } else {
+      __ GetCurrentThread(main_jni_conv->CurrentParamStackOffset());
+      __ CallFromThread(jni_decode_reference_result);
     }
   }  // if (!is_critical_native)
 
@@ -546,8 +522,8 @@
 
   // 7.3. Process pending exceptions from JNI call or monitor exit.
   //      @CriticalNative methods do not need exception poll in the stub.
-  //      @FastNative methods with reference return emit the exception poll earlier.
-  if (LIKELY(!is_critical_native) && (LIKELY(!is_fast_native) || !reference_return)) {
+  //      Methods with reference return emit the exception poll earlier.
+  if (LIKELY(!is_critical_native) && !reference_return) {
     __ ExceptionPoll(exception_slow_path.get());
   }
 
@@ -614,7 +590,14 @@
     __ Jump(transition_to_native_resume.get());
   }
 
-  // 8.3. Suspend check slow path.
+  // 8.3. Slow path for transition to Runnable.
+  if (LIKELY(!is_critical_native && !is_fast_native)) {
+    __ Bind(transition_to_runnable_slow_path.get());
+    __ CallFromThread(QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEnd));
+    __ Jump(transition_to_runnable_resume.get());
+  }
+
+  // 8.4. Suspend check slow path.
   if (UNLIKELY(is_fast_native)) {
     __ Bind(suspend_check_slow_path.get());
     if (reference_return && main_out_arg_size != 0) {
@@ -634,10 +617,10 @@
     __ Jump(suspend_check_resume.get());
   }
 
-  // 8.4. Exception poll slow path(s).
+  // 8.5. Exception poll slow path(s).
   if (LIKELY(!is_critical_native)) {
     __ Bind(exception_slow_path.get());
-    if (UNLIKELY(is_fast_native) && reference_return) {
+    if (reference_return) {
       // We performed the exception check early, so we need to adjust SP and pop IRT frame.
       if (main_out_arg_size != 0) {
         jni_asm->cfi().AdjustCFAOffset(main_out_arg_size);
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index 2fb063f..65be92c 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -25,8 +25,11 @@
 namespace art {
 namespace x86 {
 
-static constexpr Register kManagedCoreArgumentRegisters[] = {
-    EAX, ECX, EDX, EBX
+static constexpr ManagedRegister kManagedCoreArgumentRegisters[] = {
+    X86ManagedRegister::FromCpuRegister(EAX),
+    X86ManagedRegister::FromCpuRegister(ECX),
+    X86ManagedRegister::FromCpuRegister(EDX),
+    X86ManagedRegister::FromCpuRegister(EBX),
 };
 static constexpr size_t kManagedCoreArgumentRegistersCount =
     arraysize(kManagedCoreArgumentRegisters);
@@ -79,6 +82,33 @@
   return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
 }
 
+ArrayRef<const ManagedRegister> X86JniCallingConvention::ArgumentScratchRegisters() const {
+  DCHECK(!IsCriticalNative());
+  // Exclude EAX or EAX/EDX if they are used as return registers.
+  // Due to the odd ordering of argument registers, use a re-ordered array (pull EDX forward).
+  static constexpr ManagedRegister kArgumentRegisters[] = {
+      X86ManagedRegister::FromCpuRegister(EAX),
+      X86ManagedRegister::FromCpuRegister(EDX),
+      X86ManagedRegister::FromCpuRegister(ECX),
+      X86ManagedRegister::FromCpuRegister(EBX),
+  };
+  static_assert(arraysize(kArgumentRegisters) == kManagedCoreArgumentRegistersCount);
+  static_assert(kManagedCoreArgumentRegisters[0].Equals(kArgumentRegisters[0]));
+  static_assert(kManagedCoreArgumentRegisters[1].Equals(kArgumentRegisters[2]));
+  static_assert(kManagedCoreArgumentRegisters[2].Equals(kArgumentRegisters[1]));
+  static_assert(kManagedCoreArgumentRegisters[3].Equals(kArgumentRegisters[3]));
+  ArrayRef<const ManagedRegister> scratch_regs(kArgumentRegisters);
+  X86ManagedRegister return_reg = ReturnRegister().AsX86();
+  auto return_reg_overlaps = [return_reg](ManagedRegister reg) {
+    return return_reg.Overlaps(reg.AsX86());
+  };
+  if (return_reg_overlaps(scratch_regs[0])) {
+    scratch_regs = scratch_regs.SubArray(/*pos=*/ return_reg_overlaps(scratch_regs[1]) ? 2u : 1u);
+  }
+  DCHECK(std::none_of(scratch_regs.begin(), scratch_regs.end(), return_reg_overlaps));
+  return scratch_regs;
+}
+
 static ManagedRegister ReturnRegisterForShorty(const char* shorty, bool jni) {
   if (shorty[0] == 'F' || shorty[0] == 'D') {
     if (jni) {
@@ -95,15 +125,15 @@
   }
 }
 
-ManagedRegister X86ManagedRuntimeCallingConvention::ReturnRegister() {
+ManagedRegister X86ManagedRuntimeCallingConvention::ReturnRegister() const {
   return ReturnRegisterForShorty(GetShorty(), false);
 }
 
-ManagedRegister X86JniCallingConvention::ReturnRegister() {
+ManagedRegister X86JniCallingConvention::ReturnRegister() const {
   return ReturnRegisterForShorty(GetShorty(), true);
 }
 
-ManagedRegister X86JniCallingConvention::IntReturnRegister() {
+ManagedRegister X86JniCallingConvention::IntReturnRegister() const {
   return X86ManagedRegister::FromCpuRegister(EAX);
 }
 
@@ -149,20 +179,19 @@
     if (IsCurrentParamALong()) {
       switch (gpr_arg_count_) {
         case 1:
-          static_assert(kManagedCoreArgumentRegisters[1] == ECX);
-          static_assert(kManagedCoreArgumentRegisters[2] == EDX);
+          static_assert(kManagedCoreArgumentRegisters[1].AsX86().AsCpuRegister() == ECX);
+          static_assert(kManagedCoreArgumentRegisters[2].AsX86().AsCpuRegister() == EDX);
           return X86ManagedRegister::FromRegisterPair(ECX_EDX);
         case 2:
-          static_assert(kManagedCoreArgumentRegisters[2] == EDX);
-          static_assert(kManagedCoreArgumentRegisters[3] == EBX);
+          static_assert(kManagedCoreArgumentRegisters[2].AsX86().AsCpuRegister() == EDX);
+          static_assert(kManagedCoreArgumentRegisters[3].AsX86().AsCpuRegister() == EBX);
           return X86ManagedRegister::FromRegisterPair(EDX_EBX);
         default:
           LOG(FATAL) << "UNREACHABLE";
           UNREACHABLE();
       }
     } else {
-      Register core_reg = kManagedCoreArgumentRegisters[gpr_arg_count_];
-      return X86ManagedRegister::FromCpuRegister(core_reg);
+      return kManagedCoreArgumentRegisters[gpr_arg_count_];
     }
   }
 }
@@ -200,7 +229,6 @@
   if (is_critical_native_) {
     CHECK(!SpillsMethod());
     CHECK(!HasLocalReferenceSegmentState());
-    CHECK(!SpillsReturnValue());
     return 0u;  // There is no managed frame for @CriticalNative.
   }
 
@@ -214,19 +242,6 @@
   DCHECK(HasLocalReferenceSegmentState());
   // Cookie is saved in one of the spilled registers.
 
-  // Plus return value spill area size
-  if (SpillsReturnValue()) {
-    // For 64-bit return values there shall be a 4B alignment gap between
-    // the method pointer and the saved return value.
-    size_t padding = ReturnValueSaveLocation().SizeValue() - method_ptr_size;
-    DCHECK_EQ(padding,
-              (GetReturnType() == Primitive::kPrimLong || GetReturnType() == Primitive::kPrimDouble)
-                  ? 4u
-                  : 0u);
-    total_size += padding;
-    total_size += SizeOfReturnValue();
-  }
-
   return RoundUp(total_size, kStackAlignment);
 }
 
diff --git a/compiler/jni/quick/x86/calling_convention_x86.h b/compiler/jni/quick/x86/calling_convention_x86.h
index f028090..cd7ef5b 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.h
+++ b/compiler/jni/quick/x86/calling_convention_x86.h
@@ -33,7 +33,7 @@
         gpr_arg_count_(1u) {}  // Skip EAX for ArtMethod*
   ~X86ManagedRuntimeCallingConvention() override {}
   // Calling convention
-  ManagedRegister ReturnRegister() override;
+  ManagedRegister ReturnRegister() const override;
   void ResetIterator(FrameOffset displacement) override;
   // Managed runtime calling convention
   ManagedRegister MethodRegister() override;
@@ -58,13 +58,14 @@
                           const char* shorty);
   ~X86JniCallingConvention() override {}
   // Calling convention
-  ManagedRegister ReturnRegister() override;
-  ManagedRegister IntReturnRegister() override;
+  ManagedRegister ReturnRegister() const override;
+  ManagedRegister IntReturnRegister() const override;
   // JNI calling convention
   size_t FrameSize() const override;
   size_t OutFrameSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ArrayRef<const ManagedRegister> CalleeSaveScratchRegisters() const override;
+  ArrayRef<const ManagedRegister> ArgumentScratchRegisters() const override;
   uint32_t CoreSpillMask() const override;
   uint32_t FpSpillMask() const override;
   bool IsCurrentParamInRegister() override;
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
index 469de42..862ee5e 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
@@ -26,8 +26,13 @@
 namespace art {
 namespace x86_64 {
 
-static constexpr Register kCoreArgumentRegisters[] = {
-    RDI, RSI, RDX, RCX, R8, R9
+static constexpr ManagedRegister kCoreArgumentRegisters[] = {
+    X86_64ManagedRegister::FromCpuRegister(RDI),
+    X86_64ManagedRegister::FromCpuRegister(RSI),
+    X86_64ManagedRegister::FromCpuRegister(RDX),
+    X86_64ManagedRegister::FromCpuRegister(RCX),
+    X86_64ManagedRegister::FromCpuRegister(R8),
+    X86_64ManagedRegister::FromCpuRegister(R9),
 };
 static_assert(kMaxIntLikeRegisterArguments == arraysize(kCoreArgumentRegisters));
 
@@ -99,6 +104,19 @@
   return ArrayRef<const ManagedRegister>(kNativeCalleeSaveRegisters);
 }
 
+ArrayRef<const ManagedRegister> X86_64JniCallingConvention::ArgumentScratchRegisters() const {
+  DCHECK(!IsCriticalNative());
+  ArrayRef<const ManagedRegister> scratch_regs(kCoreArgumentRegisters);
+  if (kIsDebugBuild) {
+    X86_64ManagedRegister return_reg = ReturnRegister().AsX86_64();
+    auto return_reg_overlaps = [return_reg](ManagedRegister reg) {
+      return return_reg.Overlaps(reg.AsX86_64());
+    };
+    CHECK(std::none_of(scratch_regs.begin(), scratch_regs.end(), return_reg_overlaps));
+  }
+  return scratch_regs;
+}
+
 static ManagedRegister ReturnRegisterForShorty(const char* shorty, bool jni ATTRIBUTE_UNUSED) {
   if (shorty[0] == 'F' || shorty[0] == 'D') {
     return X86_64ManagedRegister::FromXmmRegister(XMM0);
@@ -111,15 +129,15 @@
   }
 }
 
-ManagedRegister X86_64ManagedRuntimeCallingConvention::ReturnRegister() {
+ManagedRegister X86_64ManagedRuntimeCallingConvention::ReturnRegister() const {
   return ReturnRegisterForShorty(GetShorty(), false);
 }
 
-ManagedRegister X86_64JniCallingConvention::ReturnRegister() {
+ManagedRegister X86_64JniCallingConvention::ReturnRegister() const {
   return ReturnRegisterForShorty(GetShorty(), true);
 }
 
-ManagedRegister X86_64JniCallingConvention::IntReturnRegister() {
+ManagedRegister X86_64JniCallingConvention::IntReturnRegister() const {
   return X86_64ManagedRegister::FromCpuRegister(RAX);
 }
 
@@ -150,8 +168,7 @@
     return X86_64ManagedRegister::FromXmmRegister(fp_reg);
   } else {
     size_t non_fp_arg_number = itr_args_ - itr_float_and_doubles_;
-    Register core_reg = kCoreArgumentRegisters[/* method */ 1u + non_fp_arg_number];
-    return X86_64ManagedRegister::FromCpuRegister(core_reg);
+    return kCoreArgumentRegisters[/* method */ 1u + non_fp_arg_number];
   }
 }
 
@@ -188,7 +205,6 @@
   if (is_critical_native_) {
     CHECK(!SpillsMethod());
     CHECK(!HasLocalReferenceSegmentState());
-    CHECK(!SpillsReturnValue());
     return 0u;  // There is no managed frame for @CriticalNative.
   }
 
@@ -202,13 +218,6 @@
   DCHECK(HasLocalReferenceSegmentState());
   // Cookie is saved in one of the spilled registers.
 
-  // Plus return value spill area size
-  if (SpillsReturnValue()) {
-    // No padding between the method pointer and the return value on arm64.
-    DCHECK_EQ(ReturnValueSaveLocation().SizeValue(), method_ptr_size);
-    total_size += SizeOfReturnValue();
-  }
-
   return RoundUp(total_size, kStackAlignment);
 }
 
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.h b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
index fda5c0e..483f1f5 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.h
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
@@ -32,7 +32,7 @@
                                         PointerSize::k64) {}
   ~X86_64ManagedRuntimeCallingConvention() override {}
   // Calling convention
-  ManagedRegister ReturnRegister() override;
+  ManagedRegister ReturnRegister() const override;
   // Managed runtime calling convention
   ManagedRegister MethodRegister() override;
   bool IsCurrentParamInRegister() override;
@@ -53,13 +53,14 @@
                              const char* shorty);
   ~X86_64JniCallingConvention() override {}
   // Calling convention
-  ManagedRegister ReturnRegister() override;
-  ManagedRegister IntReturnRegister() override;
+  ManagedRegister ReturnRegister() const override;
+  ManagedRegister IntReturnRegister() const override;
   // JNI calling convention
   size_t FrameSize() const override;
   size_t OutFrameSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ArrayRef<const ManagedRegister> CalleeSaveScratchRegisters() const override;
+  ArrayRef<const ManagedRegister> ArgumentScratchRegisters() const override;
   uint32_t CoreSpillMask() const override;
   uint32_t FpSpillMask() const override;
   bool IsCurrentParamInRegister() override;
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index 2c1b4be..418cf57 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -1079,6 +1079,45 @@
   ___ Str(scratch, MemOperand(tr, thread_held_mutex_mutator_lock_offset.Int32Value()));
 }
 
+void ArmVIXLJNIMacroAssembler::TryToTransitionFromNativeToRunnable(
+    JNIMacroLabel* label,
+    ArrayRef<const ManagedRegister> scratch_regs,
+    ManagedRegister return_reg) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset32 thread_flags_offset = Thread::ThreadFlagsOffset<kArmPointerSize>();
+  constexpr ThreadOffset32 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kArmPointerSize>(kMutatorLock);
+  constexpr ThreadOffset32 thread_mutator_lock_offset =
+      Thread::MutatorLockOffset<kArmPointerSize>();
+
+  // There must be at least two scratch registers.
+  DCHECK_GE(scratch_regs.size(), 2u);
+  DCHECK(!scratch_regs[0].AsArm().Overlaps(return_reg.AsArm()));
+  vixl32::Register scratch = AsVIXLRegister(scratch_regs[0].AsArm());
+  DCHECK(!scratch_regs[1].AsArm().Overlaps(return_reg.AsArm()));
+  vixl32::Register scratch2 = AsVIXLRegister(scratch_regs[1].AsArm());
+
+  // CAS acquire, old_value = kNativeStateValue, new_value = kRunnableStateValue, no flags.
+  vixl32::Label retry;
+  ___ Bind(&retry);
+  ___ Ldrex(scratch, MemOperand(tr, thread_flags_offset.Int32Value()));
+  // If any flags are set, or the state is not Native, go to the slow path.
+  // (While the thread can theoretically transition between different Suspended states,
+  // it would be very unexpected to see a state other than Native at this point.)
+  ___ Eors(scratch2, scratch, kNativeStateValue);
+  ___ B(ne, ArmVIXLJNIMacroLabel::Cast(label)->AsArm());
+  static_assert(kRunnableStateValue == 0u);
+  ___ Strex(scratch, scratch2, MemOperand(tr, thread_flags_offset.Int32Value()));
+  ___ Cmp(scratch, 0);
+  ___ B(ne, &retry);
+  ___ Dmb(DmbOptions::ISH);  // Memory barrier "load-any" for the "acquire" operation.
+
+  // Set `self->tlsPtr_.held_mutexes[kMutatorLock]` to the mutator lock.
+  ___ Ldr(scratch, MemOperand(tr, thread_mutator_lock_offset.Int32Value()));
+  ___ Str(scratch, MemOperand(tr, thread_held_mutex_mutator_lock_offset.Int32Value()));
+}
+
 void ArmVIXLJNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   vixl32::Register scratch = temps.Acquire();
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
index 7b9d7de..426502d 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
@@ -185,9 +185,18 @@
   void CallFromThread(ThreadOffset32 offset) override;
 
   // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be callee save core registers
+  // (already saved before this call) and must preserve all argument registers.
   void TryToTransitionFromRunnableToNative(
       JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
 
+  // Generate fast-path for transition to Runnable. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be core argument registers
+  // not used as return registers and it must preserve the `return_reg` if any.
+  void TryToTransitionFromNativeToRunnable(JNIMacroLabel* label,
+                                           ArrayRef<const ManagedRegister> scratch_regs,
+                                           ManagedRegister return_reg) override;
+
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   void SuspendCheck(JNIMacroLabel* label) override;
 
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index e84fe04..df7bb5e 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -917,6 +917,42 @@
   ___ Str(xzr, MEM_OP(reg_x(TR), thread_held_mutex_mutator_lock_offset.Int32Value()));
 }
 
+void Arm64JNIMacroAssembler::TryToTransitionFromNativeToRunnable(
+    JNIMacroLabel* label,
+    ArrayRef<const ManagedRegister> scratch_regs ATTRIBUTE_UNUSED,
+    ManagedRegister return_reg ATTRIBUTE_UNUSED) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset64 thread_flags_offset = Thread::ThreadFlagsOffset<kArm64PointerSize>();
+  constexpr ThreadOffset64 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kArm64PointerSize>(kMutatorLock);
+  constexpr ThreadOffset64 thread_mutator_lock_offset =
+      Thread::MutatorLockOffset<kArm64PointerSize>();
+
+  UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+  Register scratch = temps.AcquireW();
+  Register scratch2 = temps.AcquireW();
+
+  // CAS acquire, old_value = kNativeStateValue, new_value = kRunnableStateValue, no flags.
+  vixl::aarch64::Label retry;
+  ___ Bind(&retry);
+  static_assert(thread_flags_offset.Int32Value() == 0);  // LDAXR/STXR require exact address.
+  ___ Ldaxr(scratch, MEM_OP(reg_x(TR)));
+  ___ Mov(scratch2, kNativeStateValue);
+  // If any flags are set, or the state is not Native, go to the slow path.
+  // (While the thread can theoretically transition between different Suspended states,
+  // it would be very unexpected to see a state other than Native at this point.)
+  ___ Cmp(scratch, scratch2);
+  ___ B(ne, Arm64JNIMacroLabel::Cast(label)->AsArm64());
+  static_assert(kRunnableStateValue == 0u);
+  ___ Stxr(scratch, wzr, MEM_OP(reg_x(TR)));
+  ___ Cbnz(scratch, &retry);
+
+  // Set `self->tlsPtr_.held_mutexes[kMutatorLock]` to the mutator lock.
+  ___ Ldr(scratch.X(), MEM_OP(reg_x(TR), thread_mutator_lock_offset.Int32Value()));
+  ___ Str(scratch.X(), MEM_OP(reg_x(TR), thread_held_mutex_mutator_lock_offset.Int32Value()));
+}
+
 void Arm64JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
   Register scratch = temps.AcquireW();
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.h b/compiler/utils/arm64/jni_macro_assembler_arm64.h
index 1c61d96..0fb512e 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.h
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.h
@@ -169,9 +169,18 @@
   void CallFromThread(ThreadOffset64 offset) override;
 
   // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be callee save core registers
+  // (already saved before this call) and must preserve all argument registers.
   void TryToTransitionFromRunnableToNative(
       JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
 
+  // Generate fast-path for transition to Runnable. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be core argument registers
+  // not used as return registers and it must preserve the `return_reg` if any.
+  void TryToTransitionFromNativeToRunnable(JNIMacroLabel* label,
+                                           ArrayRef<const ManagedRegister> scratch_regs,
+                                           ManagedRegister return_reg) override;
+
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   void SuspendCheck(JNIMacroLabel* label) override;
 
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index b35066f..2d1de97 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -1,12 +1,12 @@
 const char* const VixlJniHelpersResults = {
   "       0: 2d e9 e0 4d   push.w {r5, r6, r7, r8, r10, r11, lr}\n"
   "       4: 2d ed 10 8a   vpush {s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31}\n"
-  "       8: 85 b0         sub sp, #20\n"
+  "       8: 81 b0         sub sp, #4\n"
   "       a: 00 90         str r0, [sp]\n"
-  "       c: 1d 91         str r1, [sp, #116]\n"
-  "       e: 8d ed 1e 0a   vstr s0, [sp, #120]\n"
-  "      12: 1f 92         str r2, [sp, #124]\n"
-  "      14: 20 93         str r3, [sp, #128]\n"
+  "       c: 19 91         str r1, [sp, #100]\n"
+  "       e: 8d ed 1a 0a   vstr s0, [sp, #104]\n"
+  "      12: 1b 92         str r2, [sp, #108]\n"
+  "      14: 1c 93         str r3, [sp, #112]\n"
   "      16: 88 b0         sub sp, #32\n"
   "      18: ad f5 80 5d   sub.w sp, sp, #4096\n"
   "      1c: 08 98         ldr r0, [sp, #32]\n"
@@ -147,13 +147,13 @@
   "     208: cd f8 ff c7   str.w r12, [sp, #2047]\n"
   "     20c: 0d f5 80 5d   add.w sp, sp, #4096\n"
   "     210: 08 b0         add sp, #32\n"
-  "     212: 05 b0         add sp, #20\n"
+  "     212: 01 b0         add sp, #4\n"
   "     214: bd ec 10 8a   vpop {s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31}\n"
   "     218: bd e8 e0 4d   pop.w {r5, r6, r7, r8, r10, r11, lr}\n"
   "     21c: d9 f8 24 80   ldr.w r8, [r9, #36]\n"
   "     220: 70 47         bx lr\n"
   "     222: d9 f8 8c 00   ldr.w r0, [r9, #140]\n"
-  "     226: d9 f8 c4 e2   ldr.w lr, [r9, #708]\n"
+  "     226: d9 f8 c0 e2   ldr.w lr, [r9, #704]\n"
   "     22a: f0 47         blx lr\n"
 };
 
diff --git a/compiler/utils/jni_macro_assembler.h b/compiler/utils/jni_macro_assembler.h
index 659ff4c..0d82458 100644
--- a/compiler/utils/jni_macro_assembler.h
+++ b/compiler/utils/jni_macro_assembler.h
@@ -252,9 +252,18 @@
   virtual void CallFromThread(ThreadOffset<kPointerSize> offset) = 0;
 
   // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be callee save core registers
+  // (already saved before this call) and must preserve all argument registers.
   virtual void TryToTransitionFromRunnableToNative(
       JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) = 0;
 
+  // Generate fast-path for transition to Runnable. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be core argument registers
+  // not used as return registers and it must preserve the `return_reg` if any.
+  virtual void TryToTransitionFromNativeToRunnable(JNIMacroLabel* label,
+                                                   ArrayRef<const ManagedRegister> scratch_regs,
+                                                   ManagedRegister return_reg) = 0;
+
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   virtual void SuspendCheck(JNIMacroLabel* label) = 0;
 
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 8be2a32..1a0d521 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -618,6 +618,57 @@
                 Immediate(0));
 }
 
+void X86JNIMacroAssembler::TryToTransitionFromNativeToRunnable(
+    JNIMacroLabel* label,
+    ArrayRef<const ManagedRegister> scratch_regs,
+    ManagedRegister return_reg) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset32 thread_flags_offset = Thread::ThreadFlagsOffset<kX86PointerSize>();
+  constexpr ThreadOffset32 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kX86PointerSize>(kMutatorLock);
+  constexpr ThreadOffset32 thread_mutator_lock_offset =
+      Thread::MutatorLockOffset<kX86PointerSize>();
+
+  size_t scratch_index = 0u;
+  auto get_scratch_reg = [&]() {
+    while (true) {
+      DCHECK_LT(scratch_index, scratch_regs.size());
+      X86ManagedRegister scratch_reg = scratch_regs[scratch_index].AsX86();
+      ++scratch_index;
+      DCHECK(!scratch_reg.Overlaps(return_reg.AsX86()));
+      if (scratch_reg.AsCpuRegister() != EAX) {
+        return scratch_reg.AsCpuRegister();
+      }
+    }
+  };
+  Register scratch = get_scratch_reg();
+  bool preserve_eax = return_reg.AsX86().Overlaps(X86ManagedRegister::FromCpuRegister(EAX));
+  Register saved_eax = preserve_eax ? get_scratch_reg() : kNoRegister;
+
+  // CAS acquire, old_value = kNativeStateValue, new_value = kRunnableStateValue, no flags.
+  if (preserve_eax) {
+    __ movl(saved_eax, EAX);  // Save EAX.
+  }
+  __ movl(EAX, Immediate(kNativeStateValue));
+  static_assert(kRunnableStateValue == 0u);
+  __ xorl(scratch, scratch);
+  __ fs()->LockCmpxchgl(Address::Absolute(thread_flags_offset.Uint32Value()), scratch);
+  // LOCK CMPXCHG has full barrier semantics, so we don't need barriers here.
+  if (preserve_eax) {
+    __ movl(EAX, saved_eax);  // Restore EAX; MOV does not change flags.
+  }
+  // If any flags are set, or the state is not Native, go to the slow path.
+  // (While the thread can theoretically transition between different Suspended states,
+  // it would be very unexpected to see a state other than Native at this point.)
+  __ j(kNotZero, X86JNIMacroLabel::Cast(label)->AsX86());
+
+  // Set `self->tlsPtr_.held_mutexes[kMutatorLock]` to the mutator lock.
+  __ fs()->movl(scratch, Address::Absolute(thread_mutator_lock_offset.Uint32Value()));
+  __ fs()->movl(Address::Absolute(thread_held_mutex_mutator_lock_offset.Uint32Value()),
+                scratch);
+}
+
 void X86JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   __ fs()->testl(Address::Absolute(Thread::ThreadFlagsOffset<kX86PointerSize>()),
                  Immediate(Thread::SuspendOrCheckpointRequestFlags()));
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.h b/compiler/utils/x86/jni_macro_assembler_x86.h
index 0af6371..7fe0e42 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.h
+++ b/compiler/utils/x86/jni_macro_assembler_x86.h
@@ -161,9 +161,18 @@
   void CallFromThread(ThreadOffset32 offset) override;
 
   // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be callee save core registers
+  // (already saved before this call) and must preserve all argument registers.
   void TryToTransitionFromRunnableToNative(
       JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
 
+  // Generate fast-path for transition to Runnable. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be core argument registers
+  // not used as return registers and it must preserve the `return_reg` if any.
+  void TryToTransitionFromNativeToRunnable(JNIMacroLabel* label,
+                                           ArrayRef<const ManagedRegister> scratch_regs,
+                                           ManagedRegister return_reg) override;
+
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   void SuspendCheck(JNIMacroLabel* label) override;
 
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index b25d5c7..8a90a13 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -698,6 +698,52 @@
       Immediate(0));
 }
 
+void X86_64JNIMacroAssembler::TryToTransitionFromNativeToRunnable(
+    JNIMacroLabel* label,
+    ArrayRef<const ManagedRegister> scratch_regs,
+    ManagedRegister return_reg) {
+  constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+  constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+  constexpr ThreadOffset64 thread_flags_offset = Thread::ThreadFlagsOffset<kX86_64PointerSize>();
+  constexpr ThreadOffset64 thread_held_mutex_mutator_lock_offset =
+      Thread::HeldMutexOffset<kX86_64PointerSize>(kMutatorLock);
+  constexpr ThreadOffset64 thread_mutator_lock_offset =
+      Thread::MutatorLockOffset<kX86_64PointerSize>();
+
+  DCHECK_GE(scratch_regs.size(), 2u);
+  DCHECK(!scratch_regs[0].AsX86_64().Overlaps(return_reg.AsX86_64()));
+  CpuRegister scratch = scratch_regs[0].AsX86_64().AsCpuRegister();
+  DCHECK(!scratch_regs[1].AsX86_64().Overlaps(return_reg.AsX86_64()));
+  CpuRegister saved_rax = scratch_regs[1].AsX86_64().AsCpuRegister();
+  CpuRegister rax(RAX);
+  bool preserve_rax = return_reg.AsX86_64().Overlaps(X86_64ManagedRegister::FromCpuRegister(RAX));
+
+  // CAS acquire, old_value = kNativeStateValue, new_value = kRunnableStateValue, no flags.
+  if (preserve_rax) {
+    __ movq(saved_rax, rax);  // Save RAX.
+  }
+  __ movl(rax, Immediate(kNativeStateValue));
+  static_assert(kRunnableStateValue == 0u);
+  __ xorl(scratch, scratch);
+  __ gs()->LockCmpxchgl(Address::Absolute(thread_flags_offset.Uint32Value(), /*no_rip=*/ true),
+                        scratch);
+  // LOCK CMPXCHG has full barrier semantics, so we don't need barriers here.
+  if (preserve_rax) {
+    __ movq(rax, saved_rax);  // Restore RAX; MOV does not change flags.
+  }
+  // If any flags are set, or the state is not Native, go to the slow path.
+  // (While the thread can theoretically transition between different Suspended states,
+  // it would be very unexpected to see a state other than Native at this point.)
+  __ j(kNotZero, X86_64JNIMacroLabel::Cast(label)->AsX86_64());
+
+  // Set `self->tlsPtr_.held_mutexes[kMutatorLock]` to the mutator lock.
+  __ gs()->movq(scratch,
+                Address::Absolute(thread_mutator_lock_offset.Uint32Value(), /*no_rip=*/ true));
+  __ gs()->movq(
+      Address::Absolute(thread_held_mutex_mutator_lock_offset.Uint32Value(), /*no_rip=*/ true),
+      scratch);
+}
+
 void X86_64JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
   __ gs()->testl(Address::Absolute(Thread::ThreadFlagsOffset<kX86_64PointerSize>(), true),
                  Immediate(Thread::SuspendOrCheckpointRequestFlags()));
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
index 6eb7873..c46d5c6 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
@@ -181,9 +181,18 @@
   void CallFromThread(ThreadOffset64 offset) override;
 
   // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be callee save core registers
+  // (already saved before this call) and must preserve all argument registers.
   void TryToTransitionFromRunnableToNative(
       JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
 
+  // Generate fast-path for transition to Runnable. Go to `label` if any thread flag is set.
+  // The implementation can use `scratch_regs` which should be core argument registers
+  // not used as return registers and it must preserve the `return_reg` if any.
+  void TryToTransitionFromNativeToRunnable(JNIMacroLabel* label,
+                                           ArrayRef<const ManagedRegister> scratch_regs,
+                                           ManagedRegister return_reg) override;
+
   // Generate suspend check and branch to `label` if there is a pending suspend request.
   void SuspendCheck(JNIMacroLabel* label) override;
 
diff --git a/dex2oat/linker/oat_writer_test.cc b/dex2oat/linker/oat_writer_test.cc
index cca5bc2..8663d8b 100644
--- a/dex2oat/linker/oat_writer_test.cc
+++ b/dex2oat/linker/oat_writer_test.cc
@@ -505,7 +505,7 @@
   EXPECT_EQ(64U, sizeof(OatHeader));
   EXPECT_EQ(4U, sizeof(OatMethodOffsets));
   EXPECT_EQ(4U, sizeof(OatQuickMethodHeader));
-  EXPECT_EQ(168 * static_cast<size_t>(GetInstructionSetPointerSize(kRuntimeISA)),
+  EXPECT_EQ(167 * static_cast<size_t>(GetInstructionSetPointerSize(kRuntimeISA)),
             sizeof(QuickEntryPoints));
 }
 
diff --git a/runtime/arch/arm/jni_entrypoints_arm.S b/runtime/arch/arm/jni_entrypoints_arm.S
index d1431cf..b3d89f9 100644
--- a/runtime/arch/arm/jni_entrypoints_arm.S
+++ b/runtime/arch/arm/jni_entrypoints_arm.S
@@ -50,12 +50,39 @@
     .ifnc \arg1, none
         mov r0, \arg1                     @ Pass arg1.
     .endif
-    bl    \cxx_name                       @ Call cxx_name(...).
+    bl     \cxx_name                      @ Call cxx_name(...).
     // Restore args and R4 and return.
     RESTORE_MANAGED_ARGS_R4_AND_RETURN /*restore_cfa*/ 0
 END \name
 .endm
 
+.macro JNI_SAVE_RETURN_VALUE_TRAMPOLINE name, cxx_name, arg1, arg2 = "none", label = "none"
+    .extern \cxx_name
+ENTRY \name
+    .ifnc \label, none
+        \label:
+    .endif
+    // Save GPR return registers and return address. Also save r4 for stack alignment.
+    push   {r0-r1, r4, lr}
+    .cfi_adjust_cfa_offset 16
+    .cfi_rel_offset lr, 12
+    // Save FPR return registers.
+    vpush  {s0-s1}
+    .cfi_adjust_cfa_offset 8
+    // Call `cxx_name()`.
+    mov r0, \arg1                         @ Pass arg1.
+    .ifnc \arg2, none
+        mov r1, \arg2                     @ Pass arg2.
+    .endif
+    bl     \cxx_name                      @ Call cxx_name(...).
+    // Restore FPR return registers.
+    vpop   {s0-s1}
+    .cfi_adjust_cfa_offset -8
+    // Restore GPR return registers and r4 and return.
+    pop    {r0-r1, r4, pc}
+END \name
+.endm
+
     /*
      * Jni dlsym lookup stub.
      */
@@ -298,13 +325,22 @@
      */
 JNI_SAVE_MANAGED_ARGS_TRAMPOLINE art_jni_method_start, artJniMethodStart, rSELF
 
-
     /*
      * Trampoline to `artJniMonitoredMethodStart()` that preserves all managed arguments.
      */
 JNI_SAVE_MANAGED_ARGS_TRAMPOLINE art_jni_monitored_method_start, artJniMonitoredMethodStart, rSELF
 
     /*
+     * Trampoline to `artJniMethodEnd()` that preserves all return registers.
+     */
+JNI_SAVE_RETURN_VALUE_TRAMPOLINE art_jni_method_end, artJniMethodEnd, rSELF
+
+    /*
+     * Trampoline to `artJniMonitoredMethodEnd()` that preserves all return registers.
+     */
+JNI_SAVE_RETURN_VALUE_TRAMPOLINE art_jni_monitored_method_end, artJniMonitoredMethodEnd, rSELF
+
+    /*
      * Entry from JNI stub that tries to lock the object in a fast path and
      * calls `artLockObjectFromCode()` (the same as for managed code) for the
      * difficult cases, may block for GC.
@@ -376,26 +412,8 @@
      *     Callee-save registers have been saved and can be used as temporaries.
      *     Return registers r0-r1 and s0-s1 need to be preserved.
      */
-    .extern artJniUnlockObject
-ENTRY art_jni_unlock_object_no_inline
     // This is also the slow path for art_jni_unlock_object.
-    // Note that we need a local label as the assembler emits bad instructions
-    // for CBZ/CBNZ if we try to jump to `art_jni_unlock_object_no_inline`.
-.Lunlock_object_jni_slow:
-    // Save GPR return registers and return address. Also save r4 for stack alignment.
-    push   {r0-r1, r4, lr}
-    .cfi_adjust_cfa_offset 16
-    .cfi_rel_offset lr, 12
-    // Save FPR return registers.
-    vpush  {s0-s1}
-    .cfi_adjust_cfa_offset 8
-    // Call `artJniUnlockObject()`.
-    mov    r0, r4                       @ Pass the object to unlock.
-    mov    r1, rSELF                    @ Pass Thread::Current().
-    bl     artJniUnlockObject           @ (Object* obj, Thread*)
-    // Restore FPR return registers.
-    vpop   {s0-s1}
-    .cfi_adjust_cfa_offset -8
-    // Restore GPR return registers and r4 and return.
-    pop    {r0-r1, r4, pc}
-END art_jni_unlock_object_no_inline
+JNI_SAVE_RETURN_VALUE_TRAMPOLINE art_jni_unlock_object_no_inline, artJniUnlockObject, r4, rSELF, \
+    /* Note that we need a local label as the assembler emits bad instructions                */ \
+    /* for CBZ/CBNZ if we try to jump to `art_jni_unlock_object_no_inline`.                   */ \
+    .Lunlock_object_jni_slow
diff --git a/runtime/arch/arm64/jni_entrypoints_arm64.S b/runtime/arch/arm64/jni_entrypoints_arm64.S
index fa071fd..8872362 100644
--- a/runtime/arch/arm64/jni_entrypoints_arm64.S
+++ b/runtime/arch/arm64/jni_entrypoints_arm64.S
@@ -44,25 +44,48 @@
 .endm
 
 .macro JNI_SAVE_MANAGED_ARGS_TRAMPOLINE name, cxx_name, arg1 = "none"
-    .extern cxx_name
+    .extern \cxx_name
 ENTRY \name
     // Save args and LR.
     SAVE_ALL_ARGS_INCREASE_FRAME /*padding*/ 8 + /*LR*/ 8
-    str   lr, [sp, #(ALL_ARGS_SIZE + /*padding*/ 8)]
+    str    lr, [sp, #(ALL_ARGS_SIZE + /*padding*/ 8)]
     .cfi_rel_offset lr, ALL_ARGS_SIZE + /*padding*/ 8
     // Call `cxx_name()`.
     .ifnc \arg1, none
         mov x0, \arg1                          // Pass arg1.
     .endif
-    bl    \cxx_name                            // Call cxx_name(...).
+    bl     \cxx_name                           // Call cxx_name(...).
     // Restore LR and args and return.
-    ldr   lr, [sp, #(ALL_ARGS_SIZE + /*padding*/ 8)]
+    ldr    lr, [sp, #(ALL_ARGS_SIZE + /*padding*/ 8)]
     .cfi_restore lr
     RESTORE_ALL_ARGS_DECREASE_FRAME /*padding*/ 8 + /*LR*/ 8
     ret
 END \name
 .endm
 
+.macro JNI_SAVE_RETURN_VALUE_TRAMPOLINE name, cxx_name, arg1, arg2 = "none"
+    .extern \cxx_name
+ENTRY \name
+    // Save return registers and return address.
+    stp    x0, lr, [sp, #-32]!
+    .cfi_adjust_cfa_offset 32
+    .cfi_rel_offset lr, 8
+    str    d0, [sp, #16]
+    // Call `cxx_name()`.
+    mov    x0, \arg1                           // Pass arg1.
+    .ifnc \arg2, none
+        mov x1, \arg2                          // Pass arg2.
+    .endif
+    bl     \cxx_name                           // Call cxx_name(...).
+    // Restore return registers and return.
+    ldr    d0, [sp, #16]
+    ldp    x0, lr, [sp], #32
+    .cfi_adjust_cfa_offset -32
+    .cfi_restore lr
+    ret
+END \name
+.endm
+
     /*
      * Jni dlsym lookup stub.
      */
@@ -349,6 +372,16 @@
 JNI_SAVE_MANAGED_ARGS_TRAMPOLINE art_jni_monitored_method_start, artJniMonitoredMethodStart, xSELF
 
     /*
+     * Trampoline to `artJniMethodEnd()` that preserves all return registers.
+     */
+JNI_SAVE_RETURN_VALUE_TRAMPOLINE art_jni_method_end, artJniMethodEnd, xSELF
+
+    /*
+     * Trampoline to `artJniMonitoredMethodEnd()` that preserves all return registers.
+     */
+JNI_SAVE_RETURN_VALUE_TRAMPOLINE art_jni_monitored_method_end, artJniMonitoredMethodEnd, xSELF
+
+    /*
      * Entry from JNI stub that tries to lock the object in a fast path and
      * calls `artLockObjectFromCode()` (the same as for managed code) for the
      * difficult cases, may block for GC.
@@ -419,22 +452,5 @@
      *     Callee-save registers have been saved and can be used as temporaries.
      *     Return registers r0 and d0 need to be preserved.
      */
-    .extern artJniUnlockObject
-ENTRY art_jni_unlock_object_no_inline
     // This is also the slow path for art_jni_unlock_object.
-    // Save return registers and return address.
-    stp    x0, lr, [sp, #-32]!
-    .cfi_adjust_cfa_offset 32
-    .cfi_rel_offset lr, 8
-    str    d0, [sp, #16]
-    // Call `artJniUnlockObject()`.
-    mov    x0, x15                    // Pass the object to unlock.
-    mov    x1, xSELF                  // Pass Thread::Current().
-    bl     artJniUnlockObject         // (Object* obj, Thread*)
-    // Restore return registers and return.
-    ldr    d0, [sp, #16]
-    ldp    x0, lr, [sp], #32
-    .cfi_adjust_cfa_offset -32
-    .cfi_restore lr
-    ret
-END art_jni_unlock_object_no_inline
+JNI_SAVE_RETURN_VALUE_TRAMPOLINE art_jni_unlock_object_no_inline, artJniUnlockObject, x15, xSELF
diff --git a/runtime/arch/x86/jni_entrypoints_x86.S b/runtime/arch/x86/jni_entrypoints_x86.S
index d1d0f41..d827509 100644
--- a/runtime/arch/x86/jni_entrypoints_x86.S
+++ b/runtime/arch/x86/jni_entrypoints_x86.S
@@ -62,6 +62,31 @@
 END_FUNCTION \name
 END_MACRO
 
+MACRO4(JNI_SAVE_RETURN_VALUE_TRAMPOLINE, name, cxx_name, arg1, arg2)
+DEFINE_FUNCTION \name
+    // Save return registers.
+    PUSH_ARG edx
+    PUSH_ARG eax
+    .ifnc \arg2, none
+        INCREASE_FRAME /*mmx0*/ 8 + /*padding*/ 4
+        movsd %xmm0, 0(%esp)
+        PUSH_ARG RAW_VAR(arg2)    // Pass arg2.
+    .else
+        INCREASE_FRAME /*padding*/ 4 + /*mmx0*/ 8 + /*padding*/ 4
+        movsd %xmm0, 4(%esp)
+    .endif
+    // Call `cxx_name()`.
+    PUSH_ARG RAW_VAR(arg1)        // Pass arg1.
+    call CALLVAR(cxx_name)        // Call cxx_name(...).
+    // Restore return registers and return.
+    movsd 8(%esp), %xmm0
+    DECREASE_FRAME /*call args*/ 8 + /*xmm0*/ 8 + /*padding*/ 4
+    POP_ARG eax
+    POP_ARG edx
+    ret
+END_FUNCTION \name
+END_MACRO
+
     /*
      * Jni dlsym lookup stub.
      */
@@ -267,6 +292,17 @@
     art_jni_monitored_method_start, artJniMonitoredMethodStart, fs:THREAD_SELF_OFFSET
 
     /*
+     * Trampoline to `artJniMethodEnd()` that preserves all return registers.
+     */
+JNI_SAVE_RETURN_VALUE_TRAMPOLINE art_jni_method_end, artJniMethodEnd, fs:THREAD_SELF_OFFSET, none
+
+    /*
+     * Trampoline to `artJniMonitoredMethodEnd()` that preserves all return registers.
+     */
+JNI_SAVE_RETURN_VALUE_TRAMPOLINE \
+    art_jni_monitored_method_end, artJniMonitoredMethodEnd, fs:THREAD_SELF_OFFSET, none
+
+    /*
      * Entry from JNI stub that tries to lock the object in a fast path and
      * calls `artLockObjectFromCode()` (the same as for managed code) for the
      * difficult cases, may block for GC.
@@ -346,23 +382,6 @@
      *     Callee-save registers have been saved and can be used as temporaries (except EBP).
      *     Return registers EAX, EDX and mmx0 need to be preserved.
      */
-DEFINE_FUNCTION art_jni_unlock_object_no_inline
     // This is also the slow path for art_jni_unlock_object.
-    // Save return registers.
-    PUSH_ARG edx
-    PUSH_ARG eax
-    INCREASE_FRAME /*mmx0*/ 8 + /*padding*/ 4
-    movsd %xmm0, 0(%esp)
-    // Note: The stack is not 16-byte aligned here but it shall be after pushing args for the call.
-    // Call `artJniUnlockObject()`.
-    pushl %fs:THREAD_SELF_OFFSET          // Pass Thread::Current().
-    CFI_ADJUST_CFA_OFFSET(4)
-    PUSH_ARG ebp                          // Pass the object to unlock.
-    call SYMBOL(artJniUnlockObject)       // (object, Thread*)
-    // Restore return registers and return.
-    movsd 8(%esp), %xmm0
-    DECREASE_FRAME /*call args*/ 8 + /*xmm0*/ 8 + /*padding*/ 4
-    POP_ARG eax
-    POP_ARG edx
-    ret
-END_FUNCTION art_jni_unlock_object_no_inline
+JNI_SAVE_RETURN_VALUE_TRAMPOLINE \
+    art_jni_unlock_object_no_inline, artJniUnlockObject, ebp, fs:THREAD_SELF_OFFSET
diff --git a/runtime/arch/x86_64/jni_entrypoints_x86_64.S b/runtime/arch/x86_64/jni_entrypoints_x86_64.S
index b5d3bd1..0d5fa3f 100644
--- a/runtime/arch/x86_64/jni_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/jni_entrypoints_x86_64.S
@@ -71,6 +71,26 @@
 END_FUNCTION \name
 END_MACRO
 
+MACRO4(JNI_SAVE_RETURN_VALUE_TRAMPOLINE, name, cxx_name, arg1, arg2)
+DEFINE_FUNCTION \name
+    // Save return registers and return address.
+    PUSH_ARG rax
+    INCREASE_FRAME /*mmx0*/ 8 + /*padding*/ 8
+    movsd %xmm0, 0(%rsp)
+    // Call `cxx_name()`.
+    mov REG_VAR(arg1), %rdi         // Pass arg1.
+    .ifnc \arg2, none
+        mov REG_VAR(arg2), %rsi     // Pass arg2.
+    .endif
+    call CALLVAR(cxx_name)          // Call cxx_name(...).
+    // Restore return registers and return.
+    movsd 0(%rsp), %xmm0
+    DECREASE_FRAME /*mmx0*/ 8 + /*padding*/ 8
+    POP_ARG rax
+    ret
+END_FUNCTION \name
+END_MACRO
+
     /*
      * Jni dlsym lookup stub.
      */
@@ -386,6 +406,17 @@
     art_jni_monitored_method_start, artJniMonitoredMethodStart, gs:THREAD_SELF_OFFSET
 
     /*
+     * Trampoline to `artJniMethodEnd()` that preserves all return registers.
+     */
+JNI_SAVE_RETURN_VALUE_TRAMPOLINE art_jni_method_end, artJniMethodEnd, gs:THREAD_SELF_OFFSET, none
+
+    /*
+     * Trampoline to `artJniMonitoredMethodEnd()` that preserves all return registers.
+     */
+JNI_SAVE_RETURN_VALUE_TRAMPOLINE \
+    art_jni_monitored_method_end, artJniMonitoredMethodEnd, gs:THREAD_SELF_OFFSET, none
+
+    /*
      * Entry from JNI stub that tries to lock the object in a fast path and
      * calls `artLockObjectFromCode()` (the same as for managed code) for the
      * difficult cases, may block for GC.
@@ -455,19 +486,6 @@
      *     Callee-save registers have been saved and can be used as temporaries (except RBX).
      *     Return registers RAX and mmx0 need to be preserved.
      */
-DEFINE_FUNCTION art_jni_unlock_object_no_inline
     // This is also the slow path for art_jni_unlock_object.
-    // Save return registers and return address.
-    PUSH_ARG rax
-    INCREASE_FRAME /*mmx0*/ 8 + /*padding*/ 8
-    movsd %xmm0, 0(%rsp)
-    // Call `artJniUnlockObject()`.
-    movq %rbx, %rdi                       // Pass the object to unlock.
-    movq %gs:THREAD_SELF_OFFSET, %rsi     // Pass Thread::Current().
-    call SYMBOL(artJniUnlockObject)       // (object, Thread*)
-    // Restore return registers and return.
-    movsd 0(%rsp), %xmm0
-    DECREASE_FRAME /*mmx0*/ 8 + /*padding*/ 8
-    POP_ARG rax
-    ret
-END_FUNCTION art_jni_unlock_object_no_inline
+JNI_SAVE_RETURN_VALUE_TRAMPOLINE \
+    art_jni_unlock_object_no_inline, artJniUnlockObject, rbx, gs:THREAD_SELF_OFFSET
diff --git a/runtime/entrypoints/quick/quick_default_externs.h b/runtime/entrypoints/quick/quick_default_externs.h
index b3c6c02..f8856d8 100644
--- a/runtime/entrypoints/quick/quick_default_externs.h
+++ b/runtime/entrypoints/quick/quick_default_externs.h
@@ -120,6 +120,8 @@
 // JNI method start entrypoint. Note: Custom calling convention.
 extern "C" void art_jni_method_start();
 extern "C" void art_jni_monitored_method_start();
+extern "C" void art_jni_method_end();
+extern "C" void art_jni_monitored_method_end();
 
 // JNI lock/unlock entrypoints. Note: Custom calling convention.
 extern "C" void art_jni_lock_object(art::mirror::Object*);
diff --git a/runtime/entrypoints/quick/quick_default_init_entrypoints.h b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
index 6f3c8d0..eec7ca9 100644
--- a/runtime/entrypoints/quick/quick_default_init_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
@@ -75,8 +75,7 @@
 
   // JNI
   qpoints->pJniMethodStart = art_jni_method_start;
-  qpoints->pJniMethodEnd = JniMethodEnd;
-  qpoints->pJniMethodEndWithReference = JniMethodEndWithReference;
+  qpoints->pJniMethodEnd = art_jni_method_end;
   qpoints->pQuickGenericJniTrampoline = art_quick_generic_jni_trampoline;
   qpoints->pJniDecodeReferenceResult = JniDecodeReferenceResult;
   qpoints->pJniReadBarrier = art_jni_read_barrier;
@@ -138,8 +137,7 @@
 
   if (monitor_jni_entry_exit) {
     qpoints->pJniMethodStart = art_jni_monitored_method_start;
-    qpoints->pJniMethodEnd = JniMonitoredMethodEnd;
-    qpoints->pJniMethodEndWithReference = JniMonitoredMethodEndWithReference;
+    qpoints->pJniMethodEnd = art_jni_monitored_method_end;
   }
 }
 
diff --git a/runtime/entrypoints/quick/quick_entrypoints.h b/runtime/entrypoints/quick/quick_entrypoints.h
index 4580cfb..2b9f2f3 100644
--- a/runtime/entrypoints/quick/quick_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_entrypoints.h
@@ -53,33 +53,25 @@
 
 
 // JNI entrypoints.
-// TODO: NO_THREAD_SAFETY_ANALYSIS due to different control paths depending on fast JNI.
-extern "C" void artJniMethodStart(Thread* self) NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
-extern void JniMethodEnd(Thread* self)
-    NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
-extern mirror::Object* JniMethodEndWithReference(jobject result, Thread* self)
-    NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
+extern "C" void artJniMethodStart(Thread* self) UNLOCK_FUNCTION(Locks::mutator_lock_) HOT_ATTR;
+extern "C" void artJniMethodEnd(Thread* self) SHARED_LOCK_FUNCTION(Locks::mutator_lock_) HOT_ATTR;
 extern mirror::Object* JniDecodeReferenceResult(jobject result, Thread* self)
-    NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
-
-// JNI entrypoints when monitoring entry/exit.
-extern "C" void artJniMonitoredMethodStart(Thread* self) NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
-extern void JniMonitoredMethodEnd(Thread* self)
-    NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
-extern mirror::Object* JniMonitoredMethodEndWithReference(jobject result, Thread* self)
-    NO_THREAD_SAFETY_ANALYSIS HOT_ATTR;
-
-
-extern "C" mirror::String* artStringBuilderAppend(uint32_t format,
-                                                  const uint32_t* args,
-                                                  Thread* self)
     REQUIRES_SHARED(Locks::mutator_lock_) HOT_ATTR;
-
 extern "C" void artJniReadBarrier(ArtMethod* method)
     REQUIRES_SHARED(Locks::mutator_lock_) HOT_ATTR;
 extern "C" void artJniUnlockObject(mirror::Object* locked, Thread* self)
     REQUIRES_SHARED(Locks::mutator_lock_) HOT_ATTR;
 
+// JNI entrypoints when monitoring entry/exit.
+extern "C" void artJniMonitoredMethodStart(Thread* self) UNLOCK_FUNCTION(Locks::mutator_lock_);
+extern "C" void artJniMonitoredMethodEnd(Thread* self) SHARED_LOCK_FUNCTION(Locks::mutator_lock_);
+
+// StringAppend pattern entrypoint.
+extern "C" mirror::String* artStringBuilderAppend(uint32_t format,
+                                                  const uint32_t* args,
+                                                  Thread* self)
+    REQUIRES_SHARED(Locks::mutator_lock_) HOT_ATTR;
+
 // Read barrier entrypoints.
 //
 // Compilers for ARM, ARM64 can insert a call to these
diff --git a/runtime/entrypoints/quick/quick_entrypoints_list.h b/runtime/entrypoints/quick/quick_entrypoints_list.h
index b89ff2c..dffaa4b 100644
--- a/runtime/entrypoints/quick/quick_entrypoints_list.h
+++ b/runtime/entrypoints/quick/quick_entrypoints_list.h
@@ -73,8 +73,7 @@
   V(AputObject, void, mirror::Array*, int32_t, mirror::Object*) \
 \
   V(JniMethodStart, void) \
-  V(JniMethodEnd, void, Thread*) \
-  V(JniMethodEndWithReference, mirror::Object*, jobject, Thread*) \
+  V(JniMethodEnd, void) \
   V(JniDecodeReferenceResult, mirror::Object*, jobject, Thread*) \
   V(JniLockObject, void, mirror::Object*) \
   V(JniUnlockObject, void, mirror::Object*) \
diff --git a/runtime/entrypoints/quick/quick_jni_entrypoints.cc b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
index 205dd22..ab13bd9 100644
--- a/runtime/entrypoints/quick/quick_jni_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
@@ -63,22 +63,13 @@
   if (kIsDebugBuild) {
     ArtMethod* native_method = *self->GetManagedStack()->GetTopQuickFrame();
     CHECK(!native_method->IsFastNative()) << native_method->PrettyMethod();
+    CHECK(!native_method->IsCriticalNative()) << native_method->PrettyMethod();
   }
 
   // Transition out of runnable.
   self->TransitionFromRunnableToSuspended(ThreadState::kNative);
 }
 
-// TODO: NO_THREAD_SAFETY_ANALYSIS due to different control paths depending on fast JNI.
-static void GoToRunnable(Thread* self) NO_THREAD_SAFETY_ANALYSIS {
-  if (kIsDebugBuild) {
-    ArtMethod* native_method = *self->GetManagedStack()->GetTopQuickFrame();
-    CHECK(!native_method->IsFastNative()) << native_method->PrettyMethod();
-  }
-
-  self->TransitionFromSuspendedToRunnable();
-}
-
 static void PopLocalReferences(uint32_t saved_local_ref_cookie, Thread* self)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   JNIEnvExt* env = self->GetJniEnv();
@@ -123,8 +114,14 @@
 // TODO: These should probably be templatized or macro-ized.
 // Otherwise there's just too much repetitive boilerplate.
 
-extern void JniMethodEnd(Thread* self) {
-  GoToRunnable(self);
+extern "C" void artJniMethodEnd(Thread* self) {
+  self->TransitionFromSuspendedToRunnable();
+
+  if (kIsDebugBuild) {
+    ArtMethod* native_method = *self->GetManagedStack()->GetTopQuickFrame();
+    CHECK(!native_method->IsFastNative()) << native_method->PrettyMethod();
+    CHECK(!native_method->IsCriticalNative()) << native_method->PrettyMethod();
+  }
 }
 
 extern mirror::Object* JniDecodeReferenceResult(jobject result, Thread* self)
@@ -142,36 +139,13 @@
   return o.Ptr();
 }
 
-// Common result handling for EndWithReference.
-static mirror::Object* JniMethodEndWithReferenceHandleResult(jobject result, Thread* self)
-    NO_THREAD_SAFETY_ANALYSIS {
-  // Must decode before pop. The 'result' may not be valid in case of an exception, though.
-  ObjPtr<mirror::Object> o;
-  if (!self->IsExceptionPending()) {
-    o = self->DecodeJObject(result);
-  }
-  // Process result.
-  if (UNLIKELY(self->GetJniEnv()->IsCheckJniEnabled())) {
-    // CheckReferenceResult can resolve types.
-    StackHandleScope<1> hs(self);
-    HandleWrapperObjPtr<mirror::Object> h_obj(hs.NewHandleWrapper(&o));
-    CheckReferenceResult(h_obj, self);
-  }
-  VerifyObject(o);
-  return o.Ptr();
-}
-
-extern mirror::Object* JniMethodEndWithReference(jobject result, Thread* self) {
-  GoToRunnable(self);
-  return JniMethodEndWithReferenceHandleResult(result, self);
-}
-
 extern uint64_t GenericJniMethodEnd(Thread* self,
                                     uint32_t saved_local_ref_cookie,
                                     jvalue result,
                                     uint64_t result_f,
                                     ArtMethod* called)
-    // TODO: NO_THREAD_SAFETY_ANALYSIS as GoToRunnable() is NO_THREAD_SAFETY_ANALYSIS
+    // NO_THREAD_SAFETY_ANALYSIS because we can enter this function with the mutator lock
+    // unlocked for normal JNI, or locked for @FastNative and @CriticalNative.
     NO_THREAD_SAFETY_ANALYSIS {
   bool critical_native = called->IsCriticalNative();
   bool fast_native = called->IsFastNative();
@@ -180,16 +154,19 @@
   // @CriticalNative does not do a state transition. @FastNative usually does not do a state
   // transition either but it performs a suspend check that may do state transitions.
   if (LIKELY(normal_native)) {
-    MONITOR_JNI(PaletteNotifyEndJniInvocation);
-    GoToRunnable(self);
+    if (UNLIKELY(self->ReadFlag(ThreadFlag::kMonitorJniEntryExit))) {
+      artJniMonitoredMethodEnd(self);
+    } else {
+      artJniMethodEnd(self);
+    }
   } else if (fast_native) {
     // When we are in @FastNative, we are already Runnable.
     DCHECK(Locks::mutator_lock_->IsSharedHeld(self));
     // Only do a suspend check on the way out of JNI just like compiled stubs.
     self->CheckSuspend();
   }
-  // We need the mutator lock (i.e., calling GoToRunnable()) before accessing the shorty or the
-  // locked object.
+  // We need the mutator lock (i.e., calling `artJniMethodEnd()`) before accessing
+  // the shorty or the locked object.
   if (called->IsSynchronized()) {
     DCHECK(normal_native) << "@FastNative/@CriticalNative and synchronize is not supported";
     ObjPtr<mirror::Object> lock = GetGenericJniSynchronizationObject(self, called);
@@ -198,8 +175,8 @@
   }
   char return_shorty_char = called->GetShorty()[0];
   if (return_shorty_char == 'L') {
-    uint64_t ret =
-        reinterpret_cast<uint64_t>(JniMethodEndWithReferenceHandleResult(result.l, self));
+    uint64_t ret = reinterpret_cast<uint64_t>(
+        UNLIKELY(self->IsExceptionPending()) ? nullptr : JniDecodeReferenceResult(result.l, self));
     PopLocalReferences(saved_local_ref_cookie, self);
     return ret;
   } else {
@@ -244,14 +221,9 @@
   MONITOR_JNI(PaletteNotifyBeginJniInvocation);
 }
 
-extern void JniMonitoredMethodEnd(Thread* self) {
+extern "C" void artJniMonitoredMethodEnd(Thread* self) {
   MONITOR_JNI(PaletteNotifyEndJniInvocation);
-  JniMethodEnd(self);
-}
-
-extern mirror::Object* JniMonitoredMethodEndWithReference(jobject result, Thread* self) {
-  MONITOR_JNI(PaletteNotifyEndJniInvocation);
-  return JniMethodEndWithReference(result, self);
+  artJniMethodEnd(self);
 }
 
 }  // namespace art
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index 00d5523..240ecbd 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -218,10 +218,7 @@
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pGetObjStatic, pAputObject, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pAputObject, pJniMethodStart, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodStart, pJniMethodEnd, sizeof(void*));
-    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEnd, pJniMethodEndWithReference,
-                         sizeof(void*));
-    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEndWithReference,
-                         pJniDecodeReferenceResult, sizeof(void*));
+    EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniMethodEnd, pJniDecodeReferenceResult, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniDecodeReferenceResult,
                          pJniLockObject, sizeof(void*));
     EXPECT_OFFSET_DIFFNP(QuickEntryPoints, pJniLockObject,
diff --git a/runtime/oat.h b/runtime/oat.h
index 04972eb..c2ad8c0 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,8 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr std::array<uint8_t, 4> kOatMagic { { 'o', 'a', 't', '\n' } };
-  // Last oat version changed reason: JNI: Inline fast-path for `JniMethodStart()`.
-  static constexpr std::array<uint8_t, 4> kOatVersion { { '2', '1', '6', '\0' } };
+  // Last oat version changed reason: JNI: Inline fast-path for `JniMethodEnd()`.
+  static constexpr std::array<uint8_t, 4> kOatVersion { { '2', '1', '7', '\0' } };
 
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
   static constexpr const char* kDebuggableKey = "debuggable";
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 960a870..fc8e6cb 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -251,6 +251,12 @@
 }
 
 inline ThreadState Thread::TransitionFromSuspendedToRunnable() {
+  // Note: JNI stubs inline a fast path of this method that transitions to Runnable if
+  // there are no flags set and then stores the mutator lock to `held_mutexes[kMutatorLock]`
+  // (this comes from a specialized `BaseMutex::RegisterAsUnlockedImpl(., kMutatorLock)`
+  // inlined from the `GetMutatorLock()->TransitionFromSuspendedToRunnable(this)` below).
+  // Therefore any code added here (other than debug build assertions) should be gated
+  // on some flag being set, so that the JNI stub can take the slow path to get here.
   StateAndFlags old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
   ThreadState old_state = old_state_and_flags.GetState();
   DCHECK_NE(old_state, ThreadState::kRunnable);
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 184d2c1..7988f88 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -3531,7 +3531,6 @@
   QUICK_ENTRY_POINT_INFO(pAputObject)
   QUICK_ENTRY_POINT_INFO(pJniMethodStart)
   QUICK_ENTRY_POINT_INFO(pJniMethodEnd)
-  QUICK_ENTRY_POINT_INFO(pJniMethodEndWithReference)
   QUICK_ENTRY_POINT_INFO(pJniDecodeReferenceResult)
   QUICK_ENTRY_POINT_INFO(pJniLockObject)
   QUICK_ENTRY_POINT_INFO(pJniUnlockObject)
diff --git a/runtime/thread.h b/runtime/thread.h
index 7d76956..3c358d8 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -958,6 +958,12 @@
   }
 
   template<PointerSize pointer_size>
+  static constexpr ThreadOffset<pointer_size> MutatorLockOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values,
+                                                                mutator_lock));
+  }
+
+  template<PointerSize pointer_size>
   static constexpr ThreadOffset<pointer_size> HeldMutexOffset(LockLevel level) {
     DCHECK_LT(enum_cast<size_t>(level), arraysize(tlsPtr_.held_mutexes));
     return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values,