Allow late lookup for @CriticalNative methods.

Test: Add and enable tests in 178-app-image-native-method
Test: Add and enable tests in jni_compiler_test
Test: Manually step through the new stub in GDB and check
      that backtrace works at various points.
Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: aosp_taimen-userdebug boots.
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 112189621
Change-Id: If094e5062acbb99eefa88f2afb4815f93730cb82
diff --git a/runtime/arch/arm/asm_support_arm.S b/runtime/arch/arm/asm_support_arm.S
index 633591d..5b51e51 100644
--- a/runtime/arch/arm/asm_support_arm.S
+++ b/runtime/arch/arm/asm_support_arm.S
@@ -149,4 +149,124 @@
 #endif  // USE_HEAP_POISONING
 .endm
 
+// Macro to refresh the Marking Register (R8).
+//
+// This macro must be called at the end of functions implementing
+// entrypoints that possibly (directly or indirectly) perform a
+// suspend check (before they return).
+.macro REFRESH_MARKING_REGISTER
+#if defined(USE_READ_BARRIER) && defined(USE_BAKER_READ_BARRIER)
+    ldr rMR, [rSELF, #THREAD_IS_GC_MARKING_OFFSET]
+#endif
+.endm
+
+    /*
+     * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kSaveRefsAndArgs), except for storing the method.
+     */
+.macro SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
+    // Note: We could avoid saving R8 in the case of Baker read
+    // barriers, as it is overwritten by REFRESH_MARKING_REGISTER
+    // later; but it's not worth handling this special case.
+    push {r1-r3, r5-r8, r10-r11, lr}   @ 10 words of callee saves and args.
+    .cfi_adjust_cfa_offset 40
+    .cfi_rel_offset r1, 0
+    .cfi_rel_offset r2, 4
+    .cfi_rel_offset r3, 8
+    .cfi_rel_offset r5, 12
+    .cfi_rel_offset r6, 16
+    .cfi_rel_offset r7, 20
+    .cfi_rel_offset r8, 24
+    .cfi_rel_offset r10, 28
+    .cfi_rel_offset r11, 32
+    .cfi_rel_offset lr, 36
+    vpush {s0-s15}                     @ 16 words of float args.
+    .cfi_adjust_cfa_offset 64
+    sub sp, #8                         @ 2 words of space, alignment padding and Method*
+    .cfi_adjust_cfa_offset 8
+    // Ugly compile-time check, but we only have the preprocessor.
+#if (FRAME_SIZE_SAVE_REFS_AND_ARGS != 40 + 64 + 8)
+#error "FRAME_SIZE_SAVE_REFS_AND_ARGS(ARM) size not as expected."
+#endif
+.endm
+
+.macro RESTORE_SAVE_REFS_AND_ARGS_FRAME
+    add  sp, #8                      @ rewind sp
+    .cfi_adjust_cfa_offset -8
+    vpop {s0-s15}
+    .cfi_adjust_cfa_offset -64
+    // Note: Likewise, we could avoid restoring R8 in the case of Baker
+    // read barriers, as it is overwritten by REFRESH_MARKING_REGISTER
+    // later; but it's not worth handling this special case.
+    pop {r1-r3, r5-r8, r10-r11, lr}  @ 10 words of callee saves and args.
+    .cfi_restore r1
+    .cfi_restore r2
+    .cfi_restore r3
+    .cfi_restore r5
+    .cfi_restore r6
+    .cfi_restore r7
+    .cfi_restore r8
+    .cfi_restore r10
+    .cfi_restore r11
+    .cfi_restore lr
+    .cfi_adjust_cfa_offset -40
+.endm
+
+    /*
+     * Macro to spill the GPRs.
+     */
+.macro SPILL_ALL_CALLEE_SAVE_GPRS
+    push {r4-r11, lr}                             @ 9 words (36 bytes) of callee saves.
+    .cfi_adjust_cfa_offset 36
+    .cfi_rel_offset r4, 0
+    .cfi_rel_offset r5, 4
+    .cfi_rel_offset r6, 8
+    .cfi_rel_offset r7, 12
+    .cfi_rel_offset r8, 16
+    .cfi_rel_offset r9, 20
+    .cfi_rel_offset r10, 24
+    .cfi_rel_offset r11, 28
+    .cfi_rel_offset lr, 32
+.endm
+
+    /*
+     * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kSaveAllCalleeSaves)
+     */
+.macro SETUP_SAVE_ALL_CALLEE_SAVES_FRAME rTemp
+    SPILL_ALL_CALLEE_SAVE_GPRS                    @ 9 words (36 bytes) of callee saves.
+    vpush {s16-s31}                               @ 16 words (64 bytes) of floats.
+    .cfi_adjust_cfa_offset 64
+    sub sp, #12                                   @ 3 words of space, bottom word will hold Method*
+    .cfi_adjust_cfa_offset 12
+    RUNTIME_CURRENT1 \rTemp                       @ Load Runtime::Current into rTemp.
+    @ Load kSaveAllCalleeSaves Method* into rTemp.
+    ldr \rTemp, [\rTemp, #RUNTIME_SAVE_ALL_CALLEE_SAVES_METHOD_OFFSET]
+    str \rTemp, [sp, #0]                          @ Place Method* at bottom of stack.
+    str sp, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
+
+     // Ugly compile-time check, but we only have the preprocessor.
+#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVES != 36 + 64 + 12)
+#error "FRAME_SIZE_SAVE_ALL_CALLEE_SAVES(ARM) size not as expected."
+#endif
+.endm
+
+    /*
+     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
+     * exception is Thread::Current()->exception_ when the runtime method frame is ready.
+     */
+.macro DELIVER_PENDING_EXCEPTION_FRAME_READY
+    mov    r0, rSELF                           @ pass Thread::Current
+    bl     artDeliverPendingExceptionFromCode  @ artDeliverPendingExceptionFromCode(Thread*)
+.endm
+
+    /*
+     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
+     * exception is Thread::Current()->exception_.
+     */
+.macro DELIVER_PENDING_EXCEPTION
+    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME r0       @ save callee saves for throw
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
+.endm
+
 #endif  // ART_RUNTIME_ARCH_X86_ASM_SUPPORT_X86_S_
diff --git a/runtime/arch/arm/jni_entrypoints_arm.S b/runtime/arch/arm/jni_entrypoints_arm.S
index a0f93cc..ceef772 100644
--- a/runtime/arch/arm/jni_entrypoints_arm.S
+++ b/runtime/arch/arm/jni_entrypoints_arm.S
@@ -33,12 +33,13 @@
     .cfi_adjust_cfa_offset 12
 
     mov    r0, rSELF                      @ pass Thread::Current()
-    // Call artFindNativeMethod() for normal native and artFindNativeMethodRunnable for @FastNative.
+    // Call artFindNativeMethod() for normal native and artFindNativeMethodRunnable()
+    // for @FastNative or @CriticalNative.
     ldr    ip, [r0, #THREAD_TOP_QUICK_FRAME_OFFSET]   // uintptr_t tagged_quick_frame
     bic    ip, #1                                     // ArtMethod** sp
     ldr    ip, [ip]                                   // ArtMethod* method
     ldr    ip, [ip, #ART_METHOD_ACCESS_FLAGS_OFFSET]  // uint32_t access_flags
-    tst    ip, #ACCESS_FLAGS_METHOD_IS_FAST_NATIVE
+    tst    ip, #(ACCESS_FLAGS_METHOD_IS_FAST_NATIVE | ACCESS_FLAGS_METHOD_IS_CRITICAL_NATIVE)
     bne    .Llookup_stub_fast_native
     blx    artFindNativeMethod
     b      .Llookup_stub_continue
@@ -61,3 +62,188 @@
 1:
     pop    {r0, r1, r2, r3, pc}           @ restore regs and return to caller to handle exception
 END art_jni_dlsym_lookup_stub
+
+ENTRY art_jni_dlsym_lookup_critical_stub
+    // The hidden arg holding the tagged method (bit 0 set means GenericJNI) is r4.
+    // For Generic JNI we already have a managed frame, so we reuse the art_jni_dlsym_lookup_stub.
+    tst    r4, #1
+    bne art_jni_dlsym_lookup_stub
+
+    // We need to create a GenericJNI managed frame above the stack args.
+
+    // GenericJNI frame is similar to SaveRegsAndArgs frame with the native method
+    // instead of runtime method saved at the bottom. Note that the runtime shall
+    // not examine the args here, otherwise we would have to move them in registers
+    // and stack to account for the difference between managed and native ABIs.
+    SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
+    // Save the hidden arg as method pointer, r0 in the padding.
+    // (x0 is an arg in native ABI but not considered an arg in managed ABI.)
+    strd   r4, r0, [sp]
+
+    // Call artCriticalNativeOutArgsSize(method)
+    mov    r0, r4  // r0 := method (from hidden arg)
+    bl     artCriticalNativeOutArgsSize
+
+    // Check if we have any stack args.
+    cbnz   r0, .Lcritical_has_stack_args
+
+    // Without stack args, the frame is fully constructed.
+    // Place tagged managed sp in Thread::Current()->top_quick_frame.
+    mov    ip, sp
+    orr    ip, #1  // Tag as GenericJNI frame.
+    str    ip, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]
+
+    // Call artFindNativeMethodRunnable()
+    mov    r0, rSELF   // pass Thread::Current()
+    bl     artFindNativeMethodRunnable
+
+    // Store result in scratch reg.
+    mov    ip, r0
+
+    // Restore frame.
+    .cfi_remember_state
+    ldrd   r4, r0, [sp]
+    RESTORE_SAVE_REFS_AND_ARGS_FRAME
+    REFRESH_MARKING_REGISTER
+
+    // Check for exception.
+    cmp    ip, #0
+    beq    .Lcritical_deliver_exception
+
+    // Do the tail call.
+    bx     ip
+    .cfi_restore_state
+    .cfi_def_cfa_offset FRAME_SIZE_SAVE_REFS_AND_ARGS
+
+.Lcritical_has_stack_args:
+    // Move the out args size to a scratch register.
+    mov    ip, r0
+
+    // Restore register args as we're about to move stack args.
+    ldrd   r4, r0, [sp]
+    RESTORE_SAVE_REFS_AND_ARGS_FRAME
+
+    // Reserve space for SaveRefsAndArgs frame.
+    sub sp, #FRAME_SIZE_SAVE_REFS_AND_ARGS
+    .cfi_adjust_cfa_offset FRAME_SIZE_SAVE_REFS_AND_ARGS
+
+    // Save arg regs so that we can use them as temporaries.
+    push   {r0-r3}
+    .cfi_adjust_cfa_offset 16
+
+    // Move out args. For simplicity include the return address at the end.
+    add    r0, sp, #16   // Destination.
+    add    ip, r0, ip    // Destination end.
+1:
+    ldrd   r2, r3, [r0, #FRAME_SIZE_SAVE_REFS_AND_ARGS]
+    strd   r2, r3, [r0], #8
+    cmp    r0, ip
+    bne    1b
+
+    // Save our LR, load caller's LR and redefine CFI to take ownership of the JNI stub frame.
+    str    lr, [ip, #-__SIZEOF_POINTER__]
+    mov    lr, r3  // The last moved value from the loop above.
+    .cfi_def_cfa ip, FRAME_SIZE_SAVE_REFS_AND_ARGS
+
+    // Restore arg regs.
+    pop    {r0-r3}  // No `.cfi_adjust_cfa_offset`, CFA register is currently ip, not sp.
+
+    // Re-create the SaveRefsAndArgs frame above the args.
+    strd   r4, r0, [ip]  // r0 in the padding as before.
+    add    r4, ip, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40
+    stmia  r4, {r1-r3, r5-r8, r10-r11, lr}   @ 10 words of callee saves and args.
+    .cfi_rel_offset r1, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 0
+    .cfi_rel_offset r2, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 4
+    .cfi_rel_offset r3, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 8
+    .cfi_rel_offset r5, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 12
+    .cfi_rel_offset r6, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 16
+    .cfi_rel_offset r7, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 20
+    .cfi_rel_offset r8, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 24
+    .cfi_rel_offset r10, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 28
+    .cfi_rel_offset r11, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 32
+    .cfi_rel_offset lr, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 36
+    vstmdb r4!, {s0-s15}                     @ 16 words of float args.
+
+    // Move the frame register to a callee-save register.
+    mov    r11, ip
+    .cfi_def_cfa_register r11
+
+    // Place tagged managed sp in Thread::Current()->top_quick_frame.
+    orr    ip, r11, #1  // Tag as GenericJNI frame.
+    str    ip, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]
+
+    // Call artFindNativeMethodRunnable()
+    mov    r0, rSELF   // pass Thread::Current()
+    bl     artFindNativeMethodRunnable
+
+    // Store result in scratch reg.
+    mov    ip, r0
+
+    // Restore the frame. We shall not need the method anymore, so use r4 as scratch register.
+    mov    r4, r11
+    .cfi_def_cfa_register r4
+    ldr    r0, [r4, #4]
+    add    r11, r4, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 - 64)
+    vldmia r11!, {s0-s15}                    @ 16 words of float args.
+    ldmia  r11, {r1-r3, r5-r8, r10-r11, lr}  @ 10 words of callee saves and args.
+    .cfi_restore r1
+    .cfi_restore r2
+    .cfi_restore r3
+    .cfi_restore r5
+    .cfi_restore r6
+    .cfi_restore r7
+    .cfi_restore r8
+    .cfi_restore r10
+    .cfi_restore r11
+    .cfi_restore lr
+    REFRESH_MARKING_REGISTER
+
+    // Check for exception.
+    cmp    ip, #0
+    beq    3f
+
+    // Save arg regs so that we can use them as temporaries.
+    push   {r0-r3}  // No `.cfi_adjust_cfa_offset`, CFA register is currently r4, not sp.
+
+    // Move stack args to their original place.
+    mov    r0, r4
+    add    r1, sp, #16
+2:
+    ldrd   r2, r3, [r0, #-8]!
+    strd   r2, r3, [r0, #FRAME_SIZE_SAVE_REFS_AND_ARGS]
+    cmp    r1, r0
+    bne    2b
+
+    // Replace original return address with caller's return address.
+    ldr    r1, [r4, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__)]
+    str    lr, [r4, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__)]
+
+    // Restore LR and redefine CFI to release ownership of the JNI stub frame.
+    .cfi_remember_state
+    mov    lr, r1
+    .cfi_def_cfa sp, FRAME_SIZE_SAVE_REFS_AND_ARGS + 16
+
+    // Restore args
+    pop    {r0-r3}
+    .cfi_adjust_cfa_offset -16
+
+    // Remove the frame reservation.
+    add    sp, #FRAME_SIZE_SAVE_REFS_AND_ARGS
+    .cfi_adjust_cfa_offset -FRAME_SIZE_SAVE_REFS_AND_ARGS
+
+    // Do the tail call.
+    bx     ip
+    .cfi_restore_state
+    .cfi_def_cfa x4, FRAME_SIZE_SAVE_REFS_AND_ARGS
+
+3:
+    // Drop stack args and the SaveRefsAndArgs reservation.
+    mov    sp, r4
+    add    sp, #FRAME_SIZE_SAVE_REFS_AND_ARGS
+    .cfi_def_cfa sp, 0
+
+.Lcritical_deliver_exception:
+    // When delivering exception, we check that rSELF was saved but the SaveRefsAndArgs frame does
+    // not save it, so we cannot use DELIVER_PENDING_EXCEPTION_FRAME_READY with the above frames.
+    DELIVER_PENDING_EXCEPTION
+END art_jni_dlsym_lookup_critical_stub
diff --git a/runtime/arch/arm/jni_frame_arm.h b/runtime/arch/arm/jni_frame_arm.h
new file mode 100644
index 0000000..5203eaf
--- /dev/null
+++ b/runtime/arch/arm/jni_frame_arm.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_ARCH_ARM_JNI_FRAME_ARM_H_
+#define ART_RUNTIME_ARCH_ARM_JNI_FRAME_ARM_H_
+
+#include <string.h>
+
+#include "arch/instruction_set.h"
+#include "base/bit_utils.h"
+#include "base/globals.h"
+#include "base/logging.h"
+
+namespace art {
+namespace arm {
+
+constexpr size_t kFramePointerSize = static_cast<size_t>(PointerSize::k32);
+static_assert(kArmPointerSize == PointerSize::k32, "Unexpected ARM pointer size");
+
+// The AAPCS requires 8-byte alignement. This is not as strict as the Managed ABI stack alignment.
+static constexpr size_t kAapcsStackAlignment = 8u;
+static_assert(kAapcsStackAlignment < kStackAlignment);
+
+// How many registers can be used for passing arguments.
+// Note: AAPCS is soft-float, so these are all core registers.
+constexpr size_t kJniArgumentRegisterCount = 4u;
+
+// Get the size of "out args" for @CriticalNative method stub.
+// This must match the size of the frame emitted by the JNI compiler at the native call site.
+inline size_t GetCriticalNativeOutArgsSize(const char* shorty, uint32_t shorty_len) {
+  DCHECK_EQ(shorty_len, strlen(shorty));
+
+  size_t reg = 0;  // Register for the current argument; if reg >= 4, we shall use stack.
+  for (size_t i = 1; i != shorty_len; ++i) {
+    if (shorty[i] == 'J' || shorty[i] == 'D') {
+      // 8-byte args need to start in even-numbered register or at aligned stack position.
+      reg += (reg & 1);
+      // Count first word and let the common path count the second.
+      reg += 1u;
+    }
+    reg += 1u;
+  }
+  size_t stack_args = std::max(reg, kJniArgumentRegisterCount) - kJniArgumentRegisterCount;
+  size_t size = kFramePointerSize * stack_args;
+
+  // Check if this is a tail call, i.e. there are no stack args and the return type
+  // is not  an FP type (otherwise we need to move the result to FP register).
+  // No need to sign/zero extend small return types thanks to AAPCS.
+  if (size != 0u || shorty[0] == 'F' || shorty[0] == 'D') {
+    size += kFramePointerSize;  // We need to spill LR with the args.
+  }
+  return RoundUp(size, kAapcsStackAlignment);
+}
+
+}  // namespace arm
+}  // namespace art
+
+#endif  // ART_RUNTIME_ARCH_ARM_JNI_FRAME_ARM_H_
+
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 9eee345..f94694d 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -25,45 +25,6 @@
     .extern artDeliverPendingException
 
     /*
-     * Macro to spill the GPRs.
-     */
-.macro SPILL_ALL_CALLEE_SAVE_GPRS
-    push {r4-r11, lr}                             @ 9 words (36 bytes) of callee saves.
-    .cfi_adjust_cfa_offset 36
-    .cfi_rel_offset r4, 0
-    .cfi_rel_offset r5, 4
-    .cfi_rel_offset r6, 8
-    .cfi_rel_offset r7, 12
-    .cfi_rel_offset r8, 16
-    .cfi_rel_offset r9, 20
-    .cfi_rel_offset r10, 24
-    .cfi_rel_offset r11, 28
-    .cfi_rel_offset lr, 32
-.endm
-
-    /*
-     * Macro that sets up the callee save frame to conform with
-     * Runtime::CreateCalleeSaveMethod(kSaveAllCalleeSaves)
-     */
-.macro SETUP_SAVE_ALL_CALLEE_SAVES_FRAME rTemp
-    SPILL_ALL_CALLEE_SAVE_GPRS                    @ 9 words (36 bytes) of callee saves.
-    vpush {s16-s31}                               @ 16 words (64 bytes) of floats.
-    .cfi_adjust_cfa_offset 64
-    sub sp, #12                                   @ 3 words of space, bottom word will hold Method*
-    .cfi_adjust_cfa_offset 12
-    RUNTIME_CURRENT1 \rTemp                       @ Load Runtime::Current into rTemp.
-    @ Load kSaveAllCalleeSaves Method* into rTemp.
-    ldr \rTemp, [\rTemp, #RUNTIME_SAVE_ALL_CALLEE_SAVES_METHOD_OFFSET]
-    str \rTemp, [sp, #0]                          @ Place Method* at bottom of stack.
-    str sp, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
-
-     // Ugly compile-time check, but we only have the preprocessor.
-#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVES != 36 + 64 + 12)
-#error "FRAME_SIZE_SAVE_ALL_CALLEE_SAVES(ARM) size not as expected."
-#endif
-.endm
-
-    /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveRefsOnly).
      */
@@ -111,36 +72,6 @@
     .cfi_adjust_cfa_offset -28
 .endm
 
-    /*
-     * Macro that sets up the callee save frame to conform with
-     * Runtime::CreateCalleeSaveMethod(kSaveRefsAndArgs).
-     */
-.macro SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
-    // Note: We could avoid saving R8 in the case of Baker read
-    // barriers, as it is overwritten by REFRESH_MARKING_REGISTER
-    // later; but it's not worth handling this special case.
-    push {r1-r3, r5-r8, r10-r11, lr}   @ 10 words of callee saves and args.
-    .cfi_adjust_cfa_offset 40
-    .cfi_rel_offset r1, 0
-    .cfi_rel_offset r2, 4
-    .cfi_rel_offset r3, 8
-    .cfi_rel_offset r5, 12
-    .cfi_rel_offset r6, 16
-    .cfi_rel_offset r7, 20
-    .cfi_rel_offset r8, 24
-    .cfi_rel_offset r10, 28
-    .cfi_rel_offset r11, 32
-    .cfi_rel_offset lr, 36
-    vpush {s0-s15}                     @ 16 words of float args.
-    .cfi_adjust_cfa_offset 64
-    sub sp, #8                         @ 2 words of space, alignment padding and Method*
-    .cfi_adjust_cfa_offset 8
-    // Ugly compile-time check, but we only have the preprocessor.
-#if (FRAME_SIZE_SAVE_REFS_AND_ARGS != 40 + 64 + 8)
-#error "FRAME_SIZE_SAVE_REFS_AND_ARGS(ARM) size not as expected."
-#endif
-.endm
-
 .macro SETUP_SAVE_REFS_AND_ARGS_FRAME rTemp
     SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
     RUNTIME_CURRENT3 \rTemp                       @ Load Runtime::Current into rTemp.
@@ -156,28 +87,6 @@
     str sp, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
 .endm
 
-.macro RESTORE_SAVE_REFS_AND_ARGS_FRAME
-    add  sp, #8                      @ rewind sp
-    .cfi_adjust_cfa_offset -8
-    vpop {s0-s15}
-    .cfi_adjust_cfa_offset -64
-    // Note: Likewise, we could avoid restoring X20 in the case of Baker
-    // read barriers, as it is overwritten by REFRESH_MARKING_REGISTER
-    // later; but it's not worth handling this special case.
-    pop {r1-r3, r5-r8, r10-r11, lr}  @ 10 words of callee saves
-    .cfi_restore r1
-    .cfi_restore r2
-    .cfi_restore r3
-    .cfi_restore r5
-    .cfi_restore r6
-    .cfi_restore r7
-    .cfi_restore r8
-    .cfi_restore r10
-    .cfi_restore r11
-    .cfi_restore lr
-    .cfi_adjust_cfa_offset -40
-.endm
-
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveEverything)
@@ -273,17 +182,6 @@
     .cfi_adjust_cfa_offset -52
 .endm
 
-// Macro to refresh the Marking Register (R8).
-//
-// This macro must be called at the end of functions implementing
-// entrypoints that possibly (directly or indirectly) perform a
-// suspend check (before they return).
-.macro REFRESH_MARKING_REGISTER
-#if defined(USE_READ_BARRIER) && defined(USE_BAKER_READ_BARRIER)
-    ldr rMR, [rSELF, #THREAD_IS_GC_MARKING_OFFSET]
-#endif
-.endm
-
 .macro RETURN_IF_RESULT_IS_ZERO
     cbnz   r0, 1f              @ result non-zero branch over
     bx     lr                  @ return
@@ -296,24 +194,6 @@
 1:
 .endm
 
-    /*
-     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
-     * exception is Thread::Current()->exception_ when the runtime method frame is ready.
-     */
-.macro DELIVER_PENDING_EXCEPTION_FRAME_READY
-    mov    r0, rSELF                           @ pass Thread::Current
-    bl     artDeliverPendingExceptionFromCode  @ artDeliverPendingExceptionFromCode(Thread*)
-.endm
-
-    /*
-     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
-     * exception is Thread::Current()->exception_.
-     */
-.macro DELIVER_PENDING_EXCEPTION
-    SETUP_SAVE_ALL_CALLEE_SAVES_FRAME r0       @ save callee saves for throw
-    DELIVER_PENDING_EXCEPTION_FRAME_READY
-.endm
-
 .macro NO_ARG_RUNTIME_EXCEPTION c_name, cxx_name
     .extern \cxx_name
 ENTRY \c_name
diff --git a/runtime/arch/arm64/asm_support_arm64.S b/runtime/arch/arm64/asm_support_arm64.S
index a3cf6f0..b1e5c86 100644
--- a/runtime/arch/arm64/asm_support_arm64.S
+++ b/runtime/arch/arm64/asm_support_arm64.S
@@ -103,18 +103,26 @@
     .cfi_restore \reg
 .endm
 
-.macro SAVE_TWO_REGS reg1, reg2, offset
-    stp \reg1, \reg2, [sp, #(\offset)]
+.macro SAVE_TWO_REGS_BASE base, reg1, reg2, offset
+    stp \reg1, \reg2, [\base, #(\offset)]
     .cfi_rel_offset \reg1, (\offset)
     .cfi_rel_offset \reg2, (\offset) + 8
 .endm
 
-.macro RESTORE_TWO_REGS reg1, reg2, offset
-    ldp \reg1, \reg2, [sp, #(\offset)]
+.macro SAVE_TWO_REGS reg1, reg2, offset
+    SAVE_TWO_REGS_BASE sp, \reg1, \reg2, \offset
+.endm
+
+.macro RESTORE_TWO_REGS_BASE base, reg1, reg2, offset
+    ldp \reg1, \reg2, [\base, #(\offset)]
     .cfi_restore \reg1
     .cfi_restore \reg2
 .endm
 
+.macro RESTORE_TWO_REGS reg1, reg2, offset
+    RESTORE_TWO_REGS_BASE sp, \reg1, \reg2, \offset
+.endm
+
 .macro LOAD_RUNTIME_INSTANCE reg
 #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
     adrp xIP0, :pg_hi21_nc:_ZN3art7Runtime9instance_E
@@ -190,6 +198,71 @@
     DECREASE_FRAME 96
 .endm
 
+.macro SETUP_SAVE_REFS_AND_ARGS_FRAME_INTERNAL base
+    // Ugly compile-time check, but we only have the preprocessor.
+#if (FRAME_SIZE_SAVE_REFS_AND_ARGS != 224)
+#error "FRAME_SIZE_SAVE_REFS_AND_ARGS(ARM64) size not as expected."
+#endif
+
+    // Stack alignment filler [\base, #8].
+    // FP args.
+    stp d0, d1, [\base, #16]
+    stp d2, d3, [\base, #32]
+    stp d4, d5, [\base, #48]
+    stp d6, d7, [\base, #64]
+
+    // Core args.
+    SAVE_TWO_REGS_BASE \base, x1, x2, 80
+    SAVE_TWO_REGS_BASE \base, x3, x4, 96
+    SAVE_TWO_REGS_BASE \base, x5, x6, 112
+
+    // x7, Callee-saves.
+    // Note: We could avoid saving X20 in the case of Baker read
+    // barriers, as it is overwritten by REFRESH_MARKING_REGISTER
+    // later; but it's not worth handling this special case.
+    SAVE_TWO_REGS_BASE \base, x7, x20, 128
+    SAVE_TWO_REGS_BASE \base, x21, x22, 144
+    SAVE_TWO_REGS_BASE \base, x23, x24, 160
+    SAVE_TWO_REGS_BASE \base, x25, x26, 176
+    SAVE_TWO_REGS_BASE \base, x27, x28, 192
+
+    // x29(callee-save) and LR.
+    SAVE_TWO_REGS_BASE \base, x29, xLR, 208
+.endm
+
+// TODO: Probably no need to restore registers preserved by aapcs64. (That would require
+// auditing all users to make sure they restore aapcs64 callee-save registers they clobber.)
+.macro RESTORE_SAVE_REFS_AND_ARGS_FRAME_INTERNAL base
+    // FP args.
+    ldp d0, d1, [\base, #16]
+    ldp d2, d3, [\base, #32]
+    ldp d4, d5, [\base, #48]
+    ldp d6, d7, [\base, #64]
+
+    // Core args.
+    RESTORE_TWO_REGS_BASE \base, x1, x2, 80
+    RESTORE_TWO_REGS_BASE \base, x3, x4, 96
+    RESTORE_TWO_REGS_BASE \base, x5, x6, 112
+
+    // x7, Callee-saves.
+    // Note: Likewise, we could avoid restoring X20 in the case of Baker
+    // read barriers, as it is overwritten by REFRESH_MARKING_REGISTER
+    // later; but it's not worth handling this special case.
+    RESTORE_TWO_REGS_BASE \base, x7, x20, 128
+    RESTORE_TWO_REGS_BASE \base, x21, x22, 144
+    RESTORE_TWO_REGS_BASE \base, x23, x24, 160
+    RESTORE_TWO_REGS_BASE \base, x25, x26, 176
+    RESTORE_TWO_REGS_BASE \base, x27, x28, 192
+
+    // x29(callee-save) and LR.
+    RESTORE_TWO_REGS_BASE \base, x29, xLR, 208
+.endm
+
+.macro RESTORE_SAVE_REFS_AND_ARGS_FRAME
+    RESTORE_SAVE_REFS_AND_ARGS_FRAME_INTERNAL sp
+    DECREASE_FRAME FRAME_SIZE_SAVE_REFS_AND_ARGS
+.endm
+
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveAllCalleeSaves)
diff --git a/runtime/arch/arm64/jni_entrypoints_arm64.S b/runtime/arch/arm64/jni_entrypoints_arm64.S
index e645799..8a34662 100644
--- a/runtime/arch/arm64/jni_entrypoints_arm64.S
+++ b/runtime/arch/arm64/jni_entrypoints_arm64.S
@@ -23,69 +23,210 @@
     .extern artFindNativeMethodRunnable
 
 ENTRY art_jni_dlsym_lookup_stub
-  // spill regs.
-  stp   x29, x30, [sp, #-16]!
-  .cfi_adjust_cfa_offset 16
-  .cfi_rel_offset x29, 0
-  .cfi_rel_offset x30, 8
-  mov   x29, sp
-  stp   d6, d7,   [sp, #-16]!
-  .cfi_adjust_cfa_offset 16
-  stp   d4, d5,   [sp, #-16]!
-  .cfi_adjust_cfa_offset 16
-  stp   d2, d3,   [sp, #-16]!
-  .cfi_adjust_cfa_offset 16
-  stp   d0, d1,   [sp, #-16]!
-  .cfi_adjust_cfa_offset 16
-  stp   x6, x7,   [sp, #-16]!
-  .cfi_adjust_cfa_offset 16
-  stp   x4, x5,   [sp, #-16]!
-  .cfi_adjust_cfa_offset 16
-  stp   x2, x3,   [sp, #-16]!
-  .cfi_adjust_cfa_offset 16
-  stp   x0, x1,   [sp, #-16]!
-  .cfi_adjust_cfa_offset 16
+    // spill regs.
+    stp   x29, x30, [sp, #-16]!
+    .cfi_adjust_cfa_offset 16
+    .cfi_rel_offset x29, 0
+    .cfi_rel_offset x30, 8
+    mov   x29, sp
+    stp   d6, d7,   [sp, #-16]!
+    .cfi_adjust_cfa_offset 16
+    stp   d4, d5,   [sp, #-16]!
+    .cfi_adjust_cfa_offset 16
+    stp   d2, d3,   [sp, #-16]!
+    .cfi_adjust_cfa_offset 16
+    stp   d0, d1,   [sp, #-16]!
+    .cfi_adjust_cfa_offset 16
+    stp   x6, x7,   [sp, #-16]!
+    .cfi_adjust_cfa_offset 16
+    stp   x4, x5,   [sp, #-16]!
+    .cfi_adjust_cfa_offset 16
+    stp   x2, x3,   [sp, #-16]!
+    .cfi_adjust_cfa_offset 16
+    stp   x0, x1,   [sp, #-16]!
+    .cfi_adjust_cfa_offset 16
 
-  mov x0, xSELF   // pass Thread::Current()
-  // Call artFindNativeMethod() for normal native and artFindNativeMethodRunnable for @FastNative.
-  ldr   xIP0, [x0, #THREAD_TOP_QUICK_FRAME_OFFSET]      // uintptr_t tagged_quick_frame
-  bic   xIP0, xIP0, #1                                  // ArtMethod** sp
-  ldr   xIP0, [xIP0]                                    // ArtMethod* method
-  ldr   xIP0, [xIP0, #ART_METHOD_ACCESS_FLAGS_OFFSET]   // uint32_t access_flags
-  tst   xIP0, #ACCESS_FLAGS_METHOD_IS_FAST_NATIVE
-  b.ne  .Llookup_stub_fast_native
-  bl    artFindNativeMethod
-  b     .Llookup_stub_continue
-.Llookup_stub_fast_native:
-  bl    artFindNativeMethodRunnable
+    mov x0, xSELF   // pass Thread::Current()
+    // Call artFindNativeMethod() for normal native and artFindNativeMethodRunnable()
+    // for @FastNative or @CriticalNative.
+    ldr   xIP0, [x0, #THREAD_TOP_QUICK_FRAME_OFFSET]      // uintptr_t tagged_quick_frame
+    bic   xIP0, xIP0, #1                                  // ArtMethod** sp
+    ldr   xIP0, [xIP0]                                    // ArtMethod* method
+    ldr   xIP0, [xIP0, #ART_METHOD_ACCESS_FLAGS_OFFSET]   // uint32_t access_flags
+    mov   xIP1, #(ACCESS_FLAGS_METHOD_IS_FAST_NATIVE | ACCESS_FLAGS_METHOD_IS_CRITICAL_NATIVE)
+    tst   xIP0, xIP1
+    b.ne  .Llookup_stub_fast_native
+    bl    artFindNativeMethod
+    b     .Llookup_stub_continue
+    .Llookup_stub_fast_native:
+    bl    artFindNativeMethodRunnable
 .Llookup_stub_continue:
-  mov  x17, x0    // store result in scratch reg.
+    mov   x17, x0    // store result in scratch reg.
 
-  // load spill regs.
-  ldp   x0, x1,   [sp], #16
-  .cfi_adjust_cfa_offset -16
-  ldp   x2, x3,   [sp], #16
-  .cfi_adjust_cfa_offset -16
-  ldp   x4, x5,   [sp], #16
-  .cfi_adjust_cfa_offset -16
-  ldp   x6, x7,   [sp], #16
-  .cfi_adjust_cfa_offset -16
-  ldp   d0, d1,   [sp], #16
-  .cfi_adjust_cfa_offset -16
-  ldp   d2, d3,   [sp], #16
-  .cfi_adjust_cfa_offset -16
-  ldp   d4, d5,   [sp], #16
-  .cfi_adjust_cfa_offset -16
-  ldp   d6, d7,   [sp], #16
-  .cfi_adjust_cfa_offset -16
-  ldp   x29, x30, [sp], #16
-  .cfi_adjust_cfa_offset -16
-  .cfi_restore x29
-  .cfi_restore x30
+    // load spill regs.
+    ldp   x0, x1,   [sp], #16
+    .cfi_adjust_cfa_offset -16
+    ldp   x2, x3,   [sp], #16
+    .cfi_adjust_cfa_offset -16
+    ldp   x4, x5,   [sp], #16
+    .cfi_adjust_cfa_offset -16
+    ldp   x6, x7,   [sp], #16
+    .cfi_adjust_cfa_offset -16
+    ldp   d0, d1,   [sp], #16
+    .cfi_adjust_cfa_offset -16
+    ldp   d2, d3,   [sp], #16
+    .cfi_adjust_cfa_offset -16
+    ldp   d4, d5,   [sp], #16
+    .cfi_adjust_cfa_offset -16
+    ldp   d6, d7,   [sp], #16
+    .cfi_adjust_cfa_offset -16
+    ldp   x29, x30, [sp], #16
+    .cfi_adjust_cfa_offset -16
+    .cfi_restore x29
+    .cfi_restore x30
 
-  cbz   x17, 1f   // is method code null ?
-  br    x17       // if non-null, tail call to method's code.
+    cbz   x17, 1f   // is method code null ?
+    br    x17       // if non-null, tail call to method's code.
 
 1:
-  ret             // restore regs and return to caller to handle exception.
+    ret             // restore regs and return to caller to handle exception.
 END art_jni_dlsym_lookup_stub
+
+ENTRY art_jni_dlsym_lookup_critical_stub
+    // The hidden arg holding the tagged method (bit 0 set means GenericJNI) is x15.
+    // For Generic JNI we already have a managed frame, so we reuse the art_jni_dlsym_lookup_stub.
+    tbnz  x15, #0, art_jni_dlsym_lookup_stub
+
+    // We need to create a GenericJNI managed frame above the stack args.
+
+    // GenericJNI frame is similar to SaveRegsAndArgs frame with the native method
+    // instead of runtime method saved at the bottom. Note that the runtime shall
+    // not examine the args here, otherwise we would have to move them in registers
+    // and stack to account for the difference between managed and native ABIs.
+    INCREASE_FRAME FRAME_SIZE_SAVE_REFS_AND_ARGS
+    SETUP_SAVE_REFS_AND_ARGS_FRAME_INTERNAL sp
+    // Save the hidden arg as method pointer, x0 in the padding.
+    // (x0 is an arg in native ABI but not considered an arg in managed ABI.)
+    SAVE_TWO_REGS x15, x0, 0
+
+    // Call artCriticalNativeOutArgsSize(method)
+    mov   x0, x15  // x0 := method (from hidden arg)
+    bl    artCriticalNativeOutArgsSize
+
+    // Check if we have any stack args.
+    cbnz  x0, .Lcritical_has_stack_args
+
+    // Without stack args, the frame is fully constructed.
+    // Place tagged managed sp in Thread::Current()->top_quick_frame.
+    mov   xIP0, sp
+    orr   xIP0, xIP0, #1  // Tag as GenericJNI frame.
+    str   xIP0, [xSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]
+
+    // Call artFindNativeMethodRunnable()
+    mov   x0, xSELF   // pass Thread::Current()
+    bl    artFindNativeMethodRunnable
+
+    // Store result in scratch reg.
+    mov   xIP0, x0
+
+    // Restore frame.
+    .cfi_remember_state
+    RESTORE_TWO_REGS x15, x0, 0
+    RESTORE_SAVE_REFS_AND_ARGS_FRAME
+    REFRESH_MARKING_REGISTER
+
+    // Check for exception.
+    cbz   xIP0, .Lcritical_deliver_exception
+
+    // Do the tail call
+    br    xIP0
+    .cfi_restore_state
+    .cfi_def_cfa_offset FRAME_SIZE_SAVE_REFS_AND_ARGS
+
+.Lcritical_has_stack_args:
+    // Move the out args size to a scratch register.
+    mov   xIP0, x0
+
+    // Restore register args as we're about to move stack args.
+    RESTORE_TWO_REGS x15, x0, 0
+    RESTORE_SAVE_REFS_AND_ARGS_FRAME_INTERNAL sp
+
+    // Move out args. For simplicity include the return address at the end.
+    mov   x8, sp        // Destination.
+    add   x9, sp, xIP0  // Destination end.
+1:
+    ldp   x10, x11, [x8, #FRAME_SIZE_SAVE_REFS_AND_ARGS]
+    stp   x10, x11, [x8], #16
+    cmp   x8, x9
+    bne   1b
+
+    // Save our LR, load caller's LR and redefine CFI to take ownership of the JNI stub frame.
+    str   xLR, [x9, #-__SIZEOF_POINTER__]
+    mov   xLR, x11  // The last moved value from the loop above.
+    .cfi_def_cfa x9, FRAME_SIZE_SAVE_REFS_AND_ARGS
+
+    // Re-create the SaveRefsAndArgs frame above the args.
+    SETUP_SAVE_REFS_AND_ARGS_FRAME_INTERNAL x9
+    SAVE_TWO_REGS_BASE x9, x15, x0, 0
+
+    // Move the frame register to a callee-save register.
+    mov   x29, x9
+    .cfi_def_cfa_register x29
+
+    // Place tagged managed sp in Thread::Current()->top_quick_frame.
+    orr   xIP0, x29, #1  // Tag as GenericJNI frame.
+    str   xIP0, [xSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]
+
+    // Call artFindNativeMethodRunnable()
+    mov   x0, xSELF   // pass Thread::Current()
+    bl    artFindNativeMethodRunnable
+
+    // Store result in scratch reg.
+    mov   xIP0, x0
+
+    // Restore the frame.
+    mov   x9, x29
+    .cfi_def_cfa_register x9
+    RESTORE_TWO_REGS_BASE x9, x15, x0, 0
+    RESTORE_SAVE_REFS_AND_ARGS_FRAME_INTERNAL x9
+    REFRESH_MARKING_REGISTER
+
+    // Check for exception.
+    cbz   xIP0, 3f
+
+    // Move stack args to their original place.
+    mov   x8, x9
+2:
+    ldp   x10, x11, [x8, #-16]!
+    stp   x10, x11, [x8, #FRAME_SIZE_SAVE_REFS_AND_ARGS]
+    cmp   sp, x8
+    bne   2b
+
+    // Replace original return address with caller's return address.
+    ldr   xIP1, [x9, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__)]
+    str   xLR, [x9, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__)]
+
+    // Restore LR and redefine CFI to release ownership of the JNI stub frame.
+    .cfi_remember_state
+    mov   xLR, xIP1
+    .cfi_def_cfa sp, FRAME_SIZE_SAVE_REFS_AND_ARGS
+
+    // Remove the frame reservation.
+    DECREASE_FRAME FRAME_SIZE_SAVE_REFS_AND_ARGS
+
+    // Do the tail call.
+    br    xIP0
+    .cfi_restore_state
+    .cfi_def_cfa x9, FRAME_SIZE_SAVE_REFS_AND_ARGS
+
+3:
+    // Drop stack args and the SaveRefsAndArgs reservation.
+    mov   sp, x9
+    add   sp, sp, #FRAME_SIZE_SAVE_REFS_AND_ARGS
+    .cfi_def_cfa sp, 0
+
+.Lcritical_deliver_exception:
+    // When delivering exception, we check that xSELF was saved but the SaveRefsAndArgs frame does
+    // not save it, so we cannot use DELIVER_PENDING_EXCEPTION_FRAME_READY with the above frames.
+    DELIVER_PENDING_EXCEPTION
+END art_jni_dlsym_lookup_critical_stub
diff --git a/runtime/arch/arm64/jni_frame_arm64.h b/runtime/arch/arm64/jni_frame_arm64.h
new file mode 100644
index 0000000..fa4d43c
--- /dev/null
+++ b/runtime/arch/arm64/jni_frame_arm64.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_ARCH_ARM64_JNI_FRAME_ARM64_H_
+#define ART_RUNTIME_ARCH_ARM64_JNI_FRAME_ARM64_H_
+
+#include <string.h>
+
+#include "arch/instruction_set.h"
+#include "base/bit_utils.h"
+#include "base/globals.h"
+#include "base/logging.h"
+
+namespace art {
+namespace arm64 {
+
+constexpr size_t kFramePointerSize = static_cast<size_t>(PointerSize::k64);
+static_assert(kArm64PointerSize == PointerSize::k64, "Unexpected ARM64 pointer size");
+
+// The AAPCS64 requires 16-byte alignement. This is the same as the Managed ABI stack alignment.
+static constexpr size_t kAapcs64StackAlignment = 16u;
+static_assert(kAapcs64StackAlignment == kStackAlignment);
+
+// Up to how many float-like (float, double) args can be in registers.
+// The rest of the args must go on the stack.
+constexpr size_t kMaxFloatOrDoubleRegisterArguments = 8u;
+// Up to how many integer-like (pointers, objects, longs, int, short, bool, etc) args can be
+// in registers. The rest of the args must go on the stack.
+constexpr size_t kMaxIntLikeRegisterArguments = 8u;
+
+// Get the size of "out args" for @CriticalNative method stub.
+// This must match the size of the frame emitted by the JNI compiler at the native call site.
+inline size_t GetCriticalNativeOutArgsSize(const char* shorty, uint32_t shorty_len) {
+  DCHECK_EQ(shorty_len, strlen(shorty));
+
+  size_t num_fp_args = 0u;
+  for (size_t i = 1; i != shorty_len; ++i) {
+    if (shorty[i] == 'F' || shorty[i] == 'D') {
+      num_fp_args += 1u;
+    }
+  }
+  size_t num_non_fp_args = shorty_len - 1u - num_fp_args;
+
+  // Account for FP arguments passed through v0-v7.
+  size_t num_stack_fp_args =
+      num_fp_args - std::min(kMaxFloatOrDoubleRegisterArguments, num_fp_args);
+  // Account for other (integer and pointer) arguments passed through GPR (x0-x7).
+  size_t num_stack_non_fp_args =
+      num_non_fp_args - std::min(kMaxIntLikeRegisterArguments, num_non_fp_args);
+  // The size of outgoing arguments.
+  size_t size =
+      (num_stack_fp_args + num_stack_non_fp_args) * static_cast<size_t>(kArm64PointerSize);
+
+  // We can make a tail call if there are no stack args and we do not need
+  // to extend the result. Otherwise, add space for return PC.
+  if (size != 0u || shorty[0] == 'B' || shorty[0] == 'C' || shorty[0] == 'S' || shorty[0] == 'Z') {
+    size += kFramePointerSize;  // We need to spill LR with the args.
+  }
+  return RoundUp(size, kAapcs64StackAlignment);
+}
+
+}  // namespace arm64
+}  // namespace art
+
+#endif  // ART_RUNTIME_ARCH_ARM64_JNI_FRAME_ARM64_H_
+
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 7260700..634c762 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -49,42 +49,6 @@
     DECREASE_FRAME 96
 .endm
 
-
-.macro SETUP_SAVE_REFS_AND_ARGS_FRAME_INTERNAL
-    INCREASE_FRAME 224
-
-    // Ugly compile-time check, but we only have the preprocessor.
-#if (FRAME_SIZE_SAVE_REFS_AND_ARGS != 224)
-#error "FRAME_SIZE_SAVE_REFS_AND_ARGS(ARM64) size not as expected."
-#endif
-
-    // Stack alignment filler [sp, #8].
-    // FP args.
-    stp d0, d1, [sp, #16]
-    stp d2, d3, [sp, #32]
-    stp d4, d5, [sp, #48]
-    stp d6, d7, [sp, #64]
-
-    // Core args.
-    SAVE_TWO_REGS x1, x2, 80
-    SAVE_TWO_REGS x3, x4, 96
-    SAVE_TWO_REGS x5, x6, 112
-
-    // x7, Callee-saves.
-    // Note: We could avoid saving X20 in the case of Baker read
-    // barriers, as it is overwritten by REFRESH_MARKING_REGISTER
-    // later; but it's not worth handling this special case.
-    SAVE_TWO_REGS x7, x20, 128
-    SAVE_TWO_REGS x21, x22, 144
-    SAVE_TWO_REGS x23, x24, 160
-    SAVE_TWO_REGS x25, x26, 176
-    SAVE_TWO_REGS x27, x28, 192
-
-    // x29(callee-save) and LR.
-    SAVE_TWO_REGS x29, xLR, 208
-
-.endm
-
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveRefsAndArgs).
@@ -99,7 +63,8 @@
     // ArtMethod* xIP0 = Runtime::instance_->callee_save_methods_[kSaveRefAndArgs];
     ldr xIP0, [xIP0, RUNTIME_SAVE_REFS_AND_ARGS_METHOD_OFFSET]
 
-    SETUP_SAVE_REFS_AND_ARGS_FRAME_INTERNAL
+    INCREASE_FRAME FRAME_SIZE_SAVE_REFS_AND_ARGS
+    SETUP_SAVE_REFS_AND_ARGS_FRAME_INTERNAL sp
 
     str xIP0, [sp]    // Store ArtMethod* Runtime::callee_save_methods_[kSaveRefsAndArgs].
     // Place sp in Thread::Current()->top_quick_frame.
@@ -108,42 +73,14 @@
 .endm
 
 .macro SETUP_SAVE_REFS_AND_ARGS_FRAME_WITH_METHOD_IN_X0
-    SETUP_SAVE_REFS_AND_ARGS_FRAME_INTERNAL
+    INCREASE_FRAME FRAME_SIZE_SAVE_REFS_AND_ARGS
+    SETUP_SAVE_REFS_AND_ARGS_FRAME_INTERNAL sp
     str x0, [sp, #0]  // Store ArtMethod* to bottom of stack.
     // Place sp in Thread::Current()->top_quick_frame.
     mov xIP0, sp
     str xIP0, [xSELF, # THREAD_TOP_QUICK_FRAME_OFFSET]
 .endm
 
-// TODO: Probably no need to restore registers preserved by aapcs64.
-.macro RESTORE_SAVE_REFS_AND_ARGS_FRAME
-    // FP args.
-    ldp d0, d1, [sp, #16]
-    ldp d2, d3, [sp, #32]
-    ldp d4, d5, [sp, #48]
-    ldp d6, d7, [sp, #64]
-
-    // Core args.
-    RESTORE_TWO_REGS x1, x2, 80
-    RESTORE_TWO_REGS x3, x4, 96
-    RESTORE_TWO_REGS x5, x6, 112
-
-    // x7, Callee-saves.
-    // Note: Likewise, we could avoid restoring X20 in the case of Baker
-    // read barriers, as it is overwritten by REFRESH_MARKING_REGISTER
-    // later; but it's not worth handling this special case.
-    RESTORE_TWO_REGS x7, x20, 128
-    RESTORE_TWO_REGS x21, x22, 144
-    RESTORE_TWO_REGS x23, x24, 160
-    RESTORE_TWO_REGS x25, x26, 176
-    RESTORE_TWO_REGS x27, x28, 192
-
-    // x29(callee-save) and LR.
-    RESTORE_TWO_REGS x29, xLR, 208
-
-    DECREASE_FRAME 224
-.endm
-
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveEverything)
diff --git a/runtime/arch/x86/asm_support_x86.S b/runtime/arch/x86/asm_support_x86.S
index 8f43cc8..8938d8b 100644
--- a/runtime/arch/x86/asm_support_x86.S
+++ b/runtime/arch/x86/asm_support_x86.S
@@ -94,7 +94,7 @@
     #define CFI_RESTORE(reg)
     #define CFI_REL_OFFSET(reg,size)
     #define CFI_REMEMBER_STATE
-    #define CFI_RESTORE_STATE_AND_DEF_CFA(off)
+    #define CFI_RESTORE_STATE_AND_DEF_CFA(reg,off)
     #define CFI_ESCAPE(...)
 #endif
 
@@ -156,6 +156,18 @@
     CFI_RESTORE(REG_VAR(reg))
 END_MACRO
 
+// Arguments do not need .cfi_rel_offset as they are caller-saved and
+// therefore cannot hold caller's variables or unwinding data.
+MACRO1(PUSH_ARG, reg)
+    pushl REG_VAR(reg)
+    CFI_ADJUST_CFA_OFFSET(4)
+END_MACRO
+
+MACRO1(POP_ARG, reg)
+    popl REG_VAR(reg)
+    CFI_ADJUST_CFA_OFFSET(-4)
+END_MACRO
+
 MACRO1(CFI_RESTORE_REG, reg)
     CFI_RESTORE(REG_VAR(reg))
 END_MACRO
@@ -199,5 +211,64 @@
 #endif  // USE_HEAP_POISONING
 END_MACRO
 
+    /*
+     * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kSaveRefsAndArgs), except for pushing the method
+     */
+MACRO0(SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY)
+    PUSH edi      // Save callee saves
+    PUSH esi
+    PUSH ebp
+    PUSH_ARG ebx  // Save args.
+    PUSH_ARG edx
+    PUSH_ARG ecx
+    // Create space for FPR args.
+    subl MACRO_LITERAL(4 * 8), %esp
+    CFI_ADJUST_CFA_OFFSET(4 * 8)
+    // Save FPRs.
+    movsd %xmm0, 0(%esp)
+    movsd %xmm1, 8(%esp)
+    movsd %xmm2, 16(%esp)
+    movsd %xmm3, 24(%esp)
+
+    // Ugly compile-time check, but we only have the preprocessor.
+    // First +4: implicit return address pushed on stack when caller made call.
+    // Last +4: we're not pushing the method on the stack here.
+#if (FRAME_SIZE_SAVE_REFS_AND_ARGS != 4 + 6*4 + 4*8 + 4)
+#error "FRAME_SIZE_SAVE_REFS_AND_ARGS(X86) size not as expected."
+#endif
+END_MACRO
+
+MACRO0(RESTORE_SAVE_REFS_AND_ARGS_FRAME)
+    // Restore FPRs. EAX is still on the stack.
+    movsd 4(%esp), %xmm0
+    movsd 12(%esp), %xmm1
+    movsd 20(%esp), %xmm2
+    movsd 28(%esp), %xmm3
+
+    addl MACRO_LITERAL(36), %esp  // Remove FPRs and method pointer.
+    CFI_ADJUST_CFA_OFFSET(-36)
+
+    POP_ARG ecx                   // Restore args
+    POP_ARG edx
+    POP_ARG ebx
+    POP ebp                       // Restore callee saves
+    POP esi
+    POP edi
+END_MACRO
+
+    /*
+     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
+     * exception is Thread::Current()->exception_ when the runtime method frame is ready.
+     */
+MACRO0(DELIVER_PENDING_EXCEPTION_FRAME_READY)
+    // Outgoing argument set up
+    subl MACRO_LITERAL(12), %esp               // alignment padding
+    CFI_ADJUST_CFA_OFFSET(12)
+    pushl %fs:THREAD_SELF_OFFSET               // pass Thread::Current()
+    CFI_ADJUST_CFA_OFFSET(4)
+    call SYMBOL(artDeliverPendingExceptionFromCode)  // artDeliverPendingExceptionFromCode(Thread*)
+    UNREACHABLE
+END_MACRO
 
 #endif  // ART_RUNTIME_ARCH_X86_ASM_SUPPORT_X86_S_
diff --git a/runtime/arch/x86/jni_entrypoints_x86.S b/runtime/arch/x86/jni_entrypoints_x86.S
index 4862e99..086e96f 100644
--- a/runtime/arch/x86/jni_entrypoints_x86.S
+++ b/runtime/arch/x86/jni_entrypoints_x86.S
@@ -24,12 +24,14 @@
     CFI_ADJUST_CFA_OFFSET(8)
     pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
     CFI_ADJUST_CFA_OFFSET(4)
-    // Call artFindNativeMethod() for normal native and artFindNativeMethodRunnable for @FastNative.
+    // Call artFindNativeMethod() for normal native and artFindNativeMethodRunnable()
+    // for @FastNative or @CriticalNative.
     movl (%esp), %eax                                // Thread* self
     movl THREAD_TOP_QUICK_FRAME_OFFSET(%eax), %eax   // uintptr_t tagged_quick_frame
     andl LITERAL(0xfffffffe), %eax                   // ArtMethod** sp
     movl (%eax), %eax                                // ArtMethod* method
-    testl LITERAL(ACCESS_FLAGS_METHOD_IS_FAST_NATIVE), ART_METHOD_ACCESS_FLAGS_OFFSET(%eax)
+    testl LITERAL(ACCESS_FLAGS_METHOD_IS_FAST_NATIVE | ACCESS_FLAGS_METHOD_IS_CRITICAL_NATIVE), \
+          ART_METHOD_ACCESS_FLAGS_OFFSET(%eax)
     jne .Llookup_stub_fast_native
     call SYMBOL(artFindNativeMethod)  // (Thread*)
     jmp .Llookup_stub_continue
@@ -44,3 +46,178 @@
 .Lno_native_code_found:
     ret
 END_FUNCTION art_jni_dlsym_lookup_stub
+
+DEFINE_FUNCTION art_jni_dlsym_lookup_critical_stub
+    // The hidden arg holding the tagged method (bit 0 set means GenericJNI) is eax.
+    // For Generic JNI we already have a managed frame, so we reuse the art_jni_dlsym_lookup_stub.
+    testl LITERAL(1), %eax
+    jnz art_jni_dlsym_lookup_stub
+
+    // We need to create a GenericJNI managed frame above the stack args.
+
+    // GenericJNI frame is similar to SaveRegsAndArgs frame with the native method
+    // instead of runtime method saved at the bottom. Note that the runtime shall
+    // not examine the args here, otherwise we would have to reload them from stack
+    // to account for the difference between managed and native ABIs.
+    SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
+    pushl %eax  // Save the hidden arg as method pointer at the bottom of the stack.
+    CFI_ADJUST_CFA_OFFSET(4)
+
+    // Call artCriticalNativeOutArgsSize(method); method is conveniently at the bottom of the stack.
+    call SYMBOL(artCriticalNativeOutArgsSize)
+
+    // Check if we have any stack args other than return PC.
+    cmp LITERAL(__SIZEOF_POINTER__), %eax
+    jnz .Lcritical_has_stack_args
+
+    // Without stack args, the frame is fully constructed.
+    // Place tagged managed sp in Thread::Current()->top_quick_frame.
+    leal 1(%esp), %eax  // Tag as GenericJNI frame.
+    mov %eax, %fs:THREAD_TOP_QUICK_FRAME_OFFSET
+
+    // Call artFindNativeMethodRunnable()
+    subl LITERAL(12), %esp         // align stack
+    CFI_ADJUST_CFA_OFFSET(12)
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    CFI_ADJUST_CFA_OFFSET(4)
+    call SYMBOL(artFindNativeMethodRunnable)  // (Thread*)
+    addl LITERAL(16), %esp
+    CFI_ADJUST_CFA_OFFSET(-16)
+
+    // Check for exception.
+    test %eax, %eax
+    jz 1f
+
+    // Restore frame and do the tail call.
+    CFI_REMEMBER_STATE
+    RESTORE_SAVE_REFS_AND_ARGS_FRAME
+    jmp *%eax
+    CFI_RESTORE_STATE_AND_DEF_CFA(%esp, FRAME_SIZE_SAVE_REFS_AND_ARGS)
+
+1:
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
+
+.Lcritical_has_stack_args:
+    // As mentioned above, the runtime shall not examine the args in the managed frame
+    // and since all args for the native call are on the stack, we can use the managed
+    // args registers as scratch registers. So, EBX, EDX and ECX are available and we
+    // do not need to restore xmm0-xmm3 either.
+
+    // Restore registers as we're about to move stack args over the current SaveRefsAndArgs frame.
+    movl (%esp), %edx   // Remember the method in EDX.
+    movl 48(%esp), %ebp
+    CFI_RESTORE(%ebp)
+    movl 52(%esp), %esi
+    CFI_RESTORE(%esi)
+    movl 56(%esp), %edi
+    CFI_RESTORE(%edi)
+
+    // Calculate the address of the end of the move destination and redefine CFI to take
+    // ownership of the JNI stub frame. EBX is conveniently callee-save in native ABI.
+    leal 0(%esp, %eax, 1), %ebx
+    CFI_DEF_CFA(%ebx, FRAME_SIZE_SAVE_REFS_AND_ARGS)
+
+    // Calculate the number of DWORDs to move.
+    shrl LITERAL(2), %eax
+    leal -1(%eax), %ecx  // Do not move the return PC.
+
+    // Load our return PC to EAX.
+    movl FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__(%esp), %eax
+
+    // Save EDI, ESI so that we can use them for moving stack args.
+    pushl %edi  // No `CFI_ADJUST_CFA_OFFSET`, CFA register is currently EBX, not ESP.
+    pushl %esi  // ditto
+
+    // Mov the stack args.
+    leal 2 * __SIZEOF_POINTER__(%esp), %edi
+    leal FRAME_SIZE_SAVE_REFS_AND_ARGS(%edi), %esi
+    rep movsd
+
+    // Save our return PC.
+    movl %eax, (%edi)
+
+    // Restore EDI, ESI.
+    popl %esi   // No `CFI_ADJUST_CFA_OFFSET`, CFA register is currently EBX, not ESP.
+    popl %edi   // ditto
+
+    // Re-create the SaveRefsAndArgs frame above the args.
+    movl %edi, 56(%ebx)
+    CFI_REL_OFFSET(%edi, 56)
+    movl %esi, 52(%ebx)
+    CFI_REL_OFFSET(%esi, 52)
+    movl %ebp, 48(%ebx)
+    CFI_REL_OFFSET(%ebp, 48)
+    // Skip managed ABI args EBX, EDX, ECX and FPRs, see above.
+    // (We have already clobbered EBX, EDX, ECX anyway).
+    movl %edx, (%ebx)    // Save method pointer.
+
+    // Place tagged managed sp in Thread::Current()->top_quick_frame.
+    leal 1(%ebx), %eax  // Tag as GenericJNI frame.
+    movl %eax, %fs:THREAD_TOP_QUICK_FRAME_OFFSET
+
+    // Call artFindNativeMethodRunnable()
+    subl LITERAL(12), %esp        // align stack, no `CFI_ADJUST_CFA_OFFSET`.
+    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    call SYMBOL(artFindNativeMethodRunnable)  // (Thread*)
+    addl LITERAL(16), %esp        // Pop args, no `CFI_ADJUST_CFA_OFFSET`.
+
+    // Check for exception.
+    test %eax, %eax
+    jz 2f
+
+    // Restore the frame. We shall not need the method anymore.
+    CFI_REMEMBER_STATE
+    movl 48(%ebx), %ebp
+    CFI_RESTORE(%ebp)
+    movl 52(%ebx), %esi
+    CFI_RESTORE(%esi)
+    movl 56(%ebx), %edi
+    CFI_RESTORE(%edi)
+
+    // Remember our return PC in EDX.
+    movl -__SIZEOF_POINTER__(%ebx), %edx
+
+    // Calculate the number of DWORDs to move.
+    leal -__SIZEOF_POINTER__(%ebx), %ecx  // Do not move return PC.
+    subl %esp, %ecx
+    shrl LITERAL(2), %ecx
+
+    // Save EDI, ESI so that we can use them for moving stack args.
+    pushl %edi  // No `CFI_ADJUST_CFA_OFFSET`, CFA register is currently EBX, not ESP.
+    pushl %esi  // ditto
+
+    // Mov stack args to their original place.
+    leal -2 * __SIZEOF_POINTER__(%ebx), %esi
+    leal FRAME_SIZE_SAVE_REFS_AND_ARGS - 2 * __SIZEOF_POINTER__(%ebx), %edi
+    std
+    rep movsd
+    cld
+
+    // Store our return PC.
+    movl %edx, (%edi)
+
+    // Restore EDI, ESI.
+    popl %esi   // No `CFI_ADJUST_CFA_OFFSET`, CFA register is currently EBX, not ESP.
+    popl %edi   // ditto
+
+    // Redefine CFI to release ownership of the JNI stub frame.
+    CFI_DEF_CFA(%esp, FRAME_SIZE_SAVE_REFS_AND_ARGS)
+
+    // Remove the frame reservation.
+    addl LITERAL(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__), %esp
+    CFI_ADJUST_CFA_OFFSET(-FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__)
+
+    // Do the tail call.
+    jmp *%eax
+    CFI_RESTORE_STATE_AND_DEF_CFA(%ebx, FRAME_SIZE_SAVE_REFS_AND_ARGS)
+
+2:
+    // Replicate DELIVER_PENDING_EXCEPTION_FRAME_READY without CFI_ADJUST_CFA_OFFSET,
+    // CFA register is currently EBX, not ESP.
+
+    // Outgoing argument set up
+    subl MACRO_LITERAL(12), %esp               // alignment padding
+    pushl %fs:THREAD_SELF_OFFSET               // pass Thread::Current()
+    call SYMBOL(artDeliverPendingExceptionFromCode)  // artDeliverPendingExceptionFromCode(Thread*)
+    UNREACHABLE
+END_FUNCTION art_jni_dlsym_lookup_critical_stub
diff --git a/runtime/arch/x86/jni_frame_x86.h b/runtime/arch/x86/jni_frame_x86.h
new file mode 100644
index 0000000..e710179
--- /dev/null
+++ b/runtime/arch/x86/jni_frame_x86.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_ARCH_X86_JNI_FRAME_X86_H_
+#define ART_RUNTIME_ARCH_X86_JNI_FRAME_X86_H_
+
+#include <string.h>
+
+#include "arch/instruction_set.h"
+#include "base/bit_utils.h"
+#include "base/globals.h"
+#include "base/logging.h"
+
+namespace art {
+namespace x86 {
+
+constexpr size_t kFramePointerSize = static_cast<size_t>(PointerSize::k32);
+static_assert(kX86PointerSize == PointerSize::k32, "Unexpected x86 pointer size");
+
+static constexpr size_t kNativeStackAlignment = 16;  // IA-32 cdecl requires 16 byte alignment.
+static_assert(kNativeStackAlignment == kStackAlignment);
+
+// Get the size of "out args" for @CriticalNative method stub.
+// This must match the size of the frame emitted by the JNI compiler at the native call site.
+inline size_t GetCriticalNativeOutArgsSize(const char* shorty, uint32_t shorty_len) {
+  DCHECK_EQ(shorty_len, strlen(shorty));
+
+  size_t num_long_or_double_args = 0u;
+  for (size_t i = 1; i != shorty_len; ++i) {
+    if (shorty[i] == 'J' || shorty[i] == 'D') {
+      num_long_or_double_args += 1u;
+    }
+  }
+  size_t num_arg_words = shorty_len - 1u + num_long_or_double_args;
+
+  // The size of outgoing arguments.
+  size_t size = num_arg_words * static_cast<size_t>(kX86PointerSize);
+
+  // Add return address size.
+  size += kFramePointerSize;
+  // We can make a tail call if there are no stack args and the return type is not
+  // FP type (needs moving from ST0 to MMX0) and we do not need to extend the result.
+  bool return_type_ok = shorty[0] == 'I' || shorty[0] == 'J' || shorty[0] == 'V';
+  if (return_type_ok && size == kFramePointerSize) {
+    return kFramePointerSize;
+  }
+
+  return RoundUp(size, kNativeStackAlignment);
+}
+
+}  // namespace x86
+}  // namespace art
+
+#endif  // ART_RUNTIME_ARCH_X86_JNI_FRAME_X86_H_
+
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 7d2a7e6..4abdf70 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -120,20 +120,7 @@
      * Runtime::CreateCalleeSaveMethod(kSaveRefsAndArgs)
      */
 MACRO2(SETUP_SAVE_REFS_AND_ARGS_FRAME, got_reg, temp_reg)
-    PUSH edi  // Save callee saves
-    PUSH esi
-    PUSH ebp
-    PUSH ebx  // Save args
-    PUSH edx
-    PUSH ecx
-    // Create space for FPR args.
-    subl MACRO_LITERAL(4 * 8), %esp
-    CFI_ADJUST_CFA_OFFSET(4 * 8)
-    // Save FPRs.
-    movsd %xmm0, 0(%esp)
-    movsd %xmm1, 8(%esp)
-    movsd %xmm2, 16(%esp)
-    movsd %xmm3, 24(%esp)
+    SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
 
     SETUP_GOT_NOSAVE RAW_VAR(got_reg)
     // Load Runtime::instance_ from GOT.
@@ -144,12 +131,6 @@
     CFI_ADJUST_CFA_OFFSET(4)
     // Store esp as the stop quick frame.
     movl %esp, %fs:THREAD_TOP_QUICK_FRAME_OFFSET
-
-    // Ugly compile-time check, but we only have the preprocessor.
-    // Last +4: implicit return address pushed on stack when caller made call.
-#if (FRAME_SIZE_SAVE_REFS_AND_ARGS != 7*4 + 4*8 + 4)
-#error "FRAME_SIZE_SAVE_REFS_AND_ARGS(X86) size not as expected."
-#endif
 END_MACRO
 
     /*
@@ -157,47 +138,14 @@
      * Runtime::CreateCalleeSaveMethod(kSaveRefsAndArgs) where the method is passed in EAX.
      */
 MACRO0(SETUP_SAVE_REFS_AND_ARGS_FRAME_WITH_METHOD_IN_EAX)
-    // Save callee and GPR args, mixed together to agree with core spills bitmap.
-    PUSH edi  // Save callee saves
-    PUSH esi
-    PUSH ebp
-    PUSH ebx  // Save args
-    PUSH edx
-    PUSH ecx
+    SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
 
-    // Create space for FPR args.
-    subl MACRO_LITERAL(32), %esp
-    CFI_ADJUST_CFA_OFFSET(32)
-
-    // Save FPRs.
-    movsd %xmm0, 0(%esp)
-    movsd %xmm1, 8(%esp)
-    movsd %xmm2, 16(%esp)
-    movsd %xmm3, 24(%esp)
-
-    PUSH eax  // Store the ArtMethod reference at the bottom of the stack.
+    pushl %eax  // Store the ArtMethod reference at the bottom of the stack.
+    CFI_ADJUST_CFA_OFFSET(4)
     // Store esp as the stop quick frame.
     movl %esp, %fs:THREAD_TOP_QUICK_FRAME_OFFSET
 END_MACRO
 
-MACRO0(RESTORE_SAVE_REFS_AND_ARGS_FRAME)
-    // Restore FPRs. EAX is still on the stack.
-    movsd 4(%esp), %xmm0
-    movsd 12(%esp), %xmm1
-    movsd 20(%esp), %xmm2
-    movsd 28(%esp), %xmm3
-
-    addl MACRO_LITERAL(36), %esp  // Remove FPRs and EAX.
-    CFI_ADJUST_CFA_OFFSET(-36)
-
-    POP ecx                       // Restore args except eax
-    POP edx
-    POP ebx
-    POP ebp                       // Restore callee saves
-    POP esi
-    POP edi
-END_MACRO
-
 // Restore register and jump to routine
 // Inputs:  EDI contains pointer to code.
 // Notes: Need to pop EAX too (restores Method*)
@@ -331,20 +279,6 @@
 
     /*
      * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
-     * exception is Thread::Current()->exception_ when the runtime method frame is ready.
-     */
-MACRO0(DELIVER_PENDING_EXCEPTION_FRAME_READY)
-    // Outgoing argument set up
-    subl MACRO_LITERAL(12), %esp               // alignment padding
-    CFI_ADJUST_CFA_OFFSET(12)
-    pushl %fs:THREAD_SELF_OFFSET               // pass Thread::Current()
-    CFI_ADJUST_CFA_OFFSET(4)
-    call SYMBOL(artDeliverPendingExceptionFromCode)  // artDeliverPendingExceptionFromCode(Thread*)
-    UNREACHABLE
-END_MACRO
-
-    /*
-     * Macro that calls through to artDeliverPendingExceptionFromCode, where the pending
      * exception is Thread::Current()->exception_.
      */
 MACRO0(DELIVER_PENDING_EXCEPTION)
diff --git a/runtime/arch/x86_64/asm_support_x86_64.S b/runtime/arch/x86_64/asm_support_x86_64.S
index 2b50cdb..6a60a98 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.S
+++ b/runtime/arch/x86_64/asm_support_x86_64.S
@@ -156,6 +156,28 @@
     CFI_RESTORE(REG_VAR(reg))
 END_MACRO
 
+// Arguments do not need .cfi_rel_offset as they are caller-saved and
+// therefore cannot hold caller's variables or unwinding data.
+MACRO1(PUSH_ARG, reg)
+    pushq REG_VAR(reg)
+    CFI_ADJUST_CFA_OFFSET(8)
+END_MACRO
+
+MACRO1(POP_ARG, reg)
+    popq REG_VAR(reg)
+    CFI_ADJUST_CFA_OFFSET(-8)
+END_MACRO
+
+MACRO3(SAVE_REG_BASE, base, reg, offset)
+    movq REG_VAR(reg), RAW_VAR(offset)(REG_VAR(base))
+    CFI_REL_OFFSET(REG_VAR(reg), RAW_VAR(offset))
+END_MACRO
+
+MACRO3(RESTORE_REG_BASE, base, reg, offset)
+    movq RAW_VAR(offset)(REG_VAR(base)), REG_VAR(reg)
+    CFI_RESTORE(REG_VAR(reg))
+END_MACRO
+
 MACRO1(UNIMPLEMENTED,name)
     FUNCTION_TYPE(SYMBOL(\name))
     ASM_HIDDEN VAR(name)
@@ -251,6 +273,77 @@
 
     /*
      * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kSaveRefsAndArgs), except for storing the method.
+     */
+MACRO0(SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY)
+    // Save callee and GPR args, mixed together to agree with core spills bitmap.
+    PUSH r15      // Callee save.
+    PUSH r14      // Callee save.
+    PUSH r13      // Callee save.
+    PUSH r12      // Callee save.
+    PUSH_ARG r9   // Quick arg 5.
+    PUSH_ARG r8   // Quick arg 4.
+    PUSH_ARG rsi  // Quick arg 1.
+    PUSH rbp      // Callee save.
+    PUSH rbx      // Callee save.
+    PUSH_ARG rdx  // Quick arg 2.
+    PUSH_ARG rcx  // Quick arg 3.
+    // Create space for FPR args and create 2 slots for ArtMethod*.
+    subq MACRO_LITERAL(16 + 12 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(16 + 12 * 8)
+    // Save FPRs.
+    movq %xmm0, 16(%rsp)
+    movq %xmm1, 24(%rsp)
+    movq %xmm2, 32(%rsp)
+    movq %xmm3, 40(%rsp)
+    movq %xmm4, 48(%rsp)
+    movq %xmm5, 56(%rsp)
+    movq %xmm6, 64(%rsp)
+    movq %xmm7, 72(%rsp)
+    movq %xmm12, 80(%rsp)
+    movq %xmm13, 88(%rsp)
+    movq %xmm14, 96(%rsp)
+    movq %xmm15, 104(%rsp)
+
+    // Ugly compile-time check, but we only have the preprocessor.
+    // Last +8: implicit return address pushed on stack when caller made call.
+#if (FRAME_SIZE_SAVE_REFS_AND_ARGS != 11 * 8 + 12 * 8 + 16 + 8)
+#error "FRAME_SIZE_SAVE_REFS_AND_ARGS(X86_64) size not as expected."
+#endif
+END_MACRO
+
+MACRO0(RESTORE_SAVE_REFS_AND_ARGS_FRAME)
+    // Restore FPRs.
+    movq 16(%rsp), %xmm0
+    movq 24(%rsp), %xmm1
+    movq 32(%rsp), %xmm2
+    movq 40(%rsp), %xmm3
+    movq 48(%rsp), %xmm4
+    movq 56(%rsp), %xmm5
+    movq 64(%rsp), %xmm6
+    movq 72(%rsp), %xmm7
+    movq 80(%rsp), %xmm12
+    movq 88(%rsp), %xmm13
+    movq 96(%rsp), %xmm14
+    movq 104(%rsp), %xmm15
+    addq MACRO_LITERAL(80 + 4 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-(80 + 4 * 8))
+    // Restore callee and GPR args, mixed together to agree with core spills bitmap.
+    POP_ARG rcx
+    POP_ARG rdx
+    POP rbx
+    POP rbp
+    POP_ARG rsi
+    POP_ARG r8
+    POP_ARG r9
+    POP r12
+    POP r13
+    POP r14
+    POP r15
+END_MACRO
+
+    /*
+     * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveAllCalleeSaves)
      */
 MACRO0(SETUP_SAVE_ALL_CALLEE_SAVES_FRAME)
diff --git a/runtime/arch/x86_64/jni_entrypoints_x86_64.S b/runtime/arch/x86_64/jni_entrypoints_x86_64.S
index 3860c37..e1b8e52 100644
--- a/runtime/arch/x86_64/jni_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/jni_entrypoints_x86_64.S
@@ -20,16 +20,16 @@
      * Jni dlsym lookup stub.
      */
 DEFINE_FUNCTION art_jni_dlsym_lookup_stub
-    // Save callee and GPR args, mixed together to agree with core spills bitmap.
-    PUSH r9   // Arg.
-    PUSH r8   // Arg.
-    PUSH rdi  // JniEnv.
-    PUSH rsi  // Arg.
-    PUSH rdx  // Arg.
-    PUSH rcx  // Arg.
+    // Save callee and GPR args.
+    PUSH_ARG r9   // Arg.
+    PUSH_ARG r8   // Arg.
+    PUSH_ARG rdi  // Arg. (JniEnv for normal and @FastNative)
+    PUSH_ARG rsi  // Arg.
+    PUSH_ARG rdx  // Arg.
+    PUSH_ARG rcx  // Arg.
     // Create space for FPR args, plus padding for alignment
-    subq LITERAL(72 + 4 * 8), %rsp
-    CFI_ADJUST_CFA_OFFSET(72 + 4 * 8)
+    subq LITERAL(72), %rsp
+    CFI_ADJUST_CFA_OFFSET(72)
     // Save FPRs.
     movq %xmm0, 0(%rsp)
     movq %xmm1, 8(%rsp)
@@ -39,17 +39,15 @@
     movq %xmm5, 40(%rsp)
     movq %xmm6, 48(%rsp)
     movq %xmm7, 56(%rsp)
-    movq %xmm12, 64(%rsp)
-    movq %xmm13, 72(%rsp)
-    movq %xmm14, 80(%rsp)
-    movq %xmm15, 88(%rsp)
     // prepare call
     movq %gs:THREAD_SELF_OFFSET, %rdi      // RDI := Thread::Current()
-    // Call artFindNativeMethod() for normal native and artFindNativeMethodRunnable for @FastNative.
+    // Call artFindNativeMethod() for normal native and artFindNativeMethodRunnable()
+    // for @FastNative or @CriticalNative.
     movq THREAD_TOP_QUICK_FRAME_OFFSET(%rdi), %rax   // uintptr_t tagged_quick_frame
     andq LITERAL(0xfffffffffffffffe), %rax           // ArtMethod** sp
     movq (%rax), %rax                                // ArtMethod* method
-    testl LITERAL(ACCESS_FLAGS_METHOD_IS_FAST_NATIVE), ART_METHOD_ACCESS_FLAGS_OFFSET(%rax)
+    testl LITERAL(ACCESS_FLAGS_METHOD_IS_FAST_NATIVE | ACCESS_FLAGS_METHOD_IS_CRITICAL_NATIVE), \
+          ART_METHOD_ACCESS_FLAGS_OFFSET(%rax)
     jne .Llookup_stub_fast_native
     call SYMBOL(artFindNativeMethod)  // (Thread*)
     jmp .Llookup_stub_continue
@@ -65,21 +63,200 @@
     movq 40(%rsp), %xmm5
     movq 48(%rsp), %xmm6
     movq 56(%rsp), %xmm7
-    movq 64(%rsp), %xmm12
-    movq 72(%rsp), %xmm13
-    movq 80(%rsp), %xmm14
-    movq 88(%rsp), %xmm15
-    addq LITERAL(72 + 4 * 8), %rsp
-    CFI_ADJUST_CFA_OFFSET(-72 - 4 * 8)
-    POP rcx  // Arg.
-    POP rdx  // Arg.
-    POP rsi  // Arg.
-    POP rdi  // JniEnv.
-    POP r8   // Arg.
-    POP r9   // Arg.
-    testq %rax, %rax         // check if returned method code is null
+    addq LITERAL(72), %rsp
+    CFI_ADJUST_CFA_OFFSET(-72)
+    POP_ARG rcx  // Arg.
+    POP_ARG rdx  // Arg.
+    POP_ARG rsi  // Arg.
+    POP_ARG rdi  // Arg. (JniEnv for normal and @FastNative)
+    POP_ARG r8   // Arg.
+    POP_ARG r9   // Arg.
+    testq %rax, %rax              // check if returned method code is null
     jz .Lno_native_code_found     // if null, jump to return to handle
     jmp *%rax                     // otherwise, tail call to intended method
 .Lno_native_code_found:
     ret
 END_FUNCTION art_jni_dlsym_lookup_stub
+
+DEFINE_FUNCTION art_jni_dlsym_lookup_critical_stub
+    // The hidden arg holding the tagged method (bit 0 set means GenericJNI) is r11.
+    // For Generic JNI we already have a managed frame, so we reuse the art_jni_dlsym_lookup_stub.
+    testq LITERAL(1), %r11
+    jnz art_jni_dlsym_lookup_stub
+
+    // We need to create a GenericJNI managed frame above the stack args.
+
+    // GenericJNI frame is similar to SaveRegsAndArgs frame with the native method
+    // instead of runtime method saved at the bottom.
+
+    // As we always have "stack args" on x86-64 (due to xmm12-xmm15 being callee-save
+    // in managed ABI but caller-save in native ABI), do not create a proper frame yet
+    // as we do on other architectures where it's useful for no stack args case.
+
+    // Reserve space for the frame (return PC is on stack).
+    subq MACRO_LITERAL(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__), %rsp
+    CFI_ADJUST_CFA_OFFSET(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__)
+
+    // Save GPR args.
+    PUSH_ARG r9
+    PUSH_ARG r8
+    PUSH_ARG rdi
+    PUSH_ARG rsi
+    PUSH_ARG rdx
+    PUSH_ARG rcx
+    // Create space for FPR args.
+    subq LITERAL(64), %rsp
+    CFI_ADJUST_CFA_OFFSET(64)
+    // Save FPRs.
+    movq %xmm0, 0(%rsp)
+    movq %xmm1, 8(%rsp)
+    movq %xmm2, 16(%rsp)
+    movq %xmm3, 24(%rsp)
+    movq %xmm4, 32(%rsp)
+    movq %xmm5, 40(%rsp)
+    movq %xmm6, 48(%rsp)
+    movq %xmm7, 56(%rsp)
+
+    // Add alignment padding.
+    subq MACRO_LITERAL(__SIZEOF_POINTER__), %rsp
+    CFI_ADJUST_CFA_OFFSET(__SIZEOF_POINTER__)
+    // Save hidden arg.
+    PUSH_ARG r11
+
+    // Call artCriticalNativeOutArgsSize(method).
+    movq %r11, %rdi  // Pass the method from hidden arg.
+    call SYMBOL(artCriticalNativeOutArgsSize)
+
+    // Calculate the address of the end of the move destination and redefine CFI to take
+    // ownership of the JNI stub frame.
+    leaq 16 * __SIZEOF_POINTER__(%rsp, %rax, 1), %r10  // 16 QWORDs of registers saved above.
+    CFI_DEF_CFA(%r10, FRAME_SIZE_SAVE_REFS_AND_ARGS)
+
+    // Calculate the number of QWORDs to move.
+    shrq LITERAL(3), %rax
+    leaq -1(%rax), %rcx  // Do not move the return PC.
+
+    // Load our return PC to EAX.
+    movq FRAME_SIZE_SAVE_REFS_AND_ARGS + (16 - 1) * __SIZEOF_POINTER__(%rsp), %rax
+
+    // Mov the stack args.
+    leaq 16 * __SIZEOF_POINTER__(%rsp), %rdi
+    leaq FRAME_SIZE_SAVE_REFS_AND_ARGS(%rdi), %rsi
+    rep movsq
+
+    // Save our return PC.
+    movq %rax, (%rdi)
+
+    // Pop the hidden arg and alignment padding.
+    popq %r11    // No `.cfi_adjust_cfa_offset`, CFA register is currently R10, not RSP.
+    addq MACRO_LITERAL(__SIZEOF_POINTER__), %rsp  // ditto
+
+    // Fill the SaveRefsAndArgs frame above the args, without actual args. Note that
+    // the runtime shall not examine the args here, otherwise we would have to move them in
+    // registers and stack to account for the difference between managed and native ABIs.
+    SAVE_REG_BASE r10, r15, 192
+    SAVE_REG_BASE r10, r14, 184
+    SAVE_REG_BASE r10, r13, 176
+    SAVE_REG_BASE r10, r12, 168
+    // Skip args r9, r8, rsi.
+    SAVE_REG_BASE r10, rbp, 136
+    SAVE_REG_BASE r10, rbx, 128
+    // Skip args rdx, rcx.
+    // Skip args xmm0-xmm7.
+    // Copy managed callee-saves xmm12-xmm15 from out args to the managed frame as they
+    // may theoretically store variables or unwinding data. (The compiled stub preserves
+    // them but the artCriticalNativeOutArgsSize() call above may clobber them.)
+    movq -5 * __SIZEOF_POINTER__(%r10), %xmm12
+    movq -4 * __SIZEOF_POINTER__(%r10), %xmm13
+    movq -3 * __SIZEOF_POINTER__(%r10), %xmm14
+    movq -2 * __SIZEOF_POINTER__(%r10), %xmm15
+    movq %xmm12, 80(%r10)
+    movq %xmm13, 88(%r10)
+    movq %xmm14, 96(%r10)
+    movq %xmm15, 104(%r10)
+    // Save the hidden arg as method pointer at the bottom of the stack.
+    movq %r11, (%r10)
+
+    // Move the frame register to a callee-save register.
+    movq %r10, %rbp
+    CFI_DEF_CFA_REGISTER(%rbp)
+
+    // Place tagged managed sp in Thread::Current()->top_quick_frame.
+    leaq 1(%rbp), %rax  // Tag as GenericJNI frame.
+    movq %rax, %gs:THREAD_TOP_QUICK_FRAME_OFFSET
+
+    // Call artFindNativeMethodRunnable()
+    movq %gs:THREAD_SELF_OFFSET, %rdi  // pass Thread::Current()
+    call SYMBOL(artFindNativeMethodRunnable)  // (Thread*)
+
+    // Check for exception.
+    test %rax, %rax
+    jz 2f
+
+    // Restore the frame. We shall not need the method anymore.
+    .cfi_remember_state
+    movq %rbp, %r10
+    CFI_DEF_CFA_REGISTER(%r10)
+    // Skip args xmm0-xmm7 and managed callee-saves xmm12-xmm15 (not needed for native call).
+    // Skip args rdx, rcx.
+    RESTORE_REG_BASE r10, rbx, 128
+    RESTORE_REG_BASE r10, rbp, 136
+    // Skip args r9, r8, rsi.
+    RESTORE_REG_BASE r10, r12, 168
+    RESTORE_REG_BASE r10, r13, 176
+    RESTORE_REG_BASE r10, r14, 184
+    RESTORE_REG_BASE r10, r15, 192
+
+    // Remember our return PC in R11.
+    movq -__SIZEOF_POINTER__(%r10), %r11
+
+    // Calculate the number of DWORDs to move.
+    leaq -(1 + 14) * __SIZEOF_POINTER__(%r10), %rcx  // Do not move return PC, 14 arg regs saved.
+    subq %rsp, %rcx
+    shrq LITERAL(3), %rcx
+
+    // Mov stack args to their original place.
+    leaq -2 * __SIZEOF_POINTER__(%r10), %rsi
+    leaq FRAME_SIZE_SAVE_REFS_AND_ARGS - 2 * __SIZEOF_POINTER__(%r10), %rdi
+    std
+    rep movsq
+    cld
+
+    // Store our return PC.
+    movq %r11, (%rdi)
+
+    // Redefine CFI to release ownership of the JNI stub frame.
+    CFI_DEF_CFA(%rsp, FRAME_SIZE_SAVE_REFS_AND_ARGS + 14 * __SIZEOF_POINTER__)
+
+    // Restore args.
+    movq 0(%rsp), %xmm0
+    movq 8(%rsp), %xmm1
+    movq 16(%rsp), %xmm2
+    movq 24(%rsp), %xmm3
+    movq 32(%rsp), %xmm4
+    movq 40(%rsp), %xmm5
+    movq 48(%rsp), %xmm6
+    movq 56(%rsp), %xmm7
+    addq LITERAL(64), %rsp
+    CFI_ADJUST_CFA_OFFSET(-64)
+    POP_ARG rcx
+    POP_ARG rdx
+    POP_ARG rsi
+    POP_ARG rdi
+    POP_ARG r8
+    POP_ARG r9
+
+    // Remove the frame reservation.
+    addq LITERAL(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__), %rsp
+    CFI_ADJUST_CFA_OFFSET(-(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__))
+
+    // Do the tail call.
+    jmp *%rax
+    CFI_RESTORE_STATE_AND_DEF_CFA(%rbp, FRAME_SIZE_SAVE_REFS_AND_ARGS)
+
+2:
+    // Drop the args from the stack (the r11 and padding was already removed).
+    addq LITERAL(14 * __SIZEOF_POINTER__), %rsp
+
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
+END_FUNCTION art_jni_dlsym_lookup_critical_stub
diff --git a/runtime/arch/x86_64/jni_frame_x86_64.h b/runtime/arch/x86_64/jni_frame_x86_64.h
new file mode 100644
index 0000000..65736fe
--- /dev/null
+++ b/runtime/arch/x86_64/jni_frame_x86_64.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_ARCH_X86_64_JNI_FRAME_X86_64_H_
+#define ART_RUNTIME_ARCH_X86_64_JNI_FRAME_X86_64_H_
+
+#include <string.h>
+
+#include "arch/instruction_set.h"
+#include "base/bit_utils.h"
+#include "base/globals.h"
+#include "base/logging.h"
+
+namespace art {
+namespace x86_64 {
+
+constexpr size_t kFramePointerSize = static_cast<size_t>(PointerSize::k64);
+static_assert(kX86_64PointerSize == PointerSize::k64, "Unexpected x86_64 pointer size");
+
+static constexpr size_t kNativeStackAlignment = 16;
+static_assert(kNativeStackAlignment == kStackAlignment);
+
+// We always have to spill registers xmm12-xmm15 which are callee-save
+// in managed ABI but caller-save in native ABI.
+constexpr size_t kMmxSpillSize = 8u;
+constexpr size_t kAlwaysSpilledMmxRegisters = 4;
+
+// XMM0..XMM7 can be used to pass the first 8 floating args. The rest must go on the stack.
+// -- Managed and JNI calling conventions.
+constexpr size_t kMaxFloatOrDoubleRegisterArguments = 8u;
+// Up to how many integer-like (pointers, objects, longs, int, short, bool, etc) args can be
+// enregistered. The rest of the args must go on the stack.
+// -- JNI calling convention only (Managed excludes RDI, so it's actually 5).
+constexpr size_t kMaxIntLikeRegisterArguments = 6u;
+
+// Get the size of "out args" for @CriticalNative method stub.
+// This must match the size of the frame emitted by the JNI compiler at the native call site.
+inline size_t GetCriticalNativeOutArgsSize(const char* shorty, uint32_t shorty_len) {
+  DCHECK_EQ(shorty_len, strlen(shorty));
+
+  size_t num_fp_args = 0u;
+  for (size_t i = 1; i != shorty_len; ++i) {
+    if (shorty[i] == 'F' || shorty[i] == 'D') {
+      num_fp_args += 1u;
+    }
+  }
+  size_t num_non_fp_args = shorty_len - 1u - num_fp_args;
+
+  // Account for FP arguments passed through Xmm0..Xmm7.
+  size_t num_stack_fp_args =
+      num_fp_args - std::min(kMaxFloatOrDoubleRegisterArguments, num_fp_args);
+  // Account for other (integer) arguments passed through GPR (RDI, RSI, RDX, RCX, R8, R9).
+  size_t num_stack_non_fp_args =
+      num_non_fp_args - std::min(kMaxIntLikeRegisterArguments, num_non_fp_args);
+  // The size of outgoing arguments.
+  static_assert(kFramePointerSize == kMmxSpillSize);
+  size_t size = (num_stack_fp_args + num_stack_non_fp_args) * kFramePointerSize;
+
+  // We always need to spill xmm12-xmm15 as they are managed callee-saves
+  // but not native callee-saves.
+  size += kAlwaysSpilledMmxRegisters * kMmxSpillSize;
+  // Add return address size.
+  size += kFramePointerSize;
+
+  return RoundUp(size, kNativeStackAlignment);
+}
+
+}  // namespace x86_64
+}  // namespace art
+
+#endif  // ART_RUNTIME_ARCH_X86_64_JNI_FRAME_X86_64_H_
+
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index c2f87b2..abc3a8a 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -38,117 +38,27 @@
     int3
     int3
 #else
+    SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
     // R10 := Runtime::Current()
     movq _ZN3art7Runtime9instance_E@GOTPCREL(%rip), %r10
     movq (%r10), %r10
-    // Save callee and GPR args, mixed together to agree with core spills bitmap.
-    PUSH r15  // Callee save.
-    PUSH r14  // Callee save.
-    PUSH r13  // Callee save.
-    PUSH r12  // Callee save.
-    PUSH r9   // Quick arg 5.
-    PUSH r8   // Quick arg 4.
-    PUSH rsi  // Quick arg 1.
-    PUSH rbp  // Callee save.
-    PUSH rbx  // Callee save.
-    PUSH rdx  // Quick arg 2.
-    PUSH rcx  // Quick arg 3.
-    // Create space for FPR args and create 2 slots for ArtMethod*.
-    subq MACRO_LITERAL(16 + 12 * 8), %rsp
-    CFI_ADJUST_CFA_OFFSET(16 + 12 * 8)
     // R10 := ArtMethod* for ref and args callee save frame method.
     movq RUNTIME_SAVE_REFS_AND_ARGS_METHOD_OFFSET(%r10), %r10
-    // Save FPRs.
-    movq %xmm0, 16(%rsp)
-    movq %xmm1, 24(%rsp)
-    movq %xmm2, 32(%rsp)
-    movq %xmm3, 40(%rsp)
-    movq %xmm4, 48(%rsp)
-    movq %xmm5, 56(%rsp)
-    movq %xmm6, 64(%rsp)
-    movq %xmm7, 72(%rsp)
-    movq %xmm12, 80(%rsp)
-    movq %xmm13, 88(%rsp)
-    movq %xmm14, 96(%rsp)
-    movq %xmm15, 104(%rsp)
     // Store ArtMethod* to bottom of stack.
     movq %r10, 0(%rsp)
     // Store rsp as the top quick frame.
     movq %rsp, %gs:THREAD_TOP_QUICK_FRAME_OFFSET
-
-    // Ugly compile-time check, but we only have the preprocessor.
-    // Last +8: implicit return address pushed on stack when caller made call.
-#if (FRAME_SIZE_SAVE_REFS_AND_ARGS != 11 * 8 + 12 * 8 + 16 + 8)
-#error "FRAME_SIZE_SAVE_REFS_AND_ARGS(X86_64) size not as expected."
-#endif
 #endif  // __APPLE__
 END_MACRO
 
 MACRO0(SETUP_SAVE_REFS_AND_ARGS_FRAME_WITH_METHOD_IN_RDI)
-    // Save callee and GPR args, mixed together to agree with core spills bitmap.
-    PUSH r15  // Callee save.
-    PUSH r14  // Callee save.
-    PUSH r13  // Callee save.
-    PUSH r12  // Callee save.
-    PUSH r9   // Quick arg 5.
-    PUSH r8   // Quick arg 4.
-    PUSH rsi  // Quick arg 1.
-    PUSH rbp  // Callee save.
-    PUSH rbx  // Callee save.
-    PUSH rdx  // Quick arg 2.
-    PUSH rcx  // Quick arg 3.
-    // Create space for FPR args and create 2 slots for ArtMethod*.
-    subq LITERAL(80 + 4 * 8), %rsp
-    CFI_ADJUST_CFA_OFFSET(80 + 4 * 8)
-    // Save FPRs.
-    movq %xmm0, 16(%rsp)
-    movq %xmm1, 24(%rsp)
-    movq %xmm2, 32(%rsp)
-    movq %xmm3, 40(%rsp)
-    movq %xmm4, 48(%rsp)
-    movq %xmm5, 56(%rsp)
-    movq %xmm6, 64(%rsp)
-    movq %xmm7, 72(%rsp)
-    movq %xmm12, 80(%rsp)
-    movq %xmm13, 88(%rsp)
-    movq %xmm14, 96(%rsp)
-    movq %xmm15, 104(%rsp)
+    SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
     // Store ArtMethod to bottom of stack.
     movq %rdi, 0(%rsp)
     // Store rsp as the stop quick frame.
     movq %rsp, %gs:THREAD_TOP_QUICK_FRAME_OFFSET
 END_MACRO
 
-MACRO0(RESTORE_SAVE_REFS_AND_ARGS_FRAME)
-    // Restore FPRs.
-    movq 16(%rsp), %xmm0
-    movq 24(%rsp), %xmm1
-    movq 32(%rsp), %xmm2
-    movq 40(%rsp), %xmm3
-    movq 48(%rsp), %xmm4
-    movq 56(%rsp), %xmm5
-    movq 64(%rsp), %xmm6
-    movq 72(%rsp), %xmm7
-    movq 80(%rsp), %xmm12
-    movq 88(%rsp), %xmm13
-    movq 96(%rsp), %xmm14
-    movq 104(%rsp), %xmm15
-    addq MACRO_LITERAL(80 + 4 * 8), %rsp
-    CFI_ADJUST_CFA_OFFSET(-(80 + 4 * 8))
-    // Restore callee and GPR args, mixed together to agree with core spills bitmap.
-    POP rcx
-    POP rdx
-    POP rbx
-    POP rbp
-    POP rsi
-    POP r8
-    POP r9
-    POP r12
-    POP r13
-    POP r14
-    POP r15
-END_MACRO
-
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveEverything)
diff --git a/runtime/art_method.cc b/runtime/art_method.cc
index a74d1ad..d0b6fde 100644
--- a/runtime/art_method.cc
+++ b/runtime/art_method.cc
@@ -406,7 +406,8 @@
 void ArtMethod::UnregisterNative() {
   CHECK(IsNative()) << PrettyMethod();
   // restore stub to lookup native pointer via dlsym
-  SetEntryPointFromJni(GetJniDlsymLookupStub());
+  SetEntryPointFromJni(
+      IsCriticalNative() ? GetJniDlsymLookupCriticalStub() : GetJniDlsymLookupStub());
 }
 
 bool ArtMethod::IsOverridableByDefaultMethod() {
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 4a4171e..8f3e1cb 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -627,6 +627,7 @@
       intern_table_(intern_table),
       fast_class_not_found_exceptions_(fast_class_not_found_exceptions),
       jni_dlsym_lookup_trampoline_(nullptr),
+      jni_dlsym_lookup_critical_trampoline_(nullptr),
       quick_resolution_trampoline_(nullptr),
       quick_imt_conflict_trampoline_(nullptr),
       quick_generic_jni_trampoline_(nullptr),
@@ -851,6 +852,7 @@
   if (!runtime->IsAotCompiler()) {
     // We need to set up the generic trampolines since we don't have an image.
     jni_dlsym_lookup_trampoline_ = GetJniDlsymLookupStub();
+    jni_dlsym_lookup_critical_trampoline_ = GetJniDlsymLookupCriticalStub();
     quick_resolution_trampoline_ = GetQuickResolutionStub();
     quick_imt_conflict_trampoline_ = GetQuickImtConflictStub();
     quick_generic_jni_trampoline_ = GetQuickGenericJniStub();
@@ -1202,6 +1204,7 @@
   DCHECK(!oat_files.empty());
   const OatHeader& default_oat_header = oat_files[0]->GetOatHeader();
   jni_dlsym_lookup_trampoline_ = default_oat_header.GetJniDlsymLookupTrampoline();
+  jni_dlsym_lookup_critical_trampoline_ = default_oat_header.GetJniDlsymLookupCriticalTrampoline();
   quick_resolution_trampoline_ = default_oat_header.GetQuickResolutionTrampoline();
   quick_imt_conflict_trampoline_ = default_oat_header.GetQuickImtConflictTrampoline();
   quick_generic_jni_trampoline_ = default_oat_header.GetQuickGenericJniTrampoline();
@@ -1212,6 +1215,8 @@
       const OatHeader& ith_oat_header = oat_files[i]->GetOatHeader();
       const void* ith_jni_dlsym_lookup_trampoline_ =
           ith_oat_header.GetJniDlsymLookupTrampoline();
+      const void* ith_jni_dlsym_lookup_critical_trampoline_ =
+          ith_oat_header.GetJniDlsymLookupCriticalTrampoline();
       const void* ith_quick_resolution_trampoline =
           ith_oat_header.GetQuickResolutionTrampoline();
       const void* ith_quick_imt_conflict_trampoline =
@@ -1221,6 +1226,7 @@
       const void* ith_quick_to_interpreter_bridge_trampoline =
           ith_oat_header.GetQuickToInterpreterBridge();
       if (ith_jni_dlsym_lookup_trampoline_ != jni_dlsym_lookup_trampoline_ ||
+          ith_jni_dlsym_lookup_critical_trampoline_ != jni_dlsym_lookup_critical_trampoline_ ||
           ith_quick_resolution_trampoline != quick_resolution_trampoline_ ||
           ith_quick_imt_conflict_trampoline != quick_imt_conflict_trampoline_ ||
           ith_quick_generic_jni_trampoline != quick_generic_jni_trampoline_ ||
@@ -9349,6 +9355,11 @@
       (jni_dlsym_lookup_trampoline_ == entry_point);
 }
 
+bool ClassLinker::IsJniDlsymLookupCriticalStub(const void* entry_point) const {
+  return entry_point == GetJniDlsymLookupCriticalStub() ||
+      (jni_dlsym_lookup_critical_trampoline_ == entry_point);
+}
+
 const void* ClassLinker::GetRuntimeQuickGenericJniStub() const {
   return GetQuickGenericJniStub();
 }
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index 26621af..f82a7c7 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -594,6 +594,9 @@
   // Is the given entry point the JNI dlsym lookup stub?
   bool IsJniDlsymLookupStub(const void* entry_point) const;
 
+  // Is the given entry point the JNI dlsym lookup critical stub?
+  bool IsJniDlsymLookupCriticalStub(const void* entry_point) const;
+
   const void* GetQuickToInterpreterBridgeTrampoline() const {
     return quick_to_interpreter_bridge_trampoline_;
   }
@@ -1426,6 +1429,7 @@
   // Trampolines within the image the bounce to runtime entrypoints. Done so that there is a single
   // patch point within the image. TODO: make these proper relocations.
   const void* jni_dlsym_lookup_trampoline_;
+  const void* jni_dlsym_lookup_critical_trampoline_;
   const void* quick_resolution_trampoline_;
   const void* quick_imt_conflict_trampoline_;
   const void* quick_generic_jni_trampoline_;
diff --git a/runtime/entrypoints/jni/jni_entrypoints.cc b/runtime/entrypoints/jni/jni_entrypoints.cc
index d008e1a..f1e5772 100644
--- a/runtime/entrypoints/jni/jni_entrypoints.cc
+++ b/runtime/entrypoints/jni/jni_entrypoints.cc
@@ -16,6 +16,11 @@
 
 #include <android-base/logging.h>
 
+#include "arch/arm/jni_frame_arm.h"
+#include "arch/arm64/jni_frame_arm64.h"
+#include "arch/instruction_set.h"
+#include "arch/x86/jni_frame_x86.h"
+#include "arch/x86_64/jni_frame_x86_64.h"
 #include "art_method-inl.h"
 #include "entrypoints/entrypoint_utils.h"
 #include "jni/java_vm_ext.h"
@@ -52,4 +57,24 @@
   return artFindNativeMethodRunnable(self);
 }
 
+extern "C" size_t artCriticalNativeOutArgsSize(ArtMethod* method)
+    REQUIRES_SHARED(Locks::mutator_lock_)  {
+  uint32_t shorty_len;
+  const char* shorty = method->GetShorty(&shorty_len);
+  switch (kRuntimeISA) {
+    case InstructionSet::kArm:
+    case InstructionSet::kThumb2:
+      return arm::GetCriticalNativeOutArgsSize(shorty, shorty_len);
+    case InstructionSet::kArm64:
+      return arm64::GetCriticalNativeOutArgsSize(shorty, shorty_len);
+    case InstructionSet::kX86:
+      return x86::GetCriticalNativeOutArgsSize(shorty, shorty_len);
+    case InstructionSet::kX86_64:
+      return x86_64::GetCriticalNativeOutArgsSize(shorty, shorty_len);
+    default:
+      UNIMPLEMENTED(FATAL) << kRuntimeISA;
+      UNREACHABLE();
+  }
+}
+
 }  // namespace art
diff --git a/runtime/entrypoints/jni/jni_entrypoints.h b/runtime/entrypoints/jni/jni_entrypoints.h
index 9c1b0dc..0aabed0 100644
--- a/runtime/entrypoints/jni/jni_entrypoints.h
+++ b/runtime/entrypoints/jni/jni_entrypoints.h
@@ -29,8 +29,10 @@
 
 // Pointers to functions that are called by JNI trampolines via thread-local storage.
 struct PACKED(4) JniEntryPoints {
-  // Called when the JNI method isn't registered.
+  // Called when the JNI method isn't registered for normal native and @FastNative methods.
   void* (*pDlsymLookup)(JNIEnv* env, jobject);
+  // Called when the JNI method isn't registered for @CriticalNative methods.
+  void* (*pDlsymLookupCritical)(JNIEnv* env, jobject);
 };
 
 }  // namespace art
diff --git a/runtime/entrypoints/quick/quick_default_init_entrypoints.h b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
index 048deb4..a77bb85 100644
--- a/runtime/entrypoints/quick/quick_default_init_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_default_init_entrypoints.h
@@ -29,6 +29,7 @@
 static void DefaultInitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) {
   // JNI
   jpoints->pDlsymLookup = art_jni_dlsym_lookup_stub;
+  jpoints->pDlsymLookupCritical = art_jni_dlsym_lookup_critical_stub;
 
   // Alloc
   ResetQuickAllocEntryPoints(qpoints, /* is_marking= */ true);
diff --git a/runtime/entrypoints/runtime_asm_entrypoints.h b/runtime/entrypoints/runtime_asm_entrypoints.h
index a43358f..9f47034 100644
--- a/runtime/entrypoints/runtime_asm_entrypoints.h
+++ b/runtime/entrypoints/runtime_asm_entrypoints.h
@@ -35,6 +35,11 @@
   return reinterpret_cast<const void*>(art_jni_dlsym_lookup_stub);
 }
 
+extern "C" void* art_jni_dlsym_lookup_critical_stub(JNIEnv*, jobject);
+static inline const void* GetJniDlsymLookupCriticalStub() {
+  return reinterpret_cast<const void*>(art_jni_dlsym_lookup_critical_stub);
+}
+
 // Return the address of quick stub code for handling IMT conflicts.
 extern "C" void art_quick_imt_conflict_trampoline(ArtMethod*);
 static inline const void* GetQuickImtConflictStub() {
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index d88584d..52c4142 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -147,8 +147,12 @@
   void CheckJniEntryPoints() {
     CHECKED(OFFSETOF_MEMBER(JniEntryPoints, pDlsymLookup) == 0,
             JniEntryPoints_start_with_dlsymlookup);
-    CHECKED(OFFSETOF_MEMBER(JniEntryPoints, pDlsymLookup)
-            + sizeof(void*) == sizeof(JniEntryPoints), JniEntryPoints_all);
+    CHECKED(OFFSETOF_MEMBER(JniEntryPoints, pDlsymLookup) + sizeof(void*) ==
+                OFFSETOF_MEMBER(JniEntryPoints, pDlsymLookupCritical),
+            JniEntryPoints_dlsymlookup_critical);
+    CHECKED(OFFSETOF_MEMBER(JniEntryPoints, pDlsymLookupCritical) + sizeof(void*) ==
+                sizeof(JniEntryPoints),
+            JniEntryPoints_all);
   }
 
   void CheckQuickEntryPoints() {
diff --git a/runtime/oat.cc b/runtime/oat.cc
index 7b13430..17c797a 100644
--- a/runtime/oat.cc
+++ b/runtime/oat.cc
@@ -73,6 +73,7 @@
       oat_dex_files_offset_(0),
       executable_offset_(0),
       jni_dlsym_lookup_trampoline_offset_(0),
+      jni_dlsym_lookup_critical_trampoline_offset_(0),
       quick_generic_jni_trampoline_offset_(0),
       quick_imt_conflict_trampoline_offset_(0),
       quick_resolution_trampoline_offset_(0),
@@ -217,6 +218,22 @@
   jni_dlsym_lookup_trampoline_offset_ = offset;
 }
 
+const void* OatHeader::GetJniDlsymLookupCriticalTrampoline() const {
+  return GetTrampoline(*this, GetJniDlsymLookupCriticalTrampolineOffset());
+}
+
+uint32_t OatHeader::GetJniDlsymLookupCriticalTrampolineOffset() const {
+  DCHECK(IsValid());
+  return jni_dlsym_lookup_critical_trampoline_offset_;
+}
+
+void OatHeader::SetJniDlsymLookupCriticalTrampolineOffset(uint32_t offset) {
+  DCHECK(IsValid());
+  DCHECK_EQ(jni_dlsym_lookup_critical_trampoline_offset_, 0U) << offset;
+
+  jni_dlsym_lookup_critical_trampoline_offset_ = offset;
+}
+
 const void* OatHeader::GetQuickGenericJniTrampoline() const {
   return GetTrampoline(*this, GetQuickGenericJniTrampolineOffset());
 }
diff --git a/runtime/oat.h b/runtime/oat.h
index 206f8af..8c81844 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,8 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr std::array<uint8_t, 4> kOatMagic { { 'o', 'a', 't', '\n' } };
-  // Last oat version changed reason: invokeinterface on j.l.Object do a vtable call.
-  static constexpr std::array<uint8_t, 4> kOatVersion { { '1', '7', '9', '\0' } };
+  // Last oat version changed reason: Allow late lookup for @CriticalNative.
+  static constexpr std::array<uint8_t, 4> kOatVersion { { '1', '8', '0', '\0' } };
 
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
   static constexpr const char* kDebuggableKey = "debuggable";
@@ -72,6 +72,9 @@
   const void* GetJniDlsymLookupTrampoline() const;
   uint32_t GetJniDlsymLookupTrampolineOffset() const;
   void SetJniDlsymLookupTrampolineOffset(uint32_t offset);
+  const void* GetJniDlsymLookupCriticalTrampoline() const;
+  uint32_t GetJniDlsymLookupCriticalTrampolineOffset() const;
+  void SetJniDlsymLookupCriticalTrampolineOffset(uint32_t offset);
 
   const void* GetQuickGenericJniTrampoline() const;
   uint32_t GetQuickGenericJniTrampolineOffset() const;
@@ -123,6 +126,7 @@
   uint32_t oat_dex_files_offset_;
   uint32_t executable_offset_;
   uint32_t jni_dlsym_lookup_trampoline_offset_;
+  uint32_t jni_dlsym_lookup_critical_trampoline_offset_;
   uint32_t quick_generic_jni_trampoline_offset_;
   uint32_t quick_imt_conflict_trampoline_offset_;
   uint32_t quick_resolution_trampoline_offset_;
diff --git a/runtime/stack.cc b/runtime/stack.cc
index 8916618..58a73cc 100644
--- a/runtime/stack.cc
+++ b/runtime/stack.cc
@@ -805,10 +805,12 @@
     return RuntimeCalleeSaveFrame::GetMethodFrameInfo(CalleeSaveType::kSaveRefsAndArgs);
   }
 
-  // The only remaining case is if the method is native and uses the generic JNI stub,
-  // called either directly or through some (resolution, instrumentation) trampoline.
+  // The only remaining cases are for native methods that either
+  //   - use the Generic JNI stub, called either directly or through some
+  //     (resolution, instrumentation) trampoline; or
+  //   - fake a Generic JNI frame in art_jni_dlsym_lookup_critical_stub.
   DCHECK(method->IsNative());
-  if (kIsDebugBuild) {
+  if (kIsDebugBuild && !method->IsCriticalNative()) {
     ClassLinker* class_linker = runtime->GetClassLinker();
     const void* entry_point = runtime->GetInstrumentation()->GetQuickCodeFor(method,
                                                                              kRuntimePointerSize);
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 3add372..77b9f4f 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -3371,6 +3371,7 @@
       return; \
     }
   JNI_ENTRY_POINT_INFO(pDlsymLookup)
+  JNI_ENTRY_POINT_INFO(pDlsymLookupCritical)
 #undef JNI_ENTRY_POINT_INFO
 
 #define QUICK_ENTRY_POINT_INFO(x) \