Direct calls to @CriticalNative methods.

Emit direct calls from compiled managed code to the native
code registered with the method, avoiding the JNI stub.

Golem results:
art-opt-cc                       x86 x86-64    arm  arm64
NativeDowncallStaticCritical  +12.5% +62.5% +75.9% +41.7%
NativeDowncallStaticCritical6 +55.6% +87.5% +72.1% +35.3%
art-opt                          x86 x86-64    arm  arm64
NativeDowncallStaticCritical  +28.6% +85.6% +76.4% +38.4%
NativeDowncallStaticCritical6 +44.6% +44.6% +74.6% +32.2%

Test: Covered by 178-app-image-native-method.
Test: m test-art-host-gtest
Test: testrunner.py --host --debuggable --ndebuggable \
          --optimizing --jit --jit-on-first-use
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Test: testrunner.py --target --debuggable --ndebuggable \
          --optimizing --jit --jit-on-first-use -t 178
Test: aosp_cf_x86_phone-userdebug boots.
Test: aosp_cf_x86_phone-userdebug/jitzygote boots.
Bug: 112189621
Change-Id: I8b37da51e8fe0b7bc513bb81b127fe0416068866
diff --git a/compiler/jni/jni_compiler_test.cc b/compiler/jni/jni_compiler_test.cc
index 2db1390..685e1e2 100644
--- a/compiler/jni/jni_compiler_test.cc
+++ b/compiler/jni/jni_compiler_test.cc
@@ -314,6 +314,12 @@
     }
     ASSERT_TRUE(jmethod_ != nullptr) << method_name << " " << method_sig;
 
+    // Make sure the test class is visibly initialized so that the RegisterNatives() below
+    // sets the JNI entrypoint rather than leaving it as null (this test pretends to be an
+    // AOT compiler and therefore the ClassLinker skips entrypoint initialization). Even
+    // if the ClassLinker initialized it with a stub, we would not want to test that here.
+    class_linker_->MakeInitializedClassesVisiblyInitialized(Thread::Current(), /*wait=*/ true);
+
     if (native_fnptr != nullptr) {
       JNINativeMethod methods[] = { { method_name, method_sig, native_fnptr } };
       ASSERT_EQ(JNI_OK, env_->RegisterNatives(jklass_, methods, 1))
diff --git a/compiler/jni/quick/arm/calling_convention_arm.cc b/compiler/jni/quick/arm/calling_convention_arm.cc
index d07ab98..7afa8b1 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.cc
+++ b/compiler/jni/quick/arm/calling_convention_arm.cc
@@ -420,7 +420,7 @@
   return RoundUp(total_size, kStackAlignment);
 }
 
-size_t ArmJniCallingConvention::OutArgSize() const {
+size_t ArmJniCallingConvention::OutFrameSize() const {
   // Count param args, including JNIEnv* and jclass*; count 8-byte args twice.
   size_t all_args = NumberOfExtraArgumentsForJni() + NumArgs() + NumLongOrDoubleArgs();
   // Account for arguments passed through r0-r3. (No FP args, AAPCS32 is soft-float.)
@@ -440,7 +440,7 @@
   }
   size_t out_args_size = RoundUp(size, kAapcsStackAlignment);
   if (UNLIKELY(IsCriticalNative())) {
-    DCHECK_EQ(out_args_size, GetCriticalNativeOutArgsSize(GetShorty(), NumArgs() + 1u));
+    DCHECK_EQ(out_args_size, GetCriticalNativeStubFrameSize(GetShorty(), NumArgs() + 1u));
   }
   return out_args_size;
 }
@@ -512,9 +512,9 @@
   CHECK_GE(itr_slots_, kJniArgumentRegisterCount);
   size_t offset =
       displacement_.Int32Value()
-          - OutArgSize()
+          - OutFrameSize()
           + ((itr_slots_ - kJniArgumentRegisterCount) * kFramePointerSize);
-  CHECK_LT(offset, OutArgSize());
+  CHECK_LT(offset, OutFrameSize());
   return FrameOffset(offset);
 }
 
@@ -537,7 +537,7 @@
 // Whether to use tail call (used only for @CriticalNative).
 bool ArmJniCallingConvention::UseTailCall() const {
   CHECK(IsCriticalNative());
-  return OutArgSize() == 0u;
+  return OutFrameSize() == 0u;
 }
 
 }  // namespace arm
diff --git a/compiler/jni/quick/arm/calling_convention_arm.h b/compiler/jni/quick/arm/calling_convention_arm.h
index 7896d64..38f7184 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.h
+++ b/compiler/jni/quick/arm/calling_convention_arm.h
@@ -65,7 +65,7 @@
   // JNI calling convention
   void Next() override;  // Override default behavior for AAPCS
   size_t FrameSize() const override;
-  size_t OutArgSize() const override;
+  size_t OutFrameSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ManagedRegister ReturnScratchRegister() const override;
   uint32_t CoreSpillMask() const override;
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.cc b/compiler/jni/quick/arm64/calling_convention_arm64.cc
index 32da141..06796c1 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.cc
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.cc
@@ -265,20 +265,14 @@
   return RoundUp(total_size, kStackAlignment);
 }
 
-size_t Arm64JniCallingConvention::OutArgSize() const {
+size_t Arm64JniCallingConvention::OutFrameSize() const {
   // Count param args, including JNIEnv* and jclass*.
   size_t all_args = NumberOfExtraArgumentsForJni() + NumArgs();
   size_t num_fp_args = NumFloatOrDoubleArgs();
   DCHECK_GE(all_args, num_fp_args);
   size_t num_non_fp_args = all_args - num_fp_args;
-  // Account for FP arguments passed through v0-v7.
-  size_t num_stack_fp_args =
-      num_fp_args - std::min(kMaxFloatOrDoubleRegisterArguments, num_fp_args);
-  // Account for other (integer and pointer) arguments passed through GPR (x0-x7).
-  size_t num_stack_non_fp_args =
-      num_non_fp_args - std::min(kMaxIntLikeRegisterArguments, num_non_fp_args);
   // The size of outgoing arguments.
-  size_t size = (num_stack_fp_args + num_stack_non_fp_args) * kFramePointerSize;
+  size_t size = GetNativeOutArgsSize(num_fp_args, num_non_fp_args);
 
   // @CriticalNative can use tail call as all managed callee saves are preserved by AAPCS64.
   static_assert((kCoreCalleeSpillMask & ~kAapcs64CoreCalleeSpillMask) == 0u);
@@ -291,7 +285,7 @@
   }
   size_t out_args_size = RoundUp(size, kAapcs64StackAlignment);
   if (UNLIKELY(IsCriticalNative())) {
-    DCHECK_EQ(out_args_size, GetCriticalNativeOutArgsSize(GetShorty(), NumArgs() + 1u));
+    DCHECK_EQ(out_args_size, GetCriticalNativeStubFrameSize(GetShorty(), NumArgs() + 1u));
   }
   return out_args_size;
 }
@@ -355,8 +349,8 @@
                              static_cast<size_t>(itr_float_and_doubles_))
                   - std::min(kMaxIntLikeRegisterArguments,
                              static_cast<size_t>(itr_args_ - itr_float_and_doubles_));
-  size_t offset = displacement_.Int32Value() - OutArgSize() + (args_on_stack * kFramePointerSize);
-  CHECK_LT(offset, OutArgSize());
+  size_t offset = displacement_.Int32Value() - OutFrameSize() + (args_on_stack * kFramePointerSize);
+  CHECK_LT(offset, OutFrameSize());
   return FrameOffset(offset);
 }
 
@@ -378,7 +372,7 @@
 // Whether to use tail call (used only for @CriticalNative).
 bool Arm64JniCallingConvention::UseTailCall() const {
   CHECK(IsCriticalNative());
-  return OutArgSize() == 0u;
+  return OutFrameSize() == 0u;
 }
 
 }  // namespace arm64
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.h b/compiler/jni/quick/arm64/calling_convention_arm64.h
index 7beca08..d381d9d 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.h
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.h
@@ -56,7 +56,7 @@
   ManagedRegister IntReturnRegister() override;
   // JNI calling convention
   size_t FrameSize() const override;
-  size_t OutArgSize() const override;
+  size_t OutFrameSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ManagedRegister ReturnScratchRegister() const override;
   uint32_t CoreSpillMask() const override;
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index b4396f0..005ae91 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -303,9 +303,9 @@
   // always at the bottom of a frame, but this doesn't work for outgoing
   // native args). Includes alignment.
   virtual size_t FrameSize() const = 0;
-  // Size of outgoing arguments (stack portion), including alignment.
+  // Size of outgoing frame, i.e. stack arguments, @CriticalNative return PC if needed, alignment.
   // -- Arguments that are passed via registers are excluded from this size.
-  virtual size_t OutArgSize() const = 0;
+  virtual size_t OutFrameSize() const = 0;
   // Number of references in stack indirect reference table
   size_t ReferenceCount() const;
   // Location where the segment state of the local indirect reference table is saved
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index 036cdbb..913a3ba 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -220,7 +220,7 @@
   // 1. Build the frame saving all callee saves, Method*, and PC return address.
   //    For @CriticalNative, this includes space for out args, otherwise just the managed frame.
   const size_t managed_frame_size = main_jni_conv->FrameSize();
-  const size_t main_out_arg_size = main_jni_conv->OutArgSize();
+  const size_t main_out_arg_size = main_jni_conv->OutFrameSize();
   size_t current_frame_size = is_critical_native ? main_out_arg_size : managed_frame_size;
   ManagedRegister method_register =
       is_critical_native ? ManagedRegister::NoRegister() : mr_conv->MethodRegister();
@@ -582,7 +582,7 @@
 
   if (LIKELY(!is_critical_native)) {
     // Increase frame size for out args if needed by the end_jni_conv.
-    const size_t end_out_arg_size = end_jni_conv->OutArgSize();
+    const size_t end_out_arg_size = end_jni_conv->OutFrameSize();
     if (end_out_arg_size > current_out_arg_size) {
       size_t out_arg_size_diff = end_out_arg_size - current_out_arg_size;
       current_out_arg_size = end_out_arg_size;
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index 6776f12..df45627 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -220,11 +220,10 @@
   return RoundUp(total_size, kStackAlignment);
 }
 
-size_t X86JniCallingConvention::OutArgSize() const {
-  // Count param args, including JNIEnv* and jclass*; count 8-byte args twice.
-  size_t all_args = NumberOfExtraArgumentsForJni() + NumArgs() + NumLongOrDoubleArgs();
-  // The size of outgoiong arguments.
-  size_t size = all_args * kFramePointerSize;
+size_t X86JniCallingConvention::OutFrameSize() const {
+  // The size of outgoing arguments.
+  size_t size = GetNativeOutArgsSize(/*num_args=*/ NumberOfExtraArgumentsForJni() + NumArgs(),
+                                     NumLongOrDoubleArgs());
 
   // @CriticalNative can use tail call as all managed callee saves are preserved by AAPCS.
   static_assert((kCoreCalleeSpillMask & ~kNativeCoreCalleeSpillMask) == 0u);
@@ -244,14 +243,16 @@
     if (return_type_ok && size == kFramePointerSize) {
       // Note: This is not aligned to kNativeStackAlignment but that's OK for tail call.
       static_assert(kFramePointerSize < kNativeStackAlignment);
-      DCHECK_EQ(kFramePointerSize, GetCriticalNativeOutArgsSize(GetShorty(), NumArgs() + 1u));
+      // The stub frame size is considered 0 in the callee where the return PC is a part of
+      // the callee frame but it is kPointerSize in the compiled stub before the tail call.
+      DCHECK_EQ(0u, GetCriticalNativeStubFrameSize(GetShorty(), NumArgs() + 1u));
       return kFramePointerSize;
     }
   }
 
   size_t out_args_size = RoundUp(size, kNativeStackAlignment);
   if (UNLIKELY(IsCriticalNative())) {
-    DCHECK_EQ(out_args_size, GetCriticalNativeOutArgsSize(GetShorty(), NumArgs() + 1u));
+    DCHECK_EQ(out_args_size, GetCriticalNativeStubFrameSize(GetShorty(), NumArgs() + 1u));
   }
   return out_args_size;
 }
@@ -279,7 +280,8 @@
 }
 
 FrameOffset X86JniCallingConvention::CurrentParamStackOffset() {
-  return FrameOffset(displacement_.Int32Value() - OutArgSize() + (itr_slots_ * kFramePointerSize));
+  return
+      FrameOffset(displacement_.Int32Value() - OutFrameSize() + (itr_slots_ * kFramePointerSize));
 }
 
 ManagedRegister X86JniCallingConvention::HiddenArgumentRegister() const {
@@ -295,7 +297,7 @@
 
 bool X86JniCallingConvention::UseTailCall() const {
   CHECK(IsCriticalNative());
-  return OutArgSize() == kFramePointerSize;
+  return OutFrameSize() == kFramePointerSize;
 }
 
 }  // namespace x86
diff --git a/compiler/jni/quick/x86/calling_convention_x86.h b/compiler/jni/quick/x86/calling_convention_x86.h
index 6f22c2b..81f617d 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.h
+++ b/compiler/jni/quick/x86/calling_convention_x86.h
@@ -61,7 +61,7 @@
   ManagedRegister IntReturnRegister() override;
   // JNI calling convention
   size_t FrameSize() const override;
-  size_t OutArgSize() const override;
+  size_t OutFrameSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ManagedRegister ReturnScratchRegister() const override;
   uint32_t CoreSpillMask() const override;
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
index e97cab8..44ae8be 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
@@ -208,21 +208,14 @@
   return RoundUp(total_size, kStackAlignment);
 }
 
-size_t X86_64JniCallingConvention::OutArgSize() const {
+size_t X86_64JniCallingConvention::OutFrameSize() const {
   // Count param args, including JNIEnv* and jclass*.
   size_t all_args = NumberOfExtraArgumentsForJni() + NumArgs();
   size_t num_fp_args = NumFloatOrDoubleArgs();
   DCHECK_GE(all_args, num_fp_args);
   size_t num_non_fp_args = all_args - num_fp_args;
-  // Account for FP arguments passed through Xmm0..Xmm7.
-  size_t num_stack_fp_args =
-      num_fp_args - std::min(kMaxFloatOrDoubleRegisterArguments, num_fp_args);
-  // Account for other (integer) arguments passed through GPR (RDI, RSI, RDX, RCX, R8, R9).
-  size_t num_stack_non_fp_args =
-      num_non_fp_args - std::min(kMaxIntLikeRegisterArguments, num_non_fp_args);
   // The size of outgoing arguments.
-  static_assert(kFramePointerSize == kMmxSpillSize);
-  size_t size = (num_stack_fp_args + num_stack_non_fp_args) * kFramePointerSize;
+  size_t size = GetNativeOutArgsSize(num_fp_args, num_non_fp_args);
 
   if (UNLIKELY(IsCriticalNative())) {
     // We always need to spill xmm12-xmm15 as they are managed callee-saves
@@ -239,7 +232,7 @@
 
   size_t out_args_size = RoundUp(size, kNativeStackAlignment);
   if (UNLIKELY(IsCriticalNative())) {
-    DCHECK_EQ(out_args_size, GetCriticalNativeOutArgsSize(GetShorty(), NumArgs() + 1u));
+    DCHECK_EQ(out_args_size, GetCriticalNativeStubFrameSize(GetShorty(), NumArgs() + 1u));
   }
   return out_args_size;
 }
@@ -297,8 +290,8 @@
       - std::min(kMaxIntLikeRegisterArguments,
                  static_cast<size_t>(itr_args_ - itr_float_and_doubles_));
           // Integer arguments passed through GPR
-  size_t offset = displacement_.Int32Value() - OutArgSize() + (args_on_stack * kFramePointerSize);
-  CHECK_LT(offset, OutArgSize());
+  size_t offset = displacement_.Int32Value() - OutFrameSize() + (args_on_stack * kFramePointerSize);
+  CHECK_LT(offset, OutFrameSize());
   return FrameOffset(offset);
 }
 
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.h b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
index d043a3e..5bde766 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.h
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
@@ -56,7 +56,7 @@
   ManagedRegister IntReturnRegister() override;
   // JNI calling convention
   size_t FrameSize() const override;
-  size_t OutArgSize() const override;
+  size_t OutFrameSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ManagedRegister ReturnScratchRegister() const override;
   uint32_t CoreSpillMask() const override;
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index cfd9ea6..f74a938 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -32,6 +32,7 @@
 #include "code_generator_x86_64.h"
 #endif
 
+#include "art_method-inl.h"
 #include "base/bit_utils.h"
 #include "base/bit_utils_iterator.h"
 #include "base/casts.h"
@@ -503,23 +504,69 @@
 
   if (invoke->IsInvokeStaticOrDirect()) {
     HInvokeStaticOrDirect* call = invoke->AsInvokeStaticOrDirect();
-    switch (call->GetMethodLoadKind()) {
-      case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-        locations->SetInAt(call->GetSpecialInputIndex(), visitor->GetMethodLocation());
-        break;
-      case HInvokeStaticOrDirect::MethodLoadKind::kRuntimeCall:
-        locations->AddTemp(visitor->GetMethodLocation());
-        locations->SetInAt(call->GetSpecialInputIndex(), Location::RequiresRegister());
-        break;
-      default:
-        locations->AddTemp(visitor->GetMethodLocation());
-        break;
+    HInvokeStaticOrDirect::MethodLoadKind method_load_kind = call->GetMethodLoadKind();
+    HInvokeStaticOrDirect::CodePtrLocation code_ptr_location = call->GetCodePtrLocation();
+    if (code_ptr_location == HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative) {
+      locations->AddTemp(Location::RequiresRegister());  // For target method.
+    }
+    if (code_ptr_location == HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative ||
+        method_load_kind == HInvokeStaticOrDirect::MethodLoadKind::kRecursive) {
+      // For `kCallCriticalNative` we need the current method as the hidden argument
+      // if we reach the dlsym lookup stub for @CriticalNative.
+      locations->SetInAt(call->GetCurrentMethodIndex(), visitor->GetMethodLocation());
+    } else {
+      locations->AddTemp(visitor->GetMethodLocation());
+      if (method_load_kind == HInvokeStaticOrDirect::MethodLoadKind::kRuntimeCall) {
+        locations->SetInAt(call->GetCurrentMethodIndex(), Location::RequiresRegister());
+      }
     }
   } else if (!invoke->IsInvokePolymorphic()) {
     locations->AddTemp(visitor->GetMethodLocation());
   }
 }
 
+void CodeGenerator::PrepareCriticalNativeArgumentMoves(
+    HInvokeStaticOrDirect* invoke,
+    /*inout*/InvokeDexCallingConventionVisitor* visitor,
+    /*out*/HParallelMove* parallel_move) {
+  LocationSummary* locations = invoke->GetLocations();
+  for (size_t i = 0, num = invoke->GetNumberOfArguments(); i != num; ++i) {
+    Location in_location = locations->InAt(i);
+    DataType::Type type = invoke->InputAt(i)->GetType();
+    DCHECK_NE(type, DataType::Type::kReference);
+    Location out_location = visitor->GetNextLocation(type);
+    if (out_location.IsStackSlot() || out_location.IsDoubleStackSlot()) {
+      // Stack arguments will need to be moved after adjusting the SP.
+      parallel_move->AddMove(in_location, out_location, type, /*instruction=*/ nullptr);
+    } else {
+      // Register arguments should have been assigned their final locations for register allocation.
+      DCHECK(out_location.Equals(in_location)) << in_location << " -> " << out_location;
+    }
+  }
+}
+
+void CodeGenerator::AdjustCriticalNativeArgumentMoves(size_t out_frame_size,
+                                                      /*inout*/HParallelMove* parallel_move) {
+  // Adjust the source stack offsets by `out_frame_size`, i.e. the additional
+  // frame size needed for outgoing stack arguments.
+  for (size_t i = 0, num = parallel_move->NumMoves(); i != num; ++i) {
+    MoveOperands* operands = parallel_move->MoveOperandsAt(i);
+    Location source = operands->GetSource();
+    if (operands->GetSource().IsStackSlot()) {
+      operands->SetSource(Location::StackSlot(source.GetStackIndex() +  out_frame_size));
+    } else if (operands->GetSource().IsDoubleStackSlot()) {
+      operands->SetSource(Location::DoubleStackSlot(source.GetStackIndex() +  out_frame_size));
+    }
+  }
+}
+
+const char* CodeGenerator::GetCriticalNativeShorty(HInvokeStaticOrDirect* invoke,
+                                                   uint32_t* shorty_len) {
+  ScopedObjectAccess soa(Thread::Current());
+  DCHECK(invoke->GetResolvedMethod()->IsCriticalNative());
+  return invoke->GetResolvedMethod()->GetShorty(shorty_len);
+}
+
 void CodeGenerator::GenerateInvokeStaticOrDirectRuntimeCall(
     HInvokeStaticOrDirect* invoke, Location temp, SlowPathCode* slow_path) {
   MoveConstant(temp, invoke->GetDexMethodIndex());
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index ff2be47..4bfc14a 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -570,6 +570,28 @@
   static void CreateCommonInvokeLocationSummary(
       HInvoke* invoke, InvokeDexCallingConventionVisitor* visitor);
 
+  template <typename CriticalNativeCallingConventionVisitor,
+            size_t kNativeStackAlignment,
+            size_t GetCriticalNativeDirectCallFrameSize(const char* shorty, uint32_t shorty_len)>
+  static size_t PrepareCriticalNativeCall(HInvokeStaticOrDirect* invoke,
+                                          /*out*/HParallelMove* parallel_move) {
+      DCHECK(!invoke->GetLocations()->Intrinsified());
+      CriticalNativeCallingConventionVisitor calling_convention_visitor(
+          /*for_register_allocation=*/ false);
+      PrepareCriticalNativeArgumentMoves(invoke, &calling_convention_visitor, parallel_move);
+      size_t out_frame_size =
+          RoundUp(calling_convention_visitor.GetStackOffset(), kNativeStackAlignment);
+      if (kIsDebugBuild) {
+        uint32_t shorty_len;
+        const char* shorty = GetCriticalNativeShorty(invoke, &shorty_len);
+        DCHECK_EQ(GetCriticalNativeDirectCallFrameSize(shorty, shorty_len), out_frame_size);
+      }
+      if (out_frame_size != 0u) {
+        AdjustCriticalNativeArgumentMoves(out_frame_size, parallel_move);
+      }
+      return out_frame_size;
+  }
+
   void GenerateInvokeStaticOrDirectRuntimeCall(
       HInvokeStaticOrDirect* invoke, Location temp, SlowPathCode* slow_path);
 
@@ -799,6 +821,16 @@
                        bool needs_vreg_info = true);
   void EmitVRegInfo(HEnvironment* environment, SlowPathCode* slow_path);
 
+  static void PrepareCriticalNativeArgumentMoves(
+      HInvokeStaticOrDirect* invoke,
+      /*inout*/InvokeDexCallingConventionVisitor* visitor,
+      /*out*/HParallelMove* parallel_move);
+
+  static void AdjustCriticalNativeArgumentMoves(size_t out_frame_size,
+                                                /*inout*/HParallelMove* parallel_move);
+
+  static const char* GetCriticalNativeShorty(HInvokeStaticOrDirect* invoke, uint32_t* shorty_len);
+
   OptimizingCompilerStats* stats_;
 
   HGraph* const graph_;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 4a618de..d108623 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -18,6 +18,7 @@
 
 #include "arch/arm64/asm_support_arm64.h"
 #include "arch/arm64/instruction_set_features_arm64.h"
+#include "arch/arm64/jni_frame_arm64.h"
 #include "art_method-inl.h"
 #include "base/bit_utils.h"
 #include "base/bit_utils_iterator.h"
@@ -870,6 +871,49 @@
   return LocationFrom(kArtMethodRegister);
 }
 
+Location CriticalNativeCallingConventionVisitorARM64::GetNextLocation(DataType::Type type) {
+  DCHECK_NE(type, DataType::Type::kReference);
+
+  Location location = Location::NoLocation();
+  if (DataType::IsFloatingPointType(type)) {
+    if (fpr_index_ < kParameterFPRegistersLength) {
+      location = LocationFrom(kParameterFPRegisters[fpr_index_]);
+      ++fpr_index_;
+    }
+  } else {
+    // Native ABI uses the same registers as managed, except that the method register x0
+    // is a normal argument.
+    if (gpr_index_ < 1u + kParameterCoreRegistersLength) {
+      location = LocationFrom(gpr_index_ == 0u ? x0 : kParameterCoreRegisters[gpr_index_ - 1u]);
+      ++gpr_index_;
+    }
+  }
+  if (location.IsInvalid()) {
+    if (DataType::Is64BitType(type)) {
+      location = Location::DoubleStackSlot(stack_offset_);
+    } else {
+      location = Location::StackSlot(stack_offset_);
+    }
+    stack_offset_ += kFramePointerSize;
+
+    if (for_register_allocation_) {
+      location = Location::Any();
+    }
+  }
+  return location;
+}
+
+Location CriticalNativeCallingConventionVisitorARM64::GetReturnLocation(DataType::Type type) const {
+  // We perform conversion to the managed ABI return register after the call if needed.
+  InvokeDexCallingConventionVisitorARM64 dex_calling_convention;
+  return dex_calling_convention.GetReturnLocation(type);
+}
+
+Location CriticalNativeCallingConventionVisitorARM64::GetMethodLocation() const {
+  // Pass the method in the hidden argument x15.
+  return Location::RegisterLocation(x15.GetCode());
+}
+
 CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph,
                                        const CompilerOptions& compiler_options,
                                        OptimizingCompilerStats* stats)
@@ -4295,7 +4339,13 @@
     return;
   }
 
-  HandleInvoke(invoke);
+  if (invoke->GetCodePtrLocation() == HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative) {
+    CriticalNativeCallingConventionVisitorARM64 calling_convention_visitor(
+        /*for_register_allocation=*/ true);
+    CodeGenerator::CreateCommonInvokeLocationSummary(invoke, &calling_convention_visitor);
+  } else {
+    HandleInvoke(invoke);
+  }
 }
 
 static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorARM64* codegen) {
@@ -4327,7 +4377,7 @@
       break;
     }
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(GetCompilerOptions().IsBootImage() || GetCompilerOptions().IsBootImageExtension());
@@ -4373,6 +4423,19 @@
     }
   }
 
+  auto call_code_pointer_member = [&](MemberOffset offset) {
+    // LR = callee_method->member;
+    __ Ldr(lr, MemOperand(XRegisterFrom(callee_method), offset.Int32Value()));
+    {
+      // Use a scope to help guarantee that `RecordPcInfo()` records the correct pc.
+      ExactAssemblyScope eas(GetVIXLAssembler(),
+                             kInstructionSize,
+                             CodeBufferCheckScope::kExactSize);
+      // lr()
+      __ blr(lr);
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
+    }
+  };
   switch (invoke->GetCodePtrLocation()) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
       {
@@ -4384,20 +4447,50 @@
         RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
       }
       break;
-    case HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod:
-      // LR = callee_method->entry_point_from_quick_compiled_code_;
-      __ Ldr(lr, MemOperand(
-          XRegisterFrom(callee_method),
-          ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize).Int32Value()));
-      {
-        // Use a scope to help guarantee that `RecordPcInfo()` records the correct pc.
-        ExactAssemblyScope eas(GetVIXLAssembler(),
-                               kInstructionSize,
-                               CodeBufferCheckScope::kExactSize);
-        // lr()
-        __ blr(lr);
-        RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
+    case HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative: {
+      HParallelMove parallel_move(GetGraph()->GetAllocator());
+      size_t out_frame_size =
+          PrepareCriticalNativeCall<CriticalNativeCallingConventionVisitorARM64,
+                                    kAapcs64StackAlignment,
+                                    GetCriticalNativeDirectCallFrameSize>(invoke, &parallel_move);
+      if (out_frame_size != 0u) {
+        __ Claim(out_frame_size);
+        GetAssembler()->cfi().AdjustCFAOffset(out_frame_size);
+        GetMoveResolver()->EmitNativeCode(&parallel_move);
       }
+      call_code_pointer_member(ArtMethod::EntryPointFromJniOffset(kArm64PointerSize));
+      // Zero-/sign-extend the result when needed due to native and managed ABI mismatch.
+      switch (invoke->GetType()) {
+        case DataType::Type::kBool:
+          __ Ubfx(w0, w0, 0, 8);
+          break;
+        case DataType::Type::kInt8:
+          __ Sbfx(w0, w0, 0, 8);
+          break;
+        case DataType::Type::kUint16:
+          __ Ubfx(w0, w0, 0, 16);
+          break;
+        case DataType::Type::kInt16:
+          __ Sbfx(w0, w0, 0, 16);
+          break;
+        case DataType::Type::kInt32:
+        case DataType::Type::kInt64:
+        case DataType::Type::kFloat32:
+        case DataType::Type::kFloat64:
+        case DataType::Type::kVoid:
+          break;
+        default:
+          DCHECK(false) << invoke->GetType();
+          break;
+      }
+      if (out_frame_size != 0u) {
+        __ Drop(out_frame_size);
+        GetAssembler()->cfi().AdjustCFAOffset(-out_frame_size);
+      }
+      break;
+    }
+    case HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod:
+      call_code_pointer_member(ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize));
       break;
   }
 
@@ -4819,14 +4912,9 @@
     return;
   }
 
-  {
-    // Ensure that between the BLR (emitted by GenerateStaticOrDirectCall) and RecordPcInfo there
-    // are no pools emitted.
-    EmissionCheckScope guard(GetVIXLAssembler(), kInvokeCodeMarginSizeInBytes);
-    LocationSummary* locations = invoke->GetLocations();
-    codegen_->GenerateStaticOrDirectCall(
-        invoke, locations->HasTemps() ? locations->GetTemp(0) : Location::NoLocation());
-  }
+  LocationSummary* locations = invoke->GetLocations();
+  codegen_->GenerateStaticOrDirectCall(
+      invoke, locations->HasTemps() ? locations->GetTemp(0) : Location::NoLocation());
 
   codegen_->MaybeGenerateMarkingRegisterCheck(/* code= */ __LINE__);
 }
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 487d091..bebf43d 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -231,6 +231,31 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitorARM64);
 };
 
+class CriticalNativeCallingConventionVisitorARM64 : public InvokeDexCallingConventionVisitor {
+ public:
+  explicit CriticalNativeCallingConventionVisitorARM64(bool for_register_allocation)
+      : for_register_allocation_(for_register_allocation) {}
+
+  virtual ~CriticalNativeCallingConventionVisitorARM64() {}
+
+  Location GetNextLocation(DataType::Type type) override;
+  Location GetReturnLocation(DataType::Type type) const override;
+  Location GetMethodLocation() const override;
+
+  size_t GetStackOffset() const { return stack_offset_; }
+
+ private:
+  // Register allocator does not support adjusting frame size, so we cannot provide final locations
+  // of stack arguments for register allocation. We ask the register allocator for any location and
+  // move these arguments to the right place after adjusting the SP when generating the call.
+  const bool for_register_allocation_;
+  size_t gpr_index_ = 0u;
+  size_t fpr_index_ = 0u;
+  size_t stack_offset_ = 0u;
+
+  DISALLOW_COPY_AND_ASSIGN(CriticalNativeCallingConventionVisitorARM64);
+};
+
 class FieldAccessCallingConventionARM64 : public FieldAccessCallingConvention {
  public:
   FieldAccessCallingConventionARM64() {}
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 1d8fd6c..9916257 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -18,6 +18,7 @@
 
 #include "arch/arm/asm_support_arm.h"
 #include "arch/arm/instruction_set_features_arm.h"
+#include "arch/arm/jni_frame_arm.h"
 #include "art_method-inl.h"
 #include "base/bit_utils.h"
 #include "base/bit_utils_iterator.h"
@@ -2435,6 +2436,54 @@
   return LocationFrom(kMethodRegister);
 }
 
+Location CriticalNativeCallingConventionVisitorARMVIXL::GetNextLocation(DataType::Type type) {
+  DCHECK_NE(type, DataType::Type::kReference);
+
+  // Native ABI uses the same registers as managed, except that the method register r0
+  // is a normal argument.
+  Location location = Location::NoLocation();
+  if (DataType::Is64BitType(type)) {
+    gpr_index_ = RoundUp(gpr_index_, 2u);
+    stack_offset_ = RoundUp(stack_offset_, 2 * kFramePointerSize);
+    if (gpr_index_ < 1u + kParameterCoreRegistersLengthVIXL) {
+      location = LocationFrom(gpr_index_ == 0u ? r0 : kParameterCoreRegistersVIXL[gpr_index_ - 1u],
+                              kParameterCoreRegistersVIXL[gpr_index_]);
+      gpr_index_ += 2u;
+    }
+  } else {
+    if (gpr_index_ < 1u + kParameterCoreRegistersLengthVIXL) {
+      location = LocationFrom(gpr_index_ == 0u ? r0 : kParameterCoreRegistersVIXL[gpr_index_ - 1u]);
+      ++gpr_index_;
+    }
+  }
+  if (location.IsInvalid()) {
+    if (DataType::Is64BitType(type)) {
+      location = Location::DoubleStackSlot(stack_offset_);
+      stack_offset_ += 2 * kFramePointerSize;
+    } else {
+      location = Location::StackSlot(stack_offset_);
+      stack_offset_ += kFramePointerSize;
+    }
+
+    if (for_register_allocation_) {
+      location = Location::Any();
+    }
+  }
+  return location;
+}
+
+Location CriticalNativeCallingConventionVisitorARMVIXL::GetReturnLocation(DataType::Type type)
+    const {
+  // We perform conversion to the managed ABI return register after the call if needed.
+  InvokeDexCallingConventionVisitorARMVIXL dex_calling_convention;
+  return dex_calling_convention.GetReturnLocation(type);
+}
+
+Location CriticalNativeCallingConventionVisitorARMVIXL::GetMethodLocation() const {
+  // Pass the method in the hidden argument R4.
+  return Location::RegisterLocation(R4);
+}
+
 void CodeGeneratorARMVIXL::Move32(Location destination, Location source) {
   if (source.Equals(destination)) {
     return;
@@ -3294,7 +3343,13 @@
     return;
   }
 
-  HandleInvoke(invoke);
+  if (invoke->GetCodePtrLocation() == HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative) {
+    CriticalNativeCallingConventionVisitorARMVIXL calling_convention_visitor(
+        /*for_register_allocation=*/ true);
+    CodeGenerator::CreateCommonInvokeLocationSummary(invoke, &calling_convention_visitor);
+  } else {
+    HandleInvoke(invoke);
+  }
 }
 
 static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorARMVIXL* codegen) {
@@ -8856,35 +8911,35 @@
 // otherwise return a fall-back info that should be used instead.
 HInvokeStaticOrDirect::DispatchInfo CodeGeneratorARMVIXL::GetSupportedInvokeStaticOrDirectDispatch(
     const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info,
-    ArtMethod* method ATTRIBUTE_UNUSED) {
+    ArtMethod* method) {
+  if (desired_dispatch_info.code_ptr_location ==
+          HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative) {
+    // TODO: Work around CheckTypeConsistency() in code_generator.cc that does not allow
+    // putting FP values in core registers as we need to do for the soft-float native ABI.
+    ScopedObjectAccess soa(Thread::Current());
+    uint32_t shorty_len;
+    const char* shorty = method->GetShorty(&shorty_len);
+    size_t reg = 0u;
+    for (uint32_t i = 1; i != shorty_len; ++i) {
+      size_t next_reg = reg + 1u;
+      if (shorty[i] == 'D' || shorty[i] == 'J') {
+        reg = RoundUp(reg, 2u);
+        next_reg = reg + 2u;
+      }
+      if (reg == 4u) {
+        break;
+      }
+      if (shorty[i] == 'D' || shorty[i] == 'F') {
+        HInvokeStaticOrDirect::DispatchInfo dispatch_info = desired_dispatch_info;
+        dispatch_info.code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod;
+        return dispatch_info;
+      }
+      reg = next_reg;
+    }
+  }
   return desired_dispatch_info;
 }
 
-vixl32::Register CodeGeneratorARMVIXL::GetInvokeStaticOrDirectExtraParameter(
-    HInvokeStaticOrDirect* invoke, vixl32::Register temp) {
-  DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
-  Location location = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
-  if (!invoke->GetLocations()->Intrinsified()) {
-    return RegisterFrom(location);
-  }
-  // For intrinsics we allow any location, so it may be on the stack.
-  if (!location.IsRegister()) {
-    GetAssembler()->LoadFromOffset(kLoadWord, temp, sp, location.GetStackIndex());
-    return temp;
-  }
-  // For register locations, check if the register was saved. If so, get it from the stack.
-  // Note: There is a chance that the register was saved but not overwritten, so we could
-  // save one load. However, since this is just an intrinsic slow path we prefer this
-  // simple and more robust approach rather that trying to determine if that's the case.
-  SlowPathCode* slow_path = GetCurrentSlowPath();
-  if (slow_path != nullptr && slow_path->IsCoreRegisterSaved(RegisterFrom(location).GetCode())) {
-    int stack_offset = slow_path->GetStackOffsetOfCoreRegister(RegisterFrom(location).GetCode());
-    GetAssembler()->LoadFromOffset(kLoadWord, temp, sp, stack_offset);
-    return temp;
-  }
-  return RegisterFrom(location);
-}
-
 void CodeGeneratorARMVIXL::GenerateStaticOrDirectCall(
     HInvokeStaticOrDirect* invoke, Location temp, SlowPathCode* slow_path) {
   Location callee_method = temp;  // For all kinds except kRecursive, callee will be in temp.
@@ -8897,7 +8952,7 @@
       break;
     }
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(GetCompilerOptions().IsBootImage() || GetCompilerOptions().IsBootImageExtension());
@@ -8932,6 +8987,20 @@
     }
   }
 
+  auto call_code_pointer_member = [&](MemberOffset offset) {
+    // LR = callee_method->member;
+    GetAssembler()->LoadFromOffset(kLoadWord, lr, RegisterFrom(callee_method), offset.Int32Value());
+    {
+      // Use a scope to help guarantee that `RecordPcInfo()` records the correct pc.
+      // blx in T32 has only 16bit encoding that's why a stricter check for the scope is used.
+      ExactAssemblyScope aas(GetVIXLAssembler(),
+                             vixl32::k16BitT32InstructionSizeInBytes,
+                             CodeBufferCheckScope::kExactSize);
+      // LR()
+      __ blx(lr);
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
+    }
+  };
   switch (invoke->GetCodePtrLocation()) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
       {
@@ -8943,23 +9012,46 @@
         RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
       }
       break;
-    case HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod:
-      // LR = callee_method->entry_point_from_quick_compiled_code_
-      GetAssembler()->LoadFromOffset(
-            kLoadWord,
-            lr,
-            RegisterFrom(callee_method),
-            ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize).Int32Value());
-      {
-        // Use a scope to help guarantee that `RecordPcInfo()` records the correct pc.
-        // blx in T32 has only 16bit encoding that's why a stricter check for the scope is used.
-        ExactAssemblyScope aas(GetVIXLAssembler(),
-                               vixl32::k16BitT32InstructionSizeInBytes,
-                               CodeBufferCheckScope::kExactSize);
-        // LR()
-        __ blx(lr);
-        RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
+    case HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative: {
+      HParallelMove parallel_move(GetGraph()->GetAllocator());
+      size_t out_frame_size =
+          PrepareCriticalNativeCall<CriticalNativeCallingConventionVisitorARMVIXL,
+                                    kAapcsStackAlignment,
+                                    GetCriticalNativeDirectCallFrameSize>(invoke, &parallel_move);
+      if (out_frame_size != 0u) {
+        __ Claim(out_frame_size);
+        GetAssembler()->cfi().AdjustCFAOffset(out_frame_size);
+        GetMoveResolver()->EmitNativeCode(&parallel_move);
       }
+      call_code_pointer_member(ArtMethod::EntryPointFromJniOffset(kArmPointerSize));
+      // Move the result when needed due to native and managed ABI mismatch.
+      switch (invoke->GetType()) {
+        case DataType::Type::kFloat32:
+          __ Vmov(s0, r0);
+          break;
+        case DataType::Type::kFloat64:
+          __ Vmov(d0, r0, r1);
+          break;
+        case DataType::Type::kBool:
+        case DataType::Type::kInt8:
+        case DataType::Type::kUint16:
+        case DataType::Type::kInt16:
+        case DataType::Type::kInt32:
+        case DataType::Type::kInt64:
+        case DataType::Type::kVoid:
+          break;
+        default:
+          DCHECK(false) << invoke->GetType();
+          break;
+      }
+      if (out_frame_size != 0u) {
+        __ Drop(out_frame_size);
+        GetAssembler()->cfi().AdjustCFAOffset(-out_frame_size);
+      }
+      break;
+    }
+    case HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod:
+      call_code_pointer_member(ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize));
       break;
   }
 
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index 3eed730..d6300c7 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -187,6 +187,30 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitorARMVIXL);
 };
 
+class CriticalNativeCallingConventionVisitorARMVIXL : public InvokeDexCallingConventionVisitor {
+ public:
+  explicit CriticalNativeCallingConventionVisitorARMVIXL(bool for_register_allocation)
+      : for_register_allocation_(for_register_allocation) {}
+
+  virtual ~CriticalNativeCallingConventionVisitorARMVIXL() {}
+
+  Location GetNextLocation(DataType::Type type) override;
+  Location GetReturnLocation(DataType::Type type) const override;
+  Location GetMethodLocation() const override;
+
+  size_t GetStackOffset() const { return stack_offset_; }
+
+ private:
+  // Register allocator does not support adjusting frame size, so we cannot provide final locations
+  // of stack arguments for register allocation. We ask the register allocator for any location and
+  // move these arguments to the right place after adjusting the SP when generating the call.
+  const bool for_register_allocation_;
+  size_t gpr_index_ = 0u;
+  size_t stack_offset_ = 0u;
+
+  DISALLOW_COPY_AND_ASSIGN(CriticalNativeCallingConventionVisitorARMVIXL);
+};
+
 class FieldAccessCallingConventionARMVIXL : public FieldAccessCallingConvention {
  public:
   FieldAccessCallingConventionARMVIXL() {}
@@ -853,9 +877,6 @@
                                     uint32_t encoded_data,
                                     /*out*/ std::string* debug_name);
 
-  vixl::aarch32::Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke,
-                                                                vixl::aarch32::Register temp);
-
   using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, VIXLUInt32Literal*>;
   using StringToLiteralMap = ArenaSafeMap<StringReference,
                                           VIXLUInt32Literal*,
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index e9ef21a..595b31e 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -16,6 +16,7 @@
 
 #include "code_generator_x86.h"
 
+#include "arch/x86/jni_frame_x86.h"
 #include "art_method-inl.h"
 #include "class_table.h"
 #include "code_generator_utils.h"
@@ -1300,6 +1301,34 @@
   return Location::NoLocation();
 }
 
+Location CriticalNativeCallingConventionVisitorX86::GetNextLocation(DataType::Type type) {
+  DCHECK_NE(type, DataType::Type::kReference);
+
+  Location location;
+  if (DataType::Is64BitType(type)) {
+    location = Location::DoubleStackSlot(stack_offset_);
+    stack_offset_ += 2 * kFramePointerSize;
+  } else {
+    location = Location::StackSlot(stack_offset_);
+    stack_offset_ += kFramePointerSize;
+  }
+  if (for_register_allocation_) {
+    location = Location::Any();
+  }
+  return location;
+}
+
+Location CriticalNativeCallingConventionVisitorX86::GetReturnLocation(DataType::Type type) const {
+  // We perform conversion to the managed ABI return register after the call if needed.
+  InvokeDexCallingConventionVisitorX86 dex_calling_convention;
+  return dex_calling_convention.GetReturnLocation(type);
+}
+
+Location CriticalNativeCallingConventionVisitorX86::GetMethodLocation() const {
+  // Pass the method in the hidden argument EAX.
+  return Location::RegisterLocation(EAX);
+}
+
 void CodeGeneratorX86::Move32(Location destination, Location source) {
   if (source.Equals(destination)) {
     return;
@@ -1374,11 +1403,13 @@
       size_t elem_size = DataType::Size(DataType::Type::kInt32);
       // Create stack space for 2 elements.
       __ subl(ESP, Immediate(2 * elem_size));
+      __ cfi().AdjustCFAOffset(2 * elem_size);
       __ movl(Address(ESP, 0), source.AsRegisterPairLow<Register>());
       __ movl(Address(ESP, elem_size), source.AsRegisterPairHigh<Register>());
       __ movsd(destination.AsFpuRegister<XmmRegister>(), Address(ESP, 0));
       // And remove the temporary stack space we allocated.
       __ addl(ESP, Immediate(2 * elem_size));
+      __ cfi().AdjustCFAOffset(-(2 * elem_size));
     } else {
       LOG(FATAL) << "Unimplemented";
     }
@@ -2286,9 +2317,15 @@
     return;
   }
 
-  HandleInvoke(invoke);
+  if (invoke->GetCodePtrLocation() == HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative) {
+    CriticalNativeCallingConventionVisitorX86 calling_convention_visitor(
+        /*for_register_allocation=*/ true);
+    CodeGenerator::CreateCommonInvokeLocationSummary(invoke, &calling_convention_visitor);
+  } else {
+    HandleInvoke(invoke);
+  }
 
-  // For PC-relative dex cache the invoke has an extra input, the PC-relative address base.
+  // For PC-relative load kinds the invoke has an extra input, the PC-relative address base.
   if (invoke->HasPcRelativeMethodLoadKind()) {
     invoke->GetLocations()->SetInAt(invoke->GetSpecialInputIndex(), Location::RequiresRegister());
   }
@@ -2989,6 +3026,7 @@
           if (!in.IsDoubleStackSlot() || !out.IsStackSlot()) {
             adjustment = DataType::Size(DataType::Type::kInt64);
             __ subl(ESP, Immediate(adjustment));
+            __ cfi().AdjustCFAOffset(adjustment);
           }
 
           // Load the value to the FP stack, using temporaries if needed.
@@ -3005,6 +3043,7 @@
           // Remove the temporary stack space we allocated.
           if (adjustment != 0) {
             __ addl(ESP, Immediate(adjustment));
+            __ cfi().AdjustCFAOffset(-adjustment);
           }
           break;
         }
@@ -3039,6 +3078,7 @@
           if (!in.IsDoubleStackSlot() || !out.IsDoubleStackSlot()) {
             adjustment = DataType::Size(DataType::Type::kInt64);
             __ subl(ESP, Immediate(adjustment));
+            __ cfi().AdjustCFAOffset(adjustment);
           }
 
           // Load the value to the FP stack, using temporaries if needed.
@@ -3055,6 +3095,7 @@
           // Remove the temporary stack space we allocated.
           if (adjustment != 0) {
             __ addl(ESP, Immediate(adjustment));
+            __ cfi().AdjustCFAOffset(-adjustment);
           }
           break;
         }
@@ -3551,6 +3592,7 @@
   // Create stack space for 2 elements.
   // TODO: enhance register allocator to ask for stack temporaries.
   __ subl(ESP, Immediate(2 * elem_size));
+  __ cfi().AdjustCFAOffset(2 * elem_size);
 
   // Load the values to the FP stack in reverse order, using temporaries if needed.
   const bool is_wide = !is_float;
@@ -3591,6 +3633,7 @@
 
   // And remove the temporary stack space we allocated.
   __ addl(ESP, Immediate(2 * elem_size));
+  __ cfi().AdjustCFAOffset(-(2 * elem_size));
 }
 
 
@@ -4934,7 +4977,6 @@
 
 Register CodeGeneratorX86::GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke,
                                                                  Register temp) {
-  DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
   Location location = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
   if (!invoke->GetLocations()->Intrinsified()) {
     return location.AsRegister<Register>();
@@ -4970,7 +5012,7 @@
       break;
     }
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(GetCompilerOptions().IsBootImage() || GetCompilerOptions().IsBootImageExtension());
@@ -5009,15 +5051,73 @@
   switch (invoke->GetCodePtrLocation()) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
       __ call(GetFrameEntryLabel());
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
       break;
+    case HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative: {
+      HParallelMove parallel_move(GetGraph()->GetAllocator());
+      size_t out_frame_size =
+          PrepareCriticalNativeCall<CriticalNativeCallingConventionVisitorX86,
+                                    kNativeStackAlignment,
+                                    GetCriticalNativeDirectCallFrameSize>(invoke, &parallel_move);
+      if (out_frame_size != 0u) {
+        __ subl(ESP, Immediate(out_frame_size));
+        __ cfi().AdjustCFAOffset(out_frame_size);
+        GetMoveResolver()->EmitNativeCode(&parallel_move);
+      }
+      // (callee_method + offset_of_jni_entry_point)()
+      __ call(Address(callee_method.AsRegister<Register>(),
+                      ArtMethod::EntryPointFromJniOffset(kX86PointerSize).Int32Value()));
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
+      if (out_frame_size == 0u && DataType::IsFloatingPointType(invoke->GetType())) {
+        // Create space for conversion.
+        out_frame_size = 8u;
+        __ subl(ESP, Immediate(out_frame_size));
+        __ cfi().AdjustCFAOffset(out_frame_size);
+      }
+      // Zero-/sign-extend or move the result when needed due to native and managed ABI mismatch.
+      switch (invoke->GetType()) {
+        case DataType::Type::kBool:
+          __ movzxb(EAX, AL);
+          break;
+        case DataType::Type::kInt8:
+          __ movsxb(EAX, AL);
+          break;
+        case DataType::Type::kUint16:
+          __ movzxw(EAX, EAX);
+          break;
+        case DataType::Type::kInt16:
+          __ movsxw(EAX, EAX);
+          break;
+        case DataType::Type::kFloat32:
+          __ fstps(Address(ESP, 0));
+          __ movss(XMM0, Address(ESP, 0));
+          break;
+        case DataType::Type::kFloat64:
+          __ fstpl(Address(ESP, 0));
+          __ movsd(XMM0, Address(ESP, 0));
+          break;
+        case DataType::Type::kInt32:
+        case DataType::Type::kInt64:
+        case DataType::Type::kVoid:
+          break;
+        default:
+          DCHECK(false) << invoke->GetType();
+          break;
+      }
+      if (out_frame_size != 0u) {
+        __ addl(ESP, Immediate(out_frame_size));
+        __ cfi().AdjustCFAOffset(-out_frame_size);
+      }
+      break;
+    }
     case HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod:
       // (callee_method + offset_of_quick_compiled_code)()
       __ call(Address(callee_method.AsRegister<Register>(),
                       ArtMethod::EntryPointFromQuickCompiledCodeOffset(
                           kX86PointerSize).Int32Value()));
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
       break;
   }
-  RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
 
   DCHECK(!IsLeafMethod());
 }
@@ -5072,7 +5172,6 @@
 }
 
 void CodeGeneratorX86::RecordBootImageMethodPatch(HInvokeStaticOrDirect* invoke) {
-  DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
   HX86ComputeBaseMethodAddress* method_address =
       invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress();
   boot_image_method_patches_.emplace_back(
@@ -5081,7 +5180,6 @@
 }
 
 void CodeGeneratorX86::RecordMethodBssEntryPatch(HInvokeStaticOrDirect* invoke) {
-  DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
   HX86ComputeBaseMethodAddress* method_address =
       invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress();
   // Add the patch entry and bind its label at the end of the instruction.
@@ -5126,7 +5224,6 @@
                                             uint32_t boot_image_reference,
                                             HInvokeStaticOrDirect* invoke) {
   if (GetCompilerOptions().IsBootImage()) {
-    DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
     HX86ComputeBaseMethodAddress* method_address =
         invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress();
     DCHECK(method_address != nullptr);
@@ -5135,7 +5232,6 @@
     __ leal(reg, Address(method_address_reg, CodeGeneratorX86::kDummy32BitOffset));
     RecordBootImageIntrinsicPatch(method_address, boot_image_reference);
   } else if (GetCompilerOptions().GetCompilePic()) {
-    DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
     HX86ComputeBaseMethodAddress* method_address =
         invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress();
     DCHECK(method_address != nullptr);
@@ -5160,7 +5256,6 @@
   if (GetCompilerOptions().IsBootImage()) {
     DCHECK_EQ(boot_image_offset, IntrinsicVisitor::IntegerValueOfInfo::kInvalidReference);
     // Load the class the same way as for HLoadClass::LoadKind::kBootImageLinkTimePcRelative.
-    DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
     HX86ComputeBaseMethodAddress* method_address =
         invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress();
     DCHECK(method_address != nullptr);
@@ -6365,24 +6460,45 @@
       __ movl(Address(ESP, destination.GetStackIndex()), source.AsRegister<Register>());
     }
   } else if (source.IsRegisterPair()) {
+    if (destination.IsRegisterPair()) {
+      __ movl(destination.AsRegisterPairLow<Register>(), source.AsRegisterPairLow<Register>());
+      DCHECK_NE(destination.AsRegisterPairLow<Register>(), source.AsRegisterPairHigh<Register>());
+      __ movl(destination.AsRegisterPairHigh<Register>(), source.AsRegisterPairHigh<Register>());
+    } else if (destination.IsFpuRegister()) {
       size_t elem_size = DataType::Size(DataType::Type::kInt32);
-      // Create stack space for 2 elements.
-      __ subl(ESP, Immediate(2 * elem_size));
-      __ movl(Address(ESP, 0), source.AsRegisterPairLow<Register>());
-      __ movl(Address(ESP, elem_size), source.AsRegisterPairHigh<Register>());
+      // Push the 2 source registers to stack.
+      __ pushl(source.AsRegisterPairHigh<Register>());
+      __ cfi().AdjustCFAOffset(elem_size);
+      __ pushl(source.AsRegisterPairLow<Register>());
+      __ cfi().AdjustCFAOffset(elem_size);
+      // Load the destination register.
       __ movsd(destination.AsFpuRegister<XmmRegister>(), Address(ESP, 0));
       // And remove the temporary stack space we allocated.
       __ addl(ESP, Immediate(2 * elem_size));
+      __ cfi().AdjustCFAOffset(-(2 * elem_size));
+    } else {
+      DCHECK(destination.IsDoubleStackSlot());
+      __ movl(Address(ESP, destination.GetStackIndex()), source.AsRegisterPairLow<Register>());
+      __ movl(Address(ESP, destination.GetHighStackIndex(kX86WordSize)),
+              source.AsRegisterPairHigh<Register>());
+    }
   } else if (source.IsFpuRegister()) {
     if (destination.IsRegister()) {
       __ movd(destination.AsRegister<Register>(), source.AsFpuRegister<XmmRegister>());
     } else if (destination.IsFpuRegister()) {
       __ movaps(destination.AsFpuRegister<XmmRegister>(), source.AsFpuRegister<XmmRegister>());
     } else if (destination.IsRegisterPair()) {
-      XmmRegister src_reg = source.AsFpuRegister<XmmRegister>();
-      __ movd(destination.AsRegisterPairLow<Register>(), src_reg);
-      __ psrlq(src_reg, Immediate(32));
-      __ movd(destination.AsRegisterPairHigh<Register>(), src_reg);
+      size_t elem_size = DataType::Size(DataType::Type::kInt32);
+      // Create stack space for 2 elements.
+      __ subl(ESP, Immediate(2 * elem_size));
+      __ cfi().AdjustCFAOffset(2 * elem_size);
+      // Store the source register.
+      __ movsd(Address(ESP, 0), source.AsFpuRegister<XmmRegister>());
+      // And pop the values into destination registers.
+      __ popl(destination.AsRegisterPairLow<Register>());
+      __ cfi().AdjustCFAOffset(-elem_size);
+      __ popl(destination.AsRegisterPairHigh<Register>());
+      __ cfi().AdjustCFAOffset(-elem_size);
     } else if (destination.IsStackSlot()) {
       __ movss(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>());
     } else if (destination.IsDoubleStackSlot()) {
@@ -6480,9 +6596,12 @@
           __ xorpd(dest, dest);
         } else {
           __ pushl(high);
+          __ cfi().AdjustCFAOffset(4);
           __ pushl(low);
+          __ cfi().AdjustCFAOffset(4);
           __ movsd(dest, Address(ESP, 0));
           __ addl(ESP, Immediate(8));
+          __ cfi().AdjustCFAOffset(-8);
         }
       } else {
         DCHECK(destination.IsDoubleStackSlot()) << destination;
@@ -6520,10 +6639,12 @@
 void ParallelMoveResolverX86::Exchange128(XmmRegister reg, int mem) {
   size_t extra_slot = 4 * kX86WordSize;
   __ subl(ESP, Immediate(extra_slot));
+  __ cfi().AdjustCFAOffset(extra_slot);
   __ movups(Address(ESP, 0), XmmRegister(reg));
   ExchangeMemory(0, mem + extra_slot, 4);
   __ movups(XmmRegister(reg), Address(ESP, 0));
   __ addl(ESP, Immediate(extra_slot));
+  __ cfi().AdjustCFAOffset(-extra_slot);
 }
 
 void ParallelMoveResolverX86::ExchangeMemory(int mem1, int mem2, int number_of_words) {
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 43f5acd..22d8778 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -93,6 +93,29 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitorX86);
 };
 
+class CriticalNativeCallingConventionVisitorX86 : public InvokeDexCallingConventionVisitor {
+ public:
+  explicit CriticalNativeCallingConventionVisitorX86(bool for_register_allocation)
+      : for_register_allocation_(for_register_allocation) {}
+
+  virtual ~CriticalNativeCallingConventionVisitorX86() {}
+
+  Location GetNextLocation(DataType::Type type) override;
+  Location GetReturnLocation(DataType::Type type) const override;
+  Location GetMethodLocation() const override;
+
+  size_t GetStackOffset() const { return stack_offset_; }
+
+ private:
+  // Register allocator does not support adjusting frame size, so we cannot provide final locations
+  // of stack arguments for register allocation. We ask the register allocator for any location and
+  // move these arguments to the right place after adjusting the SP when generating the call.
+  const bool for_register_allocation_;
+  size_t stack_offset_ = 0u;
+
+  DISALLOW_COPY_AND_ASSIGN(CriticalNativeCallingConventionVisitorX86);
+};
+
 class FieldAccessCallingConventionX86 : public FieldAccessCallingConvention {
  public:
   FieldAccessCallingConventionX86() {}
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index ec54376..4a0cc78 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -16,6 +16,7 @@
 
 #include "code_generator_x86_64.h"
 
+#include "arch/x86_64/jni_frame_x86_64.h"
 #include "art_method-inl.h"
 #include "class_table.h"
 #include "code_generator_utils.h"
@@ -978,6 +979,16 @@
   UNREACHABLE();
 }
 
+void CodeGeneratorX86_64::BlockNonVolatileXmmRegisters(LocationSummary* locations) {
+  // We have to ensure that the native code we call directly (such as @CriticalNative
+  // or some intrinsic helpers, say Math.sin()) doesn't clobber the XMM registers
+  // which are non-volatile for ART, but volatile for Native calls.  This will ensure
+  // that they are saved in the prologue and properly restored.
+  for (FloatRegister fp_reg : non_volatile_xmm_regs) {
+    locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
+  }
+}
+
 HInvokeStaticOrDirect::DispatchInfo CodeGeneratorX86_64::GetSupportedInvokeStaticOrDirectDispatch(
       const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info,
       ArtMethod* method ATTRIBUTE_UNUSED) {
@@ -998,7 +1009,7 @@
       break;
     }
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative:
       DCHECK(GetCompilerOptions().IsBootImage() || GetCompilerOptions().IsBootImageExtension());
@@ -1032,15 +1043,61 @@
   switch (invoke->GetCodePtrLocation()) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
       __ call(&frame_entry_label_);
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
       break;
+    case HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative: {
+      HParallelMove parallel_move(GetGraph()->GetAllocator());
+      size_t out_frame_size =
+          PrepareCriticalNativeCall<CriticalNativeCallingConventionVisitorX86_64,
+                                    kNativeStackAlignment,
+                                    GetCriticalNativeDirectCallFrameSize>(invoke, &parallel_move);
+      if (out_frame_size != 0u) {
+        __ subq(CpuRegister(RSP), Immediate(out_frame_size));
+        __ cfi().AdjustCFAOffset(out_frame_size);
+        GetMoveResolver()->EmitNativeCode(&parallel_move);
+      }
+      // (callee_method + offset_of_jni_entry_point)()
+      __ call(Address(callee_method.AsRegister<CpuRegister>(),
+                      ArtMethod::EntryPointFromJniOffset(kX86_64PointerSize).SizeValue()));
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
+      // Zero-/sign-extend the result when needed due to native and managed ABI mismatch.
+      switch (invoke->GetType()) {
+        case DataType::Type::kBool:
+          __ movzxb(CpuRegister(RAX), CpuRegister(RAX));
+          break;
+        case DataType::Type::kInt8:
+          __ movsxb(CpuRegister(RAX), CpuRegister(RAX));
+          break;
+        case DataType::Type::kUint16:
+          __ movzxw(CpuRegister(RAX), CpuRegister(RAX));
+          break;
+        case DataType::Type::kInt16:
+          __ movsxw(CpuRegister(RAX), CpuRegister(RAX));
+          break;
+        case DataType::Type::kInt32:
+        case DataType::Type::kInt64:
+        case DataType::Type::kFloat32:
+        case DataType::Type::kFloat64:
+        case DataType::Type::kVoid:
+          break;
+        default:
+          DCHECK(false) << invoke->GetType();
+          break;
+      }
+      if (out_frame_size != 0u) {
+        __ addq(CpuRegister(RSP), Immediate(out_frame_size));
+        __ cfi().AdjustCFAOffset(-out_frame_size);
+      }
+      break;
+    }
     case HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod:
       // (callee_method + offset_of_quick_compiled_code)()
       __ call(Address(callee_method.AsRegister<CpuRegister>(),
                       ArtMethod::EntryPointFromQuickCompiledCodeOffset(
                           kX86_64PointerSize).SizeValue()));
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
       break;
   }
-  RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
 
   DCHECK(!IsLeafMethod());
 }
@@ -2493,6 +2550,51 @@
   return Location::NoLocation();
 }
 
+Location CriticalNativeCallingConventionVisitorX86_64::GetNextLocation(DataType::Type type) {
+  DCHECK_NE(type, DataType::Type::kReference);
+
+  Location location = Location::NoLocation();
+  if (DataType::IsFloatingPointType(type)) {
+    if (fpr_index_ < kParameterFloatRegistersLength) {
+      location = Location::FpuRegisterLocation(kParameterFloatRegisters[fpr_index_]);
+      ++fpr_index_;
+    }
+  } else {
+    // Native ABI uses the same registers as managed, except that the method register RDI
+    // is a normal argument.
+    if (gpr_index_ < 1u + kParameterCoreRegistersLength) {
+      location = Location::RegisterLocation(
+          gpr_index_ == 0u ? RDI : kParameterCoreRegisters[gpr_index_ - 1u]);
+      ++gpr_index_;
+    }
+  }
+  if (location.IsInvalid()) {
+    if (DataType::Is64BitType(type)) {
+      location = Location::DoubleStackSlot(stack_offset_);
+    } else {
+      location = Location::StackSlot(stack_offset_);
+    }
+    stack_offset_ += kFramePointerSize;
+
+    if (for_register_allocation_) {
+      location = Location::Any();
+    }
+  }
+  return location;
+}
+
+Location CriticalNativeCallingConventionVisitorX86_64::GetReturnLocation(DataType::Type type)
+    const {
+  // We perform conversion to the managed ABI return register after the call if needed.
+  InvokeDexCallingConventionVisitorX86_64 dex_calling_convention;
+  return dex_calling_convention.GetReturnLocation(type);
+}
+
+Location CriticalNativeCallingConventionVisitorX86_64::GetMethodLocation() const {
+  // Pass the method in the hidden argument RAX.
+  return Location::RegisterLocation(RAX);
+}
+
 void LocationsBuilderX86_64::VisitInvokeUnresolved(HInvokeUnresolved* invoke) {
   // The trampoline uses the same calling convention as dex calling conventions,
   // except instead of loading arg0/r0 with the target Method*, arg0/r0 will contain
@@ -2514,7 +2616,14 @@
     return;
   }
 
-  HandleInvoke(invoke);
+  if (invoke->GetCodePtrLocation() == HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative) {
+    CriticalNativeCallingConventionVisitorX86_64 calling_convention_visitor(
+        /*for_register_allocation=*/ true);
+    CodeGenerator::CreateCommonInvokeLocationSummary(invoke, &calling_convention_visitor);
+    CodeGeneratorX86_64::BlockNonVolatileXmmRegisters(invoke->GetLocations());
+  } else {
+    HandleInvoke(invoke);
+  }
 }
 
 static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorX86_64* codegen) {
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 01810f4..dcdd632 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -79,6 +79,31 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConvention);
 };
 
+class CriticalNativeCallingConventionVisitorX86_64 : public InvokeDexCallingConventionVisitor {
+ public:
+  explicit CriticalNativeCallingConventionVisitorX86_64(bool for_register_allocation)
+      : for_register_allocation_(for_register_allocation) {}
+
+  virtual ~CriticalNativeCallingConventionVisitorX86_64() {}
+
+  Location GetNextLocation(DataType::Type type) override;
+  Location GetReturnLocation(DataType::Type type) const override;
+  Location GetMethodLocation() const override;
+
+  size_t GetStackOffset() const { return stack_offset_; }
+
+ private:
+  // Register allocator does not support adjusting frame size, so we cannot provide final locations
+  // of stack arguments for register allocation. We ask the register allocator for any location and
+  // move these arguments to the right place after adjusting the SP when generating the call.
+  const bool for_register_allocation_;
+  size_t gpr_index_ = 0u;
+  size_t fpr_index_ = 0u;
+  size_t stack_offset_ = 0u;
+
+  DISALLOW_COPY_AND_ASSIGN(CriticalNativeCallingConventionVisitorX86_64);
+};
+
 class FieldAccessCallingConventionX86_64 : public FieldAccessCallingConvention {
  public:
   FieldAccessCallingConventionX86_64() {}
@@ -609,6 +634,8 @@
 
   void MaybeIncrementHotness(bool is_frame_entry);
 
+  static void BlockNonVolatileXmmRegisters(LocationSummary* locations);
+
   // When we don't know the proper offset for the value, we use kDummy32BitOffset.
   // We will fix this up in the linker later to have the right value.
   static constexpr int32_t kDummy32BitOffset = 256;
diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc
index cd68b2a..60e1279 100644
--- a/compiler/optimizing/instruction_builder.cc
+++ b/compiler/optimizing/instruction_builder.cc
@@ -1530,8 +1530,8 @@
 
   if (invoke->IsInvokeStaticOrDirect() &&
       HInvokeStaticOrDirect::NeedsCurrentMethodInput(
-          invoke->AsInvokeStaticOrDirect()->GetMethodLoadKind())) {
-    DCHECK_EQ(argument_index, invoke->AsInvokeStaticOrDirect()->GetSpecialInputIndex());
+          invoke->AsInvokeStaticOrDirect()->GetDispatchInfo())) {
+    DCHECK_EQ(argument_index, invoke->AsInvokeStaticOrDirect()->GetCurrentMethodIndex());
     DCHECK(invoke->InputAt(argument_index) == nullptr);
     invoke->SetRawInputAt(argument_index, graph_->GetCurrentMethod());
   }
diff --git a/compiler/optimizing/intrinsics_utils.h b/compiler/optimizing/intrinsics_utils.h
index e24d541..29f815c 100644
--- a/compiler/optimizing/intrinsics_utils.h
+++ b/compiler/optimizing/intrinsics_utils.h
@@ -59,7 +59,12 @@
     Location method_loc = MoveArguments(codegen);
 
     if (invoke_->IsInvokeStaticOrDirect()) {
-      codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(), method_loc, this);
+      HInvokeStaticOrDirect* invoke_static_or_direct = invoke_->AsInvokeStaticOrDirect();
+      DCHECK_NE(invoke_static_or_direct->GetMethodLoadKind(),
+                HInvokeStaticOrDirect::MethodLoadKind::kRecursive);
+      DCHECK_NE(invoke_static_or_direct->GetCodePtrLocation(),
+                HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative);
+      codegen->GenerateStaticOrDirectCall(invoke_static_or_direct, method_loc, this);
     } else {
       codegen->GenerateVirtualCall(invoke_->AsInvokeVirtual(), method_loc, this);
     }
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 7a0f131..af3fd76 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -398,12 +398,7 @@
   locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
   locations->SetOut(Location::FpuRegisterLocation(XMM0));
 
-  // We have to ensure that the native code doesn't clobber the XMM registers which are
-  // non-volatile for ART, but volatile for Native calls.  This will ensure that they are
-  // saved in the prologue and properly restored.
-  for (FloatRegister fp_reg : non_volatile_xmm_regs) {
-    locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
-  }
+  CodeGeneratorX86_64::BlockNonVolatileXmmRegisters(locations);
 }
 
 static void GenFPToFPCall(HInvoke* invoke, CodeGeneratorX86_64* codegen,
@@ -535,12 +530,7 @@
   locations->SetInAt(1, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(1)));
   locations->SetOut(Location::FpuRegisterLocation(XMM0));
 
-  // We have to ensure that the native code doesn't clobber the XMM registers which are
-  // non-volatile for ART, but volatile for Native calls.  This will ensure that they are
-  // saved in the prologue and properly restored.
-  for (FloatRegister fp_reg : non_volatile_xmm_regs) {
-    locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
-  }
+  CodeGeneratorX86_64::BlockNonVolatileXmmRegisters(locations);
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitMathAtan2(HInvoke* invoke) {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index e562b87..0eece84 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -4604,6 +4604,11 @@
     // Recursive call, use local PC-relative call instruction.
     kCallSelf,
 
+    // Use native pointer from the Artmethod*.
+    // Used for @CriticalNative to avoid going through the compiled stub. This call goes through
+    // a special resolution stub if the class is not initialized or no native code is registered.
+    kCallCriticalNative,
+
     // Use code pointer from the ArtMethod*.
     // Used when we don't know the target code. This is also the last-resort-kind used when
     // other kinds are unimplemented or impractical (i.e. slow) on a particular architecture.
@@ -4633,9 +4638,9 @@
       : HInvoke(kInvokeStaticOrDirect,
                 allocator,
                 number_of_arguments,
-                // There is potentially one extra argument for the HCurrentMethod node, and
-                // potentially one other if the clinit check is explicit.
-                (NeedsCurrentMethodInput(dispatch_info.method_load_kind) ? 1u : 0u) +
+                // There is potentially one extra argument for the HCurrentMethod input,
+                // and one other if the clinit check is explicit. These can be removed later.
+                (NeedsCurrentMethodInput(dispatch_info) ? 1u : 0u) +
                     (clinit_check_requirement == ClinitCheckRequirement::kExplicit ? 1u : 0u),
                 return_type,
                 dex_pc,
@@ -4649,17 +4654,17 @@
 
   bool IsClonable() const override { return true; }
 
-  void SetDispatchInfo(const DispatchInfo& dispatch_info) {
+  void SetDispatchInfo(DispatchInfo dispatch_info) {
     bool had_current_method_input = HasCurrentMethodInput();
-    bool needs_current_method_input = NeedsCurrentMethodInput(dispatch_info.method_load_kind);
+    bool needs_current_method_input = NeedsCurrentMethodInput(dispatch_info);
 
     // Using the current method is the default and once we find a better
     // method load kind, we should not go back to using the current method.
     DCHECK(had_current_method_input || !needs_current_method_input);
 
     if (had_current_method_input && !needs_current_method_input) {
-      DCHECK_EQ(InputAt(GetSpecialInputIndex()), GetBlock()->GetGraph()->GetCurrentMethod());
-      RemoveInputAt(GetSpecialInputIndex());
+      DCHECK_EQ(InputAt(GetCurrentMethodIndex()), GetBlock()->GetGraph()->GetCurrentMethod());
+      RemoveInputAt(GetCurrentMethodIndex());
     }
     dispatch_info_ = dispatch_info;
   }
@@ -4668,14 +4673,6 @@
     return dispatch_info_;
   }
 
-  void AddSpecialInput(HInstruction* input) {
-    // We allow only one special input.
-    DCHECK(!IsStringInit() && !HasCurrentMethodInput());
-    DCHECK(InputCount() == GetSpecialInputIndex() ||
-           (InputCount() == GetSpecialInputIndex() + 1 && IsStaticWithExplicitClinitCheck()));
-    InsertInputAt(GetSpecialInputIndex(), input);
-  }
-
   using HInstruction::GetInputRecords;  // Keep the const version visible.
   ArrayRef<HUserRecord<HInstruction*>> GetInputRecords() override {
     ArrayRef<HUserRecord<HInstruction*>> input_records = HInvoke::GetInputRecords();
@@ -4696,7 +4693,7 @@
   }
 
   bool CanDoImplicitNullCheckOn(HInstruction* obj ATTRIBUTE_UNUSED) const override {
-    // We access the method via the dex cache so we can't do an implicit null check.
+    // We do not access the method via object reference, so we cannot do an implicit null check.
     // TODO: for intrinsics we can generate implicit null checks.
     return false;
   }
@@ -4705,14 +4702,6 @@
     return GetType() == DataType::Type::kReference && !IsStringInit();
   }
 
-  // Get the index of the special input, if any.
-  //
-  // If the invoke HasCurrentMethodInput(), the "special input" is the current
-  // method pointer; otherwise there may be one platform-specific special input,
-  // such as PC-relative addressing base.
-  uint32_t GetSpecialInputIndex() const { return GetNumberOfArguments(); }
-  bool HasSpecialInput() const { return GetNumberOfArguments() != InputCount(); }
-
   MethodLoadKind GetMethodLoadKind() const { return dispatch_info_.method_load_kind; }
   CodePtrLocation GetCodePtrLocation() const { return dispatch_info_.code_ptr_location; }
   bool IsRecursive() const { return GetMethodLoadKind() == MethodLoadKind::kRecursive; }
@@ -4724,17 +4713,6 @@
            GetMethodLoadKind() == MethodLoadKind::kBootImageRelRo ||
            GetMethodLoadKind() == MethodLoadKind::kBssEntry;
   }
-  bool HasCurrentMethodInput() const {
-    // This function can be called only after the invoke has been fully initialized by the builder.
-    if (NeedsCurrentMethodInput(GetMethodLoadKind())) {
-      DCHECK(InputAt(GetSpecialInputIndex())->IsCurrentMethod());
-      return true;
-    } else {
-      DCHECK(InputCount() == GetSpecialInputIndex() ||
-             !InputAt(GetSpecialInputIndex())->IsCurrentMethod());
-      return false;
-    }
-  }
 
   QuickEntrypointEnum GetStringInitEntryPoint() const {
     DCHECK(IsStringInit());
@@ -4761,6 +4739,60 @@
     return target_method_;
   }
 
+  // Does this method load kind need the current method as an input?
+  static bool NeedsCurrentMethodInput(DispatchInfo dispatch_info) {
+    return dispatch_info.method_load_kind == MethodLoadKind::kRecursive ||
+           dispatch_info.method_load_kind == MethodLoadKind::kRuntimeCall ||
+           dispatch_info.code_ptr_location == CodePtrLocation::kCallCriticalNative;
+  }
+
+  // Get the index of the current method input.
+  size_t GetCurrentMethodIndex() const {
+    DCHECK(HasCurrentMethodInput());
+    return GetCurrentMethodIndexUnchecked();
+  }
+  size_t GetCurrentMethodIndexUnchecked() const {
+    return GetNumberOfArguments();
+  }
+
+  // Check if the method has a current method input.
+  bool HasCurrentMethodInput() const {
+    if (NeedsCurrentMethodInput(GetDispatchInfo())) {
+      DCHECK(InputAt(GetCurrentMethodIndexUnchecked()) == nullptr ||  // During argument setup.
+             InputAt(GetCurrentMethodIndexUnchecked())->IsCurrentMethod());
+      return true;
+    } else {
+      DCHECK(InputCount() == GetCurrentMethodIndexUnchecked() ||
+             InputAt(GetCurrentMethodIndexUnchecked()) == nullptr ||  // During argument setup.
+             !InputAt(GetCurrentMethodIndexUnchecked())->IsCurrentMethod());
+      return false;
+    }
+  }
+
+  // Get the index of the special input.
+  size_t GetSpecialInputIndex() const {
+    DCHECK(HasSpecialInput());
+    return GetSpecialInputIndexUnchecked();
+  }
+  size_t GetSpecialInputIndexUnchecked() const {
+    return GetNumberOfArguments() + (HasCurrentMethodInput() ? 1u : 0u);
+  }
+
+  // Check if the method has a special input.
+  bool HasSpecialInput() const {
+    size_t other_inputs =
+        GetSpecialInputIndexUnchecked() + (IsStaticWithExplicitClinitCheck() ? 1u : 0u);
+    size_t input_count = InputCount();
+    DCHECK_LE(input_count - other_inputs, 1u) << other_inputs << " " << input_count;
+    return other_inputs != input_count;
+  }
+
+  void AddSpecialInput(HInstruction* input) {
+    // We allow only one special input.
+    DCHECK(!HasSpecialInput());
+    InsertInputAt(GetSpecialInputIndexUnchecked(), input);
+  }
+
   // Remove the HClinitCheck or the replacement HLoadClass (set as last input by
   // PrepareForRegisterAllocation::VisitClinitCheck() in lieu of the initial HClinitCheck)
   // instruction; only relevant for static calls with explicit clinit check.
@@ -4788,11 +4820,6 @@
     return IsStatic() && (GetClinitCheckRequirement() == ClinitCheckRequirement::kImplicit);
   }
 
-  // Does this method load kind need the current method as an input?
-  static bool NeedsCurrentMethodInput(MethodLoadKind kind) {
-    return kind == MethodLoadKind::kRecursive || kind == MethodLoadKind::kRuntimeCall;
-  }
-
   DECLARE_INSTRUCTION(InvokeStaticOrDirect);
 
  protected:
@@ -4815,6 +4842,7 @@
   DispatchInfo dispatch_info_;
 };
 std::ostream& operator<<(std::ostream& os, HInvokeStaticOrDirect::MethodLoadKind rhs);
+std::ostream& operator<<(std::ostream& os, HInvokeStaticOrDirect::CodePtrLocation rhs);
 std::ostream& operator<<(std::ostream& os, HInvokeStaticOrDirect::ClinitCheckRequirement rhs);
 
 class HInvokeVirtual final : public HInvoke {
diff --git a/compiler/optimizing/pc_relative_fixups_x86.cc b/compiler/optimizing/pc_relative_fixups_x86.cc
index 4ff293c..3ea1918 100644
--- a/compiler/optimizing/pc_relative_fixups_x86.cc
+++ b/compiler/optimizing/pc_relative_fixups_x86.cc
@@ -195,15 +195,6 @@
   void HandleInvoke(HInvoke* invoke) {
     HInvokeStaticOrDirect* invoke_static_or_direct = invoke->AsInvokeStaticOrDirect();
 
-    // We can't add the method address if we already have a current method pointer.
-    // This may arise when sharpening doesn't remove the current method pointer from the invoke.
-    if (invoke_static_or_direct != nullptr && invoke_static_or_direct->HasCurrentMethodInput()) {
-      // Note: This happens only for recursive calls (including compiling an intrinsic
-      // by faking a call to itself; we use kRuntimeCall for this case).
-      DCHECK(!invoke_static_or_direct->HasPcRelativeMethodLoadKind());
-      return;
-    }
-
     // If this is an invoke-static/-direct with PC-relative addressing (within boot image
     // or using .bss or .data.bimg.rel.ro), we need the PC-relative address base.
     bool base_added = false;
@@ -246,7 +237,6 @@
         // This intrinsic needs the constant area.
         if (!base_added) {
           DCHECK(invoke_static_or_direct != nullptr);
-          DCHECK(!invoke_static_or_direct->HasCurrentMethodInput());
           HX86ComputeBaseMethodAddress* method_address = GetPCRelativeBasePointer(invoke);
           invoke_static_or_direct->AddSpecialInput(method_address);
         }
diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc
index 1539421..04a8eab 100644
--- a/compiler/optimizing/sharpening.cc
+++ b/compiler/optimizing/sharpening.cc
@@ -124,6 +124,13 @@
     code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod;
   }
 
+  if (method_load_kind != HInvokeStaticOrDirect::MethodLoadKind::kRuntimeCall &&
+      callee->IsCriticalNative()) {
+    DCHECK_NE(method_load_kind, HInvokeStaticOrDirect::MethodLoadKind::kRecursive);
+    DCHECK(callee->IsStatic());
+    code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative;
+  }
+
   if (codegen->GetGraph()->IsDebuggable()) {
     // For debuggable apps always use the code pointer from ArtMethod
     // so that we don't circumvent instrumentation stubs if installed.
diff --git a/dex2oat/linker/image_writer.cc b/dex2oat/linker/image_writer.cc
index c8f36cc..31d5e99 100644
--- a/dex2oat/linker/image_writer.cc
+++ b/dex2oat/linker/image_writer.cc
@@ -3432,6 +3432,9 @@
       CopyAndFixupPointer(copy, ArtMethod::DataOffset(target_ptr_size_), orig_table);
     } else if (UNLIKELY(orig == runtime->GetResolutionMethod())) {
       quick_code = GetOatAddress(StubType::kQuickResolutionTrampoline);
+      // Set JNI entrypoint for resolving @CriticalNative methods called from compiled code .
+      const void* jni_code = GetOatAddress(StubType::kJNIDlsymLookupCriticalTrampoline);
+      copy->SetEntryPointFromJniPtrSize(jni_code, target_ptr_size_);
     } else {
       bool found_one = false;
       for (size_t i = 0; i < static_cast<size_t>(CalleeSaveType::kLastCalleeSaveType); ++i) {
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index 1f843b3..103f60f 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -2375,12 +2375,22 @@
     } else if (method->IsAbstract() || method->IsClassInitializer()) {
       // Don't print information for these.
     } else if (method->IsRuntimeMethod()) {
-      ImtConflictTable* table = method->GetImtConflictTable(image_header_.GetPointerSize());
-      if (table != nullptr) {
-        indent_os << "IMT conflict table " << table << " method: ";
-        for (size_t i = 0, count = table->NumEntries(pointer_size); i < count; ++i) {
-          indent_os << ArtMethod::PrettyMethod(table->GetImplementationMethod(i, pointer_size))
-                    << " ";
+      if (method == Runtime::Current()->GetResolutionMethod()) {
+        const void* resolution_trampoline =
+            method->GetEntryPointFromQuickCompiledCodePtrSize(image_header_.GetPointerSize());
+        indent_os << StringPrintf("Resolution trampoline: %p\n", resolution_trampoline);
+        const void* critical_native_resolution_trampoline =
+            method->GetEntryPointFromJniPtrSize(image_header_.GetPointerSize());
+        indent_os << StringPrintf("Resolution trampoline for @CriticalNative: %p\n",
+                                  critical_native_resolution_trampoline);
+      } else {
+        ImtConflictTable* table = method->GetImtConflictTable(image_header_.GetPointerSize());
+        if (table != nullptr) {
+          indent_os << "IMT conflict table " << table << " method: ";
+          for (size_t i = 0, count = table->NumEntries(pointer_size); i < count; ++i) {
+            indent_os << ArtMethod::PrettyMethod(table->GetImplementationMethod(i, pointer_size))
+                      << " ";
+          }
         }
       }
     } else {
diff --git a/runtime/arch/arm/asm_support_arm.S b/runtime/arch/arm/asm_support_arm.S
index 5b51e51..7ffdf18 100644
--- a/runtime/arch/arm/asm_support_arm.S
+++ b/runtime/arch/arm/asm_support_arm.S
@@ -18,6 +18,7 @@
 #define ART_RUNTIME_ARCH_ARM_ASM_SUPPORT_ARM_S_
 
 #include "asm_support_arm.h"
+#include "interpreter/cfi_asm_support.h"
 
 // Define special registers.
 
@@ -37,6 +38,16 @@
 .arch armv7-a
 .thumb
 
+.macro CFI_EXPRESSION_BREG n, b, offset
+    .if (-0x40 <= (\offset)) && ((\offset) < 0x40)
+        CFI_EXPRESSION_BREG_1(\n, \b, \offset)
+    .elseif (-0x2000 <= (\offset)) && ((\offset) < 0x2000)
+        CFI_EXPRESSION_BREG_2(\n, \b, \offset)
+    .else
+        .error "Unsupported offset"
+    .endif
+.endm
+
 // Macro to generate the value of Runtime::Current into rDest. As it uses labels
 // then the labels need to be unique. We bind these to the function name in the ENTRY macros.
 .macro RUNTIME_CURRENT name, num, rDest
@@ -149,6 +160,16 @@
 #endif  // USE_HEAP_POISONING
 .endm
 
+.macro INCREASE_FRAME frame_adjustment
+    sub sp, sp, #(\frame_adjustment)
+    .cfi_adjust_cfa_offset (\frame_adjustment)
+.endm
+
+.macro DECREASE_FRAME frame_adjustment
+    add sp, sp, #(\frame_adjustment)
+    .cfi_adjust_cfa_offset -(\frame_adjustment)
+.endm
+
 // Macro to refresh the Marking Register (R8).
 //
 // This macro must be called at the end of functions implementing
diff --git a/runtime/arch/arm/jni_entrypoints_arm.S b/runtime/arch/arm/jni_entrypoints_arm.S
index ceef772..3c506b0 100644
--- a/runtime/arch/arm/jni_entrypoints_arm.S
+++ b/runtime/arch/arm/jni_entrypoints_arm.S
@@ -24,10 +24,6 @@
 ENTRY art_jni_dlsym_lookup_stub
     push   {r0, r1, r2, r3, lr}           @ spill regs
     .cfi_adjust_cfa_offset 20
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset r1, 4
-    .cfi_rel_offset r2, 8
-    .cfi_rel_offset r3, 12
     .cfi_rel_offset lr, 16
     sub    sp, #12                        @ pad stack pointer to align frame
     .cfi_adjust_cfa_offset 12
@@ -40,10 +36,10 @@
     ldr    ip, [ip]                                   // ArtMethod* method
     ldr    ip, [ip, #ART_METHOD_ACCESS_FLAGS_OFFSET]  // uint32_t access_flags
     tst    ip, #(ACCESS_FLAGS_METHOD_IS_FAST_NATIVE | ACCESS_FLAGS_METHOD_IS_CRITICAL_NATIVE)
-    bne    .Llookup_stub_fast_native
+    bne    .Llookup_stub_fast_or_critical_native
     blx    artFindNativeMethod
     b      .Llookup_stub_continue
-.Llookup_stub_fast_native:
+.Llookup_stub_fast_or_critical_native:
     blx    artFindNativeMethodRunnable
 .Llookup_stub_continue:
     mov    r12, r0                        @ save result in r12
@@ -53,10 +49,6 @@
     cbz    r0, 1f                         @ is method code null?
     pop    {r0, r1, r2, r3, lr}           @ restore regs
     .cfi_adjust_cfa_offset -20
-    .cfi_restore r0
-    .cfi_restore r1
-    .cfi_restore r2
-    .cfi_restore r3
     .cfi_restore lr
     bx     r12                            @ if non-null, tail call to method's code
 1:
@@ -69,29 +61,94 @@
     tst    r4, #1
     bne art_jni_dlsym_lookup_stub
 
-    // We need to create a GenericJNI managed frame above the stack args.
+    // Reserve space for a SaveRefsAndArgs managed frame, either for the actual runtime
+    // method or for a GenericJNI frame which is similar but has a native method and a tag.
+    // Do this eagerly, so that we can use these registers as temps without the need to
+    // save and restore them multiple times.
+    INCREASE_FRAME FRAME_SIZE_SAVE_REFS_AND_ARGS
 
-    // GenericJNI frame is similar to SaveRegsAndArgs frame with the native method
-    // instead of runtime method saved at the bottom. Note that the runtime shall
-    // not examine the args here, otherwise we would have to move them in registers
-    // and stack to account for the difference between managed and native ABIs.
-    SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
-    // Save the hidden arg as method pointer, r0 in the padding.
-    // (x0 is an arg in native ABI but not considered an arg in managed ABI.)
-    strd   r4, r0, [sp]
+    // Save args, the hidden arg and caller PC. No CFI needed for args and the hidden arg.
+    push   {r0, r1, r2, r3, r4, lr}
+    .cfi_adjust_cfa_offset 24
+    .cfi_rel_offset lr, 20
 
-    // Call artCriticalNativeOutArgsSize(method)
+    // Call artCriticalNativeFrameSize(method, caller_pc)
     mov    r0, r4  // r0 := method (from hidden arg)
-    bl     artCriticalNativeOutArgsSize
+    mov    r1, lr  // r1 := caller_pc
+    bl     artCriticalNativeFrameSize
 
-    // Check if we have any stack args.
-    cbnz   r0, .Lcritical_has_stack_args
+    // Prepare the return address for managed stack walk of the SaveRefsAndArgs frame.
+    // If we're coming from JNI stub with tail call, it is LR. If we're coming from
+    // JNI stub that saved the return address, it will be the last value we copy below.
+    // If we're coming directly from compiled code, it is LR, set further down.
+    ldr    lr, [sp, #20]
 
-    // Without stack args, the frame is fully constructed.
-    // Place tagged managed sp in Thread::Current()->top_quick_frame.
-    mov    ip, sp
-    orr    ip, #1  // Tag as GenericJNI frame.
-    str    ip, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]
+    // Move the stack args if any.
+    add    r4, sp, #24
+    cbz    r0, .Lcritical_skip_copy_args
+.Lcritical_copy_args_loop:
+    ldrd   ip, lr, [r4, #FRAME_SIZE_SAVE_REFS_AND_ARGS]
+    subs   r0, r0, #8
+    strd   ip, lr, [r4], #8
+    bne    .Lcritical_copy_args_loop
+.Lcritical_skip_copy_args:
+    // The managed frame address is now in R4. This is conveniently a callee-save in native ABI.
+
+    // Restore args.
+    pop    {r0, r1, r2, r3}
+    .cfi_adjust_cfa_offset -16
+
+    // Spill registers for the SaveRefsAndArgs frame above the stack args.
+    // Note that the runtime shall not examine the args here, otherwise we would have to
+    // move them in registers and stack to account for the difference between managed and
+    // native ABIs.
+    add    ip, r4, #FRAME_SIZE_SAVE_REFS_AND_ARGS - 40
+    stmia  ip, {r1-r3, r5-r8, r10-r11, lr}  // LR: Save return address for tail call from JNI stub.
+    // (If there were any stack args, we're storing the value that's already there.
+    // For direct calls from compiled managed code, we shall overwrite this below.)
+    // Skip args r1-r3.
+    CFI_EXPRESSION_BREG 5, 4, FRAME_SIZE_SAVE_REFS_AND_ARGS - 28
+    CFI_EXPRESSION_BREG 6, 4, FRAME_SIZE_SAVE_REFS_AND_ARGS - 24
+    CFI_EXPRESSION_BREG 7, 4, FRAME_SIZE_SAVE_REFS_AND_ARGS - 20
+    CFI_EXPRESSION_BREG 8, 4, FRAME_SIZE_SAVE_REFS_AND_ARGS - 16
+    CFI_EXPRESSION_BREG 10, 4, FRAME_SIZE_SAVE_REFS_AND_ARGS - 12
+    CFI_EXPRESSION_BREG 11, 4, FRAME_SIZE_SAVE_REFS_AND_ARGS - 8
+    // The saved return PC for managed stack walk is not necessarily our LR.
+    // Skip managed FP args as these are native ABI caller-saves and not args.
+
+    // Restore the hidden arg to r1 and caller PC.
+    pop    {r1, lr}
+    .cfi_adjust_cfa_offset -8
+    .cfi_restore lr
+
+    // Save our return PC in the padding.
+    str   lr, [r4, #__SIZEOF_POINTER__]
+    CFI_EXPRESSION_BREG 14, 4, __SIZEOF_POINTER__
+
+    ldr    ip, [r1, #ART_METHOD_ACCESS_FLAGS_OFFSET]  // Load access flags.
+    add    r2, r4, #1             // Prepare managed SP tagged for a GenericJNI frame.
+    tst    ip, #ACCESS_FLAGS_METHOD_IS_NATIVE
+    bne    .Lcritical_skip_prepare_runtime_method
+
+    // When coming from a compiled method, the return PC for managed stack walk is LR.
+    // (When coming from a compiled stub, the correct return PC is already stored above.)
+    str    lr, [r4, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__)]
+
+    // Replace the target method with the SaveRefsAndArgs runtime method.
+    RUNTIME_CURRENT1 r1
+    ldr    r1, [r1, #RUNTIME_SAVE_REFS_AND_ARGS_METHOD_OFFSET]
+
+    mov    r2, r4                 // Prepare untagged managed SP for the runtime method.
+
+.Lcritical_skip_prepare_runtime_method:
+    // Store the method on the bottom of the managed frame.
+    str    r1, [r4]
+
+    // Place (maybe tagged) managed SP in Thread::Current()->top_quick_frame.
+    str    r2, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]
+
+    // Preserve the native arg register r0 in callee-save register r10 which was saved above.
+    mov    r10, r0
 
     // Call artFindNativeMethodRunnable()
     mov    r0, rSELF   // pass Thread::Current()
@@ -100,150 +157,88 @@
     // Store result in scratch reg.
     mov    ip, r0
 
-    // Restore frame.
-    .cfi_remember_state
-    ldrd   r4, r0, [sp]
-    RESTORE_SAVE_REFS_AND_ARGS_FRAME
-    REFRESH_MARKING_REGISTER
+    // Restore the native arg register r0.
+    mov    r0, r10
 
-    // Check for exception.
-    cmp    ip, #0
-    beq    .Lcritical_deliver_exception
-
-    // Do the tail call.
-    bx     ip
-    .cfi_restore_state
-    .cfi_def_cfa_offset FRAME_SIZE_SAVE_REFS_AND_ARGS
-
-.Lcritical_has_stack_args:
-    // Move the out args size to a scratch register.
-    mov    ip, r0
-
-    // Restore register args as we're about to move stack args.
-    ldrd   r4, r0, [sp]
-    RESTORE_SAVE_REFS_AND_ARGS_FRAME
-
-    // Reserve space for SaveRefsAndArgs frame.
-    sub sp, #FRAME_SIZE_SAVE_REFS_AND_ARGS
-    .cfi_adjust_cfa_offset FRAME_SIZE_SAVE_REFS_AND_ARGS
-
-    // Save arg regs so that we can use them as temporaries.
-    push   {r0-r3}
-    .cfi_adjust_cfa_offset 16
-
-    // Move out args. For simplicity include the return address at the end.
-    add    r0, sp, #16   // Destination.
-    add    ip, r0, ip    // Destination end.
-1:
-    ldrd   r2, r3, [r0, #FRAME_SIZE_SAVE_REFS_AND_ARGS]
-    strd   r2, r3, [r0], #8
-    cmp    r0, ip
-    bne    1b
-
-    // Save our LR, load caller's LR and redefine CFI to take ownership of the JNI stub frame.
-    str    lr, [ip, #-__SIZEOF_POINTER__]
-    mov    lr, r3  // The last moved value from the loop above.
-    .cfi_def_cfa ip, FRAME_SIZE_SAVE_REFS_AND_ARGS
-
-    // Restore arg regs.
-    pop    {r0-r3}  // No `.cfi_adjust_cfa_offset`, CFA register is currently ip, not sp.
-
-    // Re-create the SaveRefsAndArgs frame above the args.
-    strd   r4, r0, [ip]  // r0 in the padding as before.
-    add    r4, ip, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40
-    stmia  r4, {r1-r3, r5-r8, r10-r11, lr}   @ 10 words of callee saves and args.
-    .cfi_rel_offset r1, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 0
-    .cfi_rel_offset r2, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 4
-    .cfi_rel_offset r3, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 8
-    .cfi_rel_offset r5, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 12
-    .cfi_rel_offset r6, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 16
-    .cfi_rel_offset r7, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 20
-    .cfi_rel_offset r8, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 24
-    .cfi_rel_offset r10, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 28
-    .cfi_rel_offset r11, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 32
-    .cfi_rel_offset lr, FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 + 36
-    vstmdb r4!, {s0-s15}                     @ 16 words of float args.
-
-    // Move the frame register to a callee-save register.
-    mov    r11, ip
-    .cfi_def_cfa_register r11
-
-    // Place tagged managed sp in Thread::Current()->top_quick_frame.
-    orr    ip, r11, #1  // Tag as GenericJNI frame.
-    str    ip, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]
-
-    // Call artFindNativeMethodRunnable()
-    mov    r0, rSELF   // pass Thread::Current()
-    bl     artFindNativeMethodRunnable
-
-    // Store result in scratch reg.
-    mov    ip, r0
-
-    // Restore the frame. We shall not need the method anymore, so use r4 as scratch register.
-    mov    r4, r11
-    .cfi_def_cfa_register r4
-    ldr    r0, [r4, #4]
-    add    r11, r4, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - 40 - 64)
-    vldmia r11!, {s0-s15}                    @ 16 words of float args.
-    ldmia  r11, {r1-r3, r5-r8, r10-r11, lr}  @ 10 words of callee saves and args.
-    .cfi_restore r1
-    .cfi_restore r2
-    .cfi_restore r3
+    // Restore the frame. We shall not need the method anymore.
+    add    r1, r4, #FRAME_SIZE_SAVE_REFS_AND_ARGS - 40
+    ldmia  r1, {r1-r3, r5-r8, r10-r11}
     .cfi_restore r5
     .cfi_restore r6
     .cfi_restore r7
     .cfi_restore r8
     .cfi_restore r10
     .cfi_restore r11
-    .cfi_restore lr
+
     REFRESH_MARKING_REGISTER
 
-    // Check for exception.
+    // Check for exception before moving args back to keep the return PC for managed stack walk.
     cmp    ip, #0
-    beq    3f
+    beq    .Lcritical_deliver_exception
 
-    // Save arg regs so that we can use them as temporaries.
-    push   {r0-r3}  // No `.cfi_adjust_cfa_offset`, CFA register is currently r4, not sp.
+    .cfi_remember_state
+
+    // Restore our return PC.
+    ldr    lr, [r4, #__SIZEOF_POINTER__]
+    .cfi_restore lr
 
     // Move stack args to their original place.
-    mov    r0, r4
-    add    r1, sp, #16
-2:
-    ldrd   r2, r3, [r0, #-8]!
-    strd   r2, r3, [r0, #FRAME_SIZE_SAVE_REFS_AND_ARGS]
-    cmp    r1, r0
-    bne    2b
-
-    // Replace original return address with caller's return address.
-    ldr    r1, [r4, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__)]
-    str    lr, [r4, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__)]
-
-    // Restore LR and redefine CFI to release ownership of the JNI stub frame.
-    .cfi_remember_state
-    mov    lr, r1
-    .cfi_def_cfa sp, FRAME_SIZE_SAVE_REFS_AND_ARGS + 16
-
-    // Restore args
-    pop    {r0-r3}
+    cmp    sp, r4
+    beq    .Lcritical_skip_copy_args_back
+    push   {r0, r1, r2, r3}
+    .cfi_adjust_cfa_offset 16
+    add    r0, sp, #16
+    sub    r0, r4, r0
+.Lcritical_copy_args_loop_back:
+    ldrd   r2, r3, [r4, #-8]!
+    subs   r0, r0, #8
+    strd   r2, r3, [r4, #FRAME_SIZE_SAVE_REFS_AND_ARGS]
+    bne    .Lcritical_copy_args_loop_back
+    pop    {r0, r1, r2, r3}
     .cfi_adjust_cfa_offset -16
+.Lcritical_skip_copy_args_back:
 
     // Remove the frame reservation.
-    add    sp, #FRAME_SIZE_SAVE_REFS_AND_ARGS
-    .cfi_adjust_cfa_offset -FRAME_SIZE_SAVE_REFS_AND_ARGS
+    DECREASE_FRAME FRAME_SIZE_SAVE_REFS_AND_ARGS
 
     // Do the tail call.
     bx     ip
     .cfi_restore_state
-    .cfi_def_cfa x4, FRAME_SIZE_SAVE_REFS_AND_ARGS
-
-3:
-    // Drop stack args and the SaveRefsAndArgs reservation.
-    mov    sp, r4
-    add    sp, #FRAME_SIZE_SAVE_REFS_AND_ARGS
-    .cfi_def_cfa sp, 0
+    .cfi_def_cfa sp, FRAME_SIZE_SAVE_REFS_AND_ARGS
 
 .Lcritical_deliver_exception:
-    // When delivering exception, we check that rSELF was saved but the SaveRefsAndArgs frame does
-    // not save it, so we cannot use DELIVER_PENDING_EXCEPTION_FRAME_READY with the above frames.
-    DELIVER_PENDING_EXCEPTION
+    // The exception delivery checks that rSELF was saved but the SaveRefsAndArgs
+    // frame does not save it, so we cannot use the existing SaveRefsAndArgs frame.
+    // That's why we checked for exception after restoring registers from it.
+    // We need to build a SaveAllCalleeSaves frame instead. Args are irrelevant at this
+    // point but keep the area allocated for stack args to keep CFA definition simple.
+#if FRAME_SIZE_SAVE_REFS_AND_ARGS != FRAME_SIZE_SAVE_ALL_CALLEE_SAVES
+#  error "Expected FRAME_SIZE_SAVE_REFS_AND_ARGS == FRAME_SIZE_SAVE_ALL_CALLEE_SAVES"
+    // Otherwise we would need to adjust SP and R4 and move our return PC which is at [R4, #4].
+    // (Luckily, both SaveRefsAndArgs and SaveAllCalleeSaves frames have padding there.)
+#endif
+
+    // Spill registers for the SaveAllCalleeSaves frame above the stack args area.
+    add    ip, r4, #FRAME_SIZE_SAVE_ALL_CALLEE_SAVES - 32
+    stmia  ip, {r5-r11}  // Keep the caller PC for managed stack walk.
+    CFI_EXPRESSION_BREG 5, 4, FRAME_SIZE_SAVE_ALL_CALLEE_SAVES - 32
+    CFI_EXPRESSION_BREG 6, 4, FRAME_SIZE_SAVE_ALL_CALLEE_SAVES - 28
+    CFI_EXPRESSION_BREG 7, 4, FRAME_SIZE_SAVE_ALL_CALLEE_SAVES - 24
+    CFI_EXPRESSION_BREG 8, 4, FRAME_SIZE_SAVE_ALL_CALLEE_SAVES - 20
+    CFI_EXPRESSION_BREG 9, 4, FRAME_SIZE_SAVE_ALL_CALLEE_SAVES - 16
+    CFI_EXPRESSION_BREG 10, 4, FRAME_SIZE_SAVE_ALL_CALLEE_SAVES - 12
+    CFI_EXPRESSION_BREG 11, 4, FRAME_SIZE_SAVE_ALL_CALLEE_SAVES - 8
+    // Skip R4, it is callee-save in managed ABI.
+    add    ip, r4, #12
+    vstmia ip, {s16-s31}
+
+    // Store ArtMethod* Runtime::callee_save_methods_[kSaveAllCalleeSaves] to the managed frame.
+    RUNTIME_CURRENT2 ip
+    ldr   ip, [ip, #RUNTIME_SAVE_ALL_CALLEE_SAVES_METHOD_OFFSET]
+    str   ip, [r4]
+
+    // Place the managed frame SP in Thread::Current()->top_quick_frame.
+    str   r4, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]
+
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
 END art_jni_dlsym_lookup_critical_stub
diff --git a/runtime/arch/arm/jni_frame_arm.h b/runtime/arch/arm/jni_frame_arm.h
index 5203eaf..2263873 100644
--- a/runtime/arch/arm/jni_frame_arm.h
+++ b/runtime/arch/arm/jni_frame_arm.h
@@ -38,9 +38,8 @@
 // Note: AAPCS is soft-float, so these are all core registers.
 constexpr size_t kJniArgumentRegisterCount = 4u;
 
-// Get the size of "out args" for @CriticalNative method stub.
-// This must match the size of the frame emitted by the JNI compiler at the native call site.
-inline size_t GetCriticalNativeOutArgsSize(const char* shorty, uint32_t shorty_len) {
+// Get stack args size for @CriticalNative method calls.
+inline size_t GetCriticalNativeCallArgsSize(const char* shorty, uint32_t shorty_len) {
   DCHECK_EQ(shorty_len, strlen(shorty));
 
   size_t reg = 0;  // Register for the current argument; if reg >= 4, we shall use stack.
@@ -54,7 +53,14 @@
     reg += 1u;
   }
   size_t stack_args = std::max(reg, kJniArgumentRegisterCount) - kJniArgumentRegisterCount;
-  size_t size = kFramePointerSize * stack_args;
+  return kFramePointerSize * stack_args;
+}
+
+// Get the frame size for @CriticalNative method stub.
+// This must match the size of the frame emitted by the JNI compiler at the native call site.
+inline size_t GetCriticalNativeStubFrameSize(const char* shorty, uint32_t shorty_len) {
+  // The size of outgoing arguments.
+  size_t size = GetCriticalNativeCallArgsSize(shorty, shorty_len);
 
   // Check if this is a tail call, i.e. there are no stack args and the return type
   // is not  an FP type (otherwise we need to move the result to FP register).
@@ -65,6 +71,16 @@
   return RoundUp(size, kAapcsStackAlignment);
 }
 
+// Get the frame size for direct call to a @CriticalNative method.
+// This must match the size of the extra frame emitted by the compiler at the native call site.
+inline size_t GetCriticalNativeDirectCallFrameSize(const char* shorty, uint32_t shorty_len) {
+  // The size of outgoing arguments.
+  size_t size = GetCriticalNativeCallArgsSize(shorty, shorty_len);
+
+  // No return PC to save, zero- and sign-extension and FP value moves are handled by the caller.
+  return RoundUp(size, kAapcsStackAlignment);
+}
+
 }  // namespace arm
 }  // namespace art
 
diff --git a/runtime/arch/arm64/asm_support_arm64.S b/runtime/arch/arm64/asm_support_arm64.S
index fd5c852..f7fa7df 100644
--- a/runtime/arch/arm64/asm_support_arm64.S
+++ b/runtime/arch/arm64/asm_support_arm64.S
@@ -18,6 +18,7 @@
 #define ART_RUNTIME_ARCH_ARM64_ASM_SUPPORT_ARM64_S_
 
 #include "asm_support_arm64.h"
+#include "interpreter/cfi_asm_support.h"
 
 // Define special registers.
 
@@ -40,6 +41,16 @@
 #define wMR w20
 #endif
 
+.macro CFI_EXPRESSION_BREG n, b, offset
+    .if (-0x40 <= (\offset)) && ((\offset) < 0x40)
+        CFI_EXPRESSION_BREG_1(\n, \b, \offset)
+    .elseif (-0x2000 <= (\offset)) && ((\offset) < 0x2000)
+        CFI_EXPRESSION_BREG_2(\n, \b, \offset)
+    .else
+        .error "Unsupported offset"
+    .endif
+.endm
+
 .macro ENTRY_ALIGNED name, alignment
     .type \name, #function
     .hidden \name  // Hide this as a global symbol, so we do not incur plt calls.
@@ -98,11 +109,15 @@
     .cfi_rel_offset \reg, (\offset)
 .endm
 
-.macro RESTORE_REG reg, offset
-    ldr \reg, [sp, #(\offset)]
+.macro RESTORE_REG_BASE base, reg, offset
+    ldr \reg, [\base, #(\offset)]
     .cfi_restore \reg
 .endm
 
+.macro RESTORE_REG reg, offset
+    RESTORE_REG_BASE sp, \reg, \offset
+.endm
+
 .macro SAVE_TWO_REGS_BASE base, reg1, reg2, offset
     stp \reg1, \reg2, [\base, #(\offset)]
     .cfi_rel_offset \reg1, (\offset)
@@ -125,11 +140,11 @@
 
 .macro LOAD_RUNTIME_INSTANCE reg
 #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
-    adrp xIP0, :pg_hi21_nc:_ZN3art7Runtime9instance_E
+    adrp \reg, :pg_hi21_nc:_ZN3art7Runtime9instance_E
 #else
-    adrp xIP0, _ZN3art7Runtime9instance_E
+    adrp \reg, _ZN3art7Runtime9instance_E
 #endif
-    ldr xIP0, [xIP0, #:lo12:_ZN3art7Runtime9instance_E]
+    ldr \reg, [\reg, #:lo12:_ZN3art7Runtime9instance_E]
 .endm
 
 // Macro to refresh the Marking Register (W20).
diff --git a/runtime/arch/arm64/jni_entrypoints_arm64.S b/runtime/arch/arm64/jni_entrypoints_arm64.S
index 8a34662..f72bc55 100644
--- a/runtime/arch/arm64/jni_entrypoints_arm64.S
+++ b/runtime/arch/arm64/jni_entrypoints_arm64.S
@@ -55,10 +55,10 @@
     ldr   xIP0, [xIP0, #ART_METHOD_ACCESS_FLAGS_OFFSET]   // uint32_t access_flags
     mov   xIP1, #(ACCESS_FLAGS_METHOD_IS_FAST_NATIVE | ACCESS_FLAGS_METHOD_IS_CRITICAL_NATIVE)
     tst   xIP0, xIP1
-    b.ne  .Llookup_stub_fast_native
+    b.ne  .Llookup_stub_fast_or_critical_native
     bl    artFindNativeMethod
     b     .Llookup_stub_continue
-    .Llookup_stub_fast_native:
+    .Llookup_stub_fast_or_critical_native:
     bl    artFindNativeMethodRunnable
 .Llookup_stub_continue:
     mov   x17, x0    // store result in scratch reg.
@@ -97,136 +97,235 @@
     // For Generic JNI we already have a managed frame, so we reuse the art_jni_dlsym_lookup_stub.
     tbnz  x15, #0, art_jni_dlsym_lookup_stub
 
-    // We need to create a GenericJNI managed frame above the stack args.
+    // Save args, the hidden arg and caller PC. No CFI needed for args and the hidden arg.
+    stp   x0, x1, [sp, #-(8 * 8 + 8 * 8 + 2 * 8)]!
+    .cfi_adjust_cfa_offset (8 * 8 + 8 * 8 + 2 * 8)
+    stp   x2, x3, [sp, #16]
+    stp   x4, x5, [sp, #32]
+    stp   x6, x7, [sp, #48]
+    stp   d0, d1, [sp, #64]
+    stp   d2, d3, [sp, #80]
+    stp   d4, d5, [sp, #96]
+    stp   d6, d7, [sp, #112]
+    stp   x15, lr, [sp, #128]
+    .cfi_rel_offset lr, 136
 
-    // GenericJNI frame is similar to SaveRegsAndArgs frame with the native method
-    // instead of runtime method saved at the bottom. Note that the runtime shall
-    // not examine the args here, otherwise we would have to move them in registers
-    // and stack to account for the difference between managed and native ABIs.
-    INCREASE_FRAME FRAME_SIZE_SAVE_REFS_AND_ARGS
-    SETUP_SAVE_REFS_AND_ARGS_FRAME_INTERNAL sp
-    // Save the hidden arg as method pointer, x0 in the padding.
-    // (x0 is an arg in native ABI but not considered an arg in managed ABI.)
-    SAVE_TWO_REGS x15, x0, 0
-
-    // Call artCriticalNativeOutArgsSize(method)
+    // Call artCriticalNativeFrameSize(method, caller_pc)
     mov   x0, x15  // x0 := method (from hidden arg)
-    bl    artCriticalNativeOutArgsSize
+    mov   x1, lr   // x1 := caller_pc
+    bl    artCriticalNativeFrameSize
 
-    // Check if we have any stack args.
-    cbnz  x0, .Lcritical_has_stack_args
+    // Move frame size to x14.
+    mov   x14, x0
 
-    // Without stack args, the frame is fully constructed.
-    // Place tagged managed sp in Thread::Current()->top_quick_frame.
-    mov   xIP0, sp
-    orr   xIP0, xIP0, #1  // Tag as GenericJNI frame.
-    str   xIP0, [xSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]
+    // Restore args, the hidden arg and caller PC.
+    ldp   x2, x3, [sp, #16]
+    ldp   x4, x5, [sp, #32]
+    ldp   x6, x7, [sp, #48]
+    ldp   d0, d1, [sp, #64]
+    ldp   d2, d3, [sp, #80]
+    ldp   d4, d5, [sp, #96]
+    ldp   d6, d7, [sp, #112]
+    ldp   x15, lr, [sp, #128]
+    .cfi_restore lr
+    ldp   x0, x1, [sp], #(8 * 8 + 8 * 8 + 2 * 8)
+    .cfi_adjust_cfa_offset -(8 * 8 + 8 * 8 + 2 * 8)
+
+    // Reserve space for a SaveRefsAndArgs managed frame, either for the actual runtime
+    // method or for a GenericJNI frame which is similar but has a native method and a tag.
+    INCREASE_FRAME FRAME_SIZE_SAVE_REFS_AND_ARGS
+
+    // Calculate the base address of the managed frame.
+    add   x13, sp, x14
+
+    // Prepare the return address for managed stack walk of the SaveRefsAndArgs frame.
+    // If we're coming from JNI stub with tail call, it is LR. If we're coming from
+    // JNI stub that saved the return address, it will be the last value we copy below.
+    // If we're coming directly from compiled code, it is LR, set further down.
+    mov   xIP1, lr
+
+    // Move the stack args if any.
+    cbz   x14, .Lcritical_skip_copy_args
+    mov   x12, sp
+.Lcritical_copy_args_loop:
+    ldp   xIP0, xIP1, [x12, #FRAME_SIZE_SAVE_REFS_AND_ARGS]
+    subs  x14, x14, #16
+    stp   xIP0, xIP1, [x12], #16
+    bne   .Lcritical_copy_args_loop
+.Lcritical_skip_copy_args:
+
+    // Spill registers for the SaveRefsAndArgs frame above the stack args.
+    // Note that the runtime shall not examine the args here, otherwise we would have to
+    // move them in registers and stack to account for the difference between managed and
+    // native ABIs. Do not update CFI while we hold the frame address in x13 and the values
+    // in registers are unchanged.
+    stp   d0, d1, [x13, #16]
+    stp   d2, d3, [x13, #32]
+    stp   d4, d5, [x13, #48]
+    stp   d6, d7, [x13, #64]
+    stp   x1, x2, [x13, #80]
+    stp   x3, x4, [x13, #96]
+    stp   x5, x6, [x13, #112]
+    stp   x7, x20, [x13, #128]
+    stp   x21, x22, [x13, #144]
+    stp   x23, x24, [x13, #160]
+    stp   x25, x26, [x13, #176]
+    stp   x27, x28, [x13, #192]
+    stp   x29, xIP1, [x13, #208]  // xIP1: Save return address for tail call from JNI stub.
+    // (If there were any stack args, we're storing the value that's already there.
+    // For direct calls from compiled managed code, we shall overwrite this below.)
+
+    // Move the managed frame address to native callee-save register x29 and update CFI.
+    mov   x29, x13
+    // Skip args d0-d7, x1-x7
+    CFI_EXPRESSION_BREG 20, 29, 136
+    CFI_EXPRESSION_BREG 21, 29, 144
+    CFI_EXPRESSION_BREG 22, 29, 152
+    CFI_EXPRESSION_BREG 23, 29, 160
+    CFI_EXPRESSION_BREG 24, 29, 168
+    CFI_EXPRESSION_BREG 25, 29, 176
+    CFI_EXPRESSION_BREG 26, 29, 184
+    CFI_EXPRESSION_BREG 27, 29, 192
+    CFI_EXPRESSION_BREG 28, 29, 200
+    CFI_EXPRESSION_BREG 29, 29, 208
+    // The saved return PC for managed stack walk is not necessarily our LR.
+
+    // Save our return PC in the padding.
+    str   lr, [x29, #__SIZEOF_POINTER__]
+    CFI_EXPRESSION_BREG 30, 29, __SIZEOF_POINTER__
+
+    ldr   wIP0, [x15, #ART_METHOD_ACCESS_FLAGS_OFFSET]  // Load access flags.
+    add   x14, x29, #1            // Prepare managed SP tagged for a GenericJNI frame.
+    tbnz  wIP0, #ACCESS_FLAGS_METHOD_IS_NATIVE_BIT, .Lcritical_skip_prepare_runtime_method
+
+    // When coming from a compiled method, the return PC for managed stack walk is LR.
+    // (When coming from a compiled stub, the correct return PC is already stored above.)
+    str   lr, [x29, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__)]
+
+    // Replace the target method with the SaveRefsAndArgs runtime method.
+    LOAD_RUNTIME_INSTANCE x15
+    ldr   x15, [x15, #RUNTIME_SAVE_REFS_AND_ARGS_METHOD_OFFSET]
+
+    mov   x14, x29                // Prepare untagged managed SP for the runtime method.
+
+.Lcritical_skip_prepare_runtime_method:
+    // Store the method on the bottom of the managed frame.
+    str   x15, [x29]
+
+    // Place (maybe tagged) managed SP in Thread::Current()->top_quick_frame.
+    str   x14, [xSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]
+
+    // Preserve the native arg register x0 in callee-save register x28 which was saved above.
+    mov   x28, x0
 
     // Call artFindNativeMethodRunnable()
     mov   x0, xSELF   // pass Thread::Current()
     bl    artFindNativeMethodRunnable
 
     // Store result in scratch reg.
-    mov   xIP0, x0
+    mov   x13, x0
 
-    // Restore frame.
+    // Restore the native arg register x0.
+    mov   x0, x28
+
+    // Restore our return PC.
+    RESTORE_REG_BASE x29, lr, __SIZEOF_POINTER__
+
+    // Remember the stack args size, negated because SP cannot be on the right-hand side in SUB.
+    sub   x14, sp, x29
+
+    // Restore the frame. We shall not need the method anymore.
+    ldp   d0, d1, [x29, #16]
+    ldp   d2, d3, [x29, #32]
+    ldp   d4, d5, [x29, #48]
+    ldp   d6, d7, [x29, #64]
+    ldp   x1, x2, [x29, #80]
+    ldp   x3, x4, [x29, #96]
+    ldp   x5, x6, [x29, #112]
+    ldp   x7, x20, [x29, #128]
+    .cfi_restore x20
+    RESTORE_TWO_REGS_BASE x29, x21, x22, 144
+    RESTORE_TWO_REGS_BASE x29, x23, x24, 160
+    RESTORE_TWO_REGS_BASE x29, x25, x26, 176
+    RESTORE_TWO_REGS_BASE x29, x27, x28, 192
+    RESTORE_REG_BASE x29, x29, 208
+
+    REFRESH_MARKING_REGISTER
+
+    // Check for exception before moving args back to keep the return PC for managed stack walk.
+    cbz   x13, .Lcritical_deliver_exception
+
     .cfi_remember_state
-    RESTORE_TWO_REGS x15, x0, 0
-    RESTORE_SAVE_REFS_AND_ARGS_FRAME
-    REFRESH_MARKING_REGISTER
-
-    // Check for exception.
-    cbz   xIP0, .Lcritical_deliver_exception
-
-    // Do the tail call
-    br    xIP0
-    .cfi_restore_state
-    .cfi_def_cfa_offset FRAME_SIZE_SAVE_REFS_AND_ARGS
-
-.Lcritical_has_stack_args:
-    // Move the out args size to a scratch register.
-    mov   xIP0, x0
-
-    // Restore register args as we're about to move stack args.
-    RESTORE_TWO_REGS x15, x0, 0
-    RESTORE_SAVE_REFS_AND_ARGS_FRAME_INTERNAL sp
-
-    // Move out args. For simplicity include the return address at the end.
-    mov   x8, sp        // Destination.
-    add   x9, sp, xIP0  // Destination end.
-1:
-    ldp   x10, x11, [x8, #FRAME_SIZE_SAVE_REFS_AND_ARGS]
-    stp   x10, x11, [x8], #16
-    cmp   x8, x9
-    bne   1b
-
-    // Save our LR, load caller's LR and redefine CFI to take ownership of the JNI stub frame.
-    str   xLR, [x9, #-__SIZEOF_POINTER__]
-    mov   xLR, x11  // The last moved value from the loop above.
-    .cfi_def_cfa x9, FRAME_SIZE_SAVE_REFS_AND_ARGS
-
-    // Re-create the SaveRefsAndArgs frame above the args.
-    SETUP_SAVE_REFS_AND_ARGS_FRAME_INTERNAL x9
-    SAVE_TWO_REGS_BASE x9, x15, x0, 0
-
-    // Move the frame register to a callee-save register.
-    mov   x29, x9
-    .cfi_def_cfa_register x29
-
-    // Place tagged managed sp in Thread::Current()->top_quick_frame.
-    orr   xIP0, x29, #1  // Tag as GenericJNI frame.
-    str   xIP0, [xSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]
-
-    // Call artFindNativeMethodRunnable()
-    mov   x0, xSELF   // pass Thread::Current()
-    bl    artFindNativeMethodRunnable
-
-    // Store result in scratch reg.
-    mov   xIP0, x0
-
-    // Restore the frame.
-    mov   x9, x29
-    .cfi_def_cfa_register x9
-    RESTORE_TWO_REGS_BASE x9, x15, x0, 0
-    RESTORE_SAVE_REFS_AND_ARGS_FRAME_INTERNAL x9
-    REFRESH_MARKING_REGISTER
-
-    // Check for exception.
-    cbz   xIP0, 3f
 
     // Move stack args to their original place.
-    mov   x8, x9
-2:
-    ldp   x10, x11, [x8, #-16]!
-    stp   x10, x11, [x8, #FRAME_SIZE_SAVE_REFS_AND_ARGS]
-    cmp   sp, x8
-    bne   2b
-
-    // Replace original return address with caller's return address.
-    ldr   xIP1, [x9, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__)]
-    str   xLR, [x9, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__)]
-
-    // Restore LR and redefine CFI to release ownership of the JNI stub frame.
-    .cfi_remember_state
-    mov   xLR, xIP1
-    .cfi_def_cfa sp, FRAME_SIZE_SAVE_REFS_AND_ARGS
+    cbz   x14, .Lcritical_skip_copy_args_back
+    sub   x12, sp, x14
+.Lcritical_copy_args_back_loop:
+    ldp   xIP0, xIP1, [x12, #-16]!
+    adds  x14, x14, #16
+    stp   xIP0, xIP1, [x12, #FRAME_SIZE_SAVE_REFS_AND_ARGS]
+    bne   .Lcritical_copy_args_back_loop
+.Lcritical_skip_copy_args_back:
 
     // Remove the frame reservation.
     DECREASE_FRAME FRAME_SIZE_SAVE_REFS_AND_ARGS
 
     // Do the tail call.
-    br    xIP0
+    br    x13
     .cfi_restore_state
-    .cfi_def_cfa x9, FRAME_SIZE_SAVE_REFS_AND_ARGS
-
-3:
-    // Drop stack args and the SaveRefsAndArgs reservation.
-    mov   sp, x9
-    add   sp, sp, #FRAME_SIZE_SAVE_REFS_AND_ARGS
-    .cfi_def_cfa sp, 0
+    .cfi_def_cfa sp, FRAME_SIZE_SAVE_REFS_AND_ARGS
 
 .Lcritical_deliver_exception:
-    // When delivering exception, we check that xSELF was saved but the SaveRefsAndArgs frame does
-    // not save it, so we cannot use DELIVER_PENDING_EXCEPTION_FRAME_READY with the above frames.
-    DELIVER_PENDING_EXCEPTION
+    // The exception delivery checks that xSELF was saved but the SaveRefsAndArgs
+    // frame does not save it, so we cannot use the existing SaveRefsAndArgs frame.
+    // That's why we checked for exception after restoring registers from it.
+    // We need to build a SaveAllCalleeSaves frame instead. Args are irrelevant at this
+    // point but keep the area allocated for stack args to keep CFA definition simple.
+    DECREASE_FRAME FRAME_SIZE_SAVE_REFS_AND_ARGS - FRAME_SIZE_SAVE_ALL_CALLEE_SAVES
+
+    // Calculate the base address of the managed frame.
+    sub   x13, sp, x14
+
+    // Spill registers for the SaveAllCalleeSaves frame above the stack args area. Do not update
+    // CFI while we hold the frame address in x13 and the values in registers are unchanged.
+    stp   d8, d9, [x13, #16]
+    stp   d10, d11, [x13, #32]
+    stp   d12, d13, [x13, #48]
+    stp   d14, d15, [x13, #64]
+    stp   x19, x20, [x13, #80]
+    stp   x21, x22, [x13, #96]
+    stp   x23, x24, [x13, #112]
+    stp   x25, x26, [x13, #128]
+    stp   x27, x28, [x13, #144]
+    str   x29, [x13, #160]
+    // Keep the caller PC for managed stack walk.
+
+    // Move the managed frame address to native callee-save register x29 and update CFI.
+    mov   x29, x13
+    CFI_EXPRESSION_BREG 19, 29, 80
+    CFI_EXPRESSION_BREG 20, 29, 88
+    CFI_EXPRESSION_BREG 21, 29, 96
+    CFI_EXPRESSION_BREG 22, 29, 104
+    CFI_EXPRESSION_BREG 23, 29, 112
+    CFI_EXPRESSION_BREG 24, 29, 120
+    CFI_EXPRESSION_BREG 25, 29, 128
+    CFI_EXPRESSION_BREG 26, 29, 136
+    CFI_EXPRESSION_BREG 27, 29, 144
+    CFI_EXPRESSION_BREG 28, 29, 152
+    CFI_EXPRESSION_BREG 29, 29, 160
+    // The saved return PC for managed stack walk is not necessarily our LR.
+
+    // Save our return PC in the padding.
+    str   lr, [x29, #__SIZEOF_POINTER__]
+    CFI_EXPRESSION_BREG 30, 29, __SIZEOF_POINTER__
+
+    // Store ArtMethod* Runtime::callee_save_methods_[kSaveAllCalleeSaves] to the managed frame.
+    LOAD_RUNTIME_INSTANCE xIP0
+    ldr xIP0, [xIP0, #RUNTIME_SAVE_ALL_CALLEE_SAVES_METHOD_OFFSET]
+    str xIP0, [x29]
+
+    // Place the managed frame SP in Thread::Current()->top_quick_frame.
+    str x29, [xSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]
+
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
 END art_jni_dlsym_lookup_critical_stub
diff --git a/runtime/arch/arm64/jni_frame_arm64.h b/runtime/arch/arm64/jni_frame_arm64.h
index fa4d43c..17e7434 100644
--- a/runtime/arch/arm64/jni_frame_arm64.h
+++ b/runtime/arch/arm64/jni_frame_arm64.h
@@ -41,28 +41,34 @@
 // in registers. The rest of the args must go on the stack.
 constexpr size_t kMaxIntLikeRegisterArguments = 8u;
 
-// Get the size of "out args" for @CriticalNative method stub.
-// This must match the size of the frame emitted by the JNI compiler at the native call site.
-inline size_t GetCriticalNativeOutArgsSize(const char* shorty, uint32_t shorty_len) {
-  DCHECK_EQ(shorty_len, strlen(shorty));
-
-  size_t num_fp_args = 0u;
-  for (size_t i = 1; i != shorty_len; ++i) {
-    if (shorty[i] == 'F' || shorty[i] == 'D') {
-      num_fp_args += 1u;
-    }
-  }
-  size_t num_non_fp_args = shorty_len - 1u - num_fp_args;
-
+// Get the size of the arguments for a native call.
+inline size_t GetNativeOutArgsSize(size_t num_fp_args, size_t num_non_fp_args) {
   // Account for FP arguments passed through v0-v7.
   size_t num_stack_fp_args =
       num_fp_args - std::min(kMaxFloatOrDoubleRegisterArguments, num_fp_args);
   // Account for other (integer and pointer) arguments passed through GPR (x0-x7).
   size_t num_stack_non_fp_args =
       num_non_fp_args - std::min(kMaxIntLikeRegisterArguments, num_non_fp_args);
+  // Each stack argument takes 8 bytes.
+  return (num_stack_fp_args + num_stack_non_fp_args) * static_cast<size_t>(kArm64PointerSize);
+}
+
+// Get stack args size for @CriticalNative method calls.
+inline size_t GetCriticalNativeCallArgsSize(const char* shorty, uint32_t shorty_len) {
+  DCHECK_EQ(shorty_len, strlen(shorty));
+
+  size_t num_fp_args =
+      std::count_if(shorty + 1, shorty + shorty_len, [](char c) { return c == 'F' || c == 'D'; });
+  size_t num_non_fp_args = shorty_len - 1u - num_fp_args;
+
+  return GetNativeOutArgsSize(num_fp_args, num_non_fp_args);
+}
+
+// Get the frame size for @CriticalNative method stub.
+// This must match the size of the extra frame emitted by the compiler at the native call site.
+inline size_t GetCriticalNativeStubFrameSize(const char* shorty, uint32_t shorty_len) {
   // The size of outgoing arguments.
-  size_t size =
-      (num_stack_fp_args + num_stack_non_fp_args) * static_cast<size_t>(kArm64PointerSize);
+  size_t size = GetCriticalNativeCallArgsSize(shorty, shorty_len);
 
   // We can make a tail call if there are no stack args and we do not need
   // to extend the result. Otherwise, add space for return PC.
@@ -72,6 +78,16 @@
   return RoundUp(size, kAapcs64StackAlignment);
 }
 
+// Get the frame size for direct call to a @CriticalNative method.
+// This must match the size of the frame emitted by the JNI compiler at the native call site.
+inline size_t GetCriticalNativeDirectCallFrameSize(const char* shorty, uint32_t shorty_len) {
+  // The size of outgoing arguments.
+  size_t size = GetCriticalNativeCallArgsSize(shorty, shorty_len);
+
+  // No return PC to save, zero- and sign-extension are handled by the caller.
+  return RoundUp(size, kAapcs64StackAlignment);
+}
+
 }  // namespace arm64
 }  // namespace art
 
diff --git a/runtime/arch/x86/asm_support_x86.S b/runtime/arch/x86/asm_support_x86.S
index 8938d8b..1a75cbc 100644
--- a/runtime/arch/x86/asm_support_x86.S
+++ b/runtime/arch/x86/asm_support_x86.S
@@ -18,6 +18,7 @@
 #define ART_RUNTIME_ARCH_X86_ASM_SUPPORT_X86_S_
 
 #include "asm_support_x86.h"
+#include "interpreter/cfi_asm_support.h"
 
 // Regular gas(1) & current clang/llvm assembler support named macro parameters.
 #define MACRO0(macro_name) .macro macro_name
@@ -77,6 +78,7 @@
     #define CFI_DEF_CFA_REGISTER(reg) .cfi_def_cfa_register reg
     #define CFI_RESTORE(reg) .cfi_restore reg
     #define CFI_REL_OFFSET(reg,size) .cfi_rel_offset reg,size
+    #define CFI_REGISTER(orig_reg, current_reg) .cfi_register orig_reg, current_reg
     #define CFI_REMEMBER_STATE .cfi_remember_state
     // The spec is not clear whether the CFA is part of the saved state and tools
     // differ in the behaviour, so explicitly set the CFA to avoid any ambiguity.
@@ -93,11 +95,34 @@
     #define CFI_DEF_CFA_REGISTER(reg)
     #define CFI_RESTORE(reg)
     #define CFI_REL_OFFSET(reg,size)
+    #define CFI_REGISTER(orig_reg, current_reg)
     #define CFI_REMEMBER_STATE
     #define CFI_RESTORE_STATE_AND_DEF_CFA(reg,off)
     #define CFI_ESCAPE(...)
 #endif
 
+#define CFI_REG_eax 0
+#define CFI_REG_ecx 1
+#define CFI_REG_edx 2
+#define CFI_REG_ebx 3
+#define CFI_REG_esp 4
+#define CFI_REG_ebp 5
+#define CFI_REG_esi 6
+#define CFI_REG_edi 7
+#define CFI_REG_eip 8
+
+#define CFI_REG(reg) CFI_REG_##reg
+
+MACRO3(CFI_EXPRESSION_BREG, n, b, offset)
+    .if (-0x40 <= (\offset)) && ((\offset) < 0x40)
+        CFI_EXPRESSION_BREG_1(\n, \b, \offset)
+    .elseif (-0x2000 <= (\offset)) && ((\offset) < 0x2000)
+        CFI_EXPRESSION_BREG_2(\n, \b, \offset)
+    .else
+        .error "Unsupported offset"
+    .endif
+END_MACRO
+
     // Symbols. On a Mac, we need a leading underscore.
 #if !defined(__APPLE__)
     #define SYMBOL(name) name
@@ -172,6 +197,16 @@
     CFI_RESTORE(REG_VAR(reg))
 END_MACRO
 
+MACRO1(INCREASE_FRAME, frame_adjustment)
+    subl MACRO_LITERAL(RAW_VAR(frame_adjustment)), %esp
+    CFI_ADJUST_CFA_OFFSET((RAW_VAR(frame_adjustment)))
+END_MACRO
+
+MACRO1(DECREASE_FRAME, frame_adjustment)
+    addl MACRO_LITERAL(RAW_VAR(frame_adjustment)), %esp
+    CFI_ADJUST_CFA_OFFSET(-(RAW_VAR(frame_adjustment)))
+END_MACRO
+
 #define UNREACHABLE int3
 
 MACRO1(UNIMPLEMENTED,name)
@@ -197,6 +232,13 @@
 #endif
 END_MACRO
 
+MACRO2(LOAD_RUNTIME_INSTANCE, reg, got_reg)
+    SETUP_GOT_NOSAVE \got_reg
+    // Load Runtime::instance_ from GOT.
+    movl SYMBOL(_ZN3art7Runtime9instance_E)@GOT(REG_VAR(got_reg)), REG_VAR(reg)
+    movl (REG_VAR(reg)), REG_VAR(reg)
+END_MACRO
+
 // Macros to poison (negate) the reference for heap poisoning.
 MACRO1(POISON_HEAP_REF, rRef)
 #ifdef USE_HEAP_POISONING
@@ -223,8 +265,7 @@
     PUSH_ARG edx
     PUSH_ARG ecx
     // Create space for FPR args.
-    subl MACRO_LITERAL(4 * 8), %esp
-    CFI_ADJUST_CFA_OFFSET(4 * 8)
+    INCREASE_FRAME 4 * 8
     // Save FPRs.
     movsd %xmm0, 0(%esp)
     movsd %xmm1, 8(%esp)
@@ -246,8 +287,7 @@
     movsd 20(%esp), %xmm2
     movsd 28(%esp), %xmm3
 
-    addl MACRO_LITERAL(36), %esp  // Remove FPRs and method pointer.
-    CFI_ADJUST_CFA_OFFSET(-36)
+    DECREASE_FRAME 36             // Remove FPRs and method pointer.
 
     POP_ARG ecx                   // Restore args
     POP_ARG edx
@@ -263,12 +303,12 @@
      */
 MACRO0(DELIVER_PENDING_EXCEPTION_FRAME_READY)
     // Outgoing argument set up
-    subl MACRO_LITERAL(12), %esp               // alignment padding
-    CFI_ADJUST_CFA_OFFSET(12)
+    INCREASE_FRAME 12                          // alignment padding
     pushl %fs:THREAD_SELF_OFFSET               // pass Thread::Current()
     CFI_ADJUST_CFA_OFFSET(4)
     call SYMBOL(artDeliverPendingExceptionFromCode)  // artDeliverPendingExceptionFromCode(Thread*)
     UNREACHABLE
+    CFI_ADJUST_CFA_OFFSET(-16)                 // Reset CFA in case there is more code afterwards.
 END_MACRO
 
 #endif  // ART_RUNTIME_ARCH_X86_ASM_SUPPORT_X86_S_
diff --git a/runtime/arch/x86/jni_entrypoints_x86.S b/runtime/arch/x86/jni_entrypoints_x86.S
index 086e96f..a1a371c 100644
--- a/runtime/arch/x86/jni_entrypoints_x86.S
+++ b/runtime/arch/x86/jni_entrypoints_x86.S
@@ -20,9 +20,8 @@
      * Jni dlsym lookup stub.
      */
 DEFINE_FUNCTION art_jni_dlsym_lookup_stub
-    subl LITERAL(8), %esp         // align stack
-    CFI_ADJUST_CFA_OFFSET(8)
-    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
+    INCREASE_FRAME 8              // Align stack.
+    pushl %fs:THREAD_SELF_OFFSET  // Pass Thread::Current().
     CFI_ADJUST_CFA_OFFSET(4)
     // Call artFindNativeMethod() for normal native and artFindNativeMethodRunnable()
     // for @FastNative or @CriticalNative.
@@ -32,17 +31,16 @@
     movl (%eax), %eax                                // ArtMethod* method
     testl LITERAL(ACCESS_FLAGS_METHOD_IS_FAST_NATIVE | ACCESS_FLAGS_METHOD_IS_CRITICAL_NATIVE), \
           ART_METHOD_ACCESS_FLAGS_OFFSET(%eax)
-    jne .Llookup_stub_fast_native
+    jne .Llookup_stub_fast_or_critical_native
     call SYMBOL(artFindNativeMethod)  // (Thread*)
     jmp .Llookup_stub_continue
-.Llookup_stub_fast_native:
+.Llookup_stub_fast_or_critical_native:
     call SYMBOL(artFindNativeMethodRunnable)  // (Thread*)
 .Llookup_stub_continue:
-    addl LITERAL(12), %esp        // remove argument & padding
-    CFI_ADJUST_CFA_OFFSET(-12)
-    testl %eax, %eax              // check if returned method code is null
-    jz .Lno_native_code_found     // if null, jump to return to handle
-    jmp *%eax                     // otherwise, tail call to intended method
+    DECREASE_FRAME 12             // Remove argument & padding.
+    testl %eax, %eax              // Check if returned method code is null.
+    jz .Lno_native_code_found     // If null, jump to return to handle.
+    jmp *%eax                     // Otherwise, tail call to intended method.
 .Lno_native_code_found:
     ret
 END_FUNCTION art_jni_dlsym_lookup_stub
@@ -53,31 +51,94 @@
     testl LITERAL(1), %eax
     jnz art_jni_dlsym_lookup_stub
 
-    // We need to create a GenericJNI managed frame above the stack args.
+    // Since the native call args are all on the stack, we can use the managed args
+    // registers as scratch registers. So, EBX, EDX and ECX are available.
 
-    // GenericJNI frame is similar to SaveRegsAndArgs frame with the native method
-    // instead of runtime method saved at the bottom. Note that the runtime shall
-    // not examine the args here, otherwise we would have to reload them from stack
-    // to account for the difference between managed and native ABIs.
-    SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
-    pushl %eax  // Save the hidden arg as method pointer at the bottom of the stack.
-    CFI_ADJUST_CFA_OFFSET(4)
+    // Load caller PC.
+    movl (%esp), %ecx
 
-    // Call artCriticalNativeOutArgsSize(method); method is conveniently at the bottom of the stack.
-    call SYMBOL(artCriticalNativeOutArgsSize)
+    // Save the caller method from the hidden arg.
+    PUSH_ARG eax
 
-    // Check if we have any stack args other than return PC.
-    cmp LITERAL(__SIZEOF_POINTER__), %eax
-    jnz .Lcritical_has_stack_args
+    // Call artCriticalNativeFrameSize(method, caller_pc).
+    PUSH_ARG ecx                  // Pass caller PC.
+    PUSH_ARG eax                  // Pass method.
+    call SYMBOL(artCriticalNativeFrameSize)  // (method, caller_pc)
+    DECREASE_FRAME 8              // Remove args.
 
-    // Without stack args, the frame is fully constructed.
-    // Place tagged managed sp in Thread::Current()->top_quick_frame.
-    leal 1(%esp), %eax  // Tag as GenericJNI frame.
-    mov %eax, %fs:THREAD_TOP_QUICK_FRAME_OFFSET
+    // Restore method register to EBX.
+    POP_ARG ebx
+
+    // Load caller PC to EDX and redefine return PC for CFI.
+    movl (%esp), %edx
+    CFI_REGISTER(%eip, %edx)
+
+    // Reserve space for a SaveRefsAndArgs managed frame, either for the actual runtime
+    // method or for a GenericJNI frame which is similar but has a native method and a tag.
+    INCREASE_FRAME FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__
+
+    // Calculate the number of DWORDs to move.
+    movl %eax, %ecx
+    shrl LITERAL(2), %ecx
+    jecxz .Lcritical_skip_copy_args
+
+    // Save EDI, ESI so that we can use them for moving stack args.
+    PUSH edi
+    PUSH esi
+
+    // Move the stack args.
+    leal 2 * __SIZEOF_POINTER__(%esp), %edi
+    leal FRAME_SIZE_SAVE_REFS_AND_ARGS(%edi), %esi
+    rep movsd
+
+    // Restore EDI, ESI.
+    POP esi
+    POP edi
+
+.Lcritical_skip_copy_args:
+    // Calculate the base address of the managed frame.
+    leal (%esp, %eax, 1), %eax
+
+    leal 1(%eax), %ecx            // Prepare namaged SP tagged for a GenericJNI frame.
+    testl LITERAL(ACCESS_FLAGS_METHOD_IS_NATIVE), ART_METHOD_ACCESS_FLAGS_OFFSET(%ebx)
+    jnz .Lcritical_skip_prepare_runtime_method
+
+    // Save the return PC for managed stack walk.
+    // (When coming from a compiled stub, the correct return PC is already there.)
+    movl %edx, FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__(%eax)
+
+    // Replace the target method with the SaveRefsAndArgs runtime method.
+    LOAD_RUNTIME_INSTANCE ecx, ebx
+    movl RUNTIME_SAVE_REFS_AND_ARGS_METHOD_OFFSET(%ecx), %ebx
+
+    movl %eax, %ecx               // Prepare untagged managed SP for the runtime method.
+
+.Lcritical_skip_prepare_runtime_method:
+    // Store the method on the bottom of the managed frame.
+    movl %ebx, (%eax)
+
+    // Move the managed frame address to native callee-save register EBX.
+    movl %eax, %ebx
+
+    // Spill registers for the SaveRefsAndArgs frame above the stack args.
+    movl %edi, 56(%ebx)
+    CFI_EXPRESSION_BREG CFI_REG(edi), CFI_REG(ebx), 56
+    movl %esi, 52(%ebx)
+    CFI_EXPRESSION_BREG CFI_REG(esi), CFI_REG(ebx), 52
+    movl %ebp, 48(%ebx)
+    CFI_EXPRESSION_BREG CFI_REG(ebp), CFI_REG(ebx), 48
+    // Skip managed ABI args EBX, EDX, ECX and FPRs. The runtime shall not examine the
+    // args in the managed frame. (We have already clobbered EBX, EDX, ECX anyway.)
+
+    // Place (maybe tagged) managed SP in Thread::Current()->top_quick_frame.
+    movl %ecx, %fs:THREAD_TOP_QUICK_FRAME_OFFSET
+
+    // Save our return PC in a slot reserved for first FP arg in managed ABI.
+    movl %edx, __SIZEOF_POINTER__(%ebx)
+    CFI_EXPRESSION_BREG CFI_REG(eip), CFI_REG(ebx), __SIZEOF_POINTER__
 
     // Call artFindNativeMethodRunnable()
-    subl LITERAL(12), %esp         // align stack
-    CFI_ADJUST_CFA_OFFSET(12)
+    INCREASE_FRAME 12             // Align stack.
     pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
     CFI_ADJUST_CFA_OFFSET(4)
     call SYMBOL(artFindNativeMethodRunnable)  // (Thread*)
@@ -86,87 +147,15 @@
 
     // Check for exception.
     test %eax, %eax
-    jz 1f
+    jz .Lcritical_deliver_exception
 
-    // Restore frame and do the tail call.
     CFI_REMEMBER_STATE
-    RESTORE_SAVE_REFS_AND_ARGS_FRAME
-    jmp *%eax
-    CFI_RESTORE_STATE_AND_DEF_CFA(%esp, FRAME_SIZE_SAVE_REFS_AND_ARGS)
 
-1:
-    DELIVER_PENDING_EXCEPTION_FRAME_READY
+    // Remember our return PC in EDX.
+    movl __SIZEOF_POINTER__(%ebx), %edx
+    CFI_REGISTER(%eip, %edx)
 
-.Lcritical_has_stack_args:
-    // As mentioned above, the runtime shall not examine the args in the managed frame
-    // and since all args for the native call are on the stack, we can use the managed
-    // args registers as scratch registers. So, EBX, EDX and ECX are available and we
-    // do not need to restore xmm0-xmm3 either.
-
-    // Restore registers as we're about to move stack args over the current SaveRefsAndArgs frame.
-    movl (%esp), %edx   // Remember the method in EDX.
-    movl 48(%esp), %ebp
-    CFI_RESTORE(%ebp)
-    movl 52(%esp), %esi
-    CFI_RESTORE(%esi)
-    movl 56(%esp), %edi
-    CFI_RESTORE(%edi)
-
-    // Calculate the address of the end of the move destination and redefine CFI to take
-    // ownership of the JNI stub frame. EBX is conveniently callee-save in native ABI.
-    leal 0(%esp, %eax, 1), %ebx
-    CFI_DEF_CFA(%ebx, FRAME_SIZE_SAVE_REFS_AND_ARGS)
-
-    // Calculate the number of DWORDs to move.
-    shrl LITERAL(2), %eax
-    leal -1(%eax), %ecx  // Do not move the return PC.
-
-    // Load our return PC to EAX.
-    movl FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__(%esp), %eax
-
-    // Save EDI, ESI so that we can use them for moving stack args.
-    pushl %edi  // No `CFI_ADJUST_CFA_OFFSET`, CFA register is currently EBX, not ESP.
-    pushl %esi  // ditto
-
-    // Mov the stack args.
-    leal 2 * __SIZEOF_POINTER__(%esp), %edi
-    leal FRAME_SIZE_SAVE_REFS_AND_ARGS(%edi), %esi
-    rep movsd
-
-    // Save our return PC.
-    movl %eax, (%edi)
-
-    // Restore EDI, ESI.
-    popl %esi   // No `CFI_ADJUST_CFA_OFFSET`, CFA register is currently EBX, not ESP.
-    popl %edi   // ditto
-
-    // Re-create the SaveRefsAndArgs frame above the args.
-    movl %edi, 56(%ebx)
-    CFI_REL_OFFSET(%edi, 56)
-    movl %esi, 52(%ebx)
-    CFI_REL_OFFSET(%esi, 52)
-    movl %ebp, 48(%ebx)
-    CFI_REL_OFFSET(%ebp, 48)
-    // Skip managed ABI args EBX, EDX, ECX and FPRs, see above.
-    // (We have already clobbered EBX, EDX, ECX anyway).
-    movl %edx, (%ebx)    // Save method pointer.
-
-    // Place tagged managed sp in Thread::Current()->top_quick_frame.
-    leal 1(%ebx), %eax  // Tag as GenericJNI frame.
-    movl %eax, %fs:THREAD_TOP_QUICK_FRAME_OFFSET
-
-    // Call artFindNativeMethodRunnable()
-    subl LITERAL(12), %esp        // align stack, no `CFI_ADJUST_CFA_OFFSET`.
-    pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
-    call SYMBOL(artFindNativeMethodRunnable)  // (Thread*)
-    addl LITERAL(16), %esp        // Pop args, no `CFI_ADJUST_CFA_OFFSET`.
-
-    // Check for exception.
-    test %eax, %eax
-    jz 2f
-
-    // Restore the frame. We shall not need the method anymore.
-    CFI_REMEMBER_STATE
+    // Restore callee-save registers from the frame. We shall not need the method anymore.
     movl 48(%ebx), %ebp
     CFI_RESTORE(%ebp)
     movl 52(%ebx), %esi
@@ -174,50 +163,39 @@
     movl 56(%ebx), %edi
     CFI_RESTORE(%edi)
 
-    // Remember our return PC in EDX.
-    movl -__SIZEOF_POINTER__(%ebx), %edx
-
     // Calculate the number of DWORDs to move.
-    leal -__SIZEOF_POINTER__(%ebx), %ecx  // Do not move return PC.
+    movl %ebx, %ecx
     subl %esp, %ecx
     shrl LITERAL(2), %ecx
+    jecxz .Lcritical_skip_copy_args_back
 
     // Save EDI, ESI so that we can use them for moving stack args.
-    pushl %edi  // No `CFI_ADJUST_CFA_OFFSET`, CFA register is currently EBX, not ESP.
-    pushl %esi  // ditto
+    PUSH edi
+    PUSH esi
 
-    // Mov stack args to their original place.
-    leal -2 * __SIZEOF_POINTER__(%ebx), %esi
-    leal FRAME_SIZE_SAVE_REFS_AND_ARGS - 2 * __SIZEOF_POINTER__(%ebx), %edi
+    // Move stack args to their original place.
+    leal -__SIZEOF_POINTER__(%ebx), %esi
+    leal FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__(%ebx), %edi
     std
     rep movsd
     cld
 
-    // Store our return PC.
-    movl %edx, (%edi)
-
     // Restore EDI, ESI.
-    popl %esi   // No `CFI_ADJUST_CFA_OFFSET`, CFA register is currently EBX, not ESP.
-    popl %edi   // ditto
+    POP esi
+    POP edi
 
-    // Redefine CFI to release ownership of the JNI stub frame.
-    CFI_DEF_CFA(%esp, FRAME_SIZE_SAVE_REFS_AND_ARGS)
-
+.Lcritical_skip_copy_args_back:
     // Remove the frame reservation.
-    addl LITERAL(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__), %esp
-    CFI_ADJUST_CFA_OFFSET(-FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__)
+    DECREASE_FRAME FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__
+
+    // Store our return PC.
+    movl %edx, (%esp)
+    CFI_REL_OFFSET(%eip, 0)
 
     // Do the tail call.
     jmp *%eax
-    CFI_RESTORE_STATE_AND_DEF_CFA(%ebx, FRAME_SIZE_SAVE_REFS_AND_ARGS)
+    CFI_RESTORE_STATE_AND_DEF_CFA(%esp, FRAME_SIZE_SAVE_REFS_AND_ARGS)
 
-2:
-    // Replicate DELIVER_PENDING_EXCEPTION_FRAME_READY without CFI_ADJUST_CFA_OFFSET,
-    // CFA register is currently EBX, not ESP.
-
-    // Outgoing argument set up
-    subl MACRO_LITERAL(12), %esp               // alignment padding
-    pushl %fs:THREAD_SELF_OFFSET               // pass Thread::Current()
-    call SYMBOL(artDeliverPendingExceptionFromCode)  // artDeliverPendingExceptionFromCode(Thread*)
-    UNREACHABLE
+.Lcritical_deliver_exception:
+    DELIVER_PENDING_EXCEPTION_FRAME_READY
 END_FUNCTION art_jni_dlsym_lookup_critical_stub
diff --git a/runtime/arch/x86/jni_frame_x86.h b/runtime/arch/x86/jni_frame_x86.h
index e710179..15ccff8 100644
--- a/runtime/arch/x86/jni_frame_x86.h
+++ b/runtime/arch/x86/jni_frame_x86.h
@@ -33,31 +33,47 @@
 static constexpr size_t kNativeStackAlignment = 16;  // IA-32 cdecl requires 16 byte alignment.
 static_assert(kNativeStackAlignment == kStackAlignment);
 
-// Get the size of "out args" for @CriticalNative method stub.
-// This must match the size of the frame emitted by the JNI compiler at the native call site.
-inline size_t GetCriticalNativeOutArgsSize(const char* shorty, uint32_t shorty_len) {
+// Get the size of the arguments for a native call.
+inline size_t GetNativeOutArgsSize(size_t num_args, size_t num_long_or_double_args) {
+  size_t num_arg_words = num_args + num_long_or_double_args;
+  return num_arg_words * static_cast<size_t>(kX86PointerSize);
+}
+
+// Get stack args size for @CriticalNative method calls.
+inline size_t GetCriticalNativeCallArgsSize(const char* shorty, uint32_t shorty_len) {
   DCHECK_EQ(shorty_len, strlen(shorty));
 
-  size_t num_long_or_double_args = 0u;
-  for (size_t i = 1; i != shorty_len; ++i) {
-    if (shorty[i] == 'J' || shorty[i] == 'D') {
-      num_long_or_double_args += 1u;
-    }
-  }
-  size_t num_arg_words = shorty_len - 1u + num_long_or_double_args;
+  size_t num_long_or_double_args =
+      std::count_if(shorty + 1, shorty + shorty_len, [](char c) { return c == 'J' || c == 'D'; });
 
+  return GetNativeOutArgsSize(/*num_args=*/ shorty_len - 1u, num_long_or_double_args);
+}
+
+// Get the frame size for @CriticalNative method stub.
+// This must match the size of the frame emitted by the JNI compiler at the native call site.
+inline size_t GetCriticalNativeStubFrameSize(const char* shorty, uint32_t shorty_len) {
   // The size of outgoing arguments.
-  size_t size = num_arg_words * static_cast<size_t>(kX86PointerSize);
+  size_t size = GetCriticalNativeCallArgsSize(shorty, shorty_len);
 
-  // Add return address size.
-  size += kFramePointerSize;
   // We can make a tail call if there are no stack args and the return type is not
   // FP type (needs moving from ST0 to MMX0) and we do not need to extend the result.
   bool return_type_ok = shorty[0] == 'I' || shorty[0] == 'J' || shorty[0] == 'V';
-  if (return_type_ok && size == kFramePointerSize) {
-    return kFramePointerSize;
+  if (return_type_ok && size == 0u) {
+    return 0u;
   }
 
+  // Add return address size.
+  size += kFramePointerSize;
+  return RoundUp(size, kNativeStackAlignment);
+}
+
+// Get the frame size for direct call to a @CriticalNative method.
+// This must match the size of the extra frame emitted by the compiler at the native call site.
+inline size_t GetCriticalNativeDirectCallFrameSize(const char* shorty, uint32_t shorty_len) {
+  // The size of outgoing arguments.
+  size_t size = GetCriticalNativeCallArgsSize(shorty, shorty_len);
+
+  // No return PC to save, zero- and sign-extension and FP value moves are handled by the caller.
   return RoundUp(size, kNativeStackAlignment);
 }
 
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 4abdf70..b4155e0 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -31,10 +31,7 @@
     PUSH ebp
     subl MACRO_LITERAL(12), %esp  // Grow stack by 3 words.
     CFI_ADJUST_CFA_OFFSET(12)
-    SETUP_GOT_NOSAVE RAW_VAR(got_reg)
-    // Load Runtime::instance_ from GOT.
-    movl SYMBOL(_ZN3art7Runtime9instance_E)@GOT(REG_VAR(got_reg)), REG_VAR(temp_reg)
-    movl (REG_VAR(temp_reg)), REG_VAR(temp_reg)
+    LOAD_RUNTIME_INSTANCE \temp_reg, \got_reg
     // Push save all callee-save method.
     pushl RUNTIME_SAVE_ALL_CALLEE_SAVES_METHOD_OFFSET(REG_VAR(temp_reg))
     CFI_ADJUST_CFA_OFFSET(4)
@@ -57,10 +54,7 @@
     PUSH ebp
     subl MACRO_LITERAL(12), %esp  // Grow stack by 3 words.
     CFI_ADJUST_CFA_OFFSET(12)
-    SETUP_GOT_NOSAVE RAW_VAR(got_reg)
-    // Load Runtime::instance_ from GOT.
-    movl SYMBOL(_ZN3art7Runtime9instance_E)@GOT(REG_VAR(got_reg)), REG_VAR(temp_reg)
-    movl (REG_VAR(temp_reg)), REG_VAR(temp_reg)
+    LOAD_RUNTIME_INSTANCE \temp_reg, \got_reg
     // Push save all callee-save method.
     pushl RUNTIME_SAVE_REFS_ONLY_METHOD_OFFSET(REG_VAR(temp_reg))
     CFI_ADJUST_CFA_OFFSET(4)
@@ -87,10 +81,7 @@
     subl MACRO_LITERAL(8), %esp  // Grow stack by 2 words.
     CFI_ADJUST_CFA_OFFSET(8)
 
-    SETUP_GOT_NOSAVE RAW_VAR(got_reg)
-    // Load Runtime::instance_ from GOT.
-    movl SYMBOL(_ZN3art7Runtime9instance_E)@GOT(REG_VAR(got_reg)), REG_VAR(temp_reg)
-    movl (REG_VAR(temp_reg)), REG_VAR(temp_reg)
+    LOAD_RUNTIME_INSTANCE \temp_reg, \got_reg
     // Push save all callee-save method.
     pushl RUNTIME_SAVE_REFS_ONLY_METHOD_OFFSET(REG_VAR(temp_reg))
     CFI_ADJUST_CFA_OFFSET(4)
@@ -122,10 +113,7 @@
 MACRO2(SETUP_SAVE_REFS_AND_ARGS_FRAME, got_reg, temp_reg)
     SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
 
-    SETUP_GOT_NOSAVE RAW_VAR(got_reg)
-    // Load Runtime::instance_ from GOT.
-    movl SYMBOL(_ZN3art7Runtime9instance_E)@GOT(REG_VAR(got_reg)), REG_VAR(temp_reg)
-    movl (REG_VAR(temp_reg)), REG_VAR(temp_reg)
+    LOAD_RUNTIME_INSTANCE \temp_reg, \got_reg
     // Push save all callee-save method.
     pushl RUNTIME_SAVE_REFS_AND_ARGS_METHOD_OFFSET(REG_VAR(temp_reg))
     CFI_ADJUST_CFA_OFFSET(4)
@@ -196,10 +184,7 @@
     movsd %xmm6, 60(%esp)
     movsd %xmm7, 68(%esp)
 
-    SETUP_GOT_NOSAVE RAW_VAR(got_reg)
-    // Load Runtime::instance_ from GOT.
-    movl SYMBOL(_ZN3art7Runtime9instance_E)@GOT(REG_VAR(got_reg)), REG_VAR(temp_reg)
-    movl (REG_VAR(temp_reg)), REG_VAR(temp_reg)
+    LOAD_RUNTIME_INSTANCE \temp_reg, \got_reg
     // Push save everything callee-save method.
     pushl \runtime_method_offset(REG_VAR(temp_reg))
     CFI_ADJUST_CFA_OFFSET(4)
diff --git a/runtime/arch/x86_64/asm_support_x86_64.S b/runtime/arch/x86_64/asm_support_x86_64.S
index 6a60a98..be9f59a 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.S
+++ b/runtime/arch/x86_64/asm_support_x86_64.S
@@ -18,6 +18,7 @@
 #define ART_RUNTIME_ARCH_X86_64_ASM_SUPPORT_X86_64_S_
 
 #include "asm_support_x86_64.h"
+#include "interpreter/cfi_asm_support.h"
 
 // Regular gas(1) & current clang/llvm assembler support named macro parameters.
 #define MACRO0(macro_name) .macro macro_name
@@ -76,6 +77,7 @@
     #define CFI_DEF_CFA_REGISTER(reg) .cfi_def_cfa_register reg
     #define CFI_RESTORE(reg) .cfi_restore reg
     #define CFI_REL_OFFSET(reg,size) .cfi_rel_offset reg,size
+    #define CFI_REGISTER(orig_reg, current_reg) .cfi_register orig_reg, current_reg
     #define CFI_REMEMBER_STATE .cfi_remember_state
     // The spec is not clear whether the CFA is part of the saved state and tools
     // differ in the behaviour, so explicitly set the CFA to avoid any ambiguity.
@@ -92,11 +94,43 @@
     #define CFI_DEF_CFA_REGISTER(reg)
     #define CFI_RESTORE(reg)
     #define CFI_REL_OFFSET(reg,size)
+    #define CFI_REGISTER(orig_reg, current_reg)
     #define CFI_REMEMBER_STATE
     #define CFI_RESTORE_STATE_AND_DEF_CFA(off)
     #define CFI_RESTORE_STATE
 #endif
 
+// The register numbers are a bit mixed up for x86-64.
+#define CFI_REG_rax 0
+#define CFI_REG_rcx 2
+#define CFI_REG_rdx 1
+#define CFI_REG_rbx 3
+#define CFI_REG_rsp 7
+#define CFI_REG_rbp 6
+#define CFI_REG_rsi 4
+#define CFI_REG_rdi 5
+#define CFI_REG_r8 8
+#define CFI_REG_r9 9
+#define CFI_REG_r10 10
+#define CFI_REG_r11 11
+#define CFI_REG_r12 12
+#define CFI_REG_r13 13
+#define CFI_REG_r14 14
+#define CFI_REG_r15 15
+#define CFI_REG_rip 16
+
+#define CFI_REG(reg) CFI_REG_##reg
+
+MACRO3(CFI_EXPRESSION_BREG, n, b, offset)
+    .if (-0x40 <= (\offset)) && ((\offset) < 0x40)
+        CFI_EXPRESSION_BREG_1(\n, \b, \offset)
+    .elseif (-0x2000 <= (\offset)) && ((\offset) < 0x2000)
+        CFI_EXPRESSION_BREG_2(\n, \b, \offset)
+    .else
+        .error "Unsupported offset"
+    .endif
+END_MACRO
+
     // Symbols.
 #if !defined(__APPLE__)
     #define SYMBOL(name) name
@@ -178,6 +212,16 @@
     CFI_RESTORE(REG_VAR(reg))
 END_MACRO
 
+MACRO1(INCREASE_FRAME, frame_adjustment)
+    subq MACRO_LITERAL(RAW_VAR(frame_adjustment)), %rsp
+    CFI_ADJUST_CFA_OFFSET((RAW_VAR(frame_adjustment)))
+END_MACRO
+
+MACRO1(DECREASE_FRAME, frame_adjustment)
+    addq MACRO_LITERAL(RAW_VAR(frame_adjustment)), %rsp
+    CFI_ADJUST_CFA_OFFSET(-(RAW_VAR(frame_adjustment)))
+END_MACRO
+
 MACRO1(UNIMPLEMENTED,name)
     FUNCTION_TYPE(SYMBOL(\name))
     ASM_HIDDEN VAR(name)
@@ -199,6 +243,11 @@
     int3
 END_MACRO
 
+MACRO1(LOAD_RUNTIME_INSTANCE, reg)
+    movq _ZN3art7Runtime9instance_E@GOTPCREL(%rip), REG_VAR(reg)
+    movq (REG_VAR(reg)), REG_VAR(reg)
+END_MACRO
+
 // Macros to poison (negate) the reference for heap poisoning.
 MACRO1(POISON_HEAP_REF, rRef)
 #ifdef USE_HEAP_POISONING
@@ -223,8 +272,7 @@
     int3
 #else
     // R10 := Runtime::Current()
-    movq _ZN3art7Runtime9instance_E@GOTPCREL(%rip), %r10
-    movq (%r10), %r10
+    LOAD_RUNTIME_INSTANCE r10
     // Save callee and GPR args, mixed together to agree with core spills bitmap.
     PUSH r15  // Callee save.
     PUSH r14  // Callee save.
@@ -233,8 +281,7 @@
     PUSH rbp  // Callee save.
     PUSH rbx  // Callee save.
     // Create space for FPR args, plus space for ArtMethod*.
-    subq LITERAL(8 + 4 * 8), %rsp
-    CFI_ADJUST_CFA_OFFSET(8 + 4 * 8)
+    INCREASE_FRAME 8 + 4 * 8
     // Save FPRs.
     movq %xmm12, 8(%rsp)
     movq %xmm13, 16(%rsp)
@@ -260,8 +307,7 @@
     movq 16(%rsp), %xmm13
     movq 24(%rsp), %xmm14
     movq 32(%rsp), %xmm15
-    addq LITERAL(8 + 4*8), %rsp
-    CFI_ADJUST_CFA_OFFSET(-8 - 4*8)
+    DECREASE_FRAME 8 + 4*8
     // TODO: optimize by not restoring callee-saves restored by the ABI
     POP rbx
     POP rbp
@@ -289,8 +335,7 @@
     PUSH_ARG rdx  // Quick arg 2.
     PUSH_ARG rcx  // Quick arg 3.
     // Create space for FPR args and create 2 slots for ArtMethod*.
-    subq MACRO_LITERAL(16 + 12 * 8), %rsp
-    CFI_ADJUST_CFA_OFFSET(16 + 12 * 8)
+    INCREASE_FRAME 16 + 12 * 8
     // Save FPRs.
     movq %xmm0, 16(%rsp)
     movq %xmm1, 24(%rsp)
@@ -326,8 +371,7 @@
     movq 88(%rsp), %xmm13
     movq 96(%rsp), %xmm14
     movq 104(%rsp), %xmm15
-    addq MACRO_LITERAL(80 + 4 * 8), %rsp
-    CFI_ADJUST_CFA_OFFSET(-(80 + 4 * 8))
+    DECREASE_FRAME 80 + 4 * 8
     // Restore callee and GPR args, mixed together to agree with core spills bitmap.
     POP_ARG rcx
     POP_ARG rdx
@@ -352,8 +396,7 @@
     int3
 #else
     // R10 := Runtime::Current()
-    movq _ZN3art7Runtime9instance_E@GOTPCREL(%rip), %r10
-    movq (%r10), %r10
+    LOAD_RUNTIME_INSTANCE r10
     // Save callee save registers to agree with core spills bitmap.
     PUSH r15  // Callee save.
     PUSH r14  // Callee save.
@@ -362,8 +405,7 @@
     PUSH rbp  // Callee save.
     PUSH rbx  // Callee save.
     // Create space for FPR args, plus space for ArtMethod*.
-    subq MACRO_LITERAL(4 * 8 + 8), %rsp
-    CFI_ADJUST_CFA_OFFSET(4 * 8 + 8)
+    INCREASE_FRAME 4 * 8 + 8
     // Save FPRs.
     movq %xmm12, 8(%rsp)
     movq %xmm13, 16(%rsp)
@@ -386,8 +428,7 @@
 
 MACRO0(SETUP_FP_CALLEE_SAVE_FRAME)
     // Create space for ART FP callee-saved registers
-    subq MACRO_LITERAL(4 * 8), %rsp
-    CFI_ADJUST_CFA_OFFSET(4 * 8)
+    INCREASE_FRAME 4 * 8
     movq %xmm12, 0(%rsp)
     movq %xmm13, 8(%rsp)
     movq %xmm14, 16(%rsp)
@@ -400,8 +441,7 @@
     movq 8(%rsp), %xmm13
     movq 16(%rsp), %xmm14
     movq 24(%rsp), %xmm15
-    addq MACRO_LITERAL(4 * 8), %rsp
-    CFI_ADJUST_CFA_OFFSET(- 4 * 8)
+    DECREASE_FRAME 4 * 8
 END_MACRO
 
     /*
diff --git a/runtime/arch/x86_64/jni_entrypoints_x86_64.S b/runtime/arch/x86_64/jni_entrypoints_x86_64.S
index 5c80589..a1b8c7b 100644
--- a/runtime/arch/x86_64/jni_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/jni_entrypoints_x86_64.S
@@ -28,8 +28,7 @@
     PUSH_ARG rdx  // Arg.
     PUSH_ARG rcx  // Arg.
     // Create space for FPR args, plus padding for alignment
-    subq LITERAL(72), %rsp
-    CFI_ADJUST_CFA_OFFSET(72)
+    INCREASE_FRAME 72
     // Save FPRs.
     movq %xmm0, 0(%rsp)
     movq %xmm1, 8(%rsp)
@@ -48,10 +47,10 @@
     movq (%rax), %rax                                // ArtMethod* method
     testl LITERAL(ACCESS_FLAGS_METHOD_IS_FAST_NATIVE | ACCESS_FLAGS_METHOD_IS_CRITICAL_NATIVE), \
           ART_METHOD_ACCESS_FLAGS_OFFSET(%rax)
-    jne .Llookup_stub_fast_native
+    jne .Llookup_stub_fast_or_critical_native
     call SYMBOL(artFindNativeMethod)  // (Thread*)
     jmp .Llookup_stub_continue
-.Llookup_stub_fast_native:
+.Llookup_stub_fast_or_critical_native:
     call SYMBOL(artFindNativeMethodRunnable)  // (Thread*)
 .Llookup_stub_continue:
     // restore arguments
@@ -63,8 +62,7 @@
     movq 40(%rsp), %xmm5
     movq 48(%rsp), %xmm6
     movq 56(%rsp), %xmm7
-    addq LITERAL(72), %rsp
-    CFI_ADJUST_CFA_OFFSET(-72)
+    DECREASE_FRAME 72
     POP_ARG rcx  // Arg.
     POP_ARG rdx  // Arg.
     POP_ARG rsi  // Arg.
@@ -84,29 +82,16 @@
     testq LITERAL(1), %rax
     jnz art_jni_dlsym_lookup_stub
 
-    // We need to create a GenericJNI managed frame above the stack args.
-
-    // GenericJNI frame is similar to SaveRegsAndArgs frame with the native method
-    // instead of runtime method saved at the bottom.
-
-    // As we always have "stack args" on x86-64 (due to xmm12-xmm15 being callee-save
-    // in managed ABI but caller-save in native ABI), do not create a proper frame yet
-    // as we do on other architectures where it's useful for no stack args case.
-
-    // Reserve space for the frame (return PC is on stack).
-    subq MACRO_LITERAL(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__), %rsp
-    CFI_ADJUST_CFA_OFFSET(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__)
-
-    // Save GPR args.
+    // Save GPR args and method.
     PUSH_ARG r9
     PUSH_ARG r8
     PUSH_ARG rdi
     PUSH_ARG rsi
     PUSH_ARG rdx
     PUSH_ARG rcx
+    PUSH_ARG rax
     // Create space for FPR args.
-    subq LITERAL(64), %rsp
-    CFI_ADJUST_CFA_OFFSET(64)
+    INCREASE_FRAME 8 * 8
     // Save FPRs.
     movq %xmm0, 0(%rsp)
     movq %xmm1, 8(%rsp)
@@ -116,119 +101,15 @@
     movq %xmm5, 40(%rsp)
     movq %xmm6, 48(%rsp)
     movq %xmm7, 56(%rsp)
+    // Note: It's the caller's responsibility to preserve xmm12-xmm15 as the tail call
+    // to native shall always risk clobbering those.
 
-    // Add alignment padding.
-    subq MACRO_LITERAL(__SIZEOF_POINTER__), %rsp
-    CFI_ADJUST_CFA_OFFSET(__SIZEOF_POINTER__)
-    // Save hidden arg.
-    PUSH_ARG rax
+    // Call artCriticalNativeFrameSize(method, caller_pc).
+    movq %rax, %rdi       // Pass the method from hidden arg.
+    movq 120(%rsp), %rsi  // Pass caller PC.
+    call SYMBOL(artCriticalNativeFrameSize)
 
-    // Call artCriticalNativeOutArgsSize(method).
-    movq %rax, %rdi  // Pass the method from hidden arg.
-    call SYMBOL(artCriticalNativeOutArgsSize)
-
-    // Calculate the address of the end of the move destination and redefine CFI to take
-    // ownership of the JNI stub frame.
-    leaq 16 * __SIZEOF_POINTER__(%rsp, %rax, 1), %r10  // 16 QWORDs of registers saved above.
-    CFI_DEF_CFA(%r10, FRAME_SIZE_SAVE_REFS_AND_ARGS)
-
-    // Calculate the number of QWORDs to move.
-    shrq LITERAL(3), %rax
-    leaq -1(%rax), %rcx  // Do not move the return PC.
-
-    // Load our return PC to EAX.
-    movq FRAME_SIZE_SAVE_REFS_AND_ARGS + (16 - 1) * __SIZEOF_POINTER__(%rsp), %rax
-
-    // Mov the stack args.
-    leaq 16 * __SIZEOF_POINTER__(%rsp), %rdi
-    leaq FRAME_SIZE_SAVE_REFS_AND_ARGS(%rdi), %rsi
-    rep movsq
-
-    // Save our return PC.
-    movq %rax, (%rdi)
-
-    // Pop the hidden arg and alignment padding.
-    popq %rax    // No `.cfi_adjust_cfa_offset`, CFA register is currently R10, not RSP.
-    addq MACRO_LITERAL(__SIZEOF_POINTER__), %rsp  // ditto
-
-    // Fill the SaveRefsAndArgs frame above the args, without actual args. Note that
-    // the runtime shall not examine the args here, otherwise we would have to move them in
-    // registers and stack to account for the difference between managed and native ABIs.
-    SAVE_REG_BASE r10, r15, 192
-    SAVE_REG_BASE r10, r14, 184
-    SAVE_REG_BASE r10, r13, 176
-    SAVE_REG_BASE r10, r12, 168
-    // Skip args r9, r8, rsi.
-    SAVE_REG_BASE r10, rbp, 136
-    SAVE_REG_BASE r10, rbx, 128
-    // Skip args rdx, rcx.
-    // Skip args xmm0-xmm7.
-    // Copy managed callee-saves xmm12-xmm15 from out args to the managed frame as they
-    // may theoretically store variables or unwinding data. (The compiled stub preserves
-    // them but the artCriticalNativeOutArgsSize() call above may clobber them.)
-    movq -5 * __SIZEOF_POINTER__(%r10), %xmm12
-    movq -4 * __SIZEOF_POINTER__(%r10), %xmm13
-    movq -3 * __SIZEOF_POINTER__(%r10), %xmm14
-    movq -2 * __SIZEOF_POINTER__(%r10), %xmm15
-    movq %xmm12, 80(%r10)
-    movq %xmm13, 88(%r10)
-    movq %xmm14, 96(%r10)
-    movq %xmm15, 104(%r10)
-    // Save the hidden arg as method pointer at the bottom of the stack.
-    movq %rax, (%r10)
-
-    // Move the frame register to a callee-save register.
-    movq %r10, %rbp
-    CFI_DEF_CFA_REGISTER(%rbp)
-
-    // Place tagged managed sp in Thread::Current()->top_quick_frame.
-    leaq 1(%rbp), %rax  // Tag as GenericJNI frame.
-    movq %rax, %gs:THREAD_TOP_QUICK_FRAME_OFFSET
-
-    // Call artFindNativeMethodRunnable()
-    movq %gs:THREAD_SELF_OFFSET, %rdi  // pass Thread::Current()
-    call SYMBOL(artFindNativeMethodRunnable)  // (Thread*)
-
-    // Check for exception.
-    test %rax, %rax
-    jz 2f
-
-    // Restore the frame. We shall not need the method anymore.
-    .cfi_remember_state
-    movq %rbp, %r10
-    CFI_DEF_CFA_REGISTER(%r10)
-    // Skip args xmm0-xmm7 and managed callee-saves xmm12-xmm15 (not needed for native call).
-    // Skip args rdx, rcx.
-    RESTORE_REG_BASE r10, rbx, 128
-    RESTORE_REG_BASE r10, rbp, 136
-    // Skip args r9, r8, rsi.
-    RESTORE_REG_BASE r10, r12, 168
-    RESTORE_REG_BASE r10, r13, 176
-    RESTORE_REG_BASE r10, r14, 184
-    RESTORE_REG_BASE r10, r15, 192
-
-    // Remember our return PC in R11.
-    movq -__SIZEOF_POINTER__(%r10), %r11
-
-    // Calculate the number of DWORDs to move.
-    leaq -(1 + 14) * __SIZEOF_POINTER__(%r10), %rcx  // Do not move return PC, 14 arg regs saved.
-    subq %rsp, %rcx
-    shrq LITERAL(3), %rcx
-
-    // Mov stack args to their original place.
-    leaq -2 * __SIZEOF_POINTER__(%r10), %rsi
-    leaq FRAME_SIZE_SAVE_REFS_AND_ARGS - 2 * __SIZEOF_POINTER__(%r10), %rdi
-    std
-    rep movsq
-    cld
-
-    // Store our return PC.
-    movq %r11, (%rdi)
-
-    // Redefine CFI to release ownership of the JNI stub frame.
-    CFI_DEF_CFA(%rsp, FRAME_SIZE_SAVE_REFS_AND_ARGS + 14 * __SIZEOF_POINTER__)
-
-    // Restore args.
+    // Restore registers.
     movq 0(%rsp), %xmm0
     movq 8(%rsp), %xmm1
     movq 16(%rsp), %xmm2
@@ -237,8 +118,8 @@
     movq 40(%rsp), %xmm5
     movq 48(%rsp), %xmm6
     movq 56(%rsp), %xmm7
-    addq LITERAL(64), %rsp
-    CFI_ADJUST_CFA_OFFSET(-64)
+    DECREASE_FRAME 8 * 8
+    POP_ARG r10  // Restore method to R10.
     POP_ARG rcx
     POP_ARG rdx
     POP_ARG rsi
@@ -246,17 +127,185 @@
     POP_ARG r8
     POP_ARG r9
 
+    // Load caller PC to R11 and redefine return PC for CFI.
+    movq (%rsp), %r11
+    CFI_REGISTER(%rip, %r11)
+
+    // Reserve space for a SaveRefsAndArgs managed frame, either for the actual runtime
+    // method or for a GenericJNI frame which is similar but has a native method and a tag.
+    INCREASE_FRAME FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__
+
+    // Calculate the number of QWORDs to move.
+    shrq LITERAL(3), %rax
+    jz .Lcritical_skip_copy_args
+
+    // Save RDI, RSI, RCX so that we can use them for moving stack args.
+    PUSH_ARG rdi
+    PUSH_ARG rsi
+    PUSH_ARG rcx
+
+    // Move the stack args.
+    movq %rax, %rcx
+    leaq 3 * __SIZEOF_POINTER__(%rsp), %rdi
+    leaq FRAME_SIZE_SAVE_REFS_AND_ARGS(%rdi), %rsi
+    rep movsq
+
+    // Restore RDI, RSI, RCX.
+    POP_ARG rcx
+    POP_ARG rsi
+    POP_ARG rdi
+
+.Lcritical_skip_copy_args:
+    // Calculate the base address of the managed frame.
+    leaq (%rsp, %rax, 8), %rax
+
+    // Spill registers for the SaveRefsAndArgs frame above the stack args.
+    // Note that the runtime shall not examine the args here, otherwise we would have to
+    // move them in registers and stack to account for the difference between managed and
+    // native ABIs. Do not update CFI while we hold the frame address in RAX and the values
+    // in registers are unchanged.
+    movq %r15, 192(%rax)
+    movq %r14, 184(%rax)
+    movq %r13, 176(%rax)
+    movq %r12, 168(%rax)
+    movq %r9, 160(%rax)
+    movq %r8, 152(%rax)
+    movq %rsi, 144(%rax)
+    movq %rbp, 136(%rax)
+    movq %rbx, 128(%rax)
+    movq %rdx, 120(%rax)
+    movq %rcx, 112(%rax)
+    movq %xmm0, 16(%rax)
+    movq %xmm1, 24(%rax)
+    movq %xmm2, 32(%rax)
+    movq %xmm3, 40(%rax)
+    movq %xmm4, 48(%rax)
+    movq %xmm5, 56(%rax)
+    movq %xmm6, 64(%rax)
+    movq %xmm7, 72(%rax)
+    // Skip managed ABI callee-saves xmm12-xmm15.
+
+    // Move the managed frame address to native callee-save register RBP and update CFI.
+    movq %rax, %rbp
+    CFI_EXPRESSION_BREG CFI_REG(r15), CFI_REG(rbp), 192
+    CFI_EXPRESSION_BREG CFI_REG(r14), CFI_REG(rbp), 184
+    CFI_EXPRESSION_BREG CFI_REG(r13), CFI_REG(rbp), 176
+    CFI_EXPRESSION_BREG CFI_REG(r12), CFI_REG(rbp), 168
+    // Skip args r9, r8, rsi.
+    CFI_EXPRESSION_BREG CFI_REG(rbp), CFI_REG(rbp), 136
+    CFI_EXPRESSION_BREG CFI_REG(rbx), CFI_REG(rbp), 128
+    // Skip args rdx, rcx.
+    // Skip args xmm0-xmm7.
+
+    leaq 1(%rbp), %rax            // Prepare managed SP tagged for a GenericJNI frame.
+    testl LITERAL(ACCESS_FLAGS_METHOD_IS_NATIVE), ART_METHOD_ACCESS_FLAGS_OFFSET(%r10)
+    jnz .Lcritical_skip_prepare_runtime_method
+
+    // Save the return PC for managed stack walk.
+    // (When coming from a compiled stub, the correct return PC is already there.)
+    movq %r11, FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__(%rbp)
+
+    // Replace the target method with the SaveRefsAndArgs runtime method.
+    LOAD_RUNTIME_INSTANCE r10
+    movq RUNTIME_SAVE_REFS_AND_ARGS_METHOD_OFFSET(%r10), %r10
+
+    movq %rbp, %rax               // Prepare untagged managed SP for the runtime method.
+
+.Lcritical_skip_prepare_runtime_method:
+    // Store the method on the bottom of the managed frame.
+    movq %r10, (%rbp)
+
+    // Place (maybe tagged) managed SP in Thread::Current()->top_quick_frame.
+    movq %rax, %gs:THREAD_TOP_QUICK_FRAME_OFFSET
+
+    // Save our return PC in the padding.
+    movq %r11, __SIZEOF_POINTER__(%rbp)
+    CFI_EXPRESSION_BREG CFI_REG(rip), CFI_REG(rbp), __SIZEOF_POINTER__
+
+    // Preserve the native arg register RDI in callee-save register RBX which was saved above.
+    movq %rdi, %rbx
+
+    // Call artFindNativeMethodRunnable()
+    movq %gs:THREAD_SELF_OFFSET, %rdi  // pass Thread::Current()
+    call SYMBOL(artFindNativeMethodRunnable)  // (Thread*)
+
+    // Check for exception.
+    test %rax, %rax
+    jz .Lcritical_deliver_exception
+
+    CFI_REMEMBER_STATE
+
+    // Restore the native arg register RDI.
+    movq %rbx, %rdi
+
+    // Remember our return PC in R11.
+    movq __SIZEOF_POINTER__(%rbp), %r11
+    CFI_REGISTER(%rip, %r11)
+
+    // Remember the frame base address in r10 but do not redefine CFI.
+    movq %rbp, %r10
+
+    // Restore the frame. We shall not need the method anymore.
+    movq 16(%rbp), %xmm0
+    movq 24(%rbp), %xmm1
+    movq 32(%rbp), %xmm2
+    movq 40(%rbp), %xmm3
+    movq 48(%rbp), %xmm4
+    movq 56(%rbp), %xmm5
+    movq 64(%rbp), %xmm6
+    movq 72(%rbp), %xmm7
+    // Skip managed callee-saves xmm12-xmm15.
+    movq 112(%rbp), %rcx
+    movq 120(%rbp), %rdx
+    RESTORE_REG_BASE rbp, rbx, 128
+    // Delay restoring RBP as it's the managed frame base.
+    movq 144(%rbp), %rsi
+    movq 152(%rbp), %r8
+    movq 160(%rbp), %r9
+    RESTORE_REG_BASE rbp, r12, 168
+    RESTORE_REG_BASE rbp, r13, 176
+    RESTORE_REG_BASE rbp, r14, 184
+    RESTORE_REG_BASE rbp, r15, 192
+    // Restore RBP last.
+    RESTORE_REG_BASE rbp, rbp, 136
+
+    cmp %r10, %rsp
+    je .Lcritical_skip_copy_args_back
+
+    // Save RDI, RSI, RCX so that we can use them for moving stack args.
+    PUSH_ARG rdi
+    PUSH_ARG rsi
+    PUSH_ARG rcx
+
+    // Calculate the number of QWORDs to move.
+    leaq -3 * __SIZEOF_POINTER__(%r10), %rcx
+    subq %rsp, %rcx
+    shrq LITERAL(3), %rcx
+
+    // Move the stack args.
+    leaq -__SIZEOF_POINTER__(%r10), %rsi
+    leaq FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__(%r10), %rdi
+    std
+    rep movsq
+    cld
+
+    // Restore RDI, RSI, RCX.
+    POP_ARG rcx
+    POP_ARG rsi
+    POP_ARG rdi
+
+.Lcritical_skip_copy_args_back:
     // Remove the frame reservation.
-    addq LITERAL(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__), %rsp
-    CFI_ADJUST_CFA_OFFSET(-(FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__))
+    DECREASE_FRAME FRAME_SIZE_SAVE_REFS_AND_ARGS - __SIZEOF_POINTER__
+
+    // Store our return PC.
+    movq %r11, (%rsp)
+    CFI_REL_OFFSET(%rip, 0)
 
     // Do the tail call.
     jmp *%rax
     CFI_RESTORE_STATE_AND_DEF_CFA(%rbp, FRAME_SIZE_SAVE_REFS_AND_ARGS)
 
-2:
-    // Drop the args from the stack (the RAX and padding was already removed).
-    addq LITERAL(14 * __SIZEOF_POINTER__), %rsp
-
+.Lcritical_deliver_exception:
     DELIVER_PENDING_EXCEPTION_FRAME_READY
 END_FUNCTION art_jni_dlsym_lookup_critical_stub
diff --git a/runtime/arch/x86_64/jni_frame_x86_64.h b/runtime/arch/x86_64/jni_frame_x86_64.h
index 65736fe..959e266 100644
--- a/runtime/arch/x86_64/jni_frame_x86_64.h
+++ b/runtime/arch/x86_64/jni_frame_x86_64.h
@@ -46,28 +46,34 @@
 // -- JNI calling convention only (Managed excludes RDI, so it's actually 5).
 constexpr size_t kMaxIntLikeRegisterArguments = 6u;
 
-// Get the size of "out args" for @CriticalNative method stub.
-// This must match the size of the frame emitted by the JNI compiler at the native call site.
-inline size_t GetCriticalNativeOutArgsSize(const char* shorty, uint32_t shorty_len) {
-  DCHECK_EQ(shorty_len, strlen(shorty));
-
-  size_t num_fp_args = 0u;
-  for (size_t i = 1; i != shorty_len; ++i) {
-    if (shorty[i] == 'F' || shorty[i] == 'D') {
-      num_fp_args += 1u;
-    }
-  }
-  size_t num_non_fp_args = shorty_len - 1u - num_fp_args;
-
+// Get the size of the arguments for a native call.
+inline size_t GetNativeOutArgsSize(size_t num_fp_args, size_t num_non_fp_args) {
   // Account for FP arguments passed through Xmm0..Xmm7.
   size_t num_stack_fp_args =
       num_fp_args - std::min(kMaxFloatOrDoubleRegisterArguments, num_fp_args);
   // Account for other (integer) arguments passed through GPR (RDI, RSI, RDX, RCX, R8, R9).
   size_t num_stack_non_fp_args =
       num_non_fp_args - std::min(kMaxIntLikeRegisterArguments, num_non_fp_args);
-  // The size of outgoing arguments.
   static_assert(kFramePointerSize == kMmxSpillSize);
-  size_t size = (num_stack_fp_args + num_stack_non_fp_args) * kFramePointerSize;
+  return (num_stack_fp_args + num_stack_non_fp_args) * kFramePointerSize;
+}
+
+// Get stack args size for @CriticalNative method calls.
+inline size_t GetCriticalNativeCallArgsSize(const char* shorty, uint32_t shorty_len) {
+  DCHECK_EQ(shorty_len, strlen(shorty));
+
+  size_t num_fp_args =
+      std::count_if(shorty + 1, shorty + shorty_len, [](char c) { return c == 'F' || c == 'D'; });
+  size_t num_non_fp_args = shorty_len - 1u - num_fp_args;
+
+  return GetNativeOutArgsSize(num_fp_args, num_non_fp_args);
+}
+
+// Get the frame size for @CriticalNative method stub.
+// This must match the size of the frame emitted by the JNI compiler at the native call site.
+inline size_t GetCriticalNativeStubFrameSize(const char* shorty, uint32_t shorty_len) {
+  // The size of outgoing arguments.
+  size_t size = GetCriticalNativeCallArgsSize(shorty, shorty_len);
 
   // We always need to spill xmm12-xmm15 as they are managed callee-saves
   // but not native callee-saves.
@@ -78,6 +84,16 @@
   return RoundUp(size, kNativeStackAlignment);
 }
 
+// Get the frame size for direct call to a @CriticalNative method.
+// This must match the size of the extra frame emitted by the compiler at the native call site.
+inline size_t GetCriticalNativeDirectCallFrameSize(const char* shorty, uint32_t shorty_len) {
+  // The size of outgoing arguments.
+  size_t size = GetCriticalNativeCallArgsSize(shorty, shorty_len);
+
+  // No return PC to save, zero- and sign-extension are handled by the caller.
+  return RoundUp(size, kNativeStackAlignment);
+}
+
 }  // namespace x86_64
 }  // namespace art
 
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 6a19bbb..e25045d 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -40,8 +40,7 @@
 #else
     SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
     // R10 := Runtime::Current()
-    movq _ZN3art7Runtime9instance_E@GOTPCREL(%rip), %r10
-    movq (%r10), %r10
+    LOAD_RUNTIME_INSTANCE r10
     // R10 := ArtMethod* for ref and args callee save frame method.
     movq RUNTIME_SAVE_REFS_AND_ARGS_METHOD_OFFSET(%r10), %r10
     // Store ArtMethod* to bottom of stack.
@@ -88,8 +87,7 @@
     subq MACRO_LITERAL(8 + 16 * 8), %rsp
     CFI_ADJUST_CFA_OFFSET(8 + 16 * 8)
     // R10 := Runtime::Current()
-    movq _ZN3art7Runtime9instance_E@GOTPCREL(%rip), %r10
-    movq (%r10), %r10
+    LOAD_RUNTIME_INSTANCE r10
     // Save FPRs.
     movq %xmm0, 8(%rsp)
     movq %xmm1, 16(%rsp)
diff --git a/runtime/art_method.cc b/runtime/art_method.cc
index a2a45ce..2db2faa 100644
--- a/runtime/art_method.cc
+++ b/runtime/art_method.cc
@@ -392,24 +392,6 @@
   self->PopManagedStackFragment(fragment);
 }
 
-const void* ArtMethod::RegisterNative(const void* native_method) {
-  CHECK(IsNative()) << PrettyMethod();
-  CHECK(native_method != nullptr) << PrettyMethod();
-  void* new_native_method = nullptr;
-  Runtime::Current()->GetRuntimeCallbacks()->RegisterNativeMethod(this,
-                                                                  native_method,
-                                                                  /*out*/&new_native_method);
-  SetEntryPointFromJni(new_native_method);
-  return new_native_method;
-}
-
-void ArtMethod::UnregisterNative() {
-  CHECK(IsNative()) << PrettyMethod();
-  // restore stub to lookup native pointer via dlsym
-  SetEntryPointFromJni(
-      IsCriticalNative() ? GetJniDlsymLookupCriticalStub() : GetJniDlsymLookupStub());
-}
-
 bool ArtMethod::IsOverridableByDefaultMethod() {
   return GetDeclaringClass()->IsInterface();
 }
diff --git a/runtime/art_method.h b/runtime/art_method.h
index 70d8d15..16b4648 100644
--- a/runtime/art_method.h
+++ b/runtime/art_method.h
@@ -489,13 +489,6 @@
     ClearFastInterpreterToInterpreterInvokeFlag();
   }
 
-  // Registers the native method and returns the new entry point. NB The returned entry point might
-  // be different from the native_method argument if some MethodCallback modifies it.
-  const void* RegisterNative(const void* native_method)
-      REQUIRES_SHARED(Locks::mutator_lock_) WARN_UNUSED;
-
-  void UnregisterNative() REQUIRES_SHARED(Locks::mutator_lock_);
-
   static constexpr MemberOffset DataOffset(PointerSize pointer_size) {
     return MemberOffset(PtrSizedFieldsOffset(pointer_size) + OFFSETOF_MEMBER(
         PtrSizedFields, data_) / sizeof(void*) * static_cast<size_t>(pointer_size));
@@ -587,7 +580,9 @@
 
   void SetEntryPointFromJni(const void* entrypoint)
       REQUIRES_SHARED(Locks::mutator_lock_) {
-    DCHECK(IsNative());
+    // The resolution method also has a JNI entrypoint for direct calls from
+    // compiled code to the JNI dlsym lookup stub for @CriticalNative.
+    DCHECK(IsNative() || IsRuntimeMethod());
     SetEntryPointFromJniPtrSize(entrypoint, kRuntimePointerSize);
   }
 
@@ -837,6 +832,8 @@
     // Depending on the method type, the data is
     //   - native method: pointer to the JNI function registered to this method
     //                    or a function to resolve the JNI function,
+    //   - resolution method: pointer to a function to resolve the method and
+    //                        the JNI function for @CriticalNative.
     //   - conflict method: ImtConflictTable,
     //   - abstract/interface method: the single-implementation if any,
     //   - proxy method: the original interface method or constructor,
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index f24c5f4..f7fe27d 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -322,7 +322,7 @@
           vm->DeleteWeakGlobalRef(self, classes_[i]);
           if (klass != nullptr) {
             mirror::Class::SetStatus(klass, ClassStatus::kVisiblyInitialized, self);
-            class_linker_->FixupStaticTrampolines(klass.Get());
+            class_linker_->FixupStaticTrampolines(self, klass.Get());
           }
         }
         num_classes_ = 0u;
@@ -422,14 +422,14 @@
     // Thanks to the x86 memory model, we do not need any memory fences and
     // we can immediately mark the class as visibly initialized.
     mirror::Class::SetStatus(klass, ClassStatus::kVisiblyInitialized, self);
-    FixupStaticTrampolines(klass.Get());
+    FixupStaticTrampolines(self, klass.Get());
     return nullptr;
   }
   if (Runtime::Current()->IsActiveTransaction()) {
     // Transactions are single-threaded, so we can mark the class as visibly intialized.
     // (Otherwise we'd need to track the callback's entry in the transaction for rollback.)
     mirror::Class::SetStatus(klass, ClassStatus::kVisiblyInitialized, self);
-    FixupStaticTrampolines(klass.Get());
+    FixupStaticTrampolines(self, klass.Get());
     return nullptr;
   }
   mirror::Class::SetStatus(klass, ClassStatus::kInitialized, self);
@@ -449,6 +449,65 @@
   }
 }
 
+const void* ClassLinker::RegisterNative(
+    Thread* self, ArtMethod* method, const void* native_method) {
+  CHECK(method->IsNative()) << method->PrettyMethod();
+  CHECK(native_method != nullptr) << method->PrettyMethod();
+  void* new_native_method = nullptr;
+  Runtime* runtime = Runtime::Current();
+  runtime->GetRuntimeCallbacks()->RegisterNativeMethod(method,
+                                                       native_method,
+                                                       /*out*/&new_native_method);
+  if (method->IsCriticalNative()) {
+    MutexLock lock(self, critical_native_code_with_clinit_check_lock_);
+    // Remove old registered method if any.
+    auto it = critical_native_code_with_clinit_check_.find(method);
+    if (it != critical_native_code_with_clinit_check_.end()) {
+      critical_native_code_with_clinit_check_.erase(it);
+    }
+    // To ensure correct memory visibility, we need the class to be visibly
+    // initialized before we can set the JNI entrypoint.
+    if (method->GetDeclaringClass()->IsVisiblyInitialized()) {
+      method->SetEntryPointFromJni(new_native_method);
+    } else {
+      critical_native_code_with_clinit_check_.emplace(method, new_native_method);
+    }
+  } else {
+    method->SetEntryPointFromJni(new_native_method);
+  }
+  return new_native_method;
+}
+
+void ClassLinker::UnregisterNative(Thread* self, ArtMethod* method) {
+  CHECK(method->IsNative()) << method->PrettyMethod();
+  // Restore stub to lookup native pointer via dlsym.
+  if (method->IsCriticalNative()) {
+    MutexLock lock(self, critical_native_code_with_clinit_check_lock_);
+    auto it = critical_native_code_with_clinit_check_.find(method);
+    if (it != critical_native_code_with_clinit_check_.end()) {
+      critical_native_code_with_clinit_check_.erase(it);
+    }
+    method->SetEntryPointFromJni(GetJniDlsymLookupCriticalStub());
+  } else {
+    method->SetEntryPointFromJni(GetJniDlsymLookupStub());
+  }
+}
+
+const void* ClassLinker::GetRegisteredNative(Thread* self, ArtMethod* method) {
+  if (method->IsCriticalNative()) {
+    MutexLock lock(self, critical_native_code_with_clinit_check_lock_);
+    auto it = critical_native_code_with_clinit_check_.find(method);
+    if (it != critical_native_code_with_clinit_check_.end()) {
+      return it->second;
+    }
+    const void* native_code = method->GetEntryPointFromJni();
+    return IsJniDlsymLookupCriticalStub(native_code) ? nullptr : native_code;
+  } else {
+    const void* native_code = method->GetEntryPointFromJni();
+    return IsJniDlsymLookupStub(native_code) ? nullptr : native_code;
+  }
+}
+
 void ClassLinker::ThrowEarlierClassFailure(ObjPtr<mirror::Class> c,
                                            bool wrap_in_no_class_def,
                                            bool log) {
@@ -638,6 +697,8 @@
       image_pointer_size_(kRuntimePointerSize),
       visibly_initialized_callback_lock_("visibly initialized callback lock"),
       visibly_initialized_callback_(nullptr),
+      critical_native_code_with_clinit_check_lock_("critical native code with clinit check lock"),
+      critical_native_code_with_clinit_check_(),
       cha_(Runtime::Current()->IsAotCompiler() ? nullptr : new ClassHierarchyAnalysis()) {
   // For CHA disabled during Aot, see b/34193647.
 
@@ -2498,6 +2559,17 @@
     CHAOnDeleteUpdateClassVisitor visitor(data.allocator);
     data.class_table->Visit<CHAOnDeleteUpdateClassVisitor, kWithoutReadBarrier>(visitor);
   }
+  {
+    MutexLock lock(self, critical_native_code_with_clinit_check_lock_);
+    auto end = critical_native_code_with_clinit_check_.end();
+    for (auto it = critical_native_code_with_clinit_check_.begin(); it != end; ) {
+      if (data.allocator->ContainsUnsafe(it->first)) {
+        it = critical_native_code_with_clinit_check_.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
 
   delete data.allocator;
   delete data.class_table;
@@ -3531,15 +3603,31 @@
   return false;
 }
 
-void ClassLinker::FixupStaticTrampolines(ObjPtr<mirror::Class> klass) {
+void ClassLinker::FixupStaticTrampolines(Thread* self, ObjPtr<mirror::Class> klass) {
   ScopedAssertNoThreadSuspension sants(__FUNCTION__);
   DCHECK(klass->IsVisiblyInitialized()) << klass->PrettyDescriptor();
-  if (klass->NumDirectMethods() == 0) {
+  size_t num_direct_methods = klass->NumDirectMethods();
+  if (num_direct_methods == 0) {
     return;  // No direct methods => no static methods.
   }
   if (UNLIKELY(klass->IsProxyClass())) {
     return;
   }
+  PointerSize pointer_size = image_pointer_size_;
+  if (std::any_of(klass->GetDirectMethods(pointer_size).begin(),
+                  klass->GetDirectMethods(pointer_size).end(),
+                  [](const ArtMethod& m) { return m.IsCriticalNative(); })) {
+    // Store registered @CriticalNative methods, if any, to JNI entrypoints.
+    // Direct methods are a contiguous chunk of memory, so use the ordering of the map.
+    ArtMethod* first_method = klass->GetDirectMethod(0u, pointer_size);
+    ArtMethod* last_method = klass->GetDirectMethod(num_direct_methods - 1u, pointer_size);
+    MutexLock lock(self, critical_native_code_with_clinit_check_lock_);
+    auto lb = critical_native_code_with_clinit_check_.lower_bound(first_method);
+    while (lb != critical_native_code_with_clinit_check_.end() && lb->first <= last_method) {
+      lb->first->SetEntryPointFromJni(lb->second);
+      lb = critical_native_code_with_clinit_check_.erase(lb);
+    }
+  }
   Runtime* runtime = Runtime::Current();
   if (!runtime->IsStarted()) {
     if (runtime->IsAotCompiler() || runtime->GetHeap()->HasBootImageSpace()) {
@@ -3548,18 +3636,13 @@
   }
 
   const DexFile& dex_file = klass->GetDexFile();
-  const uint16_t class_def_idx = klass->GetDexClassDefIndex();
-  CHECK_NE(class_def_idx, DexFile::kDexNoIndex16);
-  ClassAccessor accessor(dex_file, class_def_idx);
-  // There should always be class data if there were direct methods.
-  CHECK(accessor.HasClassData()) << klass->PrettyDescriptor();
   bool has_oat_class;
   OatFile::OatClass oat_class = OatFile::FindOatClass(dex_file,
                                                       klass->GetDexClassDefIndex(),
                                                       &has_oat_class);
   // Link the code of methods skipped by LinkCode.
-  for (size_t method_index = 0; method_index < accessor.NumDirectMethods(); ++method_index) {
-    ArtMethod* method = klass->GetDirectMethod(method_index, image_pointer_size_);
+  for (size_t method_index = 0; method_index < num_direct_methods; ++method_index) {
+    ArtMethod* method = klass->GetDirectMethod(method_index, pointer_size);
     if (!method->IsStatic()) {
       // Only update static methods.
       continue;
@@ -3664,8 +3747,10 @@
   }
 
   if (method->IsNative()) {
-    // Unregistering restores the dlsym lookup stub.
-    method->UnregisterNative();
+    // Set up the dlsym lookup stub. Do not go through `UnregisterNative()`
+    // as the extra processing for @CriticalNative is not needed yet.
+    method->SetEntryPointFromJni(
+        method->IsCriticalNative() ? GetJniDlsymLookupCriticalStub() : GetJniDlsymLookupStub());
 
     if (enter_interpreter || quick_code == nullptr) {
       // We have a native method here without code. Then it should have the generic JNI
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index 4731203..33cd2f9 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -18,6 +18,7 @@
 #define ART_RUNTIME_CLASS_LINKER_H_
 
 #include <list>
+#include <map>
 #include <set>
 #include <string>
 #include <type_traits>
@@ -41,6 +42,27 @@
 
 namespace art {
 
+class ArtField;
+class ArtMethod;
+class ClassHierarchyAnalysis;
+enum class ClassRoot : uint32_t;
+class ClassTable;
+class DexFile;
+template<class T> class Handle;
+class ImtConflictTable;
+template<typename T> class LengthPrefixedArray;
+template<class T> class MutableHandle;
+class InternTable;
+class LinearAlloc;
+class OatFile;
+template<class T> class ObjectLock;
+class Runtime;
+class ScopedObjectAccessAlreadyRunnable;
+template<size_t kNumReferences> class PACKED(4) StackHandleScope;
+class Thread;
+
+enum VisitRootFlags : uint8_t;
+
 namespace dex {
 struct ClassDef;
 struct MethodHandleItem;
@@ -75,27 +97,6 @@
 using MethodDexCacheType = std::atomic<MethodDexCachePair>;
 }  // namespace mirror
 
-class ArtField;
-class ArtMethod;
-class ClassHierarchyAnalysis;
-enum class ClassRoot : uint32_t;
-class ClassTable;
-class DexFile;
-template<class T> class Handle;
-class ImtConflictTable;
-template<typename T> class LengthPrefixedArray;
-template<class T> class MutableHandle;
-class InternTable;
-class LinearAlloc;
-class OatFile;
-template<class T> class ObjectLock;
-class Runtime;
-class ScopedObjectAccessAlreadyRunnable;
-template<size_t kNumReferences> class PACKED(4) StackHandleScope;
-class Thread;
-
-enum VisitRootFlags : uint8_t;
-
 class ClassVisitor {
  public:
   virtual ~ClassVisitor() {}
@@ -780,6 +781,19 @@
 
   void MakeInitializedClassesVisiblyInitialized(Thread* self, bool wait);
 
+  // Registers the native method and returns the new entry point. NB The returned entry point
+  // might be different from the native_method argument if some MethodCallback modifies it.
+  const void* RegisterNative(Thread* self, ArtMethod* method, const void* native_method)
+      REQUIRES_SHARED(Locks::mutator_lock_) WARN_UNUSED;
+
+  // Unregister native code for a method.
+  void UnregisterNative(Thread* self, ArtMethod* method) REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Get the registered native method entrypoint, if any, otherwise null.
+  const void* GetRegisteredNative(Thread* self, ArtMethod* method)
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(!critical_native_code_with_clinit_check_lock_);
+
   struct DexCacheData {
     // Construct an invalid data object.
     DexCacheData()
@@ -956,7 +970,8 @@
                   ArtMethod* dst)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  void FixupStaticTrampolines(ObjPtr<mirror::Class> klass) REQUIRES_SHARED(Locks::mutator_lock_);
+  void FixupStaticTrampolines(Thread* self, ObjPtr<mirror::Class> klass)
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Finds a class in a Path- or DexClassLoader, loading it if necessary without using JNI. Hash
   // function is supposed to be ComputeModifiedUtf8Hash(descriptor). Returns true if the
@@ -1443,6 +1458,13 @@
   IntrusiveForwardList<VisiblyInitializedCallback> running_visibly_initialized_callbacks_
       GUARDED_BY(visibly_initialized_callback_lock_);
 
+  // Registered native code for @CriticalNative methods of classes that are not visibly
+  // initialized. These code pointers cannot be stored in ArtMethod as that would risk
+  // skipping the class initialization check for direct calls from compiled code.
+  Mutex critical_native_code_with_clinit_check_lock_;
+  std::map<ArtMethod*, void*> critical_native_code_with_clinit_check_
+      GUARDED_BY(critical_native_code_with_clinit_check_lock_);
+
   std::unique_ptr<ClassHierarchyAnalysis> cha_;
 
   class FindVirtualMethodHolderVisitor;
diff --git a/runtime/entrypoints/entrypoint_utils.cc b/runtime/entrypoints/entrypoint_utils.cc
index 849a967..ef0c474 100644
--- a/runtime/entrypoints/entrypoint_utils.cc
+++ b/runtime/entrypoints/entrypoint_utils.cc
@@ -23,16 +23,19 @@
 #include "base/sdk_version.h"
 #include "class_linker-inl.h"
 #include "dex/dex_file-inl.h"
+#include "dex/method_reference.h"
 #include "entrypoints/entrypoint_utils-inl.h"
 #include "entrypoints/quick/callee_save_frame.h"
 #include "entrypoints/runtime_asm_entrypoints.h"
 #include "gc/accounting/card_table-inl.h"
+#include "index_bss_mapping.h"
 #include "jni/java_vm_ext.h"
 #include "mirror/class-inl.h"
 #include "mirror/method.h"
 #include "mirror/object-inl.h"
 #include "mirror/object_array-inl.h"
 #include "nth_caller_visitor.h"
+#include "oat_file.h"
 #include "oat_quick_method_header.h"
 #include "reflection.h"
 #include "scoped_thread_state_change-inl.h"
@@ -281,4 +284,28 @@
   return method_type;
 }
 
+void MaybeUpdateBssMethodEntry(ArtMethod* callee, MethodReference callee_reference) {
+  DCHECK(callee != nullptr);
+  if (callee_reference.dex_file->GetOatDexFile() != nullptr) {
+    size_t bss_offset = IndexBssMappingLookup::GetBssOffset(
+        callee_reference.dex_file->GetOatDexFile()->GetMethodBssMapping(),
+        callee_reference.index,
+        callee_reference.dex_file->NumMethodIds(),
+        static_cast<size_t>(kRuntimePointerSize));
+    if (bss_offset != IndexBssMappingLookup::npos) {
+      DCHECK_ALIGNED(bss_offset, static_cast<size_t>(kRuntimePointerSize));
+      const OatFile* oat_file = callee_reference.dex_file->GetOatDexFile()->GetOatFile();
+      ArtMethod** method_entry = reinterpret_cast<ArtMethod**>(const_cast<uint8_t*>(
+          oat_file->BssBegin() + bss_offset));
+      DCHECK_GE(method_entry, oat_file->GetBssMethods().data());
+      DCHECK_LT(method_entry,
+                oat_file->GetBssMethods().data() + oat_file->GetBssMethods().size());
+      std::atomic<ArtMethod*>* atomic_entry =
+          reinterpret_cast<std::atomic<ArtMethod*>*>(method_entry);
+      static_assert(sizeof(*method_entry) == sizeof(*atomic_entry), "Size check.");
+      atomic_entry->store(callee, std::memory_order_release);
+    }
+  }
+}
+
 }  // namespace art
diff --git a/runtime/entrypoints/entrypoint_utils.h b/runtime/entrypoints/entrypoint_utils.h
index 85082d3..dfc1edd 100644
--- a/runtime/entrypoints/entrypoint_utils.h
+++ b/runtime/entrypoints/entrypoint_utils.h
@@ -44,6 +44,7 @@
 class ArtMethod;
 class HandleScope;
 enum InvokeType : uint32_t;
+class MethodReference;
 class OatQuickMethodHeader;
 class ScopedObjectAccessAlreadyRunnable;
 class Thread;
@@ -218,6 +219,10 @@
 inline HandleScope* GetGenericJniHandleScope(ArtMethod** managed_sp,
                                              size_t num_handle_scope_references);
 
+// Update .bss method entrypoint if the `callee_reference` has an associated oat file
+// and that oat file has a .bss entry for the `callee_reference`.
+void MaybeUpdateBssMethodEntry(ArtMethod* callee, MethodReference callee_reference);
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_ENTRYPOINTS_ENTRYPOINT_UTILS_H_
diff --git a/runtime/entrypoints/jni/jni_entrypoints.cc b/runtime/entrypoints/jni/jni_entrypoints.cc
index f1e5772..98192c2 100644
--- a/runtime/entrypoints/jni/jni_entrypoints.cc
+++ b/runtime/entrypoints/jni/jni_entrypoints.cc
@@ -22,31 +22,99 @@
 #include "arch/x86/jni_frame_x86.h"
 #include "arch/x86_64/jni_frame_x86_64.h"
 #include "art_method-inl.h"
-#include "entrypoints/entrypoint_utils.h"
+#include "dex/dex_instruction-inl.h"
+#include "dex/method_reference.h"
+#include "entrypoints/entrypoint_utils-inl.h"
 #include "jni/java_vm_ext.h"
 #include "mirror/object-inl.h"
+#include "oat_quick_method_header.h"
 #include "scoped_thread_state_change-inl.h"
+#include "stack_map.h"
 #include "thread.h"
 
 namespace art {
 
+static inline uint32_t GetInvokeStaticMethodIndex(ArtMethod* caller, uint32_t dex_pc)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  // Get the DexFile and method index.
+  const Instruction& instruction = caller->DexInstructions().InstructionAt(dex_pc);
+  DCHECK(instruction.Opcode() == Instruction::INVOKE_STATIC ||
+         instruction.Opcode() == Instruction::INVOKE_STATIC_RANGE);
+  uint32_t method_idx = (instruction.Opcode() == Instruction::INVOKE_STATIC)
+      ? instruction.VRegB_35c()
+      : instruction.VRegB_3rc();
+  return method_idx;
+}
+
 // Used by the JNI dlsym stub to find the native method to invoke if none is registered.
 extern "C" const void* artFindNativeMethodRunnable(Thread* self)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   Locks::mutator_lock_->AssertSharedHeld(self);  // We come here as Runnable.
-  ArtMethod* method = self->GetCurrentMethod(nullptr);
+  uint32_t dex_pc;
+  ArtMethod* method = self->GetCurrentMethod(&dex_pc);
   DCHECK(method != nullptr);
+  ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
+
+  if (!method->IsNative()) {
+    // We're coming from compiled managed code and the `method` we see here is the caller.
+    // Resolve target @CriticalNative method for a direct call from compiled managed code.
+    uint32_t method_idx = GetInvokeStaticMethodIndex(method, dex_pc);
+    ArtMethod* target_method = class_linker->ResolveMethod<ClassLinker::ResolveMode::kNoChecks>(
+        self, method_idx, method, kStatic);
+    if (target_method == nullptr) {
+      self->AssertPendingException();
+      return nullptr;
+    }
+    DCHECK(target_method->IsCriticalNative());
+    MaybeUpdateBssMethodEntry(target_method, MethodReference(method->GetDexFile(), method_idx));
+
+    // These calls do not have an explicit class initialization check, so do the check now.
+    // (When going through the stub or GenericJNI, the check was already done.)
+    DCHECK(NeedsClinitCheckBeforeCall(target_method));
+    ObjPtr<mirror::Class> declaring_class = target_method->GetDeclaringClass();
+    if (UNLIKELY(!declaring_class->IsVisiblyInitialized())) {
+      StackHandleScope<1> hs(self);
+      Handle<mirror::Class> h_class(hs.NewHandle(declaring_class));
+      if (!class_linker->EnsureInitialized(self, h_class, true, true)) {
+        DCHECK(self->IsExceptionPending()) << method->PrettyMethod();
+        return nullptr;
+      }
+    }
+
+    // Replace the runtime method on the stack with the target method.
+    DCHECK(!self->GetManagedStack()->GetTopQuickFrameTag());
+    ArtMethod** sp = self->GetManagedStack()->GetTopQuickFrameKnownNotTagged();
+    DCHECK(*sp == Runtime::Current()->GetCalleeSaveMethod(CalleeSaveType::kSaveRefsAndArgs));
+    *sp = target_method;
+    self->SetTopOfStackTagged(sp);  // Fake GenericJNI frame.
+
+    // Continue with the target method.
+    method = target_method;
+  }
+  DCHECK(method == self->GetCurrentMethod(/*dex_pc=*/ nullptr));
+
+  // Check whether we already have a registered native code.
+  // For @CriticalNative it may not be stored in the ArtMethod as a JNI entrypoint if the class
+  // was not visibly initialized yet. Do this check also for @FastNative and normal native for
+  // consistency; though success would mean that another thread raced to do this lookup.
+  const void* native_code = class_linker->GetRegisteredNative(self, method);
+  if (native_code != nullptr) {
+    return native_code;
+  }
 
   // Lookup symbol address for method, on failure we'll return null with an exception set,
   // otherwise we return the address of the method we found.
   JavaVMExt* vm = down_cast<JNIEnvExt*>(self->GetJniEnv())->GetVm();
-  void* native_code = vm->FindCodeForNativeMethod(method);
+  native_code = vm->FindCodeForNativeMethod(method);
   if (native_code == nullptr) {
     self->AssertPendingException();
     return nullptr;
   }
-  // Register so that future calls don't come here
-  return method->RegisterNative(native_code);
+
+  // Register the code. This usually prevents future calls from coming to this function again.
+  // We can still come here if the ClassLinker cannot set the entrypoint in the ArtMethod,
+  // i.e. for @CriticalNative methods with the declaring class not visibly initialized.
+  return class_linker->RegisterNative(self, method, native_code);
 }
 
 // Used by the JNI dlsym stub to find the native method to invoke if none is registered.
@@ -57,23 +125,65 @@
   return artFindNativeMethodRunnable(self);
 }
 
-extern "C" size_t artCriticalNativeOutArgsSize(ArtMethod* method)
+extern "C" size_t artCriticalNativeFrameSize(ArtMethod* method, uintptr_t caller_pc)
     REQUIRES_SHARED(Locks::mutator_lock_)  {
-  uint32_t shorty_len;
-  const char* shorty = method->GetShorty(&shorty_len);
-  switch (kRuntimeISA) {
-    case InstructionSet::kArm:
-    case InstructionSet::kThumb2:
-      return arm::GetCriticalNativeOutArgsSize(shorty, shorty_len);
-    case InstructionSet::kArm64:
-      return arm64::GetCriticalNativeOutArgsSize(shorty, shorty_len);
-    case InstructionSet::kX86:
-      return x86::GetCriticalNativeOutArgsSize(shorty, shorty_len);
-    case InstructionSet::kX86_64:
-      return x86_64::GetCriticalNativeOutArgsSize(shorty, shorty_len);
-    default:
-      UNIMPLEMENTED(FATAL) << kRuntimeISA;
-      UNREACHABLE();
+  if (method->IsNative()) {
+    // Get the method's shorty.
+    DCHECK(method->IsCriticalNative());
+    uint32_t shorty_len;
+    const char* shorty = method->GetShorty(&shorty_len);
+
+    // Return the platform-dependent stub frame size.
+    switch (kRuntimeISA) {
+      case InstructionSet::kArm:
+      case InstructionSet::kThumb2:
+        return arm::GetCriticalNativeStubFrameSize(shorty, shorty_len);
+      case InstructionSet::kArm64:
+        return arm64::GetCriticalNativeStubFrameSize(shorty, shorty_len);
+      case InstructionSet::kX86:
+        return x86::GetCriticalNativeStubFrameSize(shorty, shorty_len);
+      case InstructionSet::kX86_64:
+        return x86_64::GetCriticalNativeStubFrameSize(shorty, shorty_len);
+      default:
+        UNIMPLEMENTED(FATAL) << kRuntimeISA;
+        UNREACHABLE();
+    }
+  } else {
+    // We're coming from compiled managed code and the `method` we see here is the compiled
+    // method that made the call. Get the actual caller (may be inlined) and dex pc.
+    const OatQuickMethodHeader* current_code = method->GetOatQuickMethodHeader(caller_pc);
+    DCHECK(current_code != nullptr);
+    DCHECK(current_code->IsOptimized());
+    uintptr_t native_pc_offset = current_code->NativeQuickPcOffset(caller_pc);
+    CodeInfo code_info = CodeInfo::DecodeInlineInfoOnly(current_code);
+    StackMap stack_map = code_info.GetStackMapForNativePcOffset(native_pc_offset);
+    DCHECK(stack_map.IsValid());
+    BitTableRange<InlineInfo> inline_infos = code_info.GetInlineInfosOf(stack_map);
+    ArtMethod* caller =
+        inline_infos.empty() ? method : GetResolvedMethod(method, code_info, inline_infos);
+    uint32_t dex_pc = inline_infos.empty() ? stack_map.GetDexPc() : inline_infos.back().GetDexPc();
+
+    // Get the callee shorty.
+    const DexFile* dex_file = method->GetDexFile();
+    uint32_t method_idx = GetInvokeStaticMethodIndex(caller, dex_pc);
+    uint32_t shorty_len;
+    const char* shorty = dex_file->GetMethodShorty(dex_file->GetMethodId(method_idx), &shorty_len);
+
+    // Return the platform-dependent direct call frame size.
+    switch (kRuntimeISA) {
+      case InstructionSet::kArm:
+      case InstructionSet::kThumb2:
+        return arm::GetCriticalNativeDirectCallFrameSize(shorty, shorty_len);
+      case InstructionSet::kArm64:
+        return arm64::GetCriticalNativeDirectCallFrameSize(shorty, shorty_len);
+      case InstructionSet::kX86:
+        return x86::GetCriticalNativeDirectCallFrameSize(shorty, shorty_len);
+      case InstructionSet::kX86_64:
+        return x86_64::GetCriticalNativeDirectCallFrameSize(shorty, shorty_len);
+      default:
+        UNIMPLEMENTED(FATAL) << kRuntimeISA;
+        UNREACHABLE();
+    }
   }
 }
 
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 8508086..77a9cfa 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -32,7 +32,6 @@
 #include "gc/accounting/card_table-inl.h"
 #include "imt_conflict_table.h"
 #include "imtable-inl.h"
-#include "index_bss_mapping.h"
 #include "instrumentation.h"
 #include "interpreter/interpreter.h"
 #include "interpreter/interpreter_common.h"
@@ -1319,26 +1318,9 @@
     called = linker->ResolveMethod<ClassLinker::ResolveMode::kCheckICCEAndIAE>(
         self, called_method.index, caller, invoke_type);
 
-    // Update .bss entry in oat file if any.
-    if (called != nullptr && called_method.dex_file->GetOatDexFile() != nullptr) {
-      size_t bss_offset = IndexBssMappingLookup::GetBssOffset(
-          called_method.dex_file->GetOatDexFile()->GetMethodBssMapping(),
-          called_method.index,
-          called_method.dex_file->NumMethodIds(),
-          static_cast<size_t>(kRuntimePointerSize));
-      if (bss_offset != IndexBssMappingLookup::npos) {
-        DCHECK_ALIGNED(bss_offset, static_cast<size_t>(kRuntimePointerSize));
-        const OatFile* oat_file = called_method.dex_file->GetOatDexFile()->GetOatFile();
-        ArtMethod** method_entry = reinterpret_cast<ArtMethod**>(const_cast<uint8_t*>(
-            oat_file->BssBegin() + bss_offset));
-        DCHECK_GE(method_entry, oat_file->GetBssMethods().data());
-        DCHECK_LT(method_entry,
-                  oat_file->GetBssMethods().data() + oat_file->GetBssMethods().size());
-        std::atomic<ArtMethod*>* atomic_entry =
-            reinterpret_cast<std::atomic<ArtMethod*>*>(method_entry);
-        static_assert(sizeof(*method_entry) == sizeof(*atomic_entry), "Size check.");
-        atomic_entry->store(called, std::memory_order_release);
-      }
+    // If successful, update .bss entry in oat file if any.
+    if (called != nullptr) {
+      MaybeUpdateBssMethodEntry(called, called_method);
     }
   }
   const void* code = nullptr;
diff --git a/runtime/interpreter/cfi_asm_support.h b/runtime/interpreter/cfi_asm_support.h
index 04812e1..713bcf8 100644
--- a/runtime/interpreter/cfi_asm_support.h
+++ b/runtime/interpreter/cfi_asm_support.h
@@ -50,10 +50,28 @@
     0x92 /* bregx */, reg, (offset & 0x7F),                                    \
     0x06 /* DW_OP_DEREF */,                                                    \
     0x23 /* DW_OP_plus_uconst */, size
+
+  #define CFI_EXPRESSION_BREG_1(n, b, offset) .cfi_escape       \
+      0x10,                       /* DW_CFA_expression */       \
+      n,                          /* rule for register n */     \
+      2,                          /* expression length */       \
+      0x70+b,                     /* DW_OP_BREG<b>() */         \
+      (offset) & 0x7f             /* SLEB128 offset */
+
+  #define CFI_EXPRESSION_BREG_2(n, b, offset) .cfi_escape       \
+      0x10,                       /* DW_CFA_expression */       \
+      n,                          /* rule for register n */     \
+      3,                          /* expression length */       \
+      0x70+b,                     /* DW_OP_BREG<b>() */         \
+      ((offset) & 0x7f) | 0x80,   /* SLEB128 offset, byte 1 */  \
+      ((offset) >> 7) & 0x7f      /* SLEB128 offset, byte 2 */
+
 #else
   // Mac OS doesn't like cfi_* directives.
   #define CFI_DEFINE_DEX_PC_WITH_OFFSET(tmpReg, dexReg, dexOffset)
   #define CFI_DEFINE_CFA_DEREF(reg, offset)
+  #define CFI_EXPRESSION_BREG_1(n, b, offset)
+  #define CFI_EXPRESSION_BREG_2(n, b, offset)
 #endif
 
 #endif  // ART_RUNTIME_INTERPRETER_CFI_ASM_SUPPORT_H_
diff --git a/runtime/jni/jni_internal.cc b/runtime/jni/jni_internal.cc
index c178c38..8e69157 100644
--- a/runtime/jni/jni_internal.cc
+++ b/runtime/jni/jni_internal.cc
@@ -2304,6 +2304,7 @@
       return JNI_ERR;  // Not reached except in unit tests.
     }
     CHECK_NON_NULL_ARGUMENT_FN_NAME("RegisterNatives", java_class, JNI_ERR);
+    ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
     ScopedObjectAccess soa(env);
     StackHandleScope<1> hs(soa.Self());
     Handle<mirror::Class> c = hs.NewHandle(soa.Decode<mirror::Class>(java_class));
@@ -2420,7 +2421,7 @@
         // TODO: make this a hard register error in the future.
       }
 
-      const void* final_function_ptr = m->RegisterNative(fnPtr);
+      const void* final_function_ptr = class_linker->RegisterNative(soa.Self(), m, fnPtr);
       UNUSED(final_function_ptr);
     }
     return JNI_OK;
@@ -2434,10 +2435,11 @@
     VLOG(jni) << "[Unregistering JNI native methods for " << mirror::Class::PrettyClass(c) << "]";
 
     size_t unregistered_count = 0;
-    auto pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
+    ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
+    auto pointer_size = class_linker->GetImagePointerSize();
     for (auto& m : c->GetMethods(pointer_size)) {
       if (m.IsNative()) {
-        m.UnregisterNative();
+        class_linker->UnregisterNative(soa.Self(), &m);
         unregistered_count++;
       }
     }
diff --git a/runtime/oat.h b/runtime/oat.h
index d37927d..44b61a7 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,8 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr std::array<uint8_t, 4> kOatMagic { { 'o', 'a', 't', '\n' } };
-  // Last oat version changed reason: Change ClassStatus bits with kVerifiedNeedsAccessChecks.
-  static constexpr std::array<uint8_t, 4> kOatVersion { { '1', '8', '3', '\0' } };
+  // Last oat version changed reason: Direct calls to @CriticalNative code.
+  static constexpr std::array<uint8_t, 4> kOatVersion { { '1', '8', '4', '\0' } };
 
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
   static constexpr const char* kDebuggableKey = "debuggable";
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index ad4d7a7..1c1159a 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -2307,8 +2307,10 @@
   if (IsAotCompiler()) {
     PointerSize pointer_size = GetInstructionSetPointerSize(instruction_set_);
     method->SetEntryPointFromQuickCompiledCodePtrSize(nullptr, pointer_size);
+    method->SetEntryPointFromJniPtrSize(nullptr, pointer_size);
   } else {
     method->SetEntryPointFromQuickCompiledCode(GetQuickResolutionStub());
+    method->SetEntryPointFromJni(GetJniDlsymLookupCriticalStub());
   }
   return method;
 }
diff --git a/tools/cpp-define-generator/globals.def b/tools/cpp-define-generator/globals.def
index 6c9b2b0..6706fed 100644
--- a/tools/cpp-define-generator/globals.def
+++ b/tools/cpp-define-generator/globals.def
@@ -30,6 +30,10 @@
 #include "stack.h"
 #endif
 
+ASM_DEFINE(ACCESS_FLAGS_METHOD_IS_NATIVE,
+           art::kAccNative)
+ASM_DEFINE(ACCESS_FLAGS_METHOD_IS_NATIVE_BIT,
+           art::MostSignificantBit(art::kAccNative))
 ASM_DEFINE(ACCESS_FLAGS_METHOD_IS_FAST_NATIVE,
            art::kAccFastNative)
 ASM_DEFINE(ACCESS_FLAGS_METHOD_IS_CRITICAL_NATIVE,