Direct calls to @CriticalNative methods.

Emit direct calls from compiled managed code to the native
code registered with the method, avoiding the JNI stub.

Golem results:
art-opt-cc                       x86 x86-64    arm  arm64
NativeDowncallStaticCritical  +12.5% +62.5% +75.9% +41.7%
NativeDowncallStaticCritical6 +55.6% +87.5% +72.1% +35.3%
art-opt                          x86 x86-64    arm  arm64
NativeDowncallStaticCritical  +28.6% +85.6% +76.4% +38.4%
NativeDowncallStaticCritical6 +44.6% +44.6% +74.6% +32.2%

Test: Covered by 178-app-image-native-method.
Test: m test-art-host-gtest
Test: testrunner.py --host --debuggable --ndebuggable \
          --optimizing --jit --jit-on-first-use
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Test: testrunner.py --target --debuggable --ndebuggable \
          --optimizing --jit --jit-on-first-use -t 178
Test: aosp_cf_x86_phone-userdebug boots.
Test: aosp_cf_x86_phone-userdebug/jitzygote boots.
Bug: 112189621
Change-Id: I8b37da51e8fe0b7bc513bb81b127fe0416068866
diff --git a/compiler/jni/jni_compiler_test.cc b/compiler/jni/jni_compiler_test.cc
index 2db1390..685e1e2 100644
--- a/compiler/jni/jni_compiler_test.cc
+++ b/compiler/jni/jni_compiler_test.cc
@@ -314,6 +314,12 @@
     }
     ASSERT_TRUE(jmethod_ != nullptr) << method_name << " " << method_sig;
 
+    // Make sure the test class is visibly initialized so that the RegisterNatives() below
+    // sets the JNI entrypoint rather than leaving it as null (this test pretends to be an
+    // AOT compiler and therefore the ClassLinker skips entrypoint initialization). Even
+    // if the ClassLinker initialized it with a stub, we would not want to test that here.
+    class_linker_->MakeInitializedClassesVisiblyInitialized(Thread::Current(), /*wait=*/ true);
+
     if (native_fnptr != nullptr) {
       JNINativeMethod methods[] = { { method_name, method_sig, native_fnptr } };
       ASSERT_EQ(JNI_OK, env_->RegisterNatives(jklass_, methods, 1))
diff --git a/compiler/jni/quick/arm/calling_convention_arm.cc b/compiler/jni/quick/arm/calling_convention_arm.cc
index d07ab98..7afa8b1 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.cc
+++ b/compiler/jni/quick/arm/calling_convention_arm.cc
@@ -420,7 +420,7 @@
   return RoundUp(total_size, kStackAlignment);
 }
 
-size_t ArmJniCallingConvention::OutArgSize() const {
+size_t ArmJniCallingConvention::OutFrameSize() const {
   // Count param args, including JNIEnv* and jclass*; count 8-byte args twice.
   size_t all_args = NumberOfExtraArgumentsForJni() + NumArgs() + NumLongOrDoubleArgs();
   // Account for arguments passed through r0-r3. (No FP args, AAPCS32 is soft-float.)
@@ -440,7 +440,7 @@
   }
   size_t out_args_size = RoundUp(size, kAapcsStackAlignment);
   if (UNLIKELY(IsCriticalNative())) {
-    DCHECK_EQ(out_args_size, GetCriticalNativeOutArgsSize(GetShorty(), NumArgs() + 1u));
+    DCHECK_EQ(out_args_size, GetCriticalNativeStubFrameSize(GetShorty(), NumArgs() + 1u));
   }
   return out_args_size;
 }
@@ -512,9 +512,9 @@
   CHECK_GE(itr_slots_, kJniArgumentRegisterCount);
   size_t offset =
       displacement_.Int32Value()
-          - OutArgSize()
+          - OutFrameSize()
           + ((itr_slots_ - kJniArgumentRegisterCount) * kFramePointerSize);
-  CHECK_LT(offset, OutArgSize());
+  CHECK_LT(offset, OutFrameSize());
   return FrameOffset(offset);
 }
 
@@ -537,7 +537,7 @@
 // Whether to use tail call (used only for @CriticalNative).
 bool ArmJniCallingConvention::UseTailCall() const {
   CHECK(IsCriticalNative());
-  return OutArgSize() == 0u;
+  return OutFrameSize() == 0u;
 }
 
 }  // namespace arm
diff --git a/compiler/jni/quick/arm/calling_convention_arm.h b/compiler/jni/quick/arm/calling_convention_arm.h
index 7896d64..38f7184 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.h
+++ b/compiler/jni/quick/arm/calling_convention_arm.h
@@ -65,7 +65,7 @@
   // JNI calling convention
   void Next() override;  // Override default behavior for AAPCS
   size_t FrameSize() const override;
-  size_t OutArgSize() const override;
+  size_t OutFrameSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ManagedRegister ReturnScratchRegister() const override;
   uint32_t CoreSpillMask() const override;
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.cc b/compiler/jni/quick/arm64/calling_convention_arm64.cc
index 32da141..06796c1 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.cc
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.cc
@@ -265,20 +265,14 @@
   return RoundUp(total_size, kStackAlignment);
 }
 
-size_t Arm64JniCallingConvention::OutArgSize() const {
+size_t Arm64JniCallingConvention::OutFrameSize() const {
   // Count param args, including JNIEnv* and jclass*.
   size_t all_args = NumberOfExtraArgumentsForJni() + NumArgs();
   size_t num_fp_args = NumFloatOrDoubleArgs();
   DCHECK_GE(all_args, num_fp_args);
   size_t num_non_fp_args = all_args - num_fp_args;
-  // Account for FP arguments passed through v0-v7.
-  size_t num_stack_fp_args =
-      num_fp_args - std::min(kMaxFloatOrDoubleRegisterArguments, num_fp_args);
-  // Account for other (integer and pointer) arguments passed through GPR (x0-x7).
-  size_t num_stack_non_fp_args =
-      num_non_fp_args - std::min(kMaxIntLikeRegisterArguments, num_non_fp_args);
   // The size of outgoing arguments.
-  size_t size = (num_stack_fp_args + num_stack_non_fp_args) * kFramePointerSize;
+  size_t size = GetNativeOutArgsSize(num_fp_args, num_non_fp_args);
 
   // @CriticalNative can use tail call as all managed callee saves are preserved by AAPCS64.
   static_assert((kCoreCalleeSpillMask & ~kAapcs64CoreCalleeSpillMask) == 0u);
@@ -291,7 +285,7 @@
   }
   size_t out_args_size = RoundUp(size, kAapcs64StackAlignment);
   if (UNLIKELY(IsCriticalNative())) {
-    DCHECK_EQ(out_args_size, GetCriticalNativeOutArgsSize(GetShorty(), NumArgs() + 1u));
+    DCHECK_EQ(out_args_size, GetCriticalNativeStubFrameSize(GetShorty(), NumArgs() + 1u));
   }
   return out_args_size;
 }
@@ -355,8 +349,8 @@
                              static_cast<size_t>(itr_float_and_doubles_))
                   - std::min(kMaxIntLikeRegisterArguments,
                              static_cast<size_t>(itr_args_ - itr_float_and_doubles_));
-  size_t offset = displacement_.Int32Value() - OutArgSize() + (args_on_stack * kFramePointerSize);
-  CHECK_LT(offset, OutArgSize());
+  size_t offset = displacement_.Int32Value() - OutFrameSize() + (args_on_stack * kFramePointerSize);
+  CHECK_LT(offset, OutFrameSize());
   return FrameOffset(offset);
 }
 
@@ -378,7 +372,7 @@
 // Whether to use tail call (used only for @CriticalNative).
 bool Arm64JniCallingConvention::UseTailCall() const {
   CHECK(IsCriticalNative());
-  return OutArgSize() == 0u;
+  return OutFrameSize() == 0u;
 }
 
 }  // namespace arm64
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.h b/compiler/jni/quick/arm64/calling_convention_arm64.h
index 7beca08..d381d9d 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.h
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.h
@@ -56,7 +56,7 @@
   ManagedRegister IntReturnRegister() override;
   // JNI calling convention
   size_t FrameSize() const override;
-  size_t OutArgSize() const override;
+  size_t OutFrameSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ManagedRegister ReturnScratchRegister() const override;
   uint32_t CoreSpillMask() const override;
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index b4396f0..005ae91 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -303,9 +303,9 @@
   // always at the bottom of a frame, but this doesn't work for outgoing
   // native args). Includes alignment.
   virtual size_t FrameSize() const = 0;
-  // Size of outgoing arguments (stack portion), including alignment.
+  // Size of outgoing frame, i.e. stack arguments, @CriticalNative return PC if needed, alignment.
   // -- Arguments that are passed via registers are excluded from this size.
-  virtual size_t OutArgSize() const = 0;
+  virtual size_t OutFrameSize() const = 0;
   // Number of references in stack indirect reference table
   size_t ReferenceCount() const;
   // Location where the segment state of the local indirect reference table is saved
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index 036cdbb..913a3ba 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -220,7 +220,7 @@
   // 1. Build the frame saving all callee saves, Method*, and PC return address.
   //    For @CriticalNative, this includes space for out args, otherwise just the managed frame.
   const size_t managed_frame_size = main_jni_conv->FrameSize();
-  const size_t main_out_arg_size = main_jni_conv->OutArgSize();
+  const size_t main_out_arg_size = main_jni_conv->OutFrameSize();
   size_t current_frame_size = is_critical_native ? main_out_arg_size : managed_frame_size;
   ManagedRegister method_register =
       is_critical_native ? ManagedRegister::NoRegister() : mr_conv->MethodRegister();
@@ -582,7 +582,7 @@
 
   if (LIKELY(!is_critical_native)) {
     // Increase frame size for out args if needed by the end_jni_conv.
-    const size_t end_out_arg_size = end_jni_conv->OutArgSize();
+    const size_t end_out_arg_size = end_jni_conv->OutFrameSize();
     if (end_out_arg_size > current_out_arg_size) {
       size_t out_arg_size_diff = end_out_arg_size - current_out_arg_size;
       current_out_arg_size = end_out_arg_size;
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index 6776f12..df45627 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -220,11 +220,10 @@
   return RoundUp(total_size, kStackAlignment);
 }
 
-size_t X86JniCallingConvention::OutArgSize() const {
-  // Count param args, including JNIEnv* and jclass*; count 8-byte args twice.
-  size_t all_args = NumberOfExtraArgumentsForJni() + NumArgs() + NumLongOrDoubleArgs();
-  // The size of outgoiong arguments.
-  size_t size = all_args * kFramePointerSize;
+size_t X86JniCallingConvention::OutFrameSize() const {
+  // The size of outgoing arguments.
+  size_t size = GetNativeOutArgsSize(/*num_args=*/ NumberOfExtraArgumentsForJni() + NumArgs(),
+                                     NumLongOrDoubleArgs());
 
   // @CriticalNative can use tail call as all managed callee saves are preserved by AAPCS.
   static_assert((kCoreCalleeSpillMask & ~kNativeCoreCalleeSpillMask) == 0u);
@@ -244,14 +243,16 @@
     if (return_type_ok && size == kFramePointerSize) {
       // Note: This is not aligned to kNativeStackAlignment but that's OK for tail call.
       static_assert(kFramePointerSize < kNativeStackAlignment);
-      DCHECK_EQ(kFramePointerSize, GetCriticalNativeOutArgsSize(GetShorty(), NumArgs() + 1u));
+      // The stub frame size is considered 0 in the callee where the return PC is a part of
+      // the callee frame but it is kPointerSize in the compiled stub before the tail call.
+      DCHECK_EQ(0u, GetCriticalNativeStubFrameSize(GetShorty(), NumArgs() + 1u));
       return kFramePointerSize;
     }
   }
 
   size_t out_args_size = RoundUp(size, kNativeStackAlignment);
   if (UNLIKELY(IsCriticalNative())) {
-    DCHECK_EQ(out_args_size, GetCriticalNativeOutArgsSize(GetShorty(), NumArgs() + 1u));
+    DCHECK_EQ(out_args_size, GetCriticalNativeStubFrameSize(GetShorty(), NumArgs() + 1u));
   }
   return out_args_size;
 }
@@ -279,7 +280,8 @@
 }
 
 FrameOffset X86JniCallingConvention::CurrentParamStackOffset() {
-  return FrameOffset(displacement_.Int32Value() - OutArgSize() + (itr_slots_ * kFramePointerSize));
+  return
+      FrameOffset(displacement_.Int32Value() - OutFrameSize() + (itr_slots_ * kFramePointerSize));
 }
 
 ManagedRegister X86JniCallingConvention::HiddenArgumentRegister() const {
@@ -295,7 +297,7 @@
 
 bool X86JniCallingConvention::UseTailCall() const {
   CHECK(IsCriticalNative());
-  return OutArgSize() == kFramePointerSize;
+  return OutFrameSize() == kFramePointerSize;
 }
 
 }  // namespace x86
diff --git a/compiler/jni/quick/x86/calling_convention_x86.h b/compiler/jni/quick/x86/calling_convention_x86.h
index 6f22c2b..81f617d 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.h
+++ b/compiler/jni/quick/x86/calling_convention_x86.h
@@ -61,7 +61,7 @@
   ManagedRegister IntReturnRegister() override;
   // JNI calling convention
   size_t FrameSize() const override;
-  size_t OutArgSize() const override;
+  size_t OutFrameSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ManagedRegister ReturnScratchRegister() const override;
   uint32_t CoreSpillMask() const override;
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
index e97cab8..44ae8be 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
@@ -208,21 +208,14 @@
   return RoundUp(total_size, kStackAlignment);
 }
 
-size_t X86_64JniCallingConvention::OutArgSize() const {
+size_t X86_64JniCallingConvention::OutFrameSize() const {
   // Count param args, including JNIEnv* and jclass*.
   size_t all_args = NumberOfExtraArgumentsForJni() + NumArgs();
   size_t num_fp_args = NumFloatOrDoubleArgs();
   DCHECK_GE(all_args, num_fp_args);
   size_t num_non_fp_args = all_args - num_fp_args;
-  // Account for FP arguments passed through Xmm0..Xmm7.
-  size_t num_stack_fp_args =
-      num_fp_args - std::min(kMaxFloatOrDoubleRegisterArguments, num_fp_args);
-  // Account for other (integer) arguments passed through GPR (RDI, RSI, RDX, RCX, R8, R9).
-  size_t num_stack_non_fp_args =
-      num_non_fp_args - std::min(kMaxIntLikeRegisterArguments, num_non_fp_args);
   // The size of outgoing arguments.
-  static_assert(kFramePointerSize == kMmxSpillSize);
-  size_t size = (num_stack_fp_args + num_stack_non_fp_args) * kFramePointerSize;
+  size_t size = GetNativeOutArgsSize(num_fp_args, num_non_fp_args);
 
   if (UNLIKELY(IsCriticalNative())) {
     // We always need to spill xmm12-xmm15 as they are managed callee-saves
@@ -239,7 +232,7 @@
 
   size_t out_args_size = RoundUp(size, kNativeStackAlignment);
   if (UNLIKELY(IsCriticalNative())) {
-    DCHECK_EQ(out_args_size, GetCriticalNativeOutArgsSize(GetShorty(), NumArgs() + 1u));
+    DCHECK_EQ(out_args_size, GetCriticalNativeStubFrameSize(GetShorty(), NumArgs() + 1u));
   }
   return out_args_size;
 }
@@ -297,8 +290,8 @@
       - std::min(kMaxIntLikeRegisterArguments,
                  static_cast<size_t>(itr_args_ - itr_float_and_doubles_));
           // Integer arguments passed through GPR
-  size_t offset = displacement_.Int32Value() - OutArgSize() + (args_on_stack * kFramePointerSize);
-  CHECK_LT(offset, OutArgSize());
+  size_t offset = displacement_.Int32Value() - OutFrameSize() + (args_on_stack * kFramePointerSize);
+  CHECK_LT(offset, OutFrameSize());
   return FrameOffset(offset);
 }
 
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.h b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
index d043a3e..5bde766 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.h
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
@@ -56,7 +56,7 @@
   ManagedRegister IntReturnRegister() override;
   // JNI calling convention
   size_t FrameSize() const override;
-  size_t OutArgSize() const override;
+  size_t OutFrameSize() const override;
   ArrayRef<const ManagedRegister> CalleeSaveRegisters() const override;
   ManagedRegister ReturnScratchRegister() const override;
   uint32_t CoreSpillMask() const override;
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index cfd9ea6..f74a938 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -32,6 +32,7 @@
 #include "code_generator_x86_64.h"
 #endif
 
+#include "art_method-inl.h"
 #include "base/bit_utils.h"
 #include "base/bit_utils_iterator.h"
 #include "base/casts.h"
@@ -503,23 +504,69 @@
 
   if (invoke->IsInvokeStaticOrDirect()) {
     HInvokeStaticOrDirect* call = invoke->AsInvokeStaticOrDirect();
-    switch (call->GetMethodLoadKind()) {
-      case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-        locations->SetInAt(call->GetSpecialInputIndex(), visitor->GetMethodLocation());
-        break;
-      case HInvokeStaticOrDirect::MethodLoadKind::kRuntimeCall:
-        locations->AddTemp(visitor->GetMethodLocation());
-        locations->SetInAt(call->GetSpecialInputIndex(), Location::RequiresRegister());
-        break;
-      default:
-        locations->AddTemp(visitor->GetMethodLocation());
-        break;
+    HInvokeStaticOrDirect::MethodLoadKind method_load_kind = call->GetMethodLoadKind();
+    HInvokeStaticOrDirect::CodePtrLocation code_ptr_location = call->GetCodePtrLocation();
+    if (code_ptr_location == HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative) {
+      locations->AddTemp(Location::RequiresRegister());  // For target method.
+    }
+    if (code_ptr_location == HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative ||
+        method_load_kind == HInvokeStaticOrDirect::MethodLoadKind::kRecursive) {
+      // For `kCallCriticalNative` we need the current method as the hidden argument
+      // if we reach the dlsym lookup stub for @CriticalNative.
+      locations->SetInAt(call->GetCurrentMethodIndex(), visitor->GetMethodLocation());
+    } else {
+      locations->AddTemp(visitor->GetMethodLocation());
+      if (method_load_kind == HInvokeStaticOrDirect::MethodLoadKind::kRuntimeCall) {
+        locations->SetInAt(call->GetCurrentMethodIndex(), Location::RequiresRegister());
+      }
     }
   } else if (!invoke->IsInvokePolymorphic()) {
     locations->AddTemp(visitor->GetMethodLocation());
   }
 }
 
+void CodeGenerator::PrepareCriticalNativeArgumentMoves(
+    HInvokeStaticOrDirect* invoke,
+    /*inout*/InvokeDexCallingConventionVisitor* visitor,
+    /*out*/HParallelMove* parallel_move) {
+  LocationSummary* locations = invoke->GetLocations();
+  for (size_t i = 0, num = invoke->GetNumberOfArguments(); i != num; ++i) {
+    Location in_location = locations->InAt(i);
+    DataType::Type type = invoke->InputAt(i)->GetType();
+    DCHECK_NE(type, DataType::Type::kReference);
+    Location out_location = visitor->GetNextLocation(type);
+    if (out_location.IsStackSlot() || out_location.IsDoubleStackSlot()) {
+      // Stack arguments will need to be moved after adjusting the SP.
+      parallel_move->AddMove(in_location, out_location, type, /*instruction=*/ nullptr);
+    } else {
+      // Register arguments should have been assigned their final locations for register allocation.
+      DCHECK(out_location.Equals(in_location)) << in_location << " -> " << out_location;
+    }
+  }
+}
+
+void CodeGenerator::AdjustCriticalNativeArgumentMoves(size_t out_frame_size,
+                                                      /*inout*/HParallelMove* parallel_move) {
+  // Adjust the source stack offsets by `out_frame_size`, i.e. the additional
+  // frame size needed for outgoing stack arguments.
+  for (size_t i = 0, num = parallel_move->NumMoves(); i != num; ++i) {
+    MoveOperands* operands = parallel_move->MoveOperandsAt(i);
+    Location source = operands->GetSource();
+    if (operands->GetSource().IsStackSlot()) {
+      operands->SetSource(Location::StackSlot(source.GetStackIndex() +  out_frame_size));
+    } else if (operands->GetSource().IsDoubleStackSlot()) {
+      operands->SetSource(Location::DoubleStackSlot(source.GetStackIndex() +  out_frame_size));
+    }
+  }
+}
+
+const char* CodeGenerator::GetCriticalNativeShorty(HInvokeStaticOrDirect* invoke,
+                                                   uint32_t* shorty_len) {
+  ScopedObjectAccess soa(Thread::Current());
+  DCHECK(invoke->GetResolvedMethod()->IsCriticalNative());
+  return invoke->GetResolvedMethod()->GetShorty(shorty_len);
+}
+
 void CodeGenerator::GenerateInvokeStaticOrDirectRuntimeCall(
     HInvokeStaticOrDirect* invoke, Location temp, SlowPathCode* slow_path) {
   MoveConstant(temp, invoke->GetDexMethodIndex());
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index ff2be47..4bfc14a 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -570,6 +570,28 @@
   static void CreateCommonInvokeLocationSummary(
       HInvoke* invoke, InvokeDexCallingConventionVisitor* visitor);
 
+  template <typename CriticalNativeCallingConventionVisitor,
+            size_t kNativeStackAlignment,
+            size_t GetCriticalNativeDirectCallFrameSize(const char* shorty, uint32_t shorty_len)>
+  static size_t PrepareCriticalNativeCall(HInvokeStaticOrDirect* invoke,
+                                          /*out*/HParallelMove* parallel_move) {
+      DCHECK(!invoke->GetLocations()->Intrinsified());
+      CriticalNativeCallingConventionVisitor calling_convention_visitor(
+          /*for_register_allocation=*/ false);
+      PrepareCriticalNativeArgumentMoves(invoke, &calling_convention_visitor, parallel_move);
+      size_t out_frame_size =
+          RoundUp(calling_convention_visitor.GetStackOffset(), kNativeStackAlignment);
+      if (kIsDebugBuild) {
+        uint32_t shorty_len;
+        const char* shorty = GetCriticalNativeShorty(invoke, &shorty_len);
+        DCHECK_EQ(GetCriticalNativeDirectCallFrameSize(shorty, shorty_len), out_frame_size);
+      }
+      if (out_frame_size != 0u) {
+        AdjustCriticalNativeArgumentMoves(out_frame_size, parallel_move);
+      }
+      return out_frame_size;
+  }
+
   void GenerateInvokeStaticOrDirectRuntimeCall(
       HInvokeStaticOrDirect* invoke, Location temp, SlowPathCode* slow_path);
 
@@ -799,6 +821,16 @@
                        bool needs_vreg_info = true);
   void EmitVRegInfo(HEnvironment* environment, SlowPathCode* slow_path);
 
+  static void PrepareCriticalNativeArgumentMoves(
+      HInvokeStaticOrDirect* invoke,
+      /*inout*/InvokeDexCallingConventionVisitor* visitor,
+      /*out*/HParallelMove* parallel_move);
+
+  static void AdjustCriticalNativeArgumentMoves(size_t out_frame_size,
+                                                /*inout*/HParallelMove* parallel_move);
+
+  static const char* GetCriticalNativeShorty(HInvokeStaticOrDirect* invoke, uint32_t* shorty_len);
+
   OptimizingCompilerStats* stats_;
 
   HGraph* const graph_;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 4a618de..d108623 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -18,6 +18,7 @@
 
 #include "arch/arm64/asm_support_arm64.h"
 #include "arch/arm64/instruction_set_features_arm64.h"
+#include "arch/arm64/jni_frame_arm64.h"
 #include "art_method-inl.h"
 #include "base/bit_utils.h"
 #include "base/bit_utils_iterator.h"
@@ -870,6 +871,49 @@
   return LocationFrom(kArtMethodRegister);
 }
 
+Location CriticalNativeCallingConventionVisitorARM64::GetNextLocation(DataType::Type type) {
+  DCHECK_NE(type, DataType::Type::kReference);
+
+  Location location = Location::NoLocation();
+  if (DataType::IsFloatingPointType(type)) {
+    if (fpr_index_ < kParameterFPRegistersLength) {
+      location = LocationFrom(kParameterFPRegisters[fpr_index_]);
+      ++fpr_index_;
+    }
+  } else {
+    // Native ABI uses the same registers as managed, except that the method register x0
+    // is a normal argument.
+    if (gpr_index_ < 1u + kParameterCoreRegistersLength) {
+      location = LocationFrom(gpr_index_ == 0u ? x0 : kParameterCoreRegisters[gpr_index_ - 1u]);
+      ++gpr_index_;
+    }
+  }
+  if (location.IsInvalid()) {
+    if (DataType::Is64BitType(type)) {
+      location = Location::DoubleStackSlot(stack_offset_);
+    } else {
+      location = Location::StackSlot(stack_offset_);
+    }
+    stack_offset_ += kFramePointerSize;
+
+    if (for_register_allocation_) {
+      location = Location::Any();
+    }
+  }
+  return location;
+}
+
+Location CriticalNativeCallingConventionVisitorARM64::GetReturnLocation(DataType::Type type) const {
+  // We perform conversion to the managed ABI return register after the call if needed.
+  InvokeDexCallingConventionVisitorARM64 dex_calling_convention;
+  return dex_calling_convention.GetReturnLocation(type);
+}
+
+Location CriticalNativeCallingConventionVisitorARM64::GetMethodLocation() const {
+  // Pass the method in the hidden argument x15.
+  return Location::RegisterLocation(x15.GetCode());
+}
+
 CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph,
                                        const CompilerOptions& compiler_options,
                                        OptimizingCompilerStats* stats)
@@ -4295,7 +4339,13 @@
     return;
   }
 
-  HandleInvoke(invoke);
+  if (invoke->GetCodePtrLocation() == HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative) {
+    CriticalNativeCallingConventionVisitorARM64 calling_convention_visitor(
+        /*for_register_allocation=*/ true);
+    CodeGenerator::CreateCommonInvokeLocationSummary(invoke, &calling_convention_visitor);
+  } else {
+    HandleInvoke(invoke);
+  }
 }
 
 static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorARM64* codegen) {
@@ -4327,7 +4377,7 @@
       break;
     }
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(GetCompilerOptions().IsBootImage() || GetCompilerOptions().IsBootImageExtension());
@@ -4373,6 +4423,19 @@
     }
   }
 
+  auto call_code_pointer_member = [&](MemberOffset offset) {
+    // LR = callee_method->member;
+    __ Ldr(lr, MemOperand(XRegisterFrom(callee_method), offset.Int32Value()));
+    {
+      // Use a scope to help guarantee that `RecordPcInfo()` records the correct pc.
+      ExactAssemblyScope eas(GetVIXLAssembler(),
+                             kInstructionSize,
+                             CodeBufferCheckScope::kExactSize);
+      // lr()
+      __ blr(lr);
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
+    }
+  };
   switch (invoke->GetCodePtrLocation()) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
       {
@@ -4384,20 +4447,50 @@
         RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
       }
       break;
-    case HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod:
-      // LR = callee_method->entry_point_from_quick_compiled_code_;
-      __ Ldr(lr, MemOperand(
-          XRegisterFrom(callee_method),
-          ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize).Int32Value()));
-      {
-        // Use a scope to help guarantee that `RecordPcInfo()` records the correct pc.
-        ExactAssemblyScope eas(GetVIXLAssembler(),
-                               kInstructionSize,
-                               CodeBufferCheckScope::kExactSize);
-        // lr()
-        __ blr(lr);
-        RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
+    case HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative: {
+      HParallelMove parallel_move(GetGraph()->GetAllocator());
+      size_t out_frame_size =
+          PrepareCriticalNativeCall<CriticalNativeCallingConventionVisitorARM64,
+                                    kAapcs64StackAlignment,
+                                    GetCriticalNativeDirectCallFrameSize>(invoke, &parallel_move);
+      if (out_frame_size != 0u) {
+        __ Claim(out_frame_size);
+        GetAssembler()->cfi().AdjustCFAOffset(out_frame_size);
+        GetMoveResolver()->EmitNativeCode(&parallel_move);
       }
+      call_code_pointer_member(ArtMethod::EntryPointFromJniOffset(kArm64PointerSize));
+      // Zero-/sign-extend the result when needed due to native and managed ABI mismatch.
+      switch (invoke->GetType()) {
+        case DataType::Type::kBool:
+          __ Ubfx(w0, w0, 0, 8);
+          break;
+        case DataType::Type::kInt8:
+          __ Sbfx(w0, w0, 0, 8);
+          break;
+        case DataType::Type::kUint16:
+          __ Ubfx(w0, w0, 0, 16);
+          break;
+        case DataType::Type::kInt16:
+          __ Sbfx(w0, w0, 0, 16);
+          break;
+        case DataType::Type::kInt32:
+        case DataType::Type::kInt64:
+        case DataType::Type::kFloat32:
+        case DataType::Type::kFloat64:
+        case DataType::Type::kVoid:
+          break;
+        default:
+          DCHECK(false) << invoke->GetType();
+          break;
+      }
+      if (out_frame_size != 0u) {
+        __ Drop(out_frame_size);
+        GetAssembler()->cfi().AdjustCFAOffset(-out_frame_size);
+      }
+      break;
+    }
+    case HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod:
+      call_code_pointer_member(ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64PointerSize));
       break;
   }
 
@@ -4819,14 +4912,9 @@
     return;
   }
 
-  {
-    // Ensure that between the BLR (emitted by GenerateStaticOrDirectCall) and RecordPcInfo there
-    // are no pools emitted.
-    EmissionCheckScope guard(GetVIXLAssembler(), kInvokeCodeMarginSizeInBytes);
-    LocationSummary* locations = invoke->GetLocations();
-    codegen_->GenerateStaticOrDirectCall(
-        invoke, locations->HasTemps() ? locations->GetTemp(0) : Location::NoLocation());
-  }
+  LocationSummary* locations = invoke->GetLocations();
+  codegen_->GenerateStaticOrDirectCall(
+      invoke, locations->HasTemps() ? locations->GetTemp(0) : Location::NoLocation());
 
   codegen_->MaybeGenerateMarkingRegisterCheck(/* code= */ __LINE__);
 }
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 487d091..bebf43d 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -231,6 +231,31 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitorARM64);
 };
 
+class CriticalNativeCallingConventionVisitorARM64 : public InvokeDexCallingConventionVisitor {
+ public:
+  explicit CriticalNativeCallingConventionVisitorARM64(bool for_register_allocation)
+      : for_register_allocation_(for_register_allocation) {}
+
+  virtual ~CriticalNativeCallingConventionVisitorARM64() {}
+
+  Location GetNextLocation(DataType::Type type) override;
+  Location GetReturnLocation(DataType::Type type) const override;
+  Location GetMethodLocation() const override;
+
+  size_t GetStackOffset() const { return stack_offset_; }
+
+ private:
+  // Register allocator does not support adjusting frame size, so we cannot provide final locations
+  // of stack arguments for register allocation. We ask the register allocator for any location and
+  // move these arguments to the right place after adjusting the SP when generating the call.
+  const bool for_register_allocation_;
+  size_t gpr_index_ = 0u;
+  size_t fpr_index_ = 0u;
+  size_t stack_offset_ = 0u;
+
+  DISALLOW_COPY_AND_ASSIGN(CriticalNativeCallingConventionVisitorARM64);
+};
+
 class FieldAccessCallingConventionARM64 : public FieldAccessCallingConvention {
  public:
   FieldAccessCallingConventionARM64() {}
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 1d8fd6c..9916257 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -18,6 +18,7 @@
 
 #include "arch/arm/asm_support_arm.h"
 #include "arch/arm/instruction_set_features_arm.h"
+#include "arch/arm/jni_frame_arm.h"
 #include "art_method-inl.h"
 #include "base/bit_utils.h"
 #include "base/bit_utils_iterator.h"
@@ -2435,6 +2436,54 @@
   return LocationFrom(kMethodRegister);
 }
 
+Location CriticalNativeCallingConventionVisitorARMVIXL::GetNextLocation(DataType::Type type) {
+  DCHECK_NE(type, DataType::Type::kReference);
+
+  // Native ABI uses the same registers as managed, except that the method register r0
+  // is a normal argument.
+  Location location = Location::NoLocation();
+  if (DataType::Is64BitType(type)) {
+    gpr_index_ = RoundUp(gpr_index_, 2u);
+    stack_offset_ = RoundUp(stack_offset_, 2 * kFramePointerSize);
+    if (gpr_index_ < 1u + kParameterCoreRegistersLengthVIXL) {
+      location = LocationFrom(gpr_index_ == 0u ? r0 : kParameterCoreRegistersVIXL[gpr_index_ - 1u],
+                              kParameterCoreRegistersVIXL[gpr_index_]);
+      gpr_index_ += 2u;
+    }
+  } else {
+    if (gpr_index_ < 1u + kParameterCoreRegistersLengthVIXL) {
+      location = LocationFrom(gpr_index_ == 0u ? r0 : kParameterCoreRegistersVIXL[gpr_index_ - 1u]);
+      ++gpr_index_;
+    }
+  }
+  if (location.IsInvalid()) {
+    if (DataType::Is64BitType(type)) {
+      location = Location::DoubleStackSlot(stack_offset_);
+      stack_offset_ += 2 * kFramePointerSize;
+    } else {
+      location = Location::StackSlot(stack_offset_);
+      stack_offset_ += kFramePointerSize;
+    }
+
+    if (for_register_allocation_) {
+      location = Location::Any();
+    }
+  }
+  return location;
+}
+
+Location CriticalNativeCallingConventionVisitorARMVIXL::GetReturnLocation(DataType::Type type)
+    const {
+  // We perform conversion to the managed ABI return register after the call if needed.
+  InvokeDexCallingConventionVisitorARMVIXL dex_calling_convention;
+  return dex_calling_convention.GetReturnLocation(type);
+}
+
+Location CriticalNativeCallingConventionVisitorARMVIXL::GetMethodLocation() const {
+  // Pass the method in the hidden argument R4.
+  return Location::RegisterLocation(R4);
+}
+
 void CodeGeneratorARMVIXL::Move32(Location destination, Location source) {
   if (source.Equals(destination)) {
     return;
@@ -3294,7 +3343,13 @@
     return;
   }
 
-  HandleInvoke(invoke);
+  if (invoke->GetCodePtrLocation() == HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative) {
+    CriticalNativeCallingConventionVisitorARMVIXL calling_convention_visitor(
+        /*for_register_allocation=*/ true);
+    CodeGenerator::CreateCommonInvokeLocationSummary(invoke, &calling_convention_visitor);
+  } else {
+    HandleInvoke(invoke);
+  }
 }
 
 static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorARMVIXL* codegen) {
@@ -8856,35 +8911,35 @@
 // otherwise return a fall-back info that should be used instead.
 HInvokeStaticOrDirect::DispatchInfo CodeGeneratorARMVIXL::GetSupportedInvokeStaticOrDirectDispatch(
     const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info,
-    ArtMethod* method ATTRIBUTE_UNUSED) {
+    ArtMethod* method) {
+  if (desired_dispatch_info.code_ptr_location ==
+          HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative) {
+    // TODO: Work around CheckTypeConsistency() in code_generator.cc that does not allow
+    // putting FP values in core registers as we need to do for the soft-float native ABI.
+    ScopedObjectAccess soa(Thread::Current());
+    uint32_t shorty_len;
+    const char* shorty = method->GetShorty(&shorty_len);
+    size_t reg = 0u;
+    for (uint32_t i = 1; i != shorty_len; ++i) {
+      size_t next_reg = reg + 1u;
+      if (shorty[i] == 'D' || shorty[i] == 'J') {
+        reg = RoundUp(reg, 2u);
+        next_reg = reg + 2u;
+      }
+      if (reg == 4u) {
+        break;
+      }
+      if (shorty[i] == 'D' || shorty[i] == 'F') {
+        HInvokeStaticOrDirect::DispatchInfo dispatch_info = desired_dispatch_info;
+        dispatch_info.code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod;
+        return dispatch_info;
+      }
+      reg = next_reg;
+    }
+  }
   return desired_dispatch_info;
 }
 
-vixl32::Register CodeGeneratorARMVIXL::GetInvokeStaticOrDirectExtraParameter(
-    HInvokeStaticOrDirect* invoke, vixl32::Register temp) {
-  DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
-  Location location = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
-  if (!invoke->GetLocations()->Intrinsified()) {
-    return RegisterFrom(location);
-  }
-  // For intrinsics we allow any location, so it may be on the stack.
-  if (!location.IsRegister()) {
-    GetAssembler()->LoadFromOffset(kLoadWord, temp, sp, location.GetStackIndex());
-    return temp;
-  }
-  // For register locations, check if the register was saved. If so, get it from the stack.
-  // Note: There is a chance that the register was saved but not overwritten, so we could
-  // save one load. However, since this is just an intrinsic slow path we prefer this
-  // simple and more robust approach rather that trying to determine if that's the case.
-  SlowPathCode* slow_path = GetCurrentSlowPath();
-  if (slow_path != nullptr && slow_path->IsCoreRegisterSaved(RegisterFrom(location).GetCode())) {
-    int stack_offset = slow_path->GetStackOffsetOfCoreRegister(RegisterFrom(location).GetCode());
-    GetAssembler()->LoadFromOffset(kLoadWord, temp, sp, stack_offset);
-    return temp;
-  }
-  return RegisterFrom(location);
-}
-
 void CodeGeneratorARMVIXL::GenerateStaticOrDirectCall(
     HInvokeStaticOrDirect* invoke, Location temp, SlowPathCode* slow_path) {
   Location callee_method = temp;  // For all kinds except kRecursive, callee will be in temp.
@@ -8897,7 +8952,7 @@
       break;
     }
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(GetCompilerOptions().IsBootImage() || GetCompilerOptions().IsBootImageExtension());
@@ -8932,6 +8987,20 @@
     }
   }
 
+  auto call_code_pointer_member = [&](MemberOffset offset) {
+    // LR = callee_method->member;
+    GetAssembler()->LoadFromOffset(kLoadWord, lr, RegisterFrom(callee_method), offset.Int32Value());
+    {
+      // Use a scope to help guarantee that `RecordPcInfo()` records the correct pc.
+      // blx in T32 has only 16bit encoding that's why a stricter check for the scope is used.
+      ExactAssemblyScope aas(GetVIXLAssembler(),
+                             vixl32::k16BitT32InstructionSizeInBytes,
+                             CodeBufferCheckScope::kExactSize);
+      // LR()
+      __ blx(lr);
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
+    }
+  };
   switch (invoke->GetCodePtrLocation()) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
       {
@@ -8943,23 +9012,46 @@
         RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
       }
       break;
-    case HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod:
-      // LR = callee_method->entry_point_from_quick_compiled_code_
-      GetAssembler()->LoadFromOffset(
-            kLoadWord,
-            lr,
-            RegisterFrom(callee_method),
-            ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize).Int32Value());
-      {
-        // Use a scope to help guarantee that `RecordPcInfo()` records the correct pc.
-        // blx in T32 has only 16bit encoding that's why a stricter check for the scope is used.
-        ExactAssemblyScope aas(GetVIXLAssembler(),
-                               vixl32::k16BitT32InstructionSizeInBytes,
-                               CodeBufferCheckScope::kExactSize);
-        // LR()
-        __ blx(lr);
-        RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
+    case HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative: {
+      HParallelMove parallel_move(GetGraph()->GetAllocator());
+      size_t out_frame_size =
+          PrepareCriticalNativeCall<CriticalNativeCallingConventionVisitorARMVIXL,
+                                    kAapcsStackAlignment,
+                                    GetCriticalNativeDirectCallFrameSize>(invoke, &parallel_move);
+      if (out_frame_size != 0u) {
+        __ Claim(out_frame_size);
+        GetAssembler()->cfi().AdjustCFAOffset(out_frame_size);
+        GetMoveResolver()->EmitNativeCode(&parallel_move);
       }
+      call_code_pointer_member(ArtMethod::EntryPointFromJniOffset(kArmPointerSize));
+      // Move the result when needed due to native and managed ABI mismatch.
+      switch (invoke->GetType()) {
+        case DataType::Type::kFloat32:
+          __ Vmov(s0, r0);
+          break;
+        case DataType::Type::kFloat64:
+          __ Vmov(d0, r0, r1);
+          break;
+        case DataType::Type::kBool:
+        case DataType::Type::kInt8:
+        case DataType::Type::kUint16:
+        case DataType::Type::kInt16:
+        case DataType::Type::kInt32:
+        case DataType::Type::kInt64:
+        case DataType::Type::kVoid:
+          break;
+        default:
+          DCHECK(false) << invoke->GetType();
+          break;
+      }
+      if (out_frame_size != 0u) {
+        __ Drop(out_frame_size);
+        GetAssembler()->cfi().AdjustCFAOffset(-out_frame_size);
+      }
+      break;
+    }
+    case HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod:
+      call_code_pointer_member(ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArmPointerSize));
       break;
   }
 
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index 3eed730..d6300c7 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -187,6 +187,30 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitorARMVIXL);
 };
 
+class CriticalNativeCallingConventionVisitorARMVIXL : public InvokeDexCallingConventionVisitor {
+ public:
+  explicit CriticalNativeCallingConventionVisitorARMVIXL(bool for_register_allocation)
+      : for_register_allocation_(for_register_allocation) {}
+
+  virtual ~CriticalNativeCallingConventionVisitorARMVIXL() {}
+
+  Location GetNextLocation(DataType::Type type) override;
+  Location GetReturnLocation(DataType::Type type) const override;
+  Location GetMethodLocation() const override;
+
+  size_t GetStackOffset() const { return stack_offset_; }
+
+ private:
+  // Register allocator does not support adjusting frame size, so we cannot provide final locations
+  // of stack arguments for register allocation. We ask the register allocator for any location and
+  // move these arguments to the right place after adjusting the SP when generating the call.
+  const bool for_register_allocation_;
+  size_t gpr_index_ = 0u;
+  size_t stack_offset_ = 0u;
+
+  DISALLOW_COPY_AND_ASSIGN(CriticalNativeCallingConventionVisitorARMVIXL);
+};
+
 class FieldAccessCallingConventionARMVIXL : public FieldAccessCallingConvention {
  public:
   FieldAccessCallingConventionARMVIXL() {}
@@ -853,9 +877,6 @@
                                     uint32_t encoded_data,
                                     /*out*/ std::string* debug_name);
 
-  vixl::aarch32::Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke,
-                                                                vixl::aarch32::Register temp);
-
   using Uint32ToLiteralMap = ArenaSafeMap<uint32_t, VIXLUInt32Literal*>;
   using StringToLiteralMap = ArenaSafeMap<StringReference,
                                           VIXLUInt32Literal*,
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index e9ef21a..595b31e 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -16,6 +16,7 @@
 
 #include "code_generator_x86.h"
 
+#include "arch/x86/jni_frame_x86.h"
 #include "art_method-inl.h"
 #include "class_table.h"
 #include "code_generator_utils.h"
@@ -1300,6 +1301,34 @@
   return Location::NoLocation();
 }
 
+Location CriticalNativeCallingConventionVisitorX86::GetNextLocation(DataType::Type type) {
+  DCHECK_NE(type, DataType::Type::kReference);
+
+  Location location;
+  if (DataType::Is64BitType(type)) {
+    location = Location::DoubleStackSlot(stack_offset_);
+    stack_offset_ += 2 * kFramePointerSize;
+  } else {
+    location = Location::StackSlot(stack_offset_);
+    stack_offset_ += kFramePointerSize;
+  }
+  if (for_register_allocation_) {
+    location = Location::Any();
+  }
+  return location;
+}
+
+Location CriticalNativeCallingConventionVisitorX86::GetReturnLocation(DataType::Type type) const {
+  // We perform conversion to the managed ABI return register after the call if needed.
+  InvokeDexCallingConventionVisitorX86 dex_calling_convention;
+  return dex_calling_convention.GetReturnLocation(type);
+}
+
+Location CriticalNativeCallingConventionVisitorX86::GetMethodLocation() const {
+  // Pass the method in the hidden argument EAX.
+  return Location::RegisterLocation(EAX);
+}
+
 void CodeGeneratorX86::Move32(Location destination, Location source) {
   if (source.Equals(destination)) {
     return;
@@ -1374,11 +1403,13 @@
       size_t elem_size = DataType::Size(DataType::Type::kInt32);
       // Create stack space for 2 elements.
       __ subl(ESP, Immediate(2 * elem_size));
+      __ cfi().AdjustCFAOffset(2 * elem_size);
       __ movl(Address(ESP, 0), source.AsRegisterPairLow<Register>());
       __ movl(Address(ESP, elem_size), source.AsRegisterPairHigh<Register>());
       __ movsd(destination.AsFpuRegister<XmmRegister>(), Address(ESP, 0));
       // And remove the temporary stack space we allocated.
       __ addl(ESP, Immediate(2 * elem_size));
+      __ cfi().AdjustCFAOffset(-(2 * elem_size));
     } else {
       LOG(FATAL) << "Unimplemented";
     }
@@ -2286,9 +2317,15 @@
     return;
   }
 
-  HandleInvoke(invoke);
+  if (invoke->GetCodePtrLocation() == HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative) {
+    CriticalNativeCallingConventionVisitorX86 calling_convention_visitor(
+        /*for_register_allocation=*/ true);
+    CodeGenerator::CreateCommonInvokeLocationSummary(invoke, &calling_convention_visitor);
+  } else {
+    HandleInvoke(invoke);
+  }
 
-  // For PC-relative dex cache the invoke has an extra input, the PC-relative address base.
+  // For PC-relative load kinds the invoke has an extra input, the PC-relative address base.
   if (invoke->HasPcRelativeMethodLoadKind()) {
     invoke->GetLocations()->SetInAt(invoke->GetSpecialInputIndex(), Location::RequiresRegister());
   }
@@ -2989,6 +3026,7 @@
           if (!in.IsDoubleStackSlot() || !out.IsStackSlot()) {
             adjustment = DataType::Size(DataType::Type::kInt64);
             __ subl(ESP, Immediate(adjustment));
+            __ cfi().AdjustCFAOffset(adjustment);
           }
 
           // Load the value to the FP stack, using temporaries if needed.
@@ -3005,6 +3043,7 @@
           // Remove the temporary stack space we allocated.
           if (adjustment != 0) {
             __ addl(ESP, Immediate(adjustment));
+            __ cfi().AdjustCFAOffset(-adjustment);
           }
           break;
         }
@@ -3039,6 +3078,7 @@
           if (!in.IsDoubleStackSlot() || !out.IsDoubleStackSlot()) {
             adjustment = DataType::Size(DataType::Type::kInt64);
             __ subl(ESP, Immediate(adjustment));
+            __ cfi().AdjustCFAOffset(adjustment);
           }
 
           // Load the value to the FP stack, using temporaries if needed.
@@ -3055,6 +3095,7 @@
           // Remove the temporary stack space we allocated.
           if (adjustment != 0) {
             __ addl(ESP, Immediate(adjustment));
+            __ cfi().AdjustCFAOffset(-adjustment);
           }
           break;
         }
@@ -3551,6 +3592,7 @@
   // Create stack space for 2 elements.
   // TODO: enhance register allocator to ask for stack temporaries.
   __ subl(ESP, Immediate(2 * elem_size));
+  __ cfi().AdjustCFAOffset(2 * elem_size);
 
   // Load the values to the FP stack in reverse order, using temporaries if needed.
   const bool is_wide = !is_float;
@@ -3591,6 +3633,7 @@
 
   // And remove the temporary stack space we allocated.
   __ addl(ESP, Immediate(2 * elem_size));
+  __ cfi().AdjustCFAOffset(-(2 * elem_size));
 }
 
 
@@ -4934,7 +4977,6 @@
 
 Register CodeGeneratorX86::GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke,
                                                                  Register temp) {
-  DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
   Location location = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
   if (!invoke->GetLocations()->Intrinsified()) {
     return location.AsRegister<Register>();
@@ -4970,7 +5012,7 @@
       break;
     }
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: {
       DCHECK(GetCompilerOptions().IsBootImage() || GetCompilerOptions().IsBootImageExtension());
@@ -5009,15 +5051,73 @@
   switch (invoke->GetCodePtrLocation()) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
       __ call(GetFrameEntryLabel());
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
       break;
+    case HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative: {
+      HParallelMove parallel_move(GetGraph()->GetAllocator());
+      size_t out_frame_size =
+          PrepareCriticalNativeCall<CriticalNativeCallingConventionVisitorX86,
+                                    kNativeStackAlignment,
+                                    GetCriticalNativeDirectCallFrameSize>(invoke, &parallel_move);
+      if (out_frame_size != 0u) {
+        __ subl(ESP, Immediate(out_frame_size));
+        __ cfi().AdjustCFAOffset(out_frame_size);
+        GetMoveResolver()->EmitNativeCode(&parallel_move);
+      }
+      // (callee_method + offset_of_jni_entry_point)()
+      __ call(Address(callee_method.AsRegister<Register>(),
+                      ArtMethod::EntryPointFromJniOffset(kX86PointerSize).Int32Value()));
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
+      if (out_frame_size == 0u && DataType::IsFloatingPointType(invoke->GetType())) {
+        // Create space for conversion.
+        out_frame_size = 8u;
+        __ subl(ESP, Immediate(out_frame_size));
+        __ cfi().AdjustCFAOffset(out_frame_size);
+      }
+      // Zero-/sign-extend or move the result when needed due to native and managed ABI mismatch.
+      switch (invoke->GetType()) {
+        case DataType::Type::kBool:
+          __ movzxb(EAX, AL);
+          break;
+        case DataType::Type::kInt8:
+          __ movsxb(EAX, AL);
+          break;
+        case DataType::Type::kUint16:
+          __ movzxw(EAX, EAX);
+          break;
+        case DataType::Type::kInt16:
+          __ movsxw(EAX, EAX);
+          break;
+        case DataType::Type::kFloat32:
+          __ fstps(Address(ESP, 0));
+          __ movss(XMM0, Address(ESP, 0));
+          break;
+        case DataType::Type::kFloat64:
+          __ fstpl(Address(ESP, 0));
+          __ movsd(XMM0, Address(ESP, 0));
+          break;
+        case DataType::Type::kInt32:
+        case DataType::Type::kInt64:
+        case DataType::Type::kVoid:
+          break;
+        default:
+          DCHECK(false) << invoke->GetType();
+          break;
+      }
+      if (out_frame_size != 0u) {
+        __ addl(ESP, Immediate(out_frame_size));
+        __ cfi().AdjustCFAOffset(-out_frame_size);
+      }
+      break;
+    }
     case HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod:
       // (callee_method + offset_of_quick_compiled_code)()
       __ call(Address(callee_method.AsRegister<Register>(),
                       ArtMethod::EntryPointFromQuickCompiledCodeOffset(
                           kX86PointerSize).Int32Value()));
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
       break;
   }
-  RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
 
   DCHECK(!IsLeafMethod());
 }
@@ -5072,7 +5172,6 @@
 }
 
 void CodeGeneratorX86::RecordBootImageMethodPatch(HInvokeStaticOrDirect* invoke) {
-  DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
   HX86ComputeBaseMethodAddress* method_address =
       invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress();
   boot_image_method_patches_.emplace_back(
@@ -5081,7 +5180,6 @@
 }
 
 void CodeGeneratorX86::RecordMethodBssEntryPatch(HInvokeStaticOrDirect* invoke) {
-  DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
   HX86ComputeBaseMethodAddress* method_address =
       invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress();
   // Add the patch entry and bind its label at the end of the instruction.
@@ -5126,7 +5224,6 @@
                                             uint32_t boot_image_reference,
                                             HInvokeStaticOrDirect* invoke) {
   if (GetCompilerOptions().IsBootImage()) {
-    DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
     HX86ComputeBaseMethodAddress* method_address =
         invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress();
     DCHECK(method_address != nullptr);
@@ -5135,7 +5232,6 @@
     __ leal(reg, Address(method_address_reg, CodeGeneratorX86::kDummy32BitOffset));
     RecordBootImageIntrinsicPatch(method_address, boot_image_reference);
   } else if (GetCompilerOptions().GetCompilePic()) {
-    DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
     HX86ComputeBaseMethodAddress* method_address =
         invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress();
     DCHECK(method_address != nullptr);
@@ -5160,7 +5256,6 @@
   if (GetCompilerOptions().IsBootImage()) {
     DCHECK_EQ(boot_image_offset, IntrinsicVisitor::IntegerValueOfInfo::kInvalidReference);
     // Load the class the same way as for HLoadClass::LoadKind::kBootImageLinkTimePcRelative.
-    DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
     HX86ComputeBaseMethodAddress* method_address =
         invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress();
     DCHECK(method_address != nullptr);
@@ -6365,24 +6460,45 @@
       __ movl(Address(ESP, destination.GetStackIndex()), source.AsRegister<Register>());
     }
   } else if (source.IsRegisterPair()) {
+    if (destination.IsRegisterPair()) {
+      __ movl(destination.AsRegisterPairLow<Register>(), source.AsRegisterPairLow<Register>());
+      DCHECK_NE(destination.AsRegisterPairLow<Register>(), source.AsRegisterPairHigh<Register>());
+      __ movl(destination.AsRegisterPairHigh<Register>(), source.AsRegisterPairHigh<Register>());
+    } else if (destination.IsFpuRegister()) {
       size_t elem_size = DataType::Size(DataType::Type::kInt32);
-      // Create stack space for 2 elements.
-      __ subl(ESP, Immediate(2 * elem_size));
-      __ movl(Address(ESP, 0), source.AsRegisterPairLow<Register>());
-      __ movl(Address(ESP, elem_size), source.AsRegisterPairHigh<Register>());
+      // Push the 2 source registers to stack.
+      __ pushl(source.AsRegisterPairHigh<Register>());
+      __ cfi().AdjustCFAOffset(elem_size);
+      __ pushl(source.AsRegisterPairLow<Register>());
+      __ cfi().AdjustCFAOffset(elem_size);
+      // Load the destination register.
       __ movsd(destination.AsFpuRegister<XmmRegister>(), Address(ESP, 0));
       // And remove the temporary stack space we allocated.
       __ addl(ESP, Immediate(2 * elem_size));
+      __ cfi().AdjustCFAOffset(-(2 * elem_size));
+    } else {
+      DCHECK(destination.IsDoubleStackSlot());
+      __ movl(Address(ESP, destination.GetStackIndex()), source.AsRegisterPairLow<Register>());
+      __ movl(Address(ESP, destination.GetHighStackIndex(kX86WordSize)),
+              source.AsRegisterPairHigh<Register>());
+    }
   } else if (source.IsFpuRegister()) {
     if (destination.IsRegister()) {
       __ movd(destination.AsRegister<Register>(), source.AsFpuRegister<XmmRegister>());
     } else if (destination.IsFpuRegister()) {
       __ movaps(destination.AsFpuRegister<XmmRegister>(), source.AsFpuRegister<XmmRegister>());
     } else if (destination.IsRegisterPair()) {
-      XmmRegister src_reg = source.AsFpuRegister<XmmRegister>();
-      __ movd(destination.AsRegisterPairLow<Register>(), src_reg);
-      __ psrlq(src_reg, Immediate(32));
-      __ movd(destination.AsRegisterPairHigh<Register>(), src_reg);
+      size_t elem_size = DataType::Size(DataType::Type::kInt32);
+      // Create stack space for 2 elements.
+      __ subl(ESP, Immediate(2 * elem_size));
+      __ cfi().AdjustCFAOffset(2 * elem_size);
+      // Store the source register.
+      __ movsd(Address(ESP, 0), source.AsFpuRegister<XmmRegister>());
+      // And pop the values into destination registers.
+      __ popl(destination.AsRegisterPairLow<Register>());
+      __ cfi().AdjustCFAOffset(-elem_size);
+      __ popl(destination.AsRegisterPairHigh<Register>());
+      __ cfi().AdjustCFAOffset(-elem_size);
     } else if (destination.IsStackSlot()) {
       __ movss(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>());
     } else if (destination.IsDoubleStackSlot()) {
@@ -6480,9 +6596,12 @@
           __ xorpd(dest, dest);
         } else {
           __ pushl(high);
+          __ cfi().AdjustCFAOffset(4);
           __ pushl(low);
+          __ cfi().AdjustCFAOffset(4);
           __ movsd(dest, Address(ESP, 0));
           __ addl(ESP, Immediate(8));
+          __ cfi().AdjustCFAOffset(-8);
         }
       } else {
         DCHECK(destination.IsDoubleStackSlot()) << destination;
@@ -6520,10 +6639,12 @@
 void ParallelMoveResolverX86::Exchange128(XmmRegister reg, int mem) {
   size_t extra_slot = 4 * kX86WordSize;
   __ subl(ESP, Immediate(extra_slot));
+  __ cfi().AdjustCFAOffset(extra_slot);
   __ movups(Address(ESP, 0), XmmRegister(reg));
   ExchangeMemory(0, mem + extra_slot, 4);
   __ movups(XmmRegister(reg), Address(ESP, 0));
   __ addl(ESP, Immediate(extra_slot));
+  __ cfi().AdjustCFAOffset(-extra_slot);
 }
 
 void ParallelMoveResolverX86::ExchangeMemory(int mem1, int mem2, int number_of_words) {
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 43f5acd..22d8778 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -93,6 +93,29 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitorX86);
 };
 
+class CriticalNativeCallingConventionVisitorX86 : public InvokeDexCallingConventionVisitor {
+ public:
+  explicit CriticalNativeCallingConventionVisitorX86(bool for_register_allocation)
+      : for_register_allocation_(for_register_allocation) {}
+
+  virtual ~CriticalNativeCallingConventionVisitorX86() {}
+
+  Location GetNextLocation(DataType::Type type) override;
+  Location GetReturnLocation(DataType::Type type) const override;
+  Location GetMethodLocation() const override;
+
+  size_t GetStackOffset() const { return stack_offset_; }
+
+ private:
+  // Register allocator does not support adjusting frame size, so we cannot provide final locations
+  // of stack arguments for register allocation. We ask the register allocator for any location and
+  // move these arguments to the right place after adjusting the SP when generating the call.
+  const bool for_register_allocation_;
+  size_t stack_offset_ = 0u;
+
+  DISALLOW_COPY_AND_ASSIGN(CriticalNativeCallingConventionVisitorX86);
+};
+
 class FieldAccessCallingConventionX86 : public FieldAccessCallingConvention {
  public:
   FieldAccessCallingConventionX86() {}
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index ec54376..4a0cc78 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -16,6 +16,7 @@
 
 #include "code_generator_x86_64.h"
 
+#include "arch/x86_64/jni_frame_x86_64.h"
 #include "art_method-inl.h"
 #include "class_table.h"
 #include "code_generator_utils.h"
@@ -978,6 +979,16 @@
   UNREACHABLE();
 }
 
+void CodeGeneratorX86_64::BlockNonVolatileXmmRegisters(LocationSummary* locations) {
+  // We have to ensure that the native code we call directly (such as @CriticalNative
+  // or some intrinsic helpers, say Math.sin()) doesn't clobber the XMM registers
+  // which are non-volatile for ART, but volatile for Native calls.  This will ensure
+  // that they are saved in the prologue and properly restored.
+  for (FloatRegister fp_reg : non_volatile_xmm_regs) {
+    locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
+  }
+}
+
 HInvokeStaticOrDirect::DispatchInfo CodeGeneratorX86_64::GetSupportedInvokeStaticOrDirectDispatch(
       const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info,
       ArtMethod* method ATTRIBUTE_UNUSED) {
@@ -998,7 +1009,7 @@
       break;
     }
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative:
       DCHECK(GetCompilerOptions().IsBootImage() || GetCompilerOptions().IsBootImageExtension());
@@ -1032,15 +1043,61 @@
   switch (invoke->GetCodePtrLocation()) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
       __ call(&frame_entry_label_);
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
       break;
+    case HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative: {
+      HParallelMove parallel_move(GetGraph()->GetAllocator());
+      size_t out_frame_size =
+          PrepareCriticalNativeCall<CriticalNativeCallingConventionVisitorX86_64,
+                                    kNativeStackAlignment,
+                                    GetCriticalNativeDirectCallFrameSize>(invoke, &parallel_move);
+      if (out_frame_size != 0u) {
+        __ subq(CpuRegister(RSP), Immediate(out_frame_size));
+        __ cfi().AdjustCFAOffset(out_frame_size);
+        GetMoveResolver()->EmitNativeCode(&parallel_move);
+      }
+      // (callee_method + offset_of_jni_entry_point)()
+      __ call(Address(callee_method.AsRegister<CpuRegister>(),
+                      ArtMethod::EntryPointFromJniOffset(kX86_64PointerSize).SizeValue()));
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
+      // Zero-/sign-extend the result when needed due to native and managed ABI mismatch.
+      switch (invoke->GetType()) {
+        case DataType::Type::kBool:
+          __ movzxb(CpuRegister(RAX), CpuRegister(RAX));
+          break;
+        case DataType::Type::kInt8:
+          __ movsxb(CpuRegister(RAX), CpuRegister(RAX));
+          break;
+        case DataType::Type::kUint16:
+          __ movzxw(CpuRegister(RAX), CpuRegister(RAX));
+          break;
+        case DataType::Type::kInt16:
+          __ movsxw(CpuRegister(RAX), CpuRegister(RAX));
+          break;
+        case DataType::Type::kInt32:
+        case DataType::Type::kInt64:
+        case DataType::Type::kFloat32:
+        case DataType::Type::kFloat64:
+        case DataType::Type::kVoid:
+          break;
+        default:
+          DCHECK(false) << invoke->GetType();
+          break;
+      }
+      if (out_frame_size != 0u) {
+        __ addq(CpuRegister(RSP), Immediate(out_frame_size));
+        __ cfi().AdjustCFAOffset(-out_frame_size);
+      }
+      break;
+    }
     case HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod:
       // (callee_method + offset_of_quick_compiled_code)()
       __ call(Address(callee_method.AsRegister<CpuRegister>(),
                       ArtMethod::EntryPointFromQuickCompiledCodeOffset(
                           kX86_64PointerSize).SizeValue()));
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
       break;
   }
-  RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
 
   DCHECK(!IsLeafMethod());
 }
@@ -2493,6 +2550,51 @@
   return Location::NoLocation();
 }
 
+Location CriticalNativeCallingConventionVisitorX86_64::GetNextLocation(DataType::Type type) {
+  DCHECK_NE(type, DataType::Type::kReference);
+
+  Location location = Location::NoLocation();
+  if (DataType::IsFloatingPointType(type)) {
+    if (fpr_index_ < kParameterFloatRegistersLength) {
+      location = Location::FpuRegisterLocation(kParameterFloatRegisters[fpr_index_]);
+      ++fpr_index_;
+    }
+  } else {
+    // Native ABI uses the same registers as managed, except that the method register RDI
+    // is a normal argument.
+    if (gpr_index_ < 1u + kParameterCoreRegistersLength) {
+      location = Location::RegisterLocation(
+          gpr_index_ == 0u ? RDI : kParameterCoreRegisters[gpr_index_ - 1u]);
+      ++gpr_index_;
+    }
+  }
+  if (location.IsInvalid()) {
+    if (DataType::Is64BitType(type)) {
+      location = Location::DoubleStackSlot(stack_offset_);
+    } else {
+      location = Location::StackSlot(stack_offset_);
+    }
+    stack_offset_ += kFramePointerSize;
+
+    if (for_register_allocation_) {
+      location = Location::Any();
+    }
+  }
+  return location;
+}
+
+Location CriticalNativeCallingConventionVisitorX86_64::GetReturnLocation(DataType::Type type)
+    const {
+  // We perform conversion to the managed ABI return register after the call if needed.
+  InvokeDexCallingConventionVisitorX86_64 dex_calling_convention;
+  return dex_calling_convention.GetReturnLocation(type);
+}
+
+Location CriticalNativeCallingConventionVisitorX86_64::GetMethodLocation() const {
+  // Pass the method in the hidden argument RAX.
+  return Location::RegisterLocation(RAX);
+}
+
 void LocationsBuilderX86_64::VisitInvokeUnresolved(HInvokeUnresolved* invoke) {
   // The trampoline uses the same calling convention as dex calling conventions,
   // except instead of loading arg0/r0 with the target Method*, arg0/r0 will contain
@@ -2514,7 +2616,14 @@
     return;
   }
 
-  HandleInvoke(invoke);
+  if (invoke->GetCodePtrLocation() == HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative) {
+    CriticalNativeCallingConventionVisitorX86_64 calling_convention_visitor(
+        /*for_register_allocation=*/ true);
+    CodeGenerator::CreateCommonInvokeLocationSummary(invoke, &calling_convention_visitor);
+    CodeGeneratorX86_64::BlockNonVolatileXmmRegisters(invoke->GetLocations());
+  } else {
+    HandleInvoke(invoke);
+  }
 }
 
 static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorX86_64* codegen) {
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 01810f4..dcdd632 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -79,6 +79,31 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConvention);
 };
 
+class CriticalNativeCallingConventionVisitorX86_64 : public InvokeDexCallingConventionVisitor {
+ public:
+  explicit CriticalNativeCallingConventionVisitorX86_64(bool for_register_allocation)
+      : for_register_allocation_(for_register_allocation) {}
+
+  virtual ~CriticalNativeCallingConventionVisitorX86_64() {}
+
+  Location GetNextLocation(DataType::Type type) override;
+  Location GetReturnLocation(DataType::Type type) const override;
+  Location GetMethodLocation() const override;
+
+  size_t GetStackOffset() const { return stack_offset_; }
+
+ private:
+  // Register allocator does not support adjusting frame size, so we cannot provide final locations
+  // of stack arguments for register allocation. We ask the register allocator for any location and
+  // move these arguments to the right place after adjusting the SP when generating the call.
+  const bool for_register_allocation_;
+  size_t gpr_index_ = 0u;
+  size_t fpr_index_ = 0u;
+  size_t stack_offset_ = 0u;
+
+  DISALLOW_COPY_AND_ASSIGN(CriticalNativeCallingConventionVisitorX86_64);
+};
+
 class FieldAccessCallingConventionX86_64 : public FieldAccessCallingConvention {
  public:
   FieldAccessCallingConventionX86_64() {}
@@ -609,6 +634,8 @@
 
   void MaybeIncrementHotness(bool is_frame_entry);
 
+  static void BlockNonVolatileXmmRegisters(LocationSummary* locations);
+
   // When we don't know the proper offset for the value, we use kDummy32BitOffset.
   // We will fix this up in the linker later to have the right value.
   static constexpr int32_t kDummy32BitOffset = 256;
diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc
index cd68b2a..60e1279 100644
--- a/compiler/optimizing/instruction_builder.cc
+++ b/compiler/optimizing/instruction_builder.cc
@@ -1530,8 +1530,8 @@
 
   if (invoke->IsInvokeStaticOrDirect() &&
       HInvokeStaticOrDirect::NeedsCurrentMethodInput(
-          invoke->AsInvokeStaticOrDirect()->GetMethodLoadKind())) {
-    DCHECK_EQ(argument_index, invoke->AsInvokeStaticOrDirect()->GetSpecialInputIndex());
+          invoke->AsInvokeStaticOrDirect()->GetDispatchInfo())) {
+    DCHECK_EQ(argument_index, invoke->AsInvokeStaticOrDirect()->GetCurrentMethodIndex());
     DCHECK(invoke->InputAt(argument_index) == nullptr);
     invoke->SetRawInputAt(argument_index, graph_->GetCurrentMethod());
   }
diff --git a/compiler/optimizing/intrinsics_utils.h b/compiler/optimizing/intrinsics_utils.h
index e24d541..29f815c 100644
--- a/compiler/optimizing/intrinsics_utils.h
+++ b/compiler/optimizing/intrinsics_utils.h
@@ -59,7 +59,12 @@
     Location method_loc = MoveArguments(codegen);
 
     if (invoke_->IsInvokeStaticOrDirect()) {
-      codegen->GenerateStaticOrDirectCall(invoke_->AsInvokeStaticOrDirect(), method_loc, this);
+      HInvokeStaticOrDirect* invoke_static_or_direct = invoke_->AsInvokeStaticOrDirect();
+      DCHECK_NE(invoke_static_or_direct->GetMethodLoadKind(),
+                HInvokeStaticOrDirect::MethodLoadKind::kRecursive);
+      DCHECK_NE(invoke_static_or_direct->GetCodePtrLocation(),
+                HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative);
+      codegen->GenerateStaticOrDirectCall(invoke_static_or_direct, method_loc, this);
     } else {
       codegen->GenerateVirtualCall(invoke_->AsInvokeVirtual(), method_loc, this);
     }
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 7a0f131..af3fd76 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -398,12 +398,7 @@
   locations->SetInAt(0, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(0)));
   locations->SetOut(Location::FpuRegisterLocation(XMM0));
 
-  // We have to ensure that the native code doesn't clobber the XMM registers which are
-  // non-volatile for ART, but volatile for Native calls.  This will ensure that they are
-  // saved in the prologue and properly restored.
-  for (FloatRegister fp_reg : non_volatile_xmm_regs) {
-    locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
-  }
+  CodeGeneratorX86_64::BlockNonVolatileXmmRegisters(locations);
 }
 
 static void GenFPToFPCall(HInvoke* invoke, CodeGeneratorX86_64* codegen,
@@ -535,12 +530,7 @@
   locations->SetInAt(1, Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(1)));
   locations->SetOut(Location::FpuRegisterLocation(XMM0));
 
-  // We have to ensure that the native code doesn't clobber the XMM registers which are
-  // non-volatile for ART, but volatile for Native calls.  This will ensure that they are
-  // saved in the prologue and properly restored.
-  for (FloatRegister fp_reg : non_volatile_xmm_regs) {
-    locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
-  }
+  CodeGeneratorX86_64::BlockNonVolatileXmmRegisters(locations);
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitMathAtan2(HInvoke* invoke) {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index e562b87..0eece84 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -4604,6 +4604,11 @@
     // Recursive call, use local PC-relative call instruction.
     kCallSelf,
 
+    // Use native pointer from the Artmethod*.
+    // Used for @CriticalNative to avoid going through the compiled stub. This call goes through
+    // a special resolution stub if the class is not initialized or no native code is registered.
+    kCallCriticalNative,
+
     // Use code pointer from the ArtMethod*.
     // Used when we don't know the target code. This is also the last-resort-kind used when
     // other kinds are unimplemented or impractical (i.e. slow) on a particular architecture.
@@ -4633,9 +4638,9 @@
       : HInvoke(kInvokeStaticOrDirect,
                 allocator,
                 number_of_arguments,
-                // There is potentially one extra argument for the HCurrentMethod node, and
-                // potentially one other if the clinit check is explicit.
-                (NeedsCurrentMethodInput(dispatch_info.method_load_kind) ? 1u : 0u) +
+                // There is potentially one extra argument for the HCurrentMethod input,
+                // and one other if the clinit check is explicit. These can be removed later.
+                (NeedsCurrentMethodInput(dispatch_info) ? 1u : 0u) +
                     (clinit_check_requirement == ClinitCheckRequirement::kExplicit ? 1u : 0u),
                 return_type,
                 dex_pc,
@@ -4649,17 +4654,17 @@
 
   bool IsClonable() const override { return true; }
 
-  void SetDispatchInfo(const DispatchInfo& dispatch_info) {
+  void SetDispatchInfo(DispatchInfo dispatch_info) {
     bool had_current_method_input = HasCurrentMethodInput();
-    bool needs_current_method_input = NeedsCurrentMethodInput(dispatch_info.method_load_kind);
+    bool needs_current_method_input = NeedsCurrentMethodInput(dispatch_info);
 
     // Using the current method is the default and once we find a better
     // method load kind, we should not go back to using the current method.
     DCHECK(had_current_method_input || !needs_current_method_input);
 
     if (had_current_method_input && !needs_current_method_input) {
-      DCHECK_EQ(InputAt(GetSpecialInputIndex()), GetBlock()->GetGraph()->GetCurrentMethod());
-      RemoveInputAt(GetSpecialInputIndex());
+      DCHECK_EQ(InputAt(GetCurrentMethodIndex()), GetBlock()->GetGraph()->GetCurrentMethod());
+      RemoveInputAt(GetCurrentMethodIndex());
     }
     dispatch_info_ = dispatch_info;
   }
@@ -4668,14 +4673,6 @@
     return dispatch_info_;
   }
 
-  void AddSpecialInput(HInstruction* input) {
-    // We allow only one special input.
-    DCHECK(!IsStringInit() && !HasCurrentMethodInput());
-    DCHECK(InputCount() == GetSpecialInputIndex() ||
-           (InputCount() == GetSpecialInputIndex() + 1 && IsStaticWithExplicitClinitCheck()));
-    InsertInputAt(GetSpecialInputIndex(), input);
-  }
-
   using HInstruction::GetInputRecords;  // Keep the const version visible.
   ArrayRef<HUserRecord<HInstruction*>> GetInputRecords() override {
     ArrayRef<HUserRecord<HInstruction*>> input_records = HInvoke::GetInputRecords();
@@ -4696,7 +4693,7 @@
   }
 
   bool CanDoImplicitNullCheckOn(HInstruction* obj ATTRIBUTE_UNUSED) const override {
-    // We access the method via the dex cache so we can't do an implicit null check.
+    // We do not access the method via object reference, so we cannot do an implicit null check.
     // TODO: for intrinsics we can generate implicit null checks.
     return false;
   }
@@ -4705,14 +4702,6 @@
     return GetType() == DataType::Type::kReference && !IsStringInit();
   }
 
-  // Get the index of the special input, if any.
-  //
-  // If the invoke HasCurrentMethodInput(), the "special input" is the current
-  // method pointer; otherwise there may be one platform-specific special input,
-  // such as PC-relative addressing base.
-  uint32_t GetSpecialInputIndex() const { return GetNumberOfArguments(); }
-  bool HasSpecialInput() const { return GetNumberOfArguments() != InputCount(); }
-
   MethodLoadKind GetMethodLoadKind() const { return dispatch_info_.method_load_kind; }
   CodePtrLocation GetCodePtrLocation() const { return dispatch_info_.code_ptr_location; }
   bool IsRecursive() const { return GetMethodLoadKind() == MethodLoadKind::kRecursive; }
@@ -4724,17 +4713,6 @@
            GetMethodLoadKind() == MethodLoadKind::kBootImageRelRo ||
            GetMethodLoadKind() == MethodLoadKind::kBssEntry;
   }
-  bool HasCurrentMethodInput() const {
-    // This function can be called only after the invoke has been fully initialized by the builder.
-    if (NeedsCurrentMethodInput(GetMethodLoadKind())) {
-      DCHECK(InputAt(GetSpecialInputIndex())->IsCurrentMethod());
-      return true;
-    } else {
-      DCHECK(InputCount() == GetSpecialInputIndex() ||
-             !InputAt(GetSpecialInputIndex())->IsCurrentMethod());
-      return false;
-    }
-  }
 
   QuickEntrypointEnum GetStringInitEntryPoint() const {
     DCHECK(IsStringInit());
@@ -4761,6 +4739,60 @@
     return target_method_;
   }
 
+  // Does this method load kind need the current method as an input?
+  static bool NeedsCurrentMethodInput(DispatchInfo dispatch_info) {
+    return dispatch_info.method_load_kind == MethodLoadKind::kRecursive ||
+           dispatch_info.method_load_kind == MethodLoadKind::kRuntimeCall ||
+           dispatch_info.code_ptr_location == CodePtrLocation::kCallCriticalNative;
+  }
+
+  // Get the index of the current method input.
+  size_t GetCurrentMethodIndex() const {
+    DCHECK(HasCurrentMethodInput());
+    return GetCurrentMethodIndexUnchecked();
+  }
+  size_t GetCurrentMethodIndexUnchecked() const {
+    return GetNumberOfArguments();
+  }
+
+  // Check if the method has a current method input.
+  bool HasCurrentMethodInput() const {
+    if (NeedsCurrentMethodInput(GetDispatchInfo())) {
+      DCHECK(InputAt(GetCurrentMethodIndexUnchecked()) == nullptr ||  // During argument setup.
+             InputAt(GetCurrentMethodIndexUnchecked())->IsCurrentMethod());
+      return true;
+    } else {
+      DCHECK(InputCount() == GetCurrentMethodIndexUnchecked() ||
+             InputAt(GetCurrentMethodIndexUnchecked()) == nullptr ||  // During argument setup.
+             !InputAt(GetCurrentMethodIndexUnchecked())->IsCurrentMethod());
+      return false;
+    }
+  }
+
+  // Get the index of the special input.
+  size_t GetSpecialInputIndex() const {
+    DCHECK(HasSpecialInput());
+    return GetSpecialInputIndexUnchecked();
+  }
+  size_t GetSpecialInputIndexUnchecked() const {
+    return GetNumberOfArguments() + (HasCurrentMethodInput() ? 1u : 0u);
+  }
+
+  // Check if the method has a special input.
+  bool HasSpecialInput() const {
+    size_t other_inputs =
+        GetSpecialInputIndexUnchecked() + (IsStaticWithExplicitClinitCheck() ? 1u : 0u);
+    size_t input_count = InputCount();
+    DCHECK_LE(input_count - other_inputs, 1u) << other_inputs << " " << input_count;
+    return other_inputs != input_count;
+  }
+
+  void AddSpecialInput(HInstruction* input) {
+    // We allow only one special input.
+    DCHECK(!HasSpecialInput());
+    InsertInputAt(GetSpecialInputIndexUnchecked(), input);
+  }
+
   // Remove the HClinitCheck or the replacement HLoadClass (set as last input by
   // PrepareForRegisterAllocation::VisitClinitCheck() in lieu of the initial HClinitCheck)
   // instruction; only relevant for static calls with explicit clinit check.
@@ -4788,11 +4820,6 @@
     return IsStatic() && (GetClinitCheckRequirement() == ClinitCheckRequirement::kImplicit);
   }
 
-  // Does this method load kind need the current method as an input?
-  static bool NeedsCurrentMethodInput(MethodLoadKind kind) {
-    return kind == MethodLoadKind::kRecursive || kind == MethodLoadKind::kRuntimeCall;
-  }
-
   DECLARE_INSTRUCTION(InvokeStaticOrDirect);
 
  protected:
@@ -4815,6 +4842,7 @@
   DispatchInfo dispatch_info_;
 };
 std::ostream& operator<<(std::ostream& os, HInvokeStaticOrDirect::MethodLoadKind rhs);
+std::ostream& operator<<(std::ostream& os, HInvokeStaticOrDirect::CodePtrLocation rhs);
 std::ostream& operator<<(std::ostream& os, HInvokeStaticOrDirect::ClinitCheckRequirement rhs);
 
 class HInvokeVirtual final : public HInvoke {
diff --git a/compiler/optimizing/pc_relative_fixups_x86.cc b/compiler/optimizing/pc_relative_fixups_x86.cc
index 4ff293c..3ea1918 100644
--- a/compiler/optimizing/pc_relative_fixups_x86.cc
+++ b/compiler/optimizing/pc_relative_fixups_x86.cc
@@ -195,15 +195,6 @@
   void HandleInvoke(HInvoke* invoke) {
     HInvokeStaticOrDirect* invoke_static_or_direct = invoke->AsInvokeStaticOrDirect();
 
-    // We can't add the method address if we already have a current method pointer.
-    // This may arise when sharpening doesn't remove the current method pointer from the invoke.
-    if (invoke_static_or_direct != nullptr && invoke_static_or_direct->HasCurrentMethodInput()) {
-      // Note: This happens only for recursive calls (including compiling an intrinsic
-      // by faking a call to itself; we use kRuntimeCall for this case).
-      DCHECK(!invoke_static_or_direct->HasPcRelativeMethodLoadKind());
-      return;
-    }
-
     // If this is an invoke-static/-direct with PC-relative addressing (within boot image
     // or using .bss or .data.bimg.rel.ro), we need the PC-relative address base.
     bool base_added = false;
@@ -246,7 +237,6 @@
         // This intrinsic needs the constant area.
         if (!base_added) {
           DCHECK(invoke_static_or_direct != nullptr);
-          DCHECK(!invoke_static_or_direct->HasCurrentMethodInput());
           HX86ComputeBaseMethodAddress* method_address = GetPCRelativeBasePointer(invoke);
           invoke_static_or_direct->AddSpecialInput(method_address);
         }
diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc
index 1539421..04a8eab 100644
--- a/compiler/optimizing/sharpening.cc
+++ b/compiler/optimizing/sharpening.cc
@@ -124,6 +124,13 @@
     code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod;
   }
 
+  if (method_load_kind != HInvokeStaticOrDirect::MethodLoadKind::kRuntimeCall &&
+      callee->IsCriticalNative()) {
+    DCHECK_NE(method_load_kind, HInvokeStaticOrDirect::MethodLoadKind::kRecursive);
+    DCHECK(callee->IsStatic());
+    code_ptr_location = HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative;
+  }
+
   if (codegen->GetGraph()->IsDebuggable()) {
     // For debuggable apps always use the code pointer from ArtMethod
     // so that we don't circumvent instrumentation stubs if installed.