jni: Add @CriticalNative optimization to speed up JNI transitions

Change-Id: I963059ac3a72dd8e6a867596c356d7062deb6da7
diff --git a/compiler/jni/quick/arm/calling_convention_arm.cc b/compiler/jni/quick/arm/calling_convention_arm.cc
index 0d16260..3f29ae5 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.cc
+++ b/compiler/jni/quick/arm/calling_convention_arm.cc
@@ -24,15 +24,33 @@
 
 static_assert(kArmPointerSize == PointerSize::k32, "Unexpected ARM pointer size");
 
-// Used by hard float.
+//
+// JNI calling convention constants.
+//
+
+// List of parameters passed via registers for JNI.
+// JNI uses soft-float, so there is only a GPR list.
+static const Register kJniArgumentRegisters[] = {
+  R0, R1, R2, R3
+};
+
+static const size_t kJniArgumentRegisterCount = arraysize(kJniArgumentRegisters);
+
+//
+// Managed calling convention constants.
+//
+
+// Used by hard float. (General purpose registers.)
 static const Register kHFCoreArgumentRegisters[] = {
   R0, R1, R2, R3
 };
 
+// (VFP single-precision registers.)
 static const SRegister kHFSArgumentRegisters[] = {
   S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15
 };
 
+// (VFP double-precision registers.)
 static const DRegister kHFDArgumentRegisters[] = {
   D0, D1, D2, D3, D4, D5, D6, D7
 };
@@ -40,6 +58,10 @@
 static_assert(arraysize(kHFDArgumentRegisters) * 2 == arraysize(kHFSArgumentRegisters),
     "ks d argument registers mismatch");
 
+//
+// Shared managed+JNI calling convention constants.
+//
+
 static constexpr ManagedRegister kCalleeSaveRegisters[] = {
     // Core registers.
     ArmManagedRegister::FromCoreRegister(R5),
@@ -255,23 +277,95 @@
 }
 // JNI calling convention
 
-ArmJniCallingConvention::ArmJniCallingConvention(bool is_static, bool is_synchronized,
+ArmJniCallingConvention::ArmJniCallingConvention(bool is_static,
+                                                 bool is_synchronized,
+                                                 bool is_critical_native,
                                                  const char* shorty)
-    : JniCallingConvention(is_static, is_synchronized, shorty, kArmPointerSize) {
-  // Compute padding to ensure longs and doubles are not split in AAPCS. Ignore the 'this' jobject
-  // or jclass for static methods and the JNIEnv. We start at the aligned register r2.
-  size_t padding = 0;
-  for (size_t cur_arg = IsStatic() ? 0 : 1, cur_reg = 2; cur_arg < NumArgs(); cur_arg++) {
+    : JniCallingConvention(is_static,
+                           is_synchronized,
+                           is_critical_native,
+                           shorty,
+                           kArmPointerSize) {
+  // AAPCS 4.1 specifies fundamental alignments for each type. All of our stack arguments are
+  // usually 4-byte aligned, however longs and doubles must be 8 bytes aligned. Add padding to
+  // maintain 8-byte alignment invariant.
+  //
+  // Compute padding to ensure longs and doubles are not split in AAPCS.
+  size_t shift = 0;
+
+  size_t cur_arg, cur_reg;
+  if (LIKELY(HasExtraArgumentsForJni())) {
+    // Ignore the 'this' jobject or jclass for static methods and the JNIEnv.
+    // We start at the aligned register r2.
+    //
+    // Ignore the first 2 parameters because they are guaranteed to be aligned.
+    cur_arg = NumImplicitArgs();  // skip the "this" arg.
+    cur_reg = 2;  // skip {r0=JNIEnv, r1=jobject} / {r0=JNIEnv, r1=jclass} parameters (start at r2).
+  } else {
+    // Check every parameter.
+    cur_arg = 0;
+    cur_reg = 0;
+  }
+
+  // TODO: Maybe should just use IsCurrentParamALongOrDouble instead to be cleaner?
+  // (this just seems like an unnecessary micro-optimization).
+
+  // Shift across a logical register mapping that looks like:
+  //
+  //   | r0 | r1 | r2 | r3 | SP | SP+4| SP+8 | SP+12 | ... | SP+n | SP+n+4 |
+  //
+  //   (where SP is some arbitrary stack pointer that our 0th stack arg would go into).
+  //
+  // Any time there would normally be a long/double in an odd logical register,
+  // we have to push out the rest of the mappings by 4 bytes to maintain an 8-byte alignment.
+  //
+  // This works for both physical register pairs {r0, r1}, {r2, r3} and for when
+  // the value is on the stack.
+  //
+  // For example:
+  // (a) long would normally go into r1, but we shift it into r2
+  //  | INT | (PAD) | LONG      |
+  //  | r0  |  r1   |  r2  | r3 |
+  //
+  // (b) long would normally go into r3, but we shift it into SP
+  //  | INT | INT | INT | (PAD) | LONG     |
+  //  | r0  |  r1 |  r2 |  r3   | SP+4 SP+8|
+  //
+  // where INT is any <=4 byte arg, and LONG is any 8-byte arg.
+  for (; cur_arg < NumArgs(); cur_arg++) {
     if (IsParamALongOrDouble(cur_arg)) {
-      if ((cur_reg & 1) != 0) {
-        padding += 4;
+      if ((cur_reg & 1) != 0) {  // check that it's in a logical contiguous register pair
+        shift += 4;
         cur_reg++;  // additional bump to ensure alignment
       }
-      cur_reg++;  // additional bump to skip extra long word
+      cur_reg += 2;  // bump the iterator twice for every long argument
+    } else {
+      cur_reg++;  // bump the iterator for every non-long argument
     }
-    cur_reg++;  // bump the iterator for every argument
   }
-  padding_ = padding;
+
+  if (cur_reg < kJniArgumentRegisterCount) {
+    // As a special case when, as a result of shifting (or not) there are no arguments on the stack,
+    // we actually have 0 stack padding.
+    //
+    // For example with @CriticalNative and:
+    // (int, long) -> shifts the long but doesn't need to pad the stack
+    //
+    //          shift
+    //           \/
+    //  | INT | (PAD) | LONG      | (EMPTY) ...
+    //  | r0  |  r1   |  r2  | r3 |   SP    ...
+    //                                /\
+    //                          no stack padding
+    padding_ = 0;
+  } else {
+    padding_ = shift;
+  }
+
+  // TODO: add some new JNI tests for @CriticalNative that introduced new edge cases
+  // (a) Using r0,r1 pair = f(long,...)
+  // (b) Shifting r1 long into r2,r3 pair = f(int, long, int, ...);
+  // (c) Shifting but not introducing a stack padding = f(int, long);
 }
 
 uint32_t ArmJniCallingConvention::CoreSpillMask() const {
@@ -289,15 +383,34 @@
 
 size_t ArmJniCallingConvention::FrameSize() {
   // Method*, LR and callee save area size, local reference segment state
-  size_t frame_data_size = static_cast<size_t>(kArmPointerSize)
-      + (2 + CalleeSaveRegisters().size()) * kFramePointerSize;
-  // References plus 2 words for HandleScope header
-  size_t handle_scope_size = HandleScope::SizeOf(kArmPointerSize, ReferenceCount());
+  const size_t method_ptr_size = static_cast<size_t>(kArmPointerSize);
+  const size_t lr_return_addr_size = kFramePointerSize;
+  const size_t callee_save_area_size = CalleeSaveRegisters().size() * kFramePointerSize;
+  size_t frame_data_size = method_ptr_size + lr_return_addr_size + callee_save_area_size;
+
+  if (LIKELY(HasLocalReferenceSegmentState())) {
+    // local reference segment state
+    frame_data_size += kFramePointerSize;
+    // TODO: Probably better to use sizeof(IRTSegmentState) here...
+  }
+
+  // References plus link_ (pointer) and number_of_references_ (uint32_t) for HandleScope header
+  const size_t handle_scope_size = HandleScope::SizeOf(kArmPointerSize, ReferenceCount());
+
+  size_t total_size = frame_data_size;
+  if (LIKELY(HasHandleScope())) {
+    // HandleScope is sometimes excluded.
+    total_size += handle_scope_size;                                 // handle scope size
+  }
+
   // Plus return value spill area size
-  return RoundUp(frame_data_size + handle_scope_size + SizeOfReturnValue(), kStackAlignment);
+  total_size += SizeOfReturnValue();
+
+  return RoundUp(total_size, kStackAlignment);
 }
 
 size_t ArmJniCallingConvention::OutArgSize() {
+  // TODO: Identical to x86_64 except for also adding additional padding.
   return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize + padding_,
                  kStackAlignment);
 }
@@ -309,55 +422,70 @@
 // JniCallingConvention ABI follows AAPCS where longs and doubles must occur
 // in even register numbers and stack slots
 void ArmJniCallingConvention::Next() {
+  // Update the iterator by usual JNI rules.
   JniCallingConvention::Next();
-  size_t arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
-  if ((itr_args_ >= 2) &&
-      (arg_pos < NumArgs()) &&
-      IsParamALongOrDouble(arg_pos)) {
-    // itr_slots_ needs to be an even number, according to AAPCS.
-    if ((itr_slots_ & 0x1u) != 0) {
+
+  if (LIKELY(HasNext())) {  // Avoid CHECK failure for IsCurrentParam
+    // Ensure slot is 8-byte aligned for longs/doubles (AAPCS).
+    if (IsCurrentParamALongOrDouble() && ((itr_slots_ & 0x1u) != 0)) {
+      // itr_slots_ needs to be an even number, according to AAPCS.
       itr_slots_++;
     }
   }
 }
 
 bool ArmJniCallingConvention::IsCurrentParamInRegister() {
-  return itr_slots_ < 4;
+  return itr_slots_ < kJniArgumentRegisterCount;
 }
 
 bool ArmJniCallingConvention::IsCurrentParamOnStack() {
   return !IsCurrentParamInRegister();
 }
 
-static const Register kJniArgumentRegisters[] = {
-  R0, R1, R2, R3
-};
 ManagedRegister ArmJniCallingConvention::CurrentParamRegister() {
-  CHECK_LT(itr_slots_, 4u);
-  int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
-  if ((itr_args_ >= 2) && IsParamALongOrDouble(arg_pos)) {
-    CHECK_EQ(itr_slots_, 2u);
-    return ArmManagedRegister::FromRegisterPair(R2_R3);
+  CHECK_LT(itr_slots_, kJniArgumentRegisterCount);
+  if (IsCurrentParamALongOrDouble()) {
+    // AAPCS 5.1.1 requires 64-bit values to be in a consecutive register pair:
+    // "A double-word sized type is passed in two consecutive registers (e.g., r0 and r1, or r2 and
+    // r3). The content of the registers is as if the value had been loaded from memory
+    // representation with a single LDM instruction."
+    if (itr_slots_ == 0u) {
+      return ArmManagedRegister::FromRegisterPair(R0_R1);
+    } else if (itr_slots_ == 2u) {
+      return ArmManagedRegister::FromRegisterPair(R2_R3);
+    } else {
+      // The register can either be R0 (+R1) or R2 (+R3). Cannot be other values.
+      LOG(FATAL) << "Invalid iterator register position for a long/double " << itr_args_;
+      UNREACHABLE();
+    }
   } else {
-    return
-      ArmManagedRegister::FromCoreRegister(kJniArgumentRegisters[itr_slots_]);
+    // All other types can fit into one register.
+    return ArmManagedRegister::FromCoreRegister(kJniArgumentRegisters[itr_slots_]);
   }
 }
 
 FrameOffset ArmJniCallingConvention::CurrentParamStackOffset() {
-  CHECK_GE(itr_slots_, 4u);
+  CHECK_GE(itr_slots_, kJniArgumentRegisterCount);
   size_t offset =
-      displacement_.Int32Value() - OutArgSize() + ((itr_slots_ - 4) * kFramePointerSize);
+      displacement_.Int32Value()
+          - OutArgSize()
+          + ((itr_slots_ - kJniArgumentRegisterCount) * kFramePointerSize);
   CHECK_LT(offset, OutArgSize());
   return FrameOffset(offset);
 }
 
 size_t ArmJniCallingConvention::NumberOfOutgoingStackArgs() {
-  size_t static_args = IsStatic() ? 1 : 0;  // count jclass
+  size_t static_args = HasSelfClass() ? 1 : 0;  // count jclass
   // regular argument parameters and this
-  size_t param_args = NumArgs() + NumLongOrDoubleArgs();
+  size_t param_args = NumArgs() + NumLongOrDoubleArgs();  // twice count 8-byte args
+  // XX: Why is the long/ordouble counted twice but not JNIEnv* ???
   // count JNIEnv* less arguments in registers
-  return static_args + param_args + 1 - 4;
+  size_t internal_args = (HasJniEnv() ? 1 : 0 /* jni env */);
+  size_t total_args = static_args + param_args + internal_args;
+
+  return total_args - std::min(kJniArgumentRegisterCount, static_cast<size_t>(total_args));
+
+  // TODO: Very similar to x86_64 except for the return pc.
 }
 
 }  // namespace arm
diff --git a/compiler/jni/quick/arm/calling_convention_arm.h b/compiler/jni/quick/arm/calling_convention_arm.h
index 7c717cc..249f202 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.h
+++ b/compiler/jni/quick/arm/calling_convention_arm.h
@@ -52,7 +52,10 @@
 
 class ArmJniCallingConvention FINAL : public JniCallingConvention {
  public:
-  ArmJniCallingConvention(bool is_static, bool is_synchronized, const char* shorty);
+  ArmJniCallingConvention(bool is_static,
+                          bool is_synchronized,
+                          bool is_critical_native,
+                          const char* shorty);
   ~ArmJniCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.cc b/compiler/jni/quick/arm64/calling_convention_arm64.cc
index afa707d..3fb7b56 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.cc
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.cc
@@ -24,6 +24,13 @@
 
 static_assert(kArm64PointerSize == PointerSize::k64, "Unexpected ARM64 pointer size");
 
+// Up to how many float-like (float, double) args can be enregistered.
+// The rest of the args must go on the stack.
+constexpr size_t kMaxFloatOrDoubleRegisterArguments = 8u;
+// Up to how many integer-like (pointers, objects, longs, int, short, bool, etc) args can be
+// enregistered. The rest of the args must go on the stack.
+constexpr size_t kMaxIntLikeRegisterArguments = 8u;
+
 static const XRegister kXArgumentRegisters[] = {
   X0, X1, X2, X3, X4, X5, X6, X7
 };
@@ -211,9 +218,11 @@
 }
 
 // JNI calling convention
-Arm64JniCallingConvention::Arm64JniCallingConvention(bool is_static, bool is_synchronized,
+Arm64JniCallingConvention::Arm64JniCallingConvention(bool is_static,
+                                                     bool is_synchronized,
+                                                     bool is_critical_native,
                                                      const char* shorty)
-    : JniCallingConvention(is_static, is_synchronized, shorty, kArm64PointerSize) {
+    : JniCallingConvention(is_static, is_synchronized, is_critical_native, shorty, kArm64PointerSize) {
 }
 
 uint32_t Arm64JniCallingConvention::CoreSpillMask() const {
@@ -230,38 +239,59 @@
 
 size_t Arm64JniCallingConvention::FrameSize() {
   // Method*, callee save area size, local reference segment state
-  size_t frame_data_size = kFramePointerSize +
-      CalleeSaveRegisters().size() * kFramePointerSize + sizeof(uint32_t);
+  //
+  // (Unlike x86_64, do not include return address, and the segment state is uint32
+  // instead of pointer).
+  size_t method_ptr_size = static_cast<size_t>(kFramePointerSize);
+  size_t callee_save_area_size = CalleeSaveRegisters().size() * kFramePointerSize;
+
+  size_t frame_data_size = method_ptr_size + callee_save_area_size;
+  if (LIKELY(HasLocalReferenceSegmentState())) {
+    frame_data_size += sizeof(uint32_t);
+  }
   // References plus 2 words for HandleScope header
   size_t handle_scope_size = HandleScope::SizeOf(kArm64PointerSize, ReferenceCount());
+
+  size_t total_size = frame_data_size;
+  if (LIKELY(HasHandleScope())) {
+    // HandleScope is sometimes excluded.
+    total_size += handle_scope_size;                                 // handle scope size
+  }
+
   // Plus return value spill area size
-  return RoundUp(frame_data_size + handle_scope_size + SizeOfReturnValue(), kStackAlignment);
+  total_size += SizeOfReturnValue();
+
+  return RoundUp(total_size, kStackAlignment);
 }
 
 size_t Arm64JniCallingConvention::OutArgSize() {
+  // Same as X86_64
   return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize, kStackAlignment);
 }
 
 ArrayRef<const ManagedRegister> Arm64JniCallingConvention::CalleeSaveRegisters() const {
+  // Same as X86_64
   return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
 }
 
 bool Arm64JniCallingConvention::IsCurrentParamInRegister() {
   if (IsCurrentParamAFloatOrDouble()) {
-    return (itr_float_and_doubles_ < 8);
+    return (itr_float_and_doubles_ < kMaxFloatOrDoubleRegisterArguments);
   } else {
-    return ((itr_args_ - itr_float_and_doubles_) < 8);
+    return ((itr_args_ - itr_float_and_doubles_) < kMaxIntLikeRegisterArguments);
   }
+  // TODO: Can we just call CurrentParamRegister to figure this out?
 }
 
 bool Arm64JniCallingConvention::IsCurrentParamOnStack() {
+  // Is this ever not the same for all the architectures?
   return !IsCurrentParamInRegister();
 }
 
 ManagedRegister Arm64JniCallingConvention::CurrentParamRegister() {
   CHECK(IsCurrentParamInRegister());
   if (IsCurrentParamAFloatOrDouble()) {
-    CHECK_LT(itr_float_and_doubles_, 8u);
+    CHECK_LT(itr_float_and_doubles_, kMaxFloatOrDoubleRegisterArguments);
     if (IsCurrentParamADouble()) {
       return Arm64ManagedRegister::FromDRegister(kDArgumentRegisters[itr_float_and_doubles_]);
     } else {
@@ -269,7 +299,7 @@
     }
   } else {
     int gp_reg = itr_args_ - itr_float_and_doubles_;
-    CHECK_LT(static_cast<unsigned int>(gp_reg), 8u);
+    CHECK_LT(static_cast<unsigned int>(gp_reg), kMaxIntLikeRegisterArguments);
     if (IsCurrentParamALong() || IsCurrentParamAReference() || IsCurrentParamJniEnv())  {
       return Arm64ManagedRegister::FromXRegister(kXArgumentRegisters[gp_reg]);
     } else {
@@ -281,20 +311,30 @@
 FrameOffset Arm64JniCallingConvention::CurrentParamStackOffset() {
   CHECK(IsCurrentParamOnStack());
   size_t args_on_stack = itr_args_
-                  - std::min(8u, itr_float_and_doubles_)
-                  - std::min(8u, (itr_args_ - itr_float_and_doubles_));
+                  - std::min(kMaxFloatOrDoubleRegisterArguments,
+                             static_cast<size_t>(itr_float_and_doubles_))
+                  - std::min(kMaxIntLikeRegisterArguments,
+                             static_cast<size_t>(itr_args_ - itr_float_and_doubles_));
   size_t offset = displacement_.Int32Value() - OutArgSize() + (args_on_stack * kFramePointerSize);
   CHECK_LT(offset, OutArgSize());
   return FrameOffset(offset);
+  // TODO: Seems identical to X86_64 code.
 }
 
 size_t Arm64JniCallingConvention::NumberOfOutgoingStackArgs() {
   // all arguments including JNI args
   size_t all_args = NumArgs() + NumberOfExtraArgumentsForJni();
 
-  size_t all_stack_args = all_args -
-            std::min(8u, static_cast<unsigned int>(NumFloatOrDoubleArgs())) -
-            std::min(8u, static_cast<unsigned int>((all_args - NumFloatOrDoubleArgs())));
+  DCHECK_GE(all_args, NumFloatOrDoubleArgs());
+
+  size_t all_stack_args =
+      all_args
+      - std::min(kMaxFloatOrDoubleRegisterArguments,
+                 static_cast<size_t>(NumFloatOrDoubleArgs()))
+      - std::min(kMaxIntLikeRegisterArguments,
+                 static_cast<size_t>((all_args - NumFloatOrDoubleArgs())));
+
+  // TODO: Seems similar to X86_64 code except it doesn't count return pc.
 
   return all_stack_args;
 }
diff --git a/compiler/jni/quick/arm64/calling_convention_arm64.h b/compiler/jni/quick/arm64/calling_convention_arm64.h
index 90b12e5..5618942 100644
--- a/compiler/jni/quick/arm64/calling_convention_arm64.h
+++ b/compiler/jni/quick/arm64/calling_convention_arm64.h
@@ -52,7 +52,10 @@
 
 class Arm64JniCallingConvention FINAL : public JniCallingConvention {
  public:
-  Arm64JniCallingConvention(bool is_static, bool is_synchronized, const char* shorty);
+  Arm64JniCallingConvention(bool is_static,
+                            bool is_synchronized,
+                            bool is_critical_native,
+                            const char* shorty);
   ~Arm64JniCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
diff --git a/compiler/jni/quick/calling_convention.cc b/compiler/jni/quick/calling_convention.cc
index c7ed9c9..9859b5d 100644
--- a/compiler/jni/quick/calling_convention.cc
+++ b/compiler/jni/quick/calling_convention.cc
@@ -149,19 +149,44 @@
 std::unique_ptr<JniCallingConvention> JniCallingConvention::Create(ArenaAllocator* arena,
                                                                    bool is_static,
                                                                    bool is_synchronized,
+                                                                   bool is_critical_native,
                                                                    const char* shorty,
                                                                    InstructionSet instruction_set) {
+  if (UNLIKELY(is_critical_native)) {
+    // Sanity check that the requested JNI instruction set
+    // is supported for critical natives. Not every one is.
+    switch (instruction_set) {
+      case kX86_64:
+      case kX86:
+      case kArm64:
+      case kArm:
+      case kThumb2:
+        break;
+      default:
+        is_critical_native = false;
+        LOG(WARNING) << "@CriticalNative support not implemented for " << instruction_set
+                     << "; will crash at runtime if trying to invoke such a method.";
+        // TODO: implement for MIPS/MIPS64
+    }
+  }
+
   switch (instruction_set) {
 #ifdef ART_ENABLE_CODEGEN_arm
     case kArm:
     case kThumb2:
       return std::unique_ptr<JniCallingConvention>(
-          new (arena) arm::ArmJniCallingConvention(is_static, is_synchronized, shorty));
+          new (arena) arm::ArmJniCallingConvention(is_static,
+                                                   is_synchronized,
+                                                   is_critical_native,
+                                                   shorty));
 #endif
 #ifdef ART_ENABLE_CODEGEN_arm64
     case kArm64:
       return std::unique_ptr<JniCallingConvention>(
-          new (arena) arm64::Arm64JniCallingConvention(is_static, is_synchronized, shorty));
+          new (arena) arm64::Arm64JniCallingConvention(is_static,
+                                                       is_synchronized,
+                                                       is_critical_native,
+                                                       shorty));
 #endif
 #ifdef ART_ENABLE_CODEGEN_mips
     case kMips:
@@ -176,12 +201,18 @@
 #ifdef ART_ENABLE_CODEGEN_x86
     case kX86:
       return std::unique_ptr<JniCallingConvention>(
-          new (arena) x86::X86JniCallingConvention(is_static, is_synchronized, shorty));
+          new (arena) x86::X86JniCallingConvention(is_static,
+                                                   is_synchronized,
+                                                   is_critical_native,
+                                                   shorty));
 #endif
 #ifdef ART_ENABLE_CODEGEN_x86_64
     case kX86_64:
       return std::unique_ptr<JniCallingConvention>(
-          new (arena) x86_64::X86_64JniCallingConvention(is_static, is_synchronized, shorty));
+          new (arena) x86_64::X86_64JniCallingConvention(is_static,
+                                                         is_synchronized,
+                                                         is_critical_native,
+                                                         shorty));
 #endif
     default:
       LOG(FATAL) << "Unknown InstructionSet: " << instruction_set;
@@ -199,27 +230,36 @@
 }
 
 FrameOffset JniCallingConvention::ReturnValueSaveLocation() const {
-  // Segment state is 4 bytes long
-  return FrameOffset(SavedLocalReferenceCookieOffset().Int32Value() + 4);
+  if (LIKELY(HasHandleScope())) {
+    // Initial offset already includes the displacement.
+    // -- Remove the additional local reference cookie offset if we don't have a handle scope.
+    const size_t saved_local_reference_cookie_offset =
+        SavedLocalReferenceCookieOffset().Int32Value();
+    // Segment state is 4 bytes long
+    const size_t segment_state_size = 4;
+    return FrameOffset(saved_local_reference_cookie_offset + segment_state_size);
+  } else {
+    // Include only the initial Method* as part of the offset.
+    CHECK_LT(displacement_.SizeValue(),
+             static_cast<size_t>(std::numeric_limits<int32_t>::max()));
+    return FrameOffset(displacement_.Int32Value() + static_cast<size_t>(frame_pointer_size_));
+  }
 }
 
 bool JniCallingConvention::HasNext() {
-  if (itr_args_ <= kObjectOrClass) {
+  if (IsCurrentArgExtraForJni()) {
     return true;
   } else {
-    unsigned int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
+    unsigned int arg_pos = GetIteratorPositionWithinShorty();
     return arg_pos < NumArgs();
   }
 }
 
 void JniCallingConvention::Next() {
   CHECK(HasNext());
-  if (itr_args_ > kObjectOrClass) {
-    int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
-    if (IsParamALongOrDouble(arg_pos)) {
-      itr_longs_and_doubles_++;
-      itr_slots_++;
-    }
+  if (IsCurrentParamALong() || IsCurrentParamADouble()) {
+    itr_longs_and_doubles_++;
+    itr_slots_++;
   }
   if (IsCurrentParamAFloatOrDouble()) {
     itr_float_and_doubles_++;
@@ -227,63 +267,73 @@
   if (IsCurrentParamAReference()) {
     itr_refs_++;
   }
+  // This default/fallthrough case also covers the extra JNIEnv* argument,
+  // as well as any other single-slot primitives.
   itr_args_++;
   itr_slots_++;
 }
 
 bool JniCallingConvention::IsCurrentParamAReference() {
-  switch (itr_args_) {
-    case kJniEnv:
-      return false;  // JNIEnv*
-    case kObjectOrClass:
-      return true;   // jobject or jclass
-    default: {
-      int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
-      return IsParamAReference(arg_pos);
-    }
+  bool return_value;
+  if (SwitchExtraJniArguments(itr_args_,
+                              false,  // JNIEnv*
+                              true,   // jobject or jclass
+                              /* out parameters */
+                              &return_value)) {
+    return return_value;
+  } else {
+    int arg_pos = GetIteratorPositionWithinShorty();
+    return IsParamAReference(arg_pos);
   }
 }
 
+
 bool JniCallingConvention::IsCurrentParamJniEnv() {
+  if (UNLIKELY(!HasJniEnv())) {
+    return false;
+  }
   return (itr_args_ == kJniEnv);
 }
 
 bool JniCallingConvention::IsCurrentParamAFloatOrDouble() {
-  switch (itr_args_) {
-    case kJniEnv:
-      return false;  // JNIEnv*
-    case kObjectOrClass:
-      return false;   // jobject or jclass
-    default: {
-      int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
-      return IsParamAFloatOrDouble(arg_pos);
-    }
+  bool return_value;
+  if (SwitchExtraJniArguments(itr_args_,
+                              false,  // jnienv*
+                              false,  // jobject or jclass
+                              /* out parameters */
+                              &return_value)) {
+    return return_value;
+  } else {
+    int arg_pos = GetIteratorPositionWithinShorty();
+    return IsParamAFloatOrDouble(arg_pos);
   }
 }
 
 bool JniCallingConvention::IsCurrentParamADouble() {
-  switch (itr_args_) {
-    case kJniEnv:
-      return false;  // JNIEnv*
-    case kObjectOrClass:
-      return false;   // jobject or jclass
-    default: {
-      int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
-      return IsParamADouble(arg_pos);
-    }
+  bool return_value;
+  if (SwitchExtraJniArguments(itr_args_,
+                              false,  // jnienv*
+                              false,  // jobject or jclass
+                              /* out parameters */
+                              &return_value)) {
+    return return_value;
+  } else {
+    int arg_pos = GetIteratorPositionWithinShorty();
+    return IsParamADouble(arg_pos);
   }
 }
 
 bool JniCallingConvention::IsCurrentParamALong() {
-  switch (itr_args_) {
-    case kJniEnv:
-      return false;  // JNIEnv*
-    case kObjectOrClass:
-      return false;   // jobject or jclass
-    default: {
-      int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
-      return IsParamALong(arg_pos);
-    }
+  bool return_value;
+  if (SwitchExtraJniArguments(itr_args_,
+                              false,  // jnienv*
+                              false,  // jobject or jclass
+                              /* out parameters */
+                              &return_value)) {
+    return return_value;
+  } else {
+    int arg_pos = GetIteratorPositionWithinShorty();
+    return IsParamALong(arg_pos);
   }
 }
 
@@ -297,19 +347,93 @@
   return FrameOffset(result);
 }
 
-size_t JniCallingConvention::CurrentParamSize() {
-  if (itr_args_ <= kObjectOrClass) {
+size_t JniCallingConvention::CurrentParamSize() const {
+  if (IsCurrentArgExtraForJni()) {
     return static_cast<size_t>(frame_pointer_size_);  // JNIEnv or jobject/jclass
   } else {
-    int arg_pos = itr_args_ - NumberOfExtraArgumentsForJni();
+    int arg_pos = GetIteratorPositionWithinShorty();
     return ParamSize(arg_pos);
   }
 }
 
-size_t JniCallingConvention::NumberOfExtraArgumentsForJni() {
-  // The first argument is the JNIEnv*.
-  // Static methods have an extra argument which is the jclass.
-  return IsStatic() ? 2 : 1;
+size_t JniCallingConvention::NumberOfExtraArgumentsForJni() const {
+  if (LIKELY(HasExtraArgumentsForJni())) {
+    // The first argument is the JNIEnv*.
+    // Static methods have an extra argument which is the jclass.
+    return IsStatic() ? 2 : 1;
+  } else {
+    // Critical natives exclude the JNIEnv and the jclass/this parameters.
+    return 0;
+  }
 }
 
+bool JniCallingConvention::HasHandleScope() const {
+  // Exclude HandleScope for @CriticalNative methods for optimization speed.
+  return is_critical_native_ == false;
+}
+
+bool JniCallingConvention::HasLocalReferenceSegmentState() const {
+  // Exclude local reference segment states for @CriticalNative methods for optimization speed.
+  return is_critical_native_ == false;
+}
+
+bool JniCallingConvention::HasJniEnv() const {
+  // Exclude "JNIEnv*" parameter for @CriticalNative methods.
+  return HasExtraArgumentsForJni();
+}
+
+bool JniCallingConvention::HasSelfClass() const {
+  if (!IsStatic()) {
+    // Virtual functions: There is never an implicit jclass parameter.
+    return false;
+  } else {
+    // Static functions: There is an implicit jclass parameter unless it's @CriticalNative.
+    return HasExtraArgumentsForJni();
+  }
+}
+
+bool JniCallingConvention::HasExtraArgumentsForJni() const {
+  // @CriticalNative jni implementations exclude both JNIEnv* and the jclass/jobject parameters.
+  return is_critical_native_ == false;
+}
+
+unsigned int JniCallingConvention::GetIteratorPositionWithinShorty() const {
+  // We need to subtract out the extra JNI arguments if we want to use this iterator position
+  // with the inherited CallingConvention member functions, which rely on scanning the shorty.
+  // Note that our shorty does *not* include the JNIEnv, jclass/jobject parameters.
+  DCHECK_GE(itr_args_, NumberOfExtraArgumentsForJni());
+  return itr_args_ - NumberOfExtraArgumentsForJni();
+}
+
+bool JniCallingConvention::IsCurrentArgExtraForJni() const {
+  if (UNLIKELY(!HasExtraArgumentsForJni())) {
+    return false;  // If there are no extra args, we can never be an extra.
+  }
+  // Only parameters kJniEnv and kObjectOrClass are considered extra.
+  return itr_args_ <= kObjectOrClass;
+}
+
+bool JniCallingConvention::SwitchExtraJniArguments(size_t switch_value,
+                                                   bool case_jni_env,
+                                                   bool case_object_or_class,
+                                                   /* out parameters */
+                                                   bool* return_value) const {
+  DCHECK(return_value != nullptr);
+  if (UNLIKELY(!HasExtraArgumentsForJni())) {
+    return false;
+  }
+
+  switch (switch_value) {
+    case kJniEnv:
+      *return_value = case_jni_env;
+      return true;
+    case kObjectOrClass:
+      *return_value = case_object_or_class;
+      return true;
+    default:
+      return false;
+  }
+}
+
+
 }  // namespace art
diff --git a/compiler/jni/quick/calling_convention.h b/compiler/jni/quick/calling_convention.h
index 995fa51..3d89146 100644
--- a/compiler/jni/quick/calling_convention.h
+++ b/compiler/jni/quick/calling_convention.h
@@ -161,6 +161,12 @@
   size_t NumArgs() const {
     return num_args_;
   }
+  // Implicit argument count: 1 for instance functions, 0 for static functions.
+  // (The implicit argument is only relevant to the shorty, i.e.
+  // the 0th arg is not in the shorty if it's implicit).
+  size_t NumImplicitArgs() const {
+    return IsStatic() ? 0 : 1;
+  }
   size_t NumLongOrDoubleArgs() const {
     return num_long_or_double_args_;
   }
@@ -281,6 +287,7 @@
   static std::unique_ptr<JniCallingConvention> Create(ArenaAllocator* arena,
                                                       bool is_static,
                                                       bool is_synchronized,
+                                                      bool is_critical_native,
                                                       const char* shorty,
                                                       InstructionSet instruction_set);
 
@@ -288,7 +295,8 @@
   // always at the bottom of a frame, but this doesn't work for outgoing
   // native args). Includes alignment.
   virtual size_t FrameSize() = 0;
-  // Size of outgoing arguments, including alignment
+  // Size of outgoing arguments (stack portion), including alignment.
+  // -- Arguments that are passed via registers are excluded from this size.
   virtual size_t OutArgSize() = 0;
   // Number of references in stack indirect reference table
   size_t ReferenceCount() const;
@@ -319,8 +327,11 @@
   bool IsCurrentParamAFloatOrDouble();
   bool IsCurrentParamADouble();
   bool IsCurrentParamALong();
+  bool IsCurrentParamALongOrDouble() {
+    return IsCurrentParamALong() || IsCurrentParamADouble();
+  }
   bool IsCurrentParamJniEnv();
-  size_t CurrentParamSize();
+  size_t CurrentParamSize() const;
   virtual bool IsCurrentParamInRegister() = 0;
   virtual bool IsCurrentParamOnStack() = 0;
   virtual ManagedRegister CurrentParamRegister() = 0;
@@ -359,18 +370,62 @@
     kObjectOrClass = 1
   };
 
+  // TODO: remove this constructor once all are changed to the below one.
   JniCallingConvention(bool is_static,
                        bool is_synchronized,
                        const char* shorty,
                        PointerSize frame_pointer_size)
-      : CallingConvention(is_static, is_synchronized, shorty, frame_pointer_size) {}
+      : CallingConvention(is_static, is_synchronized, shorty, frame_pointer_size),
+        is_critical_native_(false) {}
+
+  JniCallingConvention(bool is_static,
+                       bool is_synchronized,
+                       bool is_critical_native,
+                       const char* shorty,
+                       PointerSize frame_pointer_size)
+      : CallingConvention(is_static, is_synchronized, shorty, frame_pointer_size),
+        is_critical_native_(is_critical_native) {}
 
   // Number of stack slots for outgoing arguments, above which the handle scope is
   // located
   virtual size_t NumberOfOutgoingStackArgs() = 0;
 
  protected:
-  size_t NumberOfExtraArgumentsForJni();
+  size_t NumberOfExtraArgumentsForJni() const;
+
+  // Does the transition have a StackHandleScope?
+  bool HasHandleScope() const;
+  // Does the transition have a local reference segment state?
+  bool HasLocalReferenceSegmentState() const;
+  // Has a JNIEnv* parameter implicitly?
+  bool HasJniEnv() const;
+  // Has a 'jclass' parameter implicitly?
+  bool HasSelfClass() const;
+
+  // Are there extra JNI arguments (JNIEnv* and maybe jclass)?
+  bool HasExtraArgumentsForJni() const;
+
+  // Returns the position of itr_args_, fixed up by removing the offset of extra JNI arguments.
+  unsigned int GetIteratorPositionWithinShorty() const;
+
+  // Is the current argument (at the iterator) an extra argument for JNI?
+  bool IsCurrentArgExtraForJni() const;
+
+  const bool is_critical_native_;
+
+ private:
+  // Shorthand for switching on the switch value but only IF there are extra JNI arguments.
+  //
+  // Puts the case value into return_value.
+  // * (switch_value == kJniEnv) => case_jni_env
+  // * (switch_value == kObjectOrClass) => case_object_or_class
+  //
+  // Returns false otherwise (or if there are no extra JNI arguments).
+  bool SwitchExtraJniArguments(size_t switch_value,
+                               bool case_jni_env,
+                               bool case_object_or_class,
+                               /* out parameters */
+                               bool* return_value) const;
 };
 
 }  // namespace art
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index d092c3f..7e58d78 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -90,8 +90,10 @@
   const InstructionSetFeatures* instruction_set_features = driver->GetInstructionSetFeatures();
 
   // i.e. if the method was annotated with @FastNative
-  const bool is_fast_native =
-      (static_cast<uint32_t>(optimization_flags) & Compiler::kFastNative) != 0;
+  const bool is_fast_native = (optimization_flags == Compiler::kFastNative);
+
+  // i.e. if the method was annotated with @CriticalNative
+  bool is_critical_native = (optimization_flags == Compiler::kCriticalNative);
 
   VLOG(jni) << "JniCompile: Method :: "
               << art::PrettyMethod(method_idx, dex_file, /* with signature */ true)
@@ -102,12 +104,50 @@
               << art::PrettyMethod(method_idx, dex_file, /* with signature */ true);
   }
 
+  if (UNLIKELY(is_critical_native)) {
+    VLOG(jni) << "JniCompile: Critical native method detected :: "
+              << art::PrettyMethod(method_idx, dex_file, /* with signature */ true);
+  }
+
+  if (kIsDebugBuild) {
+    // Don't allow both @FastNative and @CriticalNative. They are mutually exclusive.
+    if (UNLIKELY(is_fast_native && is_critical_native)) {
+      LOG(FATAL) << "JniCompile: Method cannot be both @CriticalNative and @FastNative"
+                 << art::PrettyMethod(method_idx, dex_file, /* with_signature */ true);
+    }
+
+    // @CriticalNative - extra checks:
+    // -- Don't allow virtual criticals
+    // -- Don't allow synchronized criticals
+    // -- Don't allow any objects as parameter or return value
+    if (UNLIKELY(is_critical_native)) {
+      CHECK(is_static)
+          << "@CriticalNative functions cannot be virtual since that would"
+          << "require passing a reference parameter (this), which is illegal "
+          << art::PrettyMethod(method_idx, dex_file, /* with_signature */ true);
+      CHECK(!is_synchronized)
+          << "@CriticalNative functions cannot be synchronized since that would"
+          << "require passing a (class and/or this) reference parameter, which is illegal "
+          << art::PrettyMethod(method_idx, dex_file, /* with_signature */ true);
+      for (size_t i = 0; i < strlen(shorty); ++i) {
+        CHECK_NE(Primitive::kPrimNot, Primitive::GetType(shorty[i]))
+            << "@CriticalNative methods' shorty types must not have illegal references "
+            << art::PrettyMethod(method_idx, dex_file, /* with_signature */ true);
+      }
+    }
+  }
+
   ArenaPool pool;
   ArenaAllocator arena(&pool);
 
   // Calling conventions used to iterate over parameters to method
-  std::unique_ptr<JniCallingConvention> main_jni_conv(
-      JniCallingConvention::Create(&arena, is_static, is_synchronized, shorty, instruction_set));
+  std::unique_ptr<JniCallingConvention> main_jni_conv =
+      JniCallingConvention::Create(&arena,
+                                   is_static,
+                                   is_synchronized,
+                                   is_critical_native,
+                                   shorty,
+                                   instruction_set);
   bool reference_return = main_jni_conv->IsReturnAReference();
 
   std::unique_ptr<ManagedRuntimeCallingConvention> mr_conv(
@@ -127,8 +167,13 @@
     jni_end_shorty = "V";
   }
 
-  std::unique_ptr<JniCallingConvention> end_jni_conv(JniCallingConvention::Create(
-      &arena, is_static, is_synchronized, jni_end_shorty, instruction_set));
+  std::unique_ptr<JniCallingConvention> end_jni_conv(
+      JniCallingConvention::Create(&arena,
+                                   is_static,
+                                   is_synchronized,
+                                   is_critical_native,
+                                   jni_end_shorty,
+                                   instruction_set));
 
   // Assembler that holds generated instructions
   std::unique_ptr<JNIMacroAssembler<kPointerSize>> jni_asm =
@@ -141,75 +186,89 @@
   const Offset monitor_enter(OFFSETOF_MEMBER(JNINativeInterface, MonitorEnter));
   const Offset monitor_exit(OFFSETOF_MEMBER(JNINativeInterface, MonitorExit));
 
-  // 1. Build the frame saving all callee saves
-  const size_t frame_size(main_jni_conv->FrameSize());
+  // 1. Build the frame saving all callee saves, Method*, and PC return address.
+  const size_t frame_size(main_jni_conv->FrameSize());  // Excludes outgoing args.
   ArrayRef<const ManagedRegister> callee_save_regs = main_jni_conv->CalleeSaveRegisters();
   __ BuildFrame(frame_size, mr_conv->MethodRegister(), callee_save_regs, mr_conv->EntrySpills());
   DCHECK_EQ(jni_asm->cfi().GetCurrentCFAOffset(), static_cast<int>(frame_size));
 
-  // 2. Set up the HandleScope
-  mr_conv->ResetIterator(FrameOffset(frame_size));
-  main_jni_conv->ResetIterator(FrameOffset(0));
-  __ StoreImmediateToFrame(main_jni_conv->HandleScopeNumRefsOffset(),
-                           main_jni_conv->ReferenceCount(),
-                           mr_conv->InterproceduralScratchRegister());
+  if (LIKELY(!is_critical_native)) {
+    // NOTE: @CriticalNative methods don't have a HandleScope
+    //       because they can't have any reference parameters or return values.
 
-  __ CopyRawPtrFromThread(main_jni_conv->HandleScopeLinkOffset(),
-                          Thread::TopHandleScopeOffset<kPointerSize>(),
-                          mr_conv->InterproceduralScratchRegister());
-  __ StoreStackOffsetToThread(Thread::TopHandleScopeOffset<kPointerSize>(),
-                              main_jni_conv->HandleScopeOffset(),
-                              mr_conv->InterproceduralScratchRegister());
+    // 2. Set up the HandleScope
+    mr_conv->ResetIterator(FrameOffset(frame_size));
+    main_jni_conv->ResetIterator(FrameOffset(0));
+    __ StoreImmediateToFrame(main_jni_conv->HandleScopeNumRefsOffset(),
+                             main_jni_conv->ReferenceCount(),
+                             mr_conv->InterproceduralScratchRegister());
 
-  // 3. Place incoming reference arguments into handle scope
-  main_jni_conv->Next();  // Skip JNIEnv*
-  // 3.5. Create Class argument for static methods out of passed method
-  if (is_static) {
-    FrameOffset handle_scope_offset = main_jni_conv->CurrentParamHandleScopeEntryOffset();
-    // Check handle scope offset is within frame
-    CHECK_LT(handle_scope_offset.Uint32Value(), frame_size);
-    // Note this LoadRef() doesn't need heap unpoisoning since it's from the ArtMethod.
-    // Note this LoadRef() does not include read barrier. It will be handled below.
-    __ LoadRef(main_jni_conv->InterproceduralScratchRegister(),
-               mr_conv->MethodRegister(), ArtMethod::DeclaringClassOffset(), false);
-    __ VerifyObject(main_jni_conv->InterproceduralScratchRegister(), false);
-    __ StoreRef(handle_scope_offset, main_jni_conv->InterproceduralScratchRegister());
-    main_jni_conv->Next();  // in handle scope so move to next argument
-  }
-  while (mr_conv->HasNext()) {
-    CHECK(main_jni_conv->HasNext());
-    bool ref_param = main_jni_conv->IsCurrentParamAReference();
-    CHECK(!ref_param || mr_conv->IsCurrentParamAReference());
-    // References need placing in handle scope and the entry value passing
-    if (ref_param) {
-      // Compute handle scope entry, note null is placed in the handle scope but its boxed value
-      // must be null.
+    __ CopyRawPtrFromThread(main_jni_conv->HandleScopeLinkOffset(),
+                            Thread::TopHandleScopeOffset<kPointerSize>(),
+                            mr_conv->InterproceduralScratchRegister());
+    __ StoreStackOffsetToThread(Thread::TopHandleScopeOffset<kPointerSize>(),
+                                main_jni_conv->HandleScopeOffset(),
+                                mr_conv->InterproceduralScratchRegister());
+
+    // 3. Place incoming reference arguments into handle scope
+    main_jni_conv->Next();  // Skip JNIEnv*
+    // 3.5. Create Class argument for static methods out of passed method
+    if (is_static) {
       FrameOffset handle_scope_offset = main_jni_conv->CurrentParamHandleScopeEntryOffset();
-      // Check handle scope offset is within frame and doesn't run into the saved segment state.
+      // Check handle scope offset is within frame
       CHECK_LT(handle_scope_offset.Uint32Value(), frame_size);
-      CHECK_NE(handle_scope_offset.Uint32Value(),
-               main_jni_conv->SavedLocalReferenceCookieOffset().Uint32Value());
-      bool input_in_reg = mr_conv->IsCurrentParamInRegister();
-      bool input_on_stack = mr_conv->IsCurrentParamOnStack();
-      CHECK(input_in_reg || input_on_stack);
-
-      if (input_in_reg) {
-        ManagedRegister in_reg  =  mr_conv->CurrentParamRegister();
-        __ VerifyObject(in_reg, mr_conv->IsCurrentArgPossiblyNull());
-        __ StoreRef(handle_scope_offset, in_reg);
-      } else if (input_on_stack) {
-        FrameOffset in_off  = mr_conv->CurrentParamStackOffset();
-        __ VerifyObject(in_off, mr_conv->IsCurrentArgPossiblyNull());
-        __ CopyRef(handle_scope_offset, in_off,
-                   mr_conv->InterproceduralScratchRegister());
-      }
+      // Note this LoadRef() doesn't need heap unpoisoning since it's from the ArtMethod.
+      // Note this LoadRef() does not include read barrier. It will be handled below.
+      //
+      // scratchRegister = *method[DeclaringClassOffset()];
+      __ LoadRef(main_jni_conv->InterproceduralScratchRegister(),
+                 mr_conv->MethodRegister(), ArtMethod::DeclaringClassOffset(), false);
+      __ VerifyObject(main_jni_conv->InterproceduralScratchRegister(), false);
+      // *handleScopeOffset = scratchRegister
+      __ StoreRef(handle_scope_offset, main_jni_conv->InterproceduralScratchRegister());
+      main_jni_conv->Next();  // in handle scope so move to next argument
     }
-    mr_conv->Next();
-    main_jni_conv->Next();
-  }
+    // Place every reference into the handle scope (ignore other parameters).
+    while (mr_conv->HasNext()) {
+      CHECK(main_jni_conv->HasNext());
+      bool ref_param = main_jni_conv->IsCurrentParamAReference();
+      CHECK(!ref_param || mr_conv->IsCurrentParamAReference());
+      // References need placing in handle scope and the entry value passing
+      if (ref_param) {
+        // Compute handle scope entry, note null is placed in the handle scope but its boxed value
+        // must be null.
+        FrameOffset handle_scope_offset = main_jni_conv->CurrentParamHandleScopeEntryOffset();
+        // Check handle scope offset is within frame and doesn't run into the saved segment state.
+        CHECK_LT(handle_scope_offset.Uint32Value(), frame_size);
+        CHECK_NE(handle_scope_offset.Uint32Value(),
+                 main_jni_conv->SavedLocalReferenceCookieOffset().Uint32Value());
+        bool input_in_reg = mr_conv->IsCurrentParamInRegister();
+        bool input_on_stack = mr_conv->IsCurrentParamOnStack();
+        CHECK(input_in_reg || input_on_stack);
 
-  // 4. Write out the end of the quick frames.
-  __ StoreStackPointerToThread(Thread::TopOfManagedStackOffset<kPointerSize>());
+        if (input_in_reg) {
+          ManagedRegister in_reg  =  mr_conv->CurrentParamRegister();
+          __ VerifyObject(in_reg, mr_conv->IsCurrentArgPossiblyNull());
+          __ StoreRef(handle_scope_offset, in_reg);
+        } else if (input_on_stack) {
+          FrameOffset in_off  = mr_conv->CurrentParamStackOffset();
+          __ VerifyObject(in_off, mr_conv->IsCurrentArgPossiblyNull());
+          __ CopyRef(handle_scope_offset, in_off,
+                     mr_conv->InterproceduralScratchRegister());
+        }
+      }
+      mr_conv->Next();
+      main_jni_conv->Next();
+    }
+
+    // 4. Write out the end of the quick frames.
+    __ StoreStackPointerToThread(Thread::TopOfManagedStackOffset<kPointerSize>());
+
+    // NOTE: @CriticalNative does not need to store the stack pointer to the thread
+    //       because garbage collections are disabled within the execution of a
+    //       @CriticalNative method.
+    //       (TODO: We could probably disable it for @FastNative too).
+  }  // if (!is_critical_native)
 
   // 5. Move frame down to allow space for out going args.
   const size_t main_out_arg_size = main_jni_conv->OutArgSize();
@@ -218,7 +277,9 @@
 
   // Call the read barrier for the declaring class loaded from the method for a static call.
   // Note that we always have outgoing param space available for at least two params.
-  if (kUseReadBarrier && is_static) {
+  if (kUseReadBarrier && is_static && !is_critical_native) {
+    // XX: Why is this necessary only for the jclass? Why not for every single object ref?
+    // Skip this for @CriticalNative because we didn't build a HandleScope to begin with.
     ThreadOffset<kPointerSize> read_barrier = QUICK_ENTRYPOINT_OFFSET(kPointerSize,
                                                                       pReadBarrierJni);
     main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
@@ -255,46 +316,56 @@
   //    can occur. The result is the saved JNI local state that is restored by the exit call. We
   //    abuse the JNI calling convention here, that is guaranteed to support passing 2 pointer
   //    arguments.
-  ThreadOffset<kPointerSize> jni_start =
-      is_synchronized
-          ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodStartSynchronized)
-          : (is_fast_native
-                 ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodFastStart)
-                 : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodStart));
+  FrameOffset locked_object_handle_scope_offset(0xBEEFDEAD);
+  if (LIKELY(!is_critical_native)) {
+    // Skip this for @CriticalNative methods. They do not call JniMethodStart.
+    ThreadOffset<kPointerSize> jni_start =
+        is_synchronized
+            ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodStartSynchronized)
+            : (is_fast_native
+                   ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodFastStart)
+                   : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodStart));
 
-  main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
-  FrameOffset locked_object_handle_scope_offset(0);
-  if (is_synchronized) {
-    // Pass object for locking.
-    main_jni_conv->Next();  // Skip JNIEnv.
-    locked_object_handle_scope_offset = main_jni_conv->CurrentParamHandleScopeEntryOffset();
     main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
-    if (main_jni_conv->IsCurrentParamOnStack()) {
-      FrameOffset out_off = main_jni_conv->CurrentParamStackOffset();
-      __ CreateHandleScopeEntry(out_off, locked_object_handle_scope_offset,
-                                mr_conv->InterproceduralScratchRegister(), false);
-    } else {
-      ManagedRegister out_reg = main_jni_conv->CurrentParamRegister();
-      __ CreateHandleScopeEntry(out_reg, locked_object_handle_scope_offset,
-                                ManagedRegister::NoRegister(), false);
+    locked_object_handle_scope_offset = FrameOffset(0);
+    if (is_synchronized) {
+      // Pass object for locking.
+      main_jni_conv->Next();  // Skip JNIEnv.
+      locked_object_handle_scope_offset = main_jni_conv->CurrentParamHandleScopeEntryOffset();
+      main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
+      if (main_jni_conv->IsCurrentParamOnStack()) {
+        FrameOffset out_off = main_jni_conv->CurrentParamStackOffset();
+        __ CreateHandleScopeEntry(out_off, locked_object_handle_scope_offset,
+                                  mr_conv->InterproceduralScratchRegister(), false);
+      } else {
+        ManagedRegister out_reg = main_jni_conv->CurrentParamRegister();
+        __ CreateHandleScopeEntry(out_reg, locked_object_handle_scope_offset,
+                                  ManagedRegister::NoRegister(), false);
+      }
+      main_jni_conv->Next();
     }
-    main_jni_conv->Next();
+    if (main_jni_conv->IsCurrentParamInRegister()) {
+      __ GetCurrentThread(main_jni_conv->CurrentParamRegister());
+      __ Call(main_jni_conv->CurrentParamRegister(),
+              Offset(jni_start),
+              main_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ GetCurrentThread(main_jni_conv->CurrentParamStackOffset(),
+                          main_jni_conv->InterproceduralScratchRegister());
+      __ CallFromThread(jni_start, main_jni_conv->InterproceduralScratchRegister());
+    }
+    if (is_synchronized) {  // Check for exceptions from monitor enter.
+      __ ExceptionPoll(main_jni_conv->InterproceduralScratchRegister(), main_out_arg_size);
+    }
   }
-  if (main_jni_conv->IsCurrentParamInRegister()) {
-    __ GetCurrentThread(main_jni_conv->CurrentParamRegister());
-    __ Call(main_jni_conv->CurrentParamRegister(),
-            Offset(jni_start),
-            main_jni_conv->InterproceduralScratchRegister());
-  } else {
-    __ GetCurrentThread(main_jni_conv->CurrentParamStackOffset(),
-                        main_jni_conv->InterproceduralScratchRegister());
-    __ CallFromThread(jni_start, main_jni_conv->InterproceduralScratchRegister());
+
+  // Store into stack_frame[saved_cookie_offset] the return value of JniMethodStart.
+  FrameOffset saved_cookie_offset(
+      FrameOffset(0xDEADBEEFu));  // @CriticalNative - use obviously bad value for debugging
+  if (LIKELY(!is_critical_native)) {
+    saved_cookie_offset = main_jni_conv->SavedLocalReferenceCookieOffset();
+    __ Store(saved_cookie_offset, main_jni_conv->IntReturnRegister(), 4 /* sizeof cookie */);
   }
-  if (is_synchronized) {  // Check for exceptions from monitor enter.
-    __ ExceptionPoll(main_jni_conv->InterproceduralScratchRegister(), main_out_arg_size);
-  }
-  FrameOffset saved_cookie_offset = main_jni_conv->SavedLocalReferenceCookieOffset();
-  __ Store(saved_cookie_offset, main_jni_conv->IntReturnRegister(), 4);
 
   // 7. Iterate over arguments placing values from managed calling convention in
   //    to the convention required for a native call (shuffling). For references
@@ -315,9 +386,13 @@
   for (uint32_t i = 0; i < args_count; ++i) {
     mr_conv->ResetIterator(FrameOffset(frame_size + main_out_arg_size));
     main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
-    main_jni_conv->Next();  // Skip JNIEnv*.
-    if (is_static) {
-      main_jni_conv->Next();  // Skip Class for now.
+
+    // Skip the extra JNI parameters for now.
+    if (LIKELY(!is_critical_native)) {
+      main_jni_conv->Next();    // Skip JNIEnv*.
+      if (is_static) {
+        main_jni_conv->Next();  // Skip Class for now.
+      }
     }
     // Skip to the argument we're interested in.
     for (uint32_t j = 0; j < args_count - i - 1; ++j) {
@@ -326,7 +401,7 @@
     }
     CopyParameter(jni_asm.get(), mr_conv.get(), main_jni_conv.get(), frame_size, main_out_arg_size);
   }
-  if (is_static) {
+  if (is_static && !is_critical_native) {
     // Create argument for Class
     mr_conv->ResetIterator(FrameOffset(frame_size + main_out_arg_size));
     main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
@@ -344,24 +419,30 @@
     }
   }
 
-  // 8. Create 1st argument, the JNI environment ptr.
+  // Set the iterator back to the incoming Method*.
   main_jni_conv->ResetIterator(FrameOffset(main_out_arg_size));
-  // Register that will hold local indirect reference table
-  if (main_jni_conv->IsCurrentParamInRegister()) {
-    ManagedRegister jni_env = main_jni_conv->CurrentParamRegister();
-    DCHECK(!jni_env.Equals(main_jni_conv->InterproceduralScratchRegister()));
-    __ LoadRawPtrFromThread(jni_env, Thread::JniEnvOffset<kPointerSize>());
-  } else {
-    FrameOffset jni_env = main_jni_conv->CurrentParamStackOffset();
-    __ CopyRawPtrFromThread(jni_env,
-                            Thread::JniEnvOffset<kPointerSize>(),
-                            main_jni_conv->InterproceduralScratchRegister());
+  if (LIKELY(!is_critical_native)) {
+    // 8. Create 1st argument, the JNI environment ptr.
+    // Register that will hold local indirect reference table
+    if (main_jni_conv->IsCurrentParamInRegister()) {
+      ManagedRegister jni_env = main_jni_conv->CurrentParamRegister();
+      DCHECK(!jni_env.Equals(main_jni_conv->InterproceduralScratchRegister()));
+      __ LoadRawPtrFromThread(jni_env, Thread::JniEnvOffset<kPointerSize>());
+    } else {
+      FrameOffset jni_env = main_jni_conv->CurrentParamStackOffset();
+      __ CopyRawPtrFromThread(jni_env,
+                              Thread::JniEnvOffset<kPointerSize>(),
+                              main_jni_conv->InterproceduralScratchRegister());
+    }
   }
 
   // 9. Plant call to native code associated with method.
-  MemberOffset jni_entrypoint_offset = ArtMethod::EntryPointFromJniOffset(
-      InstructionSetPointerSize(instruction_set));
-  __ Call(main_jni_conv->MethodStackOffset(), jni_entrypoint_offset,
+  MemberOffset jni_entrypoint_offset =
+      ArtMethod::EntryPointFromJniOffset(InstructionSetPointerSize(instruction_set));
+  // FIXME: Not sure if MethodStackOffset will work here. What does it even do?
+  __ Call(main_jni_conv->MethodStackOffset(),
+          jni_entrypoint_offset,
+          // XX: Why not the jni conv scratch register?
           mr_conv->InterproceduralScratchRegister());
 
   // 10. Fix differences in result widths.
@@ -377,20 +458,45 @@
     }
   }
 
-  // 11. Save return value
+  // 11. Process return value
   FrameOffset return_save_location = main_jni_conv->ReturnValueSaveLocation();
   if (main_jni_conv->SizeOfReturnValue() != 0 && !reference_return) {
-    if ((instruction_set == kMips || instruction_set == kMips64) &&
-        main_jni_conv->GetReturnType() == Primitive::kPrimDouble &&
-        return_save_location.Uint32Value() % 8 != 0) {
-      // Ensure doubles are 8-byte aligned for MIPS
-      return_save_location = FrameOffset(return_save_location.Uint32Value()
-                                             + static_cast<size_t>(kMipsPointerSize));
+    if (LIKELY(!is_critical_native)) {
+      // For normal JNI, store the return value on the stack because the call to
+      // JniMethodEnd will clobber the return value. It will be restored in (13).
+      if ((instruction_set == kMips || instruction_set == kMips64) &&
+          main_jni_conv->GetReturnType() == Primitive::kPrimDouble &&
+          return_save_location.Uint32Value() % 8 != 0) {
+        // Ensure doubles are 8-byte aligned for MIPS
+        return_save_location = FrameOffset(return_save_location.Uint32Value()
+                                               + static_cast<size_t>(kMipsPointerSize));
+        // TODO: refactor this into the JniCallingConvention code
+        // as a return value alignment requirement.
+      }
+      CHECK_LT(return_save_location.Uint32Value(), frame_size + main_out_arg_size);
+      __ Store(return_save_location,
+               main_jni_conv->ReturnRegister(),
+               main_jni_conv->SizeOfReturnValue());
+    } else {
+      // For @CriticalNative only,
+      // move the JNI return register into the managed return register (if they don't match).
+      ManagedRegister jni_return_reg = main_jni_conv->ReturnRegister();
+      ManagedRegister mr_return_reg = mr_conv->ReturnRegister();
+
+      // Check if the JNI return register matches the managed return register.
+      // If they differ, only then do we have to do anything about it.
+      // Otherwise the return value is already in the right place when we return.
+      if (!jni_return_reg.Equals(mr_return_reg)) {
+        // This is typically only necessary on ARM32 due to native being softfloat
+        // while managed is hardfloat.
+        // -- For example VMOV {r0, r1} -> D0; VMOV r0 -> S0.
+        __ Move(mr_return_reg, jni_return_reg, main_jni_conv->SizeOfReturnValue());
+      } else if (jni_return_reg.IsNoRegister() && mr_return_reg.IsNoRegister()) {
+        // Sanity check: If the return value is passed on the stack for some reason,
+        // then make sure the size matches.
+        CHECK_EQ(main_jni_conv->SizeOfReturnValue(), mr_conv->SizeOfReturnValue());
+      }
     }
-    CHECK_LT(return_save_location.Uint32Value(), frame_size + main_out_arg_size);
-    __ Store(return_save_location,
-             main_jni_conv->ReturnRegister(),
-             main_jni_conv->SizeOfReturnValue());
   }
 
   // Increase frame size for out args if needed by the end_jni_conv.
@@ -398,6 +504,8 @@
   if (end_out_arg_size > current_out_arg_size) {
     size_t out_arg_size_diff = end_out_arg_size - current_out_arg_size;
     current_out_arg_size = end_out_arg_size;
+    // TODO: This is redundant for @CriticalNative but we need to
+    // conditionally do __DecreaseFrameSize below.
     __ IncreaseFrameSize(out_arg_size_diff);
     saved_cookie_offset = FrameOffset(saved_cookie_offset.SizeValue() + out_arg_size_diff);
     locked_object_handle_scope_offset =
@@ -407,65 +515,71 @@
   //     thread.
   end_jni_conv->ResetIterator(FrameOffset(end_out_arg_size));
 
-  ThreadOffset<kPointerSize> jni_end(-1);
-  if (reference_return) {
-    // Pass result.
-    jni_end = is_synchronized
-                  ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndWithReferenceSynchronized)
-                  : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndWithReference);
-    SetNativeParameter(jni_asm.get(), end_jni_conv.get(), end_jni_conv->ReturnRegister());
-    end_jni_conv->Next();
-  } else {
-    jni_end = is_synchronized
-                  ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndSynchronized)
-                  : (is_fast_native
-                         ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodFastEnd)
-                         : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEnd));
-  }
-  // Pass saved local reference state.
-  if (end_jni_conv->IsCurrentParamOnStack()) {
-    FrameOffset out_off = end_jni_conv->CurrentParamStackOffset();
-    __ Copy(out_off, saved_cookie_offset, end_jni_conv->InterproceduralScratchRegister(), 4);
-  } else {
-    ManagedRegister out_reg = end_jni_conv->CurrentParamRegister();
-    __ Load(out_reg, saved_cookie_offset, 4);
-  }
-  end_jni_conv->Next();
-  if (is_synchronized) {
-    // Pass object for unlocking.
+  if (LIKELY(!is_critical_native)) {
+    // 12. Call JniMethodEnd
+    ThreadOffset<kPointerSize> jni_end(-1);
+    if (reference_return) {
+      // Pass result.
+      jni_end = is_synchronized
+                    ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndWithReferenceSynchronized)
+                    : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndWithReference);
+      SetNativeParameter(jni_asm.get(), end_jni_conv.get(), end_jni_conv->ReturnRegister());
+      end_jni_conv->Next();
+    } else {
+      jni_end = is_synchronized
+                    ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEndSynchronized)
+                    : (is_fast_native
+                           ? QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodFastEnd)
+                           : QUICK_ENTRYPOINT_OFFSET(kPointerSize, pJniMethodEnd));
+    }
+    // Pass saved local reference state.
     if (end_jni_conv->IsCurrentParamOnStack()) {
       FrameOffset out_off = end_jni_conv->CurrentParamStackOffset();
-      __ CreateHandleScopeEntry(out_off, locked_object_handle_scope_offset,
-                         end_jni_conv->InterproceduralScratchRegister(),
-                         false);
+      __ Copy(out_off, saved_cookie_offset, end_jni_conv->InterproceduralScratchRegister(), 4);
     } else {
       ManagedRegister out_reg = end_jni_conv->CurrentParamRegister();
-      __ CreateHandleScopeEntry(out_reg, locked_object_handle_scope_offset,
-                         ManagedRegister::NoRegister(), false);
+      __ Load(out_reg, saved_cookie_offset, 4);
     }
     end_jni_conv->Next();
-  }
-  if (end_jni_conv->IsCurrentParamInRegister()) {
-    __ GetCurrentThread(end_jni_conv->CurrentParamRegister());
-    __ Call(end_jni_conv->CurrentParamRegister(),
-            Offset(jni_end),
-            end_jni_conv->InterproceduralScratchRegister());
-  } else {
-    __ GetCurrentThread(end_jni_conv->CurrentParamStackOffset(),
-                        end_jni_conv->InterproceduralScratchRegister());
-    __ CallFromThread(jni_end, end_jni_conv->InterproceduralScratchRegister());
-  }
+    if (is_synchronized) {
+      // Pass object for unlocking.
+      if (end_jni_conv->IsCurrentParamOnStack()) {
+        FrameOffset out_off = end_jni_conv->CurrentParamStackOffset();
+        __ CreateHandleScopeEntry(out_off, locked_object_handle_scope_offset,
+                           end_jni_conv->InterproceduralScratchRegister(),
+                           false);
+      } else {
+        ManagedRegister out_reg = end_jni_conv->CurrentParamRegister();
+        __ CreateHandleScopeEntry(out_reg, locked_object_handle_scope_offset,
+                           ManagedRegister::NoRegister(), false);
+      }
+      end_jni_conv->Next();
+    }
+    if (end_jni_conv->IsCurrentParamInRegister()) {
+      __ GetCurrentThread(end_jni_conv->CurrentParamRegister());
+      __ Call(end_jni_conv->CurrentParamRegister(),
+              Offset(jni_end),
+              end_jni_conv->InterproceduralScratchRegister());
+    } else {
+      __ GetCurrentThread(end_jni_conv->CurrentParamStackOffset(),
+                          end_jni_conv->InterproceduralScratchRegister());
+      __ CallFromThread(jni_end, end_jni_conv->InterproceduralScratchRegister());
+    }
 
-  // 13. Reload return value
-  if (main_jni_conv->SizeOfReturnValue() != 0 && !reference_return) {
-    __ Load(mr_conv->ReturnRegister(), return_save_location, mr_conv->SizeOfReturnValue());
-  }
+    // 13. Reload return value
+    if (main_jni_conv->SizeOfReturnValue() != 0 && !reference_return) {
+      __ Load(mr_conv->ReturnRegister(), return_save_location, mr_conv->SizeOfReturnValue());
+      // NIT: If it's @CriticalNative then we actually only need to do this IF
+      // the calling convention's native return register doesn't match the managed convention's
+      // return register.
+    }
+  }  // if (!is_critical_native)
 
   // 14. Move frame up now we're done with the out arg space.
   __ DecreaseFrameSize(current_out_arg_size);
 
   // 15. Process pending exceptions from JNI call or monitor exit.
-  __ ExceptionPoll(main_jni_conv->InterproceduralScratchRegister(), 0);
+  __ ExceptionPoll(main_jni_conv->InterproceduralScratchRegister(), 0 /* stack_adjust */);
 
   // 16. Remove activation - need to restore callee save registers since the GC may have changed
   //     them.
@@ -497,7 +611,8 @@
 static void CopyParameter(JNIMacroAssembler<kPointerSize>* jni_asm,
                           ManagedRuntimeCallingConvention* mr_conv,
                           JniCallingConvention* jni_conv,
-                          size_t frame_size, size_t out_arg_size) {
+                          size_t frame_size,
+                          size_t out_arg_size) {
   bool input_in_reg = mr_conv->IsCurrentParamInRegister();
   bool output_in_reg = jni_conv->IsCurrentParamInRegister();
   FrameOffset handle_scope_offset(0);
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index 1d06f26..0bfcc3f 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -24,6 +24,7 @@
 namespace x86 {
 
 static_assert(kX86PointerSize == PointerSize::k32, "Unexpected x86 pointer size");
+static_assert(kStackAlignment >= 16u, "IA-32 cdecl requires at least 16 byte stack alignment");
 
 static constexpr ManagedRegister kCalleeSaveRegisters[] = {
     // Core registers.
@@ -190,9 +191,15 @@
 
 // JNI calling convention
 
-X86JniCallingConvention::X86JniCallingConvention(bool is_static, bool is_synchronized,
+X86JniCallingConvention::X86JniCallingConvention(bool is_static,
+                                                 bool is_synchronized,
+                                                 bool is_critical_native,
                                                  const char* shorty)
-    : JniCallingConvention(is_static, is_synchronized, shorty, kX86PointerSize) {
+    : JniCallingConvention(is_static,
+                           is_synchronized,
+                           is_critical_native,
+                           shorty,
+                           kX86PointerSize) {
 }
 
 uint32_t X86JniCallingConvention::CoreSpillMask() const {
@@ -204,13 +211,31 @@
 }
 
 size_t X86JniCallingConvention::FrameSize() {
-  // Method*, return address and callee save area size, local reference segment state
-  size_t frame_data_size = static_cast<size_t>(kX86PointerSize) +
-      (2 + CalleeSaveRegisters().size()) * kFramePointerSize;
-  // References plus 2 words for HandleScope header
-  size_t handle_scope_size = HandleScope::SizeOf(kX86PointerSize, ReferenceCount());
+  // Method*, PC return address and callee save area size, local reference segment state
+  const size_t method_ptr_size = static_cast<size_t>(kX86PointerSize);
+  const size_t pc_return_addr_size = kFramePointerSize;
+  const size_t callee_save_area_size = CalleeSaveRegisters().size() * kFramePointerSize;
+  size_t frame_data_size = method_ptr_size + pc_return_addr_size + callee_save_area_size;
+
+  if (LIKELY(HasLocalReferenceSegmentState())) {                     // local ref. segment state
+    // Local reference segment state is sometimes excluded.
+    frame_data_size += kFramePointerSize;
+  }
+
+  // References plus link_ (pointer) and number_of_references_ (uint32_t) for HandleScope header
+  const size_t handle_scope_size = HandleScope::SizeOf(kX86PointerSize, ReferenceCount());
+
+  size_t total_size = frame_data_size;
+  if (LIKELY(HasHandleScope())) {
+    // HandleScope is sometimes excluded.
+    total_size += handle_scope_size;                                 // handle scope size
+  }
+
   // Plus return value spill area size
-  return RoundUp(frame_data_size + handle_scope_size + SizeOfReturnValue(), kStackAlignment);
+  total_size += SizeOfReturnValue();
+
+  return RoundUp(total_size, kStackAlignment);
+  // TODO: Same thing as x64 except using different pointer size. Refactor?
 }
 
 size_t X86JniCallingConvention::OutArgSize() {
@@ -239,11 +264,13 @@
 }
 
 size_t X86JniCallingConvention::NumberOfOutgoingStackArgs() {
-  size_t static_args = IsStatic() ? 1 : 0;  // count jclass
+  size_t static_args = HasSelfClass() ? 1 : 0;  // count jclass
   // regular argument parameters and this
   size_t param_args = NumArgs() + NumLongOrDoubleArgs();
   // count JNIEnv* and return pc (pushed after Method*)
-  size_t total_args = static_args + param_args + 2;
+  size_t internal_args = 1 /* return pc */ + (HasJniEnv() ? 1 : 0 /* jni env */);
+  // No register args.
+  size_t total_args = static_args + param_args + internal_args;
   return total_args;
 }
 
diff --git a/compiler/jni/quick/x86/calling_convention_x86.h b/compiler/jni/quick/x86/calling_convention_x86.h
index ff92fc9..be83cda 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.h
+++ b/compiler/jni/quick/x86/calling_convention_x86.h
@@ -52,9 +52,13 @@
   DISALLOW_COPY_AND_ASSIGN(X86ManagedRuntimeCallingConvention);
 };
 
+// Implements the x86 cdecl calling convention.
 class X86JniCallingConvention FINAL : public JniCallingConvention {
  public:
-  X86JniCallingConvention(bool is_static, bool is_synchronized, const char* shorty);
+  X86JniCallingConvention(bool is_static,
+                          bool is_synchronized,
+                          bool is_critical_native,
+                          const char* shorty);
   ~X86JniCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
index cbf10bd..8ca0ffe 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
@@ -25,8 +25,16 @@
 namespace x86_64 {
 
 constexpr size_t kFramePointerSize = static_cast<size_t>(PointerSize::k64);
-
 static_assert(kX86_64PointerSize == PointerSize::k64, "Unexpected x86_64 pointer size");
+static_assert(kStackAlignment >= 16u, "System V AMD64 ABI requires at least 16 byte stack alignment");
+
+// XMM0..XMM7 can be used to pass the first 8 floating args. The rest must go on the stack.
+// -- Managed and JNI calling conventions.
+constexpr size_t kMaxFloatOrDoubleRegisterArguments = 8u;
+// Up to how many integer-like (pointers, objects, longs, int, short, bool, etc) args can be
+// enregistered. The rest of the args must go on the stack.
+// -- JNI calling convention only (Managed excludes RDI, so it's actually 5).
+constexpr size_t kMaxIntLikeRegisterArguments = 6u;
 
 static constexpr ManagedRegister kCalleeSaveRegisters[] = {
     // Core registers.
@@ -130,7 +138,7 @@
     case 3: res = X86_64ManagedRegister::FromCpuRegister(R8); break;
     case 4: res = X86_64ManagedRegister::FromCpuRegister(R9); break;
     }
-  } else if (itr_float_and_doubles_ < 8) {
+  } else if (itr_float_and_doubles_ < kMaxFloatOrDoubleRegisterArguments) {
     // First eight float parameters are passed via XMM0..XMM7
     res = X86_64ManagedRegister::FromXmmRegister(
                                  static_cast<FloatRegister>(XMM0 + itr_float_and_doubles_));
@@ -165,9 +173,15 @@
 
 // JNI calling convention
 
-X86_64JniCallingConvention::X86_64JniCallingConvention(bool is_static, bool is_synchronized,
+X86_64JniCallingConvention::X86_64JniCallingConvention(bool is_static,
+                                                       bool is_synchronized,
+                                                       bool is_critical_native,
                                                        const char* shorty)
-    : JniCallingConvention(is_static, is_synchronized, shorty, kX86_64PointerSize) {
+    : JniCallingConvention(is_static,
+                           is_synchronized,
+                           is_critical_native,
+                           shorty,
+                           kX86_64PointerSize) {
 }
 
 uint32_t X86_64JniCallingConvention::CoreSpillMask() const {
@@ -179,13 +193,30 @@
 }
 
 size_t X86_64JniCallingConvention::FrameSize() {
-  // Method*, return address and callee save area size, local reference segment state
-  size_t frame_data_size = static_cast<size_t>(kX86_64PointerSize) +
-      (2 + CalleeSaveRegisters().size()) * kFramePointerSize;
+  // Method*, PC return address and callee save area size, local reference segment state
+  const size_t method_ptr_size = static_cast<size_t>(kX86_64PointerSize);
+  const size_t pc_return_addr_size = kFramePointerSize;
+  const size_t callee_save_area_size = CalleeSaveRegisters().size() * kFramePointerSize;
+  size_t frame_data_size = method_ptr_size + pc_return_addr_size + callee_save_area_size;
+
+  if (LIKELY(HasLocalReferenceSegmentState())) {                     // local ref. segment state
+    // Local reference segment state is sometimes excluded.
+    frame_data_size += kFramePointerSize;
+  }
+
   // References plus link_ (pointer) and number_of_references_ (uint32_t) for HandleScope header
-  size_t handle_scope_size = HandleScope::SizeOf(kX86_64PointerSize, ReferenceCount());
+  const size_t handle_scope_size = HandleScope::SizeOf(kX86_64PointerSize, ReferenceCount());
+
+  size_t total_size = frame_data_size;
+  if (LIKELY(HasHandleScope())) {
+    // HandleScope is sometimes excluded.
+    total_size += handle_scope_size;                                 // handle scope size
+  }
+
   // Plus return value spill area size
-  return RoundUp(frame_data_size + handle_scope_size + SizeOfReturnValue(), kStackAlignment);
+  total_size += SizeOfReturnValue();
+
+  return RoundUp(total_size, kStackAlignment);
 }
 
 size_t X86_64JniCallingConvention::OutArgSize() {
@@ -214,8 +245,9 @@
     case 3: res = X86_64ManagedRegister::FromCpuRegister(RCX); break;
     case 4: res = X86_64ManagedRegister::FromCpuRegister(R8); break;
     case 5: res = X86_64ManagedRegister::FromCpuRegister(R9); break;
+    static_assert(5u == kMaxIntLikeRegisterArguments - 1, "Missing case statement(s)");
     }
-  } else if (itr_float_and_doubles_ < 8) {
+  } else if (itr_float_and_doubles_ < kMaxFloatOrDoubleRegisterArguments) {
     // First eight float parameters are passed via XMM0..XMM7
     res = X86_64ManagedRegister::FromXmmRegister(
                                  static_cast<FloatRegister>(XMM0 + itr_float_and_doubles_));
@@ -224,24 +256,35 @@
 }
 
 FrameOffset X86_64JniCallingConvention::CurrentParamStackOffset() {
-  size_t offset = itr_args_
-      - std::min(8U, itr_float_and_doubles_)               // Float arguments passed through Xmm0..Xmm7
-      - std::min(6U, itr_args_ - itr_float_and_doubles_);  // Integer arguments passed through GPR
-  return FrameOffset(displacement_.Int32Value() - OutArgSize() + (offset * kFramePointerSize));
+  CHECK(IsCurrentParamOnStack());
+  size_t args_on_stack = itr_args_
+      - std::min(kMaxFloatOrDoubleRegisterArguments,
+                 static_cast<size_t>(itr_float_and_doubles_))
+          // Float arguments passed through Xmm0..Xmm7
+      - std::min(kMaxIntLikeRegisterArguments,
+                 static_cast<size_t>(itr_args_ - itr_float_and_doubles_));
+          // Integer arguments passed through GPR
+  size_t offset = displacement_.Int32Value() - OutArgSize() + (args_on_stack * kFramePointerSize);
+  CHECK_LT(offset, OutArgSize());
+  return FrameOffset(offset);
 }
 
+// TODO: Calling this "NumberArgs" is misleading.
+// It's really more like NumberSlots (like itr_slots_)
+// because doubles/longs get counted twice.
 size_t X86_64JniCallingConvention::NumberOfOutgoingStackArgs() {
-  size_t static_args = IsStatic() ? 1 : 0;  // count jclass
+  size_t static_args = HasSelfClass() ? 1 : 0;  // count jclass
   // regular argument parameters and this
   size_t param_args = NumArgs() + NumLongOrDoubleArgs();
   // count JNIEnv* and return pc (pushed after Method*)
-  size_t total_args = static_args + param_args + 2;
+  size_t internal_args = 1 /* return pc */ + (HasJniEnv() ? 1 : 0 /* jni env */);
+  size_t total_args = static_args + param_args + internal_args;
 
   // Float arguments passed through Xmm0..Xmm7
   // Other (integer) arguments passed through GPR (RDI, RSI, RDX, RCX, R8, R9)
   size_t total_stack_args = total_args
-                            - std::min(8U, static_cast<unsigned int>(NumFloatOrDoubleArgs()))
-                            - std::min(6U, static_cast<unsigned int>(NumArgs() - NumFloatOrDoubleArgs()));
+                            - std::min(kMaxFloatOrDoubleRegisterArguments, static_cast<size_t>(NumFloatOrDoubleArgs()))
+                            - std::min(kMaxIntLikeRegisterArguments, static_cast<size_t>(NumArgs() - NumFloatOrDoubleArgs()));
 
   return total_stack_args;
 }
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.h b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
index b98f505..cdba334 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.h
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
@@ -48,7 +48,10 @@
 
 class X86_64JniCallingConvention FINAL : public JniCallingConvention {
  public:
-  X86_64JniCallingConvention(bool is_static, bool is_synchronized, const char* shorty);
+  X86_64JniCallingConvention(bool is_static,
+                             bool is_synchronized,
+                             bool is_critical_native,
+                             const char* shorty);
   ~X86_64JniCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;