jni: Do not create a managed frame for @CriticalNative.

Omit managed frame for @CriticalNative methods, do not check
for exceptions and and make a tail call when possible.
Pass the method pointer in a hidden argument to prepare for
implementing late binding for @CriticalNative methods.

This changes only the JNI compiler, Generic JNI shall be
updated in a separate change.

Performance improvements reported by Golem (art-opt-cc):
                                 x86 x86-64    arm  arm64
NativeDowncallStaticCritical6   +17%   +50%   +88%  +139%
NativeDowncallStaticCritical    +37%   +32%  +103%  +216%

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: aosp_taimen-userdebug boots.
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 112189621
Change-Id: I5758c8f478627f2eee8f615b4537a907c211b9f8
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index 1f255e2..d12eb9b 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -26,7 +26,6 @@
 namespace x86 {
 
 static_assert(kX86PointerSize == PointerSize::k32, "Unexpected x86 pointer size");
-static_assert(kStackAlignment >= 16u, "IA-32 cdecl requires at least 16 byte stack alignment");
 
 static constexpr ManagedRegister kCalleeSaveRegisters[] = {
     // Core registers.
@@ -36,10 +35,12 @@
     // No hard float callee saves.
 };
 
-static constexpr uint32_t CalculateCoreCalleeSpillMask() {
+template <size_t size>
+static constexpr uint32_t CalculateCoreCalleeSpillMask(
+    const ManagedRegister (&callee_saves)[size]) {
   // The spilled PC gets a special marker.
   uint32_t result = 1 << kNumberOfCpuRegisters;
-  for (auto&& r : kCalleeSaveRegisters) {
+  for (auto&& r : callee_saves) {
     if (r.AsX86().IsCpuRegister()) {
       result |= (1 << r.AsX86().AsCpuRegister());
     }
@@ -47,16 +48,32 @@
   return result;
 }
 
-static constexpr uint32_t kCoreCalleeSpillMask = CalculateCoreCalleeSpillMask();
+static constexpr uint32_t kCoreCalleeSpillMask = CalculateCoreCalleeSpillMask(kCalleeSaveRegisters);
 static constexpr uint32_t kFpCalleeSpillMask = 0u;
 
+static constexpr size_t kNativeStackAlignment = 16;  // IA-32 cdecl requires 16 byte alignment.
+static_assert(kNativeStackAlignment == kStackAlignment);
+
+static constexpr ManagedRegister kNativeCalleeSaveRegisters[] = {
+    // Core registers.
+    X86ManagedRegister::FromCpuRegister(EBX),
+    X86ManagedRegister::FromCpuRegister(EBP),
+    X86ManagedRegister::FromCpuRegister(ESI),
+    X86ManagedRegister::FromCpuRegister(EDI),
+    // No hard float callee saves.
+};
+
+static constexpr uint32_t kNativeCoreCalleeSpillMask =
+    CalculateCoreCalleeSpillMask(kNativeCalleeSaveRegisters);
+static constexpr uint32_t kNativeFpCalleeSpillMask = 0u;
+
 // Calling convention
 
-ManagedRegister X86ManagedRuntimeCallingConvention::InterproceduralScratchRegister() {
+ManagedRegister X86ManagedRuntimeCallingConvention::InterproceduralScratchRegister() const {
   return X86ManagedRegister::FromCpuRegister(ECX);
 }
 
-ManagedRegister X86JniCallingConvention::InterproceduralScratchRegister() {
+ManagedRegister X86JniCallingConvention::InterproceduralScratchRegister() const {
   return X86ManagedRegister::FromCpuRegister(ECX);
 }
 
@@ -205,47 +222,81 @@
 }
 
 uint32_t X86JniCallingConvention::CoreSpillMask() const {
-  return kCoreCalleeSpillMask;
+  return is_critical_native_ ? 0u : kCoreCalleeSpillMask;
 }
 
 uint32_t X86JniCallingConvention::FpSpillMask() const {
-  return kFpCalleeSpillMask;
+  return is_critical_native_ ? 0u : kFpCalleeSpillMask;
 }
 
-size_t X86JniCallingConvention::FrameSize() {
+size_t X86JniCallingConvention::FrameSize() const {
+  if (is_critical_native_) {
+    CHECK(!SpillsMethod());
+    CHECK(!HasLocalReferenceSegmentState());
+    CHECK(!HasHandleScope());
+    CHECK(!SpillsReturnValue());
+    return 0u;  // There is no managed frame for @CriticalNative.
+  }
+
   // Method*, PC return address and callee save area size, local reference segment state
+  CHECK(SpillsMethod());
   const size_t method_ptr_size = static_cast<size_t>(kX86PointerSize);
   const size_t pc_return_addr_size = kFramePointerSize;
   const size_t callee_save_area_size = CalleeSaveRegisters().size() * kFramePointerSize;
-  size_t frame_data_size = method_ptr_size + pc_return_addr_size + callee_save_area_size;
+  size_t total_size = method_ptr_size + pc_return_addr_size + callee_save_area_size;
 
-  if (LIKELY(HasLocalReferenceSegmentState())) {                     // local ref. segment state
-    // Local reference segment state is sometimes excluded.
-    frame_data_size += kFramePointerSize;
-  }
+  CHECK(HasLocalReferenceSegmentState());
+  total_size += kFramePointerSize;
 
-  // References plus link_ (pointer) and number_of_references_ (uint32_t) for HandleScope header
-  const size_t handle_scope_size = HandleScope::SizeOf(kX86PointerSize, ReferenceCount());
-
-  size_t total_size = frame_data_size;
-  if (LIKELY(HasHandleScope())) {
-    // HandleScope is sometimes excluded.
-    total_size += handle_scope_size;                                 // handle scope size
-  }
+  CHECK(HasHandleScope());
+  total_size += HandleScope::SizeOf(kX86_64PointerSize, ReferenceCount());
 
   // Plus return value spill area size
+  CHECK(SpillsReturnValue());
   total_size += SizeOfReturnValue();
 
   return RoundUp(total_size, kStackAlignment);
-  // TODO: Same thing as x64 except using different pointer size. Refactor?
 }
 
-size_t X86JniCallingConvention::OutArgSize() {
-  return RoundUp(NumberOfOutgoingStackArgs() * kFramePointerSize, kStackAlignment);
+size_t X86JniCallingConvention::OutArgSize() const {
+  // Count param args, including JNIEnv* and jclass*; count 8-byte args twice.
+  size_t all_args = NumberOfExtraArgumentsForJni() + NumArgs() + NumLongOrDoubleArgs();
+  // The size of outgoiong arguments.
+  size_t size = all_args * kFramePointerSize;
+
+  // @CriticalNative can use tail call as all managed callee saves are preserved by AAPCS.
+  static_assert((kCoreCalleeSpillMask & ~kNativeCoreCalleeSpillMask) == 0u);
+  static_assert((kFpCalleeSpillMask & ~kNativeFpCalleeSpillMask) == 0u);
+
+  if (is_critical_native_) {
+    // Add return address size for @CriticalNative
+    // For normal native the return PC is part of the managed stack frame instead of out args.
+    size += kFramePointerSize;
+    // For @CriticalNative, we can make a tail call if there are no stack args
+    // and the return type is not FP type (needs moving from ST0 to MMX0) and
+    // we do not need to extend the result.
+    bool return_type_ok = GetShorty()[0] == 'I' || GetShorty()[0] == 'J' || GetShorty()[0] == 'V';
+    DCHECK_EQ(
+        return_type_ok,
+        GetShorty()[0] != 'F' && GetShorty()[0] != 'D' && !RequiresSmallResultTypeExtension());
+    if (return_type_ok && size == kFramePointerSize) {
+      // Note: This is not aligned to kNativeStackAlignment but that's OK for tail call.
+      DCHECK_EQ(size, kFramePointerSize);
+      static_assert(kFramePointerSize < kNativeStackAlignment);
+      return kFramePointerSize;
+    }
+  }
+
+  return RoundUp(size, kNativeStackAlignment);
 }
 
 ArrayRef<const ManagedRegister> X86JniCallingConvention::CalleeSaveRegisters() const {
-  return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
+  if (UNLIKELY(IsCriticalNative())) {
+    // Do not spill anything, whether tail call or not (return PC is already on the stack).
+    return ArrayRef<const ManagedRegister>();
+  } else {
+    return ArrayRef<const ManagedRegister>(kCalleeSaveRegisters);
+  }
 }
 
 bool X86JniCallingConvention::IsCurrentParamInRegister() {
@@ -265,15 +316,21 @@
   return FrameOffset(displacement_.Int32Value() - OutArgSize() + (itr_slots_ * kFramePointerSize));
 }
 
-size_t X86JniCallingConvention::NumberOfOutgoingStackArgs() {
-  size_t static_args = HasSelfClass() ? 1 : 0;  // count jclass
-  // regular argument parameters and this
-  size_t param_args = NumArgs() + NumLongOrDoubleArgs();
-  // count JNIEnv* and return pc (pushed after Method*)
-  size_t internal_args = 1 /* return pc */ + (HasJniEnv() ? 1 : 0 /* jni env */);
-  // No register args.
-  size_t total_args = static_args + param_args + internal_args;
-  return total_args;
+ManagedRegister X86JniCallingConvention::HiddenArgumentRegister() const {
+  CHECK(IsCriticalNative());
+  // EAX is neither managed callee-save, nor argument register, nor scratch register.
+  DCHECK(std::none_of(kCalleeSaveRegisters,
+                      kCalleeSaveRegisters + std::size(kCalleeSaveRegisters),
+                      [](ManagedRegister callee_save) constexpr {
+                        return callee_save.Equals(X86ManagedRegister::FromCpuRegister(EAX));
+                      }));
+  DCHECK(!InterproceduralScratchRegister().Equals(X86ManagedRegister::FromCpuRegister(EAX)));
+  return X86ManagedRegister::FromCpuRegister(EAX);
+}
+
+bool X86JniCallingConvention::UseTailCall() const {
+  CHECK(IsCriticalNative());
+  return OutArgSize() == kFramePointerSize;
 }
 
 }  // namespace x86