jni: Do not create a managed frame for @CriticalNative.

Omit managed frame for @CriticalNative methods, do not check
for exceptions and and make a tail call when possible.
Pass the method pointer in a hidden argument to prepare for
implementing late binding for @CriticalNative methods.

This changes only the JNI compiler, Generic JNI shall be
updated in a separate change.

Performance improvements reported by Golem (art-opt-cc):
                                 x86 x86-64    arm  arm64
NativeDowncallStaticCritical6   +17%   +50%   +88%  +139%
NativeDowncallStaticCritical    +37%   +32%  +103%  +216%

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: aosp_taimen-userdebug boots.
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 112189621
Change-Id: I5758c8f478627f2eee8f615b4537a907c211b9f8
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 540d72b..f4ea004 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -39,6 +39,9 @@
 
 constexpr size_t kFramePointerSize = 4;
 
+static constexpr size_t kNativeStackAlignment = 16;
+static_assert(kNativeStackAlignment == kStackAlignment);
+
 #define __ asm_.
 
 void X86JNIMacroAssembler::BuildFrame(size_t frame_size,
@@ -47,7 +50,15 @@
                                       const ManagedRegisterEntrySpills& entry_spills) {
   DCHECK_EQ(CodeSize(), 0U);  // Nothing emitted yet.
   cfi().SetCurrentCFAOffset(4);  // Return address on stack.
-  CHECK_ALIGNED(frame_size, kStackAlignment);
+  if (frame_size == kFramePointerSize) {
+    // For @CriticalNative tail call.
+    CHECK(method_reg.IsNoRegister());
+    CHECK(spill_regs.empty());
+  } else if (method_reg.IsNoRegister()) {
+    CHECK_ALIGNED(frame_size, kNativeStackAlignment);
+  } else {
+    CHECK_ALIGNED(frame_size, kStackAlignment);
+  }
   int gpr_count = 0;
   for (int i = spill_regs.size() - 1; i >= 0; --i) {
     Register spill = spill_regs[i].AsX86().AsCpuRegister();
@@ -59,12 +70,16 @@
 
   // return address then method on stack.
   int32_t adjust = frame_size - gpr_count * kFramePointerSize -
-      kFramePointerSize /*method*/ -
-      kFramePointerSize /*return address*/;
-  __ addl(ESP, Immediate(-adjust));
-  cfi().AdjustCFAOffset(adjust);
-  __ pushl(method_reg.AsX86().AsCpuRegister());
-  cfi().AdjustCFAOffset(kFramePointerSize);
+      kFramePointerSize /*return address*/ -
+      (method_reg.IsRegister() ? kFramePointerSize /*method*/ : 0u);
+  if (adjust != 0) {
+    __ addl(ESP, Immediate(-adjust));
+    cfi().AdjustCFAOffset(adjust);
+  }
+  if (method_reg.IsRegister()) {
+    __ pushl(method_reg.AsX86().AsCpuRegister());
+    cfi().AdjustCFAOffset(kFramePointerSize);
+  }
   DCHECK_EQ(static_cast<size_t>(cfi().GetCurrentCFAOffset()), frame_size);
 
   for (const ManagedRegisterSpill& spill : entry_spills) {
@@ -86,12 +101,14 @@
 void X86JNIMacroAssembler::RemoveFrame(size_t frame_size,
                                        ArrayRef<const ManagedRegister> spill_regs,
                                        bool may_suspend ATTRIBUTE_UNUSED) {
-  CHECK_ALIGNED(frame_size, kStackAlignment);
+  CHECK_ALIGNED(frame_size, kNativeStackAlignment);
   cfi().RememberState();
   // -kFramePointerSize for ArtMethod*.
   int adjust = frame_size - spill_regs.size() * kFramePointerSize - kFramePointerSize;
-  __ addl(ESP, Immediate(adjust));
-  cfi().AdjustCFAOffset(-adjust);
+  if (adjust != 0) {
+    __ addl(ESP, Immediate(adjust));
+    cfi().AdjustCFAOffset(-adjust);
+  }
   for (size_t i = 0; i < spill_regs.size(); ++i) {
     Register spill = spill_regs[i].AsX86().AsCpuRegister();
     __ popl(spill);
@@ -105,15 +122,19 @@
 }
 
 void X86JNIMacroAssembler::IncreaseFrameSize(size_t adjust) {
-  CHECK_ALIGNED(adjust, kStackAlignment);
-  __ addl(ESP, Immediate(-adjust));
-  cfi().AdjustCFAOffset(adjust);
+  if (adjust != 0u) {
+    CHECK_ALIGNED(adjust, kNativeStackAlignment);
+    __ addl(ESP, Immediate(-adjust));
+    cfi().AdjustCFAOffset(adjust);
+  }
 }
 
 static void DecreaseFrameSizeImpl(X86Assembler* assembler, size_t adjust) {
-  CHECK_ALIGNED(adjust, kStackAlignment);
-  assembler->addl(ESP, Immediate(adjust));
-  assembler->cfi().AdjustCFAOffset(-adjust);
+  if (adjust != 0u) {
+    CHECK_ALIGNED(adjust, kNativeStackAlignment);
+    assembler->addl(ESP, Immediate(adjust));
+    assembler->cfi().AdjustCFAOffset(-adjust);
+  }
 }
 
 void X86JNIMacroAssembler::DecreaseFrameSize(size_t adjust) {
@@ -301,7 +322,7 @@
       __ movl(dest.AsCpuRegister(), src.AsCpuRegister());
     } else if (src.IsX87Register() && dest.IsXmmRegister()) {
       // Pass via stack and pop X87 register
-      __ subl(ESP, Immediate(16));
+      IncreaseFrameSize(16);
       if (size == 4) {
         CHECK_EQ(src.AsX87Register(), ST0);
         __ fstps(Address(ESP, 0));
@@ -311,7 +332,7 @@
         __ fstpl(Address(ESP, 0));
         __ movsd(dest.AsXmmRegister(), Address(ESP, 0));
       }
-      __ addl(ESP, Immediate(16));
+      DecreaseFrameSize(16);
     } else {
       // TODO: x87, SSE
       UNIMPLEMENTED(FATAL) << ": Move " << dest << ", " << src;
@@ -487,6 +508,12 @@
   // TODO: not validating references
 }
 
+void X86JNIMacroAssembler::Jump(ManagedRegister mbase, Offset offset, ManagedRegister) {
+  X86ManagedRegister base = mbase.AsX86();
+  CHECK(base.IsCpuRegister());
+  __ jmp(Address(base.AsCpuRegister(), offset.Int32Value()));
+}
+
 void X86JNIMacroAssembler::Call(ManagedRegister mbase, Offset offset, ManagedRegister) {
   X86ManagedRegister base = mbase.AsX86();
   CHECK(base.IsCpuRegister());