Direct calls to @CriticalNative methods.

Emit direct calls from compiled managed code to the native
code registered with the method, avoiding the JNI stub.

Golem results:
art-opt-cc                       x86 x86-64    arm  arm64
NativeDowncallStaticCritical  +12.5% +62.5% +75.9% +41.7%
NativeDowncallStaticCritical6 +55.6% +87.5% +72.1% +35.3%
art-opt                          x86 x86-64    arm  arm64
NativeDowncallStaticCritical  +28.6% +85.6% +76.4% +38.4%
NativeDowncallStaticCritical6 +44.6% +44.6% +74.6% +32.2%

Test: Covered by 178-app-image-native-method.
Test: m test-art-host-gtest
Test: testrunner.py --host --debuggable --ndebuggable \
          --optimizing --jit --jit-on-first-use
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Test: testrunner.py --target --debuggable --ndebuggable \
          --optimizing --jit --jit-on-first-use -t 178
Test: aosp_cf_x86_phone-userdebug boots.
Test: aosp_cf_x86_phone-userdebug/jitzygote boots.
Bug: 112189621
Change-Id: I8b37da51e8fe0b7bc513bb81b127fe0416068866
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index ec54376..4a0cc78 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -16,6 +16,7 @@
 
 #include "code_generator_x86_64.h"
 
+#include "arch/x86_64/jni_frame_x86_64.h"
 #include "art_method-inl.h"
 #include "class_table.h"
 #include "code_generator_utils.h"
@@ -978,6 +979,16 @@
   UNREACHABLE();
 }
 
+void CodeGeneratorX86_64::BlockNonVolatileXmmRegisters(LocationSummary* locations) {
+  // We have to ensure that the native code we call directly (such as @CriticalNative
+  // or some intrinsic helpers, say Math.sin()) doesn't clobber the XMM registers
+  // which are non-volatile for ART, but volatile for Native calls.  This will ensure
+  // that they are saved in the prologue and properly restored.
+  for (FloatRegister fp_reg : non_volatile_xmm_regs) {
+    locations->AddTemp(Location::FpuRegisterLocation(fp_reg));
+  }
+}
+
 HInvokeStaticOrDirect::DispatchInfo CodeGeneratorX86_64::GetSupportedInvokeStaticOrDirectDispatch(
       const HInvokeStaticOrDirect::DispatchInfo& desired_dispatch_info,
       ArtMethod* method ATTRIBUTE_UNUSED) {
@@ -998,7 +1009,7 @@
       break;
     }
     case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
-      callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
+      callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodIndex());
       break;
     case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative:
       DCHECK(GetCompilerOptions().IsBootImage() || GetCompilerOptions().IsBootImageExtension());
@@ -1032,15 +1043,61 @@
   switch (invoke->GetCodePtrLocation()) {
     case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
       __ call(&frame_entry_label_);
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
       break;
+    case HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative: {
+      HParallelMove parallel_move(GetGraph()->GetAllocator());
+      size_t out_frame_size =
+          PrepareCriticalNativeCall<CriticalNativeCallingConventionVisitorX86_64,
+                                    kNativeStackAlignment,
+                                    GetCriticalNativeDirectCallFrameSize>(invoke, &parallel_move);
+      if (out_frame_size != 0u) {
+        __ subq(CpuRegister(RSP), Immediate(out_frame_size));
+        __ cfi().AdjustCFAOffset(out_frame_size);
+        GetMoveResolver()->EmitNativeCode(&parallel_move);
+      }
+      // (callee_method + offset_of_jni_entry_point)()
+      __ call(Address(callee_method.AsRegister<CpuRegister>(),
+                      ArtMethod::EntryPointFromJniOffset(kX86_64PointerSize).SizeValue()));
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
+      // Zero-/sign-extend the result when needed due to native and managed ABI mismatch.
+      switch (invoke->GetType()) {
+        case DataType::Type::kBool:
+          __ movzxb(CpuRegister(RAX), CpuRegister(RAX));
+          break;
+        case DataType::Type::kInt8:
+          __ movsxb(CpuRegister(RAX), CpuRegister(RAX));
+          break;
+        case DataType::Type::kUint16:
+          __ movzxw(CpuRegister(RAX), CpuRegister(RAX));
+          break;
+        case DataType::Type::kInt16:
+          __ movsxw(CpuRegister(RAX), CpuRegister(RAX));
+          break;
+        case DataType::Type::kInt32:
+        case DataType::Type::kInt64:
+        case DataType::Type::kFloat32:
+        case DataType::Type::kFloat64:
+        case DataType::Type::kVoid:
+          break;
+        default:
+          DCHECK(false) << invoke->GetType();
+          break;
+      }
+      if (out_frame_size != 0u) {
+        __ addq(CpuRegister(RSP), Immediate(out_frame_size));
+        __ cfi().AdjustCFAOffset(-out_frame_size);
+      }
+      break;
+    }
     case HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod:
       // (callee_method + offset_of_quick_compiled_code)()
       __ call(Address(callee_method.AsRegister<CpuRegister>(),
                       ArtMethod::EntryPointFromQuickCompiledCodeOffset(
                           kX86_64PointerSize).SizeValue()));
+      RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
       break;
   }
-  RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
 
   DCHECK(!IsLeafMethod());
 }
@@ -2493,6 +2550,51 @@
   return Location::NoLocation();
 }
 
+Location CriticalNativeCallingConventionVisitorX86_64::GetNextLocation(DataType::Type type) {
+  DCHECK_NE(type, DataType::Type::kReference);
+
+  Location location = Location::NoLocation();
+  if (DataType::IsFloatingPointType(type)) {
+    if (fpr_index_ < kParameterFloatRegistersLength) {
+      location = Location::FpuRegisterLocation(kParameterFloatRegisters[fpr_index_]);
+      ++fpr_index_;
+    }
+  } else {
+    // Native ABI uses the same registers as managed, except that the method register RDI
+    // is a normal argument.
+    if (gpr_index_ < 1u + kParameterCoreRegistersLength) {
+      location = Location::RegisterLocation(
+          gpr_index_ == 0u ? RDI : kParameterCoreRegisters[gpr_index_ - 1u]);
+      ++gpr_index_;
+    }
+  }
+  if (location.IsInvalid()) {
+    if (DataType::Is64BitType(type)) {
+      location = Location::DoubleStackSlot(stack_offset_);
+    } else {
+      location = Location::StackSlot(stack_offset_);
+    }
+    stack_offset_ += kFramePointerSize;
+
+    if (for_register_allocation_) {
+      location = Location::Any();
+    }
+  }
+  return location;
+}
+
+Location CriticalNativeCallingConventionVisitorX86_64::GetReturnLocation(DataType::Type type)
+    const {
+  // We perform conversion to the managed ABI return register after the call if needed.
+  InvokeDexCallingConventionVisitorX86_64 dex_calling_convention;
+  return dex_calling_convention.GetReturnLocation(type);
+}
+
+Location CriticalNativeCallingConventionVisitorX86_64::GetMethodLocation() const {
+  // Pass the method in the hidden argument RAX.
+  return Location::RegisterLocation(RAX);
+}
+
 void LocationsBuilderX86_64::VisitInvokeUnresolved(HInvokeUnresolved* invoke) {
   // The trampoline uses the same calling convention as dex calling conventions,
   // except instead of loading arg0/r0 with the target Method*, arg0/r0 will contain
@@ -2514,7 +2616,14 @@
     return;
   }
 
-  HandleInvoke(invoke);
+  if (invoke->GetCodePtrLocation() == HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative) {
+    CriticalNativeCallingConventionVisitorX86_64 calling_convention_visitor(
+        /*for_register_allocation=*/ true);
+    CodeGenerator::CreateCommonInvokeLocationSummary(invoke, &calling_convention_visitor);
+    CodeGeneratorX86_64::BlockNonVolatileXmmRegisters(invoke->GetLocations());
+  } else {
+    HandleInvoke(invoke);
+  }
 }
 
 static bool TryGenerateIntrinsicCode(HInvoke* invoke, CodeGeneratorX86_64* codegen) {