Direct calls to @CriticalNative methods.
Emit direct calls from compiled managed code to the native
code registered with the method, avoiding the JNI stub.
Golem results:
art-opt-cc x86 x86-64 arm arm64
NativeDowncallStaticCritical +12.5% +62.5% +75.9% +41.7%
NativeDowncallStaticCritical6 +55.6% +87.5% +72.1% +35.3%
art-opt x86 x86-64 arm arm64
NativeDowncallStaticCritical +28.6% +85.6% +76.4% +38.4%
NativeDowncallStaticCritical6 +44.6% +44.6% +74.6% +32.2%
Test: Covered by 178-app-image-native-method.
Test: m test-art-host-gtest
Test: testrunner.py --host --debuggable --ndebuggable \
--optimizing --jit --jit-on-first-use
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Test: testrunner.py --target --debuggable --ndebuggable \
--optimizing --jit --jit-on-first-use -t 178
Test: aosp_cf_x86_phone-userdebug boots.
Test: aosp_cf_x86_phone-userdebug/jitzygote boots.
Bug: 112189621
Change-Id: I8b37da51e8fe0b7bc513bb81b127fe0416068866
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index e9ef21a..595b31e 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -16,6 +16,7 @@
#include "code_generator_x86.h"
+#include "arch/x86/jni_frame_x86.h"
#include "art_method-inl.h"
#include "class_table.h"
#include "code_generator_utils.h"
@@ -1300,6 +1301,34 @@
return Location::NoLocation();
}
+Location CriticalNativeCallingConventionVisitorX86::GetNextLocation(DataType::Type type) {
+ DCHECK_NE(type, DataType::Type::kReference);
+
+ Location location;
+ if (DataType::Is64BitType(type)) {
+ location = Location::DoubleStackSlot(stack_offset_);
+ stack_offset_ += 2 * kFramePointerSize;
+ } else {
+ location = Location::StackSlot(stack_offset_);
+ stack_offset_ += kFramePointerSize;
+ }
+ if (for_register_allocation_) {
+ location = Location::Any();
+ }
+ return location;
+}
+
+Location CriticalNativeCallingConventionVisitorX86::GetReturnLocation(DataType::Type type) const {
+ // We perform conversion to the managed ABI return register after the call if needed.
+ InvokeDexCallingConventionVisitorX86 dex_calling_convention;
+ return dex_calling_convention.GetReturnLocation(type);
+}
+
+Location CriticalNativeCallingConventionVisitorX86::GetMethodLocation() const {
+ // Pass the method in the hidden argument EAX.
+ return Location::RegisterLocation(EAX);
+}
+
void CodeGeneratorX86::Move32(Location destination, Location source) {
if (source.Equals(destination)) {
return;
@@ -1374,11 +1403,13 @@
size_t elem_size = DataType::Size(DataType::Type::kInt32);
// Create stack space for 2 elements.
__ subl(ESP, Immediate(2 * elem_size));
+ __ cfi().AdjustCFAOffset(2 * elem_size);
__ movl(Address(ESP, 0), source.AsRegisterPairLow<Register>());
__ movl(Address(ESP, elem_size), source.AsRegisterPairHigh<Register>());
__ movsd(destination.AsFpuRegister<XmmRegister>(), Address(ESP, 0));
// And remove the temporary stack space we allocated.
__ addl(ESP, Immediate(2 * elem_size));
+ __ cfi().AdjustCFAOffset(-(2 * elem_size));
} else {
LOG(FATAL) << "Unimplemented";
}
@@ -2286,9 +2317,15 @@
return;
}
- HandleInvoke(invoke);
+ if (invoke->GetCodePtrLocation() == HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative) {
+ CriticalNativeCallingConventionVisitorX86 calling_convention_visitor(
+ /*for_register_allocation=*/ true);
+ CodeGenerator::CreateCommonInvokeLocationSummary(invoke, &calling_convention_visitor);
+ } else {
+ HandleInvoke(invoke);
+ }
- // For PC-relative dex cache the invoke has an extra input, the PC-relative address base.
+ // For PC-relative load kinds the invoke has an extra input, the PC-relative address base.
if (invoke->HasPcRelativeMethodLoadKind()) {
invoke->GetLocations()->SetInAt(invoke->GetSpecialInputIndex(), Location::RequiresRegister());
}
@@ -2989,6 +3026,7 @@
if (!in.IsDoubleStackSlot() || !out.IsStackSlot()) {
adjustment = DataType::Size(DataType::Type::kInt64);
__ subl(ESP, Immediate(adjustment));
+ __ cfi().AdjustCFAOffset(adjustment);
}
// Load the value to the FP stack, using temporaries if needed.
@@ -3005,6 +3043,7 @@
// Remove the temporary stack space we allocated.
if (adjustment != 0) {
__ addl(ESP, Immediate(adjustment));
+ __ cfi().AdjustCFAOffset(-adjustment);
}
break;
}
@@ -3039,6 +3078,7 @@
if (!in.IsDoubleStackSlot() || !out.IsDoubleStackSlot()) {
adjustment = DataType::Size(DataType::Type::kInt64);
__ subl(ESP, Immediate(adjustment));
+ __ cfi().AdjustCFAOffset(adjustment);
}
// Load the value to the FP stack, using temporaries if needed.
@@ -3055,6 +3095,7 @@
// Remove the temporary stack space we allocated.
if (adjustment != 0) {
__ addl(ESP, Immediate(adjustment));
+ __ cfi().AdjustCFAOffset(-adjustment);
}
break;
}
@@ -3551,6 +3592,7 @@
// Create stack space for 2 elements.
// TODO: enhance register allocator to ask for stack temporaries.
__ subl(ESP, Immediate(2 * elem_size));
+ __ cfi().AdjustCFAOffset(2 * elem_size);
// Load the values to the FP stack in reverse order, using temporaries if needed.
const bool is_wide = !is_float;
@@ -3591,6 +3633,7 @@
// And remove the temporary stack space we allocated.
__ addl(ESP, Immediate(2 * elem_size));
+ __ cfi().AdjustCFAOffset(-(2 * elem_size));
}
@@ -4934,7 +4977,6 @@
Register CodeGeneratorX86::GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke,
Register temp) {
- DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
Location location = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
if (!invoke->GetLocations()->Intrinsified()) {
return location.AsRegister<Register>();
@@ -4970,7 +5012,7 @@
break;
}
case HInvokeStaticOrDirect::MethodLoadKind::kRecursive:
- callee_method = invoke->GetLocations()->InAt(invoke->GetSpecialInputIndex());
+ callee_method = invoke->GetLocations()->InAt(invoke->GetCurrentMethodIndex());
break;
case HInvokeStaticOrDirect::MethodLoadKind::kBootImageLinkTimePcRelative: {
DCHECK(GetCompilerOptions().IsBootImage() || GetCompilerOptions().IsBootImageExtension());
@@ -5009,15 +5051,73 @@
switch (invoke->GetCodePtrLocation()) {
case HInvokeStaticOrDirect::CodePtrLocation::kCallSelf:
__ call(GetFrameEntryLabel());
+ RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
break;
+ case HInvokeStaticOrDirect::CodePtrLocation::kCallCriticalNative: {
+ HParallelMove parallel_move(GetGraph()->GetAllocator());
+ size_t out_frame_size =
+ PrepareCriticalNativeCall<CriticalNativeCallingConventionVisitorX86,
+ kNativeStackAlignment,
+ GetCriticalNativeDirectCallFrameSize>(invoke, ¶llel_move);
+ if (out_frame_size != 0u) {
+ __ subl(ESP, Immediate(out_frame_size));
+ __ cfi().AdjustCFAOffset(out_frame_size);
+ GetMoveResolver()->EmitNativeCode(¶llel_move);
+ }
+ // (callee_method + offset_of_jni_entry_point)()
+ __ call(Address(callee_method.AsRegister<Register>(),
+ ArtMethod::EntryPointFromJniOffset(kX86PointerSize).Int32Value()));
+ RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
+ if (out_frame_size == 0u && DataType::IsFloatingPointType(invoke->GetType())) {
+ // Create space for conversion.
+ out_frame_size = 8u;
+ __ subl(ESP, Immediate(out_frame_size));
+ __ cfi().AdjustCFAOffset(out_frame_size);
+ }
+ // Zero-/sign-extend or move the result when needed due to native and managed ABI mismatch.
+ switch (invoke->GetType()) {
+ case DataType::Type::kBool:
+ __ movzxb(EAX, AL);
+ break;
+ case DataType::Type::kInt8:
+ __ movsxb(EAX, AL);
+ break;
+ case DataType::Type::kUint16:
+ __ movzxw(EAX, EAX);
+ break;
+ case DataType::Type::kInt16:
+ __ movsxw(EAX, EAX);
+ break;
+ case DataType::Type::kFloat32:
+ __ fstps(Address(ESP, 0));
+ __ movss(XMM0, Address(ESP, 0));
+ break;
+ case DataType::Type::kFloat64:
+ __ fstpl(Address(ESP, 0));
+ __ movsd(XMM0, Address(ESP, 0));
+ break;
+ case DataType::Type::kInt32:
+ case DataType::Type::kInt64:
+ case DataType::Type::kVoid:
+ break;
+ default:
+ DCHECK(false) << invoke->GetType();
+ break;
+ }
+ if (out_frame_size != 0u) {
+ __ addl(ESP, Immediate(out_frame_size));
+ __ cfi().AdjustCFAOffset(-out_frame_size);
+ }
+ break;
+ }
case HInvokeStaticOrDirect::CodePtrLocation::kCallArtMethod:
// (callee_method + offset_of_quick_compiled_code)()
__ call(Address(callee_method.AsRegister<Register>(),
ArtMethod::EntryPointFromQuickCompiledCodeOffset(
kX86PointerSize).Int32Value()));
+ RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
break;
}
- RecordPcInfo(invoke, invoke->GetDexPc(), slow_path);
DCHECK(!IsLeafMethod());
}
@@ -5072,7 +5172,6 @@
}
void CodeGeneratorX86::RecordBootImageMethodPatch(HInvokeStaticOrDirect* invoke) {
- DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
HX86ComputeBaseMethodAddress* method_address =
invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress();
boot_image_method_patches_.emplace_back(
@@ -5081,7 +5180,6 @@
}
void CodeGeneratorX86::RecordMethodBssEntryPatch(HInvokeStaticOrDirect* invoke) {
- DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
HX86ComputeBaseMethodAddress* method_address =
invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress();
// Add the patch entry and bind its label at the end of the instruction.
@@ -5126,7 +5224,6 @@
uint32_t boot_image_reference,
HInvokeStaticOrDirect* invoke) {
if (GetCompilerOptions().IsBootImage()) {
- DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
HX86ComputeBaseMethodAddress* method_address =
invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress();
DCHECK(method_address != nullptr);
@@ -5135,7 +5232,6 @@
__ leal(reg, Address(method_address_reg, CodeGeneratorX86::kDummy32BitOffset));
RecordBootImageIntrinsicPatch(method_address, boot_image_reference);
} else if (GetCompilerOptions().GetCompilePic()) {
- DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
HX86ComputeBaseMethodAddress* method_address =
invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress();
DCHECK(method_address != nullptr);
@@ -5160,7 +5256,6 @@
if (GetCompilerOptions().IsBootImage()) {
DCHECK_EQ(boot_image_offset, IntrinsicVisitor::IntegerValueOfInfo::kInvalidReference);
// Load the class the same way as for HLoadClass::LoadKind::kBootImageLinkTimePcRelative.
- DCHECK_EQ(invoke->InputCount(), invoke->GetNumberOfArguments() + 1u);
HX86ComputeBaseMethodAddress* method_address =
invoke->InputAt(invoke->GetSpecialInputIndex())->AsX86ComputeBaseMethodAddress();
DCHECK(method_address != nullptr);
@@ -6365,24 +6460,45 @@
__ movl(Address(ESP, destination.GetStackIndex()), source.AsRegister<Register>());
}
} else if (source.IsRegisterPair()) {
+ if (destination.IsRegisterPair()) {
+ __ movl(destination.AsRegisterPairLow<Register>(), source.AsRegisterPairLow<Register>());
+ DCHECK_NE(destination.AsRegisterPairLow<Register>(), source.AsRegisterPairHigh<Register>());
+ __ movl(destination.AsRegisterPairHigh<Register>(), source.AsRegisterPairHigh<Register>());
+ } else if (destination.IsFpuRegister()) {
size_t elem_size = DataType::Size(DataType::Type::kInt32);
- // Create stack space for 2 elements.
- __ subl(ESP, Immediate(2 * elem_size));
- __ movl(Address(ESP, 0), source.AsRegisterPairLow<Register>());
- __ movl(Address(ESP, elem_size), source.AsRegisterPairHigh<Register>());
+ // Push the 2 source registers to stack.
+ __ pushl(source.AsRegisterPairHigh<Register>());
+ __ cfi().AdjustCFAOffset(elem_size);
+ __ pushl(source.AsRegisterPairLow<Register>());
+ __ cfi().AdjustCFAOffset(elem_size);
+ // Load the destination register.
__ movsd(destination.AsFpuRegister<XmmRegister>(), Address(ESP, 0));
// And remove the temporary stack space we allocated.
__ addl(ESP, Immediate(2 * elem_size));
+ __ cfi().AdjustCFAOffset(-(2 * elem_size));
+ } else {
+ DCHECK(destination.IsDoubleStackSlot());
+ __ movl(Address(ESP, destination.GetStackIndex()), source.AsRegisterPairLow<Register>());
+ __ movl(Address(ESP, destination.GetHighStackIndex(kX86WordSize)),
+ source.AsRegisterPairHigh<Register>());
+ }
} else if (source.IsFpuRegister()) {
if (destination.IsRegister()) {
__ movd(destination.AsRegister<Register>(), source.AsFpuRegister<XmmRegister>());
} else if (destination.IsFpuRegister()) {
__ movaps(destination.AsFpuRegister<XmmRegister>(), source.AsFpuRegister<XmmRegister>());
} else if (destination.IsRegisterPair()) {
- XmmRegister src_reg = source.AsFpuRegister<XmmRegister>();
- __ movd(destination.AsRegisterPairLow<Register>(), src_reg);
- __ psrlq(src_reg, Immediate(32));
- __ movd(destination.AsRegisterPairHigh<Register>(), src_reg);
+ size_t elem_size = DataType::Size(DataType::Type::kInt32);
+ // Create stack space for 2 elements.
+ __ subl(ESP, Immediate(2 * elem_size));
+ __ cfi().AdjustCFAOffset(2 * elem_size);
+ // Store the source register.
+ __ movsd(Address(ESP, 0), source.AsFpuRegister<XmmRegister>());
+ // And pop the values into destination registers.
+ __ popl(destination.AsRegisterPairLow<Register>());
+ __ cfi().AdjustCFAOffset(-elem_size);
+ __ popl(destination.AsRegisterPairHigh<Register>());
+ __ cfi().AdjustCFAOffset(-elem_size);
} else if (destination.IsStackSlot()) {
__ movss(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>());
} else if (destination.IsDoubleStackSlot()) {
@@ -6480,9 +6596,12 @@
__ xorpd(dest, dest);
} else {
__ pushl(high);
+ __ cfi().AdjustCFAOffset(4);
__ pushl(low);
+ __ cfi().AdjustCFAOffset(4);
__ movsd(dest, Address(ESP, 0));
__ addl(ESP, Immediate(8));
+ __ cfi().AdjustCFAOffset(-8);
}
} else {
DCHECK(destination.IsDoubleStackSlot()) << destination;
@@ -6520,10 +6639,12 @@
void ParallelMoveResolverX86::Exchange128(XmmRegister reg, int mem) {
size_t extra_slot = 4 * kX86WordSize;
__ subl(ESP, Immediate(extra_slot));
+ __ cfi().AdjustCFAOffset(extra_slot);
__ movups(Address(ESP, 0), XmmRegister(reg));
ExchangeMemory(0, mem + extra_slot, 4);
__ movups(XmmRegister(reg), Address(ESP, 0));
__ addl(ESP, Immediate(extra_slot));
+ __ cfi().AdjustCFAOffset(-extra_slot);
}
void ParallelMoveResolverX86::ExchangeMemory(int mem1, int mem2, int number_of_words) {