JNI: Inline fast-path for `JniMethodStart()`.
Golem results for art-opt-cc (higher is better):
linux-ia32 before after
NativeDowncallStaticNormal 35.306 47.382 (+34.20%)
NativeDowncallStaticNormal6 32.951 42.247 (+28.21%)
NativeDowncallStaticNormalRefs6 17.866 41.355 (+131.5%)
NativeDowncallVirtualNormal 35.341 46.836 (+32.53%)
NativeDowncallVirtualNormal6 32.403 41.791 (+28.97%)
NativeDowncallVirtualNormalRefs6 32.131 40.500 (+26.05%)
linux-x64 before after
NativeDowncallStaticNormal 33.350 43.716 (+31.08%)
NativeDowncallStaticNormal6 31.096 43.176 (+38.85%)
NativeDowncallStaticNormalRefs6 30.617 38.500 (+25.75%)
NativeDowncallVirtualNormal 33.234 43.672 (+32.41%)
NativeDowncallVirtualNormal6 30.617 42.247 (+37.98%)
NativeDowncallVirtualNormalRefs6 32.131 42.701 (+32.90%)
linux-armv7 before after
NativeDowncallStaticNormal 7.8701 9.9651 (+26.62%)
NativeDowncallStaticNormal6 7.4147 8.9463 (+20.66%)
NativeDowncallStaticNormalRefs6 6.8830 8.3868 (+21.85%)
NativeDowncallVirtualNormal 7.8316 9.8377 (+25.61%)
NativeDowncallVirtualNormal6 7.4147 9.3596 (+26.23%)
NativeDowncallVirtualNormalRefs6 6.6794 8.4325 (+26.25%)
linux-armv8 before after
NativeDowncallStaticNormal 7.6372 9.8571 (+29.07%)
NativeDowncallStaticNormal6 7.4147 9.4905 (+28.00%)
NativeDowncallStaticNormalRefs6 6.8527 8.6705 (+26.53%)
NativeDowncallVirtualNormal 7.4147 9.3183 (+25.67%)
NativeDowncallVirtualNormal6 7.0755 9.2593 (+30.86%)
NativeDowncallVirtualNormalRefs6 6.5604 8.2967 (+26.47%)
Note that NativeDowncallStaticNormalRefs6 on x86 has been
jumping like crazy since
https://android-review.googlesource.com/1905055
between ~17.6 and ~32.4 for completely unrelated changes,
so if we take the 32.4 as a baseline, the improvement is
only ~27.6% in line with the other x86 benchmarks.
Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 172332525
Change-Id: I771a4765bd3a7c4e58b94be4155515241ea6fa3c
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
index 3d45abd..b06f428 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.cc
@@ -1050,6 +1050,35 @@
asm_.StoreToOffset(kStoreWord, tr, sp, dest_offset.Int32Value());
}
+void ArmVIXLJNIMacroAssembler::TryToTransitionFromRunnableToNative(
+ JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) {
+ constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+ constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+ constexpr ThreadOffset32 thread_flags_offset = Thread::ThreadFlagsOffset<kArmPointerSize>();
+ constexpr ThreadOffset32 thread_held_mutex_mutator_lock_offset =
+ Thread::HeldMutexOffset<kArmPointerSize>(kMutatorLock);
+
+ DCHECK_GE(scratch_regs.size(), 2u);
+ vixl32::Register scratch = AsVIXLRegister(scratch_regs[0].AsArm());
+ vixl32::Register scratch2 = AsVIXLRegister(scratch_regs[1].AsArm());
+
+ // CAS acquire, old_value = kRunnableStateValue, new_value = kNativeStateValue, no flags.
+ vixl32::Label retry;
+ ___ Bind(&retry);
+ ___ Ldrex(scratch, MemOperand(tr, thread_flags_offset.Int32Value()));
+ ___ Mov(scratch2, kNativeStateValue);
+ // If any flags are set, go to the slow path.
+ ___ Cmp(scratch, kRunnableStateValue);
+ ___ B(ne, ArmVIXLJNIMacroLabel::Cast(label)->AsArm());
+ ___ Strex(scratch, scratch2, MemOperand(tr, thread_flags_offset.Int32Value()));
+ ___ Cmp(scratch, 0);
+ ___ B(ne, &retry);
+ ___ Dmb(DmbOptions::ISH); // Memory barrier "load-any" for the "acquire" operation.
+
+ // Clear `self->tlsPtr_.held_mutexes[kMutatorLock]`; `scratch` holds 0 at this point.
+ ___ Str(scratch, MemOperand(tr, thread_held_mutex_mutator_lock_offset.Int32Value()));
+}
+
void ArmVIXLJNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
vixl32::Register scratch = temps.Acquire();
diff --git a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
index 49f5e7c..7b9d7de 100644
--- a/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
+++ b/compiler/utils/arm/jni_macro_assembler_arm_vixl.h
@@ -184,6 +184,10 @@
void Call(FrameOffset base, Offset offset) override;
void CallFromThread(ThreadOffset32 offset) override;
+ // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+ void TryToTransitionFromRunnableToNative(
+ JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
+
// Generate suspend check and branch to `label` if there is a pending suspend request.
void SuspendCheck(JNIMacroLabel* label) override;
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.cc b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
index a505db0..8ae1d04 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.cc
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.cc
@@ -889,6 +889,34 @@
___ Str(scratch, MEM_OP(reg_x(SP), out_off.Int32Value()));
}
+void Arm64JNIMacroAssembler::TryToTransitionFromRunnableToNative(
+ JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs ATTRIBUTE_UNUSED) {
+ constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+ constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+ constexpr ThreadOffset64 thread_flags_offset = Thread::ThreadFlagsOffset<kArm64PointerSize>();
+ constexpr ThreadOffset64 thread_held_mutex_mutator_lock_offset =
+ Thread::HeldMutexOffset<kArm64PointerSize>(kMutatorLock);
+
+ UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
+ Register scratch = temps.AcquireW();
+ Register scratch2 = temps.AcquireW();
+
+ // CAS acquire, old_value = kRunnableStateValue, new_value = kNativeStateValue, no flags.
+ vixl::aarch64::Label retry;
+ ___ Bind(&retry);
+ static_assert(thread_flags_offset.Int32Value() == 0); // LDAXR/STXR require exact address.
+ ___ Ldaxr(scratch, MEM_OP(reg_x(TR)));
+ ___ Mov(scratch2, kNativeStateValue);
+ // If any flags are set, go to the slow path.
+ static_assert(kRunnableStateValue == 0u);
+ ___ Cbnz(scratch, Arm64JNIMacroLabel::Cast(label)->AsArm64());
+ ___ Stxr(scratch, scratch2, MEM_OP(reg_x(TR)));
+ ___ Cbnz(scratch, &retry);
+
+ // Clear `self->tlsPtr_.held_mutexes[kMutatorLock]`.
+ ___ Str(xzr, MEM_OP(reg_x(TR), thread_held_mutex_mutator_lock_offset.Int32Value()));
+}
+
void Arm64JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
UseScratchRegisterScope temps(asm_.GetVIXLAssembler());
Register scratch = temps.AcquireW();
diff --git a/compiler/utils/arm64/jni_macro_assembler_arm64.h b/compiler/utils/arm64/jni_macro_assembler_arm64.h
index b6e31c2..1c61d96 100644
--- a/compiler/utils/arm64/jni_macro_assembler_arm64.h
+++ b/compiler/utils/arm64/jni_macro_assembler_arm64.h
@@ -168,6 +168,10 @@
void Call(FrameOffset base, Offset offset) override;
void CallFromThread(ThreadOffset64 offset) override;
+ // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+ void TryToTransitionFromRunnableToNative(
+ JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
+
// Generate suspend check and branch to `label` if there is a pending suspend request.
void SuspendCheck(JNIMacroLabel* label) override;
diff --git a/compiler/utils/jni_macro_assembler.h b/compiler/utils/jni_macro_assembler.h
index abb53b7..659ff4c 100644
--- a/compiler/utils/jni_macro_assembler.h
+++ b/compiler/utils/jni_macro_assembler.h
@@ -251,6 +251,10 @@
virtual void Call(FrameOffset base, Offset offset) = 0;
virtual void CallFromThread(ThreadOffset<kPointerSize> offset) = 0;
+ // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+ virtual void TryToTransitionFromRunnableToNative(
+ JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) = 0;
+
// Generate suspend check and branch to `label` if there is a pending suspend request.
virtual void SuspendCheck(JNIMacroLabel* label) = 0;
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 4ba3aa1..fc92c30 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -589,6 +589,35 @@
__ movl(Address(ESP, offset), scratch);
}
+void X86JNIMacroAssembler::TryToTransitionFromRunnableToNative(
+ JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) {
+ constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+ constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+ constexpr ThreadOffset32 thread_flags_offset = Thread::ThreadFlagsOffset<kX86PointerSize>();
+ constexpr ThreadOffset32 thread_held_mutex_mutator_lock_offset =
+ Thread::HeldMutexOffset<kX86PointerSize>(kMutatorLock);
+
+ // We need to preserve managed argument EAX.
+ DCHECK_GE(scratch_regs.size(), 2u);
+ Register saved_eax = scratch_regs[0].AsX86().AsCpuRegister();
+ Register scratch = scratch_regs[1].AsX86().AsCpuRegister();
+
+ // CAS acquire, old_value = kRunnableStateValue, new_value = kNativeStateValue, no flags.
+ __ movl(saved_eax, EAX); // Save EAX.
+ static_assert(kRunnableStateValue == 0u);
+ __ xorl(EAX, EAX);
+ __ movl(scratch, Immediate(kNativeStateValue));
+ __ fs()->LockCmpxchgl(Address::Absolute(thread_flags_offset.Uint32Value()), scratch);
+ // LOCK CMPXCHG has full barrier semantics, so we don't need barriers here.
+ __ movl(EAX, saved_eax); // Restore EAX; MOV does not change flags.
+ // If any flags are set, go to the slow path.
+ __ j(kNotZero, X86JNIMacroLabel::Cast(label)->AsX86());
+
+ // Clear `self->tlsPtr_.held_mutexes[kMutatorLock]`.
+ __ fs()->movl(Address::Absolute(thread_held_mutex_mutator_lock_offset.Uint32Value()),
+ Immediate(0));
+}
+
void X86JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
__ fs()->testl(Address::Absolute(Thread::ThreadFlagsOffset<kX86PointerSize>()),
Immediate(Thread::SuspendOrCheckpointRequestFlags()));
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.h b/compiler/utils/x86/jni_macro_assembler_x86.h
index 058e040..0af6371 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.h
+++ b/compiler/utils/x86/jni_macro_assembler_x86.h
@@ -160,6 +160,10 @@
void Call(FrameOffset base, Offset offset) override;
void CallFromThread(ThreadOffset32 offset) override;
+ // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+ void TryToTransitionFromRunnableToNative(
+ JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
+
// Generate suspend check and branch to `label` if there is a pending suspend request.
void SuspendCheck(JNIMacroLabel* label) override;
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
index de99e74..3ddb689 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.cc
@@ -671,6 +671,33 @@
__ movq(Address(CpuRegister(RSP), offset), scratch);
}
+void X86_64JNIMacroAssembler::TryToTransitionFromRunnableToNative(
+ JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs ATTRIBUTE_UNUSED) {
+ constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+ constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+ constexpr ThreadOffset64 thread_flags_offset = Thread::ThreadFlagsOffset<kX86_64PointerSize>();
+ constexpr ThreadOffset64 thread_held_mutex_mutator_lock_offset =
+ Thread::HeldMutexOffset<kX86_64PointerSize>(kMutatorLock);
+
+ CpuRegister rax(RAX); // RAX can be freely clobbered. It does not hold any argument.
+ CpuRegister scratch = GetScratchRegister();
+
+ // CAS acquire, old_value = kRunnableStateValue, new_value = kNativeStateValue, no flags.
+ static_assert(kRunnableStateValue == 0u);
+ __ xorl(rax, rax);
+ __ movl(scratch, Immediate(kNativeStateValue));
+ __ gs()->LockCmpxchgl(Address::Absolute(thread_flags_offset.Uint32Value(), /*no_rip=*/ true),
+ scratch);
+ // LOCK CMPXCHG has full barrier semantics, so we don't need barriers here.
+ // If any flags are set, go to the slow path.
+ __ j(kNotZero, X86_64JNIMacroLabel::Cast(label)->AsX86_64());
+
+ // Clear `self->tlsPtr_.held_mutexes[kMutatorLock]`.
+ __ gs()->movq(
+ Address::Absolute(thread_held_mutex_mutator_lock_offset.Uint32Value(), /*no_rip=*/ true),
+ Immediate(0));
+}
+
void X86_64JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
__ gs()->testl(Address::Absolute(Thread::ThreadFlagsOffset<kX86_64PointerSize>(), true),
Immediate(Thread::SuspendOrCheckpointRequestFlags()));
diff --git a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
index 3e5dfb7..6eb7873 100644
--- a/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
+++ b/compiler/utils/x86_64/jni_macro_assembler_x86_64.h
@@ -180,6 +180,10 @@
void Call(FrameOffset base, Offset offset) override;
void CallFromThread(ThreadOffset64 offset) override;
+ // Generate fast-path for transition to Native. Go to `label` if any thread flag is set.
+ void TryToTransitionFromRunnableToNative(
+ JNIMacroLabel* label, ArrayRef<const ManagedRegister> scratch_regs) override;
+
// Generate suspend check and branch to `label` if there is a pending suspend request.
void SuspendCheck(JNIMacroLabel* label) override;