JNI: Inline fast-path for `JniMethodEnd()`.
Golem results for art-opt-cc (higher is better):
linux-ia32 before after
NativeDowncallStaticNormal 46.766 51.016 (+9.086%)
NativeDowncallStaticNormal6 42.268 45.748 (+8.235%)
NativeDowncallStaticNormalRefs6 41.355 44.776 (+8.272%)
NativeDowncallVirtualNormal 46.361 52.527 (+13.30%)
NativeDowncallVirtualNormal6 41.812 45.206 (+8.118%)
NativeDowncallVirtualNormalRefs6 40.500 44.169 (+9.059%)
(The NativeDowncallVirtualNormal result for x86 is skewed
by one extra good run as Golem reports the best result in
the summary. Using the second best and most frequent
result 50.5, the improvement is only around 8.9%.)
linux-x64 before after
NativeDowncallStaticNormal 44.169 47.976 (+8.620%)
NativeDowncallStaticNormal6 43.198 46.836 (+8.423%)
NativeDowncallStaticNormalRefs6 38.481 44.687 (+16.13%)
NativeDowncallVirtualNormal 43.672 47.405 (+8.547%)
NativeDowncallVirtualNormal6 42.268 45.726 (+8.182%)
NativeDowncallVirtualNormalRefs6 41.355 44.687 (+8.057%)
(The NativeDowncallStaticNormalRefs6 result for x86-64 is
a bit inflated because recent results jump between ~38.5
and ~40.5. If we take the latter as the baseline, the
improvements is only around 10.3%.)
linux-armv7 before after
NativeDowncallStaticNormal 10.659 14.620 (+37.16%)
NativeDowncallStaticNormal6 9.8377 13.120 (+33.36%)
NativeDowncallStaticNormalRefs6 8.8714 11.454 (+29.11%)
NativeDowncallVirtualNormal 10.511 14.349 (+36.51%)
NativeDowncallVirtualNormal6 9.9701 13.347 (+33.87%)
NativeDowncallVirtualNormalRefs6 8.9241 11.454 (+28.35%)
linux-armv8 before after
NativeDowncallStaticNormal 10.608 16.329 (+53.93%)
NativeDowncallStaticNormal6 10.179 15.347 (+50.76%)
NativeDowncallStaticNormalRefs6 9.2457 13.705 (+48.23%)
NativeDowncallVirtualNormal 9.9850 14.903 (+49.25%)
NativeDowncallVirtualNormal6 9.9206 14.757 (+48.75%)
NativeDowncallVirtualNormalRefs6 8.8235 12.789 (+44.94%)
Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Bug: 172332525
Change-Id: Ie144bc4f7f82be95790ea7d3123b81a3b6bfa603
diff --git a/compiler/utils/x86/jni_macro_assembler_x86.cc b/compiler/utils/x86/jni_macro_assembler_x86.cc
index 8be2a32..1a0d521 100644
--- a/compiler/utils/x86/jni_macro_assembler_x86.cc
+++ b/compiler/utils/x86/jni_macro_assembler_x86.cc
@@ -618,6 +618,57 @@
Immediate(0));
}
+void X86JNIMacroAssembler::TryToTransitionFromNativeToRunnable(
+ JNIMacroLabel* label,
+ ArrayRef<const ManagedRegister> scratch_regs,
+ ManagedRegister return_reg) {
+ constexpr uint32_t kNativeStateValue = Thread::StoredThreadStateValue(ThreadState::kNative);
+ constexpr uint32_t kRunnableStateValue = Thread::StoredThreadStateValue(ThreadState::kRunnable);
+ constexpr ThreadOffset32 thread_flags_offset = Thread::ThreadFlagsOffset<kX86PointerSize>();
+ constexpr ThreadOffset32 thread_held_mutex_mutator_lock_offset =
+ Thread::HeldMutexOffset<kX86PointerSize>(kMutatorLock);
+ constexpr ThreadOffset32 thread_mutator_lock_offset =
+ Thread::MutatorLockOffset<kX86PointerSize>();
+
+ size_t scratch_index = 0u;
+ auto get_scratch_reg = [&]() {
+ while (true) {
+ DCHECK_LT(scratch_index, scratch_regs.size());
+ X86ManagedRegister scratch_reg = scratch_regs[scratch_index].AsX86();
+ ++scratch_index;
+ DCHECK(!scratch_reg.Overlaps(return_reg.AsX86()));
+ if (scratch_reg.AsCpuRegister() != EAX) {
+ return scratch_reg.AsCpuRegister();
+ }
+ }
+ };
+ Register scratch = get_scratch_reg();
+ bool preserve_eax = return_reg.AsX86().Overlaps(X86ManagedRegister::FromCpuRegister(EAX));
+ Register saved_eax = preserve_eax ? get_scratch_reg() : kNoRegister;
+
+ // CAS acquire, old_value = kNativeStateValue, new_value = kRunnableStateValue, no flags.
+ if (preserve_eax) {
+ __ movl(saved_eax, EAX); // Save EAX.
+ }
+ __ movl(EAX, Immediate(kNativeStateValue));
+ static_assert(kRunnableStateValue == 0u);
+ __ xorl(scratch, scratch);
+ __ fs()->LockCmpxchgl(Address::Absolute(thread_flags_offset.Uint32Value()), scratch);
+ // LOCK CMPXCHG has full barrier semantics, so we don't need barriers here.
+ if (preserve_eax) {
+ __ movl(EAX, saved_eax); // Restore EAX; MOV does not change flags.
+ }
+ // If any flags are set, or the state is not Native, go to the slow path.
+ // (While the thread can theoretically transition between different Suspended states,
+ // it would be very unexpected to see a state other than Native at this point.)
+ __ j(kNotZero, X86JNIMacroLabel::Cast(label)->AsX86());
+
+ // Set `self->tlsPtr_.held_mutexes[kMutatorLock]` to the mutator lock.
+ __ fs()->movl(scratch, Address::Absolute(thread_mutator_lock_offset.Uint32Value()));
+ __ fs()->movl(Address::Absolute(thread_held_mutex_mutator_lock_offset.Uint32Value()),
+ scratch);
+}
+
void X86JNIMacroAssembler::SuspendCheck(JNIMacroLabel* label) {
__ fs()->testl(Address::Absolute(Thread::ThreadFlagsOffset<kX86PointerSize>()),
Immediate(Thread::SuspendOrCheckpointRequestFlags()));