ARM/ARM64: Improve lock/unlock entrypoints.
Do the same in fewer instructions.
Test: Pixel 2 XL boots.
Test: testrunner.py --target --optimizing
Change-Id: I8003481116fd3dc6a1559b84fdc776b92dba0c68
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index cd00125..311e838 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -55,7 +55,7 @@
@ Load kSaveAllCalleeSaves Method* into rTemp.
ldr \rTemp, [\rTemp, #RUNTIME_SAVE_ALL_CALLEE_SAVES_METHOD_OFFSET]
str \rTemp, [sp, #0] @ Place Method* at bottom of stack.
- str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET] @ Place sp in Thread::Current()->top_quick_frame.
+ str sp, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET] @ Place sp in Thread::Current()->top_quick_frame.
// Ugly compile-time check, but we only have the preprocessor.
#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVES != 36 + 64 + 12)
@@ -86,7 +86,7 @@
@ Load kSaveRefsOnly Method* into rTemp.
ldr \rTemp, [\rTemp, #RUNTIME_SAVE_REFS_ONLY_METHOD_OFFSET]
str \rTemp, [sp, #0] @ Place Method* at bottom of stack.
- str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET] @ Place sp in Thread::Current()->top_quick_frame.
+ str sp, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET] @ Place sp in Thread::Current()->top_quick_frame.
// Ugly compile-time check, but we only have the preprocessor.
#if (FRAME_SIZE_SAVE_REFS_ONLY != 28 + 4)
@@ -147,13 +147,13 @@
@ Load kSaveRefsAndArgs Method* into rTemp.
ldr \rTemp, [\rTemp, #RUNTIME_SAVE_REFS_AND_ARGS_METHOD_OFFSET]
str \rTemp, [sp, #0] @ Place Method* at bottom of stack.
- str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET] @ Place sp in Thread::Current()->top_quick_frame.
+ str sp, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET] @ Place sp in Thread::Current()->top_quick_frame.
.endm
.macro SETUP_SAVE_REFS_AND_ARGS_FRAME_WITH_METHOD_IN_R0
SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
str r0, [sp, #0] @ Store ArtMethod* to bottom of stack.
- str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET] @ Place sp in Thread::Current()->top_quick_frame.
+ str sp, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET] @ Place sp in Thread::Current()->top_quick_frame.
.endm
.macro RESTORE_SAVE_REFS_AND_ARGS_FRAME
@@ -193,7 +193,7 @@
@ Load kSaveEverything Method* into rTemp.
ldr \rTemp, [\rTemp, #\runtime_method_offset]
str \rTemp, [sp, #0] @ Place Method* at bottom of stack.
- str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET] @ Place sp in Thread::Current()->top_quick_frame.
+ str sp, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET] @ Place sp in Thread::Current()->top_quick_frame.
// Ugly compile-time check, but we only have the preprocessor.
#if (FRAME_SIZE_SAVE_EVERYTHING != 56 + 128 + 8)
@@ -301,7 +301,7 @@
* exception is Thread::Current()->exception_ when the runtime method frame is ready.
*/
.macro DELIVER_PENDING_EXCEPTION_FRAME_READY
- mov r0, r9 @ pass Thread::Current
+ mov r0, rSELF @ pass Thread::Current
bl artDeliverPendingExceptionFromCode @ artDeliverPendingExceptionFromCode(Thread*)
.endm
@@ -318,7 +318,7 @@
.extern \cxx_name
ENTRY \c_name
SETUP_SAVE_ALL_CALLEE_SAVES_FRAME r0 @ save all registers as basis for long jump context
- mov r0, r9 @ pass Thread::Current
+ mov r0, rSELF @ pass Thread::Current
bl \cxx_name @ \cxx_name(Thread*)
END \c_name
.endm
@@ -327,7 +327,7 @@
.extern \cxx_name
ENTRY \c_name
SETUP_SAVE_EVERYTHING_FRAME r0 @ save all registers as basis for long jump context
- mov r0, r9 @ pass Thread::Current
+ mov r0, rSELF @ pass Thread::Current
bl \cxx_name @ \cxx_name(Thread*)
END \c_name
.endm
@@ -336,7 +336,7 @@
.extern \cxx_name
ENTRY \c_name
SETUP_SAVE_ALL_CALLEE_SAVES_FRAME r1 @ save all registers as basis for long jump context
- mov r1, r9 @ pass Thread::Current
+ mov r1, rSELF @ pass Thread::Current
bl \cxx_name @ \cxx_name(Thread*)
END \c_name
.endm
@@ -345,13 +345,13 @@
.extern \cxx_name
ENTRY \c_name
SETUP_SAVE_EVERYTHING_FRAME r2 @ save all registers as basis for long jump context
- mov r2, r9 @ pass Thread::Current
+ mov r2, rSELF @ pass Thread::Current
bl \cxx_name @ \cxx_name(Thread*)
END \c_name
.endm
.macro RETURN_OR_DELIVER_PENDING_EXCEPTION_REG reg
- ldr \reg, [r9, #THREAD_EXCEPTION_OFFSET] // Get exception field.
+ ldr \reg, [rSELF, #THREAD_EXCEPTION_OFFSET] @ Get exception field.
cbnz \reg, 1f
bx lr
1:
@@ -377,7 +377,7 @@
.extern \entrypoint
ENTRY \name
SETUP_SAVE_REFS_ONLY_FRAME r1 @ save callee saves in case of GC
- mov r1, r9 @ pass Thread::Current
+ mov r1, rSELF @ pass Thread::Current
bl \entrypoint @ (uint32_t field_idx, Thread*)
RESTORE_SAVE_REFS_ONLY_FRAME
REFRESH_MARKING_REGISTER
@@ -389,7 +389,7 @@
.extern \entrypoint
ENTRY \name
SETUP_SAVE_REFS_ONLY_FRAME r2 @ save callee saves in case of GC
- mov r2, r9 @ pass Thread::Current
+ mov r2, rSELF @ pass Thread::Current
bl \entrypoint @ (field_idx, Object*, Thread*)
RESTORE_SAVE_REFS_ONLY_FRAME
REFRESH_MARKING_REGISTER
@@ -401,7 +401,7 @@
.extern \entrypoint
ENTRY \name
SETUP_SAVE_REFS_ONLY_FRAME r3 @ save callee saves in case of GC
- mov r3, r9 @ pass Thread::Current
+ mov r3, rSELF @ pass Thread::Current
bl \entrypoint @ (field_idx, Object*, new_val, Thread*)
RESTORE_SAVE_REFS_ONLY_FRAME @ TODO: we can clearly save an add here
REFRESH_MARKING_REGISTER
@@ -448,7 +448,7 @@
@ save all registers as basis for long jump context
SETUP_SAVE_EVERYTHING_FRAME_CORE_REGS_SAVED r1
mov r0, lr @ pass the fault address stored in LR by the fault handler.
- mov r1, r9 @ pass Thread::Current
+ mov r1, rSELF @ pass Thread::Current
bl artThrowNullPointerExceptionFromSignal @ (Thread*)
END art_quick_throw_null_pointer_exception_from_signal
@@ -494,7 +494,7 @@
.macro INVOKE_TRAMPOLINE_BODY cxx_name
.extern \cxx_name
SETUP_SAVE_REFS_AND_ARGS_FRAME r2 @ save callee saves in case allocation triggers GC
- mov r2, r9 @ pass Thread::Current
+ mov r2, rSELF @ pass Thread::Current
mov r3, sp
bl \cxx_name @ (method_idx, this, Thread*, SP)
mov r12, r1 @ save Method*->code_
@@ -682,50 +682,48 @@
*/
.extern artLockObjectFromCode
ENTRY art_quick_lock_object
+ ldr r1, [rSELF, #THREAD_ID_OFFSET]
cbz r0, .Lslow_lock
.Lretry_lock:
- ldr r2, [r9, #THREAD_ID_OFFSET]
- ldrex r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
- mov r3, r1
- and r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED @ zero the gc bits
- cbnz r3, .Lnot_unlocked @ already thin locked
- @ unlocked case - r1: original lock word that's zero except for the read barrier bits.
- orr r2, r1, r2 @ r2 holds thread id with count of 0 with preserved read barrier bits
- strex r3, r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
- cbnz r3, .Llock_strex_fail @ store failed, retry
- dmb ish @ full (LoadLoad|LoadStore) memory barrier
+ ldrex r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+ eor r3, r2, r1 @ Prepare the value to store if unlocked
+ @ (thread id, count of 0 and preserved read barrier bits),
+ @ or prepare to compare thread id for recursive lock check
+ @ (lock_word.ThreadId() ^ self->ThreadId()).
+ ands ip, r2, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED @ Test the non-gc bits.
+ bne .Lnot_unlocked @ Check if unlocked.
+ @ unlocked case - store r3: original lock word plus thread id, preserved read barrier bits.
+ strex r2, r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+ cbnz r2, .Llock_strex_fail @ If store failed, retry.
+ dmb ish @ Full (LoadLoad|LoadStore) memory barrier.
bx lr
-.Lnot_unlocked: @ r1: original lock word, r2: thread_id with count of 0 and zero read barrier bits
- lsr r3, r1, LOCK_WORD_STATE_SHIFT
- cbnz r3, .Lslow_lock @ if either of the top two bits are set, go slow path
- eor r2, r1, r2 @ lock_word.ThreadId() ^ self->ThreadId()
- uxth r2, r2 @ zero top 16 bits
- cbnz r2, .Lslow_lock @ lock word and self thread id's match -> recursive lock
- @ else contention, go to slow path
- mov r3, r1 @ copy the lock word to check count overflow.
- and r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED @ zero the gc bits.
- add r2, r3, #LOCK_WORD_THIN_LOCK_COUNT_ONE @ increment count in lock word placing in r2 to check overflow
- lsr r3, r2, #LOCK_WORD_GC_STATE_SHIFT @ if the first gc state bit is set, we overflowed.
- cbnz r3, .Lslow_lock @ if we overflow the count go slow path
- add r2, r1, #LOCK_WORD_THIN_LOCK_COUNT_ONE @ increment count for real
- strex r3, r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET] @ strex necessary for read barrier bits
- cbnz r3, .Llock_strex_fail @ strex failed, retry
+.Lnot_unlocked: @ r2: original lock word, r1: thread_id, r3: r2 ^ r1
+#if LOCK_WORD_THIN_LOCK_COUNT_SHIFT + LOCK_WORD_THIN_LOCK_COUNT_SIZE != LOCK_WORD_GC_STATE_SHIFT
+#error "Expecting thin lock count and gc state in consecutive bits."
+#endif
+ @ Check lock word state and thread id together,
+ bfc r3, #LOCK_WORD_THIN_LOCK_COUNT_SHIFT, #(LOCK_WORD_THIN_LOCK_COUNT_SIZE + LOCK_WORD_GC_STATE_SIZE)
+ cbnz r3, .Lslow_lock @ if either of the top two bits are set, or the lock word's
+ @ thread id did not match, go slow path.
+ add r3, r2, #LOCK_WORD_THIN_LOCK_COUNT_ONE @ Increment the recursive lock count.
+ @ Extract the new thin lock count for overflow check.
+ ubfx r2, r3, #LOCK_WORD_THIN_LOCK_COUNT_SHIFT, #LOCK_WORD_THIN_LOCK_COUNT_SIZE
+ cbz r2, .Lslow_lock @ Zero as the new count indicates overflow, go slow path.
+ strex r2, r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET] @ strex necessary for read barrier bits.
+ cbnz r2, .Llock_strex_fail @ If strex failed, retry.
bx lr
.Llock_strex_fail:
b .Lretry_lock @ retry
-.Lslow_lock:
- SETUP_SAVE_REFS_ONLY_FRAME r1 @ save callee saves in case we block
- mov r1, r9 @ pass Thread::Current
- bl artLockObjectFromCode @ (Object* obj, Thread*)
- RESTORE_SAVE_REFS_ONLY_FRAME
- REFRESH_MARKING_REGISTER
- RETURN_IF_RESULT_IS_ZERO
- DELIVER_PENDING_EXCEPTION
+// Note: the slow path is actually the art_quick_lock_object_no_inline (tail call).
END art_quick_lock_object
ENTRY art_quick_lock_object_no_inline
+ // This is also the slow path for art_quick_lock_object. Note that we
+ // need a local label, the assembler complains about target being out of
+ // range if we try to jump to `art_quick_lock_object_no_inline`.
+.Lslow_lock:
SETUP_SAVE_REFS_ONLY_FRAME r1 @ save callee saves in case we block
- mov r1, r9 @ pass Thread::Current
+ mov r1, rSELF @ pass Thread::Current
bl artLockObjectFromCode @ (Object* obj, Thread*)
RESTORE_SAVE_REFS_ONLY_FRAME
REFRESH_MARKING_REGISTER
@@ -739,62 +737,59 @@
*/
.extern artUnlockObjectFromCode
ENTRY art_quick_unlock_object
+ ldr r1, [rSELF, #THREAD_ID_OFFSET]
cbz r0, .Lslow_unlock
.Lretry_unlock:
#ifndef USE_READ_BARRIER
- ldr r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+ ldr r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
#else
- ldrex r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET] @ Need to use atomic instructions for read barrier
+ @ Need to use atomic instructions for read barrier.
+ ldrex r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
#endif
- lsr r2, r1, #LOCK_WORD_STATE_SHIFT
- cbnz r2, .Lslow_unlock @ if either of the top two bits are set, go slow path
- ldr r2, [r9, #THREAD_ID_OFFSET]
- mov r3, r1 @ copy lock word to check thread id equality
- and r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED @ zero the gc bits
- eor r3, r3, r2 @ lock_word.ThreadId() ^ self->ThreadId()
- uxth r3, r3 @ zero top 16 bits
- cbnz r3, .Lslow_unlock @ do lock word and self thread id's match?
- mov r3, r1 @ copy lock word to detect transition to unlocked
- and r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED @ zero the gc bits
- cmp r3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
- bpl .Lrecursive_thin_unlock
- @ transition to unlocked
- mov r3, r1
- and r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED @ r3: zero except for the preserved gc bits
- dmb ish @ full (LoadStore|StoreStore) memory barrier
+ eor r3, r2, r1 @ Prepare the value to store if simply locked
+ @ (mostly 0s, and preserved read barrier bits),
+ @ or prepare to compare thread id for recursive lock check
+ @ (lock_word.ThreadId() ^ self->ThreadId()).
+ ands ip, r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED @ Test the non-gc bits.
+ bne .Lnot_simply_locked @ Locked recursively or by other thread?
+ @ Transition to unlocked.
+ dmb ish @ Full (LoadStore|StoreStore) memory barrier.
#ifndef USE_READ_BARRIER
str r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
#else
strex r2, r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET] @ strex necessary for read barrier bits
- cbnz r2, .Lunlock_strex_fail @ store failed, retry
+ cbnz r2, .Lunlock_strex_fail @ If the store failed, retry.
#endif
bx lr
-.Lrecursive_thin_unlock: @ r1: original lock word
- sub r1, r1, #LOCK_WORD_THIN_LOCK_COUNT_ONE @ decrement count
+.Lnot_simply_locked: @ r2: original lock word, r1: thread_id, r3: r2 ^ r1
+#if LOCK_WORD_THIN_LOCK_COUNT_SHIFT + LOCK_WORD_THIN_LOCK_COUNT_SIZE != LOCK_WORD_GC_STATE_SHIFT
+#error "Expecting thin lock count and gc state in consecutive bits."
+#endif
+ @ Check lock word state and thread id together,
+ bfc r3, #LOCK_WORD_THIN_LOCK_COUNT_SHIFT, #(LOCK_WORD_THIN_LOCK_COUNT_SIZE + LOCK_WORD_GC_STATE_SIZE)
+ cbnz r3, .Lslow_unlock @ if either of the top two bits are set, or the lock word's
+ @ thread id did not match, go slow path.
+ sub r3, r2, #LOCK_WORD_THIN_LOCK_COUNT_ONE @ Decrement recursive lock count.
#ifndef USE_READ_BARRIER
- str r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+ str r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
#else
- strex r2, r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET] @ strex necessary for read barrier bits
- cbnz r2, .Lunlock_strex_fail @ store failed, retry
+ strex r2, r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET] @ strex necessary for read barrier bits.
+ cbnz r2, .Lunlock_strex_fail @ If the store failed, retry.
#endif
bx lr
.Lunlock_strex_fail:
b .Lretry_unlock @ retry
-.Lslow_unlock:
- @ save callee saves in case exception allocation triggers GC
- SETUP_SAVE_REFS_ONLY_FRAME r1
- mov r1, r9 @ pass Thread::Current
- bl artUnlockObjectFromCode @ (Object* obj, Thread*)
- RESTORE_SAVE_REFS_ONLY_FRAME
- REFRESH_MARKING_REGISTER
- RETURN_IF_RESULT_IS_ZERO
- DELIVER_PENDING_EXCEPTION
+// Note: the slow path is actually the art_quick_unlock_object_no_inline (tail call).
END art_quick_unlock_object
ENTRY art_quick_unlock_object_no_inline
+ // This is also the slow path for art_quick_unlock_object. Note that we
+ // need a local label, the assembler complains about target being out of
+ // range if we try to jump to `art_quick_unlock_object_no_inline`.
+.Lslow_unlock:
@ save callee saves in case exception allocation triggers GC
SETUP_SAVE_REFS_ONLY_FRAME r1
- mov r1, r9 @ pass Thread::Current
+ mov r1, rSELF @ pass Thread::Current
bl artUnlockObjectFromCode @ (Object* obj, Thread*)
RESTORE_SAVE_REFS_ONLY_FRAME
REFRESH_MARKING_REGISTER
@@ -832,7 +827,7 @@
.Lthrow_class_cast_exception_for_bitstring_check:
SETUP_SAVE_ALL_CALLEE_SAVES_FRAME r2 @ save all registers as basis for long jump context
- mov r2, r9 @ pass Thread::Current
+ mov r2, rSELF @ pass Thread::Current
bl artThrowClassCastExceptionForObject @ (Object*, Class*, Thread*)
bkpt
END art_quick_check_instance_of
@@ -917,7 +912,7 @@
add r3, r0, #MIRROR_OBJECT_ARRAY_DATA_OFFSET
POISON_HEAP_REF r2
str r2, [r3, r1, lsl #2]
- ldr r3, [r9, #THREAD_CARD_TABLE_OFFSET]
+ ldr r3, [rSELF, #THREAD_CARD_TABLE_OFFSET]
lsr r0, r0, #CARD_TABLE_CARD_SHIFT
strb r3, [r3, r0]
blx lr
@@ -945,7 +940,7 @@
add r3, r0, #MIRROR_OBJECT_ARRAY_DATA_OFFSET
POISON_HEAP_REF r2
str r2, [r3, r1, lsl #2]
- ldr r3, [r9, #THREAD_CARD_TABLE_OFFSET]
+ ldr r3, [rSELF, #THREAD_CARD_TABLE_OFFSET]
lsr r0, r0, #CARD_TABLE_CARD_SHIFT
strb r3, [r3, r0]
blx lr
@@ -954,7 +949,7 @@
/* No need to repeat restore cfi directives, the ones above apply here. */
SETUP_SAVE_ALL_CALLEE_SAVES_FRAME r3
mov r1, r2
- mov r2, r9 @ pass Thread::Current
+ mov r2, rSELF @ pass Thread::Current
bl artThrowArrayStoreException @ (Class*, Class*, Thread*)
bkpt @ unreached
END art_quick_aput_obj
@@ -964,7 +959,7 @@
.extern \entrypoint
ENTRY \name
SETUP_SAVE_REFS_ONLY_FRAME r1 @ save callee saves in case of GC
- mov r1, r9 @ pass Thread::Current
+ mov r1, rSELF @ pass Thread::Current
bl \entrypoint @ (uint32_t type_idx, Method* method, Thread*)
RESTORE_SAVE_REFS_ONLY_FRAME
REFRESH_MARKING_REGISTER
@@ -977,7 +972,7 @@
.extern \entrypoint
ENTRY \name
SETUP_SAVE_REFS_ONLY_FRAME r2 @ save callee saves in case of GC
- mov r2, r9 @ pass Thread::Current
+ mov r2, rSELF @ pass Thread::Current
bl \entrypoint @ (uint32_t type_idx, Method* method, Thread*)
RESTORE_SAVE_REFS_ONLY_FRAME
REFRESH_MARKING_REGISTER
@@ -990,7 +985,7 @@
.extern \entrypoint
ENTRY \name
SETUP_SAVE_REFS_ONLY_FRAME r3 @ save callee saves in case of GC
- mov r3, r9 @ pass Thread::Current
+ mov r3, rSELF @ pass Thread::Current
@ (uint32_t type_idx, Method* method, int32_t component_count, Thread*)
bl \entrypoint
RESTORE_SAVE_REFS_ONLY_FRAME
@@ -1004,7 +999,7 @@
.extern \entrypoint
ENTRY \name
SETUP_SAVE_REFS_ONLY_FRAME r12 @ save callee saves in case of GC
- str r9, [sp, #-16]! @ expand the frame and pass Thread::Current
+ str rSELF, [sp, #-16]! @ expand the frame and pass Thread::Current
.cfi_adjust_cfa_offset 16
bl \entrypoint
add sp, #16 @ strip the extra frame
@@ -1023,7 +1018,7 @@
.extern \entrypoint
ENTRY \name
SETUP_SAVE_EVERYTHING_FRAME r1, \runtime_method_offset @ save everything in case of GC
- mov r1, r9 @ pass Thread::Current
+ mov r1, rSELF @ pass Thread::Current
bl \entrypoint @ (uint32_t index, Thread*)
cbz r0, 1f @ If result is null, deliver the OOME.
.cfi_remember_state
@@ -1065,9 +1060,9 @@
.extern artGet64StaticFromCompiledCode
ENTRY art_quick_get64_static
SETUP_SAVE_REFS_ONLY_FRAME r2 @ save callee saves in case of GC
- mov r1, r9 @ pass Thread::Current
- bl artGet64StaticFromCompiledCode @ (uint32_t field_idx, Thread*)
- ldr r2, [r9, #THREAD_EXCEPTION_OFFSET] @ load Thread::Current()->exception_
+ mov r1, rSELF @ pass Thread::Current
+ bl artGet64StaticFromCompiledCode @ (uint32_t field_idx, Thread*)
+ ldr r2, [rSELF, #THREAD_EXCEPTION_OFFSET] @ load Thread::Current()->exception_
RESTORE_SAVE_REFS_ONLY_FRAME
REFRESH_MARKING_REGISTER
cbnz r2, 1f @ success if no exception pending
@@ -1091,9 +1086,9 @@
.extern artGet64InstanceFromCompiledCode
ENTRY art_quick_get64_instance
SETUP_SAVE_REFS_ONLY_FRAME r2 @ save callee saves in case of GC
- mov r2, r9 @ pass Thread::Current
- bl artGet64InstanceFromCompiledCode @ (field_idx, Object*, Thread*)
- ldr r2, [r9, #THREAD_EXCEPTION_OFFSET] @ load Thread::Current()->exception_
+ mov r2, rSELF @ pass Thread::Current
+ bl artGet64InstanceFromCompiledCode @ (field_idx, Object*, Thread*)
+ ldr r2, [rSELF, #THREAD_EXCEPTION_OFFSET] @ load Thread::Current()->exception_
RESTORE_SAVE_REFS_ONLY_FRAME
REFRESH_MARKING_REGISTER
cbnz r2, 1f @ success if no exception pending
@@ -1125,7 +1120,7 @@
ENTRY art_quick_set64_instance
SETUP_SAVE_REFS_ONLY_FRAME r12 @ save callee saves in case of GC
@ r2:r3 contain the wide argument
- str r9, [sp, #-16]! @ expand the frame and pass Thread::Current
+ str rSELF, [sp, #-16]! @ expand the frame and pass Thread::Current
.cfi_adjust_cfa_offset 16
bl artSet64InstanceFromCompiledCode @ (field_idx, Object*, new_val, Thread*)
add sp, #16 @ release out args
@@ -1140,7 +1135,7 @@
ENTRY art_quick_set64_static
SETUP_SAVE_REFS_ONLY_FRAME r12 @ save callee saves in case of GC
@ r2:r3 contain the wide argument
- str r9, [sp, #-16]! @ expand the frame and pass Thread::Current
+ str rSELF, [sp, #-16]! @ expand the frame and pass Thread::Current
.cfi_adjust_cfa_offset 16
bl artSet64StaticFromCompiledCode @ (field_idx, new_val, Thread*)
add sp, #16 @ release out args
@@ -1185,12 +1180,12 @@
.macro ART_QUICK_ALLOC_OBJECT_ROSALLOC c_name, cxx_name, isInitialized
ENTRY \c_name
// Fast path rosalloc allocation.
- // r0: type/return value, r9: Thread::Current
+ // r0: type/return value, rSELF (r9): Thread::Current
// r1, r2, r3, r12: free.
- ldr r3, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET] // Check if the thread local
+ ldr r3, [rSELF, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET] // Check if the thread local
// allocation stack has room.
// TODO: consider using ldrd.
- ldr r12, [r9, #THREAD_LOCAL_ALLOC_STACK_END_OFFSET]
+ ldr r12, [rSELF, #THREAD_LOCAL_ALLOC_STACK_END_OFFSET]
cmp r3, r12
bhs .Lslow_path\c_name
@@ -1208,7 +1203,7 @@
// from the size. Since the size is
// already aligned we can combine the
// two shifts together.
- add r12, r9, r3, lsr #(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT - POINTER_SIZE_SHIFT)
+ add r12, rSELF, r3, lsr #(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT - POINTER_SIZE_SHIFT)
// Subtract pointer size since ther
// are no runs for 0 byte allocations
// and the size is already aligned.
@@ -1236,9 +1231,9 @@
// local allocation stack and
// increment the thread local
// allocation stack top.
- ldr r1, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]
+ ldr r1, [rSELF, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]
str r3, [r1], #COMPRESSED_REFERENCE_SIZE // (Increment r1 as a side effect.)
- str r1, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]
+ str r1, [rSELF, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]
// Decrement the size of the free list
// After this "STR" the object is published to the thread local allocation stack,
@@ -1287,7 +1282,7 @@
.Lslow_path\c_name:
SETUP_SAVE_REFS_ONLY_FRAME r2 @ save callee saves in case of GC
- mov r1, r9 @ pass Thread::Current
+ mov r1, rSELF @ pass Thread::Current
bl \cxx_name @ (mirror::Class* cls, Thread*)
RESTORE_SAVE_REFS_ONLY_FRAME
REFRESH_MARKING_REGISTER
@@ -1301,7 +1296,7 @@
// The common fast path code for art_quick_alloc_object_resolved/initialized_tlab
// and art_quick_alloc_object_resolved/initialized_region_tlab.
//
-// r0: type r9: Thread::Current, r1, r2, r3, r12: free.
+// r0: type, rSELF (r9): Thread::Current, r1, r2, r3, r12: free.
// Need to preserve r0 to the slow path.
//
// If isInitialized=1 then the compiler assumes the object's class has already been initialized.
@@ -1313,7 +1308,7 @@
#if !((THREAD_LOCAL_POS_OFFSET + 4 == THREAD_LOCAL_END_OFFSET) && (THREAD_LOCAL_POS_OFFSET % 8 == 0))
#error "Thread::thread_local_pos/end must be consecutive and are 8 byte aligned for performance"
#endif
- ldrd r12, r3, [r9, #THREAD_LOCAL_POS_OFFSET]
+ ldrd r12, r3, [rSELF, #THREAD_LOCAL_POS_OFFSET]
sub r12, r3, r12 // Compute the remaining buf size.
ldr r3, [r0, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET] // Load the object size (r3).
cmp r3, r12 // Check if it fits.
@@ -1326,9 +1321,9 @@
// "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
// Reload old thread_local_pos (r0)
// for the return value.
- ldr r2, [r9, #THREAD_LOCAL_POS_OFFSET]
+ ldr r2, [rSELF, #THREAD_LOCAL_POS_OFFSET]
add r1, r2, r3
- str r1, [r9, #THREAD_LOCAL_POS_OFFSET] // Store new thread_local_pos.
+ str r1, [rSELF, #THREAD_LOCAL_POS_OFFSET] // Store new thread_local_pos.
// After this "STR" the object is published to the thread local allocation stack,
// and it will be observable from a runtime internal (eg. Heap::VisitObjects) point of view.
// It is not yet visible to the running (user) compiled code until after the return.
@@ -1346,9 +1341,9 @@
//
// (Note: The actual check is done by checking that the object's class pointer is non-null.
// Also, unlike rosalloc, the object can never be observed as null).
- ldr r1, [r9, #THREAD_LOCAL_OBJECTS_OFFSET] // Increment thread_local_objects.
+ ldr r1, [rSELF, #THREAD_LOCAL_OBJECTS_OFFSET] // Increment thread_local_objects.
add r1, r1, #1
- str r1, [r9, #THREAD_LOCAL_OBJECTS_OFFSET]
+ str r1, [rSELF, #THREAD_LOCAL_OBJECTS_OFFSET]
POISON_HEAP_REF r0
str r0, [r2, #MIRROR_OBJECT_CLASS_OFFSET] // Store the class pointer.
// Fence. This is "ish" not "ishst" so
@@ -1375,12 +1370,12 @@
.macro GENERATE_ALLOC_OBJECT_RESOLVED_TLAB name, entrypoint, isInitialized
ENTRY \name
// Fast path tlab allocation.
- // r0: type, r9: Thread::Current
+ // r0: type, rSELF (r9): Thread::Current
// r1, r2, r3, r12: free.
ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lslow_path\name, \isInitialized
.Lslow_path\name:
SETUP_SAVE_REFS_ONLY_FRAME r2 // Save callee saves in case of GC.
- mov r1, r9 // Pass Thread::Current.
+ mov r1, rSELF // Pass Thread::Current.
bl \entrypoint // (mirror::Class* klass, Thread*)
RESTORE_SAVE_REFS_ONLY_FRAME
REFRESH_MARKING_REGISTER
@@ -1397,7 +1392,7 @@
// The common fast path code for art_quick_alloc_array_resolved/initialized_tlab
// and art_quick_alloc_array_resolved/initialized_region_tlab.
//
-// r0: type r1: component_count r2: total_size r9: Thread::Current, r3, r12: free.
+// r0: type, r1: component_count, r2: total_size, rSELF (r9): Thread::Current, r3, r12: free.
// Need to preserve r0 and r1 to the slow path.
.macro ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE slowPathLabel
and r2, r2, #OBJECT_ALIGNMENT_MASK_TOGGLED // Apply alignment mask
@@ -1409,7 +1404,7 @@
#if !((THREAD_LOCAL_POS_OFFSET + 4 == THREAD_LOCAL_END_OFFSET) && (THREAD_LOCAL_POS_OFFSET % 8 == 0))
#error "Thread::thread_local_pos/end must be consecutive and are 8 byte aligned for performance"
#endif
- ldrd r3, r12, [r9, #THREAD_LOCAL_POS_OFFSET]
+ ldrd r3, r12, [rSELF, #THREAD_LOCAL_POS_OFFSET]
sub r12, r12, r3 // Compute the remaining buf size.
cmp r2, r12 // Check if the total_size fits.
// The array class is always initialized here. Unlike new-instance,
@@ -1417,10 +1412,10 @@
bhi \slowPathLabel
// "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
add r2, r2, r3
- str r2, [r9, #THREAD_LOCAL_POS_OFFSET] // Store new thread_local_pos.
- ldr r2, [r9, #THREAD_LOCAL_OBJECTS_OFFSET] // Increment thread_local_objects.
+ str r2, [rSELF, #THREAD_LOCAL_POS_OFFSET] // Store new thread_local_pos.
+ ldr r2, [rSELF, #THREAD_LOCAL_OBJECTS_OFFSET] // Increment thread_local_objects.
add r2, r2, #1
- str r2, [r9, #THREAD_LOCAL_OBJECTS_OFFSET]
+ str r2, [rSELF, #THREAD_LOCAL_OBJECTS_OFFSET]
POISON_HEAP_REF r0
str r0, [r3, #MIRROR_OBJECT_CLASS_OFFSET] // Store the class pointer.
str r1, [r3, #MIRROR_ARRAY_LENGTH_OFFSET] // Store the array length.
@@ -1443,7 +1438,7 @@
// Fast path array allocation for region tlab allocation.
// r0: mirror::Class* type
// r1: int32_t component_count
- // r9: thread
+ // rSELF (r9): thread
// r2, r3, r12: free.
\size_setup .Lslow_path\name
ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE .Lslow_path\name
@@ -1452,7 +1447,7 @@
// r1: int32_t component_count
// r2: Thread* self
SETUP_SAVE_REFS_ONLY_FRAME r2 // save callee saves in case of GC
- mov r2, r9 // pass Thread::Current
+ mov r2, rSELF // pass Thread::Current
bl \entrypoint
RESTORE_SAVE_REFS_ONLY_FRAME
REFRESH_MARKING_REGISTER
@@ -1575,10 +1570,10 @@
.extern artQuickProxyInvokeHandler
ENTRY art_quick_proxy_invoke_handler
SETUP_SAVE_REFS_AND_ARGS_FRAME_WITH_METHOD_IN_R0
- mov r2, r9 @ pass Thread::Current
+ mov r2, rSELF @ pass Thread::Current
mov r3, sp @ pass SP
blx artQuickProxyInvokeHandler @ (Method* proxy method, receiver, Thread*, SP)
- ldr r2, [r9, #THREAD_EXCEPTION_OFFSET] @ load Thread::Current()->exception_
+ ldr r2, [rSELF, #THREAD_EXCEPTION_OFFSET] @ load Thread::Current()->exception_
// Tear down the callee-save frame. Skip arg registers.
add sp, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - FRAME_SIZE_SAVE_REFS_ONLY)
.cfi_adjust_cfa_offset -(FRAME_SIZE_SAVE_REFS_AND_ARGS - FRAME_SIZE_SAVE_REFS_ONLY)
@@ -1706,7 +1701,7 @@
.extern artQuickResolutionTrampoline
ENTRY art_quick_resolution_trampoline
SETUP_SAVE_REFS_AND_ARGS_FRAME r2
- mov r2, r9 @ pass Thread::Current
+ mov r2, rSELF @ pass Thread::Current
mov r3, sp @ pass SP
blx artQuickResolutionTrampoline @ (Method* called, receiver, Thread*, SP)
cbz r0, 1f @ is code pointer null? goto exception
@@ -1780,10 +1775,10 @@
blx artQuickGenericJniEndTrampoline
// Restore self pointer.
- mov r9, r11
+ mov rSELF, r11
// Pending exceptions possible.
- ldr r2, [r9, #THREAD_EXCEPTION_OFFSET] @ load Thread::Current()->exception_
+ ldr r2, [rSELF, #THREAD_EXCEPTION_OFFSET] @ load Thread::Current()->exception_
cbnz r2, .Lexception_in_native
// Tear down the alloca.
@@ -1804,7 +1799,7 @@
.cfi_adjust_cfa_offset FRAME_SIZE_SAVE_REFS_AND_ARGS-FRAME_SIZE_SAVE_REFS_ONLY
.Lexception_in_native:
- ldr ip, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]
+ ldr ip, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]
add ip, ip, #-1 // Remove the GenericJNI tag. ADD/SUB writing directly to SP is UNPREDICTABLE.
mov sp, ip
.cfi_def_cfa_register sp
@@ -1815,10 +1810,10 @@
.extern artQuickToInterpreterBridge
ENTRY art_quick_to_interpreter_bridge
SETUP_SAVE_REFS_AND_ARGS_FRAME r1
- mov r1, r9 @ pass Thread::Current
+ mov r1, rSELF @ pass Thread::Current
mov r2, sp @ pass SP
blx artQuickToInterpreterBridge @ (Method* method, Thread*, SP)
- ldr r2, [r9, #THREAD_EXCEPTION_OFFSET] @ load Thread::Current()->exception_
+ ldr r2, [rSELF, #THREAD_EXCEPTION_OFFSET] @ load Thread::Current()->exception_
// Tear down the callee-save frame. Skip arg registers.
add sp, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - FRAME_SIZE_SAVE_REFS_ONLY)
.cfi_adjust_cfa_offset -(FRAME_SIZE_SAVE_REFS_AND_ARGS - FRAME_SIZE_SAVE_REFS_ONLY)
@@ -1846,7 +1841,7 @@
SETUP_SAVE_REFS_AND_ARGS_FRAME r2
@ preserve r0 (not normally an arg) knowing there is a spare slot in kSaveRefsAndArgs.
str r0, [sp, #4]
- mov r2, r9 @ pass Thread::Current
+ mov r2, rSELF @ pass Thread::Current
mov r3, sp @ pass SP
blx artInstrumentationMethodEntryFromCode @ (Method*, Object*, Thread*, SP)
cbz r0, .Ldeliver_instrumentation_entry_exception
@@ -1872,7 +1867,7 @@
add r3, sp, #8 @ store fpr_res pointer, in kSaveEverything frame
add r2, sp, #136 @ store gpr_res pointer, in kSaveEverything frame
mov r1, sp @ pass SP
- mov r0, r9 @ pass Thread::Current
+ mov r0, rSELF @ pass Thread::Current
blx artInstrumentationMethodExitFromCode @ (Thread*, SP, gpr_res*, fpr_res*)
cbz r0, .Ldo_deliver_instrumentation_exception
@@ -1901,7 +1896,7 @@
.extern artDeoptimize
ENTRY art_quick_deoptimize
SETUP_SAVE_EVERYTHING_FRAME r0
- mov r0, r9 @ pass Thread::Current
+ mov r0, rSELF @ pass Thread::Current
blx artDeoptimize @ (Thread*)
END art_quick_deoptimize
@@ -1912,7 +1907,7 @@
.extern artDeoptimizeFromCompiledCode
ENTRY art_quick_deoptimize_from_compiled_code
SETUP_SAVE_EVERYTHING_FRAME r1
- mov r1, r9 @ pass Thread::Current
+ mov r1, rSELF @ pass Thread::Current
blx artDeoptimizeFromCompiledCode @ (DeoptimizationKind, Thread*)
END art_quick_deoptimize_from_compiled_code
@@ -2691,7 +2686,7 @@
.extern artInvokePolymorphic
ENTRY art_quick_invoke_polymorphic
SETUP_SAVE_REFS_AND_ARGS_FRAME r2
- mov r2, r9 @ pass Thread::Current
+ mov r2, rSELF @ pass Thread::Current
mov r3, sp @ pass SP
mov r0, #0 @ initialize 64-bit JValue as zero.
str r0, [sp, #-4]!
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index ac5b2b8..14d0cc7 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1151,45 +1151,36 @@
*/
.extern artLockObjectFromCode
ENTRY art_quick_lock_object
- cbz w0, .Lslow_lock
- add x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET // exclusive load/store has no immediate anymore
+ ldr w1, [xSELF, #THREAD_ID_OFFSET]
+ cbz w0, art_quick_lock_object_no_inline
+ // Exclusive load/store has no immediate anymore.
+ add x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET
.Lretry_lock:
- ldr w2, [xSELF, #THREAD_ID_OFFSET] // TODO: Can the thread ID really change during the loop?
- ldaxr w1, [x4] // acquire needed only in most common case
- and w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED // zero the gc bits
- cbnz w3, .Lnot_unlocked // already thin locked
- // unlocked case - x1: original lock word that's zero except for the read barrier bits.
- orr x2, x1, x2 // x2 holds thread id with count of 0 with preserved read barrier bits
- stxr w3, w2, [x4]
- cbnz w3, .Llock_stxr_fail // store failed, retry
+ ldaxr w2, [x4] // Acquire needed only in most common case.
+ eor w3, w2, w1 // Prepare the value to store if unlocked
+ // (thread id, count of 0 and preserved read barrier bits),
+ // or prepare to compare thread id for recursive lock check
+ // (lock_word.ThreadId() ^ self->ThreadId()).
+ tst w2, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED // Test the non-gc bits.
+ b.ne .Lnot_unlocked // Check if unlocked.
+ // unlocked case - store w3: original lock word plus thread id, preserved read barrier bits.
+ stxr w2, w3, [x4]
+ cbnz w2, .Lretry_lock // If the store failed, retry.
ret
-.Lnot_unlocked: // x1: original lock word
- lsr w3, w1, LOCK_WORD_STATE_SHIFT
- cbnz w3, .Lslow_lock // if either of the top two bits are set, go slow path
- eor w2, w1, w2 // lock_word.ThreadId() ^ self->ThreadId()
- uxth w2, w2 // zero top 16 bits
- cbnz w2, .Lslow_lock // lock word and self thread id's match -> recursive lock
- // else contention, go to slow path
- and w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED // zero the gc bits.
- add w2, w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE // increment count in lock word placing in w2 to check overflow
- lsr w3, w2, #LOCK_WORD_GC_STATE_SHIFT // if the first gc state bit is set, we overflowed.
- cbnz w3, .Lslow_lock // if we overflow the count go slow path
- add w2, w1, #LOCK_WORD_THIN_LOCK_COUNT_ONE // increment count for real
- stxr w3, w2, [x4]
- cbnz w3, .Llock_stxr_fail // store failed, retry
+.Lnot_unlocked: // w2: original lock word, w1: thread id, w3: w2 ^ w1
+ // Check lock word state and thread id together,
+ tst w3, #(LOCK_WORD_STATE_MASK_SHIFTED | LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED)
+ b.ne art_quick_lock_object_no_inline
+ add w3, w2, #LOCK_WORD_THIN_LOCK_COUNT_ONE // Increment the recursive lock count.
+ tst w3, #LOCK_WORD_THIN_LOCK_COUNT_MASK_SHIFTED // Test the new thin lock count.
+ b.eq art_quick_lock_object_no_inline // Zero as the new count indicates overflow, go slow path.
+ stxr w2, w3, [x4]
+ cbnz w2, .Lretry_lock // If the store failed, retry.
ret
-.Llock_stxr_fail:
- b .Lretry_lock // retry
-.Lslow_lock:
- SETUP_SAVE_REFS_ONLY_FRAME // save callee saves in case we block
- mov x1, xSELF // pass Thread::Current
- bl artLockObjectFromCode // (Object* obj, Thread*)
- RESTORE_SAVE_REFS_ONLY_FRAME
- REFRESH_MARKING_REGISTER
- RETURN_IF_W0_IS_ZERO_OR_DELIVER
END art_quick_lock_object
ENTRY art_quick_lock_object_no_inline
+ // This is also the slow path for art_quick_lock_object.
SETUP_SAVE_REFS_ONLY_FRAME // save callee saves in case we block
mov x1, xSELF // pass Thread::Current
bl artLockObjectFromCode // (Object* obj, Thread*)
@@ -1206,54 +1197,46 @@
*/
.extern artUnlockObjectFromCode
ENTRY art_quick_unlock_object
- cbz x0, .Lslow_unlock
- add x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET // exclusive load/store has no immediate anymore
+ ldr w1, [xSELF, #THREAD_ID_OFFSET]
+ cbz x0, art_quick_unlock_object_no_inline
+ // Exclusive load/store has no immediate anymore.
+ add x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET
.Lretry_unlock:
#ifndef USE_READ_BARRIER
- ldr w1, [x4]
+ ldr w2, [x4]
#else
- ldxr w1, [x4] // Need to use atomic instructions for read barrier
+ ldxr w2, [x4] // Need to use atomic instructions for read barrier.
#endif
- lsr w2, w1, LOCK_WORD_STATE_SHIFT
- cbnz w2, .Lslow_unlock // if either of the top two bits are set, go slow path
- ldr w2, [xSELF, #THREAD_ID_OFFSET]
- and w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED // zero the gc bits
- eor w3, w3, w2 // lock_word.ThreadId() ^ self->ThreadId()
- uxth w3, w3 // zero top 16 bits
- cbnz w3, .Lslow_unlock // do lock word and self thread id's match?
- and w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED // zero the gc bits
- cmp w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
- bpl .Lrecursive_thin_unlock
- // transition to unlocked
- and w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED // w3: zero except for the preserved read barrier bits
+ eor w3, w2, w1 // Prepare the value to store if simply locked
+ // (mostly 0s, and preserved read barrier bits),
+ // or prepare to compare thread id for recursive lock check
+ // (lock_word.ThreadId() ^ self->ThreadId()).
+ tst w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED // Test the non-gc bits.
+ b.ne .Lnot_simply_locked // Locked recursively or by other thread?
+ // Transition to unlocked.
#ifndef USE_READ_BARRIER
stlr w3, [x4]
#else
- stlxr w2, w3, [x4] // Need to use atomic instructions for read barrier
- cbnz w2, .Lunlock_stxr_fail // store failed, retry
+ stlxr w2, w3, [x4] // Need to use atomic instructions for read barrier.
+ cbnz w2, .Lretry_unlock // If the store failed, retry.
#endif
ret
-.Lrecursive_thin_unlock: // w1: original lock word
- sub w1, w1, #LOCK_WORD_THIN_LOCK_COUNT_ONE // decrement count
+.Lnot_simply_locked:
+ // Check lock word state and thread id together,
+ tst w3, #(LOCK_WORD_STATE_MASK_SHIFTED | LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED)
+ b.ne art_quick_unlock_object_no_inline
+ sub w3, w2, #LOCK_WORD_THIN_LOCK_COUNT_ONE // decrement count
#ifndef USE_READ_BARRIER
- str w1, [x4]
+ str w3, [x4]
#else
- stxr w2, w1, [x4] // Need to use atomic instructions for read barrier
- cbnz w2, .Lunlock_stxr_fail // store failed, retry
+ stxr w2, w3, [x4] // Need to use atomic instructions for read barrier.
+ cbnz w2, .Lretry_unlock // If the store failed, retry.
#endif
ret
-.Lunlock_stxr_fail:
- b .Lretry_unlock // retry
-.Lslow_unlock:
- SETUP_SAVE_REFS_ONLY_FRAME // save callee saves in case exception allocation triggers GC
- mov x1, xSELF // pass Thread::Current
- bl artUnlockObjectFromCode // (Object* obj, Thread*)
- RESTORE_SAVE_REFS_ONLY_FRAME
- REFRESH_MARKING_REGISTER
- RETURN_IF_W0_IS_ZERO_OR_DELIVER
END art_quick_unlock_object
ENTRY art_quick_unlock_object_no_inline
+ // This is also the slow path for art_quick_unlock_object.
SETUP_SAVE_REFS_ONLY_FRAME // save callee saves in case exception allocation triggers GC
mov x1, xSELF // pass Thread::Current
bl artUnlockObjectFromCode // (Object* obj, Thread*)
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 8ab4ce1..b89d45f 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1292,7 +1292,7 @@
jz .Lslow_lock
.Lretry_lock:
movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax), %ecx // ecx := lock word
- test LITERAL(LOCK_WORD_STATE_MASK), %ecx // test the 2 high bits.
+ test LITERAL(LOCK_WORD_STATE_MASK_SHIFTED), %ecx // test the 2 high bits.
jne .Lslow_lock // slow path if either of the two high bits are set.
movl %ecx, %edx // save lock word (edx) to keep read barrier bits.
andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx // zero the gc bits.
@@ -1362,7 +1362,7 @@
.Lretry_unlock:
movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax), %ecx // ecx := lock word
movl %fs:THREAD_ID_OFFSET, %edx // edx := thread id
- test LITERAL(LOCK_WORD_STATE_MASK), %ecx
+ test LITERAL(LOCK_WORD_STATE_MASK_SHIFTED), %ecx
jnz .Lslow_unlock // lock word contains a monitor
cmpw %cx, %dx // does the thread id match?
jne .Lslow_unlock
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index eb945ed..c179033 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1312,7 +1312,7 @@
jz .Lslow_lock
.Lretry_lock:
movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi), %ecx // ecx := lock word.
- test LITERAL(LOCK_WORD_STATE_MASK), %ecx // Test the 2 high bits.
+ test LITERAL(LOCK_WORD_STATE_MASK_SHIFTED), %ecx // Test the 2 high bits.
jne .Lslow_lock // Slow path if either of the two high bits are set.
movl %ecx, %edx // save lock word (edx) to keep read barrier bits.
andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx // zero the gc bits.
@@ -1362,7 +1362,7 @@
.Lretry_unlock:
movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi), %ecx // ecx := lock word
movl %gs:THREAD_ID_OFFSET, %edx // edx := thread id
- test LITERAL(LOCK_WORD_STATE_MASK), %ecx
+ test LITERAL(LOCK_WORD_STATE_MASK_SHIFTED), %ecx
jnz .Lslow_unlock // lock word contains a monitor
cmpw %cx, %dx // does the thread id match?
jne .Lslow_unlock
diff --git a/runtime/generated/asm_support_gen.h b/runtime/generated/asm_support_gen.h
index 46630db..464c2b7 100644
--- a/runtime/generated/asm_support_gen.h
+++ b/runtime/generated/asm_support_gen.h
@@ -90,16 +90,24 @@
DEFINE_CHECK_EQ(static_cast<size_t>(MIN_LARGE_OBJECT_THRESHOLD), (static_cast<size_t>(art::gc::Heap::kMinLargeObjectThreshold)))
#define LOCK_WORD_STATE_SHIFT 30
DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_STATE_SHIFT), (static_cast<int32_t>(art::LockWord::kStateShift)))
-#define LOCK_WORD_STATE_MASK 0xc0000000
-DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_STATE_MASK), (static_cast<uint32_t>(art::LockWord::kStateMaskShifted)))
+#define LOCK_WORD_STATE_MASK_SHIFTED 0xc0000000
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_STATE_MASK_SHIFTED), (static_cast<uint32_t>(art::LockWord::kStateMaskShifted)))
#define LOCK_WORD_READ_BARRIER_STATE_SHIFT 28
DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_READ_BARRIER_STATE_SHIFT), (static_cast<int32_t>(art::LockWord::kReadBarrierStateShift)))
#define LOCK_WORD_READ_BARRIER_STATE_MASK 0x10000000
DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_READ_BARRIER_STATE_MASK), (static_cast<uint32_t>(art::LockWord::kReadBarrierStateMaskShifted)))
#define LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED 0xefffffff
DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), (static_cast<uint32_t>(art::LockWord::kReadBarrierStateMaskShiftedToggled)))
-#define LOCK_WORD_THIN_LOCK_COUNT_ONE 65536
-DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_THIN_LOCK_COUNT_ONE), (static_cast<int32_t>(art::LockWord::kThinLockCountOne)))
+#define LOCK_WORD_THIN_LOCK_COUNT_SIZE 12
+DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_THIN_LOCK_COUNT_SIZE), (static_cast<int32_t>(art::LockWord::kThinLockCountSize)))
+#define LOCK_WORD_THIN_LOCK_COUNT_SHIFT 16
+DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_THIN_LOCK_COUNT_SHIFT), (static_cast<int32_t>(art::LockWord::kThinLockCountShift)))
+#define LOCK_WORD_THIN_LOCK_COUNT_MASK_SHIFTED 0xfff0000
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_THIN_LOCK_COUNT_MASK_SHIFTED), (static_cast<uint32_t>(art::LockWord::kThinLockCountMaskShifted)))
+#define LOCK_WORD_THIN_LOCK_COUNT_ONE 0x10000
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_THIN_LOCK_COUNT_ONE), (static_cast<uint32_t>(art::LockWord::kThinLockCountOne)))
+#define LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED 0xffff
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED), (static_cast<uint32_t>(art::LockWord::kThinLockOwnerMaskShifted)))
#define LOCK_WORD_STATE_FORWARDING_ADDRESS 0x3
DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_STATE_FORWARDING_ADDRESS), (static_cast<uint32_t>(art::LockWord::kStateForwardingAddress)))
#define LOCK_WORD_STATE_FORWARDING_ADDRESS_OVERFLOW 0x40000000
@@ -110,6 +118,8 @@
DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_GC_STATE_MASK_SHIFTED), (static_cast<uint32_t>(art::LockWord::kGCStateMaskShifted)))
#define LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED 0xcfffffff
DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), (static_cast<uint32_t>(art::LockWord::kGCStateMaskShiftedToggled)))
+#define LOCK_WORD_GC_STATE_SIZE 2
+DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_GC_STATE_SIZE), (static_cast<int32_t>(art::LockWord::kGCStateSize)))
#define LOCK_WORD_GC_STATE_SHIFT 28
DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_GC_STATE_SHIFT), (static_cast<int32_t>(art::LockWord::kGCStateShift)))
#define LOCK_WORD_MARK_BIT_SHIFT 29
diff --git a/runtime/lock_word.h b/runtime/lock_word.h
index 09d856f..ce7fe34 100644
--- a/runtime/lock_word.h
+++ b/runtime/lock_word.h
@@ -75,16 +75,18 @@
// Remaining bits are the recursive lock count.
kThinLockCountSize = 32 - kThinLockOwnerSize - kStateSize - kReadBarrierStateSize -
kMarkBitStateSize,
- // Thin lock bits. Owner in lowest bits.
+ // Thin lock bits. Owner in lowest bits.
kThinLockOwnerShift = 0,
kThinLockOwnerMask = (1 << kThinLockOwnerSize) - 1,
+ kThinLockOwnerMaskShifted = kThinLockOwnerMask << kThinLockOwnerShift,
kThinLockMaxOwner = kThinLockOwnerMask,
// Count in higher bits.
kThinLockCountShift = kThinLockOwnerSize + kThinLockOwnerShift,
kThinLockCountMask = (1 << kThinLockCountSize) - 1,
kThinLockMaxCount = kThinLockCountMask,
kThinLockCountOne = 1 << kThinLockCountShift, // == 65536 (0x10000)
+ kThinLockCountMaskShifted = kThinLockCountMask << kThinLockCountShift,
// State in the highest bits.
kStateShift = kReadBarrierStateSize + kThinLockCountSize + kThinLockCountShift +
diff --git a/tools/cpp-define-generator/constant_lockword.def b/tools/cpp-define-generator/constant_lockword.def
index 08d5885..977d1ca 100644
--- a/tools/cpp-define-generator/constant_lockword.def
+++ b/tools/cpp-define-generator/constant_lockword.def
@@ -23,23 +23,29 @@
#define DEFINE_LOCK_WORD_EXPR(macro_name, type, constant_field_name) \
DEFINE_EXPR(LOCK_WORD_ ## macro_name, type, art::LockWord::constant_field_name)
+// FIXME: The naming is inconsistent, the `Shifted` -> `_SHIFTED` suffix is sometimes missing.
DEFINE_LOCK_WORD_EXPR(STATE_SHIFT, int32_t, kStateShift)
-DEFINE_LOCK_WORD_EXPR(STATE_MASK, uint32_t, kStateMaskShifted)
+DEFINE_LOCK_WORD_EXPR(STATE_MASK_SHIFTED, uint32_t, kStateMaskShifted)
DEFINE_LOCK_WORD_EXPR(READ_BARRIER_STATE_SHIFT, int32_t, kReadBarrierStateShift)
-DEFINE_LOCK_WORD_EXPR(READ_BARRIER_STATE_MASK, uint32_t, kReadBarrierStateMaskShifted)
+DEFINE_LOCK_WORD_EXPR(READ_BARRIER_STATE_MASK, uint32_t, kReadBarrierStateMaskShifted)
DEFINE_LOCK_WORD_EXPR(READ_BARRIER_STATE_MASK_TOGGLED, uint32_t, kReadBarrierStateMaskShiftedToggled)
-DEFINE_LOCK_WORD_EXPR(THIN_LOCK_COUNT_ONE, int32_t, kThinLockCountOne)
+DEFINE_LOCK_WORD_EXPR(THIN_LOCK_COUNT_SIZE, int32_t, kThinLockCountSize)
+DEFINE_LOCK_WORD_EXPR(THIN_LOCK_COUNT_SHIFT, int32_t, kThinLockCountShift)
+DEFINE_LOCK_WORD_EXPR(THIN_LOCK_COUNT_MASK_SHIFTED, uint32_t, kThinLockCountMaskShifted)
+DEFINE_LOCK_WORD_EXPR(THIN_LOCK_COUNT_ONE, uint32_t, kThinLockCountOne)
+DEFINE_LOCK_WORD_EXPR(THIN_LOCK_OWNER_MASK_SHIFTED, uint32_t, kThinLockOwnerMaskShifted)
-DEFINE_LOCK_WORD_EXPR(STATE_FORWARDING_ADDRESS, uint32_t, kStateForwardingAddress)
+DEFINE_LOCK_WORD_EXPR(STATE_FORWARDING_ADDRESS, uint32_t, kStateForwardingAddress)
DEFINE_LOCK_WORD_EXPR(STATE_FORWARDING_ADDRESS_OVERFLOW, uint32_t, kStateForwardingAddressOverflow)
DEFINE_LOCK_WORD_EXPR(STATE_FORWARDING_ADDRESS_SHIFT, uint32_t, kForwardingAddressShift)
-DEFINE_LOCK_WORD_EXPR(GC_STATE_MASK_SHIFTED, uint32_t, kGCStateMaskShifted)
+DEFINE_LOCK_WORD_EXPR(GC_STATE_MASK_SHIFTED, uint32_t, kGCStateMaskShifted)
DEFINE_LOCK_WORD_EXPR(GC_STATE_MASK_SHIFTED_TOGGLED, uint32_t, kGCStateMaskShiftedToggled)
-DEFINE_LOCK_WORD_EXPR(GC_STATE_SHIFT, int32_t, kGCStateShift)
+DEFINE_LOCK_WORD_EXPR(GC_STATE_SIZE, int32_t, kGCStateSize)
+DEFINE_LOCK_WORD_EXPR(GC_STATE_SHIFT, int32_t, kGCStateShift)
-DEFINE_LOCK_WORD_EXPR(MARK_BIT_SHIFT, int32_t, kMarkBitStateShift)
-DEFINE_LOCK_WORD_EXPR(MARK_BIT_MASK_SHIFTED, uint32_t, kMarkBitStateMaskShifted)
+DEFINE_LOCK_WORD_EXPR(MARK_BIT_SHIFT, int32_t, kMarkBitStateShift)
+DEFINE_LOCK_WORD_EXPR(MARK_BIT_MASK_SHIFTED, uint32_t, kMarkBitStateMaskShifted)
#undef DEFINE_LOCK_WORD_EXPR