ARM/ARM64: Improve lock/unlock entrypoints.

Do the same in fewer instructions.

Test: Pixel 2 XL boots.
Test: testrunner.py --target --optimizing
Change-Id: I8003481116fd3dc6a1559b84fdc776b92dba0c68
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index cd00125..311e838 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -55,7 +55,7 @@
     @ Load kSaveAllCalleeSaves Method* into rTemp.
     ldr \rTemp, [\rTemp, #RUNTIME_SAVE_ALL_CALLEE_SAVES_METHOD_OFFSET]
     str \rTemp, [sp, #0]                          @ Place Method* at bottom of stack.
-    str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
+    str sp, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
 
      // Ugly compile-time check, but we only have the preprocessor.
 #if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVES != 36 + 64 + 12)
@@ -86,7 +86,7 @@
     @ Load kSaveRefsOnly Method* into rTemp.
     ldr \rTemp, [\rTemp, #RUNTIME_SAVE_REFS_ONLY_METHOD_OFFSET]
     str \rTemp, [sp, #0]                          @ Place Method* at bottom of stack.
-    str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
+    str sp, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
 
     // Ugly compile-time check, but we only have the preprocessor.
 #if (FRAME_SIZE_SAVE_REFS_ONLY != 28 + 4)
@@ -147,13 +147,13 @@
     @ Load kSaveRefsAndArgs Method* into rTemp.
     ldr \rTemp, [\rTemp, #RUNTIME_SAVE_REFS_AND_ARGS_METHOD_OFFSET]
     str \rTemp, [sp, #0]                          @ Place Method* at bottom of stack.
-    str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
+    str sp, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
 .endm
 
 .macro SETUP_SAVE_REFS_AND_ARGS_FRAME_WITH_METHOD_IN_R0
     SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
     str r0, [sp, #0]                              @ Store ArtMethod* to bottom of stack.
-    str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
+    str sp, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
 .endm
 
 .macro RESTORE_SAVE_REFS_AND_ARGS_FRAME
@@ -193,7 +193,7 @@
     @ Load kSaveEverything Method* into rTemp.
     ldr \rTemp, [\rTemp, #\runtime_method_offset]
     str \rTemp, [sp, #0]                @ Place Method* at bottom of stack.
-    str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
+    str sp, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
 
     // Ugly compile-time check, but we only have the preprocessor.
 #if (FRAME_SIZE_SAVE_EVERYTHING != 56 + 128 + 8)
@@ -301,7 +301,7 @@
      * exception is Thread::Current()->exception_ when the runtime method frame is ready.
      */
 .macro DELIVER_PENDING_EXCEPTION_FRAME_READY
-    mov    r0, r9                              @ pass Thread::Current
+    mov    r0, rSELF                           @ pass Thread::Current
     bl     artDeliverPendingExceptionFromCode  @ artDeliverPendingExceptionFromCode(Thread*)
 .endm
 
@@ -318,7 +318,7 @@
     .extern \cxx_name
 ENTRY \c_name
     SETUP_SAVE_ALL_CALLEE_SAVES_FRAME r0       @ save all registers as basis for long jump context
-    mov r0, r9                      @ pass Thread::Current
+    mov r0, rSELF                   @ pass Thread::Current
     bl  \cxx_name                   @ \cxx_name(Thread*)
 END \c_name
 .endm
@@ -327,7 +327,7 @@
     .extern \cxx_name
 ENTRY \c_name
     SETUP_SAVE_EVERYTHING_FRAME r0  @ save all registers as basis for long jump context
-    mov r0, r9                      @ pass Thread::Current
+    mov r0, rSELF                   @ pass Thread::Current
     bl  \cxx_name                   @ \cxx_name(Thread*)
 END \c_name
 .endm
@@ -336,7 +336,7 @@
     .extern \cxx_name
 ENTRY \c_name
     SETUP_SAVE_ALL_CALLEE_SAVES_FRAME r1       @ save all registers as basis for long jump context
-    mov r1, r9                      @ pass Thread::Current
+    mov r1, rSELF                   @ pass Thread::Current
     bl  \cxx_name                   @ \cxx_name(Thread*)
 END \c_name
 .endm
@@ -345,13 +345,13 @@
     .extern \cxx_name
 ENTRY \c_name
     SETUP_SAVE_EVERYTHING_FRAME r2  @ save all registers as basis for long jump context
-    mov r2, r9                      @ pass Thread::Current
+    mov r2, rSELF                   @ pass Thread::Current
     bl  \cxx_name                   @ \cxx_name(Thread*)
 END \c_name
 .endm
 
 .macro  RETURN_OR_DELIVER_PENDING_EXCEPTION_REG reg
-    ldr \reg, [r9, #THREAD_EXCEPTION_OFFSET]   // Get exception field.
+    ldr \reg, [rSELF, #THREAD_EXCEPTION_OFFSET]  @ Get exception field.
     cbnz \reg, 1f
     bx lr
 1:
@@ -377,7 +377,7 @@
     .extern \entrypoint
 ENTRY \name
     SETUP_SAVE_REFS_ONLY_FRAME r1        @ save callee saves in case of GC
-    mov    r1, r9                        @ pass Thread::Current
+    mov    r1, rSELF                     @ pass Thread::Current
     bl     \entrypoint                   @ (uint32_t field_idx, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
     REFRESH_MARKING_REGISTER
@@ -389,7 +389,7 @@
     .extern \entrypoint
 ENTRY \name
     SETUP_SAVE_REFS_ONLY_FRAME r2        @ save callee saves in case of GC
-    mov    r2, r9                        @ pass Thread::Current
+    mov    r2, rSELF                     @ pass Thread::Current
     bl     \entrypoint                   @ (field_idx, Object*, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
     REFRESH_MARKING_REGISTER
@@ -401,7 +401,7 @@
     .extern \entrypoint
 ENTRY \name
     SETUP_SAVE_REFS_ONLY_FRAME r3        @ save callee saves in case of GC
-    mov    r3, r9                        @ pass Thread::Current
+    mov    r3, rSELF                     @ pass Thread::Current
     bl     \entrypoint                   @ (field_idx, Object*, new_val, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME         @ TODO: we can clearly save an add here
     REFRESH_MARKING_REGISTER
@@ -448,7 +448,7 @@
     @ save all registers as basis for long jump context
     SETUP_SAVE_EVERYTHING_FRAME_CORE_REGS_SAVED r1
     mov r0, lr                      @ pass the fault address stored in LR by the fault handler.
-    mov r1, r9                      @ pass Thread::Current
+    mov r1, rSELF                   @ pass Thread::Current
     bl  artThrowNullPointerExceptionFromSignal  @ (Thread*)
 END art_quick_throw_null_pointer_exception_from_signal
 
@@ -494,7 +494,7 @@
 .macro INVOKE_TRAMPOLINE_BODY cxx_name
     .extern \cxx_name
     SETUP_SAVE_REFS_AND_ARGS_FRAME r2     @ save callee saves in case allocation triggers GC
-    mov    r2, r9                         @ pass Thread::Current
+    mov    r2, rSELF                      @ pass Thread::Current
     mov    r3, sp
     bl     \cxx_name                      @ (method_idx, this, Thread*, SP)
     mov    r12, r1                        @ save Method*->code_
@@ -682,50 +682,48 @@
      */
     .extern artLockObjectFromCode
 ENTRY art_quick_lock_object
+    ldr    r1, [rSELF, #THREAD_ID_OFFSET]
     cbz    r0, .Lslow_lock
 .Lretry_lock:
-    ldr    r2, [r9, #THREAD_ID_OFFSET]
-    ldrex  r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    mov    r3, r1
-    and    r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ zero the gc bits
-    cbnz   r3, .Lnot_unlocked         @ already thin locked
-    @ unlocked case - r1: original lock word that's zero except for the read barrier bits.
-    orr    r2, r1, r2                 @ r2 holds thread id with count of 0 with preserved read barrier bits
-    strex  r3, r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    cbnz   r3, .Llock_strex_fail      @ store failed, retry
-    dmb    ish                        @ full (LoadLoad|LoadStore) memory barrier
+    ldrex  r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    eor    r3, r2, r1                 @ Prepare the value to store if unlocked
+                                      @   (thread id, count of 0 and preserved read barrier bits),
+                                      @ or prepare to compare thread id for recursive lock check
+                                      @   (lock_word.ThreadId() ^ self->ThreadId()).
+    ands   ip, r2, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ Test the non-gc bits.
+    bne    .Lnot_unlocked             @ Check if unlocked.
+    @ unlocked case - store r3: original lock word plus thread id, preserved read barrier bits.
+    strex  r2, r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    cbnz   r2, .Llock_strex_fail      @ If store failed, retry.
+    dmb    ish                        @ Full (LoadLoad|LoadStore) memory barrier.
     bx lr
-.Lnot_unlocked:  @ r1: original lock word, r2: thread_id with count of 0 and zero read barrier bits
-    lsr    r3, r1, LOCK_WORD_STATE_SHIFT
-    cbnz   r3, .Lslow_lock            @ if either of the top two bits are set, go slow path
-    eor    r2, r1, r2                 @ lock_word.ThreadId() ^ self->ThreadId()
-    uxth   r2, r2                     @ zero top 16 bits
-    cbnz   r2, .Lslow_lock            @ lock word and self thread id's match -> recursive lock
-                                      @ else contention, go to slow path
-    mov    r3, r1                     @ copy the lock word to check count overflow.
-    and    r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ zero the gc bits.
-    add    r2, r3, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ increment count in lock word placing in r2 to check overflow
-    lsr    r3, r2, #LOCK_WORD_GC_STATE_SHIFT    @ if the first gc state bit is set, we overflowed.
-    cbnz   r3, .Lslow_lock            @ if we overflow the count go slow path
-    add    r2, r1, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ increment count for real
-    strex  r3, r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET] @ strex necessary for read barrier bits
-    cbnz   r3, .Llock_strex_fail      @ strex failed, retry
+.Lnot_unlocked:  @ r2: original lock word, r1: thread_id, r3: r2 ^ r1
+#if LOCK_WORD_THIN_LOCK_COUNT_SHIFT + LOCK_WORD_THIN_LOCK_COUNT_SIZE != LOCK_WORD_GC_STATE_SHIFT
+#error "Expecting thin lock count and gc state in consecutive bits."
+#endif
+                                      @ Check lock word state and thread id together,
+    bfc    r3, #LOCK_WORD_THIN_LOCK_COUNT_SHIFT, #(LOCK_WORD_THIN_LOCK_COUNT_SIZE + LOCK_WORD_GC_STATE_SIZE)
+    cbnz   r3, .Lslow_lock            @ if either of the top two bits are set, or the lock word's
+                                      @ thread id did not match, go slow path.
+    add    r3, r2, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ Increment the recursive lock count.
+                                      @ Extract the new thin lock count for overflow check.
+    ubfx   r2, r3, #LOCK_WORD_THIN_LOCK_COUNT_SHIFT, #LOCK_WORD_THIN_LOCK_COUNT_SIZE
+    cbz    r2, .Lslow_lock            @ Zero as the new count indicates overflow, go slow path.
+    strex  r2, r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]  @ strex necessary for read barrier bits.
+    cbnz   r2, .Llock_strex_fail      @ If strex failed, retry.
     bx lr
 .Llock_strex_fail:
     b      .Lretry_lock               @ retry
-.Lslow_lock:
-    SETUP_SAVE_REFS_ONLY_FRAME r1     @ save callee saves in case we block
-    mov    r1, r9                     @ pass Thread::Current
-    bl     artLockObjectFromCode      @ (Object* obj, Thread*)
-    RESTORE_SAVE_REFS_ONLY_FRAME
-    REFRESH_MARKING_REGISTER
-    RETURN_IF_RESULT_IS_ZERO
-    DELIVER_PENDING_EXCEPTION
+// Note: the slow path is actually the art_quick_lock_object_no_inline (tail call).
 END art_quick_lock_object
 
 ENTRY art_quick_lock_object_no_inline
+    // This is also the slow path for art_quick_lock_object. Note that we
+    // need a local label, the assembler complains about target being out of
+    // range if we try to jump to `art_quick_lock_object_no_inline`.
+.Lslow_lock:
     SETUP_SAVE_REFS_ONLY_FRAME r1     @ save callee saves in case we block
-    mov    r1, r9                     @ pass Thread::Current
+    mov    r1, rSELF                  @ pass Thread::Current
     bl     artLockObjectFromCode      @ (Object* obj, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
     REFRESH_MARKING_REGISTER
@@ -739,62 +737,59 @@
      */
     .extern artUnlockObjectFromCode
 ENTRY art_quick_unlock_object
+    ldr    r1, [rSELF, #THREAD_ID_OFFSET]
     cbz    r0, .Lslow_unlock
 .Lretry_unlock:
 #ifndef USE_READ_BARRIER
-    ldr    r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    ldr    r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
 #else
-    ldrex  r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]  @ Need to use atomic instructions for read barrier
+                                      @ Need to use atomic instructions for read barrier.
+    ldrex  r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
 #endif
-    lsr    r2, r1, #LOCK_WORD_STATE_SHIFT
-    cbnz   r2, .Lslow_unlock          @ if either of the top two bits are set, go slow path
-    ldr    r2, [r9, #THREAD_ID_OFFSET]
-    mov    r3, r1                     @ copy lock word to check thread id equality
-    and    r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ zero the gc bits
-    eor    r3, r3, r2                 @ lock_word.ThreadId() ^ self->ThreadId()
-    uxth   r3, r3                     @ zero top 16 bits
-    cbnz   r3, .Lslow_unlock          @ do lock word and self thread id's match?
-    mov    r3, r1                     @ copy lock word to detect transition to unlocked
-    and    r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ zero the gc bits
-    cmp    r3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
-    bpl    .Lrecursive_thin_unlock
-    @ transition to unlocked
-    mov    r3, r1
-    and    r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED  @ r3: zero except for the preserved gc bits
-    dmb    ish                        @ full (LoadStore|StoreStore) memory barrier
+    eor    r3, r2, r1                 @ Prepare the value to store if simply locked
+                                      @   (mostly 0s, and preserved read barrier bits),
+                                      @ or prepare to compare thread id for recursive lock check
+                                      @   (lock_word.ThreadId() ^ self->ThreadId()).
+    ands   ip, r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ Test the non-gc bits.
+    bne    .Lnot_simply_locked        @ Locked recursively or by other thread?
+    @ Transition to unlocked.
+    dmb    ish                        @ Full (LoadStore|StoreStore) memory barrier.
 #ifndef USE_READ_BARRIER
     str    r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
 #else
     strex  r2, r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]  @ strex necessary for read barrier bits
-    cbnz   r2, .Lunlock_strex_fail    @ store failed, retry
+    cbnz   r2, .Lunlock_strex_fail    @ If the store failed, retry.
 #endif
     bx     lr
-.Lrecursive_thin_unlock:  @ r1: original lock word
-    sub    r1, r1, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ decrement count
+.Lnot_simply_locked:  @ r2: original lock word, r1: thread_id, r3: r2 ^ r1
+#if LOCK_WORD_THIN_LOCK_COUNT_SHIFT + LOCK_WORD_THIN_LOCK_COUNT_SIZE != LOCK_WORD_GC_STATE_SHIFT
+#error "Expecting thin lock count and gc state in consecutive bits."
+#endif
+                                      @ Check lock word state and thread id together,
+    bfc    r3, #LOCK_WORD_THIN_LOCK_COUNT_SHIFT, #(LOCK_WORD_THIN_LOCK_COUNT_SIZE + LOCK_WORD_GC_STATE_SIZE)
+    cbnz   r3, .Lslow_unlock          @ if either of the top two bits are set, or the lock word's
+                                      @ thread id did not match, go slow path.
+    sub    r3, r2, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ Decrement recursive lock count.
 #ifndef USE_READ_BARRIER
-    str    r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    str    r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
 #else
-    strex  r2, r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]  @ strex necessary for read barrier bits
-    cbnz   r2, .Lunlock_strex_fail    @ store failed, retry
+    strex  r2, r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]  @ strex necessary for read barrier bits.
+    cbnz   r2, .Lunlock_strex_fail    @ If the store failed, retry.
 #endif
     bx     lr
 .Lunlock_strex_fail:
     b      .Lretry_unlock             @ retry
-.Lslow_unlock:
-    @ save callee saves in case exception allocation triggers GC
-    SETUP_SAVE_REFS_ONLY_FRAME r1
-    mov    r1, r9                     @ pass Thread::Current
-    bl     artUnlockObjectFromCode    @ (Object* obj, Thread*)
-    RESTORE_SAVE_REFS_ONLY_FRAME
-    REFRESH_MARKING_REGISTER
-    RETURN_IF_RESULT_IS_ZERO
-    DELIVER_PENDING_EXCEPTION
+// Note: the slow path is actually the art_quick_unlock_object_no_inline (tail call).
 END art_quick_unlock_object
 
 ENTRY art_quick_unlock_object_no_inline
+    // This is also the slow path for art_quick_unlock_object. Note that we
+    // need a local label, the assembler complains about target being out of
+    // range if we try to jump to `art_quick_unlock_object_no_inline`.
+.Lslow_unlock:
     @ save callee saves in case exception allocation triggers GC
     SETUP_SAVE_REFS_ONLY_FRAME r1
-    mov    r1, r9                     @ pass Thread::Current
+    mov    r1, rSELF                  @ pass Thread::Current
     bl     artUnlockObjectFromCode    @ (Object* obj, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
     REFRESH_MARKING_REGISTER
@@ -832,7 +827,7 @@
 
 .Lthrow_class_cast_exception_for_bitstring_check:
     SETUP_SAVE_ALL_CALLEE_SAVES_FRAME r2       @ save all registers as basis for long jump context
-    mov r2, r9                      @ pass Thread::Current
+    mov r2, rSELF                   @ pass Thread::Current
     bl  artThrowClassCastExceptionForObject  @ (Object*, Class*, Thread*)
     bkpt
 END art_quick_check_instance_of
@@ -917,7 +912,7 @@
     add r3, r0, #MIRROR_OBJECT_ARRAY_DATA_OFFSET
     POISON_HEAP_REF r2
     str r2, [r3, r1, lsl #2]
-    ldr r3, [r9, #THREAD_CARD_TABLE_OFFSET]
+    ldr r3, [rSELF, #THREAD_CARD_TABLE_OFFSET]
     lsr r0, r0, #CARD_TABLE_CARD_SHIFT
     strb r3, [r3, r0]
     blx lr
@@ -945,7 +940,7 @@
     add r3, r0, #MIRROR_OBJECT_ARRAY_DATA_OFFSET
     POISON_HEAP_REF r2
     str r2, [r3, r1, lsl #2]
-    ldr r3, [r9, #THREAD_CARD_TABLE_OFFSET]
+    ldr r3, [rSELF, #THREAD_CARD_TABLE_OFFSET]
     lsr r0, r0, #CARD_TABLE_CARD_SHIFT
     strb r3, [r3, r0]
     blx lr
@@ -954,7 +949,7 @@
     /* No need to repeat restore cfi directives, the ones above apply here. */
     SETUP_SAVE_ALL_CALLEE_SAVES_FRAME r3
     mov r1, r2
-    mov r2, r9                     @ pass Thread::Current
+    mov r2, rSELF                  @ pass Thread::Current
     bl artThrowArrayStoreException @ (Class*, Class*, Thread*)
     bkpt                           @ unreached
 END art_quick_aput_obj
@@ -964,7 +959,7 @@
     .extern \entrypoint
 ENTRY \name
     SETUP_SAVE_REFS_ONLY_FRAME r1     @ save callee saves in case of GC
-    mov    r1, r9                     @ pass Thread::Current
+    mov    r1, rSELF                  @ pass Thread::Current
     bl     \entrypoint     @ (uint32_t type_idx, Method* method, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
     REFRESH_MARKING_REGISTER
@@ -977,7 +972,7 @@
     .extern \entrypoint
 ENTRY \name
     SETUP_SAVE_REFS_ONLY_FRAME r2     @ save callee saves in case of GC
-    mov    r2, r9                     @ pass Thread::Current
+    mov    r2, rSELF                  @ pass Thread::Current
     bl     \entrypoint     @ (uint32_t type_idx, Method* method, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
     REFRESH_MARKING_REGISTER
@@ -990,7 +985,7 @@
     .extern \entrypoint
 ENTRY \name
     SETUP_SAVE_REFS_ONLY_FRAME r3     @ save callee saves in case of GC
-    mov    r3, r9                     @ pass Thread::Current
+    mov    r3, rSELF                  @ pass Thread::Current
     @ (uint32_t type_idx, Method* method, int32_t component_count, Thread*)
     bl     \entrypoint
     RESTORE_SAVE_REFS_ONLY_FRAME
@@ -1004,7 +999,7 @@
     .extern \entrypoint
 ENTRY \name
     SETUP_SAVE_REFS_ONLY_FRAME r12    @ save callee saves in case of GC
-    str    r9, [sp, #-16]!            @ expand the frame and pass Thread::Current
+    str    rSELF, [sp, #-16]!         @ expand the frame and pass Thread::Current
     .cfi_adjust_cfa_offset 16
     bl     \entrypoint
     add    sp, #16                    @ strip the extra frame
@@ -1023,7 +1018,7 @@
     .extern \entrypoint
 ENTRY \name
     SETUP_SAVE_EVERYTHING_FRAME r1, \runtime_method_offset    @ save everything in case of GC
-    mov    r1, r9                     @ pass Thread::Current
+    mov    r1, rSELF                  @ pass Thread::Current
     bl     \entrypoint                @ (uint32_t index, Thread*)
     cbz    r0, 1f                     @ If result is null, deliver the OOME.
     .cfi_remember_state
@@ -1065,9 +1060,9 @@
     .extern artGet64StaticFromCompiledCode
 ENTRY art_quick_get64_static
     SETUP_SAVE_REFS_ONLY_FRAME r2        @ save callee saves in case of GC
-    mov    r1, r9                        @ pass Thread::Current
-    bl     artGet64StaticFromCompiledCode        @ (uint32_t field_idx, Thread*)
-    ldr    r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
+    mov    r1, rSELF                     @ pass Thread::Current
+    bl     artGet64StaticFromCompiledCode  @ (uint32_t field_idx, Thread*)
+    ldr    r2, [rSELF, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
     RESTORE_SAVE_REFS_ONLY_FRAME
     REFRESH_MARKING_REGISTER
     cbnz   r2, 1f                        @ success if no exception pending
@@ -1091,9 +1086,9 @@
     .extern artGet64InstanceFromCompiledCode
 ENTRY art_quick_get64_instance
     SETUP_SAVE_REFS_ONLY_FRAME r2        @ save callee saves in case of GC
-    mov    r2, r9                        @ pass Thread::Current
-    bl     artGet64InstanceFromCompiledCode      @ (field_idx, Object*, Thread*)
-    ldr    r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
+    mov    r2, rSELF                     @ pass Thread::Current
+    bl     artGet64InstanceFromCompiledCode  @ (field_idx, Object*, Thread*)
+    ldr    r2, [rSELF, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
     RESTORE_SAVE_REFS_ONLY_FRAME
     REFRESH_MARKING_REGISTER
     cbnz   r2, 1f                        @ success if no exception pending
@@ -1125,7 +1120,7 @@
 ENTRY art_quick_set64_instance
     SETUP_SAVE_REFS_ONLY_FRAME r12       @ save callee saves in case of GC
                                          @ r2:r3 contain the wide argument
-    str    r9, [sp, #-16]!               @ expand the frame and pass Thread::Current
+    str    rSELF, [sp, #-16]!            @ expand the frame and pass Thread::Current
     .cfi_adjust_cfa_offset 16
     bl     artSet64InstanceFromCompiledCode      @ (field_idx, Object*, new_val, Thread*)
     add    sp, #16                       @ release out args
@@ -1140,7 +1135,7 @@
 ENTRY art_quick_set64_static
     SETUP_SAVE_REFS_ONLY_FRAME r12        @ save callee saves in case of GC
                                           @ r2:r3 contain the wide argument
-    str    r9, [sp, #-16]!                @ expand the frame and pass Thread::Current
+    str    rSELF, [sp, #-16]!             @ expand the frame and pass Thread::Current
     .cfi_adjust_cfa_offset 16
     bl     artSet64StaticFromCompiledCode @ (field_idx, new_val, Thread*)
     add    sp, #16                        @ release out args
@@ -1185,12 +1180,12 @@
 .macro ART_QUICK_ALLOC_OBJECT_ROSALLOC c_name, cxx_name, isInitialized
 ENTRY \c_name
     // Fast path rosalloc allocation.
-    // r0: type/return value, r9: Thread::Current
+    // r0: type/return value, rSELF (r9): Thread::Current
     // r1, r2, r3, r12: free.
-    ldr    r3, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]     // Check if the thread local
+    ldr    r3, [rSELF, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]  // Check if the thread local
                                                               // allocation stack has room.
                                                               // TODO: consider using ldrd.
-    ldr    r12, [r9, #THREAD_LOCAL_ALLOC_STACK_END_OFFSET]
+    ldr    r12, [rSELF, #THREAD_LOCAL_ALLOC_STACK_END_OFFSET]
     cmp    r3, r12
     bhs    .Lslow_path\c_name
 
@@ -1208,7 +1203,7 @@
                                                               // from the size. Since the size is
                                                               // already aligned we can combine the
                                                               // two shifts together.
-    add    r12, r9, r3, lsr #(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT - POINTER_SIZE_SHIFT)
+    add    r12, rSELF, r3, lsr #(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT - POINTER_SIZE_SHIFT)
                                                               // Subtract pointer size since ther
                                                               // are no runs for 0 byte allocations
                                                               // and the size is already aligned.
@@ -1236,9 +1231,9 @@
                                                               // local allocation stack and
                                                               // increment the thread local
                                                               // allocation stack top.
-    ldr    r1, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]
+    ldr    r1, [rSELF, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]
     str    r3, [r1], #COMPRESSED_REFERENCE_SIZE               // (Increment r1 as a side effect.)
-    str    r1, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]
+    str    r1, [rSELF, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]
                                                               // Decrement the size of the free list
 
     // After this "STR" the object is published to the thread local allocation stack,
@@ -1287,7 +1282,7 @@
 
 .Lslow_path\c_name:
     SETUP_SAVE_REFS_ONLY_FRAME r2     @ save callee saves in case of GC
-    mov    r1, r9                     @ pass Thread::Current
+    mov    r1, rSELF                  @ pass Thread::Current
     bl     \cxx_name                  @ (mirror::Class* cls, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
     REFRESH_MARKING_REGISTER
@@ -1301,7 +1296,7 @@
 // The common fast path code for art_quick_alloc_object_resolved/initialized_tlab
 // and art_quick_alloc_object_resolved/initialized_region_tlab.
 //
-// r0: type r9: Thread::Current, r1, r2, r3, r12: free.
+// r0: type, rSELF (r9): Thread::Current, r1, r2, r3, r12: free.
 // Need to preserve r0 to the slow path.
 //
 // If isInitialized=1 then the compiler assumes the object's class has already been initialized.
@@ -1313,7 +1308,7 @@
 #if !((THREAD_LOCAL_POS_OFFSET + 4 == THREAD_LOCAL_END_OFFSET) && (THREAD_LOCAL_POS_OFFSET % 8 == 0))
 #error "Thread::thread_local_pos/end must be consecutive and are 8 byte aligned for performance"
 #endif
-    ldrd   r12, r3, [r9, #THREAD_LOCAL_POS_OFFSET]
+    ldrd   r12, r3, [rSELF, #THREAD_LOCAL_POS_OFFSET]
     sub    r12, r3, r12                                       // Compute the remaining buf size.
     ldr    r3, [r0, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET]  // Load the object size (r3).
     cmp    r3, r12                                            // Check if it fits.
@@ -1326,9 +1321,9 @@
     // "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
                                                               // Reload old thread_local_pos (r0)
                                                               // for the return value.
-    ldr    r2, [r9, #THREAD_LOCAL_POS_OFFSET]
+    ldr    r2, [rSELF, #THREAD_LOCAL_POS_OFFSET]
     add    r1, r2, r3
-    str    r1, [r9, #THREAD_LOCAL_POS_OFFSET]                 // Store new thread_local_pos.
+    str    r1, [rSELF, #THREAD_LOCAL_POS_OFFSET]              // Store new thread_local_pos.
     // After this "STR" the object is published to the thread local allocation stack,
     // and it will be observable from a runtime internal (eg. Heap::VisitObjects) point of view.
     // It is not yet visible to the running (user) compiled code until after the return.
@@ -1346,9 +1341,9 @@
     //
     // (Note: The actual check is done by checking that the object's class pointer is non-null.
     // Also, unlike rosalloc, the object can never be observed as null).
-    ldr    r1, [r9, #THREAD_LOCAL_OBJECTS_OFFSET]             // Increment thread_local_objects.
+    ldr    r1, [rSELF, #THREAD_LOCAL_OBJECTS_OFFSET]          // Increment thread_local_objects.
     add    r1, r1, #1
-    str    r1, [r9, #THREAD_LOCAL_OBJECTS_OFFSET]
+    str    r1, [rSELF, #THREAD_LOCAL_OBJECTS_OFFSET]
     POISON_HEAP_REF r0
     str    r0, [r2, #MIRROR_OBJECT_CLASS_OFFSET]              // Store the class pointer.
                                                               // Fence. This is "ish" not "ishst" so
@@ -1375,12 +1370,12 @@
 .macro GENERATE_ALLOC_OBJECT_RESOLVED_TLAB name, entrypoint, isInitialized
 ENTRY \name
     // Fast path tlab allocation.
-    // r0: type, r9: Thread::Current
+    // r0: type, rSELF (r9): Thread::Current
     // r1, r2, r3, r12: free.
     ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lslow_path\name, \isInitialized
 .Lslow_path\name:
     SETUP_SAVE_REFS_ONLY_FRAME r2                             // Save callee saves in case of GC.
-    mov    r1, r9                                             // Pass Thread::Current.
+    mov    r1, rSELF                                          // Pass Thread::Current.
     bl     \entrypoint                                        // (mirror::Class* klass, Thread*)
     RESTORE_SAVE_REFS_ONLY_FRAME
     REFRESH_MARKING_REGISTER
@@ -1397,7 +1392,7 @@
 // The common fast path code for art_quick_alloc_array_resolved/initialized_tlab
 // and art_quick_alloc_array_resolved/initialized_region_tlab.
 //
-// r0: type r1: component_count r2: total_size r9: Thread::Current, r3, r12: free.
+// r0: type, r1: component_count, r2: total_size, rSELF (r9): Thread::Current, r3, r12: free.
 // Need to preserve r0 and r1 to the slow path.
 .macro ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE slowPathLabel
     and    r2, r2, #OBJECT_ALIGNMENT_MASK_TOGGLED             // Apply alignment mask
@@ -1409,7 +1404,7 @@
 #if !((THREAD_LOCAL_POS_OFFSET + 4 == THREAD_LOCAL_END_OFFSET) && (THREAD_LOCAL_POS_OFFSET % 8 == 0))
 #error "Thread::thread_local_pos/end must be consecutive and are 8 byte aligned for performance"
 #endif
-    ldrd   r3, r12, [r9, #THREAD_LOCAL_POS_OFFSET]
+    ldrd   r3, r12, [rSELF, #THREAD_LOCAL_POS_OFFSET]
     sub    r12, r12, r3                                       // Compute the remaining buf size.
     cmp    r2, r12                                            // Check if the total_size fits.
     // The array class is always initialized here. Unlike new-instance,
@@ -1417,10 +1412,10 @@
     bhi    \slowPathLabel
     // "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
     add    r2, r2, r3
-    str    r2, [r9, #THREAD_LOCAL_POS_OFFSET]                 // Store new thread_local_pos.
-    ldr    r2, [r9, #THREAD_LOCAL_OBJECTS_OFFSET]             // Increment thread_local_objects.
+    str    r2, [rSELF, #THREAD_LOCAL_POS_OFFSET]              // Store new thread_local_pos.
+    ldr    r2, [rSELF, #THREAD_LOCAL_OBJECTS_OFFSET]          // Increment thread_local_objects.
     add    r2, r2, #1
-    str    r2, [r9, #THREAD_LOCAL_OBJECTS_OFFSET]
+    str    r2, [rSELF, #THREAD_LOCAL_OBJECTS_OFFSET]
     POISON_HEAP_REF r0
     str    r0, [r3, #MIRROR_OBJECT_CLASS_OFFSET]              // Store the class pointer.
     str    r1, [r3, #MIRROR_ARRAY_LENGTH_OFFSET]              // Store the array length.
@@ -1443,7 +1438,7 @@
     // Fast path array allocation for region tlab allocation.
     // r0: mirror::Class* type
     // r1: int32_t component_count
-    // r9: thread
+    // rSELF (r9): thread
     // r2, r3, r12: free.
     \size_setup .Lslow_path\name
     ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED_WITH_SIZE .Lslow_path\name
@@ -1452,7 +1447,7 @@
     // r1: int32_t component_count
     // r2: Thread* self
     SETUP_SAVE_REFS_ONLY_FRAME r2  // save callee saves in case of GC
-    mov    r2, r9                  // pass Thread::Current
+    mov    r2, rSELF               // pass Thread::Current
     bl     \entrypoint
     RESTORE_SAVE_REFS_ONLY_FRAME
     REFRESH_MARKING_REGISTER
@@ -1575,10 +1570,10 @@
      .extern artQuickProxyInvokeHandler
 ENTRY art_quick_proxy_invoke_handler
     SETUP_SAVE_REFS_AND_ARGS_FRAME_WITH_METHOD_IN_R0
-    mov     r2, r9                 @ pass Thread::Current
+    mov     r2, rSELF              @ pass Thread::Current
     mov     r3, sp                 @ pass SP
     blx     artQuickProxyInvokeHandler  @ (Method* proxy method, receiver, Thread*, SP)
-    ldr     r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
+    ldr     r2, [rSELF, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
     // Tear down the callee-save frame. Skip arg registers.
     add     sp, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - FRAME_SIZE_SAVE_REFS_ONLY)
     .cfi_adjust_cfa_offset -(FRAME_SIZE_SAVE_REFS_AND_ARGS - FRAME_SIZE_SAVE_REFS_ONLY)
@@ -1706,7 +1701,7 @@
     .extern artQuickResolutionTrampoline
 ENTRY art_quick_resolution_trampoline
     SETUP_SAVE_REFS_AND_ARGS_FRAME r2
-    mov     r2, r9                 @ pass Thread::Current
+    mov     r2, rSELF              @ pass Thread::Current
     mov     r3, sp                 @ pass SP
     blx     artQuickResolutionTrampoline  @ (Method* called, receiver, Thread*, SP)
     cbz     r0, 1f                 @ is code pointer null? goto exception
@@ -1780,10 +1775,10 @@
     blx artQuickGenericJniEndTrampoline
 
     // Restore self pointer.
-    mov r9, r11
+    mov rSELF, r11
 
     // Pending exceptions possible.
-    ldr r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
+    ldr r2, [rSELF, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
     cbnz r2, .Lexception_in_native
 
     // Tear down the alloca.
@@ -1804,7 +1799,7 @@
     .cfi_adjust_cfa_offset FRAME_SIZE_SAVE_REFS_AND_ARGS-FRAME_SIZE_SAVE_REFS_ONLY
 
 .Lexception_in_native:
-    ldr ip, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]
+    ldr ip, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]
     add ip, ip, #-1  // Remove the GenericJNI tag. ADD/SUB writing directly to SP is UNPREDICTABLE.
     mov sp, ip
     .cfi_def_cfa_register sp
@@ -1815,10 +1810,10 @@
     .extern artQuickToInterpreterBridge
 ENTRY art_quick_to_interpreter_bridge
     SETUP_SAVE_REFS_AND_ARGS_FRAME r1
-    mov     r1, r9                 @ pass Thread::Current
+    mov     r1, rSELF              @ pass Thread::Current
     mov     r2, sp                 @ pass SP
     blx     artQuickToInterpreterBridge    @ (Method* method, Thread*, SP)
-    ldr     r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
+    ldr     r2, [rSELF, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
     // Tear down the callee-save frame. Skip arg registers.
     add     sp, #(FRAME_SIZE_SAVE_REFS_AND_ARGS - FRAME_SIZE_SAVE_REFS_ONLY)
     .cfi_adjust_cfa_offset -(FRAME_SIZE_SAVE_REFS_AND_ARGS - FRAME_SIZE_SAVE_REFS_ONLY)
@@ -1846,7 +1841,7 @@
     SETUP_SAVE_REFS_AND_ARGS_FRAME r2
     @ preserve r0 (not normally an arg) knowing there is a spare slot in kSaveRefsAndArgs.
     str   r0, [sp, #4]
-    mov   r2, r9         @ pass Thread::Current
+    mov   r2, rSELF      @ pass Thread::Current
     mov   r3, sp         @ pass SP
     blx   artInstrumentationMethodEntryFromCode  @ (Method*, Object*, Thread*, SP)
     cbz   r0, .Ldeliver_instrumentation_entry_exception
@@ -1872,7 +1867,7 @@
     add   r3, sp, #8     @ store fpr_res pointer, in kSaveEverything frame
     add   r2, sp, #136   @ store gpr_res pointer, in kSaveEverything frame
     mov   r1, sp         @ pass SP
-    mov   r0, r9         @ pass Thread::Current
+    mov   r0, rSELF      @ pass Thread::Current
     blx   artInstrumentationMethodExitFromCode  @ (Thread*, SP, gpr_res*, fpr_res*)
 
     cbz   r0, .Ldo_deliver_instrumentation_exception
@@ -1901,7 +1896,7 @@
     .extern artDeoptimize
 ENTRY art_quick_deoptimize
     SETUP_SAVE_EVERYTHING_FRAME r0
-    mov    r0, r9         @ pass Thread::Current
+    mov    r0, rSELF      @ pass Thread::Current
     blx    artDeoptimize  @ (Thread*)
 END art_quick_deoptimize
 
@@ -1912,7 +1907,7 @@
     .extern artDeoptimizeFromCompiledCode
 ENTRY art_quick_deoptimize_from_compiled_code
     SETUP_SAVE_EVERYTHING_FRAME r1
-    mov    r1, r9                         @ pass Thread::Current
+    mov    r1, rSELF                      @ pass Thread::Current
     blx    artDeoptimizeFromCompiledCode  @ (DeoptimizationKind, Thread*)
 END art_quick_deoptimize_from_compiled_code
 
@@ -2691,7 +2686,7 @@
 .extern artInvokePolymorphic
 ENTRY art_quick_invoke_polymorphic
     SETUP_SAVE_REFS_AND_ARGS_FRAME r2
-    mov     r2, r9                 @ pass Thread::Current
+    mov     r2, rSELF              @ pass Thread::Current
     mov     r3, sp                 @ pass SP
     mov     r0, #0                 @ initialize 64-bit JValue as zero.
     str     r0, [sp, #-4]!
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index ac5b2b8..14d0cc7 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1151,45 +1151,36 @@
      */
     .extern artLockObjectFromCode
 ENTRY art_quick_lock_object
-    cbz    w0, .Lslow_lock
-    add    x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET  // exclusive load/store has no immediate anymore
+    ldr    w1, [xSELF, #THREAD_ID_OFFSET]
+    cbz    w0, art_quick_lock_object_no_inline
+                                      // Exclusive load/store has no immediate anymore.
+    add    x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET
 .Lretry_lock:
-    ldr    w2, [xSELF, #THREAD_ID_OFFSET] // TODO: Can the thread ID really change during the loop?
-    ldaxr  w1, [x4]                   // acquire needed only in most common case
-    and    w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
-    cbnz   w3, .Lnot_unlocked         // already thin locked
-    // unlocked case - x1: original lock word that's zero except for the read barrier bits.
-    orr    x2, x1, x2                 // x2 holds thread id with count of 0 with preserved read barrier bits
-    stxr   w3, w2, [x4]
-    cbnz   w3, .Llock_stxr_fail       // store failed, retry
+    ldaxr  w2, [x4]                   // Acquire needed only in most common case.
+    eor    w3, w2, w1                 // Prepare the value to store if unlocked
+                                      //   (thread id, count of 0 and preserved read barrier bits),
+                                      // or prepare to compare thread id for recursive lock check
+                                      //   (lock_word.ThreadId() ^ self->ThreadId()).
+    tst    w2, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // Test the non-gc bits.
+    b.ne   .Lnot_unlocked             // Check if unlocked.
+    // unlocked case - store w3: original lock word plus thread id, preserved read barrier bits.
+    stxr   w2, w3, [x4]
+    cbnz   w2, .Lretry_lock           // If the store failed, retry.
     ret
-.Lnot_unlocked:  // x1: original lock word
-    lsr    w3, w1, LOCK_WORD_STATE_SHIFT
-    cbnz   w3, .Lslow_lock            // if either of the top two bits are set, go slow path
-    eor    w2, w1, w2                 // lock_word.ThreadId() ^ self->ThreadId()
-    uxth   w2, w2                     // zero top 16 bits
-    cbnz   w2, .Lslow_lock            // lock word and self thread id's match -> recursive lock
-                                      // else contention, go to slow path
-    and    w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits.
-    add    w2, w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // increment count in lock word placing in w2 to check overflow
-    lsr    w3, w2, #LOCK_WORD_GC_STATE_SHIFT     // if the first gc state bit is set, we overflowed.
-    cbnz   w3, .Lslow_lock            // if we overflow the count go slow path
-    add    w2, w1, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // increment count for real
-    stxr   w3, w2, [x4]
-    cbnz   w3, .Llock_stxr_fail       // store failed, retry
+.Lnot_unlocked:  // w2: original lock word, w1: thread id, w3: w2 ^ w1
+                                      // Check lock word state and thread id together,
+    tst    w3, #(LOCK_WORD_STATE_MASK_SHIFTED | LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED)
+    b.ne   art_quick_lock_object_no_inline
+    add    w3, w2, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // Increment the recursive lock count.
+    tst    w3, #LOCK_WORD_THIN_LOCK_COUNT_MASK_SHIFTED  // Test the new thin lock count.
+    b.eq   art_quick_lock_object_no_inline  // Zero as the new count indicates overflow, go slow path.
+    stxr   w2, w3, [x4]
+    cbnz   w2, .Lretry_lock           // If the store failed, retry.
     ret
-.Llock_stxr_fail:
-    b      .Lretry_lock               // retry
-.Lslow_lock:
-    SETUP_SAVE_REFS_ONLY_FRAME        // save callee saves in case we block
-    mov    x1, xSELF                  // pass Thread::Current
-    bl     artLockObjectFromCode      // (Object* obj, Thread*)
-    RESTORE_SAVE_REFS_ONLY_FRAME
-    REFRESH_MARKING_REGISTER
-    RETURN_IF_W0_IS_ZERO_OR_DELIVER
 END art_quick_lock_object
 
 ENTRY art_quick_lock_object_no_inline
+    // This is also the slow path for art_quick_lock_object.
     SETUP_SAVE_REFS_ONLY_FRAME        // save callee saves in case we block
     mov    x1, xSELF                  // pass Thread::Current
     bl     artLockObjectFromCode      // (Object* obj, Thread*)
@@ -1206,54 +1197,46 @@
      */
     .extern artUnlockObjectFromCode
 ENTRY art_quick_unlock_object
-    cbz    x0, .Lslow_unlock
-    add    x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET  // exclusive load/store has no immediate anymore
+    ldr    w1, [xSELF, #THREAD_ID_OFFSET]
+    cbz    x0, art_quick_unlock_object_no_inline
+                                      // Exclusive load/store has no immediate anymore.
+    add    x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET
 .Lretry_unlock:
 #ifndef USE_READ_BARRIER
-    ldr    w1, [x4]
+    ldr    w2, [x4]
 #else
-    ldxr   w1, [x4]                   // Need to use atomic instructions for read barrier
+    ldxr   w2, [x4]                   // Need to use atomic instructions for read barrier.
 #endif
-    lsr    w2, w1, LOCK_WORD_STATE_SHIFT
-    cbnz   w2, .Lslow_unlock          // if either of the top two bits are set, go slow path
-    ldr    w2, [xSELF, #THREAD_ID_OFFSET]
-    and    w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
-    eor    w3, w3, w2                 // lock_word.ThreadId() ^ self->ThreadId()
-    uxth   w3, w3                     // zero top 16 bits
-    cbnz   w3, .Lslow_unlock          // do lock word and self thread id's match?
-    and    w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
-    cmp    w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
-    bpl    .Lrecursive_thin_unlock
-    // transition to unlocked
-    and    w3, w1, #LOCK_WORD_GC_STATE_MASK_SHIFTED  // w3: zero except for the preserved read barrier bits
+    eor    w3, w2, w1                 // Prepare the value to store if simply locked
+                                      //   (mostly 0s, and preserved read barrier bits),
+                                      // or prepare to compare thread id for recursive lock check
+                                      //   (lock_word.ThreadId() ^ self->ThreadId()).
+    tst    w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // Test the non-gc bits.
+    b.ne   .Lnot_simply_locked        // Locked recursively or by other thread?
+    // Transition to unlocked.
 #ifndef USE_READ_BARRIER
     stlr   w3, [x4]
 #else
-    stlxr  w2, w3, [x4]               // Need to use atomic instructions for read barrier
-    cbnz   w2, .Lunlock_stxr_fail     // store failed, retry
+    stlxr  w2, w3, [x4]               // Need to use atomic instructions for read barrier.
+    cbnz   w2, .Lretry_unlock         // If the store failed, retry.
 #endif
     ret
-.Lrecursive_thin_unlock:  // w1: original lock word
-    sub    w1, w1, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // decrement count
+.Lnot_simply_locked:
+                                      // Check lock word state and thread id together,
+    tst    w3, #(LOCK_WORD_STATE_MASK_SHIFTED | LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED)
+    b.ne   art_quick_unlock_object_no_inline
+    sub    w3, w2, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // decrement count
 #ifndef USE_READ_BARRIER
-    str    w1, [x4]
+    str    w3, [x4]
 #else
-    stxr   w2, w1, [x4]               // Need to use atomic instructions for read barrier
-    cbnz   w2, .Lunlock_stxr_fail     // store failed, retry
+    stxr   w2, w3, [x4]               // Need to use atomic instructions for read barrier.
+    cbnz   w2, .Lretry_unlock         // If the store failed, retry.
 #endif
     ret
-.Lunlock_stxr_fail:
-    b      .Lretry_unlock             // retry
-.Lslow_unlock:
-    SETUP_SAVE_REFS_ONLY_FRAME        // save callee saves in case exception allocation triggers GC
-    mov    x1, xSELF                  // pass Thread::Current
-    bl     artUnlockObjectFromCode    // (Object* obj, Thread*)
-    RESTORE_SAVE_REFS_ONLY_FRAME
-    REFRESH_MARKING_REGISTER
-    RETURN_IF_W0_IS_ZERO_OR_DELIVER
 END art_quick_unlock_object
 
 ENTRY art_quick_unlock_object_no_inline
+    // This is also the slow path for art_quick_unlock_object.
     SETUP_SAVE_REFS_ONLY_FRAME        // save callee saves in case exception allocation triggers GC
     mov    x1, xSELF                  // pass Thread::Current
     bl     artUnlockObjectFromCode    // (Object* obj, Thread*)
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 8ab4ce1..b89d45f 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1292,7 +1292,7 @@
     jz   .Lslow_lock
 .Lretry_lock:
     movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax), %ecx  // ecx := lock word
-    test LITERAL(LOCK_WORD_STATE_MASK), %ecx         // test the 2 high bits.
+    test LITERAL(LOCK_WORD_STATE_MASK_SHIFTED), %ecx  // test the 2 high bits.
     jne  .Lslow_lock                      // slow path if either of the two high bits are set.
     movl %ecx, %edx                       // save lock word (edx) to keep read barrier bits.
     andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx  // zero the gc bits.
@@ -1362,7 +1362,7 @@
 .Lretry_unlock:
     movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax), %ecx  // ecx := lock word
     movl %fs:THREAD_ID_OFFSET, %edx       // edx := thread id
-    test LITERAL(LOCK_WORD_STATE_MASK), %ecx
+    test LITERAL(LOCK_WORD_STATE_MASK_SHIFTED), %ecx
     jnz  .Lslow_unlock                    // lock word contains a monitor
     cmpw %cx, %dx                         // does the thread id match?
     jne  .Lslow_unlock
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index eb945ed..c179033 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1312,7 +1312,7 @@
     jz   .Lslow_lock
 .Lretry_lock:
     movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi), %ecx  // ecx := lock word.
-    test LITERAL(LOCK_WORD_STATE_MASK), %ecx         // Test the 2 high bits.
+    test LITERAL(LOCK_WORD_STATE_MASK_SHIFTED), %ecx  // Test the 2 high bits.
     jne  .Lslow_lock                      // Slow path if either of the two high bits are set.
     movl %ecx, %edx                       // save lock word (edx) to keep read barrier bits.
     andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx  // zero the gc bits.
@@ -1362,7 +1362,7 @@
 .Lretry_unlock:
     movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi), %ecx  // ecx := lock word
     movl %gs:THREAD_ID_OFFSET, %edx       // edx := thread id
-    test LITERAL(LOCK_WORD_STATE_MASK), %ecx
+    test LITERAL(LOCK_WORD_STATE_MASK_SHIFTED), %ecx
     jnz  .Lslow_unlock                    // lock word contains a monitor
     cmpw %cx, %dx                         // does the thread id match?
     jne  .Lslow_unlock
diff --git a/runtime/generated/asm_support_gen.h b/runtime/generated/asm_support_gen.h
index 46630db..464c2b7 100644
--- a/runtime/generated/asm_support_gen.h
+++ b/runtime/generated/asm_support_gen.h
@@ -90,16 +90,24 @@
 DEFINE_CHECK_EQ(static_cast<size_t>(MIN_LARGE_OBJECT_THRESHOLD), (static_cast<size_t>(art::gc::Heap::kMinLargeObjectThreshold)))
 #define LOCK_WORD_STATE_SHIFT 30
 DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_STATE_SHIFT), (static_cast<int32_t>(art::LockWord::kStateShift)))
-#define LOCK_WORD_STATE_MASK 0xc0000000
-DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_STATE_MASK), (static_cast<uint32_t>(art::LockWord::kStateMaskShifted)))
+#define LOCK_WORD_STATE_MASK_SHIFTED 0xc0000000
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_STATE_MASK_SHIFTED), (static_cast<uint32_t>(art::LockWord::kStateMaskShifted)))
 #define LOCK_WORD_READ_BARRIER_STATE_SHIFT 28
 DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_READ_BARRIER_STATE_SHIFT), (static_cast<int32_t>(art::LockWord::kReadBarrierStateShift)))
 #define LOCK_WORD_READ_BARRIER_STATE_MASK 0x10000000
 DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_READ_BARRIER_STATE_MASK), (static_cast<uint32_t>(art::LockWord::kReadBarrierStateMaskShifted)))
 #define LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED 0xefffffff
 DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), (static_cast<uint32_t>(art::LockWord::kReadBarrierStateMaskShiftedToggled)))
-#define LOCK_WORD_THIN_LOCK_COUNT_ONE 65536
-DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_THIN_LOCK_COUNT_ONE), (static_cast<int32_t>(art::LockWord::kThinLockCountOne)))
+#define LOCK_WORD_THIN_LOCK_COUNT_SIZE 12
+DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_THIN_LOCK_COUNT_SIZE), (static_cast<int32_t>(art::LockWord::kThinLockCountSize)))
+#define LOCK_WORD_THIN_LOCK_COUNT_SHIFT 16
+DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_THIN_LOCK_COUNT_SHIFT), (static_cast<int32_t>(art::LockWord::kThinLockCountShift)))
+#define LOCK_WORD_THIN_LOCK_COUNT_MASK_SHIFTED 0xfff0000
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_THIN_LOCK_COUNT_MASK_SHIFTED), (static_cast<uint32_t>(art::LockWord::kThinLockCountMaskShifted)))
+#define LOCK_WORD_THIN_LOCK_COUNT_ONE 0x10000
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_THIN_LOCK_COUNT_ONE), (static_cast<uint32_t>(art::LockWord::kThinLockCountOne)))
+#define LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED 0xffff
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_THIN_LOCK_OWNER_MASK_SHIFTED), (static_cast<uint32_t>(art::LockWord::kThinLockOwnerMaskShifted)))
 #define LOCK_WORD_STATE_FORWARDING_ADDRESS 0x3
 DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_STATE_FORWARDING_ADDRESS), (static_cast<uint32_t>(art::LockWord::kStateForwardingAddress)))
 #define LOCK_WORD_STATE_FORWARDING_ADDRESS_OVERFLOW 0x40000000
@@ -110,6 +118,8 @@
 DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_GC_STATE_MASK_SHIFTED), (static_cast<uint32_t>(art::LockWord::kGCStateMaskShifted)))
 #define LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED 0xcfffffff
 DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), (static_cast<uint32_t>(art::LockWord::kGCStateMaskShiftedToggled)))
+#define LOCK_WORD_GC_STATE_SIZE 2
+DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_GC_STATE_SIZE), (static_cast<int32_t>(art::LockWord::kGCStateSize)))
 #define LOCK_WORD_GC_STATE_SHIFT 28
 DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_GC_STATE_SHIFT), (static_cast<int32_t>(art::LockWord::kGCStateShift)))
 #define LOCK_WORD_MARK_BIT_SHIFT 29
diff --git a/runtime/lock_word.h b/runtime/lock_word.h
index 09d856f..ce7fe34 100644
--- a/runtime/lock_word.h
+++ b/runtime/lock_word.h
@@ -75,16 +75,18 @@
     // Remaining bits are the recursive lock count.
     kThinLockCountSize = 32 - kThinLockOwnerSize - kStateSize - kReadBarrierStateSize -
         kMarkBitStateSize,
-    // Thin lock bits. Owner in lowest bits.
 
+    // Thin lock bits. Owner in lowest bits.
     kThinLockOwnerShift = 0,
     kThinLockOwnerMask = (1 << kThinLockOwnerSize) - 1,
+    kThinLockOwnerMaskShifted = kThinLockOwnerMask << kThinLockOwnerShift,
     kThinLockMaxOwner = kThinLockOwnerMask,
     // Count in higher bits.
     kThinLockCountShift = kThinLockOwnerSize + kThinLockOwnerShift,
     kThinLockCountMask = (1 << kThinLockCountSize) - 1,
     kThinLockMaxCount = kThinLockCountMask,
     kThinLockCountOne = 1 << kThinLockCountShift,  // == 65536 (0x10000)
+    kThinLockCountMaskShifted = kThinLockCountMask << kThinLockCountShift,
 
     // State in the highest bits.
     kStateShift = kReadBarrierStateSize + kThinLockCountSize + kThinLockCountShift +
diff --git a/tools/cpp-define-generator/constant_lockword.def b/tools/cpp-define-generator/constant_lockword.def
index 08d5885..977d1ca 100644
--- a/tools/cpp-define-generator/constant_lockword.def
+++ b/tools/cpp-define-generator/constant_lockword.def
@@ -23,23 +23,29 @@
 #define DEFINE_LOCK_WORD_EXPR(macro_name, type, constant_field_name) \
   DEFINE_EXPR(LOCK_WORD_ ## macro_name, type, art::LockWord::constant_field_name)
 
+// FIXME: The naming is inconsistent, the `Shifted` -> `_SHIFTED` suffix is sometimes missing.
 DEFINE_LOCK_WORD_EXPR(STATE_SHIFT,               int32_t,  kStateShift)
-DEFINE_LOCK_WORD_EXPR(STATE_MASK,                uint32_t, kStateMaskShifted)
+DEFINE_LOCK_WORD_EXPR(STATE_MASK_SHIFTED,        uint32_t, kStateMaskShifted)
 DEFINE_LOCK_WORD_EXPR(READ_BARRIER_STATE_SHIFT,  int32_t,  kReadBarrierStateShift)
-DEFINE_LOCK_WORD_EXPR(READ_BARRIER_STATE_MASK,   uint32_t,  kReadBarrierStateMaskShifted)
+DEFINE_LOCK_WORD_EXPR(READ_BARRIER_STATE_MASK,   uint32_t, kReadBarrierStateMaskShifted)
 DEFINE_LOCK_WORD_EXPR(READ_BARRIER_STATE_MASK_TOGGLED, uint32_t, kReadBarrierStateMaskShiftedToggled)
-DEFINE_LOCK_WORD_EXPR(THIN_LOCK_COUNT_ONE,       int32_t,  kThinLockCountOne)
+DEFINE_LOCK_WORD_EXPR(THIN_LOCK_COUNT_SIZE,      int32_t,  kThinLockCountSize)
+DEFINE_LOCK_WORD_EXPR(THIN_LOCK_COUNT_SHIFT,     int32_t,  kThinLockCountShift)
+DEFINE_LOCK_WORD_EXPR(THIN_LOCK_COUNT_MASK_SHIFTED, uint32_t, kThinLockCountMaskShifted)
+DEFINE_LOCK_WORD_EXPR(THIN_LOCK_COUNT_ONE,       uint32_t, kThinLockCountOne)
+DEFINE_LOCK_WORD_EXPR(THIN_LOCK_OWNER_MASK_SHIFTED, uint32_t, kThinLockOwnerMaskShifted)
 
-DEFINE_LOCK_WORD_EXPR(STATE_FORWARDING_ADDRESS, uint32_t, kStateForwardingAddress)
+DEFINE_LOCK_WORD_EXPR(STATE_FORWARDING_ADDRESS,  uint32_t, kStateForwardingAddress)
 DEFINE_LOCK_WORD_EXPR(STATE_FORWARDING_ADDRESS_OVERFLOW, uint32_t, kStateForwardingAddressOverflow)
 DEFINE_LOCK_WORD_EXPR(STATE_FORWARDING_ADDRESS_SHIFT, uint32_t, kForwardingAddressShift)
 
-DEFINE_LOCK_WORD_EXPR(GC_STATE_MASK_SHIFTED,   uint32_t,  kGCStateMaskShifted)
+DEFINE_LOCK_WORD_EXPR(GC_STATE_MASK_SHIFTED,     uint32_t,  kGCStateMaskShifted)
 DEFINE_LOCK_WORD_EXPR(GC_STATE_MASK_SHIFTED_TOGGLED, uint32_t, kGCStateMaskShiftedToggled)
-DEFINE_LOCK_WORD_EXPR(GC_STATE_SHIFT,   int32_t,  kGCStateShift)
+DEFINE_LOCK_WORD_EXPR(GC_STATE_SIZE,             int32_t,  kGCStateSize)
+DEFINE_LOCK_WORD_EXPR(GC_STATE_SHIFT,            int32_t,  kGCStateShift)
 
-DEFINE_LOCK_WORD_EXPR(MARK_BIT_SHIFT, int32_t, kMarkBitStateShift)
-DEFINE_LOCK_WORD_EXPR(MARK_BIT_MASK_SHIFTED, uint32_t, kMarkBitStateMaskShifted)
+DEFINE_LOCK_WORD_EXPR(MARK_BIT_SHIFT,            int32_t,  kMarkBitStateShift)
+DEFINE_LOCK_WORD_EXPR(MARK_BIT_MASK_SHIFTED,     uint32_t, kMarkBitStateMaskShifted)
 
 #undef DEFINE_LOCK_WORD_EXPR