JNI: Faster mutator locking during transition.

Add mutator lock pointer to `Thread`. This makes retrieving
the pointer faster on ARM and ARM64 and makes it accessible
for JNI stubs if we decide to inline `JniMethodStart()` and
`JniMethodEnd()`.

Pass the lock level `kMutatorLock` explicitly from the
`MutatorMutex` functions to let the compiler evaluate a lot
of the conditions statically and avoid unnecessary code.

Golem results for art-opt-cc (higher is better):
linux-armv7                      before after
NativeDowncallStaticNormal       6.3694 7.2394 (+13.66%)
NativeDowncallStaticNormal6      6.0663 6.8527 (+12.96%)
NativeDowncallStaticNormalRefs6  5.7061 6.3945 (+12.06%)
NativeDowncallVirtualNormal      5.7088 7.2081 (+26.26%)
NativeDowncallVirtualNormal6     5.4563 6.7929 (+24.49%)
NativeDowncallVirtualNormalRefs6 5.1595 6.3415 (+22.91%)
linux-armv8                      before after
NativeDowncallStaticNormal       6.4229 7.0423 (+9.642%)
NativeDowncallStaticNormal6      6.2651 6.8527 (+9.379%)
NativeDowncallStaticNormalRefs6  5.8824 6.3976 (+8.760%)
NativeDowncallVirtualNormal      6.2651 6.8527 (+9.379%)
NativeDowncallVirtualNormal6     6.0663 6.6163 (+9.066%)
NativeDowncallVirtualNormalRefs6 5.6630 6.1408 (+8.436%)
There does not seem to be a measurable difference for x86
and x86-64.

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Bug: 172332525
Change-Id: I2ad511a2fe7bac250549c43789cf3fb5e2de9e25
diff --git a/runtime/base/mutex-inl.h b/runtime/base/mutex-inl.h
index 2a1a08d..dba1e12 100644
--- a/runtime/base/mutex-inl.h
+++ b/runtime/base/mutex-inl.h
@@ -93,7 +93,12 @@
     CheckUnattachedThread(level_);
     return;
   }
-  LockLevel level = level_;
+  RegisterAsLockedImpl(self, level_);
+}
+
+inline void BaseMutex::RegisterAsLockedImpl(Thread* self, LockLevel level) {
+  DCHECK(self != nullptr);
+  DCHECK_EQ(level_, level);
   // It would be nice to avoid this condition checking in the non-debug case,
   // but that would make the various methods that check if a mutex is held not
   // work properly for thread wait locks. Since the vast majority of lock
@@ -159,8 +164,13 @@
     CheckUnattachedThread(level_);
     return;
   }
-  if (level_ != kMonitorLock) {
-    auto level = level_;
+  RegisterAsUnlockedImpl(self , level_);
+}
+
+inline void BaseMutex::RegisterAsUnlockedImpl(Thread* self, LockLevel level) {
+  DCHECK(self != nullptr);
+  DCHECK_EQ(level_, level);
+  if (level != kMonitorLock) {
     if (UNLIKELY(level == kThreadWaitLock) && self->GetHeldMutex(kThreadWaitWakeLock) == this) {
       level = kThreadWaitWakeLock;
     }
@@ -292,11 +302,11 @@
 
 inline void MutatorMutex::TransitionFromRunnableToSuspended(Thread* self) {
   AssertSharedHeld(self);
-  RegisterAsUnlocked(self);
+  RegisterAsUnlockedImpl(self, kMutatorLock);
 }
 
 inline void MutatorMutex::TransitionFromSuspendedToRunnable(Thread* self) {
-  RegisterAsLocked(self);
+  RegisterAsLockedImpl(self, kMutatorLock);
   AssertSharedHeld(self);
 }
 
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index 8f2a8ea..87e9525 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -106,8 +106,11 @@
   // Add this mutex to those owned by self, and perform appropriate checking.
   // For this call only, self may also be another suspended thread.
   void RegisterAsLocked(Thread* self);
+  void RegisterAsLockedImpl(Thread* self, LockLevel level);
 
   void RegisterAsUnlocked(Thread* self);
+  void RegisterAsUnlockedImpl(Thread* self, LockLevel level);
+
   void CheckSafeToWait(Thread* self);
 
   friend class ScopedContentionRecorder;
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index 609f081..c19e000 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -126,7 +126,8 @@
                         sizeof(void*) * kNumRosAllocThreadLocalSizeBracketsInThread);
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_alloc_stack_top, thread_local_alloc_stack_end,
                         sizeof(void*));
-    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_alloc_stack_end, held_mutexes, sizeof(void*));
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_alloc_stack_end, mutator_lock, sizeof(void*));
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, mutator_lock, held_mutexes, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, held_mutexes, flip_function,
                         sizeof(void*) * kLockLevelCount);
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, flip_function, method_verifier, sizeof(void*));
diff --git a/runtime/oat.h b/runtime/oat.h
index bc9a2ca..acb3d30 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,8 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr std::array<uint8_t, 4> kOatMagic { { 'o', 'a', 't', '\n' } };
-  // Last oat version changed reason: Inlining across dex files for bss within OAT.
-  static constexpr std::array<uint8_t, 4> kOatVersion { { '2', '1', '1', '\0' } };
+  // Last oat version changed reason: JNI: Faster mutator locking during transition.
+  static constexpr std::array<uint8_t, 4> kOatVersion { { '2', '1', '2', '\0' } };
 
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
   static constexpr const char* kDebuggableKey = "debuggable";
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 9d96e9d..f5bf5fb 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -75,7 +75,7 @@
         for (int i = kLockLevelCount - 1; i >= 0; --i) {
           BaseMutex* held_mutex = self->GetHeldMutex(static_cast<LockLevel>(i));
           if (held_mutex != nullptr &&
-              held_mutex != Locks::mutator_lock_ &&
+              held_mutex != GetMutatorLock() &&
               held_mutex != cond_var_mutex) {
             CHECK(Locks::IsExpectedOnWeakRefAccess(held_mutex))
                 << "Holding unexpected mutex " << held_mutex->GetName()
@@ -150,7 +150,7 @@
     if (check_locks) {
       bool bad_mutexes_held = false;
       for (int i = kLockLevelCount - 1; i >= 0; --i) {
-        // We expect no locks except the mutator_lock_. User code suspension lock is OK as long as
+        // We expect no locks except the mutator lock. User code suspension lock is OK as long as
         // we aren't going to be held suspended due to SuspendReason::kForUserCode.
         if (i != kMutatorLock && i != kUserCodeSuspensionLock) {
           BaseMutex* held_mutex = GetHeldMutex(static_cast<LockLevel>(i));
@@ -234,8 +234,8 @@
   DCHECK_EQ(this, Thread::Current());
   // Change to non-runnable state, thereby appearing suspended to the system.
   TransitionToSuspendedAndRunCheckpoints(new_state);
-  // Mark the release of the share of the mutator_lock_.
-  Locks::mutator_lock_->TransitionFromRunnableToSuspended(this);
+  // Mark the release of the share of the mutator lock.
+  GetMutatorLock()->TransitionFromRunnableToSuspended(this);
   // Once suspended - check the active suspend barrier flag
   PassActiveSuspendBarriers();
 }
@@ -246,7 +246,7 @@
   int16_t old_state = old_state_and_flags.as_struct.state;
   DCHECK_NE(static_cast<ThreadState>(old_state), kRunnable);
   do {
-    Locks::mutator_lock_->AssertNotHeld(this);  // Otherwise we starve GC..
+    GetMutatorLock()->AssertNotHeld(this);  // Otherwise we starve GC.
     old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
     DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
     if (LIKELY(old_state_and_flags.as_struct.flags == 0)) {
@@ -260,8 +260,8 @@
       if (LIKELY(tls32_.state_and_flags.as_atomic_int.CompareAndSetWeakAcquire(
                                                  old_state_and_flags.as_int,
                                                  new_state_and_flags.as_int))) {
-        // Mark the acquisition of a share of the mutator_lock_.
-        Locks::mutator_lock_->TransitionFromSuspendedToRunnable(this);
+        // Mark the acquisition of a share of the mutator lock.
+        GetMutatorLock()->TransitionFromSuspendedToRunnable(this);
         break;
       }
     } else if ((old_state_and_flags.as_struct.flags & kActiveSuspendBarrier) != 0) {
diff --git a/runtime/thread.cc b/runtime/thread.cc
index d54330a..9fb8d62 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -2297,6 +2297,8 @@
       is_runtime_thread_(false) {
   wait_mutex_ = new Mutex("a thread wait mutex", LockLevel::kThreadWaitLock);
   wait_cond_ = new ConditionVariable("a thread wait condition variable", *wait_mutex_);
+  tlsPtr_.mutator_lock = Locks::mutator_lock_;
+  DCHECK(tlsPtr_.mutator_lock != nullptr);
   tlsPtr_.instrumentation_stack =
       new std::map<uintptr_t, instrumentation::InstrumentationStackFrame>;
   tlsPtr_.name = new std::string(kThreadNameDuringStartup);
diff --git a/runtime/thread.h b/runtime/thread.h
index 7e60582..9478980 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -1346,6 +1346,11 @@
     return old_state;
   }
 
+  MutatorMutex* GetMutatorLock() RETURN_CAPABILITY(Locks::mutator_lock_) {
+    DCHECK_EQ(tlsPtr_.mutator_lock, Locks::mutator_lock_);
+    return tlsPtr_.mutator_lock;
+  }
+
   void VerifyStackImpl() REQUIRES_SHARED(Locks::mutator_lock_);
 
   void DumpState(std::ostream& os) const REQUIRES_SHARED(Locks::mutator_lock_);
@@ -1639,6 +1644,7 @@
                                thread_local_objects(0),
                                thread_local_alloc_stack_top(nullptr),
                                thread_local_alloc_stack_end(nullptr),
+                               mutator_lock(nullptr),
                                flip_function(nullptr),
                                method_verifier(nullptr),
                                thread_local_mark_stack(nullptr),
@@ -1782,6 +1788,10 @@
     StackReference<mirror::Object>* thread_local_alloc_stack_top;
     StackReference<mirror::Object>* thread_local_alloc_stack_end;
 
+    // Pointer to the mutator lock.
+    // This is the same as `Locks::mutator_lock_` but cached for faster state transitions.
+    MutatorMutex* mutator_lock;
+
     // Support for Mutex lock hierarchy bug detection.
     BaseMutex* held_mutexes[kLockLevelCount];