30 files changed, 1435 insertions, 1136 deletions
diff --git a/compiler/jni/jni_cfi_test_expected.inc b/compiler/jni/jni_cfi_test_expected.inc
index 3fe822666f..47a67c72c0 100644
--- a/compiler/jni/jni_cfi_test_expected.inc
+++ b/compiler/jni/jni_cfi_test_expected.inc
@@ -1,9 +1,12 @@
 // TODO These arrays should be generated automatically or have instructions for re-creation.
+// For now, the gc_is_marking offset can be adjusted by tweaking the last CL that made a
+// similar change.
+
 static constexpr uint8_t expected_asm_kThumb2[] = {
     0x2D, 0xE9, 0xE0, 0x4D, 0x2D, 0xED, 0x10, 0x8A, 0x81, 0xB0, 0x00, 0x90,
     0x19, 0x91, 0x8D, 0xED, 0x1A, 0x0A, 0x1B, 0x92, 0x1C, 0x93, 0x88, 0xB0,
     0x08, 0xB0, 0x01, 0xB0, 0xBD, 0xEC, 0x10, 0x8A, 0xBD, 0xE8, 0xE0, 0x4D,
-    0xD9, 0xF8, 0x24, 0x80, 0x70, 0x47,
+    0xD9, 0xF8, 0x20, 0x80, 0x70, 0x47,
 };
 static constexpr uint8_t expected_cfi_kThumb2[] = {
     0x44, 0x0E, 0x1C, 0x85, 0x07, 0x86, 0x06, 0x87, 0x05, 0x88, 0x04, 0x8A,
@@ -86,7 +89,7 @@ static constexpr uint8_t expected_cfi_kThumb2[] = {
 // 0x00000024: .cfi_restore: r10
 // 0x00000024: .cfi_restore: r11
 // 0x00000024: .cfi_restore: r14
-// 0x00000024: ldr r8, [tr, #36] ; is_gc_marking
+// 0x00000024: ldr r8, [tr, #32] ; is_gc_marking
 // 0x00000028: bx lr
 // 0x0000002a: .cfi_restore_state
 // 0x0000002a: .cfi_def_cfa_offset: 112
@@ -101,7 +104,7 @@ static constexpr uint8_t expected_asm_kArm64[] = {
     0xF3, 0x53, 0x45, 0xA9, 0xF5, 0x5B, 0x46, 0xA9, 0xF7, 0x63, 0x47, 0xA9,
     0xF9, 0x6B, 0x48, 0xA9, 0xFB, 0x73, 0x49, 0xA9, 0xFD, 0x7B, 0x4A, 0xA9,
     0xE8, 0x27, 0x41, 0x6D, 0xEA, 0x2F, 0x42, 0x6D, 0xEC, 0x37, 0x43, 0x6D,
-    0xEE, 0x3F, 0x44, 0x6D, 0x74, 0x26, 0x40, 0xB9, 0xFF, 0xC3, 0x02, 0x91,
+    0xEE, 0x3F, 0x44, 0x6D, 0x74, 0x22, 0x40, 0xB9, 0xFF, 0xC3, 0x02, 0x91,
     0xC0, 0x03, 0x5F, 0xD6,
 };
 static constexpr uint8_t expected_cfi_kArm64[] = {
@@ -188,7 +191,7 @@ static constexpr uint8_t expected_cfi_kArm64[] = {
 // 0x0000006c: ldp d14, d15, [sp, #64]
 // 0x00000070: .cfi_restore_extended: r78
 // 0x00000070: .cfi_restore_extended: r79
-// 0x00000070: ldr w20, [tr, #48] ; is_gc_marking
+// 0x00000070: ldr w20, [tr, #32] ; is_gc_marking
 // 0x00000074: add sp, sp, #0xb0 (176)
 // 0x00000078: .cfi_def_cfa_offset: 0
 // 0x00000078: ret
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index aea7f14762..6e0048eb20 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -76,7 +76,7 @@ const char* const VixlJniHelpersResults = {
   "      e4: f1bb 0f00     cmp.w r11, #0\n"
   "      e8: bf18          it ne\n"
   "      ea: 46e3          movne r11, r12\n"
-  "      ec: f8d9 c09c     ldr.w r12, [r9, #156]\n"
+  "      ec: f8d9 c094     ldr.w r12, [r9, #148]\n"
   "      f0: f1bc 0f00     cmp.w r12, #0\n"
   "      f4: d16f          bne 0x1d6     @ imm = #222\n"
   "      f6: f8cd c7ff     str.w r12, [sp, #2047]\n"
@@ -151,10 +151,10 @@ const char* const VixlJniHelpersResults = {
   "     206: b001          add sp, #4\n"
   "     208: ecbd 8a10     vpop {s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31}\n"
   "     20c: e8bd 4de0     pop.w {r5, r6, r7, r8, r10, r11, lr}\n"
-  "     210: f8d9 8024     ldr.w r8, [r9, #36]\n"
+  "     210: f8d9 8020     ldr.w r8, [r9, #32]\n"
   "     214: 4770          bx lr\n"
-  "     216: f8d9 009c     ldr.w r0, [r9, #156]\n"
-  "     21a: f8d9 e2d0     ldr.w lr, [r9, #720]\n"
+  "     216: f8d9 0094     ldr.w r0, [r9, #148]\n"
+  "     21a: f8d9 e2c4     ldr.w lr, [r9, #708]\n"
   "     21e: 47f0          blx lr\n"
 };
 
diff --git a/openjdkjvmti/ti_stack.cc b/openjdkjvmti/ti_stack.cc
index 9af8861260..f399a6738b 100644
--- a/openjdkjvmti/ti_stack.cc
+++ b/openjdkjvmti/ti_stack.cc
@@ -363,7 +363,13 @@ static void RunCheckpointAndWait(Data* data, size_t max_frame_count)
     REQUIRES_SHARED(art::Locks::mutator_lock_) {
   // Note: requires the mutator lock as the checkpoint requires the mutator lock.
   GetAllStackTracesVectorClosure<Data> closure(max_frame_count, data);
-  size_t barrier_count = art::Runtime::Current()->GetThreadList()->RunCheckpoint(&closure, nullptr);
+  // TODO(b/253671779): Replace this use of RunCheckpointUnchecked() with RunCheckpoint(). This is
+  // currently not possible, since the following undesirable call chain (abbreviated here) is then
+  // possible and exercised by current tests: (jvmti) GetAllStackTraces -> <this function> ->
+  // RunCheckpoint -> GetStackTraceVisitor -> EncodeMethodId -> Class::EnsureMethodIds ->
+  // Class::Alloc -> AllocObjectWithAllocator -> potentially suspends, or runs GC, etc. -> CHECK
+  // failure.
+  size_t barrier_count = art::Runtime::Current()->GetThreadList()->RunCheckpointUnchecked(&closure);
   if (barrier_count == 0) {
     return;
   }
diff --git a/openjdkjvmti/ti_thread.cc b/openjdkjvmti/ti_thread.cc
index 13eebbff04..590b3b5ebe 100644
--- a/openjdkjvmti/ti_thread.cc
+++ b/openjdkjvmti/ti_thread.cc
@@ -900,17 +900,13 @@ jvmtiError ThreadUtil::SuspendOther(art::Thread* self,
         }
       }
     }
-    bool timeout = true;
     art::Thread* ret_target = art::Runtime::Current()->GetThreadList()->SuspendThreadByPeer(
-        target_jthread,
-        art::SuspendReason::kForUserCode,
-        &timeout);
-    if (ret_target == nullptr && !timeout) {
+        target_jthread, art::SuspendReason::kForUserCode);
+    if (ret_target == nullptr) {
       // TODO It would be good to get more information about why exactly the thread failed to
       // suspend.
       return ERR(INTERNAL);
-    } else if (!timeout) {
-      // we didn't time out and got a result.
+    } else {
       return OK;
     }
     // We timed out. Just go around and try again.
@@ -927,11 +923,7 @@ jvmtiError ThreadUtil::SuspendSelf(art::Thread* self) {
       // This can only happen if we race with another thread to suspend 'self' and we lose.
       return ERR(THREAD_SUSPENDED);
     }
-    // We shouldn't be able to fail this.
-    if (!self->ModifySuspendCount(self, +1, nullptr, art::SuspendReason::kForUserCode)) {
-      // TODO More specific error would be nice.
-      return ERR(INTERNAL);
-    }
+    self->IncrementSuspendCount(self, nullptr, nullptr, art::SuspendReason::kForUserCode);
   }
   // Once we have requested the suspend we actually go to sleep. We need to do this after releasing
   // the suspend_lock to make sure we can be woken up. This call gains the mutator lock causing us
diff --git a/runtime/base/locks.h b/runtime/base/locks.h
index c15e5dee71..e8c83fe2b2 100644
--- a/runtime/base/locks.h
+++ b/runtime/base/locks.h
@@ -108,10 +108,6 @@ enum LockLevel : uint8_t {
   kClassLinkerClassesLock,  // TODO rename.
   kSubtypeCheckLock,
   kBreakpointLock,
-  // This is a generic lock level for a lock meant to be gained after having a
-  // monitor lock.
-  kPostMonitorLock,
-  kMonitorLock,
   kMonitorListLock,
   kThreadListLock,
   kAllocTrackerLock,
@@ -125,7 +121,10 @@ enum LockLevel : uint8_t {
   kRuntimeShutdownLock,
   kTraceLock,
   kHeapBitmapLock,
-
+  // This is a generic lock level for a lock meant to be gained after having a
+  // monitor lock.
+  kPostMonitorLock,
+  kMonitorLock,
   // This is a generic lock level for a top-level lock meant to be gained after having the
   // mutator_lock_.
   kPostMutatorTopLockLevel,
@@ -138,7 +137,7 @@ enum LockLevel : uint8_t {
   kUserCodeSuspensionLock,
   kZygoteCreationLock,
 
-  // The highest valid lock level. Use this if there is code that should only be called with no
+  // The highest valid lock level. Use this for locks that should only be acquired with no
   // other locks held. Since this is the highest lock level we also allow it to be held even if the
   // runtime or current thread is not fully set-up yet (for example during thread attach). Note that
   // this lock also has special behavior around the mutator_lock_. Since the mutator_lock_ is not
diff --git a/runtime/base/mutex-inl.h b/runtime/base/mutex-inl.h
index dba1e1299b..712b61d4ac 100644
--- a/runtime/base/mutex-inl.h
+++ b/runtime/base/mutex-inl.h
@@ -60,43 +60,44 @@ static inline void CheckUnattachedThread(LockLevel level) NO_THREAD_SAFETY_ANALY
   // The check below enumerates the cases where we expect not to be able to check the validity of
   // locks on a thread. Lock checking is disabled to avoid deadlock when checking shutdown lock.
   // TODO: tighten this check.
-  if (kDebugLocking) {
-    CHECK(!Locks::IsSafeToCallAbortRacy() ||
-          // Used during thread creation to avoid races with runtime shutdown. Thread::Current not
-          // yet established.
-          level == kRuntimeShutdownLock ||
-          // Thread Ids are allocated/released before threads are established.
-          level == kAllocatedThreadIdsLock ||
-          // Thread LDT's are initialized without Thread::Current established.
-          level == kModifyLdtLock ||
-          // Threads are unregistered while holding the thread list lock, during this process they
-          // no longer exist and so we expect an unlock with no self.
-          level == kThreadListLock ||
-          // Ignore logging which may or may not have set up thread data structures.
-          level == kLoggingLock ||
-          // When transitioning from suspended to runnable, a daemon thread might be in
-          // a situation where the runtime is shutting down. To not crash our debug locking
-          // mechanism we just pass null Thread* to the MutexLock during that transition
-          // (see Thread::TransitionFromSuspendedToRunnable).
-          level == kThreadSuspendCountLock ||
-          // Avoid recursive death.
-          level == kAbortLock ||
-          // Locks at the absolute top of the stack can be locked at any time.
-          level == kTopLockLevel ||
-          // The unexpected signal handler may be catching signals from any thread.
-          level == kUnexpectedSignalLock) << level;
-  }
+  CHECK(!Locks::IsSafeToCallAbortRacy() ||
+        // Used during thread creation to avoid races with runtime shutdown. Thread::Current not
+        // yet established.
+        level == kRuntimeShutdownLock ||
+        // Thread Ids are allocated/released before threads are established.
+        level == kAllocatedThreadIdsLock ||
+        // Thread LDT's are initialized without Thread::Current established.
+        level == kModifyLdtLock ||
+        // Threads are unregistered while holding the thread list lock, during this process they
+        // no longer exist and so we expect an unlock with no self.
+        level == kThreadListLock ||
+        // Ignore logging which may or may not have set up thread data structures.
+        level == kLoggingLock ||
+        // When transitioning from suspended to runnable, a daemon thread might be in
+        // a situation where the runtime is shutting down. To not crash our debug locking
+        // mechanism we just pass null Thread* to the MutexLock during that transition
+        // (see Thread::TransitionFromSuspendedToRunnable).
+        level == kThreadSuspendCountLock ||
+        // Avoid recursive death.
+        level == kAbortLock ||
+        // Locks at the absolute top of the stack can be locked at any time.
+        level == kTopLockLevel ||
+        // The unexpected signal handler may be catching signals from any thread.
+        level == kUnexpectedSignalLock)
+      << level;
 }
 
-inline void BaseMutex::RegisterAsLocked(Thread* self) {
+inline void BaseMutex::RegisterAsLocked(Thread* self, bool check) {
   if (UNLIKELY(self == nullptr)) {
-    CheckUnattachedThread(level_);
-    return;
+    if (check) {
+      CheckUnattachedThread(level_);
+    }
+  } else {
+    RegisterAsLockedImpl(self, level_, check);
   }
-  RegisterAsLockedImpl(self, level_);
 }
 
-inline void BaseMutex::RegisterAsLockedImpl(Thread* self, LockLevel level) {
+inline void BaseMutex::RegisterAsLockedImpl(Thread* self, LockLevel level, bool check) {
   DCHECK(self != nullptr);
   DCHECK_EQ(level_, level);
   // It would be nice to avoid this condition checking in the non-debug case,
@@ -107,7 +108,7 @@ inline void BaseMutex::RegisterAsLockedImpl(Thread* self, LockLevel level) {
   if (UNLIKELY(level == kThreadWaitLock) && self->GetHeldMutex(kThreadWaitLock) != nullptr) {
     level = kThreadWaitWakeLock;
   }
-  if (kDebugLocking) {
+  if (check) {
     // Check if a bad Mutex of this level or lower is held.
     bool bad_mutexes_held = false;
     // Specifically allow a kTopLockLevel lock to be gained when the current thread holds the
@@ -161,10 +162,12 @@ inline void BaseMutex::RegisterAsLockedImpl(Thread* self, LockLevel level) {
 
 inline void BaseMutex::RegisterAsUnlocked(Thread* self) {
   if (UNLIKELY(self == nullptr)) {
-    CheckUnattachedThread(level_);
-    return;
+    if (kDebugLocking) {
+      CheckUnattachedThread(level_);
+    }
+  } else {
+    RegisterAsUnlockedImpl(self, level_);
   }
-  RegisterAsUnlockedImpl(self , level_);
 }
 
 inline void BaseMutex::RegisterAsUnlockedImpl(Thread* self, LockLevel level) {
@@ -306,7 +309,7 @@ inline void MutatorMutex::TransitionFromRunnableToSuspended(Thread* self) {
 }
 
 inline void MutatorMutex::TransitionFromSuspendedToRunnable(Thread* self) {
-  RegisterAsLockedImpl(self, kMutatorLock);
+  RegisterAsLockedImpl(self, kMutatorLock, kDebugLocking);
   AssertSharedHeld(self);
 }
 
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 728dc842c2..ffc2168025 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -246,11 +246,12 @@ void BaseMutex::DumpAll(std::ostream& os) {
 }
 
 void BaseMutex::CheckSafeToWait(Thread* self) {
-  if (self == nullptr) {
-    CheckUnattachedThread(level_);
+  if (!kDebugLocking) {
     return;
   }
-  if (kDebugLocking) {
+  if (self == nullptr) {
+    CheckUnattachedThread(level_);
+  } else {
     CHECK(self->GetHeldMutex(level_) == this || level_ == kMonitorLock)
         << "Waiting on unacquired mutex: " << name_;
     bool bad_mutexes_held = false;
@@ -570,6 +571,7 @@ bool Mutex::IsDumpFrequent(Thread* thread, uint64_t try_times) {
   }
 }
 
+template <bool kCheck>
 bool Mutex::ExclusiveTryLock(Thread* self) {
   DCHECK(self == nullptr || self == Thread::Current());
   if (kDebugLocking && !recursive_) {
@@ -600,7 +602,7 @@ bool Mutex::ExclusiveTryLock(Thread* self) {
 #endif
     DCHECK_EQ(GetExclusiveOwnerTid(), 0);
     exclusive_owner_.store(SafeGetTid(self), std::memory_order_relaxed);
-    RegisterAsLocked(self);
+    RegisterAsLocked(self, kCheck);
   }
   recursion_count_++;
   if (kDebugLocking) {
@@ -611,6 +613,9 @@ bool Mutex::ExclusiveTryLock(Thread* self) {
   return true;
 }
 
+template bool Mutex::ExclusiveTryLock<false>(Thread* self);
+template bool Mutex::ExclusiveTryLock<true>(Thread* self);
+
 bool Mutex::ExclusiveTryLockWithSpinning(Thread* self) {
   // Spin a small number of times, since this affects our ability to respond to suspension
   // requests. We spin repeatedly only if the mutex repeatedly becomes available and unavailable
@@ -923,7 +928,7 @@ void ReaderWriterMutex::HandleSharedLockContention(Thread* self, int32_t cur_sta
 }
 #endif
 
-bool ReaderWriterMutex::SharedTryLock(Thread* self) {
+bool ReaderWriterMutex::SharedTryLock(Thread* self, bool check) {
   DCHECK(self == nullptr || self == Thread::Current());
 #if ART_USE_FUTEXES
   bool done = false;
@@ -947,7 +952,7 @@ bool ReaderWriterMutex::SharedTryLock(Thread* self) {
     PLOG(FATAL) << "pthread_mutex_trylock failed for " << name_;
   }
 #endif
-  RegisterAsLocked(self);
+  RegisterAsLocked(self, check);
   AssertSharedHeld(self);
   return true;
 }
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index 87e9525557..925f7a2f45 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -103,10 +103,11 @@ class BaseMutex {
   BaseMutex(const char* name, LockLevel level);
   virtual ~BaseMutex();
 
-  // Add this mutex to those owned by self, and perform appropriate checking.
-  // For this call only, self may also be another suspended thread.
-  void RegisterAsLocked(Thread* self);
-  void RegisterAsLockedImpl(Thread* self, LockLevel level);
+  // Add this mutex to those owned by self, and optionally perform lock order checking.  Caller
+  // may wish to disable checking for trylock calls that cannot result in deadlock.  For this call
+  // only, self may also be another suspended thread.
+  void RegisterAsLocked(Thread* self, bool check = kDebugLocking);
+  void RegisterAsLockedImpl(Thread* self, LockLevel level, bool check);
 
   void RegisterAsUnlocked(Thread* self);
   void RegisterAsUnlockedImpl(Thread* self, LockLevel level);
@@ -183,7 +184,10 @@ class LOCKABLE Mutex : public BaseMutex {
   void ExclusiveLock(Thread* self) ACQUIRE();
   void Lock(Thread* self) ACQUIRE() {  ExclusiveLock(self); }
 
-  // Returns true if acquires exclusive access, false otherwise.
+  // Returns true if acquires exclusive access, false otherwise. The `check` argument specifies
+  // whether lock level checking should be performed.  Should be defaulted unless we are using
+  // TryLock instead of Lock for deadlock avoidance.
+  template <bool kCheck = kDebugLocking>
   bool ExclusiveTryLock(Thread* self) TRY_ACQUIRE(true);
   bool TryLock(Thread* self) TRY_ACQUIRE(true) { return ExclusiveTryLock(self); }
   // Equivalent to ExclusiveTryLock, but retry for a short period before giving up.
@@ -342,7 +346,7 @@ class SHARED_LOCKABLE ReaderWriterMutex : public BaseMutex {
   void ReaderLock(Thread* self) ACQUIRE_SHARED() { SharedLock(self); }
 
   // Try to acquire share of ReaderWriterMutex.
-  bool SharedTryLock(Thread* self) SHARED_TRYLOCK_FUNCTION(true);
+  bool SharedTryLock(Thread* self, bool check = kDebugLocking) SHARED_TRYLOCK_FUNCTION(true);
 
   // Release a share of the access.
   void SharedUnlock(Thread* self) RELEASE_SHARED() ALWAYS_INLINE;
diff --git a/runtime/cha.cc b/runtime/cha.cc
index 8b77f76146..0c548d384a 100644
--- a/runtime/cha.cc
+++ b/runtime/cha.cc
@@ -237,11 +237,10 @@ class CHACheckpoint final : public Closure {
       : barrier_(0),
         method_headers_(method_headers) {}
 
-  void Run(Thread* thread) override {
+  void Run(Thread* thread) override REQUIRES_SHARED(Locks::mutator_lock_) {
     // Note thread and self may not be equal if thread was already suspended at
     // the point of the request.
     Thread* self = Thread::Current();
-    ScopedObjectAccess soa(self);
     CHAStackVisitor visitor(thread, nullptr, method_headers_);
     visitor.WalkStack();
     barrier_.Pass(self);
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index a7b818ed99..3c8e6e612d 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -340,7 +340,11 @@ void Dbg::DdmSetThreadNotification(bool enable) {
       Dbg::DdmSendThreadNotification(thread, CHUNK_TYPE("THCR"));
       finish_barrier.Pass(cls_self);
     });
-    size_t checkpoints = Runtime::Current()->GetThreadList()->RunCheckpoint(&fc);
+    // TODO(b/253671779): The above eventually results in calls to EventHandler::DispatchEvent,
+    // which does a ScopedThreadStateChange, which amounts to a thread state change inside the
+    // checkpoint run method. Hence the normal check would fail, and thus we specify Unchecked
+    // here.
+    size_t checkpoints = Runtime::Current()->GetThreadList()->RunCheckpointUnchecked(&fc);
     ScopedThreadSuspension sts(self, ThreadState::kWaitingForCheckPointsToRun);
     finish_barrier.Increment(self, checkpoints);
   }
diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index 9b4af663a0..a2d5a436d0 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc
@@ -70,7 +70,6 @@ class EntrypointsOrderTest : public CommonArtTest {
     EXPECT_OFFSET_DIFFP(Thread, tls32_, daemon, throwing_OutOfMemoryError, 4);
     EXPECT_OFFSET_DIFFP(Thread, tls32_, throwing_OutOfMemoryError, no_thread_suspension, 4);
     EXPECT_OFFSET_DIFFP(Thread, tls32_, no_thread_suspension, thread_exit_check_count, 4);
-    EXPECT_OFFSET_DIFFP(Thread, tls32_, thread_exit_check_count, is_transitioning_to_runnable, 4);
 
     // TODO: Better connection. Take alignment into account.
     EXPECT_OFFSET_DIFF_GT3(Thread, tls32_.thread_exit_check_count, tls64_.trace_clock_base, 4,
@@ -108,13 +107,14 @@ class EntrypointsOrderTest : public CommonArtTest {
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, name, pthread_self, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, pthread_self, last_no_thread_suspension_cause,
                         sizeof(void*));
-    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, last_no_thread_suspension_cause, active_suspend_barriers,
-                        sizeof(void*));
-    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, active_suspend_barriers, thread_local_start,
-                        sizeof(Thread::tls_ptr_sized_values::active_suspend_barriers));
-    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_start, thread_local_pos, sizeof(void*));
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, last_no_thread_suspension_cause,
+                        active_suspendall_barrier, sizeof(void*));
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, active_suspendall_barrier,
+                        active_suspend1_barriers, sizeof(void*));
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, active_suspend1_barriers, thread_local_pos, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_pos, thread_local_end, sizeof(void*));
-    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_end, thread_local_limit, sizeof(void*));
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_end, thread_local_start, sizeof(void*));
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_start, thread_local_limit, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_limit, thread_local_objects, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_objects, checkpoint_function, sizeof(size_t));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, checkpoint_function, jni_entrypoints,
@@ -137,11 +137,12 @@ class EntrypointsOrderTest : public CommonArtTest {
         Thread, tlsPtr_, top_reflective_handle_scope, method_trace_buffer, sizeof(void*));
     EXPECT_OFFSET_DIFFP(
         Thread, tlsPtr_, method_trace_buffer, method_trace_buffer_index, sizeof(void*));
+    EXPECT_OFFSET_DIFFP(
+        Thread, tlsPtr_, method_trace_buffer_index, thread_exit_flags, sizeof(void*));
     // The first field after tlsPtr_ is forced to a 16 byte alignment so it might have some space.
     auto offset_tlsptr_end = OFFSETOF_MEMBER(Thread, tlsPtr_) +
         sizeof(decltype(reinterpret_cast<Thread*>(16)->tlsPtr_));
-    CHECKED(offset_tlsptr_end - OFFSETOF_MEMBER(Thread, tlsPtr_.method_trace_buffer_index) ==
-                sizeof(void*),
+    CHECKED(offset_tlsptr_end - OFFSETOF_MEMBER(Thread, tlsPtr_.thread_exit_flags) == sizeof(void*),
             "async_exception last field");
   }
 
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 787e997ba0..0cb2609740 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -480,7 +480,8 @@ class ConcurrentCopying::ThreadFlipVisitor : public Closure, public RootVisitor
   }
 
   void Run(Thread* thread) override REQUIRES_SHARED(Locks::mutator_lock_) {
-    // Note: self is not necessarily equal to thread since thread may be suspended.
+    // We are either running this in the target thread, or the target thread will wait for us
+    // before switching back to runnable.
     Thread* self = Thread::Current();
     CHECK(thread == self || thread->GetState() != ThreadState::kRunnable)
         << thread->GetState() << " thread " << thread << " self " << self;
@@ -495,7 +496,6 @@ class ConcurrentCopying::ThreadFlipVisitor : public Closure, public RootVisitor
     // We can use the non-CAS VisitRoots functions below because we update thread-local GC roots
     // only.
     thread->VisitRoots(this, kVisitRootFlagAllRoots);
-    concurrent_copying_->GetBarrier().Pass(self);
   }
 
   void VisitRoots(mirror::Object*** roots,
@@ -764,17 +764,12 @@ void ConcurrentCopying::FlipThreadRoots() {
   }
   Thread* self = Thread::Current();
   Locks::mutator_lock_->AssertNotHeld(self);
-  gc_barrier_->Init(self, 0);
   ThreadFlipVisitor thread_flip_visitor(this, heap_->use_tlab_);
   FlipCallback flip_callback(this);
 
-  size_t barrier_count = Runtime::Current()->GetThreadList()->FlipThreadRoots(
+  Runtime::Current()->GetThreadList()->FlipThreadRoots(
       &thread_flip_visitor, &flip_callback, this, GetHeap()->GetGcPauseListener());
 
-  {
-    ScopedThreadStateChange tsc(self, ThreadState::kWaitingForCheckPointsToRun);
-    gc_barrier_->Increment(self, barrier_count);
-  }
   is_asserting_to_space_invariant_ = true;
   QuasiAtomic::ThreadFenceForConstructor();
   if (kVerboseMode) {
diff --git a/runtime/gc/collector/mark_compact.cc b/runtime/gc/collector/mark_compact.cc
index 23ddd4fc3a..6c6c9e2e18 100644
--- a/runtime/gc/collector/mark_compact.cc
+++ b/runtime/gc/collector/mark_compact.cc
@@ -688,7 +688,6 @@ class MarkCompact::ThreadFlipVisitor : public Closure {
     CHECK(collector_->compacting_);
     thread->SweepInterpreterCache(collector_);
     thread->AdjustTlab(collector_->black_objs_slide_diff_);
-    collector_->GetBarrier().Pass(self);
   }
 
  private:
@@ -736,15 +735,10 @@ void MarkCompact::RunPhases() {
 
   {
     // Compaction pause
-    gc_barrier_.Init(self, 0);
     ThreadFlipVisitor visitor(this);
     FlipCallback callback(this);
-    size_t barrier_count = runtime->GetThreadList()->FlipThreadRoots(
+    runtime->GetThreadList()->FlipThreadRoots(
         &visitor, &callback, this, GetHeap()->GetGcPauseListener());
-    {
-      ScopedThreadStateChange tsc(self, ThreadState::kWaitingForCheckPointsToRun);
-      gc_barrier_.Increment(self, barrier_count);
-    }
   }
 
   if (IsValidFd(uffd_)) {
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 2ce7666be0..e49d25b479 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -2723,113 +2723,122 @@ collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type,
   }
   ScopedThreadStateChange tsc(self, ThreadState::kWaitingPerformingGc);
   Locks::mutator_lock_->AssertNotHeld(self);
-  if (self->IsHandlingStackOverflow()) {
-    // If we are throwing a stack overflow error we probably don't have enough remaining stack
-    // space to run the GC.
-    // Count this as a GC in case someone is waiting for it to complete.
-    gcs_completed_.fetch_add(1, std::memory_order_release);
-    return collector::kGcTypeNone;
-  }
-  bool compacting_gc;
+  SelfDeletingTask* clear;  // Unconditionally set below.
   {
-    gc_complete_lock_->AssertNotHeld(self);
-    ScopedThreadStateChange tsc2(self, ThreadState::kWaitingForGcToComplete);
-    MutexLock mu(self, *gc_complete_lock_);
-    // Ensure there is only one GC at a time.
-    WaitForGcToCompleteLocked(gc_cause, self);
-    if (requested_gc_num != GC_NUM_ANY && !GCNumberLt(GetCurrentGcNum(), requested_gc_num)) {
-      // The appropriate GC was already triggered elsewhere.
-      return collector::kGcTypeNone;
-    }
-    compacting_gc = IsMovingGc(collector_type_);
-    // GC can be disabled if someone has a used GetPrimitiveArrayCritical.
-    if (compacting_gc && disable_moving_gc_count_ != 0) {
-      LOG(WARNING) << "Skipping GC due to disable moving GC count " << disable_moving_gc_count_;
-      // Again count this as a GC.
+    // We should not ever become runnable and re-suspend while executing a GC.
+    // This would likely cause a deadlock if we acted on a suspension request.
+    // TODO: We really want to assert that we don't transition to kRunnable.
+    ScopedAssertNoThreadSuspension("Performing GC");
+    if (self->IsHandlingStackOverflow()) {
+      // If we are throwing a stack overflow error we probably don't have enough remaining stack
+      // space to run the GC.
+      // Count this as a GC in case someone is waiting for it to complete.
       gcs_completed_.fetch_add(1, std::memory_order_release);
       return collector::kGcTypeNone;
     }
-    if (gc_disabled_for_shutdown_) {
-      gcs_completed_.fetch_add(1, std::memory_order_release);
-      return collector::kGcTypeNone;
-    }
-    collector_type_running_ = collector_type_;
-    last_gc_cause_ = gc_cause;
-  }
-  if (gc_cause == kGcCauseForAlloc && runtime->HasStatsEnabled()) {
-    ++runtime->GetStats()->gc_for_alloc_count;
-    ++self->GetStats()->gc_for_alloc_count;
-  }
-  const size_t bytes_allocated_before_gc = GetBytesAllocated();
-
-  DCHECK_LT(gc_type, collector::kGcTypeMax);
-  DCHECK_NE(gc_type, collector::kGcTypeNone);
-
-  collector::GarbageCollector* collector = nullptr;
-  // TODO: Clean this up.
-  if (compacting_gc) {
-    DCHECK(current_allocator_ == kAllocatorTypeBumpPointer ||
-           current_allocator_ == kAllocatorTypeTLAB ||
-           current_allocator_ == kAllocatorTypeRegion ||
-           current_allocator_ == kAllocatorTypeRegionTLAB);
-    switch (collector_type_) {
-      case kCollectorTypeSS:
-        semi_space_collector_->SetFromSpace(bump_pointer_space_);
-        semi_space_collector_->SetToSpace(temp_space_);
-        semi_space_collector_->SetSwapSemiSpaces(true);
-        collector = semi_space_collector_;
-        break;
-      case kCollectorTypeCMC:
-        collector = mark_compact_;
-        break;
-      case kCollectorTypeCC:
-        collector::ConcurrentCopying* active_cc_collector;
-        if (use_generational_cc_) {
-          // TODO: Other threads must do the flip checkpoint before they start poking at
-          // active_concurrent_copying_collector_. So we should not concurrency here.
-          active_cc_collector = (gc_type == collector::kGcTypeSticky) ?
-                  young_concurrent_copying_collector_ : concurrent_copying_collector_;
-          active_concurrent_copying_collector_.store(active_cc_collector,
-                                                     std::memory_order_relaxed);
-          DCHECK(active_cc_collector->RegionSpace() == region_space_);
-          collector = active_cc_collector;
-        } else {
-          collector = active_concurrent_copying_collector_.load(std::memory_order_relaxed);
+    bool compacting_gc;
+    {
+      gc_complete_lock_->AssertNotHeld(self);
+      ScopedThreadStateChange tsc2(self, ThreadState::kWaitingForGcToComplete);
+      MutexLock mu(self, *gc_complete_lock_);
+      // Ensure there is only one GC at a time.
+      WaitForGcToCompleteLocked(gc_cause, self);
+      if (requested_gc_num != GC_NUM_ANY && !GCNumberLt(GetCurrentGcNum(), requested_gc_num)) {
+        // The appropriate GC was already triggered elsewhere.
+        return collector::kGcTypeNone;
+      }
+      compacting_gc = IsMovingGc(collector_type_);
+      // GC can be disabled if someone has a used GetPrimitiveArrayCritical.
+      if (compacting_gc && disable_moving_gc_count_ != 0) {
+        LOG(WARNING) << "Skipping GC due to disable moving GC count " << disable_moving_gc_count_;
+        // Again count this as a GC.
+        gcs_completed_.fetch_add(1, std::memory_order_release);
+        return collector::kGcTypeNone;
+      }
+      if (gc_disabled_for_shutdown_) {
+        gcs_completed_.fetch_add(1, std::memory_order_release);
+        return collector::kGcTypeNone;
+      }
+      collector_type_running_ = collector_type_;
+      last_gc_cause_ = gc_cause;
+    }
+    if (gc_cause == kGcCauseForAlloc && runtime->HasStatsEnabled()) {
+      ++runtime->GetStats()->gc_for_alloc_count;
+      ++self->GetStats()->gc_for_alloc_count;
+    }
+    const size_t bytes_allocated_before_gc = GetBytesAllocated();
+
+    DCHECK_LT(gc_type, collector::kGcTypeMax);
+    DCHECK_NE(gc_type, collector::kGcTypeNone);
+
+    collector::GarbageCollector* collector = nullptr;
+    // TODO: Clean this up.
+    if (compacting_gc) {
+      DCHECK(current_allocator_ == kAllocatorTypeBumpPointer ||
+             current_allocator_ == kAllocatorTypeTLAB ||
+             current_allocator_ == kAllocatorTypeRegion ||
+             current_allocator_ == kAllocatorTypeRegionTLAB);
+      switch (collector_type_) {
+        case kCollectorTypeSS:
+          semi_space_collector_->SetFromSpace(bump_pointer_space_);
+          semi_space_collector_->SetToSpace(temp_space_);
+          semi_space_collector_->SetSwapSemiSpaces(true);
+          collector = semi_space_collector_;
+          break;
+        case kCollectorTypeCMC:
+          collector = mark_compact_;
+          break;
+        case kCollectorTypeCC:
+          collector::ConcurrentCopying* active_cc_collector;
+          if (use_generational_cc_) {
+            // TODO: Other threads must do the flip checkpoint before they start poking at
+            // active_concurrent_copying_collector_. So we should not concurrency here.
+            active_cc_collector = (gc_type == collector::kGcTypeSticky) ?
+                                      young_concurrent_copying_collector_ :
+                                      concurrent_copying_collector_;
+            active_concurrent_copying_collector_.store(active_cc_collector,
+                                                       std::memory_order_relaxed);
+            DCHECK(active_cc_collector->RegionSpace() == region_space_);
+            collector = active_cc_collector;
+          } else {
+            collector = active_concurrent_copying_collector_.load(std::memory_order_relaxed);
+          }
+          break;
+        default:
+          LOG(FATAL) << "Invalid collector type " << static_cast<size_t>(collector_type_);
+      }
+      // temp_space_ will be null for kCollectorTypeCMC.
+      if (temp_space_ != nullptr &&
+          collector != active_concurrent_copying_collector_.load(std::memory_order_relaxed)) {
+        temp_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
+        if (kIsDebugBuild) {
+          // Try to read each page of the memory map in case mprotect didn't work properly
+          // b/19894268.
+          temp_space_->GetMemMap()->TryReadable();
         }
-        break;
-      default:
-        LOG(FATAL) << "Invalid collector type " << static_cast<size_t>(collector_type_);
-    }
-    // temp_space_ will be null for kCollectorTypeCMC.
-    if (temp_space_ != nullptr
-        && collector != active_concurrent_copying_collector_.load(std::memory_order_relaxed)) {
-      temp_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
-      if (kIsDebugBuild) {
-        // Try to read each page of the memory map in case mprotect didn't work properly b/19894268.
-        temp_space_->GetMemMap()->TryReadable();
+        CHECK(temp_space_->IsEmpty());
       }
-      CHECK(temp_space_->IsEmpty());
+    } else if (current_allocator_ == kAllocatorTypeRosAlloc ||
+               current_allocator_ == kAllocatorTypeDlMalloc) {
+      collector = FindCollectorByGcType(gc_type);
+    } else {
+      LOG(FATAL) << "Invalid current allocator " << current_allocator_;
     }
-  } else if (current_allocator_ == kAllocatorTypeRosAlloc ||
-      current_allocator_ == kAllocatorTypeDlMalloc) {
-    collector = FindCollectorByGcType(gc_type);
-  } else {
-    LOG(FATAL) << "Invalid current allocator " << current_allocator_;
-  }
 
-  CHECK(collector != nullptr)
-      << "Could not find garbage collector with collector_type="
-      << static_cast<size_t>(collector_type_) << " and gc_type=" << gc_type;
-  collector->Run(gc_cause, clear_soft_references || runtime->IsZygote());
-  IncrementFreedEver();
-  RequestTrim(self);
-  // Collect cleared references.
-  SelfDeletingTask* clear = reference_processor_->CollectClearedReferences(self);
-  // Grow the heap so that we know when to perform the next GC.
-  GrowForUtilization(collector, bytes_allocated_before_gc);
-  old_native_bytes_allocated_.store(GetNativeBytes());
-  LogGC(gc_cause, collector);
-  FinishGC(self, gc_type);
+    CHECK(collector != nullptr) << "Could not find garbage collector with collector_type="
+                                << static_cast<size_t>(collector_type_)
+                                << " and gc_type=" << gc_type;
+    collector->Run(gc_cause, clear_soft_references || runtime->IsZygote());
+    IncrementFreedEver();
+    RequestTrim(self);
+    // Collect cleared references.
+    clear = reference_processor_->CollectClearedReferences(self);
+    // Grow the heap so that we know when to perform the next GC.
+    GrowForUtilization(collector, bytes_allocated_before_gc);
+    old_native_bytes_allocated_.store(GetNativeBytes());
+    LogGC(gc_cause, collector);
+    FinishGC(self, gc_type);
+  }
   // Actually enqueue all cleared references. Do this after the GC has officially finished since
   // otherwise we can deadlock.
   clear->Run(self);
diff --git a/runtime/mirror/object.cc b/runtime/mirror/object.cc
index 940b82dc0a..995f06e14e 100644
--- a/runtime/mirror/object.cc
+++ b/runtime/mirror/object.cc
@@ -185,7 +185,8 @@ void Object::SetHashCodeSeed(uint32_t new_seed) {
   hash_code_seed.store(new_seed, std::memory_order_relaxed);
 }
 
-int32_t Object::IdentityHashCode() {
+template <bool kAllowInflation>
+int32_t Object::IdentityHashCodeHelper() {
   ObjPtr<Object> current_this = this;  // The this pointer may get invalidated by thread suspension.
   while (true) {
     LockWord lw = current_this->GetLockWord(false);
@@ -203,6 +204,9 @@ int32_t Object::IdentityHashCode() {
         break;
       }
       case LockWord::kThinLocked: {
+        if (!kAllowInflation) {
+          return 0;
+        }
         // Inflate the thin lock to a monitor and stick the hash code inside of the monitor. May
         // fail spuriously.
         Thread* self = Thread::Current();
@@ -230,6 +234,12 @@ int32_t Object::IdentityHashCode() {
   }
 }
 
+int32_t Object::IdentityHashCode() { return IdentityHashCodeHelper</* kAllowInflation= */ true>(); }
+
+int32_t Object::IdentityHashCodeNoInflation() {
+  return IdentityHashCodeHelper</* kAllowInflation= */ false>();
+}
+
 void Object::CheckFieldAssignmentImpl(MemberOffset field_offset, ObjPtr<Object> new_value) {
   ObjPtr<Class> c = GetClass();
   Runtime* runtime = Runtime::Current();
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index 95b9f86a4b..5e770f45a5 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -137,11 +137,16 @@ class MANAGED LOCKABLE Object {
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!Roles::uninterruptible_);
 
+  // Returns a nonzero value that fits into lockword slot.
   int32_t IdentityHashCode()
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!Locks::thread_list_lock_,
                !Locks::thread_suspend_count_lock_);
 
+  // Identical to the above, but returns 0 if monitor inflation would otherwise be needed.
+  int32_t IdentityHashCodeNoInflation() REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_);
+
   static constexpr MemberOffset MonitorOffset() {
     return OFFSET_OF_OBJECT_MEMBER(Object, monitor_);
   }
@@ -726,6 +731,10 @@ class MANAGED LOCKABLE Object {
       REQUIRES_SHARED(Locks::mutator_lock_);
 
  private:
+  template <bool kAllowInflation>
+  int32_t IdentityHashCodeHelper() REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_);
+
   // Get a field with acquire semantics.
   template<typename kSize>
   ALWAYS_INLINE kSize GetFieldAcquire(MemberOffset field_offset)
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 3fed8d43b7..4e9e4d946e 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -1005,7 +1005,7 @@ bool Monitor::Deflate(Thread* self, ObjPtr<mirror::Object> obj) {
     if (monitor->num_waiters_.load(std::memory_order_relaxed) > 0) {
       return false;
     }
-    if (!monitor->monitor_lock_.ExclusiveTryLock(self)) {
+    if (!monitor->monitor_lock_.ExclusiveTryLock</* check= */ false>(self)) {
       // We cannot deflate a monitor that's currently held. It's unclear whether we should if
       // we could.
       return false;
@@ -1065,13 +1065,10 @@ void Monitor::InflateThinLocked(Thread* self, Handle<mirror::Object> obj, LockWo
     ThreadList* thread_list = Runtime::Current()->GetThreadList();
     // Suspend the owner, inflate. First change to blocked and give up mutator_lock_.
     self->SetMonitorEnterObject(obj.Get());
-    bool timed_out;
     Thread* owner;
     {
       ScopedThreadSuspension sts(self, ThreadState::kWaitingForLockInflation);
-      owner = thread_list->SuspendThreadByThreadId(owner_thread_id,
-                                                   SuspendReason::kInternal,
-                                                   &timed_out);
+      owner = thread_list->SuspendThreadByThreadId(owner_thread_id, SuspendReason::kInternal);
     }
     if (owner != nullptr) {
       // We succeeded in suspending the thread, check the lock's status didn't change.
diff --git a/runtime/mutator_gc_coord.md b/runtime/mutator_gc_coord.md
index aba8421a3e..836213d187 100644
--- a/runtime/mutator_gc_coord.md
+++ b/runtime/mutator_gc_coord.md
@@ -100,7 +100,11 @@ thread suspension mechanism, and then make them runnable again on release.
 
 Logically the mutator lock is held in shared/reader mode if ***either*** the
 underlying reader-writer lock is held in shared mode, ***or*** if a mutator is in
-runnable state.
+runnable state. These two ways of holding the mutator mutex are ***not***
+equivalent: In particular, we rely on the garbage collector never actually
+entering a "runnable" state while active (see below). However, it often runs with
+the explicit mutator mutex in shared mode, thus blocking others from acquiring it
+in exclusive mode.
 
 Suspension and checkpoint API
 -----------------------------
@@ -217,6 +221,86 @@ processing, after acquiring `reference_processor_lock_`.  This means that empty
 checkpoints do not preclude client threads from being in the middle of an
 operation that involves a weak reference access, while nonempty checkpoints do.
 
+**Suspending the GC**
+Under unusual conditions, the GC can run on any thread. This means that when
+thread *A* suspends thread *B* for some other reason, Thread *B* might be
+running the garbage collector and conceivably thus cause it to block.  This
+would be very deadlock prone. If Thread *A* allocates while Thread *B* is
+suspended in the GC, and the allocation requires the GC's help to complete, we
+deadlock.
+
+Thus we ensure that the GC, together with anything else that can block GCs,
+cannot be blocked for thread suspension requests. This is accomplished by
+ensuring that it always appears to be in a suspended thread state. Since we
+only check for suspend requests when entering the runnable state, suspend
+requests go unnoticed until the GC completes. It may physically acquire and
+release the actual `mutator_lock_` in either shared or exclusive mode.
+
+Thread Suspension Mechanics
+---------------------------
+
+Thread suspension is initiated by a registered thread, except that, for testing
+purposes, `SuspendAll` may be invoked with `self == nullptr`.  We never suspend
+the initiating thread, explicitly exclusing it from `SuspendAll()`, and failing
+`SuspendThreadBy...()` requests to that effect.
+
+The suspend calls invoke `IncrementSuspendCount()` to increment the thread
+suspend count for each thread. That adds a "suspend barrier" (atomic counter) to
+the per-thread list of such counters to decrement. It normally sets the
+`kSuspendRequest` ("should enter safepoint handler") and `kActiveSuspendBarrier`
+("need to notify us when suspended") flags.
+
+After setting these two flags, we check whether the thread is suspended and
+`kSuspendRequest` is still set. Since the thread is already suspended, it cannot
+be expected to respond to "pass the suspend barrier" (decrement the atomic
+counter) in a timely fashion.  Hence we do so on its behalf. This decrements
+the "barrier" and removes it from the thread's list of barriers to decrement,
+and clears `kActiveSuspendBarrier`. `kSuspendRequest` remains to ensure the
+thread doesn't prematurely return to runnable state.
+
+If `SuspendAllInternal()` does not immediately see a suspended state, then it is up
+to the target thread to decrement the suspend barrier.
+`TransitionFromRunnableToSuspended()` calls
+`TransitionToSuspendedAndRunCheckpoints()`, which changes the thread state
+and returns. `TransitionFromRunnableToSuspended()` then calls
+`CheckActiveSuspendBarriers()` to check for the `kActiveSuspendBarrier` flag
+and decrement the suspend barrier if set.
+
+The `suspend_count_lock_` is not consistently held in the target thread
+during this process.  Thus correctness in resolving the race between a
+suspension-requesting thread and a target thread voluntarily suspending relies
+on first requesting suspension, and then checking whether the target is
+already suspended, The detailed correctness argument is given in a comment
+inside `SuspendAllInternal()`. This also ensures that the barrier cannot be
+decremented after the stack frame holding the barrier goes away.
+
+This relies on the fact that the two stores in the two threads to the state and
+kActiveSuspendBarrier flag are ordered with respect to the later loads. That's
+guaranteed, since they are all stored in a single `atomic<>`. Thus even relaxed
+accesses are OK.
+
+The actual suspend barrier representation still varies between `SuspendAll()`
+and `SuspendThreadBy...()`.  The former relies on the fact that only one such
+barrier can be in use at a time, while the latter maintains a linked list of
+active suspend barriers for each target thread, relying on the fact that each
+one can appear on the list of only one thread, and we can thus use list nodes
+allocated in the stack frames of requesting threads.
+
+**Avoiding suspension cycles**
+
+Any thread can issue a `SuspendThreadByPeer()` or `SuspendAll()` request. But if
+Thread A increments Thread B's suspend count while Thread B increments Thread
+A's suspend count, and they then both suspend during a subsequent thread
+transition, we're deadlocked.
+
+For `SuspendAll()`, we enforce a requirement that at most one `SuspendAll()`
+request is running at one time. In addition, in all cases, we refuse to initiate
+a suspend request from a registered thread that is also being asked to suspend
+(i.e. the suspend count is nonzero).  Instead the requestor waits for that
+condition to change.  This means that we cannot create a cycle in which each
+thread has asked to suspend the next one, and thus no thread can progress.  The
+required atomicity of the requestor suspend count check with setting the suspend
+count of the target(s) target is ensured by holding `suspend_count_lock_`.
 
 [^1]: Some comments in the code refer to a not-yet-really-implemented scheme in
 which the compiler-generated code would load through the address at
diff --git a/runtime/native/dalvik_system_VMStack.cc b/runtime/native/dalvik_system_VMStack.cc
index 71078c9ad2..bf441b4b1d 100644
--- a/runtime/native/dalvik_system_VMStack.cc
+++ b/runtime/native/dalvik_system_VMStack.cc
@@ -48,19 +48,29 @@ static ResultT GetThreadStack(const ScopedFastNativeObjectAccess& soa,
   } else {
     // Never allow suspending the heap task thread since it may deadlock if allocations are
     // required for the stack trace.
-    Thread* heap_task_thread =
-        Runtime::Current()->GetHeap()->GetTaskProcessor()->GetRunningThread();
-    // heap_task_thread could be null if the daemons aren't yet started.
-    if (heap_task_thread != nullptr && decoded_peer == heap_task_thread->GetPeerFromOtherThread()) {
+    Thread* heap_task_thread = nullptr;
+    for (int i = 0; i < 20; ++i) {
+      heap_task_thread = Runtime::Current()->GetHeap()->GetTaskProcessor()->GetRunningThread();
+      if (heap_task_thread != nullptr) {
+        break;
+      }
+      // heap_task_thread could be null if the daemons aren't fully running yet.  But it can appear
+      // in enumerations, and thus we could try to get its stack even before that. Try to wait
+      // for a non-null value so we can avoid suspending it.
+      static constexpr int kHeapTaskDaemonSleepMicros = 5000;
+      usleep(kHeapTaskDaemonSleepMicros);
+    }
+    if (heap_task_thread == nullptr) {
+      LOG(ERROR) << "No HeapTaskDaemon; refusing to get thread stack.";
+      return nullptr;
+    }
+    if (decoded_peer == heap_task_thread->GetPeerFromOtherThread()) {
       return nullptr;
     }
     // Suspend thread to build stack trace.
     ScopedThreadSuspension sts(soa.Self(), ThreadState::kNative);
     ThreadList* thread_list = Runtime::Current()->GetThreadList();
-    bool timed_out;
-    Thread* thread = thread_list->SuspendThreadByPeer(peer,
-                                                      SuspendReason::kInternal,
-                                                      &timed_out);
+    Thread* thread = thread_list->SuspendThreadByPeer(peer, SuspendReason::kInternal);
     if (thread != nullptr) {
       // Must be runnable to create returned array.
       {
@@ -70,9 +80,6 @@ static ResultT GetThreadStack(const ScopedFastNativeObjectAccess& soa,
       // Restart suspended thread.
       bool resumed = thread_list->Resume(thread, SuspendReason::kInternal);
       DCHECK(resumed);
-    } else if (timed_out) {
-      LOG(ERROR) << "Trying to get thread's stack failed as the thread failed to suspend within a "
-          "generous timeout.";
     }
   }
   return trace;
diff --git a/runtime/native/java_lang_Thread.cc b/runtime/native/java_lang_Thread.cc
index 570c554782..fd67a0a7fa 100644
--- a/runtime/native/java_lang_Thread.cc
+++ b/runtime/native/java_lang_Thread.cc
@@ -147,11 +147,8 @@ static void Thread_setNativeName(JNIEnv* env, jobject peer, jstring java_name) {
   // thread list lock to avoid this, as setting the thread name causes mutator to lock/unlock
   // in the DDMS send code.
   ThreadList* thread_list = Runtime::Current()->GetThreadList();
-  bool timed_out;
   // Take suspend thread lock to avoid races with threads trying to suspend this one.
-  Thread* thread = thread_list->SuspendThreadByPeer(peer,
-                                                    SuspendReason::kInternal,
-                                                    &timed_out);
+  Thread* thread = thread_list->SuspendThreadByPeer(peer, SuspendReason::kInternal);
   if (thread != nullptr) {
     {
       ScopedObjectAccess soa(env);
@@ -159,9 +156,6 @@ static void Thread_setNativeName(JNIEnv* env, jobject peer, jstring java_name) {
     }
     bool resumed = thread_list->Resume(thread, SuspendReason::kInternal);
     DCHECK(resumed);
-  } else if (timed_out) {
-    LOG(ERROR) << "Trying to set thread name to '" << name.c_str() << "' failed as the thread "
-        "failed to suspend within a generous timeout.";
   }
 }
 
diff --git a/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc b/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
index 081ec2043a..f20cd281e8 100644
--- a/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
+++ b/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
@@ -59,7 +59,6 @@ static jobjectArray DdmVmInternal_getStackTraceById(JNIEnv* env, jclass, jint th
     trace = Thread::InternalStackTraceToStackTraceElementArray(soa, internal_trace);
   } else {
     ThreadList* thread_list = Runtime::Current()->GetThreadList();
-    bool timed_out;
 
     // Check for valid thread
     if (thin_lock_id == ThreadList::kInvalidThreadId) {
@@ -67,9 +66,7 @@ static jobjectArray DdmVmInternal_getStackTraceById(JNIEnv* env, jclass, jint th
     }
 
     // Suspend thread to build stack trace.
-    Thread* thread = thread_list->SuspendThreadByThreadId(thin_lock_id,
-                                                          SuspendReason::kInternal,
-                                                          &timed_out);
+    Thread* thread = thread_list->SuspendThreadByThreadId(thin_lock_id, SuspendReason::kInternal);
     if (thread != nullptr) {
       {
         ScopedObjectAccess soa(env);
@@ -79,11 +76,6 @@ static jobjectArray DdmVmInternal_getStackTraceById(JNIEnv* env, jclass, jint th
       // Restart suspended thread.
       bool resumed = thread_list->Resume(thread, SuspendReason::kInternal);
       DCHECK(resumed);
-    } else {
-      if (timed_out) {
-        LOG(ERROR) << "Trying to get thread's stack by id failed as the thread failed to suspend "
-            "within a generous timeout.";
-      }
     }
   }
   return trace;
diff --git a/runtime/oat.h b/runtime/oat.h
index d041726bfb..b0c8661243 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -44,8 +44,8 @@ std::ostream& operator<<(std::ostream& stream, StubType stub_type);
 class PACKED(4) OatHeader {
  public:
   static constexpr std::array<uint8_t, 4> kOatMagic { { 'o', 'a', 't', '\n' } };
-  // Last oat version changed reason: Add DEX magic to oat files.
-  static constexpr std::array<uint8_t, 4> kOatVersion{{'2', '3', '3', '\0'}};
+  // Last oat version changed reason: Change suspend barrier data structure.
+  static constexpr std::array<uint8_t, 4> kOatVersion{{'2', '3', '4', '\0'}};
 
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
   static constexpr const char* kDebuggableKey = "debuggable";
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index ce50471815..dc9fa6f0ee 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -17,8 +17,6 @@
 #ifndef ART_RUNTIME_THREAD_INL_H_
 #define ART_RUNTIME_THREAD_INL_H_
 
-#include "thread.h"
-
 #include "arch/instruction_set.h"
 #include "base/aborting.h"
 #include "base/casts.h"
@@ -28,8 +26,8 @@
 #include "jni/jni_env_ext.h"
 #include "managed_stack-inl.h"
 #include "obj_ptr-inl.h"
-#include "suspend_reason.h"
 #include "thread-current-inl.h"
+#include "thread.h"
 #include "thread_pool.h"
 
 namespace art {
@@ -249,7 +247,8 @@ inline void Thread::TransitionToSuspendedAndRunCheckpoints(ThreadState new_state
   }
 }
 
-inline void Thread::PassActiveSuspendBarriers() {
+inline void Thread::CheckActiveSuspendBarriers() {
+  DCHECK_NE(GetState(), ThreadState::kRunnable);
   while (true) {
     StateAndFlags state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
     if (LIKELY(!state_and_flags.IsFlagSet(ThreadFlag::kCheckpointRequest) &&
@@ -257,7 +256,7 @@ inline void Thread::PassActiveSuspendBarriers() {
                !state_and_flags.IsFlagSet(ThreadFlag::kActiveSuspendBarrier))) {
       break;
     } else if (state_and_flags.IsFlagSet(ThreadFlag::kActiveSuspendBarrier)) {
-      PassActiveSuspendBarriers(this);
+      PassActiveSuspendBarriers();
     } else {
       // Impossible
       LOG(FATAL) << "Fatal, thread transitioned into suspended without running the checkpoint";
@@ -265,6 +264,20 @@ inline void Thread::PassActiveSuspendBarriers() {
   }
 }
 
+inline void Thread::AddSuspend1Barrier(WrappedSuspend1Barrier* suspend1_barrier) {
+  suspend1_barrier->next_ = tlsPtr_.active_suspend1_barriers;
+  tlsPtr_.active_suspend1_barriers = suspend1_barrier;
+}
+
+inline void Thread::RemoveFirstSuspend1Barrier() {
+  tlsPtr_.active_suspend1_barriers = tlsPtr_.active_suspend1_barriers->next_;
+}
+
+inline bool Thread::HasActiveSuspendBarrier() {
+  return tlsPtr_.active_suspend1_barriers != nullptr ||
+         tlsPtr_.active_suspendall_barrier != nullptr;
+}
+
 inline void Thread::TransitionFromRunnableToSuspended(ThreadState new_state) {
   // Note: JNI stubs inline a fast path of this method that transitions to suspended if
   // there are no flags set and then clears the `held_mutexes[kMutatorLock]` (this comes
@@ -280,7 +293,7 @@ inline void Thread::TransitionFromRunnableToSuspended(ThreadState new_state) {
   // Mark the release of the share of the mutator lock.
   GetMutatorLock()->TransitionFromRunnableToSuspended(this);
   // Once suspended - check the active suspend barrier flag
-  PassActiveSuspendBarriers();
+  CheckActiveSuspendBarriers();
 }
 
 inline ThreadState Thread::TransitionFromSuspendedToRunnable() {
@@ -290,6 +303,7 @@ inline ThreadState Thread::TransitionFromSuspendedToRunnable() {
   // inlined from the `GetMutatorLock()->TransitionFromSuspendedToRunnable(this)` below).
   // Therefore any code added here (other than debug build assertions) should be gated
   // on some flag being set, so that the JNI stub can take the slow path to get here.
+  DCHECK(this == Current());
   StateAndFlags old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
   ThreadState old_state = old_state_and_flags.GetState();
   DCHECK_NE(old_state, ThreadState::kRunnable);
@@ -311,7 +325,7 @@ inline ThreadState Thread::TransitionFromSuspendedToRunnable() {
         break;
       }
     } else if (old_state_and_flags.IsFlagSet(ThreadFlag::kActiveSuspendBarrier)) {
-      PassActiveSuspendBarriers(this);
+      PassActiveSuspendBarriers();
     } else if (UNLIKELY(old_state_and_flags.IsFlagSet(ThreadFlag::kCheckpointRequest) ||
                         old_state_and_flags.IsFlagSet(ThreadFlag::kEmptyCheckpointRequest))) {
       // Checkpoint flags should not be set while in suspended state.
@@ -333,7 +347,6 @@ inline ThreadState Thread::TransitionFromSuspendedToRunnable() {
         thread_to_pass = this;
       }
       MutexLock mu(thread_to_pass, *Locks::thread_suspend_count_lock_);
-      ScopedTransitioningToRunnable scoped_transitioning_to_runnable(this);
       // Reload state and flags after locking the mutex.
       old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
       DCHECK_EQ(old_state, old_state_and_flags.GetState());
@@ -345,14 +358,15 @@ inline ThreadState Thread::TransitionFromSuspendedToRunnable() {
         DCHECK_EQ(old_state, old_state_and_flags.GetState());
       }
       DCHECK_EQ(GetSuspendCount(), 0);
-    } else if (UNLIKELY(old_state_and_flags.IsFlagSet(ThreadFlag::kRunningFlipFunction)) ||
-               UNLIKELY(old_state_and_flags.IsFlagSet(ThreadFlag::kWaitingForFlipFunction))) {
-      // It's possible that some thread runs this thread's flip-function in
-      // Thread::GetPeerFromOtherThread() even though it was runnable.
+    } else if (UNLIKELY(old_state_and_flags.IsFlagSet(ThreadFlag::kRunningFlipFunction))) {
+      DCHECK(!old_state_and_flags.IsFlagSet(ThreadFlag::kPendingFlipFunction));
+      // Do this before transitioning to runnable, both because we shouldn't wait in a runnable
+      // state, and so that the thread running the flip function can DCHECK we're not runnable.
       WaitForFlipFunction(this);
-    } else {
-      DCHECK(old_state_and_flags.IsFlagSet(ThreadFlag::kPendingFlipFunction));
-      if (EnsureFlipFunctionStarted(this, old_state_and_flags)) {
+    } else if (old_state_and_flags.IsFlagSet(ThreadFlag::kPendingFlipFunction)) {
+      // Logically acquire mutator lock in shared mode.
+      DCHECK(!old_state_and_flags.IsFlagSet(ThreadFlag::kRunningFlipFunction));
+      if (EnsureFlipFunctionStarted(this, this, old_state_and_flags)) {
         break;
       }
     }
@@ -360,6 +374,7 @@ inline ThreadState Thread::TransitionFromSuspendedToRunnable() {
     old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
     DCHECK_EQ(old_state, old_state_and_flags.GetState());
   }
+  DCHECK_EQ(this->GetState(), ThreadState::kRunnable);
   return static_cast<ThreadState>(old_state);
 }
 
@@ -438,35 +453,70 @@ inline void Thread::PoisonObjectPointersIfDebug() {
   }
 }
 
-inline bool Thread::ModifySuspendCount(Thread* self,
-                                       int delta,
-                                       AtomicInteger* suspend_barrier,
-                                       SuspendReason reason) {
-  if (delta > 0 &&
-      (((gUseUserfaultfd || gUseReadBarrier) && this != self) || suspend_barrier != nullptr)) {
-    // When delta > 0 (requesting a suspend), ModifySuspendCountInternal() may fail either if
-    // active_suspend_barriers is full or we are in the middle of a thread flip. Retry in a loop.
-    while (true) {
-      if (LIKELY(ModifySuspendCountInternal(self, delta, suspend_barrier, reason))) {
-        return true;
-      } else {
-        // Failure means the list of active_suspend_barriers is full or we are in the middle of a
-        // thread flip, we should release the thread_suspend_count_lock_ (to avoid deadlock) and
-        // wait till the target thread has executed or Thread::PassActiveSuspendBarriers() or the
-        // flip function. Note that we could not simply wait for the thread to change to a suspended
-        // state, because it might need to run checkpoint function before the state change or
-        // resumes from the resume_cond_, which also needs thread_suspend_count_lock_.
-        //
-        // The list of active_suspend_barriers is very unlikely to be full since more than
-        // kMaxSuspendBarriers threads need to execute SuspendAllInternal() simultaneously, and
-        // target thread stays in kRunnable in the mean time.
-        Locks::thread_suspend_count_lock_->ExclusiveUnlock(self);
-        NanoSleep(100000);
-        Locks::thread_suspend_count_lock_->ExclusiveLock(self);
-      }
+inline void Thread::IncrementSuspendCount(Thread* self,
+                                          AtomicInteger* suspendall_barrier,
+                                          WrappedSuspend1Barrier* suspend1_barrier,
+                                          SuspendReason reason) {
+  if (kIsDebugBuild) {
+    Locks::thread_suspend_count_lock_->AssertHeld(self);
+    if (this != self && !IsSuspended()) {
+      Locks::thread_list_lock_->AssertHeld(self);
     }
-  } else {
-    return ModifySuspendCountInternal(self, delta, suspend_barrier, reason);
+  }
+  if (UNLIKELY(reason == SuspendReason::kForUserCode)) {
+    Locks::user_code_suspension_lock_->AssertHeld(self);
+  }
+
+  uint32_t flags = enum_cast<uint32_t>(ThreadFlag::kSuspendRequest);
+  if (suspendall_barrier != nullptr) {
+    DCHECK(suspend1_barrier == nullptr);
+    DCHECK(tlsPtr_.active_suspendall_barrier == nullptr);
+    tlsPtr_.active_suspendall_barrier = suspendall_barrier;
+    flags |= enum_cast<uint32_t>(ThreadFlag::kActiveSuspendBarrier);
+  } else if (suspend1_barrier != nullptr) {
+    AddSuspend1Barrier(suspend1_barrier);
+    flags |= enum_cast<uint32_t>(ThreadFlag::kActiveSuspendBarrier);
+  }
+
+  ++tls32_.suspend_count;
+  if (reason == SuspendReason::kForUserCode) {
+    ++tls32_.user_code_suspend_count;
+  }
+
+  // Two bits might be set simultaneously.
+  tls32_.state_and_flags.fetch_or(flags, std::memory_order_release);
+  TriggerSuspend();
+}
+
+inline void Thread::IncrementSuspendCount(Thread* self) {
+  IncrementSuspendCount(self, nullptr, nullptr, SuspendReason::kInternal);
+}
+
+inline void Thread::DecrementSuspendCount(Thread* self, bool for_user_code) {
+  if (kIsDebugBuild) {
+    Locks::thread_suspend_count_lock_->AssertHeld(self);
+    if (this != self && !IsSuspended()) {
+      Locks::thread_list_lock_->AssertHeld(self);
+    }
+  }
+  if (UNLIKELY(tls32_.suspend_count <= 0)) {
+    UnsafeLogFatalForSuspendCount(self, this);
+    UNREACHABLE();
+  }
+  if (for_user_code) {
+    Locks::user_code_suspension_lock_->AssertHeld(self);
+    if (UNLIKELY(tls32_.user_code_suspend_count <= 0)) {
+      LOG(ERROR) << "user_code_suspend_count incorrect";
+      UnsafeLogFatalForSuspendCount(self, this);
+      UNREACHABLE();
+    }
+    --tls32_.user_code_suspend_count;
+  }
+
+  --tls32_.suspend_count;
+
+  if (tls32_.suspend_count == 0) {
+    AtomicClearFlag(ThreadFlag::kSuspendRequest, std::memory_order_release);
   }
 }
 
@@ -499,6 +549,34 @@ inline void Thread::ResetDefaultStackEnd() {
   tlsPtr_.stack_end = tlsPtr_.stack_begin + GetStackOverflowReservedBytes(kRuntimeISA);
 }
 
+inline void Thread::NotifyOnThreadExit(ThreadExitFlag* tef) {
+  tef->next_ = tlsPtr_.thread_exit_flags;
+  if (tef->next_ == nullptr) {
+    tlsPtr_.thread_exit_flags = tef;
+  } else {
+    DCHECK(!tef->next_->HasExited());
+    tef->next_->prev_ = tef;
+  }
+  tef->prev_ = nullptr;
+}
+
+inline void Thread::UnregisterThreadExitFlag(ThreadExitFlag* tef) {
+  if (tef->HasExited()) {
+    // List is no longer used; each client will deallocate its own ThreadExitFlag.
+    return;
+  }
+  // Remove tef from the list.
+  if (tef->next_ != nullptr) {
+    tef->next_->prev_ = tef->prev_;
+  }
+  if (tef->prev_ == nullptr) {
+    DCHECK_EQ(tlsPtr_.thread_exit_flags, tef);
+    tlsPtr_.thread_exit_flags = tef->next_;
+  } else {
+    tef->prev_->next_ = tef->next_;
+  }
+}
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_THREAD_INL_H_
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 4c9d5efe42..58bde009a8 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -144,9 +144,6 @@ thread_local Thread* Thread::self_tls_ = nullptr;
 #endif
 
 static constexpr bool kVerifyImageObjectsMarked = kIsDebugBuild;
-// Amount of time (in microseconds) that we sleep if another thread is running
-// flip function of the thread that we are interested in.
-static constexpr size_t kSuspendTimeDuringFlip = 5'000;
 
 // For implicit overflow checks we reserve an extra piece of memory at the bottom
 // of the stack (lowest memory).  The higher portion of the memory
@@ -1454,7 +1451,7 @@ uint64_t Thread::GetCpuMicroTime() const {
 }
 
 // Attempt to rectify locks so that we dump thread list with required locks before exiting.
-static void UnsafeLogFatalForSuspendCount(Thread* self, Thread* thread) NO_THREAD_SAFETY_ANALYSIS {
+void Thread::UnsafeLogFatalForSuspendCount(Thread* self, Thread* thread) NO_THREAD_SAFETY_ANALYSIS {
   LOG(ERROR) << *thread << " suspend count already zero.";
   Locks::thread_suspend_count_lock_->Unlock(self);
   if (!Locks::mutator_lock_->IsSharedHeld(self)) {
@@ -1472,141 +1469,53 @@ static void UnsafeLogFatalForSuspendCount(Thread* self, Thread* thread) NO_THREA
   std::ostringstream ss;
   Runtime::Current()->GetThreadList()->Dump(ss);
   LOG(FATAL) << ss.str();
+  UNREACHABLE();
 }
 
-bool Thread::ModifySuspendCountInternal(Thread* self,
-                                        int delta,
-                                        AtomicInteger* suspend_barrier,
-                                        SuspendReason reason) {
-  if (kIsDebugBuild) {
-    DCHECK(delta == -1 || delta == +1)
-          << reason << " " << delta << " " << this;
-    Locks::thread_suspend_count_lock_->AssertHeld(self);
-    if (this != self && !IsSuspended()) {
-      Locks::thread_list_lock_->AssertHeld(self);
-    }
-  }
-  // User code suspensions need to be checked more closely since they originate from code outside of
-  // the runtime's control.
-  if (UNLIKELY(reason == SuspendReason::kForUserCode)) {
-    Locks::user_code_suspension_lock_->AssertHeld(self);
-    if (UNLIKELY(delta + tls32_.user_code_suspend_count < 0)) {
-      LOG(ERROR) << "attempting to modify suspend count in an illegal way.";
-      return false;
-    }
-  }
-  if (UNLIKELY(delta < 0 && tls32_.suspend_count <= 0)) {
-    UnsafeLogFatalForSuspendCount(self, this);
-    return false;
-  }
-
-  if (delta > 0 && this != self && tlsPtr_.flip_function != nullptr) {
-    // Force retry of a suspend request if it's in the middle of a thread flip to avoid a
-    // deadlock. b/31683379.
-    return false;
-  }
-
-  uint32_t flags = enum_cast<uint32_t>(ThreadFlag::kSuspendRequest);
-  if (delta > 0 && suspend_barrier != nullptr) {
-    uint32_t available_barrier = kMaxSuspendBarriers;
-    for (uint32_t i = 0; i < kMaxSuspendBarriers; ++i) {
-      if (tlsPtr_.active_suspend_barriers[i] == nullptr) {
-        available_barrier = i;
-        break;
-      }
-    }
-    if (available_barrier == kMaxSuspendBarriers) {
-      // No barrier spaces available, we can't add another.
-      return false;
-    }
-    tlsPtr_.active_suspend_barriers[available_barrier] = suspend_barrier;
-    flags |= enum_cast<uint32_t>(ThreadFlag::kActiveSuspendBarrier);
-  }
-
-  tls32_.suspend_count += delta;
-  switch (reason) {
-    case SuspendReason::kForUserCode:
-      tls32_.user_code_suspend_count += delta;
-      break;
-    case SuspendReason::kInternal:
-      break;
-  }
-
-  if (tls32_.suspend_count == 0) {
-    AtomicClearFlag(ThreadFlag::kSuspendRequest);
-  } else {
-    // Two bits might be set simultaneously.
-    tls32_.state_and_flags.fetch_or(flags, std::memory_order_seq_cst);
-    TriggerSuspend();
-  }
-  return true;
-}
-
-bool Thread::PassActiveSuspendBarriers(Thread* self) {
-  // Grab the suspend_count lock and copy the current set of
-  // barriers. Then clear the list and the flag. The ModifySuspendCount
-  // function requires the lock so we prevent a race between setting
+bool Thread::PassActiveSuspendBarriers() {
+  DCHECK_EQ(this, Thread::Current());
+  DCHECK_NE(GetState(), ThreadState::kRunnable);
+  // Grab the suspend_count lock and copy the current set of barriers. Then clear the list and the
+  // flag. The IncrementSuspendCount function requires the lock so we prevent a race between setting
   // the kActiveSuspendBarrier flag and clearing it.
-  AtomicInteger* pass_barriers[kMaxSuspendBarriers];
+  std::vector<AtomicInteger*> pass_barriers{};
   {
-    MutexLock mu(self, *Locks::thread_suspend_count_lock_);
+    MutexLock mu(this, *Locks::thread_suspend_count_lock_);
     if (!ReadFlag(ThreadFlag::kActiveSuspendBarrier)) {
-      // quick exit test: the barriers have already been claimed - this is
-      // possible as there may be a race to claim and it doesn't matter
-      // who wins.
-      // All of the callers of this function (except the SuspendAllInternal)
-      // will first test the kActiveSuspendBarrier flag without lock. Here
-      // double-check whether the barrier has been passed with the
-      // suspend_count lock.
+      // quick exit test: the barriers have already been claimed - this is possible as there may
+      // be a race to claim and it doesn't matter who wins.
+      // All of the callers of this function (except the SuspendAllInternal) will first test the
+      // kActiveSuspendBarrier flag without lock. Here double-check whether the barrier has been
+      // passed with the suspend_count lock.
       return false;
     }
-
-    for (uint32_t i = 0; i < kMaxSuspendBarriers; ++i) {
-      pass_barriers[i] = tlsPtr_.active_suspend_barriers[i];
-      tlsPtr_.active_suspend_barriers[i] = nullptr;
+    if (tlsPtr_.active_suspendall_barrier != nullptr) {
+      // We have at most one active active_suspendall_barrier. See thread.h comment.
+      pass_barriers.push_back(tlsPtr_.active_suspendall_barrier);
+      tlsPtr_.active_suspendall_barrier = nullptr;
     }
+    for (WrappedSuspend1Barrier* w = tlsPtr_.active_suspend1_barriers; w != nullptr; w = w->next_) {
+      pass_barriers.push_back(&(w->barrier_));
+    }
+    tlsPtr_.active_suspend1_barriers = nullptr;
     AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier);
   }
 
   uint32_t barrier_count = 0;
-  for (uint32_t i = 0; i < kMaxSuspendBarriers; i++) {
-    AtomicInteger* pending_threads = pass_barriers[i];
-    if (pending_threads != nullptr) {
-      bool done = false;
-      do {
-        int32_t cur_val = pending_threads->load(std::memory_order_relaxed);
-        CHECK_GT(cur_val, 0) << "Unexpected value for PassActiveSuspendBarriers(): " << cur_val;
-        // Reduce value by 1.
-        done = pending_threads->CompareAndSetWeakRelaxed(cur_val, cur_val - 1);
+  for (AtomicInteger* barrier : pass_barriers) {
+    ++barrier_count;
+    int32_t old_val = barrier->fetch_sub(1, std::memory_order_release);
+    CHECK_GT(old_val, 0) << "Unexpected value for PassActiveSuspendBarriers(): " << old_val;
 #if ART_USE_FUTEXES
-        if (done && (cur_val - 1) == 0) {  // Weak CAS may fail spuriously.
-          futex(pending_threads->Address(), FUTEX_WAKE_PRIVATE, INT_MAX, nullptr, nullptr, 0);
-        }
-#endif
-      } while (!done);
-      ++barrier_count;
+    if (old_val == 1) {
+      futex(barrier->Address(), FUTEX_WAKE_PRIVATE, INT_MAX, nullptr, nullptr, 0);
     }
+#endif
   }
   CHECK_GT(barrier_count, 0U);
   return true;
 }
 
-void Thread::ClearSuspendBarrier(AtomicInteger* target) {
-  CHECK(ReadFlag(ThreadFlag::kActiveSuspendBarrier));
-  bool clear_flag = true;
-  for (uint32_t i = 0; i < kMaxSuspendBarriers; ++i) {
-    AtomicInteger* ptr = tlsPtr_.active_suspend_barriers[i];
-    if (ptr == target) {
-      tlsPtr_.active_suspend_barriers[i] = nullptr;
-    } else if (ptr != nullptr) {
-      clear_flag = false;
-    }
-  }
-  if (LIKELY(clear_flag)) {
-    AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier);
-  }
-}
-
 void Thread::RunCheckpointFunction() {
   DCHECK_EQ(Thread::Current(), this);
   CHECK(!GetStateAndFlags(std::memory_order_relaxed).IsAnyOfFlagsSet(FlipFunctionFlags()));
@@ -1641,28 +1550,25 @@ void Thread::RunEmptyCheckpoint() {
 }
 
 bool Thread::RequestCheckpoint(Closure* function) {
-  StateAndFlags old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
-  if (old_state_and_flags.GetState() != ThreadState::kRunnable) {
-    return false;  // Fail, thread is suspended and so can't run a checkpoint.
-  }
-
-  // We must be runnable to request a checkpoint.
-  DCHECK_EQ(old_state_and_flags.GetState(), ThreadState::kRunnable);
-  StateAndFlags new_state_and_flags = old_state_and_flags;
-  new_state_and_flags.SetFlag(ThreadFlag::kCheckpointRequest);
-  bool success = tls32_.state_and_flags.CompareAndSetStrongSequentiallyConsistent(
-      old_state_and_flags.GetValue(), new_state_and_flags.GetValue());
-  if (success) {
-    // Succeeded setting checkpoint flag, now insert the actual checkpoint.
-    if (tlsPtr_.checkpoint_function == nullptr) {
-      tlsPtr_.checkpoint_function = function;
-    } else {
-      checkpoint_overflow_.push_back(function);
-    }
-    CHECK(ReadFlag(ThreadFlag::kCheckpointRequest));
-    TriggerSuspend();
+  StateAndFlags old_state_and_flags(0 /* unused */), new_state_and_flags(0 /* unused */);
+  do {
+    old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
+    if (old_state_and_flags.GetState() != ThreadState::kRunnable) {
+      return false;  // Fail, thread is suspended and so can't run a checkpoint.
+    }
+    new_state_and_flags = old_state_and_flags;
+    new_state_and_flags.SetFlag(ThreadFlag::kCheckpointRequest);
+  } while (!tls32_.state_and_flags.CompareAndSetWeakSequentiallyConsistent(
+      old_state_and_flags.GetValue(), new_state_and_flags.GetValue()));
+  // Succeeded setting checkpoint flag, now insert the actual checkpoint.
+  if (tlsPtr_.checkpoint_function == nullptr) {
+    tlsPtr_.checkpoint_function = function;
+  } else {
+    checkpoint_overflow_.push_back(function);
   }
-  return success;
+  DCHECK(ReadFlag(ThreadFlag::kCheckpointRequest));
+  TriggerSuspend();
+  return true;
 }
 
 bool Thread::RequestEmptyCheckpoint() {
@@ -1694,8 +1600,8 @@ class BarrierClosure : public Closure {
     barrier_.Pass(self);
   }
 
-  void Wait(Thread* self, ThreadState suspend_state) {
-    if (suspend_state != ThreadState::kRunnable) {
+  void Wait(Thread* self, ThreadState wait_state) {
+    if (wait_state != ThreadState::kRunnable) {
       barrier_.Increment<Barrier::kDisallowHoldingLocks>(self, 1);
     } else {
       barrier_.Increment<Barrier::kAllowHoldingLocks>(self, 1);
@@ -1708,9 +1614,9 @@ class BarrierClosure : public Closure {
 };
 
 // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution.
-bool Thread::RequestSynchronousCheckpoint(Closure* function, ThreadState suspend_state) {
+bool Thread::RequestSynchronousCheckpoint(Closure* function, ThreadState wait_state) {
   Thread* self = Thread::Current();
-  if (this == Thread::Current()) {
+  if (this == self) {
     Locks::thread_list_lock_->AssertExclusiveHeld(self);
     // Unlock the tll before running so that the state is the same regardless of thread.
     Locks::thread_list_lock_->ExclusiveUnlock(self);
@@ -1722,185 +1628,228 @@ bool Thread::RequestSynchronousCheckpoint(Closure* function, ThreadState suspend
   // The current thread is not this thread.
 
   if (GetState() == ThreadState::kTerminated) {
-    Locks::thread_list_lock_->ExclusiveUnlock(self);
+    Locks::thread_list_lock_->Unlock(self);
     return false;
   }
 
-  struct ScopedThreadListLockUnlock {
-    explicit ScopedThreadListLockUnlock(Thread* self_in) RELEASE(*Locks::thread_list_lock_)
-        : self_thread(self_in) {
-      Locks::thread_list_lock_->AssertHeld(self_thread);
-      Locks::thread_list_lock_->Unlock(self_thread);
-    }
-
-    ~ScopedThreadListLockUnlock() ACQUIRE(*Locks::thread_list_lock_) {
-      Locks::thread_list_lock_->AssertNotHeld(self_thread);
-      Locks::thread_list_lock_->Lock(self_thread);
-    }
-
-    Thread* self_thread;
-  };
-
-  for (;;) {
-    Locks::thread_list_lock_->AssertExclusiveHeld(self);
-    // If this thread is runnable, try to schedule a checkpoint. Do some gymnastics to not hold the
-    // suspend-count lock for too long.
-    if (GetState() == ThreadState::kRunnable) {
-      BarrierClosure barrier_closure(function);
-      bool installed = false;
-      {
-        MutexLock mu(self, *Locks::thread_suspend_count_lock_);
-        installed = RequestCheckpoint(&barrier_closure);
-      }
-      if (installed) {
-        // Relinquish the thread-list lock. We should not wait holding any locks. We cannot
-        // reacquire it since we don't know if 'this' hasn't been deleted yet.
-        Locks::thread_list_lock_->ExclusiveUnlock(self);
-        ScopedThreadStateChange sts(self, suspend_state);
-        barrier_closure.Wait(self, suspend_state);
-        return true;
-      }
-      // Fall-through.
-    }
-
-    // This thread is not runnable, make sure we stay suspended, then run the checkpoint.
-    // Note: ModifySuspendCountInternal also expects the thread_list_lock to be held in
-    //       certain situations.
+  Locks::thread_list_lock_->AssertExclusiveHeld(self);
+  // If target "this" thread is runnable, try to schedule a checkpoint. Do some gymnastics to not
+  // hold the suspend-count lock for too long.
+  if (GetState() == ThreadState::kRunnable) {
+    BarrierClosure barrier_closure(function);
+    bool installed = false;
     {
-      MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-
-      if (!ModifySuspendCount(self, +1, nullptr, SuspendReason::kInternal)) {
-        // Just retry the loop.
-        sched_yield();
-        continue;
-      }
+      MutexLock mu(self, *Locks::thread_suspend_count_lock_);
+      installed = RequestCheckpoint(&barrier_closure);
+    }
+    if (installed) {
+      // Relinquish the thread-list lock. We should not wait holding any locks. We cannot
+      // reacquire it since we don't know if 'this' hasn't been deleted yet.
+      Locks::thread_list_lock_->ExclusiveUnlock(self);
+      ScopedThreadStateChange sts(self, wait_state);
+      // Wait state can be kRunnable, in which case, for lock ordering purposes, it's as if we ran
+      // the closure ourselves. This means that the target thread should not acquire a pre-mutator
+      // lock without running the checkpoint, and the closure should not acquire a pre-mutator
+      // lock or suspend.
+      barrier_closure.Wait(self, wait_state);
+      return true;
     }
+    // No longer runnable. Fall-through.
+  }
+
+  // Target "this" thread was not runnable. Suspend it, hopefully redundantly,
+  // but it might have become runnable in the meantime.
+  // Although this is a thread suspension, the target thread only blocks while we run the
+  // checkpoint, which is presumed to terminate quickly even if other threads are blocked.
+  // Note: IncrementSuspendCount also expects the thread_list_lock to be held in
+  //       certain situations.
+  {
+    bool is_suspended = false;
+    WrappedSuspend1Barrier wrapped_barrier{};
 
     {
-      // Release for the wait. The suspension will keep us from being deleted. Reacquire after so
-      // that we can call ModifySuspendCount without racing against ThreadList::Unregister.
-      ScopedThreadListLockUnlock stllu(self);
-      {
-        ScopedThreadStateChange sts(self, suspend_state);
-        while (GetState() == ThreadState::kRunnable) {
-          // We became runnable again. Wait till the suspend triggered in ModifySuspendCount
-          // moves us to suspended.
-          sched_yield();
+      MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
+      // If wait_state is kRunnable, function may not suspend. We thus never block because
+      // we ourselves are being asked to suspend.
+      if (UNLIKELY(wait_state != ThreadState::kRunnable && self->GetSuspendCount() != 0)) {
+        // We are being asked to suspend while we are suspending another thread that may be
+        // responsible for our suspension. This is likely to result in deadlock if we each
+        // block on the suspension request. Instead we wait for the situation to change.
+        ThreadExitFlag target_status;
+        NotifyOnThreadExit(&target_status);
+        for (int iter_count = 1; self->GetSuspendCount() != 0; ++iter_count) {
+          Locks::thread_suspend_count_lock_->Unlock(self);
+          Locks::thread_list_lock_->Unlock(self);
+          {
+            ScopedThreadStateChange sts(self, wait_state);
+            usleep(ThreadList::kThreadSuspendSleepUs);
+          }
+          CHECK_LT(iter_count, ThreadList::kMaxSuspendRetries);
+          Locks::thread_list_lock_->Lock(self);
+          if (target_status.HasExited()) {
+            Locks::thread_list_lock_->Unlock(self);
+            return false;
+          }
+          Locks::thread_suspend_count_lock_->Lock(self);
         }
+        UnregisterThreadExitFlag(&target_status);
       }
-      // Ensure that the flip function for this thread, if pending, is finished *before*
-      // the checkpoint function is run. Otherwise, we may end up with both `to' and 'from'
-      // space references on the stack, confusing the GC's thread-flip logic. The caller is
-      // runnable so can't have a pending flip function.
-      DCHECK_EQ(self->GetState(), ThreadState::kRunnable);
-      DCHECK(
-          !self->GetStateAndFlags(std::memory_order_relaxed).IsAnyOfFlagsSet(FlipFunctionFlags()));
-      EnsureFlipFunctionStarted(self);
-      while (GetStateAndFlags(std::memory_order_acquire).IsAnyOfFlagsSet(FlipFunctionFlags())) {
-        usleep(kSuspendTimeDuringFlip);
+      IncrementSuspendCount(self, nullptr, &wrapped_barrier, SuspendReason::kInternal);
+      DCHECK_GT(GetSuspendCount(), 0);
+      DCHECK_EQ(self->GetSuspendCount(), 0);
+      // Since we've incremented the suspend count, "this" thread can no longer disappear.
+      Locks::thread_list_lock_->ExclusiveUnlock(self);
+      if (IsSuspended()) {
+        // See the discussion in mutator_gc_coord.md and SuspendAllInternal for the race here.
+        RemoveFirstSuspend1Barrier();
+        if (!HasActiveSuspendBarrier()) {
+          AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier);
+        }
+        is_suspended = true;
       }
-
-      function->Run(this);
     }
-
-    {
-      MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-
-      DCHECK_NE(GetState(), ThreadState::kRunnable);
-      bool updated = ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal);
-      DCHECK(updated);
-      // Imitate ResumeAll, the thread may be waiting on Thread::resume_cond_ since we raised its
-      // suspend count. Now the suspend_count_ is lowered so we must do the broadcast.
-      Thread::resume_cond_->Broadcast(self);
+    if (!is_suspended) {
+      bool success =
+          Runtime::Current()->GetThreadList()->WaitForSuspendBarrier(&wrapped_barrier.barrier_);
+      CHECK(success);
     }
 
-    // Release the thread_list_lock_ to be consistent with the barrier-closure path.
-    Locks::thread_list_lock_->ExclusiveUnlock(self);
+    // Ensure that the flip function for this thread, if pending, is finished *before*
+    // the checkpoint function is run. Otherwise, we may end up with both `to' and 'from'
+    // space references on the stack, confusing the GC's thread-flip logic. The caller is
+    // runnable so can't have a pending flip function.
+    DCHECK_EQ(self->GetState(), ThreadState::kRunnable);
+    DCHECK(!self->GetStateAndFlags(std::memory_order_relaxed).IsAnyOfFlagsSet(FlipFunctionFlags()));
+    EnsureFlipFunctionStarted(self, this);
+    if (ReadFlag(ThreadFlag::kRunningFlipFunction)) {
+      WaitForFlipFunction(self);
+    }
+    function->Run(this);
+  }
 
-    return true;  // We're done, break out of the loop.
+  {
+    MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
+    DCHECK_NE(GetState(), ThreadState::kRunnable);
+    DCHECK_GT(GetSuspendCount(), 0);
+    DecrementSuspendCount(self);
+    resume_cond_->Broadcast(self);
   }
+
+  return true;
 }
 
 void Thread::SetFlipFunction(Closure* function) {
   // This is called with all threads suspended, except for the calling thread.
   DCHECK(IsSuspended() || Thread::Current() == this);
   DCHECK(function != nullptr);
-  DCHECK(tlsPtr_.flip_function == nullptr);
-  tlsPtr_.flip_function = function;
+  DCHECK(GetFlipFunction() == nullptr);
+  tlsPtr_.flip_function.store(function, std::memory_order_relaxed);
   DCHECK(!GetStateAndFlags(std::memory_order_relaxed).IsAnyOfFlagsSet(FlipFunctionFlags()));
   AtomicSetFlag(ThreadFlag::kPendingFlipFunction, std::memory_order_release);
 }
 
-bool Thread::EnsureFlipFunctionStarted(Thread* self, StateAndFlags old_state_and_flags) {
+bool Thread::EnsureFlipFunctionStarted(Thread* self,
+                                       Thread* target,
+                                       StateAndFlags old_state_and_flags,
+                                       ThreadExitFlag* tef,
+                                       bool* finished) {
+  //  Note: *target may have been destroyed. We have to be careful about
+  //  accessing it. That is the reason this is static and not a member function.
   bool become_runnable;
+  DCHECK(self == Current());
+  bool check_exited = (tef != nullptr);
+  auto maybe_release = [=]() NO_THREAD_SAFETY_ANALYSIS /* conditionally unlocks */ {
+    if (check_exited) {
+      Locks::thread_list_lock_->Unlock(self);
+    }
+  };
+  auto set_finished = [=](bool value) {
+    if (finished != nullptr) {
+      *finished = value;
+    }
+  };
+
+  if (check_exited) {
+    Locks::thread_list_lock_->Lock(self);
+    if (tef->HasExited()) {
+      Locks::thread_list_lock_->Unlock(self);
+      set_finished(true);
+      return false;
+    }
+  }
   if (old_state_and_flags.GetValue() == 0) {
     become_runnable = false;
-    old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
+    old_state_and_flags = target->GetStateAndFlags(std::memory_order_relaxed);
   } else {
     become_runnable = true;
+    DCHECK(!check_exited);
+    DCHECK(target == self);
+    DCHECK(old_state_and_flags.IsFlagSet(ThreadFlag::kPendingFlipFunction));
     DCHECK(!old_state_and_flags.IsFlagSet(ThreadFlag::kSuspendRequest));
   }
-
-  while (old_state_and_flags.IsFlagSet(ThreadFlag::kPendingFlipFunction)) {
+  while (true) {
+    DCHECK(!check_exited || (Locks::thread_list_lock_->IsExclusiveHeld(self) && !tef->HasExited()));
+    if (!old_state_and_flags.IsFlagSet(ThreadFlag::kPendingFlipFunction)) {
+      maybe_release();
+      set_finished(!old_state_and_flags.IsFlagSet(ThreadFlag::kRunningFlipFunction));
+      return false;
+    }
     DCHECK(!old_state_and_flags.IsFlagSet(ThreadFlag::kRunningFlipFunction));
     StateAndFlags new_state_and_flags =
         old_state_and_flags.WithFlag(ThreadFlag::kRunningFlipFunction)
                            .WithoutFlag(ThreadFlag::kPendingFlipFunction);
     if (become_runnable) {
-      DCHECK_EQ(self, this);
+      DCHECK_EQ(self, target);
       DCHECK_NE(self->GetState(), ThreadState::kRunnable);
       new_state_and_flags = new_state_and_flags.WithState(ThreadState::kRunnable);
     }
-    if (tls32_.state_and_flags.CompareAndSetWeakAcquire(old_state_and_flags.GetValue(),
-                                                        new_state_and_flags.GetValue())) {
+    if (target->tls32_.state_and_flags.CompareAndSetWeakAcquire(old_state_and_flags.GetValue(),
+                                                                new_state_and_flags.GetValue())) {
       if (become_runnable) {
-        GetMutatorLock()->TransitionFromSuspendedToRunnable(this);
+        self->GetMutatorLock()->TransitionFromSuspendedToRunnable(self);
       }
       art::Locks::mutator_lock_->AssertSharedHeld(self);
-      RunFlipFunction(self, /*notify=*/ true);
-      DCHECK(!GetStateAndFlags(std::memory_order_relaxed).IsAnyOfFlagsSet(FlipFunctionFlags()));
+      maybe_release();
+      // Thread will not go away while kRunningFlipFunction is set.
+      target->RunFlipFunction(self);
+      DCHECK(!target->GetStateAndFlags(std::memory_order_relaxed)
+                  .IsAnyOfFlagsSet(FlipFunctionFlags()));
+      set_finished(true);
       return true;
-    } else {
-      old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
-      if (become_runnable && old_state_and_flags.IsFlagSet(ThreadFlag::kSuspendRequest)) {
-        break;
-      }
     }
+    if (become_runnable) {
+      DCHECK(!check_exited);  // We didn't acquire thread_list_lock_ .
+      // Let caller retry.
+      return false;
+    }
+    old_state_and_flags = target->GetStateAndFlags(std::memory_order_acquire);
   }
-  return false;
+  // Unreachable.
 }
 
-void Thread::RunFlipFunction(Thread* self, bool notify) {
-  // This function is called for suspended threads and by the thread running
-  // `ThreadList::FlipThreadRoots()` after we've successfully set the flag
-  // `ThreadFlag::kRunningFlipFunction`. This flag is not set if the thread is
-  // running the flip function right after transitioning to Runnable as
-  // no other thread may run checkpoints on a thread that's actually Runnable.
-  DCHECK_EQ(notify, ReadFlag(ThreadFlag::kRunningFlipFunction));
+void Thread::RunFlipFunction(Thread* self) {
+  // This function is called either by the thread running `ThreadList::FlipThreadRoots()` or when
+  // a thread becomes runnable, after we've successfully set the kRunningFlipFunction ThreadFlag.
+  DCHECK(ReadFlag(ThreadFlag::kRunningFlipFunction));
 
-  Closure* flip_function = tlsPtr_.flip_function;
-  tlsPtr_.flip_function = nullptr;
+  Closure* flip_function = GetFlipFunction();
+  tlsPtr_.flip_function.store(nullptr, std::memory_order_relaxed);
   DCHECK(flip_function != nullptr);
   flip_function->Run(this);
-
-  if (notify) {
-    // Clear the `ThreadFlag::kRunningFlipFunction` and `ThreadFlag::kWaitingForFlipFunction`.
-    // Check if the latter was actually set, indicating that there is at least one waiting thread.
-    constexpr uint32_t kFlagsToClear = enum_cast<uint32_t>(ThreadFlag::kRunningFlipFunction) |
-                                       enum_cast<uint32_t>(ThreadFlag::kWaitingForFlipFunction);
-    StateAndFlags old_state_and_flags(
-        tls32_.state_and_flags.fetch_and(~kFlagsToClear, std::memory_order_release));
-    if (old_state_and_flags.IsFlagSet(ThreadFlag::kWaitingForFlipFunction)) {
-      // Notify all threads that are waiting for completion (at least one).
-      // TODO: Should we create a separate mutex and condition variable instead
-      // of piggy-backing on the `thread_suspend_count_lock_` and `resume_cond_`?
-      MutexLock mu(self, *Locks::thread_suspend_count_lock_);
-      resume_cond_->Broadcast(self);
-    }
-  }
+  // The following also serves as a very approximate check for a deallocated thread.
+  DCHECK_NE(GetState(), ThreadState::kTerminated);
+  DCHECK(!ReadFlag(ThreadFlag::kPendingFlipFunction));
+  AtomicClearFlag(ThreadFlag::kRunningFlipFunction, std::memory_order_release);
+  // From here on this thread may go away, and it is no longer safe to access.
+
+  // Notify all threads that are waiting for completion.
+  // TODO: Should we create a separate mutex and condition variable instead
+  // of piggy-backing on the `thread_suspend_count_lock_` and `resume_cond_`?
+  MutexLock mu(self, *Locks::thread_suspend_count_lock_);
+  resume_cond_->Broadcast(self);
 }
 
-void Thread::WaitForFlipFunction(Thread* self) {
+void Thread::WaitForFlipFunction(Thread* self) const {
   // Another thread is running the flip function. Wait for it to complete.
   // Check the flag while holding the mutex so that we do not miss the broadcast.
   // Repeat the check after waiting to guard against spurious wakeups (and because
@@ -1910,19 +1859,39 @@ void Thread::WaitForFlipFunction(Thread* self) {
     StateAndFlags old_state_and_flags = GetStateAndFlags(std::memory_order_acquire);
     DCHECK(!old_state_and_flags.IsFlagSet(ThreadFlag::kPendingFlipFunction));
     if (!old_state_and_flags.IsFlagSet(ThreadFlag::kRunningFlipFunction)) {
-      DCHECK(!old_state_and_flags.IsAnyOfFlagsSet(FlipFunctionFlags()));
-      break;
+      return;
     }
-    if (!old_state_and_flags.IsFlagSet(ThreadFlag::kWaitingForFlipFunction)) {
-      // Mark that there is a waiting thread.
-      StateAndFlags new_state_and_flags =
-          old_state_and_flags.WithFlag(ThreadFlag::kWaitingForFlipFunction);
-      if (!tls32_.state_and_flags.CompareAndSetWeakRelaxed(old_state_and_flags.GetValue(),
-                                                           new_state_and_flags.GetValue())) {
-        continue;  // Retry.
-      }
+    // We sometimes hold mutator lock here. OK since the flip function must complete quickly.
+    resume_cond_->WaitHoldingLocks(self);
+  }
+}
+
+void Thread::WaitForFlipFunctionTestingExited(Thread* self, ThreadExitFlag* tef) {
+  Locks::thread_list_lock_->Lock(self);
+  if (tef->HasExited()) {
+    Locks::thread_list_lock_->Unlock(self);
+    return;
+  }
+  // We need to hold suspend_count_lock_ to avoid missed wakeups when the flip function finishes.
+  // We need to hold thread_list_lock_ because the tef test result is only valid while we hold the
+  // lock, and once kRunningFlipFunction is no longer set, "this" may be deallocated. Hence the
+  // complicated locking dance.
+  MutexLock mu(self, *Locks::thread_suspend_count_lock_);
+  while (true) {
+    StateAndFlags old_state_and_flags = GetStateAndFlags(std::memory_order_acquire);
+    DCHECK(!old_state_and_flags.IsFlagSet(ThreadFlag::kPendingFlipFunction));
+    Locks::thread_list_lock_->Unlock(self);  // So we can wait or return.
+    if (!old_state_and_flags.IsFlagSet(ThreadFlag::kRunningFlipFunction)) {
+      return;
+    }
+    resume_cond_->WaitHoldingLocks(self);
+    Locks::thread_suspend_count_lock_->Unlock(self);  // To re-lock thread_list_lock.
+    Locks::thread_list_lock_->Lock(self);
+    Locks::thread_suspend_count_lock_->Lock(self);
+    if (tef->HasExited()) {
+      Locks::thread_list_lock_->Unlock(self);
+      return;
     }
-    resume_cond_->Wait(self);
   }
 }
 
@@ -2242,19 +2211,27 @@ struct StackDumpVisitor : public MonitorObjectsStackVisitor {
     if (obj == nullptr) {
       os << msg << "an unknown object";
     } else {
-      if ((obj->GetLockWord(true).GetState() == LockWord::kThinLocked) &&
-          Locks::mutator_lock_->IsExclusiveHeld(Thread::Current())) {
-        // Getting the identity hashcode here would result in lock inflation and suspension of the
-        // current thread, which isn't safe if this is the only runnable thread.
-        os << msg << StringPrintf("<@addr=0x%" PRIxPTR "> (a %s)",
-                                  reinterpret_cast<intptr_t>(obj.Ptr()),
-                                  obj->PrettyTypeOf().c_str());
+      const std::string pretty_type(obj->PrettyTypeOf());
+      // It's often unsafe to allow lock inflation here. We may be the only runnable thread, or
+      // this may be called from a checkpoint. We get the hashcode on a best effort basis.
+      static constexpr int kNumRetries = 3;
+      static constexpr int kSleepMicros = 10;
+      int32_t hash_code;
+      for (int i = 0;; ++i) {
+        hash_code = obj->IdentityHashCodeNoInflation();
+        if (hash_code != 0 || i == kNumRetries) {
+          break;
+        }
+        usleep(kSleepMicros);
+      }
+      if (hash_code == 0) {
+        os << msg
+           << StringPrintf("<@addr=0x%" PRIxPTR "> (a %s)",
+                           reinterpret_cast<intptr_t>(obj.Ptr()),
+                           pretty_type.c_str());
       } else {
-        // - waiting on <0x6008c468> (a java.lang.Class<java.lang.ref.ReferenceQueue>)
-        // Call PrettyTypeOf before IdentityHashCode since IdentityHashCode can cause thread
-        // suspension and move pretty_object.
-        const std::string pretty_type(obj->PrettyTypeOf());
-        os << msg << StringPrintf("<0x%08x> (a %s)", obj->IdentityHashCode(), pretty_type.c_str());
+        // - waiting on <0x608c468> (a java.lang.Class<java.lang.ref.ReferenceQueue>)
+        os << msg << StringPrintf("<0x%08x> (a %s)", hash_code, pretty_type.c_str());
       }
     }
     if (owner_tid != ThreadList::kInvalidThreadId) {
@@ -2452,6 +2429,13 @@ void Thread::NotifyThreadGroup(ScopedObjectAccessAlreadyRunnable& soa, jobject t
       soa.Self(), thread_group_object, thread_object);
 }
 
+void Thread::SignalExitFlags() {
+  for (ThreadExitFlag* tef = tlsPtr_.thread_exit_flags; tef != nullptr; tef = tef->next_) {
+    tef->exited_ = true;
+  }
+  tlsPtr_.thread_exit_flags = nullptr;  // Now unused.
+}
+
 Thread::Thread(bool daemon)
     : tls32_(daemon),
       wait_monitor_(nullptr),
@@ -2476,12 +2460,10 @@ Thread::Thread(bool daemon)
             tlsPtr_.rosalloc_runs + kNumRosAllocThreadLocalSizeBracketsInThread,
             gc::allocator::RosAlloc::GetDedicatedFullRun());
   tlsPtr_.checkpoint_function = nullptr;
-  for (uint32_t i = 0; i < kMaxSuspendBarriers; ++i) {
-    tlsPtr_.active_suspend_barriers[i] = nullptr;
-  }
-  tlsPtr_.flip_function = nullptr;
+  tlsPtr_.active_suspendall_barrier = nullptr;
+  tlsPtr_.active_suspend1_barriers = nullptr;
+  tlsPtr_.flip_function.store(nullptr, std::memory_order_relaxed);
   tlsPtr_.thread_local_mark_stack = nullptr;
-  tls32_.is_transitioning_to_runnable = false;
   ResetTlab();
 }
 
@@ -2625,8 +2607,8 @@ Thread::~Thread() {
   CHECK(!ReadFlag(ThreadFlag::kEmptyCheckpointRequest));
   CHECK(tlsPtr_.checkpoint_function == nullptr);
   CHECK_EQ(checkpoint_overflow_.size(), 0u);
-  CHECK(tlsPtr_.flip_function == nullptr);
-  CHECK_EQ(tls32_.is_transitioning_to_runnable, false);
+  // A pending flip function request is OK. FlipThreadRoots will have been notified that we
+  // exited, and nobody will attempt to process the request.
 
   // Make sure we processed all deoptimization requests.
   CHECK(tlsPtr_.deoptimization_context_stack == nullptr) << "Missed deoptimization";
@@ -4717,9 +4699,11 @@ bool Thread::IsAotCompiler() {
 mirror::Object* Thread::GetPeerFromOtherThread() {
   DCHECK(tlsPtr_.jpeer == nullptr);
   // Ensure that opeer is not obsolete.
-  EnsureFlipFunctionStarted(Thread::Current());
-  while (GetStateAndFlags(std::memory_order_acquire).IsAnyOfFlagsSet(FlipFunctionFlags())) {
-    usleep(kSuspendTimeDuringFlip);
+  Thread* self = Thread::Current();
+  EnsureFlipFunctionStarted(self, this);
+  if (ReadFlag(ThreadFlag::kRunningFlipFunction)) {
+    // Does not release mutator lock. Hence no new flip requests can be issued.
+    WaitForFlipFunction(self);
   }
   return tlsPtr_.opeer;
 }
diff --git a/runtime/thread.h b/runtime/thread.h
index 08c803d064..56c3f474b4 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -47,6 +47,7 @@
 #include "reflective_handle_scope.h"
 #include "runtime_globals.h"
 #include "runtime_stats.h"
+#include "suspend_reason.h"
 #include "thread_state.h"
 
 namespace unwindstack {
@@ -101,7 +102,6 @@ class RootVisitor;
 class ScopedObjectAccessAlreadyRunnable;
 class ShadowFrame;
 class StackedShadowFrameRecord;
-enum class SuspendReason : char;
 class Thread;
 class ThreadList;
 enum VisitRootFlags : uint8_t;
@@ -133,29 +133,25 @@ enum class ThreadFlag : uint32_t {
   kEmptyCheckpointRequest = 1u << 2,
 
   // Register that at least 1 suspend barrier needs to be passed.
+  // Changes to this flag are guarded by suspend_count_lock_ .
   kActiveSuspendBarrier = 1u << 3,
 
   // Marks that a "flip function" needs to be executed on this thread.
+  // Set only while holding thread_list_lock_.
   kPendingFlipFunction = 1u << 4,
 
   // Marks that the "flip function" is being executed by another thread.
   //
-  // This is used to guards against multiple threads trying to run the
+  // This is used to guard against multiple threads trying to run the
   // "flip function" for the same thread while the thread is suspended.
   //
-  // This is not needed when the thread is running the flip function
-  // on its own after transitioning to Runnable.
+  // Set either by the thread itself or while holding thread_list_lock, Prevents a thread from
+  // exiting.
   kRunningFlipFunction = 1u << 5,
 
-  // Marks that a thread is wating for "flip function" to complete.
-  //
-  // This is used to check if we need to broadcast the completion of the
-  // "flip function" to other threads. See also `kRunningFlipFunction`.
-  kWaitingForFlipFunction = 1u << 6,
-
   // Request that compiled JNI stubs do not transition to Native or Runnable with
   // inlined code, but take a slow path for monitoring method entry and exit events.
-  kMonitorJniEntryExit = 1u << 7,
+  kMonitorJniEntryExit = 1u << 6,
 
   // Indicates the last flag. Used for checking that the flags do not overlap thread state.
   kLastFlag = kMonitorJniEntryExit
@@ -187,6 +183,28 @@ enum class WeakRefAccessState : int32_t {
   kDisabled
 };
 
+// See Thread.tlsPtr_.active_suspend1_barriers below for explanation.
+struct WrappedSuspend1Barrier {
+  WrappedSuspend1Barrier() : barrier_(1), next_(nullptr) {}
+  AtomicInteger barrier_;
+  struct WrappedSuspend1Barrier* next_ GUARDED_BY(Locks::thread_suspend_count_lock_);
+};
+
+// Mostly opaque structure allocated by the client of NotifyOnThreadExit.  Allows a client to
+// check whether the thread still exists after temporarily releasing thread_list_lock_, usually
+// because we need to wait for something.
+class ThreadExitFlag {
+ public:
+  ThreadExitFlag() : exited_(false) {}
+  bool HasExited() REQUIRES(Locks::thread_list_lock_) { return exited_; }
+
+ private:
+  ThreadExitFlag* next_ GUARDED_BY(Locks::thread_list_lock_);
+  ThreadExitFlag* prev_ GUARDED_BY(Locks::thread_list_lock_);
+  bool exited_ GUARDED_BY(*Locks::thread_list_lock_);
+  friend class Thread;
+};
+
 // This should match RosAlloc::kNumThreadLocalSizeBrackets.
 static constexpr size_t kNumRosAllocThreadLocalSizeBracketsInThread = 16;
 
@@ -320,7 +338,10 @@ class Thread {
   }
 
   bool IsSuspended() const {
-    StateAndFlags state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
+    // We need to ensure that once we return true, all prior accesses to the Java data by "this"
+    // thread are complete. Hence we need "acquire" ordering here, and "release" when the flags
+    // are set.
+    StateAndFlags state_and_flags = GetStateAndFlags(std::memory_order_acquire);
     return state_and_flags.GetState() != ThreadState::kRunnable &&
            state_and_flags.IsFlagSet(ThreadFlag::kSuspendRequest);
   }
@@ -336,20 +357,30 @@ class Thread {
     return tls32_.define_class_counter;
   }
 
-  // If delta > 0 and (this != self or suspend_barrier is not null), this function may temporarily
-  // release thread_suspend_count_lock_ internally.
+  // Increment suspend count and optionally install at most one suspend barrier.
   ALWAYS_INLINE
-  bool ModifySuspendCount(Thread* self,
-                          int delta,
-                          AtomicInteger* suspend_barrier,
-                          SuspendReason reason)
-      WARN_UNUSED
+  void IncrementSuspendCount(Thread* self,
+                             AtomicInteger* suspendall_barrier,
+                             WrappedSuspend1Barrier* suspend1_barrier,
+                             SuspendReason reason) REQUIRES(Locks::thread_suspend_count_lock_);
+
+  // The same, but default reason to kInternal, and barriers to nullptr.
+  ALWAYS_INLINE void IncrementSuspendCount(Thread* self)
       REQUIRES(Locks::thread_suspend_count_lock_);
 
+  // Follows one of the above calls. For_user_code indicates if SuspendReason was kForUserCode.
+  ALWAYS_INLINE void DecrementSuspendCount(Thread* self, bool for_user_code = false)
+      REQUIRES(Locks::thread_suspend_count_lock_);
+
+ private:
+  NO_RETURN static void UnsafeLogFatalForSuspendCount(Thread* self, Thread* thread);
+
+ public:
   // Requests a checkpoint closure to run on another thread. The closure will be run when the
   // thread notices the request, either in an explicit runtime CheckSuspend() call, or in a call
   // originating from a compiler generated suspend point check. This returns true if the closure
-  // was added and will (eventually) be executed. It returns false otherwise.
+  // was added and will (eventually) be executed. It returns false if this was impossible
+  // because the thread was suspended, and we thus did nothing.
   //
   // Since multiple closures can be queued and some closures can delay other threads from running,
   // no closure should attempt to suspend another thread while running.
@@ -363,27 +394,37 @@ class Thread {
 
   // RequestSynchronousCheckpoint releases the thread_list_lock_ as a part of its execution. This is
   // due to the fact that Thread::Current() needs to go to sleep to allow the targeted thread to
-  // execute the checkpoint for us if it is Runnable. The suspend_state is the state that the thread
+  // execute the checkpoint for us if it is Runnable. The wait_state is the state that the thread
   // will go into while it is awaiting the checkpoint to be run.
-  // NB Passing ThreadState::kRunnable may cause the current thread to wait in a condition variable
-  // while holding the mutator_lock_.  Callers should ensure that this will not cause any problems
-  // for the closure or the rest of the system.
+  // The closure may be run on Thread::Current() on behalf of "this" thread.
+  // Thus for lock ordering purposes, the closure should be runnable by the caller. This also
+  // sometimes makes it reasonable to pass ThreadState::kRunnable as wait_state: We may wait on
+  // a condition variable for the "this" thread to act, but for lock ordering purposes, this is
+  // exactly as though Thread::Current() had run the closure.
   // NB Since multiple closures can be queued and some closures can delay other threads from running
   // no closure should attempt to suspend another thread while running.
   bool RequestSynchronousCheckpoint(Closure* function,
-                                    ThreadState suspend_state = ThreadState::kWaiting)
-      REQUIRES_SHARED(Locks::mutator_lock_)
-      RELEASE(Locks::thread_list_lock_)
-      REQUIRES(!Locks::thread_suspend_count_lock_);
+                                    ThreadState wait_state = ThreadState::kWaiting)
+      REQUIRES_SHARED(Locks::mutator_lock_) RELEASE(Locks::thread_list_lock_)
+          REQUIRES(!Locks::thread_suspend_count_lock_);
 
   bool RequestEmptyCheckpoint()
       REQUIRES(Locks::thread_suspend_count_lock_);
 
+  Closure* GetFlipFunction() { return tlsPtr_.flip_function.load(std::memory_order_relaxed); }
+
   // Set the flip function. This is done with all threads suspended, except for the calling thread.
-  void SetFlipFunction(Closure* function);
+  void SetFlipFunction(Closure* function) REQUIRES(Locks::thread_suspend_count_lock_)
+      REQUIRES(Locks::thread_list_lock_);
 
-  // Wait for the flip function to complete if still running on another thread.
-  void WaitForFlipFunction(Thread* self);
+  // Wait for the flip function to complete if still running on another thread. Assumes the "this"
+  // thread remains live.
+  void WaitForFlipFunction(Thread* self) const REQUIRES(!Locks::thread_suspend_count_lock_);
+
+  // An enhanced version of the above that uses tef to safely return if the thread exited in the
+  // meantime.
+  void WaitForFlipFunctionTestingExited(Thread* self, ThreadExitFlag* tef)
+      REQUIRES(!Locks::thread_suspend_count_lock_, !Locks::thread_list_lock_);
 
   gc::accounting::AtomicStack<mirror::Object>* GetThreadLocalMarkStack() {
     CHECK(gUseReadBarrier);
@@ -520,7 +561,7 @@ class Thread {
   // the thread's stack may have not been flipped yet and peer may be a from-space (stale) ref.
   // This function will force a flip for the other thread if necessary.
   // Since we hold a shared mutator lock, a new flip function cannot be concurrently
-  // installed
+  // installed.
   mirror::Object* GetPeerFromOtherThread() REQUIRES_SHARED(Locks::mutator_lock_);
 
   bool HasPeer() const {
@@ -644,6 +685,19 @@ class Thread {
   void NotifyThreadGroup(ScopedObjectAccessAlreadyRunnable& soa, jobject thread_group = nullptr)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // Request notification when this thread is unregistered or exits. The caller must allocate a
+  // ThreadExitFlag, and pass it in. The caller is responsible for either waiting until the
+  // thread has exited, or uregistering the ThreadExitFlag, and then, and only then, deallocating
+  // the ThreadExitFlag. (This scheme avoids an allocation and questions about what to do if the
+  // allocation fails. Allows detection of thread exit after temporary release of
+  // thread_list_lock_)
+  void NotifyOnThreadExit(ThreadExitFlag* tef) REQUIRES(Locks::thread_list_lock_);
+  void UnregisterThreadExitFlag(ThreadExitFlag* tef) REQUIRES(Locks::thread_list_lock_);
+
+  // Called when thread is unregistered. May be called repeatedly, in which case only newly
+  // registered clients are processed.
+  void SignalExitFlags() REQUIRES(Locks::thread_list_lock_);
+
   // JNI methods
   JNIEnvExt* GetJniEnv() const {
     return tlsPtr_.jni_env;
@@ -1241,18 +1295,39 @@ class Thread {
     tlsPtr_.held_mutexes[level] = mutex;
   }
 
-  void ClearSuspendBarrier(AtomicInteger* target)
-      REQUIRES(Locks::thread_suspend_count_lock_);
+  // Possibly check that no mutexes at level kMonitorLock or above are subsequently acquired.
+  // Only invoked by the thread itself.
+  void DisallowPreMonitorMutexes() {
+    if (kIsDebugBuild) {
+      CHECK(this == Thread::Current());
+      CHECK(GetHeldMutex(kMonitorLock) == nullptr);
+      // Pretend we hold a kMonitorLock level mutex to detect disallowed mutex
+      // acquisitions by checkpoint Run() methods.  We don't normally register or thus check
+      // kMonitorLock level mutexes, but this is an exception.
+      static Mutex placeholder_mutex("checkpoint placeholder mutex", kMonitorLock);
+      SetHeldMutex(kMonitorLock, &placeholder_mutex);
+    }
+  }
+
+  // Undo the effect of the previous call. Again only invoked by the thread itself.
+  void AllowPreMonitorMutexes() {
+    if (kIsDebugBuild) {
+      CHECK(GetHeldMutex(kMonitorLock) != nullptr);
+      SetHeldMutex(kMonitorLock, nullptr);
+    }
+  }
 
   bool ReadFlag(ThreadFlag flag) const {
     return GetStateAndFlags(std::memory_order_relaxed).IsFlagSet(flag);
   }
 
   void AtomicSetFlag(ThreadFlag flag, std::memory_order order = std::memory_order_seq_cst) {
+    // Since we discard the returned value, memory_order_release will often suffice.
     tls32_.state_and_flags.fetch_or(enum_cast<uint32_t>(flag), order);
   }
 
   void AtomicClearFlag(ThreadFlag flag, std::memory_order order = std::memory_order_seq_cst) {
+    // Since we discard the returned value, memory_order_release will often suffice.
     tls32_.state_and_flags.fetch_and(~enum_cast<uint32_t>(flag), order);
   }
 
@@ -1345,14 +1420,6 @@ class Thread {
   bool ProtectStack(bool fatal_on_error = true);
   bool UnprotectStack();
 
-  bool IsTransitioningToRunnable() const {
-    return tls32_.is_transitioning_to_runnable;
-  }
-
-  void SetIsTransitioningToRunnable(bool value) {
-    tls32_.is_transitioning_to_runnable = value;
-  }
-
   uint32_t DecrementForceInterpreterCount() REQUIRES(Locks::thread_list_lock_) {
     return --tls32_.force_interpreter_count;
   }
@@ -1463,8 +1530,7 @@ class Thread {
 
   static constexpr uint32_t FlipFunctionFlags() {
     return enum_cast<uint32_t>(ThreadFlag::kPendingFlipFunction) |
-           enum_cast<uint32_t>(ThreadFlag::kRunningFlipFunction) |
-           enum_cast<uint32_t>(ThreadFlag::kWaitingForFlipFunction);
+           enum_cast<uint32_t>(ThreadFlag::kRunningFlipFunction);
   }
 
   static constexpr uint32_t StoredThreadStateValue(ThreadState state) {
@@ -1486,6 +1552,7 @@ class Thread {
 
  private:
   explicit Thread(bool daemon);
+  // A runnable thread should only be deleted from the thread itself.
   ~Thread() REQUIRES(!Locks::mutator_lock_, !Locks::thread_suspend_count_lock_);
   void Destroy(bool should_run_callbacks);
 
@@ -1513,7 +1580,8 @@ class Thread {
 
   // Avoid use, callers should use SetState.
   // Used only by `Thread` destructor and stack trace collection in semi-space GC (currently
-  // disabled by `kStoreStackTraces = false`).
+  // disabled by `kStoreStackTraces = false`). May not be called on a runnable thread other
+  // than Thread::Current().
   // NO_THREAD_SAFETY_ANALYSIS: This function is "Unsafe" and can be called in
   // different states, so clang cannot perform the thread safety analysis.
   ThreadState SetStateUnsafe(ThreadState new_state) NO_THREAD_SAFETY_ANALYSIS {
@@ -1522,12 +1590,13 @@ class Thread {
     if (old_state == new_state) {
       // Nothing to do.
     } else if (old_state == ThreadState::kRunnable) {
+      DCHECK_EQ(this, Thread::Current());
       // Need to run pending checkpoint and suspend barriers. Run checkpoints in runnable state in
       // case they need to use a ScopedObjectAccess. If we are holding the mutator lock and a SOA
       // attempts to TransitionFromSuspendedToRunnable, it results in a deadlock.
       TransitionToSuspendedAndRunCheckpoints(new_state);
       // Since we transitioned to a suspended state, check the pass barrier requests.
-      PassActiveSuspendBarriers();
+      CheckActiveSuspendBarriers();
     } else {
       while (true) {
         StateAndFlags new_state_and_flags = old_state_and_flags;
@@ -1601,8 +1670,25 @@ class Thread {
       REQUIRES(!Locks::thread_suspend_count_lock_, !Roles::uninterruptible_)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  ALWAYS_INLINE void PassActiveSuspendBarriers()
-      REQUIRES(!Locks::thread_suspend_count_lock_, !Roles::uninterruptible_);
+  // Call PassActiveSuspendBarriers() if there are active barriers. Only called on current thread.
+  ALWAYS_INLINE void CheckActiveSuspendBarriers()
+      REQUIRES(!Locks::thread_suspend_count_lock_, !Locks::mutator_lock_, !Roles::uninterruptible_);
+
+  // Decrement all "suspend barriers" for the current thread, notifying threads that requested our
+  // suspension. Only called on current thread, when suspended. If suspend_count_ > 0 then we
+  // promise that we are and will remain "suspended" until the suspend count is decremented.
+  bool PassActiveSuspendBarriers()
+      REQUIRES(!Locks::thread_suspend_count_lock_, !Locks::mutator_lock_);
+
+  // Add an entry to active_suspend1_barriers.
+  ALWAYS_INLINE void AddSuspend1Barrier(WrappedSuspend1Barrier* suspend1_barrier)
+      REQUIRES(Locks::thread_suspend_count_lock_);
+
+  // Remove last-added entry from active_suspend1_barriers.
+  // Only makes sense if we're still holding thread_suspend_count_lock_ since insertion.
+  ALWAYS_INLINE void RemoveFirstSuspend1Barrier() REQUIRES(Locks::thread_suspend_count_lock_);
+
+  ALWAYS_INLINE bool HasActiveSuspendBarrier() REQUIRES(Locks::thread_suspend_count_lock_);
 
   // Registers the current thread as the jit sensitive thread. Should be called just once.
   static void SetJitSensitiveThread() {
@@ -1618,13 +1704,6 @@ class Thread {
     is_sensitive_thread_hook_ = is_sensitive_thread_hook;
   }
 
-  bool ModifySuspendCountInternal(Thread* self,
-                                  int delta,
-                                  AtomicInteger* suspend_barrier,
-                                  SuspendReason reason)
-      WARN_UNUSED
-      REQUIRES(Locks::thread_suspend_count_lock_);
-
   // Runs a single checkpoint function. If there are no more pending checkpoint functions it will
   // clear the kCheckpointRequest flag. The caller is responsible for calling this in a loop until
   // the kCheckpointRequest flag is cleared.
@@ -1633,9 +1712,6 @@ class Thread {
       REQUIRES_SHARED(Locks::mutator_lock_);
   void RunEmptyCheckpoint();
 
-  bool PassActiveSuspendBarriers(Thread* self)
-      REQUIRES(!Locks::thread_suspend_count_lock_);
-
   // Install the protected region for implicit stack checks.
   void InstallImplicitProtection();
 
@@ -1739,21 +1815,31 @@ class Thread {
   // Format state and flags as a hex string. For diagnostic output.
   std::string StateAndFlagsAsHexString() const;
 
-  // Run the flip function and, if requested, notify other threads that may have tried
+  // Run the flip function and notify other threads that may have tried
   // to do that concurrently.
-  void RunFlipFunction(Thread* self, bool notify) REQUIRES_SHARED(Locks::mutator_lock_);
+  void RunFlipFunction(Thread* self) REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // Ensure that thread flip function started running. If no other thread is executing
-  // it, the calling thread shall run the flip function and then notify other threads
+  // Ensure that thread flip function for thread target started running. If no other thread is
+  // executing it, the calling thread shall run the flip function and then notify other threads
   // that have tried to do that concurrently. After this function returns, the
-  // `ThreadFlag::kPendingFlipFunction` is cleared but another thread may still
-  // run the flip function as indicated by the `ThreadFlag::kRunningFlipFunction`.
-  // A non-zero 'old_state_and_flags' indicates that the thread should logically
-  // acquire mutator lock if we win the race to run the flip function, if a
-  // suspend request is not already set. A zero 'old_state_and_flags' indicates
-  // we already hold the mutator lock.
-  // Returns true if this call ran the flip function.
-  bool EnsureFlipFunctionStarted(Thread* self, StateAndFlags old_state_and_flags = StateAndFlags(0))
+  // `ThreadFlag::kPendingFlipFunction` is cleared but another thread may still be running the
+  // flip function as indicated by the `ThreadFlag::kRunningFlipFunction`. Optional arguments:
+  //  - old_state_and_flags indicates the current and state and flags value for the thread, with
+  //    at least kPendingFlipFunction set. The thread should logically acquire the
+  //    mutator lock before running the flip function.  A special zero value indicates that the
+  //    thread already holds the mutator lock, and the actual state_and_flags must be read.
+  //    A non-zero value implies this == Current().
+  //  - If tef is non-null, we check that the target thread has not yet exited, as indicated by
+  //    tef. In that case, we acquire thread_list_lock_ as needed.
+  //  - If finished is non-null, we assign to *finished to indicate whether the flip was known to
+  //    be completed when we returned.
+  //  Returns true if and only if we acquired the mutator lock (which implies that we ran the flip
+  //  function after finding old_state_and_flags unchanged).
+  static bool EnsureFlipFunctionStarted(Thread* self,
+                                        Thread* target,
+                                        StateAndFlags old_state_and_flags = StateAndFlags(0),
+                                        ThreadExitFlag* tef = nullptr,
+                                        /*out*/ bool* finished = nullptr)
       TRY_ACQUIRE_SHARED(true, Locks::mutator_lock_);
 
   static void ThreadExitCallback(void* arg);
@@ -1799,7 +1885,6 @@ class Thread {
           throwing_OutOfMemoryError(false),
           no_thread_suspension(0),
           thread_exit_check_count(0),
-          is_transitioning_to_runnable(false),
           is_gc_marking(false),
           is_deopt_check_required(false),
           weak_ref_access_enabled(WeakRefAccessState::kVisiblyEnabled),
@@ -1809,8 +1894,7 @@ class Thread {
           make_visibly_initialized_counter(0),
           define_class_counter(0),
           num_name_readers(0),
-          shared_method_hotness(kSharedMethodHotnessThreshold)
-        {}
+          shared_method_hotness(kSharedMethodHotnessThreshold) {}
 
     // The state and flags field must be changed atomically so that flag values aren't lost.
     // See `StateAndFlags` for bit assignments of `ThreadFlag` and `ThreadState` values.
@@ -1846,11 +1930,6 @@ class Thread {
     // How many times has our pthread key's destructor been called?
     uint32_t thread_exit_check_count;
 
-    // True if the thread is in TransitionFromSuspendedToRunnable(). This is used to distinguish the
-    // non-runnable threads (eg. kNative, kWaiting) that are about to transition to runnable from
-    // the rest of them.
-    bool32_t is_transitioning_to_runnable;
-
     // True if the GC is in the marking phase. This is used for the CC collector only. This is
     // thread local so that we can simplify the logic to check for the fast path of read barriers of
     // GC roots.
@@ -1955,9 +2034,11 @@ class Thread {
                                name(nullptr),
                                pthread_self(0),
                                last_no_thread_suspension_cause(nullptr),
-                               thread_local_start(nullptr),
+                               active_suspendall_barrier(nullptr),
+                               active_suspend1_barriers(nullptr),
                                thread_local_pos(nullptr),
                                thread_local_end(nullptr),
+                               thread_local_start(nullptr),
                                thread_local_limit(nullptr),
                                thread_local_objects(0),
                                checkpoint_function(nullptr),
@@ -1969,7 +2050,8 @@ class Thread {
                                async_exception(nullptr),
                                top_reflective_handle_scope(nullptr),
                                method_trace_buffer(nullptr),
-                               method_trace_buffer_index(0) {
+                               method_trace_buffer_index(0),
+                               thread_exit_flags(nullptr) {
       std::fill(held_mutexes, held_mutexes + kLockLevelCount, nullptr);
     }
 
@@ -2067,27 +2149,37 @@ class Thread {
     // If no_thread_suspension_ is > 0, what is causing that assertion.
     const char* last_no_thread_suspension_cause;
 
-    // Pending barriers that require passing or NULL if non-pending. Installation guarding by
-    // Locks::thread_suspend_count_lock_.
-    // They work effectively as art::Barrier, but implemented directly using AtomicInteger and futex
-    // to avoid additional cost of a mutex and a condition variable, as used in art::Barrier.
-    AtomicInteger* active_suspend_barriers[kMaxSuspendBarriers];
-
-    // Thread-local allocation pointer. Moved here to force alignment for thread_local_pos on ARM.
-    uint8_t* thread_local_start;
+    // After a thread observes a suspend request and enters a suspended state,
+    // it notifies the requestor by arriving at a "suspend barrier". This consists of decrementing
+    // the atomic integer representing the barrier. (This implementation was introduced in 2015 to
+    // minimize cost. There may be other options.) These atomic integer barriers are always
+    // stored on the requesting thread's stack. They are referenced from the target thread's
+    // data structure in one of two ways; in either case the data structure referring to these
+    // barriers is guarded by suspend_count_lock:
+    // 1. A SuspendAll barrier is directly referenced from the target thread. Only one of these
+    // can be active at a time:
+    AtomicInteger* active_suspendall_barrier GUARDED_BY(Locks::thread_suspend_count_lock_);
+    // 2. For individual thread suspensions, active barriers are embedded in a struct that is used
+    // to link together all suspend requests for this thread. Unlike the SuspendAll case, each
+    // barrier is referenced by a single target thread, and thus can appear only on a single list.
+    // The struct as a whole is still stored on the requesting thread's stack.
+    WrappedSuspend1Barrier* active_suspend1_barriers GUARDED_BY(Locks::thread_suspend_count_lock_);
 
     // thread_local_pos and thread_local_end must be consecutive for ldrd and are 8 byte aligned for
     // potentially better performance.
     uint8_t* thread_local_pos;
     uint8_t* thread_local_end;
 
+    // Thread-local allocation pointer. Can be moved above the preceding two to correct alignment.
+    uint8_t* thread_local_start;
+
     // Thread local limit is how much we can expand the thread local buffer to, it is greater or
     // equal to thread_local_end.
     uint8_t* thread_local_limit;
 
     size_t thread_local_objects;
 
-    // Pending checkpoint function or null if non-pending. If this checkpoint is set and someone\
+    // Pending checkpoint function or null if non-pending. If this checkpoint is set and someone
     // requests another checkpoint, it goes to the checkpoint overflow list.
     Closure* checkpoint_function GUARDED_BY(Locks::thread_suspend_count_lock_);
 
@@ -2110,8 +2202,9 @@ class Thread {
     // Support for Mutex lock hierarchy bug detection.
     BaseMutex* held_mutexes[kLockLevelCount];
 
-    // The function used for thread flip.
-    Closure* flip_function;
+    // The function used for thread flip.  Set while holding Locks::thread_suspend_count_lock_ and
+    // with all other threads suspended.  May be cleared while being read.
+    std::atomic<Closure*> flip_function;
 
     union {
       // Thread-local mark stack for the concurrent copying collector.
@@ -2131,6 +2224,9 @@ class Thread {
 
     // The index of the next free entry in method_trace_buffer.
     size_t method_trace_buffer_index;
+
+    // Pointer to the first node of an intrusively doubly-linked list of ThreadExitFlags.
+    ThreadExitFlag* thread_exit_flags;
   } tlsPtr_;
 
   // Small thread-local cache to be used from the interpreter.
@@ -2278,20 +2374,6 @@ class ScopedDebugDisallowReadBarriers {
   Thread* const self_;
 };
 
-class ScopedTransitioningToRunnable : public ValueObject {
- public:
-  explicit ScopedTransitioningToRunnable(Thread* self)
-      : self_(self) {
-    DCHECK_EQ(self, Thread::Current());
-    self_->SetIsTransitioningToRunnable(true);
-  }
-
-  ~ScopedTransitioningToRunnable() { self_->SetIsTransitioningToRunnable(false); }
-
- private:
-  Thread* const self_;
-};
-
 class ThreadLifecycleCallback {
  public:
   virtual ~ThreadLifecycleCallback() {}
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 5c7132425c..f2e6f19b3f 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -67,10 +67,10 @@ namespace art {
 using android::base::StringPrintf;
 
 static constexpr uint64_t kLongThreadSuspendThreshold = MsToNs(5);
-// Use 0 since we want to yield to prevent blocking for an unpredictable amount of time.
-static constexpr useconds_t kThreadSuspendInitialSleepUs = 0;
-static constexpr useconds_t kThreadSuspendMaxYieldUs = 3000;
-static constexpr useconds_t kThreadSuspendMaxSleepUs = 5000;
+// Track the number of times SuspendAll loops for various reasons.
+// TODO(hboehm): Consider changing this back to kIsDebugBuild. However this has been known to only
+// fail in non-debug builds.
+static constexpr bool kTrackLoopReasons = true;
 
 // Whether we should try to dump the native stack of unattached threads. See commit ed8b723 for
 // some history.
@@ -79,7 +79,7 @@ static constexpr bool kDumpUnattachedThreadNativeStackForSigQuit = true;
 ThreadList::ThreadList(uint64_t thread_suspend_timeout_ns)
     : suspend_all_count_(0),
       unregistering_count_(0),
-      suspend_all_historam_("suspend all histogram", 16, 64),
+      suspend_all_histogram_("suspend all histogram", 16, 64),
       long_suspend_(false),
       shut_down_(false),
       thread_suspend_timeout_ns_(thread_suspend_timeout_ns),
@@ -140,10 +140,10 @@ void ThreadList::DumpForSigQuit(std::ostream& os) {
   {
     ScopedObjectAccess soa(Thread::Current());
     // Only print if we have samples.
-    if (suspend_all_historam_.SampleSize() > 0) {
+    if (suspend_all_histogram_.SampleSize() > 0) {
       Histogram<uint64_t>::CumulativeData data;
-      suspend_all_historam_.CreateHistogram(&data);
-      suspend_all_historam_.PrintConfidenceIntervals(os, 0.99, data);  // Dump time to suspend.
+      suspend_all_histogram_.CreateHistogram(&data);
+      suspend_all_histogram_.PrintConfidenceIntervals(os, 0.99, data);  // Dump time to suspend.
     }
   }
   bool dump_native_stack = Runtime::Current()->GetDumpNativeStackOnSigQuit();
@@ -279,11 +279,11 @@ void ThreadList::Dump(std::ostream& os, bool dump_native_stack) {
   }
 }
 
-void ThreadList::AssertThreadsAreSuspended(Thread* self, Thread* ignore1, Thread* ignore2) {
+void ThreadList::AssertOtherThreadsAreSuspended(Thread* self) {
   MutexLock mu(self, *Locks::thread_list_lock_);
   MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
   for (const auto& thread : list_) {
-    if (thread != ignore1 && thread != ignore2) {
+    if (thread != self) {
       CHECK(thread->IsSuspended())
             << "\nUnsuspended thread: <<" << *thread << "\n"
             << "self: <<" << *Thread::Current();
@@ -310,20 +310,9 @@ NO_RETURN static void UnsafeLogFatalForThreadSuspendAllTimeout() {
 }
 #endif
 
-// Unlike suspending all threads where we can wait to acquire the mutator_lock_, suspending an
-// individual thread requires polling. delay_us is the requested sleep wait. If delay_us is 0 then
-// we use sched_yield instead of calling usleep.
-// Although there is the possibility, here and elsewhere, that usleep could return -1 and
-// errno = EINTR, there should be no problem if interrupted, so we do not check.
-static void ThreadSuspendSleep(useconds_t delay_us) {
-  if (delay_us == 0) {
-    sched_yield();
-  } else {
-    usleep(delay_us);
-  }
-}
-
-size_t ThreadList::RunCheckpoint(Closure* checkpoint_function, Closure* callback) {
+size_t ThreadList::RunCheckpoint(Closure* checkpoint_function,
+                                 Closure* callback,
+                                 bool allow_lock_checking) {
   Thread* self = Thread::Current();
   Locks::mutator_lock_->AssertNotExclusiveHeld(self);
   Locks::thread_list_lock_->AssertNotHeld(self);
@@ -332,9 +321,12 @@ size_t ThreadList::RunCheckpoint(Closure* checkpoint_function, Closure* callback
   std::vector<Thread*> suspended_count_modified_threads;
   size_t count = 0;
   {
-    // Call a checkpoint function for each thread, threads which are suspended get their checkpoint
-    // manually called.
+    // Call a checkpoint function for each thread. We directly invoke the function on behalf of
+    // suspended threads.
     MutexLock mu(self, *Locks::thread_list_lock_);
+    if (kIsDebugBuild && allow_lock_checking) {
+      self->DisallowPreMonitorMutexes();
+    }
     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
     count = list_.size();
     for (const auto& thread : list_) {
@@ -345,39 +337,35 @@ size_t ThreadList::RunCheckpoint(Closure* checkpoint_function, Closure* callback
             // This thread will run its checkpoint some time in the near future.
             if (requested_suspend) {
               // The suspend request is now unnecessary.
-              bool updated =
-                  thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal);
-              DCHECK(updated);
+              thread->DecrementSuspendCount(self);
               requested_suspend = false;
             }
             break;
           } else {
-            // The thread is probably suspended, try to make sure that it stays suspended.
-            if (thread->GetState() == ThreadState::kRunnable) {
-              // Spurious fail, try again.
-              continue;
-            }
+            // The thread was, and probably still is, suspended.
             if (!requested_suspend) {
-              bool updated =
-                  thread->ModifySuspendCount(self, +1, nullptr, SuspendReason::kInternal);
-              DCHECK(updated);
+              // This does not risk suspension cycles: We may have a pending suspension request,
+              // but it cannot block us: Checkpoint Run() functions may not suspend, thus we cannot
+              // be blocked from decrementing the count again.
+              thread->IncrementSuspendCount(self);
               requested_suspend = true;
-              if (thread->IsSuspended()) {
-                break;
-              }
-              // The thread raced us to become Runnable. Try to RequestCheckpoint() again.
-            } else {
-              // The thread previously raced our suspend request to become Runnable but
-              // since it is suspended again, it must honor that suspend request now.
-              DCHECK(thread->IsSuspended());
+            }
+            if (thread->IsSuspended()) {
+              // We saw it suspended after incrementing suspend count, so it will stay that way.
               break;
             }
           }
+          // We only get here if the thread entered kRunnable again. Retry immediately.
         }
+        // At this point, either the thread was runnable, and will run the checkpoint itself,
+        // or requested_suspend is true, and the thread is safely suspended.
         if (requested_suspend) {
+          DCHECK(thread->IsSuspended());
           suspended_count_modified_threads.push_back(thread);
         }
       }
+      // Thread either has honored or will honor the checkpoint, or it has been added to
+      // suspended_count_modified_threads.
     }
     // Run the callback to be called inside this critical section.
     if (callback != nullptr) {
@@ -388,6 +376,7 @@ size_t ThreadList::RunCheckpoint(Closure* checkpoint_function, Closure* callback
   // Run the checkpoint on ourself while we wait for threads to suspend.
   checkpoint_function->Run(self);
 
+  bool mutator_lock_held = Locks::mutator_lock_->IsSharedHeld(self);
   bool repeat = true;
   // Run the checkpoint on the suspended threads.
   while (repeat) {
@@ -396,21 +385,23 @@ size_t ThreadList::RunCheckpoint(Closure* checkpoint_function, Closure* callback
       if (thread != nullptr) {
         // We know for sure that the thread is suspended at this point.
         DCHECK(thread->IsSuspended());
-        // Make sure there is no pending flip function before running checkpoint
-        // on behalf of thread.
-        thread->EnsureFlipFunctionStarted(self);
-        if (thread->GetStateAndFlags(std::memory_order_acquire)
-                .IsAnyOfFlagsSet(Thread::FlipFunctionFlags())) {
-          // There is another thread running the flip function for 'thread'.
-          // Instead of waiting for it to complete, move to the next thread.
-          repeat = true;
-          continue;
-        }
+        if (mutator_lock_held) {
+          // Make sure there is no pending flip function before running Java-heap-accessing
+          // checkpoint on behalf of thread.
+          Thread::EnsureFlipFunctionStarted(self, thread);
+          if (thread->GetStateAndFlags(std::memory_order_acquire)
+                  .IsAnyOfFlagsSet(Thread::FlipFunctionFlags())) {
+            // There is another thread running the flip function for 'thread'.
+            // Instead of waiting for it to complete, move to the next thread.
+            repeat = true;
+            continue;
+          }
+        }  // O.w. the checkpoint will not access Java data structures, and doesn't care whether
+           // the flip function has been called.
         checkpoint_function->Run(thread);
         {
           MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-          bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal);
-          DCHECK(updated);
+          thread->DecrementSuspendCount(self);
         }
         // We are done with 'thread' so set it to nullptr so that next outer
         // loop iteration, if any, skips 'thread'.
@@ -429,6 +420,9 @@ size_t ThreadList::RunCheckpoint(Closure* checkpoint_function, Closure* callback
     Thread::resume_cond_->Broadcast(self);
   }
 
+  if (kIsDebugBuild && allow_lock_checking) {
+    self->AllowPreMonitorMutexes();
+  }
   return count;
 }
 
@@ -537,113 +531,153 @@ void ThreadList::RunEmptyCheckpoint() {
   }
 }
 
+// Separate function to disable just the right amount of thread-safety analysis.
+ALWAYS_INLINE void AcquireMutatorLockSharedUncontended(Thread* self)
+    ACQUIRE_SHARED(*Locks::mutator_lock_) NO_THREAD_SAFETY_ANALYSIS {
+  bool success = Locks::mutator_lock_->SharedTryLock(self, /*check=*/false);
+  CHECK(success);
+}
+
 // A checkpoint/suspend-all hybrid to switch thread roots from
 // from-space to to-space refs. Used to synchronize threads at a point
 // to mark the initiation of marking while maintaining the to-space
 // invariant.
-size_t ThreadList::FlipThreadRoots(Closure* thread_flip_visitor,
-                                   Closure* flip_callback,
-                                   gc::collector::GarbageCollector* collector,
-                                   gc::GcPauseListener* pause_listener) {
+void ThreadList::FlipThreadRoots(Closure* thread_flip_visitor,
+                                 Closure* flip_callback,
+                                 gc::collector::GarbageCollector* collector,
+                                 gc::GcPauseListener* pause_listener) {
   TimingLogger::ScopedTiming split("ThreadListFlip", collector->GetTimings());
   Thread* self = Thread::Current();
   Locks::mutator_lock_->AssertNotHeld(self);
   Locks::thread_list_lock_->AssertNotHeld(self);
   Locks::thread_suspend_count_lock_->AssertNotHeld(self);
   CHECK_NE(self->GetState(), ThreadState::kRunnable);
-  size_t runnable_thread_count = 0;
-  std::vector<Thread*> other_threads;
 
   collector->GetHeap()->ThreadFlipBegin(self);  // Sync with JNI critical calls.
 
   // ThreadFlipBegin happens before we suspend all the threads, so it does not
   // count towards the pause.
   const uint64_t suspend_start_time = NanoTime();
-  SuspendAllInternal(self, self, nullptr);
+  VLOG(threads) << "Suspending all for thread flip";
+  SuspendAllInternal(self);
   if (pause_listener != nullptr) {
     pause_listener->StartPause();
   }
 
   // Run the flip callback for the collector.
   Locks::mutator_lock_->ExclusiveLock(self);
-  suspend_all_historam_.AdjustAndAddValue(NanoTime() - suspend_start_time);
+  suspend_all_histogram_.AdjustAndAddValue(NanoTime() - suspend_start_time);
   flip_callback->Run(self);
-  // Releasing mutator-lock *before* setting up flip function in the threads
-  // leaves a gap for another thread trying to suspend all threads. That thread
-  // gets to run with mutator-lock, thereby accessing the heap, without running
-  // its flip function. It's not a problem with CC as the gc-thread hasn't
-  // started marking yet and the from-space is accessible. By delaying releasing
-  // mutator-lock until after the flip function are running on all threads we
-  // fix that without increasing pause time, except for any thread that might be
-  // trying to suspend all. Even though the change works irrespective of the GC,
-  // it has been limited to userfaultfd GC to keep the change behind the flag.
-  //
-  // TODO: It's a temporary change as aosp/2377951 is going to clean-up at a
-  // broad scale, including not allowing concurrent suspend-all.
 
-  // Resume runnable threads.
+  std::vector<Thread*> flipping_threads;  // All suspended threads. Includes us.
+  int thread_count;
+  // Flipping threads might exit between the time we resume them and try to run the flip function.
+  // Track that in a parallel vector.
+  std::unique_ptr<ThreadExitFlag[]> exit_flags;
   {
     TimingLogger::ScopedTiming split2("ResumeRunnableThreads", collector->GetTimings());
     MutexLock mu(self, *Locks::thread_list_lock_);
     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-    --suspend_all_count_;
+    thread_count = list_.size();
+    exit_flags.reset(new ThreadExitFlag[thread_count]);
+    flipping_threads.resize(thread_count, nullptr);
+    int i = 1;
     for (Thread* thread : list_) {
       // Set the flip function for all threads because once we start resuming any threads,
       // they may need to run the flip function on behalf of other threads, even this one.
+      DCHECK(thread == self || thread->IsSuspended());
       thread->SetFlipFunction(thread_flip_visitor);
-      if (thread == self) {
-        continue;
-      }
-      // Resume early the threads that were runnable but are suspended just for this thread flip or
-      // about to transition from non-runnable (eg. kNative at the SOA entry in a JNI function) to
-      // runnable (both cases waiting inside Thread::TransitionFromSuspendedToRunnable), or waiting
-      // for the thread flip to end at the JNI critical section entry (kWaitingForGcThreadFlip),
-      ThreadState state = thread->GetState();
-      if ((state == ThreadState::kWaitingForGcThreadFlip || thread->IsTransitioningToRunnable()) &&
-          thread->GetSuspendCount() == 1) {
-        // The thread will resume right after the broadcast.
-        bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal);
-        DCHECK(updated);
-        ++runnable_thread_count;
-      } else {
-        other_threads.push_back(thread);
-      }
+      // Put ourselves first, so other threads are more likely to have finished before we get
+      // there.
+      int thread_index = thread == self ? 0 : i++;
+      flipping_threads[thread_index] = thread;
+      thread->NotifyOnThreadExit(&exit_flags[thread_index]);
     }
-    Thread::resume_cond_->Broadcast(self);
+    DCHECK(i == thread_count);
   }
 
-  collector->RegisterPause(NanoTime() - suspend_start_time);
   if (pause_listener != nullptr) {
     pause_listener->EndPause();
   }
-  collector->GetHeap()->ThreadFlipEnd(self);
+  // Any new threads created after this will be created by threads that already ran their flip
+  // functions. In the normal GC use case in which the flip function converts all local references
+  // to to-space references, these newly created threads will also see only to-space references.
 
-  // Try to run the closure on the other threads.
+  // Resume threads, making sure that we do not release suspend_count_lock_ until we've reacquired
+  // the mutator_lock_ in shared mode, and decremented suspend_all_count_.  This avoids a
+  // concurrent SuspendAll, and ensures that newly started threads see a correct value of
+  // suspend_all_count.
   {
-    TimingLogger::ScopedTiming split3("FlipOtherThreads", collector->GetTimings());
-    for (Thread* thread : other_threads) {
-      thread->EnsureFlipFunctionStarted(self);
-      DCHECK(!thread->ReadFlag(ThreadFlag::kPendingFlipFunction));
-    }
-    // Try to run the flip function for self.
-    self->EnsureFlipFunctionStarted(self);
-    DCHECK(!self->ReadFlag(ThreadFlag::kPendingFlipFunction));
+    MutexLock mu(self, *Locks::thread_list_lock_);
+    Locks::thread_suspend_count_lock_->Lock(self);
+    ResumeAllInternal(self);
   }
 
-  Locks::mutator_lock_->ExclusiveUnlock(self);
+  collector->RegisterPause(NanoTime() - suspend_start_time);
 
-  // Resume other threads.
-  {
-    TimingLogger::ScopedTiming split4("ResumeOtherThreads", collector->GetTimings());
-    MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-    for (const auto& thread : other_threads) {
-      bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal);
-      DCHECK(updated);
+  // Since all threads were suspended, they will attempt to run the flip function before
+  // reentering a runnable state. We will also attempt to run the flip functions ourselves.  Any
+  // intervening checkpoint request will do the same.  Exactly one of those flip function attempts
+  // will succeed, and the target thread will not be able to reenter a runnable state until one of
+  // them does.
+
+  // Try to run the closure on the other threads.
+  TimingLogger::ScopedTiming split3("RunningThreadFlips", collector->GetTimings());
+  // Reacquire the mutator lock while holding suspend_count_lock. This cannot fail, since we
+  // do not acquire the mutator lock unless suspend_all_count was read as 0 while holding
+  // suspend_count_lock. We did not release suspend_count_lock since releasing the mutator
+  // lock.
+  AcquireMutatorLockSharedUncontended(self);
+
+  Locks::thread_suspend_count_lock_->Unlock(self);
+  // Concurrent SuspendAll may now see zero suspend_all_count_, but block on mutator_lock_.
+
+  collector->GetHeap()->ThreadFlipEnd(self);
+
+  for (int i = 0; i < thread_count; ++i) {
+    bool finished;
+    Thread::EnsureFlipFunctionStarted(
+        self, flipping_threads[i], Thread::StateAndFlags(0), &exit_flags[i], &finished);
+    if (finished) {
+      MutexLock mu2(self, *Locks::thread_list_lock_);
+      flipping_threads[i]->UnregisterThreadExitFlag(&exit_flags[i]);
+      flipping_threads[i] = nullptr;
     }
-    Thread::resume_cond_->Broadcast(self);
   }
+  // Make sure all flips complete before we return.
+  for (int i = 0; i < thread_count; ++i) {
+    if (UNLIKELY(flipping_threads[i] != nullptr)) {
+      flipping_threads[i]->WaitForFlipFunctionTestingExited(self, &exit_flags[i]);
+      MutexLock mu2(self, *Locks::thread_list_lock_);
+      flipping_threads[i]->UnregisterThreadExitFlag(&exit_flags[i]);
+    }
+  }
+  Locks::mutator_lock_->SharedUnlock(self);
+}
 
-  return runnable_thread_count + other_threads.size() + 1;  // +1 for self.
+bool ThreadList::WaitForSuspendBarrier(AtomicInteger* barrier) {
+#if ART_USE_FUTEXES
+  timespec wait_timeout;
+  InitTimeSpec(false, CLOCK_MONOTONIC, NsToMs(thread_suspend_timeout_ns_), 0, &wait_timeout);
+#endif
+  while (true) {
+    int32_t cur_val = barrier->load(std::memory_order_acquire);
+    if (cur_val <= 0) {
+      CHECK_EQ(cur_val, 0);
+      return true;
+    }
+#if ART_USE_FUTEXES
+    if (futex(barrier->Address(), FUTEX_WAIT_PRIVATE, cur_val, &wait_timeout, nullptr, 0) != 0) {
+      if (errno == ETIMEDOUT) {
+        return false;
+      } else if (errno != EAGAIN && errno != EINTR) {
+        PLOG(FATAL) << "futex wait for suspend barrier failed";
+      }
+    }
+#endif
+    // Else spin wait. This is likely to be slow, but ART_USE_FUTEXES is set on Linux,
+    // including all targets.
+  }
 }
 
 void ThreadList::SuspendAll(const char* cause, bool long_suspend) {
@@ -658,7 +692,7 @@ void ThreadList::SuspendAll(const char* cause, bool long_suspend) {
     ScopedTrace trace("Suspending mutator threads");
     const uint64_t start_time = NanoTime();
 
-    SuspendAllInternal(self, self);
+    SuspendAllInternal(self);
     // All threads are known to have suspended (but a thread may still own the mutator lock)
     // Make sure this thread grabs exclusive access to the mutator lock and its protected data.
 #if HAVE_TIMED_RWLOCK
@@ -682,16 +716,21 @@ void ThreadList::SuspendAll(const char* cause, bool long_suspend) {
 
     const uint64_t end_time = NanoTime();
     const uint64_t suspend_time = end_time - start_time;
-    suspend_all_historam_.AdjustAndAddValue(suspend_time);
+    suspend_all_histogram_.AdjustAndAddValue(suspend_time);
     if (suspend_time > kLongThreadSuspendThreshold) {
       LOG(WARNING) << "Suspending all threads took: " << PrettyDuration(suspend_time);
     }
 
     if (kDebugLocking) {
       // Debug check that all threads are suspended.
-      AssertThreadsAreSuspended(self, self);
+      AssertOtherThreadsAreSuspended(self);
     }
   }
+
+  // SuspendAllInternal blocks if we are in the middle of a flip.
+  DCHECK(!self->ReadFlag(ThreadFlag::kPendingFlipFunction));
+  DCHECK(!self->ReadFlag(ThreadFlag::kRunningFlipFunction));
+
   ATraceBegin((std::string("Mutator threads suspended for ") + cause).c_str());
 
   if (self != nullptr) {
@@ -701,11 +740,32 @@ void ThreadList::SuspendAll(const char* cause, bool long_suspend) {
   }
 }
 
+// Only used for kIsDebugBuild statistics.
+struct DelayInfo {
+  DelayInfo() : suspend_all_(0), self_susp_(0) {}
+  int suspend_all_;
+  int self_susp_;
+};
+
+static inline std::ostream& operator<<(std::ostream& os, DelayInfo di) {
+  if (kTrackLoopReasons) {
+    return os << "suspend_all = " << di.suspend_all_ << " self_suspend = " << di.self_susp_;
+  } else {
+    return os << "(no delay info)";
+  }
+}
+
+static inline bool TrackIfFalse(bool cond, int* field) {
+  if (kTrackLoopReasons && !cond) {
+    ++*field;
+  }
+  return cond;
+}
+
 // Ensures all threads running Java suspend and that those not running Java don't start.
-void ThreadList::SuspendAllInternal(Thread* self,
-                                    Thread* ignore1,
-                                    Thread* ignore2,
-                                    SuspendReason reason) {
+void ThreadList::SuspendAllInternal(Thread* self, SuspendReason reason) {
+  // self can be nullptr if this is an unregistered thread.
+  const uint64_t start_time = NanoTime();
   Locks::mutator_lock_->AssertNotExclusiveHeld(self);
   Locks::thread_list_lock_->AssertNotHeld(self);
   Locks::thread_suspend_count_lock_->AssertNotHeld(self);
@@ -723,91 +783,102 @@ void ThreadList::SuspendAllInternal(Thread* self,
 
   // The atomic counter for number of threads that need to pass the barrier.
   AtomicInteger pending_threads;
-  uint32_t num_ignored = 0;
-  if (ignore1 != nullptr) {
-    ++num_ignored;
-  }
-  if (ignore2 != nullptr && ignore1 != ignore2) {
-    ++num_ignored;
-  }
-  {
-    MutexLock mu(self, *Locks::thread_list_lock_);
-    MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-    // Update global suspend all state for attaching threads.
-    ++suspend_all_count_;
-    pending_threads.store(list_.size() - num_ignored, std::memory_order_relaxed);
-    // Increment everybody's suspend count (except those that should be ignored).
-    for (const auto& thread : list_) {
-      if (thread == ignore1 || thread == ignore2) {
-        continue;
+
+  DelayInfo di;
+  for (int iter_count = 1;; ++iter_count) {
+    {
+      MutexLock mu(self, *Locks::thread_list_lock_);
+      MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
+      if (LIKELY(TrackIfFalse(suspend_all_count_ == 0, &di.suspend_all_) &&
+                 TrackIfFalse(self == nullptr || self->GetSuspendCount() == 0, &di.self_susp_))) {
+        // The above condition remains valid while we hold thread_suspend_count_lock_ .
+        bool found_myself = false;
+        // Update global suspend all state for attaching threads.
+        ++suspend_all_count_;
+        pending_threads.store(list_.size() - (self == nullptr ? 0 : 1), std::memory_order_relaxed);
+        // Increment everybody else's suspend count.
+        for (const auto& thread : list_) {
+          if (thread == self) {
+            found_myself = true;
+          } else {
+            VLOG(threads) << "requesting thread suspend: " << *thread;
+            DCHECK_EQ(suspend_all_count_, 1);
+            thread->IncrementSuspendCount(self, &pending_threads, nullptr, reason);
+            if (thread->IsSuspended()) {
+              // Effectively pass the barrier on behalf of the already suspended thread.
+              // The thread itself cannot yet have acted on our request since we still hold the
+              // suspend_count_lock_, and it will notice that kActiveSuspendBarrier has already
+              // been cleared if and when it acquires the lock in PassActiveSuspendBarriers().
+              DCHECK_EQ(thread->tlsPtr_.active_suspendall_barrier, &pending_threads);
+              pending_threads.fetch_sub(1, std::memory_order_seq_cst);
+              thread->tlsPtr_.active_suspendall_barrier = nullptr;
+              if (!thread->HasActiveSuspendBarrier()) {
+                thread->AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier);
+              }
+            }
+            // else:
+            // The target thread was not yet suspended, and hence will be forced to execute
+            // TransitionFromRunnableToSuspended shortly. Since we set the kSuspendRequest flag
+            // before checking, and it checks kActiveSuspendBarrier after noticing kSuspendRequest,
+            // it must notice kActiveSuspendBarrier when it does. Thus it is guaranteed to
+            // decrement the suspend barrier. We're relying on store; load ordering here, but
+            // that's not a problem, since state and flags all reside in the same atomic, and
+            // are thus properly ordered, even for relaxed accesses.
+          }
+        }
+        DCHECK(self == nullptr || found_myself);
+        break;
       }
-      VLOG(threads) << "requesting thread suspend: " << *thread;
-      bool updated = thread->ModifySuspendCount(self, +1, &pending_threads, reason);
-      DCHECK(updated);
-
-      // Must install the pending_threads counter first, then check thread->IsSuspend() and clear
-      // the counter. Otherwise there's a race with Thread::TransitionFromRunnableToSuspended()
-      // that can lead a thread to miss a call to PassActiveSuspendBarriers().
-      if (thread->IsSuspended()) {
-        // Only clear the counter for the current thread.
-        thread->ClearSuspendBarrier(&pending_threads);
-        pending_threads.fetch_sub(1, std::memory_order_seq_cst);
+    }
+    // The if (LIKELY ...) condition above didn't hold. This is a bad time to initiate a suspend.
+    // Either the suspendall_barrier is already in use, or proceeding at this time risks deadlock.
+    // Note that in the event of a competing Suspend or SuspendAll, we are about to be "suspended"
+    // anyway, but this wouldn't stop us from continuing to loop. We hold no locks, so it's safe
+    // to sleep and retry.
+    if (iter_count >= kMaxSuspendRetries) {
+      LOG(FATAL) << "Too many SuspendAll retries: " << di;
+    }
+    {
+      MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
+      DCHECK_LE(suspend_all_count_, 1);
+      if (suspend_all_count_ != 0) {
+        // This may take a while, and we're not runnable, and thus would otherwise not block.
+        Thread::resume_cond_->WaitHoldingLocks(self);
+        continue;
       }
     }
+    usleep(kThreadSuspendSleepUs);
+    // We're already not runnable, so an attempt to suspend us should succeed.
   }
 
-  // Wait for the barrier to be passed by all runnable threads. This wait
-  // is done with a timeout so that we can detect problems.
-#if ART_USE_FUTEXES
-  timespec wait_timeout;
-  InitTimeSpec(false, CLOCK_MONOTONIC, NsToMs(thread_suspend_timeout_ns_), 0, &wait_timeout);
-#endif
-  const uint64_t start_time = NanoTime();
-  while (true) {
-    int32_t cur_val = pending_threads.load(std::memory_order_relaxed);
-    if (LIKELY(cur_val > 0)) {
-#if ART_USE_FUTEXES
-      if (futex(pending_threads.Address(), FUTEX_WAIT_PRIVATE, cur_val, &wait_timeout, nullptr, 0)
-          != 0) {
-        if ((errno == EAGAIN) || (errno == EINTR)) {
-          // EAGAIN and EINTR both indicate a spurious failure, try again from the beginning.
-          continue;
-        }
-        if (errno == ETIMEDOUT) {
-          const uint64_t wait_time = NanoTime() - start_time;
-          MutexLock mu(self, *Locks::thread_list_lock_);
-          MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-          std::ostringstream oss;
-          for (const auto& thread : list_) {
-            if (thread == ignore1 || thread == ignore2) {
-              continue;
-            }
-            if (!thread->IsSuspended()) {
-              oss << std::endl << "Thread not suspended: " << *thread;
-            }
-          }
-          LOG(kIsDebugBuild ? ::android::base::FATAL : ::android::base::ERROR)
-              << "Timed out waiting for threads to suspend, waited for "
-              << PrettyDuration(wait_time)
-              << oss.str();
-        } else {
-          PLOG(FATAL) << "futex wait failed for SuspendAllInternal()";
-        }
-      }  // else re-check pending_threads in the next iteration (this may be a spurious wake-up).
-#else
-      // Spin wait. This is likely to be slow, but on most architecture ART_USE_FUTEXES is set.
-      UNUSED(start_time);
-#endif
-    } else {
-      CHECK_EQ(cur_val, 0);
-      break;
+  if (!WaitForSuspendBarrier(&pending_threads)) {
+    const uint64_t wait_time = NanoTime() - start_time;
+    MutexLock mu(self, *Locks::thread_list_lock_);
+    MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
+    std::ostringstream oss;
+    for (const auto& thread : list_) {
+      if (thread != self && !thread->IsSuspended()) {
+        oss << std::endl << "Thread not suspended: " << *thread;
+      }
     }
+    LOG(::android::base::FATAL) << "Timed out waiting for threads to suspend, waited for "
+                                << PrettyDuration(wait_time) << oss.str();
   }
 }
 
 void ThreadList::ResumeAll() {
   Thread* self = Thread::Current();
+  if (kDebugLocking) {
+    // Debug check that all threads are suspended.
+    AssertOtherThreadsAreSuspended(self);
+  }
+  MutexLock mu(self, *Locks::thread_list_lock_);
+  MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
+  ResumeAllInternal(self);
+}
 
+// Holds thread_list_lock_ and suspend_count_lock_
+void ThreadList::ResumeAllInternal(Thread* self) {
   if (self != nullptr) {
     VLOG(threads) << *self << " ResumeAll starting";
   } else {
@@ -818,38 +889,30 @@ void ThreadList::ResumeAll() {
 
   ScopedTrace trace("Resuming mutator threads");
 
-  if (kDebugLocking) {
-    // Debug check that all threads are suspended.
-    AssertThreadsAreSuspended(self, self);
-  }
-
   long_suspend_ = false;
 
   Locks::mutator_lock_->ExclusiveUnlock(self);
-  {
-    MutexLock mu(self, *Locks::thread_list_lock_);
-    MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-    // Update global suspend all state for attaching threads.
-    --suspend_all_count_;
-    // Decrement the suspend counts for all threads.
-    for (const auto& thread : list_) {
-      if (thread == self) {
-        continue;
-      }
-      bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal);
-      DCHECK(updated);
-    }
 
-    // Broadcast a notification to all suspended threads, some or all of
-    // which may choose to wake up.  No need to wait for them.
-    if (self != nullptr) {
-      VLOG(threads) << *self << " ResumeAll waking others";
-    } else {
-      VLOG(threads) << "Thread[null] ResumeAll waking others";
+  // Decrement the suspend counts for all threads.
+  for (const auto& thread : list_) {
+    if (thread != self) {
+      thread->DecrementSuspendCount(self);
     }
-    Thread::resume_cond_->Broadcast(self);
   }
 
+  // Update global suspend all state for attaching threads. Unblocks other SuspendAlls once
+  // suspend_count_lock_ is released.
+  --suspend_all_count_;
+
+  // Broadcast a notification to all suspended threads, some or all of
+  // which may choose to wake up.  No need to wait for them.
+  if (self != nullptr) {
+    VLOG(threads) << *self << " ResumeAll waking others";
+  } else {
+    VLOG(threads) << "Thread[null] ResumeAll waking others";
+  }
+  Thread::resume_cond_->Broadcast(self);
+
   if (self != nullptr) {
     VLOG(threads) << *self << " ResumeAll complete";
   } else {
@@ -882,11 +945,7 @@ bool ThreadList::Resume(Thread* thread, SuspendReason reason) {
           << ") thread not within thread list";
       return false;
     }
-    if (UNLIKELY(!thread->ModifySuspendCount(self, -1, nullptr, reason))) {
-      LOG(ERROR) << "Resume(" << reinterpret_cast<void*>(thread)
-                 << ") could not modify suspend count.";
-      return false;
-    }
+    thread->DecrementSuspendCount(self, /*for_user_code=*/(reason == SuspendReason::kForUserCode));
   }
 
   {
@@ -912,40 +971,19 @@ static void ThreadSuspendByPeerWarning(ScopedObjectAccess& soa,
   }
 }
 
-Thread* ThreadList::SuspendThreadByPeer(jobject peer,
-                                        SuspendReason reason,
-                                        bool* timed_out) {
-  bool request_suspension = true;
-  const uint64_t start_time = NanoTime();
-  int self_suspend_count = 0;
-  useconds_t sleep_us = kThreadSuspendInitialSleepUs;
-  *timed_out = false;
+Thread* ThreadList::SuspendThreadByPeer(jobject peer, SuspendReason reason) {
+  bool is_suspended = false;
   Thread* const self = Thread::Current();
-  Thread* suspended_thread = nullptr;
   VLOG(threads) << "SuspendThreadByPeer starting";
-  while (true) {
-    Thread* thread;
+  Thread* thread;
+  WrappedSuspend1Barrier wrapped_barrier{};
+  for (int iter_count = 1;; ++iter_count) {
     {
-      // Note: this will transition to runnable and potentially suspend. We ensure only one thread
-      // is requesting another suspend, to avoid deadlock, by requiring this function be called
-      // holding Locks::thread_list_suspend_thread_lock_. Its important this thread suspend rather
-      // than request thread suspension, to avoid potential cycles in threads requesting each other
-      // suspend.
+      // Note: this will transition to runnable and potentially suspend.
       ScopedObjectAccess soa(self);
       MutexLock thread_list_mu(self, *Locks::thread_list_lock_);
       thread = Thread::FromManagedThread(soa, peer);
       if (thread == nullptr) {
-        if (suspended_thread != nullptr) {
-          MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
-          // If we incremented the suspend count but the thread reset its peer, we need to
-          // re-decrement it since it is shutting down and may deadlock the runtime in
-          // ThreadList::WaitForOtherNonDaemonThreadsToExit.
-          bool updated = suspended_thread->ModifySuspendCount(soa.Self(),
-                                                              -1,
-                                                              nullptr,
-                                                              reason);
-          DCHECK(updated);
-        }
         ThreadSuspendByPeerWarning(soa,
                                    ::android::base::WARNING,
                                     "No such thread for suspend",
@@ -953,84 +991,64 @@ Thread* ThreadList::SuspendThreadByPeer(jobject peer,
         return nullptr;
       }
       if (!Contains(thread)) {
-        CHECK(suspended_thread == nullptr);
         VLOG(threads) << "SuspendThreadByPeer failed for unattached thread: "
             << reinterpret_cast<void*>(thread);
         return nullptr;
       }
+      // IsSuspended on the current thread will fail as the current thread is changed into
+      // Runnable above. As the suspend count is now raised if this is the current thread
+      // it will self suspend on transition to Runnable, making it hard to work with. It's simpler
+      // to just explicitly handle the current thread in the callers to this code.
+      CHECK_NE(thread, self) << "Attempt to suspend the current thread for the debugger";
       VLOG(threads) << "SuspendThreadByPeer found thread: " << *thread;
       {
         MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
-        if (request_suspension) {
-          if (self->GetSuspendCount() > 0) {
-            // We hold the suspend count lock but another thread is trying to suspend us. Its not
-            // safe to try to suspend another thread in case we get a cycle. Start the loop again
-            // which will allow this thread to be suspended.
-            ++self_suspend_count;
-            continue;
-          }
-          CHECK(suspended_thread == nullptr);
-          suspended_thread = thread;
-          bool updated = suspended_thread->ModifySuspendCount(self, +1, nullptr, reason);
-          DCHECK(updated);
-          request_suspension = false;
-        } else {
-          // If the caller isn't requesting suspension, a suspension should have already occurred.
-          CHECK_GT(thread->GetSuspendCount(), 0);
-        }
-        // IsSuspended on the current thread will fail as the current thread is changed into
-        // Runnable above. As the suspend count is now raised if this is the current thread
-        // it will self suspend on transition to Runnable, making it hard to work with. It's simpler
-        // to just explicitly handle the current thread in the callers to this code.
-        CHECK_NE(thread, self) << "Attempt to suspend the current thread for the debugger";
-        // If thread is suspended (perhaps it was already not Runnable but didn't have a suspend
-        // count, or else we've waited and it has self suspended) or is the current thread, we're
-        // done.
-        if (thread->IsSuspended()) {
-          VLOG(threads) << "SuspendThreadByPeer thread suspended: " << *thread;
-          if (ATraceEnabled()) {
-            std::string name;
-            thread->GetThreadName(name);
-            ATraceBegin(StringPrintf("SuspendThreadByPeer suspended %s for peer=%p", name.c_str(),
-                                      peer).c_str());
-          }
-          return thread;
-        }
-        const uint64_t total_delay = NanoTime() - start_time;
-        if (total_delay >= thread_suspend_timeout_ns_) {
-          if (suspended_thread == nullptr) {
-            ThreadSuspendByPeerWarning(soa,
-                                       ::android::base::FATAL,
-                                       "Failed to issue suspend request",
-                                       peer);
-          } else {
-            CHECK_EQ(suspended_thread, thread);
-            LOG(WARNING) << "Suspended thread state_and_flags: "
-                         << suspended_thread->StateAndFlagsAsHexString()
-                         << ", self_suspend_count = " << self_suspend_count;
-            // Explicitly release thread_suspend_count_lock_; we haven't held it for long, so
-            // seeing threads blocked on it is not informative.
-            Locks::thread_suspend_count_lock_->Unlock(self);
-            ThreadSuspendByPeerWarning(soa,
-                                       ::android::base::FATAL,
-                                       "Thread suspension timed out",
-                                       peer);
+        if (LIKELY(self->GetSuspendCount() == 0)) {
+          thread->IncrementSuspendCount(self, nullptr, &wrapped_barrier, reason);
+          if (thread->IsSuspended()) {
+            // See the discussion in mutator_gc_coord.md and SuspendAllInternal for the race here.
+            thread->RemoveFirstSuspend1Barrier();
+            if (!thread->HasActiveSuspendBarrier()) {
+              thread->AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier);
+            }
+            is_suspended = true;
           }
-          UNREACHABLE();
-        } else if (sleep_us == 0 &&
-            total_delay > static_cast<uint64_t>(kThreadSuspendMaxYieldUs) * 1000) {
-          // We have spun for kThreadSuspendMaxYieldUs time, switch to sleeps to prevent
-          // excessive CPU usage.
-          sleep_us = kThreadSuspendMaxYieldUs / 2;
+          DCHECK_GT(thread->GetSuspendCount(), 0);
+          break;
         }
+        // Else we either hold the suspend count lock but another thread is trying to suspend us,
+        // making it unsafe to try to suspend another thread in case we get a cycle.  In either
+        // case, start the loop again, which will allow this thread to be suspended.
       }
-      // Release locks and come out of runnable state.
     }
-    VLOG(threads) << "SuspendThreadByPeer waiting to allow thread chance to suspend";
-    ThreadSuspendSleep(sleep_us);
-    // This may stay at 0 if sleep_us == 0, but this is WAI since we want to avoid using usleep at
-    // all if possible. This shouldn't be an issue since time to suspend should always be small.
-    sleep_us = std::min(sleep_us * 2, kThreadSuspendMaxSleepUs);
+    // All locks are released, and we should quickly exit the suspend-unfriendly state. Retry.
+    if (iter_count >= kMaxSuspendRetries) {
+      LOG(FATAL) << "Too many suspend retries";
+    }
+    usleep(kThreadSuspendSleepUs);
+  }
+  // Now wait for target to decrement suspend barrier.
+  if (is_suspended || WaitForSuspendBarrier(&wrapped_barrier.barrier_)) {
+    // wrapped_barrier.barrier_ has been decremented and will no longer be accessed.
+    VLOG(threads) << "SuspendThreadByPeer thread suspended: " << *thread;
+    if (ATraceEnabled()) {
+      std::string name;
+      thread->GetThreadName(name);
+      ATraceBegin(
+          StringPrintf("SuspendThreadByPeer suspended %s for peer=%p", name.c_str(), peer).c_str());
+    }
+    DCHECK(thread->IsSuspended());
+    return thread;
+  } else {
+    LOG(WARNING) << "Suspended thread state_and_flags: " << thread->StateAndFlagsAsHexString();
+    // thread still has a pointer to wrapped_barrier. Returning and continuing would be unsafe
+    // without additional cleanup.
+    {
+      ScopedObjectAccess soa(self);
+      ThreadSuspendByPeerWarning(
+          soa, ::android::base::FATAL, "SuspendThreadByPeer timed out", peer);
+    }
+    UNREACHABLE();
   }
 }
 
@@ -1040,101 +1058,74 @@ static void ThreadSuspendByThreadIdWarning(LogSeverity severity,
   LOG(severity) << StringPrintf("%s: %d", message, thread_id);
 }
 
-Thread* ThreadList::SuspendThreadByThreadId(uint32_t thread_id,
-                                            SuspendReason reason,
-                                            bool* timed_out) {
-  const uint64_t start_time = NanoTime();
-  useconds_t sleep_us = kThreadSuspendInitialSleepUs;
-  *timed_out = false;
-  Thread* suspended_thread = nullptr;
+Thread* ThreadList::SuspendThreadByThreadId(uint32_t thread_id, SuspendReason reason) {
+  bool is_suspended = false;
   Thread* const self = Thread::Current();
   CHECK_NE(thread_id, kInvalidThreadId);
   VLOG(threads) << "SuspendThreadByThreadId starting";
-  while (true) {
+  Thread* thread;
+  WrappedSuspend1Barrier wrapped_barrier{};
+  for (int iter_count = 1;; ++iter_count) {
     {
-      // Note: this will transition to runnable and potentially suspend. We ensure only one thread
-      // is requesting another suspend, to avoid deadlock, by requiring this function be called
-      // holding Locks::thread_list_suspend_thread_lock_. Its important this thread suspend rather
-      // than request thread suspension, to avoid potential cycles in threads requesting each other
-      // suspend.
+      // Note: this will transition to runnable and potentially suspend.
       ScopedObjectAccess soa(self);
       MutexLock thread_list_mu(self, *Locks::thread_list_lock_);
-      Thread* thread = nullptr;
-      for (const auto& it : list_) {
-        if (it->GetThreadId() == thread_id) {
-          thread = it;
-          break;
-        }
-      }
+      thread = FindThreadByThreadId(thread_id);
       if (thread == nullptr) {
-        CHECK(suspended_thread == nullptr) << "Suspended thread " << suspended_thread
-            << " no longer in thread list";
         // There's a race in inflating a lock and the owner giving up ownership and then dying.
         ThreadSuspendByThreadIdWarning(::android::base::WARNING,
                                        "No such thread id for suspend",
                                        thread_id);
         return nullptr;
       }
-      VLOG(threads) << "SuspendThreadByThreadId found thread: " << *thread;
       DCHECK(Contains(thread));
+      CHECK_NE(thread, self) << "Attempt to suspend the current thread for the debugger";
+      VLOG(threads) << "SuspendThreadByThreadId found thread: " << *thread;
       {
         MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
-        if (suspended_thread == nullptr) {
-          if (self->GetSuspendCount() > 0) {
-            // We hold the suspend count lock but another thread is trying to suspend us. Its not
-            // safe to try to suspend another thread in case we get a cycle. Start the loop again
-            // which will allow this thread to be suspended.
-            continue;
-          }
-          bool updated = thread->ModifySuspendCount(self, +1, nullptr, reason);
-          DCHECK(updated);
-          suspended_thread = thread;
-        } else {
-          CHECK_EQ(suspended_thread, thread);
-          // If the caller isn't requesting suspension, a suspension should have already occurred.
-          CHECK_GT(thread->GetSuspendCount(), 0);
-        }
-        // IsSuspended on the current thread will fail as the current thread is changed into
-        // Runnable above. As the suspend count is now raised if this is the current thread
-        // it will self suspend on transition to Runnable, making it hard to work with. It's simpler
-        // to just explicitly handle the current thread in the callers to this code.
-        CHECK_NE(thread, self) << "Attempt to suspend the current thread for the debugger";
-        // If thread is suspended (perhaps it was already not Runnable but didn't have a suspend
-        // count, or else we've waited and it has self suspended) or is the current thread, we're
-        // done.
-        if (thread->IsSuspended()) {
-          if (ATraceEnabled()) {
-            std::string name;
-            thread->GetThreadName(name);
-            ATraceBegin(StringPrintf("SuspendThreadByThreadId suspended %s id=%d",
-                                      name.c_str(), thread_id).c_str());
-          }
-          VLOG(threads) << "SuspendThreadByThreadId thread suspended: " << *thread;
-          return thread;
-        }
-        const uint64_t total_delay = NanoTime() - start_time;
-        if (total_delay >= thread_suspend_timeout_ns_) {
-          ThreadSuspendByThreadIdWarning(::android::base::WARNING,
-                                         "Thread suspension timed out",
-                                         thread_id);
-          if (suspended_thread != nullptr) {
-            bool updated = thread->ModifySuspendCount(soa.Self(), -1, nullptr, reason);
-            DCHECK(updated);
+        if (LIKELY(self->GetSuspendCount() == 0)) {
+          thread->IncrementSuspendCount(self, nullptr, &wrapped_barrier, reason);
+          if (thread->IsSuspended()) {
+            // See the discussion in mutator_gc_coord.md and SuspendAllInternal for the race here.
+            thread->RemoveFirstSuspend1Barrier();
+            if (!thread->HasActiveSuspendBarrier()) {
+              thread->AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier);
+            }
+            is_suspended = true;
           }
-          *timed_out = true;
-          return nullptr;
-        } else if (sleep_us == 0 &&
-            total_delay > static_cast<uint64_t>(kThreadSuspendMaxYieldUs) * 1000) {
-          // We have spun for kThreadSuspendMaxYieldUs time, switch to sleeps to prevent
-          // excessive CPU usage.
-          sleep_us = kThreadSuspendMaxYieldUs / 2;
+          DCHECK_GT(thread->GetSuspendCount(), 0);
+          break;
         }
+        // Else we either hold the suspend count lock but another thread is trying to suspend us,
+        // making it unsafe to try to suspend another thread in case we get a cycle. In either
+        // case, start the loop again, which will allow this thread to be suspended.
       }
-      // Release locks and come out of runnable state.
     }
-    VLOG(threads) << "SuspendThreadByThreadId waiting to allow thread chance to suspend";
-    ThreadSuspendSleep(sleep_us);
-    sleep_us = std::min(sleep_us * 2, kThreadSuspendMaxSleepUs);
+    // All locks are released, and we should quickly exit the suspend-unfriendly state. Retry.
+    if (iter_count >= kMaxSuspendRetries) {
+      LOG(FATAL) << "Too many suspend retries";
+    }
+    usleep(kThreadSuspendSleepUs);
+  }
+  // Now wait for target to decrement suspend barrier.
+  if (is_suspended || WaitForSuspendBarrier(&wrapped_barrier.barrier_)) {
+    // wrapped_barrier.barrier_ has been decremented and will no longer be accessed.
+    VLOG(threads) << "SuspendThreadByThreadId thread suspended: " << *thread;
+    if (ATraceEnabled()) {
+      std::string name;
+      thread->GetThreadName(name);
+      ATraceBegin(
+          StringPrintf("SuspendThreadByPeer suspended %s for id=%d", name.c_str(), thread_id)
+              .c_str());
+    }
+    DCHECK(thread->IsSuspended());
+    return thread;
+  } else {
+    // thread still has a pointer to wrapped_barrier. Returning and continuing would be unsafe
+    // without additional cleanup.
+    ThreadSuspendByThreadIdWarning(
+        ::android::base::FATAL, "SuspendThreadByThreadId timed out", thread_id);
+    UNREACHABLE();
   }
 }
 
@@ -1210,8 +1201,7 @@ void ThreadList::SuspendAllDaemonThreadsForShutdown() {
       // daemons.
       CHECK(thread->IsDaemon()) << *thread;
       if (thread != self) {
-        bool updated = thread->ModifySuspendCount(self, +1, nullptr, SuspendReason::kInternal);
-        DCHECK(updated);
+        thread->IncrementSuspendCount(self);
         ++daemons_left;
       }
       // We are shutting down the runtime, set the JNI functions of all the JNIEnvs to be
@@ -1311,11 +1301,10 @@ void ThreadList::Register(Thread* self) {
   // SuspendAll requests.
   MutexLock mu(self, *Locks::thread_list_lock_);
   MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-  // Modify suspend count in increments of 1 to maintain invariants in ModifySuspendCount. While
-  // this isn't particularly efficient the suspend counts are most commonly 0 or 1.
-  for (int delta = suspend_all_count_; delta > 0; delta--) {
-    bool updated = self->ModifySuspendCount(self, +1, nullptr, SuspendReason::kInternal);
-    DCHECK(updated);
+  if (suspend_all_count_ == 1) {
+    self->IncrementSuspendCount(self);
+  } else {
+    DCHECK_EQ(suspend_all_count_, 0);
   }
   CHECK(!Contains(self));
   list_.push_back(self);
@@ -1361,6 +1350,7 @@ void ThreadList::Unregister(Thread* self, bool should_run_callbacks) {
     // Note: deliberately not using MutexLock that could hold a stale self pointer.
     {
       MutexLock mu(self, *Locks::thread_list_lock_);
+      self->SignalExitFlags();
       if (!Contains(self)) {
         std::string thread_name;
         self->GetThreadName(thread_name);
@@ -1370,16 +1360,19 @@ void ThreadList::Unregister(Thread* self, bool should_run_callbacks) {
         break;
       } else {
         MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-        if (!self->IsSuspended()) {
-          list_.remove(self);
-          break;
+        if (!self->IsSuspended() && !self->ReadFlag(ThreadFlag::kRunningFlipFunction)) {
+          Thread::StateAndFlags state_and_flags = self->GetStateAndFlags(std::memory_order_acquire);
+          if (!state_and_flags.IsFlagSet(ThreadFlag::kRunningFlipFunction)) {
+            list_.remove(self);
+            break;
+          }
         }
       }
     }
     // In the case where we are not suspended yet, sleep to leave other threads time to execute.
     // This is important if there are realtime threads. b/111277984
     usleep(1);
-    // We failed to remove the thread due to a suspend request, loop and try again.
+    // We failed to remove the thread due to a suspend request or the like, loop and try again.
   }
   delete self;
 
@@ -1418,13 +1411,11 @@ void ThreadList::VisitRootsForSuspendedThreads(RootVisitor* visitor) {
     MutexLock mu(self, *Locks::thread_list_lock_);
     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
     for (Thread* thread : list_) {
-      bool suspended = thread->ModifySuspendCount(self, +1, nullptr, SuspendReason::kInternal);
-      DCHECK(suspended);
+      thread->IncrementSuspendCount(self);
       if (thread == self || thread->IsSuspended()) {
         threads_to_visit.push_back(thread);
       } else {
-        bool resumed = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal);
-        DCHECK(resumed);
+        thread->DecrementSuspendCount(self);
       }
     }
   }
@@ -1439,9 +1430,9 @@ void ThreadList::VisitRootsForSuspendedThreads(RootVisitor* visitor) {
   {
     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
     for (Thread* thread : threads_to_visit) {
-      bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal);
-      DCHECK(updated);
+      thread->DecrementSuspendCount(self);
     }
+    Thread::resume_cond_->Broadcast(self);
   }
 }
 
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index 51fac4a6ed..cad51aa969 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -49,7 +49,11 @@ class ThreadList {
   static constexpr uint32_t kInvalidThreadId = 0;
   static constexpr uint32_t kMainThreadId = 1;
   static constexpr uint64_t kDefaultThreadSuspendTimeout =
-      kIsDebugBuild ? 50'000'000'000ull : 10'000'000'000ull;
+      kIsDebugBuild ? 2'000'000'000ull : 4'000'000'000ull;
+  // We fail more aggressively in debug builds to catch potential issues early.
+  // The number of times we may retry when we find ourselves in a suspend-unfriendly state.
+  static constexpr int kMaxSuspendRetries = kIsDebugBuild ? 500 : 5000;
+  static constexpr useconds_t kThreadSuspendSleepUs = 100;
 
   explicit ThreadList(uint64_t thread_suspend_timeout_ns);
   ~ThreadList();
@@ -70,7 +74,7 @@ class ThreadList {
   bool Resume(Thread* thread, SuspendReason reason = SuspendReason::kInternal)
       REQUIRES(!Locks::thread_suspend_count_lock_) WARN_UNUSED;
 
-  // Suspends all threads and gets exclusive access to the mutator lock.
+  // Suspends all other threads and gets exclusive access to the mutator lock.
   // If long_suspend is true, then other threads who try to suspend will never timeout.
   // long_suspend is currenly used for hprof since large heaps take a long time.
   void SuspendAll(const char* cause, bool long_suspend = false)
@@ -81,10 +85,7 @@ class ThreadList {
 
   // Suspend a thread using a peer, typically used by the debugger. Returns the thread on success,
   // else null. The peer is used to identify the thread to avoid races with the thread terminating.
-  // If the suspension times out then *timeout is set to true.
-  Thread* SuspendThreadByPeer(jobject peer,
-                              SuspendReason reason,
-                              bool* timed_out)
+  Thread* SuspendThreadByPeer(jobject peer, SuspendReason reason)
       REQUIRES(!Locks::mutator_lock_,
                !Locks::thread_list_lock_,
                !Locks::thread_suspend_count_lock_);
@@ -92,8 +93,8 @@ class ThreadList {
   // Suspend a thread using its thread id, typically used by lock/monitor inflation. Returns the
   // thread on success else null. The thread id is used to identify the thread to avoid races with
   // the thread terminating. Note that as thread ids are recycled this may not suspend the expected
-  // thread, that may be terminating. If the suspension times out then *timeout is set to true.
-  Thread* SuspendThreadByThreadId(uint32_t thread_id, SuspendReason reason, bool* timed_out)
+  // thread, that may be terminating.
+  Thread* SuspendThreadByThreadId(uint32_t thread_id, SuspendReason reason)
       REQUIRES(!Locks::mutator_lock_,
                !Locks::thread_list_lock_,
                !Locks::thread_suspend_count_lock_);
@@ -113,11 +114,24 @@ class ThreadList {
   // Running threads are not suspended but run the checkpoint inside of the suspend check. The
   // return value includes already suspended threads for b/24191051. Runs or requests the
   // callback, if non-null, inside the thread_list_lock critical section after determining the
-  // runnable/suspended states of the threads. Does not wait for completion of the callbacks in
-  // running threads.
-  size_t RunCheckpoint(Closure* checkpoint_function, Closure* callback = nullptr)
+  // runnable/suspended states of the threads. Does not wait for completion of the checkpoint
+  // function in running threads. If the caller holds the mutator lock, then all instances of the
+  // checkpoint function are run with the mutator lock. If the caller does not hold the mutator
+  // lock (see mutator_gc_coord.md) then, since the checkpoint code may not acquire or release the
+  // mutator lock, the checkpoint will have no way to access Java data.
+  // TODO: Is it possible to just require the mutator lock here?
+  size_t RunCheckpoint(Closure* checkpoint_function,
+                       Closure* callback = nullptr,
+                       bool allow_lock_checking = true)
       REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_);
 
+  // Convenience version of the above to disable lock checking inside Run function. Hopefully this
+  // and the third parameter above will eventually disappear.
+  size_t RunCheckpointUnchecked(Closure* checkpoint_function, Closure* callback = nullptr)
+      REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_) {
+    return RunCheckpoint(checkpoint_function, callback, false);
+  }
+
   // Run an empty checkpoint on threads. Wait until threads pass the next suspend point or are
   // suspended. This is used to ensure that the threads finish or aren't in the middle of an
   // in-flight mutator heap access (eg. a read barrier.) Runnable threads will respond by
@@ -126,12 +140,17 @@ class ThreadList {
   void RunEmptyCheckpoint()
       REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_);
 
-  // Flip thread roots from from-space refs to to-space refs. Used by
-  // the concurrent moving collectors.
-  size_t FlipThreadRoots(Closure* thread_flip_visitor,
-                         Closure* flip_callback,
-                         gc::collector::GarbageCollector* collector,
-                         gc::GcPauseListener* pause_listener)
+  // Used to flip thread roots from from-space refs to to-space refs. Used only by the concurrent
+  // moving collectors during a GC, and hence cannot be called from multiple threads concurrently.
+  //
+  // Briefly suspends all threads to atomically install a checkpoint-like thread_flip_visitor
+  // function to be run on each thread. Run flip_callback while threads are suspended.
+  // Thread_flip_visitors are run by each thread before it becomes runnable, or by us. We do not
+  // return until all thread_flip_visitors have been run.
+  void FlipThreadRoots(Closure* thread_flip_visitor,
+                       Closure* flip_callback,
+                       gc::collector::GarbageCollector* collector,
+                       gc::GcPauseListener* pause_listener)
       REQUIRES(!Locks::mutator_lock_,
                !Locks::thread_list_lock_,
                !Locks::thread_suspend_count_lock_);
@@ -186,26 +205,29 @@ class ThreadList {
       REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_,
                !Locks::mutator_lock_);
 
+  // Wait for suspend barrier to reach zero. Return false on timeout.
+  bool WaitForSuspendBarrier(AtomicInteger* barrier);
+
  private:
   uint32_t AllocThreadId(Thread* self);
   void ReleaseThreadId(Thread* self, uint32_t id) REQUIRES(!Locks::allocated_thread_ids_lock_);
 
-  size_t RunCheckpoint(Closure* checkpoint_function, bool includeSuspended)
-      REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_);
-
   void DumpUnattachedThreads(std::ostream& os, bool dump_native_stack)
       REQUIRES(!Locks::thread_list_lock_);
 
   void SuspendAllDaemonThreadsForShutdown()
       REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_);
 
-  void SuspendAllInternal(Thread* self,
-                          Thread* ignore1,
-                          Thread* ignore2 = nullptr,
-                          SuspendReason reason = SuspendReason::kInternal)
-      REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_);
+  void ResumeAllInternal(Thread* self)
+      REQUIRES(Locks::thread_list_lock_, Locks::thread_suspend_count_lock_)
+          UNLOCK_FUNCTION(Locks::mutator_lock_);
 
-  void AssertThreadsAreSuspended(Thread* self, Thread* ignore1, Thread* ignore2 = nullptr)
+  void SuspendAllInternal(Thread* self, SuspendReason reason = SuspendReason::kInternal)
+      REQUIRES(!Locks::thread_list_lock_,
+               !Locks::thread_suspend_count_lock_,
+               !Locks::mutator_lock_);
+
+  void AssertOtherThreadsAreSuspended(Thread* self)
       REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_);
 
   std::bitset<kMaxThreadId> allocated_ids_ GUARDED_BY(Locks::allocated_thread_ids_lock_);
@@ -213,7 +235,10 @@ class ThreadList {
   // The actual list of all threads.
   std::list<Thread*> list_ GUARDED_BY(Locks::thread_list_lock_);
 
-  // Ongoing suspend all requests, used to ensure threads added to list_ respect SuspendAll.
+  // Ongoing suspend all requests, used to ensure threads added to list_ respect SuspendAll, and
+  // to ensure that only one SuspendAll ot FlipThreadRoots call is active at a time.  The value is
+  // always either 0 or 1. Thread_suspend_count_lock must be held continuously while these two
+  // functions modify suspend counts of all other threads and modify suspend_all_count_ .
   int suspend_all_count_ GUARDED_BY(Locks::thread_suspend_count_lock_);
 
   // Number of threads unregistering, ~ThreadList blocks until this hits 0.
@@ -221,7 +246,7 @@ class ThreadList {
 
   // Thread suspend time histogram. Only modified when all the threads are suspended, so guarding
   // by mutator lock ensures no thread can read when another thread is modifying it.
-  Histogram<uint64_t> suspend_all_historam_ GUARDED_BY(Locks::mutator_lock_);
+  Histogram<uint64_t> suspend_all_histogram_ GUARDED_BY(Locks::mutator_lock_);
 
   // Whether or not the current thread suspension is long.
   bool long_suspend_;
@@ -237,6 +262,9 @@ class ThreadList {
 
   friend class Thread;
 
+  friend class Mutex;
+  friend class BaseMutex;
+
   DISALLOW_COPY_AND_ASSIGN(ThreadList);
 };
 
diff --git a/test/129-ThreadGetId/expected-stdout.txt b/test/129-ThreadGetId/expected-stdout.txt
index aadf90d9d7..4455320f38 100644
--- a/test/129-ThreadGetId/expected-stdout.txt
+++ b/test/129-ThreadGetId/expected-stdout.txt
@@ -1,2 +1,8 @@
+Thread finished
+Thread finished
+Thread finished
+Thread finished
+Thread finished
+All joined
 HeapTaskDaemon depth 0
 Finishing
diff --git a/test/129-ThreadGetId/src/Main.java b/test/129-ThreadGetId/src/Main.java
index 50e8c098de..d46bb25448 100644
--- a/test/129-ThreadGetId/src/Main.java
+++ b/test/129-ThreadGetId/src/Main.java
@@ -20,19 +20,42 @@ import java.util.Map;
 public class Main implements Runnable {
     static final int NUMBER_OF_THREADS = 5;
     static final int TOTAL_OPERATIONS = 900;
+    static int[] progress = new int[NUMBER_OF_THREADS];
+    int index;
+
+    Main(int i) {
+        index = i;
+    }
 
     public static void main(String[] args) throws Exception {
         final Thread[] threads = new Thread[NUMBER_OF_THREADS];
+        Thread watchdog = new Thread() {
+            public void run() {
+                try {
+                    Thread.sleep(50_000);
+                    System.out.print("Watchdog timed out: ");
+                    for (int i = 0; i < NUMBER_OF_THREADS; ++i) {
+                        System.out.print(progress[i] + ", ");
+                    }
+                    System.out.println("");
+                    System.err.println("Watchdog thread timed out");
+                    System.exit(1);
+                } catch (InterruptedException e) {}
+            }
+        };
+        watchdog.start();
         for (int t = 0; t < threads.length; t++) {
-            threads[t] = new Thread(new Main());
+            threads[t] = new Thread(new Main(t));
             threads[t].start();
         }
         for (Thread t : threads) {
             t.join();
         }
+        System.out.println("All joined");
         // Do this test after the other part to leave some time for the heap task daemon to start
         // up.
         test_getStackTraces();
+        watchdog.interrupt();
         System.out.println("Finishing");
     }
 
@@ -87,8 +110,10 @@ public class Main implements Runnable {
     }
 
     public void run() {
-        for (int i = 0; i < TOTAL_OPERATIONS; ++i) {
+        for (int i = 1; i <= TOTAL_OPERATIONS; ++i) {
             test_getId();
+            progress[index] = i;
         }
+        System.out.println("Thread finished");
     }
 }
diff --git a/test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc b/test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc
index ae1d8306d1..9ae1bedd23 100644
--- a/test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc
+++ b/test/2011-stack-walk-concurrent-instrument/stack_walk_concurrent.cc
@@ -81,10 +81,8 @@ extern "C" JNIEXPORT void JNICALL Java_Main_waitAndInstrumentStack(JNIEnv*,
                                                                    jobject target) {
   while (!instrument_waiting) {
   }
-  bool timed_out = false;
   Thread* other = Runtime::Current()->GetThreadList()->SuspendThreadByPeer(
-      target, SuspendReason::kInternal, &timed_out);
-  CHECK(!timed_out);
+      target, SuspendReason::kInternal);
   CHECK(other != nullptr);
   ScopedSuspendAll ssa(__FUNCTION__);
   Runtime::Current()->GetInstrumentation()->InstrumentThreadStack(other,