10 files changed, 195 insertions, 38 deletions
diff --git a/compiler/jni/jni_compiler_test.cc b/compiler/jni/jni_compiler_test.cc
index 35ee1edafd..1a408bb724 100644
--- a/compiler/jni/jni_compiler_test.cc
+++ b/compiler/jni/jni_compiler_test.cc
@@ -912,14 +912,14 @@ void JniCompilerTest::CompileAndRun_fooJJ_synchronizedImpl() {
   gLogVerbosity.systrace_lock_logging = true;
   InitEntryPoints(&self->tlsPtr_.jni_entrypoints,
                   &self->tlsPtr_.quick_entrypoints,
-                  self->ReadFlag(ThreadFlag::kMonitorJniEntryExit));
+                  self->ReadFlag(ThreadFlag::kMonitorJniEntryExit, std::memory_order_relaxed));
   result = env_->CallNonvirtualLongMethod(jobj_, jklass_, jmethod_, a, b);
   EXPECT_EQ(a | b, result);
   EXPECT_EQ(5, gJava_MyClassNatives_fooJJ_synchronized_calls[gCurrentJni]);
   gLogVerbosity.systrace_lock_logging = false;
   InitEntryPoints(&self->tlsPtr_.jni_entrypoints,
                   &self->tlsPtr_.quick_entrypoints,
-                  self->ReadFlag(ThreadFlag::kMonitorJniEntryExit));
+                  self->ReadFlag(ThreadFlag::kMonitorJniEntryExit, std::memory_order_relaxed));
 
   gJava_MyClassNatives_fooJJ_synchronized_calls[gCurrentJni] = 0;
 }
diff --git a/openjdkjvmti/ti_thread.cc b/openjdkjvmti/ti_thread.cc
index 9c71d4e3cc..f89a9d9a14 100644
--- a/openjdkjvmti/ti_thread.cc
+++ b/openjdkjvmti/ti_thread.cc
@@ -550,7 +550,7 @@ static jint GetJavaStateFromInternal(const InternalThreadState& state) {
 
 // Suspends the current thread if it has any suspend requests on it.
 void ThreadUtil::SuspendCheck(art::Thread* self) {
-  DCHECK(!self->ReadFlag(art::ThreadFlag::kSuspensionImmune));
+  DCHECK(!self->ReadFlag(art::ThreadFlag::kSuspensionImmune, std::memory_order_relaxed));
   art::ScopedObjectAccess soa(self);
   // Really this is only needed if we are in FastJNI and actually have the mutator_lock_ already.
   self->FullSuspendCheck();
diff --git a/runtime/entrypoints/quick/quick_jni_entrypoints.cc b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
index 1359fef086..d4dc4714ba 100644
--- a/runtime/entrypoints/quick/quick_jni_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
@@ -158,7 +158,7 @@ extern uint64_t GenericJniMethodEnd(Thread* self,
   // @CriticalNative does not do a state transition. @FastNative usually does not do a state
   // transition either but it performs a suspend check that may do state transitions.
   if (LIKELY(normal_native)) {
-    if (UNLIKELY(self->ReadFlag(ThreadFlag::kMonitorJniEntryExit))) {
+    if (UNLIKELY(self->ReadFlag(ThreadFlag::kMonitorJniEntryExit, std::memory_order_relaxed))) {
       artJniMonitoredMethodEnd(self);
     } else {
       artJniMethodEnd(self);
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 6c817d5e16..90c0aea478 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -2101,7 +2101,7 @@ extern "C" const void* artQuickGenericJniTrampoline(Thread* self,
         return nullptr;  // Report error.
       }
     }
-    if (UNLIKELY(self->ReadFlag(ThreadFlag::kMonitorJniEntryExit))) {
+    if (UNLIKELY(self->ReadFlag(ThreadFlag::kMonitorJniEntryExit, std::memory_order_relaxed))) {
       artJniMonitoredMethodStart(self);
     } else {
       artJniMethodStart(self);
diff --git a/runtime/mutator_gc_coord.md b/runtime/mutator_gc_coord.md
index 01e3ef025d..35996a5f8e 100644
--- a/runtime/mutator_gc_coord.md
+++ b/runtime/mutator_gc_coord.md
@@ -308,6 +308,125 @@ to prevent a single thread suspension of a thread currently between
 it will complete before it can be affected by suspension requests from other
 threads.
 
+
+Thread state transitions, flags, and memory ordering
+----------------------------------------------------
+
+Logically when a state transitions to state `kRunnable`, it acquires the mutator
+lock (in shared mode). When it changes its state back to suspended, it releases
+that lock. These must enforce proper happens-before ordering with respect to a
+thread acquiring the mutator lock in exclusive mode. Thus changing the thread
+state to suspended normally requires a release operation, to ensure visibility
+to a thread wishing to "suspend". Conversely, when we change the state back to
+`kRunnable` in `TransitionFromSuspendedToRunnable` we do so with an acquire
+compare-exchange operation to check that no suspension request remains.
+
+Any suspending thread should clear `kSuspendRequest` with a release
+operation. This, together with the acquire compare-exchange when returning to
+runnable state, ensure that suspend-count decrements happen-before a thread
+becomes runnable again.
+
+Flags are often set and tested with relaxed ordering, even when they may
+communicate a request to another thread. The reason this is safe varies.
+Generally the request is conformed by some other better synchronized
+interaction, which establishes the necessary ordering. In particular:
+
+`kCheckPointRequest`
+: See below. The call happens-before checkpoint execution since
+`checkpoint_function` accesses are guarded by a lock, as are updates to the
+flag. Checkpoint completion ordering is ensured by waiting for both
+`ThreadList::RunCheckpoint()` to complete, and for a barrier indicating
+completion in "runnable" threads.
+
+`kEmptyCheckpointRequest`
+: Currently (12/2024) in flux. See below and b/382722942 . We currently use
+acquire/release ordering to access this flag in some places to partially
+mitigate memory ordering issues here.
+
+`kActiveSuspendBarrier`
+: Changes are protected by `thread_suspend_count_lock_`, and
+`PassActiveSuspendBarriers` rechecks it while holding that lock.
+
+`kPendingFlipFunction`
+: Set while holding `thread_list_lock_`, but this lock is not consistently held
+while checking the flag, notably in `EnsureFlipFunctionStarted()`. We need
+acquire/release ordering to make sure that the requestor happens-before flip
+function execution, and the flip function is properly visible, among other
+things. We still commonly read the flag using a relaxed operation, but we
+confirm it with an acquire compare-exchange operation in
+`EnsureFlipFunctionStarted()` before actiong on it.
+
+`kRunningFlipFunction`
+: Cleared with release ordering, and read with acquire ordering by
+`WaitForFlipFunction` and its callers, thus ensuring that flip function
+execution happens-before completion of WaitForFlipFunction.
+
+`kSuspensionImmune`
+: Guarded by `thread_suspend_count_lock_`.
+
+### Checkpoint-specific considerations
+
+Checkpoints expose additional tricky issues, since they are also used to ensure
+that certain global state changes have propagated to all running Java threads.
+
+`ThreadList::RunCheckpoint()` guarantees that "if at point _X_ `RunCheckpoint()`
+has returned, and all checkpoints have been properly observed to have completed,
+then every thread has executed a code sequence _S_ during which it remained in
+a suspended state, such that the call to `RunCheckpoint` happens-before the end
+of _S_, and the beginning of _S_ happened before _X_."
+
+For each thread _T_, we attempt to atomically install the `kCheckpointRequest`
+flag while ensuring that _T_ is runnable.  If this succeeds, the fact that
+`tlsPtr_.checkpoint_function` is protected by `thread_suspend_count_lock_`, and
+`checkpoint_function` is written by the requestor, and read by _T_ at a suspend
+point, ensures that the call happens-before _S_. Normally a barrier ensures that
+_S_ happens-before _X_ .
+
+If we are unable to do so for a thread _T_ because we find it suspended, we run
+`checkpoint_function` ourselves. Before running it we set `kSuspendRequest`
+(with a release store) while _T_ is in _S_, preventing _T_ from terminating _S_
+by becoming runnable, after an acquire load of the flags. This ensures that the
+`RunCheckpoint()` call happens-before the end of _S_. Since we learned of _T_'s
+suspended state via an acquire load of its state, which it stored with a release
+store, we know that the beginning of _S_ happens-before we return from
+`RunCheckpoint()`, and hence before _X_ .
+
+The case of empty checkpoints is worth highlighting. We have traditionally
+optimized that by avoiding ever suspending the affected threads. This appears
+correct if all operations are sequentially consistent. But once memory model
+issues are considered, it appears more expensive to do fully correctly than it
+is to forego the optimization, as we explain below:
+
+We need to ensure that if Thread _A_ performs some update _U_ and then calls
+`RunEmptyCheckpoint()` then, when `RunEmptyCheckpoint()` returns, every Thread
+_B_ will have passed a suspend point at which _U_ became guaranteed visible to
+that thread. This could be ensured in different ways, depending on the observed
+state of Thread _B_:
+
+Runnable
+: Use acquire/release ordering when setting and detecting the
+`kEmptyCheckpointRequest` flag, and then use a barrier to observe when thread _B_
+has done so. In this case, Thread _B_ actually executes code at a suspend point.
+The acquire release ordering on `kEmptyCheckpointRequest` ensures that _U_
+happens-before the suspend point, and the barrier ensures that the suspend point
+happens-before the `RunEmptyCheckpoint()` return.
+
+Suspended
+: In this case, we have no mechanism for the thread to execute synchronization
+operations, and things become trickier. Thread _B_ is suspended, but it may
+restart at any time. Thread _A_ wants to conclude that Thread _B_ is already
+suspended at a suspend point, and thus it is safe to ignore Thread _B_.
+Effectively, it wants to perform the update _U_, read _B_'s state, and continue,
+while _B_ may change its state back to runnable, and then read the result of
+_U_. Both threads effectively perform a store (either _U_ or to the state of
+_B_) and then read the other value. With only acquire/release ordering, this can
+result in _A_ seeing the old suspended state, and _B_ failing to see the update
+_U_, which is an incorrect outcome.
+: The canonical fix for this kind of `store; load` reordering problem is to make
+all operations sequentially consistent. There does not appear to be an easy way
+to limit this overhead to just empty checkpoint execution. Thus it appears to be
+better to forego this "optimization".
+
 [^1]: In the most recent versions of ART, compiler-generated code loads through
     the address at `tlsPtr_.suspend_trigger`. A thread suspension is requested
     by setting this to null, triggering a `SIGSEGV`, causing that thread to
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 432453e311..431d66d7ff 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -83,7 +83,11 @@ inline void Thread::AllowThreadSuspension() {
 inline void Thread::CheckSuspend(bool implicit) {
   DCHECK_EQ(Thread::Current(), this);
   while (true) {
-    StateAndFlags state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
+    // Memory_order_relaxed should be OK, since RunCheckpointFunction shares a lock with the
+    // requestor, and FullSuspendCheck() re-checks later. But we currently need memory_order_acquire
+    // for the empty checkpoint path.
+    // TODO (b/382722942): Revisit after we fix RunEmptyCheckpoint().
+    StateAndFlags state_and_flags = GetStateAndFlags(std::memory_order_acquire);
     if (LIKELY(!state_and_flags.IsAnyOfFlagsSet(SuspendOrCheckpointRequestFlags()))) {
       break;
     } else if (state_and_flags.IsFlagSet(ThreadFlag::kCheckpointRequest)) {
@@ -110,7 +114,8 @@ inline void Thread::CheckEmptyCheckpointFromWeakRefAccess(BaseMutex* cond_var_mu
   Thread* self = Thread::Current();
   DCHECK_EQ(self, this);
   for (;;) {
-    if (ReadFlag(ThreadFlag::kEmptyCheckpointRequest)) {
+    // TODO (b/382722942): Revisit memory ordering after we fix RunEmptyCheckpoint().
+    if (ReadFlag(ThreadFlag::kEmptyCheckpointRequest, std::memory_order_acquire)) {
       RunEmptyCheckpoint();
       // Check we hold only an expected mutex when accessing weak ref.
       if (kIsDebugBuild) {
@@ -135,7 +140,8 @@ inline void Thread::CheckEmptyCheckpointFromWeakRefAccess(BaseMutex* cond_var_mu
 inline void Thread::CheckEmptyCheckpointFromMutex() {
   DCHECK_EQ(Thread::Current(), this);
   for (;;) {
-    if (ReadFlag(ThreadFlag::kEmptyCheckpointRequest)) {
+    // TODO (b/382722942): Revisit memory ordering after we fix RunEmptyCheckpoint().
+    if (ReadFlag(ThreadFlag::kEmptyCheckpointRequest, std::memory_order_acquire)) {
       RunEmptyCheckpoint();
     } else {
       break;
@@ -234,7 +240,11 @@ inline void Thread::AssertThreadSuspensionIsAllowable(bool check_locks) const {
 inline void Thread::TransitionToSuspendedAndRunCheckpoints(ThreadState new_state) {
   DCHECK_NE(new_state, ThreadState::kRunnable);
   while (true) {
-    StateAndFlags old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
+    // memory_order_relaxed is OK for ordinary checkpoints, which enforce ordering via
+    // thread_suspend_count_lock_ . It is not currently OK for empty checkpoints.
+    // TODO (b/382722942): Consider changing back to memory_order_relaxed after fixing empty
+    // checkpoints.
+    StateAndFlags old_state_and_flags = GetStateAndFlags(std::memory_order_acquire);
     DCHECK_EQ(old_state_and_flags.GetState(), ThreadState::kRunnable);
     if (UNLIKELY(old_state_and_flags.IsFlagSet(ThreadFlag::kCheckpointRequest))) {
       IncrementStatsCounter(&checkpoint_count_);
@@ -265,6 +275,8 @@ inline void Thread::TransitionToSuspendedAndRunCheckpoints(ThreadState new_state
 inline void Thread::CheckActiveSuspendBarriers() {
   DCHECK_NE(GetState(), ThreadState::kRunnable);
   while (true) {
+    // memory_order_relaxed is OK here, since PassActiveSuspendBarriers() rechecks with
+    // thread_suspend_count_lock_ .
     StateAndFlags state_and_flags = GetStateAndFlags(std::memory_order_relaxed);
     if (LIKELY(!state_and_flags.IsFlagSet(ThreadFlag::kCheckpointRequest) &&
                !state_and_flags.IsFlagSet(ThreadFlag::kEmptyCheckpointRequest) &&
@@ -540,7 +552,7 @@ inline void Thread::IncrementSuspendCount(Thread* self) {
 }
 
 inline void Thread::DecrementSuspendCount(Thread* self, bool for_user_code) {
-  DCHECK(ReadFlag(ThreadFlag::kSuspendRequest));
+  DCHECK(ReadFlag(ThreadFlag::kSuspendRequest, std::memory_order_relaxed));
   Locks::thread_suspend_count_lock_->AssertHeld(self);
   if (UNLIKELY(tls32_.suspend_count <= 0)) {
     UnsafeLogFatalForSuspendCount(self, this);
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 5bd97eb59a..a94dbd697d 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -1530,7 +1530,7 @@ bool Thread::PassActiveSuspendBarriers() {
   std::vector<AtomicInteger*> pass_barriers{};
   {
     MutexLock mu(this, *Locks::thread_suspend_count_lock_);
-    if (!ReadFlag(ThreadFlag::kActiveSuspendBarrier)) {
+    if (!ReadFlag(ThreadFlag::kActiveSuspendBarrier, std::memory_order_relaxed)) {
       // Quick exit test: The barriers have already been claimed - this is possible as there may
       // be a race to claim and it doesn't matter who wins.  All of the callers of this function
       // (except SuspendAllInternal) will first test the kActiveSuspendBarrier flag without the
@@ -1604,7 +1604,8 @@ void Thread::RunEmptyCheckpoint() {
   // Note: Empty checkpoint does not access the thread's stack,
   // so we do not need to check for the flip function.
   DCHECK_EQ(Thread::Current(), this);
-  AtomicClearFlag(ThreadFlag::kEmptyCheckpointRequest);
+  // See mutator_gc_coord.md and b/382722942 for memory ordering discussion.
+  AtomicClearFlag(ThreadFlag::kEmptyCheckpointRequest, std::memory_order_release);
   Runtime::Current()->GetThreadList()->EmptyCheckpointBarrier()->Pass(this);
 }
 
@@ -1626,7 +1627,7 @@ bool Thread::RequestCheckpoint(Closure* function) {
   } else {
     checkpoint_overflow_.push_back(function);
   }
-  DCHECK(ReadFlag(ThreadFlag::kCheckpointRequest));
+  DCHECK(ReadFlag(ThreadFlag::kCheckpointRequest, std::memory_order_relaxed));
   TriggerSuspend();
   return true;
 }
@@ -1790,7 +1791,8 @@ bool Thread::RequestSynchronousCheckpoint(Closure* function, ThreadState wait_st
     // Since we're runnable, and kPendingFlipFunction is set with all threads suspended, it
     // cannot be set again here. Thus kRunningFlipFunction is either already set after the
     // EnsureFlipFunctionStarted call, or will not be set before we call Run().
-    if (ReadFlag(ThreadFlag::kRunningFlipFunction)) {
+    // See mutator_gc_coord.md for a discussion of memory ordering for thread flags.
+    if (ReadFlag(ThreadFlag::kRunningFlipFunction, std::memory_order_acquire)) {
       WaitForFlipFunction(self);
     }
     function->Run(this);
@@ -1831,7 +1833,8 @@ bool Thread::EnsureFlipFunctionStarted(Thread* self,
   DCHECK(self == Current());
   bool check_exited = (tef != nullptr);
   // Check that the thread can't unexpectedly exit while we are running.
-  DCHECK(self == target || check_exited || target->ReadFlag(ThreadFlag::kSuspendRequest) ||
+  DCHECK(self == target || check_exited ||
+         target->ReadFlag(ThreadFlag::kSuspendRequest, std::memory_order_relaxed) ||
          Locks::thread_list_lock_->IsExclusiveHeld(self))
       << *target;
   bool become_runnable;
@@ -1857,6 +1860,8 @@ bool Thread::EnsureFlipFunctionStarted(Thread* self,
   target->VerifyState();
   if (old_state_and_flags.GetValue() == 0) {
     become_runnable = false;
+    // Memory_order_relaxed is OK here, since we re-check with memory_order_acquire below before
+    // acting on a pending flip function.
     old_state_and_flags = target->GetStateAndFlags(std::memory_order_relaxed);
   } else {
     become_runnable = true;
@@ -1868,8 +1873,12 @@ bool Thread::EnsureFlipFunctionStarted(Thread* self,
   while (true) {
     DCHECK(!check_exited || (Locks::thread_list_lock_->IsExclusiveHeld(self) && !tef->HasExited()));
     if (!old_state_and_flags.IsFlagSet(ThreadFlag::kPendingFlipFunction)) {
+      // Re-read kRunningFlipFunction flag with acquire ordering to ensure that if we claim
+      // flip function has run then its execution happened-before our return.
+      bool running_flip =
+          target->ReadFlag(ThreadFlag::kRunningFlipFunction, std::memory_order_acquire);
       maybe_release();
-      set_finished(!old_state_and_flags.IsFlagSet(ThreadFlag::kRunningFlipFunction));
+      set_finished(!running_flip);
       return false;
     }
     DCHECK(!old_state_and_flags.IsFlagSet(ThreadFlag::kRunningFlipFunction));
@@ -1900,7 +1909,8 @@ bool Thread::EnsureFlipFunctionStarted(Thread* self,
       // Let caller retry.
       return false;
     }
-    old_state_and_flags = target->GetStateAndFlags(std::memory_order_acquire);
+    // Again, we re-read with memory_order_acquire before acting on the flags.
+    old_state_and_flags = target->GetStateAndFlags(std::memory_order_relaxed);
   }
   // Unreachable.
 }
@@ -1908,14 +1918,14 @@ bool Thread::EnsureFlipFunctionStarted(Thread* self,
 void Thread::RunFlipFunction(Thread* self) {
   // This function is called either by the thread running `ThreadList::FlipThreadRoots()` or when
   // a thread becomes runnable, after we've successfully set the kRunningFlipFunction ThreadFlag.
-  DCHECK(ReadFlag(ThreadFlag::kRunningFlipFunction));
+  DCHECK(ReadFlag(ThreadFlag::kRunningFlipFunction, std::memory_order_relaxed));
 
   Closure* flip_function = GetFlipFunction();
   tlsPtr_.flip_function.store(nullptr, std::memory_order_relaxed);
   DCHECK(flip_function != nullptr);
   VerifyState();
   flip_function->Run(this);
-  DCHECK(!ReadFlag(ThreadFlag::kPendingFlipFunction));
+  DCHECK(!ReadFlag(ThreadFlag::kPendingFlipFunction, std::memory_order_relaxed));
   VerifyState();
   AtomicClearFlag(ThreadFlag::kRunningFlipFunction, std::memory_order_release);
   // From here on this thread may go away, and it is no longer safe to access.
@@ -1933,12 +1943,12 @@ void Thread::WaitForFlipFunction(Thread* self) const {
   // Repeat the check after waiting to guard against spurious wakeups (and because
   // we share the `thread_suspend_count_lock_` and `resume_cond_` with other code).
   // Check that the thread can't unexpectedly exit while we are running.
-  DCHECK(self == this || ReadFlag(ThreadFlag::kSuspendRequest) ||
+  DCHECK(self == this || ReadFlag(ThreadFlag::kSuspendRequest, std::memory_order_relaxed) ||
          Locks::thread_list_lock_->IsExclusiveHeld(self));
   MutexLock mu(self, *Locks::thread_suspend_count_lock_);
   while (true) {
-    StateAndFlags old_state_and_flags = GetStateAndFlags(std::memory_order_acquire);
-    if (!old_state_and_flags.IsFlagSet(ThreadFlag::kRunningFlipFunction)) {
+    // See mutator_gc_coord.md for a discussion of memory ordering for thread flags.
+    if (!ReadFlag(ThreadFlag::kRunningFlipFunction, std::memory_order_acquire)) {
       return;
     }
     // We sometimes hold mutator lock here. OK since the flip function must complete quickly.
@@ -1958,9 +1968,10 @@ void Thread::WaitForFlipFunctionTestingExited(Thread* self, ThreadExitFlag* tef)
   // complicated locking dance.
   MutexLock mu(self, *Locks::thread_suspend_count_lock_);
   while (true) {
-    StateAndFlags old_state_and_flags = GetStateAndFlags(std::memory_order_acquire);
+    // See mutator_gc_coord.md for a discussion of memory ordering for thread flags.
+    bool running_flip = ReadFlag(ThreadFlag::kRunningFlipFunction, std::memory_order_acquire);
     Locks::thread_list_lock_->Unlock(self);  // So we can wait or return.
-    if (!old_state_and_flags.IsFlagSet(ThreadFlag::kRunningFlipFunction)) {
+    if (!running_flip) {
       return;
     }
     resume_cond_->WaitHoldingLocks(self);
@@ -1976,7 +1987,7 @@ void Thread::WaitForFlipFunctionTestingExited(Thread* self, ThreadExitFlag* tef)
 
 void Thread::FullSuspendCheck(bool implicit) {
   ScopedTrace trace(__FUNCTION__);
-  DCHECK(!ReadFlag(ThreadFlag::kSuspensionImmune));
+  DCHECK(!ReadFlag(ThreadFlag::kSuspensionImmune, std::memory_order_relaxed));
   DCHECK(this == Thread::Current());
   VLOG(threads) << this << " self-suspending";
   // Make thread appear suspended to other threads, release mutator_lock_.
@@ -2707,9 +2718,9 @@ Thread::~Thread() {
     tlsPtr_.jni_env = nullptr;
   }
   CHECK_NE(GetState(), ThreadState::kRunnable);
-  CHECK(!ReadFlag(ThreadFlag::kCheckpointRequest));
-  CHECK(!ReadFlag(ThreadFlag::kEmptyCheckpointRequest));
-  CHECK(!ReadFlag(ThreadFlag::kSuspensionImmune));
+  CHECK(!ReadFlag(ThreadFlag::kCheckpointRequest, std::memory_order_relaxed));
+  CHECK(!ReadFlag(ThreadFlag::kEmptyCheckpointRequest, std::memory_order_relaxed));
+  CHECK(!ReadFlag(ThreadFlag::kSuspensionImmune, std::memory_order_relaxed));
   CHECK(tlsPtr_.checkpoint_function == nullptr);
   CHECK_EQ(checkpoint_overflow_.size(), 0u);
   // A pending flip function request is OK. FlipThreadRoots will have been notified that we
@@ -4818,11 +4829,11 @@ mirror::Object* Thread::GetPeerFromOtherThread() {
   // mutator lock in exclusive mode, and we should not have a pending flip function.
   if (kIsDebugBuild && Locks::thread_list_lock_->IsExclusiveHeld(self)) {
     Locks::mutator_lock_->AssertExclusiveHeld(self);
-    CHECK(!ReadFlag(ThreadFlag::kPendingFlipFunction));
+    CHECK(!ReadFlag(ThreadFlag::kPendingFlipFunction, std::memory_order_relaxed));
   }
   // Ensure that opeer is not obsolete.
   EnsureFlipFunctionStarted(self, this);
-  if (ReadFlag(ThreadFlag::kRunningFlipFunction)) {
+  if (ReadFlag(ThreadFlag::kRunningFlipFunction, std::memory_order_acquire)) {
     // Does not release mutator lock. Hence no new flip requests can be issued.
     WaitForFlipFunction(self);
   }
@@ -4833,7 +4844,8 @@ mirror::Object* Thread::LockedGetPeerFromOtherThread(ThreadExitFlag* tef) {
   DCHECK(tlsPtr_.jpeer == nullptr);
   Thread* self = Thread::Current();
   Locks::thread_list_lock_->AssertHeld(self);
-  if (ReadFlag(ThreadFlag::kPendingFlipFunction)) {
+  // memory_order_relaxed is OK here, because we recheck it later with acquire order.
+  if (ReadFlag(ThreadFlag::kPendingFlipFunction, std::memory_order_relaxed)) {
     // It is unsafe to call EnsureFlipFunctionStarted with thread_list_lock_. Thus we temporarily
     // release it, taking care to handle the case in which "this" thread disapppears while we no
     // longer hold it.
@@ -4844,7 +4856,7 @@ mirror::Object* Thread::LockedGetPeerFromOtherThread(ThreadExitFlag* tef) {
       return nullptr;
     }
   }
-  if (ReadFlag(ThreadFlag::kRunningFlipFunction)) {
+  if (ReadFlag(ThreadFlag::kRunningFlipFunction, std::memory_order_acquire)) {
     // Does not release mutator lock. Hence no new flip requests can be issued.
     WaitForFlipFunction(self);
   }
diff --git a/runtime/thread.h b/runtime/thread.h
index 8ce9854212..3ccf7c3c97 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -127,6 +127,7 @@ enum class ThreadFlag : uint32_t {
   kSuspendRequest = 1u << 0,
 
   // Request that the thread do some checkpoint work and then continue.
+  // Only modified while holding thread_suspend_count_lock_ .
   kCheckpointRequest = 1u << 1,
 
   // Request that the thread do empty checkpoint and then continue.
@@ -158,7 +159,7 @@ enum class ThreadFlag : uint32_t {
   // in any case not check for such requests; other clients of SuspendAll might.
   // Prevents a situation in which we are asked to suspend just before we suspend all
   // other threads, and then notice the suspension request and suspend ourselves,
-  // leading to deadlock. Guarded by suspend_count_lock_ .
+  // leading to deadlock. Guarded by thread_suspend_count_lock_ .
   // Should not ever be set when we try to transition to kRunnable.
   // TODO(b/296639267): Generalize use to prevent SuspendAll from blocking
   // in-progress GC.
@@ -1451,8 +1452,10 @@ class EXPORT Thread {
   // Undo the effect of the previous call. Again only invoked by the thread itself.
   void AllowPreMonitorMutexes();
 
-  bool ReadFlag(ThreadFlag flag) const {
-    return GetStateAndFlags(std::memory_order_relaxed).IsFlagSet(flag);
+  // Read a flag with the given memory order. See mutator_gc_coord.md for memory ordering
+  // considerations.
+  bool ReadFlag(ThreadFlag flag, std::memory_order order) const {
+    return GetStateAndFlags(order).IsFlagSet(flag);
   }
 
   void AtomicSetFlag(ThreadFlag flag, std::memory_order order = std::memory_order_seq_cst) {
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 3665b34cd5..6a8a7f566c 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -557,7 +557,7 @@ void ThreadList::RunEmptyCheckpoint() {
                 std::find(runnable_thread_ids.begin(), runnable_thread_ids.end(), tid) !=
                 runnable_thread_ids.end();
             if (is_in_runnable_thread_ids &&
-                thread->ReadFlag(ThreadFlag::kEmptyCheckpointRequest)) {
+                thread->ReadFlag(ThreadFlag::kEmptyCheckpointRequest, std::memory_order_relaxed)) {
               // Found a runnable thread that hasn't responded to the empty checkpoint request.
               // Assume it's stuck and safe to dump its stack.
               thread->Dump(LOG_STREAM(FATAL_WITHOUT_ABORT),
@@ -877,8 +877,8 @@ void ThreadList::SuspendAll(const char* cause, bool long_suspend) {
   }
 
   // SuspendAllInternal blocks if we are in the middle of a flip.
-  DCHECK(!self->ReadFlag(ThreadFlag::kPendingFlipFunction));
-  DCHECK(!self->ReadFlag(ThreadFlag::kRunningFlipFunction));
+  DCHECK(!self->ReadFlag(ThreadFlag::kPendingFlipFunction, std::memory_order_relaxed));
+  DCHECK(!self->ReadFlag(ThreadFlag::kRunningFlipFunction, std::memory_order_relaxed));
 
   ATraceBegin((std::string("Mutator threads suspended for ") + cause).c_str());
 
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index 1cf28989fb..4bf08dc9c1 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -138,6 +138,13 @@ class ThreadList {
   // threads must act on that. It is possible that on return there will be threads which have not,
   // and will not, run the checkpoint_function, and neither have/will any of their ancestors.
   //
+  // We guarantee that if a thread calls RunCheckpoint() then, if at point X RunCheckpoint() has
+  // returned, and all checkpoints have been properly observed to have completed (usually via a
+  // barrier), then every thread has executed a code sequence S during which it remained in a
+  // suspended state, such that the call to `RunCheckpoint` happens-before the end of S, and the
+  // beginning of S happened-before X.  Thus after a RunCheckpoint() call, no preexisting
+  // thread can still be relying on global information it caches between suspend points.
+  //
   // TODO: Is it possible to simplify mutator_lock handling here? Should this wait for completion?
   EXPORT size_t RunCheckpoint(Closure* checkpoint_function,
                               Closure* callback = nullptr,
@@ -157,6 +164,10 @@ class ThreadList {
   // in-flight mutator heap access (eg. a read barrier.) Runnable threads will respond by
   // decrementing the empty checkpoint barrier count. This works even when the weak ref access is
   // disabled. Only one concurrent use is currently supported.
+  // TODO(b/382722942): This is intended to guarantee the analogous memory ordering property to
+  // RunCheckpoint(). It over-optimizes by always avoiding thread suspension and hence does not in
+  // fact guarantee this. (See the discussion in `mutator_gc_coord.md`.) Fix this by implementing
+  // this with RunCheckpoint() instead.
   void RunEmptyCheckpoint()
       REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_);