diff options
author | 2022-09-10 00:08:42 +0000 | |
---|---|---|
committer | 2022-09-10 00:08:42 +0000 | |
commit | 7c8835df16147b9096dd4c9380ab4b5f700ea17d (patch) | |
tree | ce0f48061b3ba7d5dc1da4d0dda78520f2629d4a /runtime | |
parent | 7c39c86b17c91e651de1fcc0876fe5565e3f5204 (diff) |
Revert "Thread suspension cleanup and deadlock fix"
This reverts commit 7c39c86b17c91e651de1fcc0876fe5565e3f5204.
Reason for revert: We're see a number of new, somewhat rare, timeouts on multiple tests.
Change-Id: Ida9a4f80b64b6fedc16db176a8df9c2e985ef482
Diffstat (limited to 'runtime')
-rw-r--r-- | runtime/cha.cc | 2 | ||||
-rw-r--r-- | runtime/entrypoints_order_test.cc | 12 | ||||
-rw-r--r-- | runtime/monitor.cc | 5 | ||||
-rw-r--r-- | runtime/mutator_gc_coord.md | 71 | ||||
-rw-r--r-- | runtime/native/dalvik_system_VMStack.cc | 8 | ||||
-rw-r--r-- | runtime/native/java_lang_Thread.cc | 8 | ||||
-rw-r--r-- | runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc | 10 | ||||
-rw-r--r-- | runtime/oat.h | 4 | ||||
-rw-r--r-- | runtime/thread-inl.h | 121 | ||||
-rw-r--r-- | runtime/thread.cc | 317 | ||||
-rw-r--r-- | runtime/thread.h | 198 | ||||
-rw-r--r-- | runtime/thread_list.cc | 527 | ||||
-rw-r--r-- | runtime/thread_list.h | 29 |
13 files changed, 668 insertions, 644 deletions
diff --git a/runtime/cha.cc b/runtime/cha.cc index 0c032aa125..d19d4f6e30 100644 --- a/runtime/cha.cc +++ b/runtime/cha.cc @@ -241,7 +241,7 @@ class CHACheckpoint final : public Closure { // Note thread and self may not be equal if thread was already suspended at // the point of the request. Thread* self = Thread::Current(); - Locks::mutator_lock_->AssertSharedHeld(Thread::Current()); + ScopedObjectAccess soa(self); CHAStackVisitor visitor(thread, nullptr, method_headers_); visitor.WalkStack(); barrier_.Pass(self); diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc index 8821aed259..2cd58dbf1b 100644 --- a/runtime/entrypoints_order_test.cc +++ b/runtime/entrypoints_order_test.cc @@ -111,15 +111,13 @@ class EntrypointsOrderTest : public CommonRuntimeTest { sizeof(void*)); EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, last_no_thread_suspension_cause, checkpoint_function, sizeof(void*)); - EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, checkpoint_function, active_suspendall_barrier, - sizeof(void*)); - EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, active_suspendall_barrier, active_suspend1_barriers, - sizeof(void*)); - EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, active_suspend1_barriers, thread_local_pos, + EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, checkpoint_function, active_suspend_barriers, sizeof(void*)); + EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, active_suspend_barriers, thread_local_start, + sizeof(Thread::tls_ptr_sized_values::active_suspend_barriers)); + EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_start, thread_local_pos, sizeof(void*)); EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_pos, thread_local_end, sizeof(void*)); - EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_end, thread_local_start, sizeof(void*)); - EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_start, thread_local_limit, sizeof(void*)); + EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_end, thread_local_limit, sizeof(void*)); EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_limit, thread_local_objects, sizeof(void*)); EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_objects, jni_entrypoints, sizeof(size_t)); diff --git a/runtime/monitor.cc b/runtime/monitor.cc index 0cd05485e6..4e64c95b8c 100644 --- a/runtime/monitor.cc +++ b/runtime/monitor.cc @@ -1065,10 +1065,13 @@ void Monitor::InflateThinLocked(Thread* self, Handle<mirror::Object> obj, LockWo ThreadList* thread_list = Runtime::Current()->GetThreadList(); // Suspend the owner, inflate. First change to blocked and give up mutator_lock_. self->SetMonitorEnterObject(obj.Get()); + bool timed_out; Thread* owner; { ScopedThreadSuspension sts(self, ThreadState::kWaitingForLockInflation); - owner = thread_list->SuspendThreadByThreadId(owner_thread_id, SuspendReason::kInternal); + owner = thread_list->SuspendThreadByThreadId(owner_thread_id, + SuspendReason::kInternal, + &timed_out); } if (owner != nullptr) { // We succeeded in suspending the thread, check the lock's status didn't change. diff --git a/runtime/mutator_gc_coord.md b/runtime/mutator_gc_coord.md index e724de4a76..b3635c558a 100644 --- a/runtime/mutator_gc_coord.md +++ b/runtime/mutator_gc_coord.md @@ -217,77 +217,6 @@ processing, after acquiring `reference_processor_lock_`. This means that empty checkpoints do not preclude client threads from being in the middle of an operation that involves a weak reference access, while nonempty checkpoints do. -Thread Suspension Mechanics ---------------------------- - -Thread suspension is initiated by a registered thread, except that, for testing -purposes, `SuspendAll` may be invoked with `self == nullptr`. We never suspend -the initiating thread, explicitly exclusing it from `SuspendAll()`, and failing -`SuspendThreadBy...()` requests to that effect. - -The suspend calls invoke `IncrementSuspendCount()` to increment the thread -suspend count for each thread. That adds a "suspend barrier" (atomic counter) to -the per-thread list of such counters to decrement. It normally sets the -`kSuspendRequest` ("should enter safepoint handler") and `kActiveSuspendBarrier` -("need to notify us when suspended") flags. - -After setting these two flags, we check whether the thread is suspended and -`kSuspendRequest` is still set. Since the thread is already suspended, it cannot -be expected to respond to "pass the suspend barrier" (decrement the atomic -counter) in a timely fashion. Hence we do so on its behalf. This decrements -the "barrier" and removes it from the thread's list of barriers to decrement, -and clears `kActiveSuspendBarrier`. `kSuspendRequest` remains to ensure the -thread doesn't prematurely return to runnable state. - -If `SuspendAllInternal()` does not immediately see a suspended state, then it is up -to the target thread to decrement the suspend barrier. -`TransitionFromRunnableToSuspended()` calls -`TransitionToSuspendedAndRunCheckpoints()`, which changes the thread state, and -then calls `CheckActiveSuspendBarriers()` to check for the -`kActiveSuspendBarrier` flag and decrement the suspend barrier if set. - -The `suspend_count_lock_` is not consistently held in the target thread -during this process. Thus correctness in resolving the race between a -suspension-requesting thread and a target thread voluntarily suspending relies -on the following: If the requesting thread sets the flags with -`kActiveSuspendBarrier` before the target's state is changed to suspended, then -the target thread will see `kActiveSuspendBarrier` set, and will attempt to -handle the barrier. If, on the other hand, the target thread changes the thread -state first, then the requesting thread will see the suspended state, and handle -the barrier itself. Since the actual update of suspend counts and suspend -barrier data structures is done under the `suspend_count_lock_`, we always -ensure that either the requestor removes/clears the barrier for a given target, -or the target thread(s) decrement the barrier, but not both. This also ensures -that the barrier cannot be decremented after the stack frame holding the barrier -goes away. - -This relies on the fact that the two stores in the two threads to the state and -kActiveSuspendBarrier flag are ordered with respect to the later loads. That's -guaranteed, since they are all stored in a single `atomic<>`. Thus even relaxed -accesses are OK. - -The actual suspend barrier representation still varies between `SuspendAll()` -and `SuspendThreadBy...()`. The former relies on the fact that only one such -barrier can be in use at a time, while the latter maintains a linked list of -active suspend barriers for each target thread, relying on the fact that each -one can appear on the list of only one thread, and we can thus use list nodes -allocated in the stack frames of requesting threads. - -**Avoiding suspension cycles** - -Any thread can issue a `SuspendThreadByPeer()` or `SuspendAll()` request. But if -Thread A increments Thread B's suspend count while Thread B increments Thread -A's suspend count, and they then both suspend during a subsequent thread -transition, we're deadlocked. - -For `SuspendAll()`, we enforce a requirement that at most one `SuspendAll()` -request is running at one time. In addition, in all cases, we refuse to initiate -a suspend request from a registered thread that is also being asked to suspend -(i.e. the suspend count is nonzero). Instead the requestor waits for that -condition to change. This means that we cannot create a cycle in which each -thread has asked to suspend the next one, and thus no thread can progress. The -required atomicity of the requestor suspend count check with setting the suspend -count of the target(s) target is ensured by holding `suspend_count_lock_`. [^1]: Some comments in the code refer to a not-yet-really-implemented scheme in which the compiler-generated code would load through the address at diff --git a/runtime/native/dalvik_system_VMStack.cc b/runtime/native/dalvik_system_VMStack.cc index 7b21de204d..71078c9ad2 100644 --- a/runtime/native/dalvik_system_VMStack.cc +++ b/runtime/native/dalvik_system_VMStack.cc @@ -57,7 +57,10 @@ static ResultT GetThreadStack(const ScopedFastNativeObjectAccess& soa, // Suspend thread to build stack trace. ScopedThreadSuspension sts(soa.Self(), ThreadState::kNative); ThreadList* thread_list = Runtime::Current()->GetThreadList(); - Thread* thread = thread_list->SuspendThreadByPeer(peer, SuspendReason::kInternal); + bool timed_out; + Thread* thread = thread_list->SuspendThreadByPeer(peer, + SuspendReason::kInternal, + &timed_out); if (thread != nullptr) { // Must be runnable to create returned array. { @@ -67,6 +70,9 @@ static ResultT GetThreadStack(const ScopedFastNativeObjectAccess& soa, // Restart suspended thread. bool resumed = thread_list->Resume(thread, SuspendReason::kInternal); DCHECK(resumed); + } else if (timed_out) { + LOG(ERROR) << "Trying to get thread's stack failed as the thread failed to suspend within a " + "generous timeout."; } } return trace; diff --git a/runtime/native/java_lang_Thread.cc b/runtime/native/java_lang_Thread.cc index fd67a0a7fa..570c554782 100644 --- a/runtime/native/java_lang_Thread.cc +++ b/runtime/native/java_lang_Thread.cc @@ -147,8 +147,11 @@ static void Thread_setNativeName(JNIEnv* env, jobject peer, jstring java_name) { // thread list lock to avoid this, as setting the thread name causes mutator to lock/unlock // in the DDMS send code. ThreadList* thread_list = Runtime::Current()->GetThreadList(); + bool timed_out; // Take suspend thread lock to avoid races with threads trying to suspend this one. - Thread* thread = thread_list->SuspendThreadByPeer(peer, SuspendReason::kInternal); + Thread* thread = thread_list->SuspendThreadByPeer(peer, + SuspendReason::kInternal, + &timed_out); if (thread != nullptr) { { ScopedObjectAccess soa(env); @@ -156,6 +159,9 @@ static void Thread_setNativeName(JNIEnv* env, jobject peer, jstring java_name) { } bool resumed = thread_list->Resume(thread, SuspendReason::kInternal); DCHECK(resumed); + } else if (timed_out) { + LOG(ERROR) << "Trying to set thread name to '" << name.c_str() << "' failed as the thread " + "failed to suspend within a generous timeout."; } } diff --git a/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc b/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc index f20cd281e8..081ec2043a 100644 --- a/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc +++ b/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc @@ -59,6 +59,7 @@ static jobjectArray DdmVmInternal_getStackTraceById(JNIEnv* env, jclass, jint th trace = Thread::InternalStackTraceToStackTraceElementArray(soa, internal_trace); } else { ThreadList* thread_list = Runtime::Current()->GetThreadList(); + bool timed_out; // Check for valid thread if (thin_lock_id == ThreadList::kInvalidThreadId) { @@ -66,7 +67,9 @@ static jobjectArray DdmVmInternal_getStackTraceById(JNIEnv* env, jclass, jint th } // Suspend thread to build stack trace. - Thread* thread = thread_list->SuspendThreadByThreadId(thin_lock_id, SuspendReason::kInternal); + Thread* thread = thread_list->SuspendThreadByThreadId(thin_lock_id, + SuspendReason::kInternal, + &timed_out); if (thread != nullptr) { { ScopedObjectAccess soa(env); @@ -76,6 +79,11 @@ static jobjectArray DdmVmInternal_getStackTraceById(JNIEnv* env, jclass, jint th // Restart suspended thread. bool resumed = thread_list->Resume(thread, SuspendReason::kInternal); DCHECK(resumed); + } else { + if (timed_out) { + LOG(ERROR) << "Trying to get thread's stack by id failed as the thread failed to suspend " + "within a generous timeout."; + } } } return trace; diff --git a/runtime/oat.h b/runtime/oat.h index 99cc4e9087..341e70bbe9 100644 --- a/runtime/oat.h +++ b/runtime/oat.h @@ -32,8 +32,8 @@ class InstructionSetFeatures; class PACKED(4) OatHeader { public: static constexpr std::array<uint8_t, 4> kOatMagic { { 'o', 'a', 't', '\n' } }; - // Last oat version changed reason: Change suspend barrier data structure. - static constexpr std::array<uint8_t, 4> kOatVersion { { '2', '2', '8', '\0' } }; + // Last oat version changed reason: Don't use instrumentation stubs for native methods. + static constexpr std::array<uint8_t, 4> kOatVersion { { '2', '2', '7', '\0' } }; static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline"; static constexpr const char* kDebuggableKey = "debuggable"; diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h index 5d45c599fe..4110ed2851 100644 --- a/runtime/thread-inl.h +++ b/runtime/thread-inl.h @@ -27,6 +27,7 @@ #include "jni/jni_env_ext.h" #include "managed_stack-inl.h" #include "obj_ptr.h" +#include "suspend_reason.h" #include "thread-current-inl.h" #include "thread_pool.h" @@ -221,7 +222,7 @@ inline void Thread::TransitionToSuspendedAndRunCheckpoints(ThreadState new_state } } -inline void Thread::CheckActiveSuspendBarriers() { +inline void Thread::PassActiveSuspendBarriers() { while (true) { StateAndFlags state_and_flags = GetStateAndFlags(std::memory_order_relaxed); if (LIKELY(!state_and_flags.IsFlagSet(ThreadFlag::kCheckpointRequest) && @@ -229,7 +230,7 @@ inline void Thread::CheckActiveSuspendBarriers() { !state_and_flags.IsFlagSet(ThreadFlag::kActiveSuspendBarrier))) { break; } else if (state_and_flags.IsFlagSet(ThreadFlag::kActiveSuspendBarrier)) { - PassActiveSuspendBarriers(); + PassActiveSuspendBarriers(this); } else { // Impossible LOG(FATAL) << "Fatal, thread transitioned into suspended without running the checkpoint"; @@ -237,19 +238,6 @@ inline void Thread::CheckActiveSuspendBarriers() { } } -inline void Thread::AddSuspend1Barrier(WrappedSuspend1Barrier* suspend1_barrier) { - suspend1_barrier->next_ = tlsPtr_.active_suspend1_barriers; - tlsPtr_.active_suspend1_barriers = suspend1_barrier; -} - -inline void Thread::RemoveFirstSuspend1Barrier() { - tlsPtr_.active_suspend1_barriers = tlsPtr_.active_suspend1_barriers->next_; -} - -inline bool Thread::HasActiveSuspendBarrier() { - return tlsPtr_.active_suspend1_barriers != nullptr || tlsPtr_.active_suspendall_barrier != nullptr; -} - inline void Thread::TransitionFromRunnableToSuspended(ThreadState new_state) { // Note: JNI stubs inline a fast path of this method that transitions to suspended if // there are no flags set and then clears the `held_mutexes[kMutatorLock]` (this comes @@ -265,7 +253,7 @@ inline void Thread::TransitionFromRunnableToSuspended(ThreadState new_state) { // Mark the release of the share of the mutator lock. GetMutatorLock()->TransitionFromRunnableToSuspended(this); // Once suspended - check the active suspend barrier flag - CheckActiveSuspendBarriers(); + PassActiveSuspendBarriers(); } inline ThreadState Thread::TransitionFromSuspendedToRunnable() { @@ -296,7 +284,7 @@ inline ThreadState Thread::TransitionFromSuspendedToRunnable() { break; } } else if (old_state_and_flags.IsFlagSet(ThreadFlag::kActiveSuspendBarrier)) { - PassActiveSuspendBarriers(); + PassActiveSuspendBarriers(this); } else if (UNLIKELY(old_state_and_flags.IsFlagSet(ThreadFlag::kCheckpointRequest) || old_state_and_flags.IsFlagSet(ThreadFlag::kEmptyCheckpointRequest))) { // Checkpoint flags should not be set while in suspended state. @@ -436,80 +424,35 @@ inline void Thread::PoisonObjectPointersIfDebug() { } } -inline void Thread::IncrementSuspendCount(Thread* self, - AtomicInteger* suspendall_barrier, - WrappedSuspend1Barrier* suspend1_barrier, - SuspendReason reason) { - if (kIsDebugBuild) { - Locks::thread_suspend_count_lock_->AssertHeld(self); - if (this != self && !IsSuspended()) { - Locks::thread_list_lock_->AssertHeld(self); - } - } - if (UNLIKELY(reason == SuspendReason::kForUserCode)) { - Locks::user_code_suspension_lock_->AssertHeld(self); - } - - // Avoid deadlocks during a thread flip. b/31683379. This condition is not necessary for - // RunCheckpoint, but it does not use a suspend barrier. - DCHECK(!gUseReadBarrier - || this == self - || (suspendall_barrier == nullptr && suspend1_barrier == nullptr) - || GetFlipFunction() == nullptr); - - uint32_t flags = enum_cast<uint32_t>(ThreadFlag::kSuspendRequest); - if (suspendall_barrier != nullptr) { - DCHECK(suspend1_barrier == nullptr); - DCHECK(tlsPtr_.active_suspendall_barrier == nullptr); - tlsPtr_.active_suspendall_barrier = suspendall_barrier; - flags |= enum_cast<uint32_t>(ThreadFlag::kActiveSuspendBarrier); - } else if (suspend1_barrier != nullptr) { - AddSuspend1Barrier(suspend1_barrier); - flags |= enum_cast<uint32_t>(ThreadFlag::kActiveSuspendBarrier); - } - - ++tls32_.suspend_count; - if (reason == SuspendReason::kForUserCode) { - ++tls32_.user_code_suspend_count; - } - - // Two bits might be set simultaneously. - tls32_.state_and_flags.fetch_or(flags, std::memory_order_seq_cst); - TriggerSuspend(); -} - -inline void Thread::DecrementSuspendCount(Thread* self, SuspendReason reason) { - if (kIsDebugBuild) { - Locks::thread_suspend_count_lock_->AssertHeld(self); - if (this != self && !IsSuspended()) { - Locks::thread_list_lock_->AssertHeld(self); - } - } - if (UNLIKELY(tls32_.suspend_count <= 0)) { - UnsafeLogFatalForSuspendCount(self, this); - UNREACHABLE(); - } - if (UNLIKELY(reason == SuspendReason::kForUserCode)) { - Locks::user_code_suspension_lock_->AssertHeld(self); - if (UNLIKELY(tls32_.user_code_suspend_count <= 0)) { - LOG(ERROR) << "user_code_suspend_count incorrect"; - UnsafeLogFatalForSuspendCount(self, this); - UNREACHABLE(); +inline bool Thread::ModifySuspendCount(Thread* self, + int delta, + AtomicInteger* suspend_barrier, + SuspendReason reason) { + if (delta > 0 && ((gUseReadBarrier && this != self) || suspend_barrier != nullptr)) { + // When delta > 0 (requesting a suspend), ModifySuspendCountInternal() may fail either if + // active_suspend_barriers is full or we are in the middle of a thread flip. Retry in a loop. + while (true) { + if (LIKELY(ModifySuspendCountInternal(self, delta, suspend_barrier, reason))) { + return true; + } else { + // Failure means the list of active_suspend_barriers is full or we are in the middle of a + // thread flip, we should release the thread_suspend_count_lock_ (to avoid deadlock) and + // wait till the target thread has executed or Thread::PassActiveSuspendBarriers() or the + // flip function. Note that we could not simply wait for the thread to change to a suspended + // state, because it might need to run checkpoint function before the state change or + // resumes from the resume_cond_, which also needs thread_suspend_count_lock_. + // + // The list of active_suspend_barriers is very unlikely to be full since more than + // kMaxSuspendBarriers threads need to execute SuspendAllInternal() simultaneously, and + // target thread stays in kRunnable in the mean time. + Locks::thread_suspend_count_lock_->ExclusiveUnlock(self); + NanoSleep(100000); + Locks::thread_suspend_count_lock_->ExclusiveLock(self); + } } + } else { + return ModifySuspendCountInternal(self, delta, suspend_barrier, reason); } - - --tls32_.suspend_count; - if (reason == SuspendReason::kForUserCode) { - --tls32_.user_code_suspend_count; - } - - if (tls32_.suspend_count == 0) { - AtomicClearFlag(ThreadFlag::kSuspendRequest); - } -} - -inline void Thread::IncrementSuspendCount(Thread* self) { - IncrementSuspendCount(self, nullptr, nullptr, SuspendReason::kInternal); } inline ShadowFrame* Thread::PushShadowFrame(ShadowFrame* new_top_frame) { diff --git a/runtime/thread.cc b/runtime/thread.cc index 6de88e6973..d233e6f174 100644 --- a/runtime/thread.cc +++ b/runtime/thread.cc @@ -1436,8 +1436,7 @@ uint64_t Thread::GetCpuMicroTime() const { } // Attempt to rectify locks so that we dump thread list with required locks before exiting. -void Thread::UnsafeLogFatalForSuspendCount(Thread* self, Thread* thread) - NO_THREAD_SAFETY_ANALYSIS { +static void UnsafeLogFatalForSuspendCount(Thread* self, Thread* thread) NO_THREAD_SAFETY_ANALYSIS { LOG(ERROR) << *thread << " suspend count already zero."; Locks::thread_suspend_count_lock_->Unlock(self); if (!Locks::mutator_lock_->IsSharedHeld(self)) { @@ -1455,51 +1454,141 @@ void Thread::UnsafeLogFatalForSuspendCount(Thread* self, Thread* thread) std::ostringstream ss; Runtime::Current()->GetThreadList()->Dump(ss); LOG(FATAL) << ss.str(); - UNREACHABLE(); } -bool Thread::PassActiveSuspendBarriers() { - DCHECK_EQ(this, Thread::Current()); - // Grab the suspend_count lock and copy the current set of barriers. Then clear the list and the - // flag. The IncrementSuspendCount function requires the lock so we prevent a race between setting +bool Thread::ModifySuspendCountInternal(Thread* self, + int delta, + AtomicInteger* suspend_barrier, + SuspendReason reason) { + if (kIsDebugBuild) { + DCHECK(delta == -1 || delta == +1) + << reason << " " << delta << " " << this; + Locks::thread_suspend_count_lock_->AssertHeld(self); + if (this != self && !IsSuspended()) { + Locks::thread_list_lock_->AssertHeld(self); + } + } + // User code suspensions need to be checked more closely since they originate from code outside of + // the runtime's control. + if (UNLIKELY(reason == SuspendReason::kForUserCode)) { + Locks::user_code_suspension_lock_->AssertHeld(self); + if (UNLIKELY(delta + tls32_.user_code_suspend_count < 0)) { + LOG(ERROR) << "attempting to modify suspend count in an illegal way."; + return false; + } + } + if (UNLIKELY(delta < 0 && tls32_.suspend_count <= 0)) { + UnsafeLogFatalForSuspendCount(self, this); + return false; + } + + if (gUseReadBarrier && delta > 0 && this != self && tlsPtr_.flip_function != nullptr) { + // Force retry of a suspend request if it's in the middle of a thread flip to avoid a + // deadlock. b/31683379. + return false; + } + + uint32_t flags = enum_cast<uint32_t>(ThreadFlag::kSuspendRequest); + if (delta > 0 && suspend_barrier != nullptr) { + uint32_t available_barrier = kMaxSuspendBarriers; + for (uint32_t i = 0; i < kMaxSuspendBarriers; ++i) { + if (tlsPtr_.active_suspend_barriers[i] == nullptr) { + available_barrier = i; + break; + } + } + if (available_barrier == kMaxSuspendBarriers) { + // No barrier spaces available, we can't add another. + return false; + } + tlsPtr_.active_suspend_barriers[available_barrier] = suspend_barrier; + flags |= enum_cast<uint32_t>(ThreadFlag::kActiveSuspendBarrier); + } + + tls32_.suspend_count += delta; + switch (reason) { + case SuspendReason::kForUserCode: + tls32_.user_code_suspend_count += delta; + break; + case SuspendReason::kInternal: + break; + } + + if (tls32_.suspend_count == 0) { + AtomicClearFlag(ThreadFlag::kSuspendRequest); + } else { + // Two bits might be set simultaneously. + tls32_.state_and_flags.fetch_or(flags, std::memory_order_seq_cst); + TriggerSuspend(); + } + return true; +} + +bool Thread::PassActiveSuspendBarriers(Thread* self) { + // Grab the suspend_count lock and copy the current set of + // barriers. Then clear the list and the flag. The ModifySuspendCount + // function requires the lock so we prevent a race between setting // the kActiveSuspendBarrier flag and clearing it. - std::vector<AtomicInteger*> pass_barriers{}; + AtomicInteger* pass_barriers[kMaxSuspendBarriers]; { - MutexLock mu(this, *Locks::thread_suspend_count_lock_); + MutexLock mu(self, *Locks::thread_suspend_count_lock_); if (!ReadFlag(ThreadFlag::kActiveSuspendBarrier)) { - // quick exit test: the barriers have already been claimed - this is possible as there may - // be a race to claim and it doesn't matter who wins. - // All of the callers of this function (except the SuspendAllInternal) will first test the - // kActiveSuspendBarrier flag without lock. Here double-check whether the barrier has been - // passed with the suspend_count lock. + // quick exit test: the barriers have already been claimed - this is + // possible as there may be a race to claim and it doesn't matter + // who wins. + // All of the callers of this function (except the SuspendAllInternal) + // will first test the kActiveSuspendBarrier flag without lock. Here + // double-check whether the barrier has been passed with the + // suspend_count lock. return false; } - if (tlsPtr_.active_suspendall_barrier != nullptr) { - pass_barriers.push_back(tlsPtr_.active_suspendall_barrier); - tlsPtr_.active_suspendall_barrier = nullptr; - } - for (WrappedSuspend1Barrier* w = tlsPtr_.active_suspend1_barriers; w != nullptr; w = w->next_) { - pass_barriers.push_back(&(w->barrier_)); + + for (uint32_t i = 0; i < kMaxSuspendBarriers; ++i) { + pass_barriers[i] = tlsPtr_.active_suspend_barriers[i]; + tlsPtr_.active_suspend_barriers[i] = nullptr; } - tlsPtr_.active_suspend1_barriers = nullptr; AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier); } uint32_t barrier_count = 0; - for (AtomicInteger* barrier : pass_barriers) { - ++barrier_count; - int32_t old_val = barrier->fetch_sub(1, std::memory_order_release); - CHECK_GT(old_val, 0) << "Unexpected value for PassActiveSuspendBarriers(): " << old_val; + for (uint32_t i = 0; i < kMaxSuspendBarriers; i++) { + AtomicInteger* pending_threads = pass_barriers[i]; + if (pending_threads != nullptr) { + bool done = false; + do { + int32_t cur_val = pending_threads->load(std::memory_order_relaxed); + CHECK_GT(cur_val, 0) << "Unexpected value for PassActiveSuspendBarriers(): " << cur_val; + // Reduce value by 1. + done = pending_threads->CompareAndSetWeakRelaxed(cur_val, cur_val - 1); #if ART_USE_FUTEXES - if (old_val == 1) { - futex(barrier->Address(), FUTEX_WAKE_PRIVATE, INT_MAX, nullptr, nullptr, 0); - } + if (done && (cur_val - 1) == 0) { // Weak CAS may fail spuriously. + futex(pending_threads->Address(), FUTEX_WAKE_PRIVATE, INT_MAX, nullptr, nullptr, 0); + } #endif + } while (!done); + ++barrier_count; + } } CHECK_GT(barrier_count, 0U); return true; } +void Thread::ClearSuspendBarrier(AtomicInteger* target) { + CHECK(ReadFlag(ThreadFlag::kActiveSuspendBarrier)); + bool clear_flag = true; + for (uint32_t i = 0; i < kMaxSuspendBarriers; ++i) { + AtomicInteger* ptr = tlsPtr_.active_suspend_barriers[i]; + if (ptr == target) { + tlsPtr_.active_suspend_barriers[i] = nullptr; + } else if (ptr != nullptr) { + clear_flag = false; + } + } + if (LIKELY(clear_flag)) { + AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier); + } +} + void Thread::RunCheckpointFunction() { // If this thread is suspended and another thread is running the checkpoint on its behalf, // we may have a pending flip function that we need to run for the sake of those checkpoints @@ -1551,25 +1640,28 @@ void Thread::RunEmptyCheckpoint() { } bool Thread::RequestCheckpoint(Closure* function) { - StateAndFlags old_state_and_flags(0 /* unused */), new_state_and_flags(0 /* unused */); - do { - old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed); - if (old_state_and_flags.GetState() != ThreadState::kRunnable) { - return false; // Fail, thread is suspended and so can't run a checkpoint. - } - new_state_and_flags = old_state_and_flags; - new_state_and_flags.SetFlag(ThreadFlag::kCheckpointRequest); - } while (!tls32_.state_and_flags.CompareAndSetWeakSequentiallyConsistent( - old_state_and_flags.GetValue(), new_state_and_flags.GetValue())); - // Succeeded setting checkpoint flag, now insert the actual checkpoint. - if (tlsPtr_.checkpoint_function == nullptr) { - tlsPtr_.checkpoint_function = function; - } else { - checkpoint_overflow_.push_back(function); + StateAndFlags old_state_and_flags = GetStateAndFlags(std::memory_order_relaxed); + if (old_state_and_flags.GetState() != ThreadState::kRunnable) { + return false; // Fail, thread is suspended and so can't run a checkpoint. } - DCHECK(ReadFlag(ThreadFlag::kCheckpointRequest)); - TriggerSuspend(); - return true; + + // We must be runnable to request a checkpoint. + DCHECK_EQ(old_state_and_flags.GetState(), ThreadState::kRunnable); + StateAndFlags new_state_and_flags = old_state_and_flags; + new_state_and_flags.SetFlag(ThreadFlag::kCheckpointRequest); + bool success = tls32_.state_and_flags.CompareAndSetStrongSequentiallyConsistent( + old_state_and_flags.GetValue(), new_state_and_flags.GetValue()); + if (success) { + // Succeeded setting checkpoint flag, now insert the actual checkpoint. + if (tlsPtr_.checkpoint_function == nullptr) { + tlsPtr_.checkpoint_function = function; + } else { + checkpoint_overflow_.push_back(function); + } + CHECK(ReadFlag(ThreadFlag::kCheckpointRequest)); + TriggerSuspend(); + } + return success; } bool Thread::RequestEmptyCheckpoint() { @@ -1648,85 +1740,85 @@ bool Thread::RequestSynchronousCheckpoint(Closure* function, ThreadState suspend Thread* self_thread; }; - Locks::thread_list_lock_->AssertExclusiveHeld(self); - // If this thread is runnable, try to schedule a checkpoint. Do some gymnastics to not hold the - // suspend-count lock for too long. - if (GetState() == ThreadState::kRunnable) { - BarrierClosure barrier_closure(function); - bool installed = false; - { - MutexLock mu(self, *Locks::thread_suspend_count_lock_); - installed = RequestCheckpoint(&barrier_closure); - } - if (installed) { - // Relinquish the thread-list lock. We should not wait holding any locks. We cannot - // reacquire it since we don't know if 'this' hasn't been deleted yet. - Locks::thread_list_lock_->ExclusiveUnlock(self); - ScopedThreadStateChange sts(self, suspend_state); - barrier_closure.Wait(self, suspend_state); - return true; + for (;;) { + Locks::thread_list_lock_->AssertExclusiveHeld(self); + // If this thread is runnable, try to schedule a checkpoint. Do some gymnastics to not hold the + // suspend-count lock for too long. + if (GetState() == ThreadState::kRunnable) { + BarrierClosure barrier_closure(function); + bool installed = false; + { + MutexLock mu(self, *Locks::thread_suspend_count_lock_); + installed = RequestCheckpoint(&barrier_closure); + } + if (installed) { + // Relinquish the thread-list lock. We should not wait holding any locks. We cannot + // reacquire it since we don't know if 'this' hasn't been deleted yet. + Locks::thread_list_lock_->ExclusiveUnlock(self); + ScopedThreadStateChange sts(self, suspend_state); + barrier_closure.Wait(self, suspend_state); + return true; + } + // Fall-through. } - // No longer runnable. Fall-through. - } - // This thread is not runnable, make sure we stay suspended, then run the checkpoint. - // Note: IncrementSuspendCount also expects the thread_list_lock to be held in - // certain situations. - while (true) { - MutexLock mu2(self, *Locks::thread_suspend_count_lock_); + // This thread is not runnable, make sure we stay suspended, then run the checkpoint. + // Note: ModifySuspendCountInternal also expects the thread_list_lock to be held in + // certain situations. + { + MutexLock mu2(self, *Locks::thread_suspend_count_lock_); - if (GetFlipFunction() == nullptr) { - IncrementSuspendCount(self); - break; + if (!ModifySuspendCount(self, +1, nullptr, SuspendReason::kInternal)) { + // Just retry the loop. + sched_yield(); + continue; + } } - DCHECK(gUseReadBarrier); // Flip functions are only used by CC collector. - sched_yield(); - } - { - // Release for the wait. The suspension will keep the target thread from being unregistered - // and deleted. Reacquire after so that we can call DecrementSuspendCount without racing - // against ThreadList::Unregister. - ScopedThreadListLockUnlock stllu(self); { - ScopedThreadStateChange sts(self, suspend_state); - while (GetState() == ThreadState::kRunnable) { - // The target thread became runnable again. Wait till the suspend triggered in - // IncrementSuspendCount moves the target thread to suspended. - sched_yield(); + // Release for the wait. The suspension will keep us from being deleted. Reacquire after so + // that we can call ModifySuspendCount without racing against ThreadList::Unregister. + ScopedThreadListLockUnlock stllu(self); + { + ScopedThreadStateChange sts(self, suspend_state); + while (GetState() == ThreadState::kRunnable) { + // We became runnable again. Wait till the suspend triggered in ModifySuspendCount + // moves us to suspended. + sched_yield(); + } } - } - function->Run(this); - } + function->Run(this); + } - { - MutexLock mu2(self, *Locks::thread_suspend_count_lock_); + { + MutexLock mu2(self, *Locks::thread_suspend_count_lock_); - // Target was in a suspended state with a nonzero suspend count, so it must stay suspended. - DCHECK_NE(GetState(), ThreadState::kRunnable); - DecrementSuspendCount(self); - } + DCHECK_NE(GetState(), ThreadState::kRunnable); + bool updated = ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal); + DCHECK(updated); + } - { - // Imitate ResumeAll, the thread may be waiting on Thread::resume_cond_ since we raised its - // suspend count. Now the suspend_count_ is lowered so we must do the broadcast. - MutexLock mu2(self, *Locks::thread_suspend_count_lock_); - Thread::resume_cond_->Broadcast(self); - } + { + // Imitate ResumeAll, the thread may be waiting on Thread::resume_cond_ since we raised its + // suspend count. Now the suspend_count_ is lowered so we must do the broadcast. + MutexLock mu2(self, *Locks::thread_suspend_count_lock_); + Thread::resume_cond_->Broadcast(self); + } - // Release the thread_list_lock_ to be consistent with the barrier-closure path. - Locks::thread_list_lock_->ExclusiveUnlock(self); + // Release the thread_list_lock_ to be consistent with the barrier-closure path. + Locks::thread_list_lock_->ExclusiveUnlock(self); - return true; // We're done, break out of the loop. + return true; // We're done, break out of the loop. + } } void Thread::SetFlipFunction(Closure* function) { // This is called with all threads suspended, except for the calling thread. DCHECK(IsSuspended() || Thread::Current() == this); DCHECK(function != nullptr); - DCHECK(GetFlipFunction() == nullptr); - tlsPtr_.flip_function.store(function, std::memory_order_relaxed); + DCHECK(tlsPtr_.flip_function == nullptr); + tlsPtr_.flip_function = function; DCHECK(!GetStateAndFlags(std::memory_order_relaxed).IsAnyOfFlagsSet(FlipFunctionFlags())); AtomicSetFlag(ThreadFlag::kPendingFlipFunction, std::memory_order_release); } @@ -1758,8 +1850,8 @@ void Thread::RunFlipFunction(Thread* self, bool notify) { // no other thread may run checkpoints on a thread that's actually Runnable. DCHECK_EQ(notify, ReadFlag(ThreadFlag::kRunningFlipFunction)); - Closure* flip_function = GetFlipFunction(); - tlsPtr_.flip_function.store(nullptr, std::memory_order_relaxed); + Closure* flip_function = tlsPtr_.flip_function; + tlsPtr_.flip_function = nullptr; DCHECK(flip_function != nullptr); flip_function->Run(this); @@ -2354,9 +2446,10 @@ Thread::Thread(bool daemon) tlsPtr_.rosalloc_runs + kNumRosAllocThreadLocalSizeBracketsInThread, gc::allocator::RosAlloc::GetDedicatedFullRun()); tlsPtr_.checkpoint_function = nullptr; - tlsPtr_.active_suspendall_barrier = nullptr; - tlsPtr_.active_suspend1_barriers = nullptr; - tlsPtr_.flip_function.store(nullptr, std::memory_order_relaxed); + for (uint32_t i = 0; i < kMaxSuspendBarriers; ++i) { + tlsPtr_.active_suspend_barriers[i] = nullptr; + } + tlsPtr_.flip_function = nullptr; tlsPtr_.thread_local_mark_stack = nullptr; tls32_.is_transitioning_to_runnable = false; ResetTlab(); @@ -2504,7 +2597,7 @@ Thread::~Thread() { CHECK(!ReadFlag(ThreadFlag::kEmptyCheckpointRequest)); CHECK(tlsPtr_.checkpoint_function == nullptr); CHECK_EQ(checkpoint_overflow_.size(), 0u); - CHECK(GetFlipFunction() == nullptr); + CHECK(tlsPtr_.flip_function == nullptr); CHECK_EQ(tls32_.is_transitioning_to_runnable, false); // Make sure we processed all deoptimization requests. diff --git a/runtime/thread.h b/runtime/thread.h index d979cce6c1..6b1c16c907 100644 --- a/runtime/thread.h +++ b/runtime/thread.h @@ -46,7 +46,6 @@ #include "reflective_handle_scope.h" #include "runtime_globals.h" #include "runtime_stats.h" -#include "suspend_reason.h" #include "thread_state.h" namespace unwindstack { @@ -102,6 +101,7 @@ class RootVisitor; class ScopedObjectAccessAlreadyRunnable; class ShadowFrame; class StackedShadowFrameRecord; +enum class SuspendReason : char; class Thread; class ThreadList; enum VisitRootFlags : uint8_t; @@ -133,7 +133,6 @@ enum class ThreadFlag : uint32_t { kEmptyCheckpointRequest = 1u << 2, // Register that at least 1 suspend barrier needs to be passed. - // Changes to this flag are guarded by suspend_count_lock_ . kActiveSuspendBarrier = 1u << 3, // Marks that a "flip function" needs to be executed on this thread. @@ -141,7 +140,7 @@ enum class ThreadFlag : uint32_t { // Marks that the "flip function" is being executed by another thread. // - // This is used to guard against multiple threads trying to run the + // This is used to guards against multiple threads trying to run the // "flip function" for the same thread while the thread is suspended. // // This is not needed when the thread is running the flip function @@ -188,13 +187,6 @@ enum class WeakRefAccessState : int32_t { kDisabled }; -// See Thread.tlsPtr_.active_suspend1_barriers below for explanation. -struct WrappedSuspend1Barrier { - WrappedSuspend1Barrier() : barrier_(1), next_(nullptr) {} - AtomicInteger barrier_; - struct WrappedSuspend1Barrier* next_ GUARDED_BY(Locks::thread_suspend_count_lock_); -}; - // This should match RosAlloc::kNumThreadLocalSizeBrackets. static constexpr size_t kNumRosAllocThreadLocalSizeBracketsInThread = 16; @@ -314,7 +306,7 @@ class Thread { } bool IsSuspended() const { - StateAndFlags state_and_flags = GetStateAndFlags(std::memory_order_acquire); + StateAndFlags state_and_flags = GetStateAndFlags(std::memory_order_relaxed); return state_and_flags.GetState() != ThreadState::kRunnable && state_and_flags.IsFlagSet(ThreadFlag::kSuspendRequest); } @@ -330,32 +322,20 @@ class Thread { return tls32_.define_class_counter; } - // Increment suspend count and optionally install at most one suspend barrier. - // Should not be invoked on another thread while the target's flip_function is not null. + // If delta > 0 and (this != self or suspend_barrier is not null), this function may temporarily + // release thread_suspend_count_lock_ internally. ALWAYS_INLINE - void IncrementSuspendCount(Thread* self, - AtomicInteger* suspend_all_barrier, - WrappedSuspend1Barrier* suspend1_barrier, - SuspendReason reason) - REQUIRES(Locks::thread_suspend_count_lock_); - - // The same, but default reason to kInternal, and barriers to nullptr. - ALWAYS_INLINE void IncrementSuspendCount(Thread* self) + bool ModifySuspendCount(Thread* self, + int delta, + AtomicInteger* suspend_barrier, + SuspendReason reason) + WARN_UNUSED REQUIRES(Locks::thread_suspend_count_lock_); - ALWAYS_INLINE void DecrementSuspendCount(Thread* self, - SuspendReason reason = SuspendReason::kInternal) - REQUIRES(Locks::thread_suspend_count_lock_); - - private: - NO_RETURN static void UnsafeLogFatalForSuspendCount(Thread* self, Thread* thread); - - public: // Requests a checkpoint closure to run on another thread. The closure will be run when the // thread notices the request, either in an explicit runtime CheckSuspend() call, or in a call // originating from a compiler generated suspend point check. This returns true if the closure - // was added and will (eventually) be executed. It returns false if the thread was found to be - // runnable. + // was added and will (eventually) be executed. It returns false otherwise. // // Since multiple closures can be queued and some closures can delay other threads from running, // no closure should attempt to suspend another thread while running. @@ -385,14 +365,8 @@ class Thread { bool RequestEmptyCheckpoint() REQUIRES(Locks::thread_suspend_count_lock_); - Closure* GetFlipFunction() { - return tlsPtr_.flip_function.load(std::memory_order_relaxed); - } - // Set the flip function. This is done with all threads suspended, except for the calling thread. - void SetFlipFunction(Closure* function) - REQUIRES(Locks::thread_suspend_count_lock_) - REQUIRES(Locks::thread_list_lock_); + void SetFlipFunction(Closure* function); // Ensure that thread flip function started running. If no other thread is executing // it, the calling thread shall run the flip function and then notify other threads @@ -1229,6 +1203,9 @@ class Thread { tlsPtr_.held_mutexes[level] = mutex; } + void ClearSuspendBarrier(AtomicInteger* target) + REQUIRES(Locks::thread_suspend_count_lock_); + bool ReadFlag(ThreadFlag flag) const { return GetStateAndFlags(std::memory_order_relaxed).IsFlagSet(flag); } @@ -1474,7 +1451,6 @@ class Thread { private: explicit Thread(bool daemon); - // A runnable thread should only be deleted from the thread itself. ~Thread() REQUIRES(!Locks::mutator_lock_, !Locks::thread_suspend_count_lock_); void Destroy(); @@ -1502,8 +1478,7 @@ class Thread { // Avoid use, callers should use SetState. // Used only by `Thread` destructor and stack trace collection in semi-space GC (currently - // disabled by `kStoreStackTraces = false`). May not be called on a runnable thread other - // than Thread::Current(). + // disabled by `kStoreStackTraces = false`). // NO_THREAD_SAFETY_ANALYSIS: This function is "Unsafe" and can be called in // different states, so clang cannot perform the thread safety analysis. ThreadState SetStateUnsafe(ThreadState new_state) NO_THREAD_SAFETY_ANALYSIS { @@ -1512,13 +1487,12 @@ class Thread { if (old_state == new_state) { // Nothing to do. } else if (old_state == ThreadState::kRunnable) { - DCHECK_EQ(this, Thread::Current()); // Need to run pending checkpoint and suspend barriers. Run checkpoints in runnable state in // case they need to use a ScopedObjectAccess. If we are holding the mutator lock and a SOA // attempts to TransitionFromSuspendedToRunnable, it results in a deadlock. TransitionToSuspendedAndRunCheckpoints(new_state); // Since we transitioned to a suspended state, check the pass barrier requests. - CheckActiveSuspendBarriers(); + PassActiveSuspendBarriers(); } else { while (true) { StateAndFlags new_state_and_flags = old_state_and_flags; @@ -1591,26 +1565,9 @@ class Thread { REQUIRES(!Locks::thread_suspend_count_lock_, !Roles::uninterruptible_) REQUIRES_SHARED(Locks::mutator_lock_); - // Call PassActiveSuspendBarriers() if there are active barriers. Only called on current thread. - ALWAYS_INLINE void CheckActiveSuspendBarriers() + ALWAYS_INLINE void PassActiveSuspendBarriers() REQUIRES(!Locks::thread_suspend_count_lock_, !Roles::uninterruptible_); - // Decrement all "suspend barriers" for the current thread, notifying threads that requested our - // suspension. Only called on current thread. - bool PassActiveSuspendBarriers() - REQUIRES(!Locks::thread_suspend_count_lock_); - - // Add an entry to active_suspend1_barriers. - ALWAYS_INLINE void AddSuspend1Barrier(WrappedSuspend1Barrier* suspend1_barrier) - REQUIRES(Locks::thread_suspend_count_lock_); - - // Remove last-added entry from active_suspend1_barriers. - // Only makes sense if we're still holding thread_suspend_count_lock_ since insertion. - ALWAYS_INLINE void RemoveFirstSuspend1Barrier() - REQUIRES(Locks::thread_suspend_count_lock_); - - ALWAYS_INLINE bool HasActiveSuspendBarrier() REQUIRES(Locks::thread_suspend_count_lock_); - // Registers the current thread as the jit sensitive thread. Should be called just once. static void SetJitSensitiveThread() { if (jit_sensitive_thread_ == nullptr) { @@ -1625,6 +1582,13 @@ class Thread { is_sensitive_thread_hook_ = is_sensitive_thread_hook; } + bool ModifySuspendCountInternal(Thread* self, + int delta, + AtomicInteger* suspend_barrier, + SuspendReason reason) + WARN_UNUSED + REQUIRES(Locks::thread_suspend_count_lock_); + // Runs a single checkpoint function. If there are no more pending checkpoint functions it will // clear the kCheckpointRequest flag. The caller is responsible for calling this in a loop until // the kCheckpointRequest flag is cleared. @@ -1633,6 +1597,9 @@ class Thread { REQUIRES_SHARED(Locks::mutator_lock_); void RunEmptyCheckpoint(); + bool PassActiveSuspendBarriers(Thread* self) + REQUIRES(!Locks::thread_suspend_count_lock_); + // Install the protected region for implicit stack checks. void InstallImplicitProtection(); @@ -1918,47 +1885,45 @@ class Thread { } tls64_; struct PACKED(sizeof(void*)) tls_ptr_sized_values { - tls_ptr_sized_values() : card_table(nullptr), - exception(nullptr), - stack_end(nullptr), - managed_stack(), - suspend_trigger(nullptr), - jni_env(nullptr), - tmp_jni_env(nullptr), - self(nullptr), - opeer(nullptr), - jpeer(nullptr), - stack_begin(nullptr), - stack_size(0), - deps_or_stack_trace_sample(), - wait_next(nullptr), - monitor_enter_object(nullptr), - top_handle_scope(nullptr), - class_loader_override(nullptr), - long_jump_context(nullptr), - instrumentation_stack(nullptr), - stacked_shadow_frame_record(nullptr), - deoptimization_context_stack(nullptr), - frame_id_to_shadow_frame(nullptr), - name(nullptr), - pthread_self(0), - last_no_thread_suspension_cause(nullptr), - checkpoint_function(nullptr), - active_suspendall_barrier(nullptr), - active_suspend1_barriers(nullptr), - thread_local_pos(nullptr), - thread_local_end(nullptr), - thread_local_start(nullptr), - thread_local_limit(nullptr), - thread_local_objects(0), - thread_local_alloc_stack_top(nullptr), - thread_local_alloc_stack_end(nullptr), - mutator_lock(nullptr), - flip_function(nullptr), - method_verifier(nullptr), - thread_local_mark_stack(nullptr), - async_exception(nullptr), - top_reflective_handle_scope(nullptr) { + tls_ptr_sized_values() : card_table(nullptr), + exception(nullptr), + stack_end(nullptr), + managed_stack(), + suspend_trigger(nullptr), + jni_env(nullptr), + tmp_jni_env(nullptr), + self(nullptr), + opeer(nullptr), + jpeer(nullptr), + stack_begin(nullptr), + stack_size(0), + deps_or_stack_trace_sample(), + wait_next(nullptr), + monitor_enter_object(nullptr), + top_handle_scope(nullptr), + class_loader_override(nullptr), + long_jump_context(nullptr), + instrumentation_stack(nullptr), + stacked_shadow_frame_record(nullptr), + deoptimization_context_stack(nullptr), + frame_id_to_shadow_frame(nullptr), + name(nullptr), + pthread_self(0), + last_no_thread_suspension_cause(nullptr), + checkpoint_function(nullptr), + thread_local_start(nullptr), + thread_local_pos(nullptr), + thread_local_end(nullptr), + thread_local_limit(nullptr), + thread_local_objects(0), + thread_local_alloc_stack_top(nullptr), + thread_local_alloc_stack_end(nullptr), + mutator_lock(nullptr), + flip_function(nullptr), + method_verifier(nullptr), + thread_local_mark_stack(nullptr), + async_exception(nullptr), + top_reflective_handle_scope(nullptr) { std::fill(held_mutexes, held_mutexes + kLockLevelCount, nullptr); } @@ -2068,30 +2033,20 @@ class Thread { // requests another checkpoint, it goes to the checkpoint overflow list. Closure* checkpoint_function GUARDED_BY(Locks::thread_suspend_count_lock_); - // After a thread observes a suspend request and enters a suspended state, - // it notifies the requestor by arriving at a "suspend barrier". This consists of decrementing - // the atomic integer representing the barrier. (This implementation was introduced in 2015 to - // minimize cost. There may be other options.) These atomic integer barriers are always - // stored on the requesting thread's stack. They are referenced from the target thread's - // data structure in one of two ways; in either case the data structure referring to these - // barriers is guarded by suspend_count_lock: - // 1. A SuspendAll barrier is directly referenced from the target thread. Only one of these - // can be active at a time: - AtomicInteger* active_suspendall_barrier GUARDED_BY(Locks::thread_suspend_count_lock_); - // 2. For individual thread suspensions, active barriers are embedded in a struct that is used - // to link together all suspend requests for this thread. Unlike the SuspendAll case, each - // barrier is referenced by a single target thread, and thus can appear only on a single list. - // The struct as a whole is still stored on the requesting thread's stack. - WrappedSuspend1Barrier* active_suspend1_barriers GUARDED_BY(Locks::thread_suspend_count_lock_); + // Pending barriers that require passing or NULL if non-pending. Installation guarding by + // Locks::thread_suspend_count_lock_. + // They work effectively as art::Barrier, but implemented directly using AtomicInteger and futex + // to avoid additional cost of a mutex and a condition variable, as used in art::Barrier. + AtomicInteger* active_suspend_barriers[kMaxSuspendBarriers]; + + // Thread-local allocation pointer. Moved here to force alignment for thread_local_pos on ARM. + uint8_t* thread_local_start; // thread_local_pos and thread_local_end must be consecutive for ldrd and are 8 byte aligned for // potentially better performance. uint8_t* thread_local_pos; uint8_t* thread_local_end; - // Thread-local allocation pointer. Can be moved above the preceding two to correct alignment. - uint8_t* thread_local_start; - // Thread local limit is how much we can expand the thread local buffer to, it is greater or // equal to thread_local_end. uint8_t* thread_local_limit; @@ -2118,8 +2073,7 @@ class Thread { BaseMutex* held_mutexes[kLockLevelCount]; // The function used for thread flip. - // Set only while holding Locks::thread_suspend_count_lock_ . May be cleared while being read. - std::atomic<Closure*> flip_function; + Closure* flip_function; // Current method verifier, used for root marking. verifier::MethodVerifier* method_verifier; diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc index 77e373d7de..6b23c349ef 100644 --- a/runtime/thread_list.cc +++ b/runtime/thread_list.cc @@ -62,7 +62,10 @@ namespace art { using android::base::StringPrintf; static constexpr uint64_t kLongThreadSuspendThreshold = MsToNs(5); -static constexpr useconds_t kThreadSuspendSleepUs = 1000; +// Use 0 since we want to yield to prevent blocking for an unpredictable amount of time. +static constexpr useconds_t kThreadSuspendInitialSleepUs = 0; +static constexpr useconds_t kThreadSuspendMaxYieldUs = 3000; +static constexpr useconds_t kThreadSuspendMaxSleepUs = 5000; // Whether we should try to dump the native stack of unattached threads. See commit ed8b723 for // some history. @@ -71,7 +74,7 @@ static constexpr bool kDumpUnattachedThreadNativeStackForSigQuit = true; ThreadList::ThreadList(uint64_t thread_suspend_timeout_ns) : suspend_all_count_(0), unregistering_count_(0), - suspend_all_histogram_("suspend all histogram", 16, 64), + suspend_all_historam_("suspend all histogram", 16, 64), long_suspend_(false), shut_down_(false), thread_suspend_timeout_ns_(thread_suspend_timeout_ns), @@ -132,10 +135,10 @@ void ThreadList::DumpForSigQuit(std::ostream& os) { { ScopedObjectAccess soa(Thread::Current()); // Only print if we have samples. - if (suspend_all_histogram_.SampleSize() > 0) { + if (suspend_all_historam_.SampleSize() > 0) { Histogram<uint64_t>::CumulativeData data; - suspend_all_histogram_.CreateHistogram(&data); - suspend_all_histogram_.PrintConfidenceIntervals(os, 0.99, data); // Dump time to suspend. + suspend_all_historam_.CreateHistogram(&data); + suspend_all_historam_.PrintConfidenceIntervals(os, 0.99, data); // Dump time to suspend. } } bool dump_native_stack = Runtime::Current()->GetDumpNativeStackOnSigQuit(); @@ -257,11 +260,11 @@ void ThreadList::Dump(std::ostream& os, bool dump_native_stack) { } } -void ThreadList::AssertOtherThreadsAreSuspended(Thread* self) { +void ThreadList::AssertThreadsAreSuspended(Thread* self, Thread* ignore1, Thread* ignore2) { MutexLock mu(self, *Locks::thread_list_lock_); MutexLock mu2(self, *Locks::thread_suspend_count_lock_); for (const auto& thread : list_) { - if (thread != self) { + if (thread != ignore1 && thread != ignore2) { CHECK(thread->IsSuspended()) << "\nUnsuspended thread: <<" << *thread << "\n" << "self: <<" << *Thread::Current(); @@ -288,6 +291,19 @@ NO_RETURN static void UnsafeLogFatalForThreadSuspendAllTimeout() { } #endif +// Unlike suspending all threads where we can wait to acquire the mutator_lock_, suspending an +// individual thread requires polling. delay_us is the requested sleep wait. If delay_us is 0 then +// we use sched_yield instead of calling usleep. +// Although there is the possibility, here and elsewhere, that usleep could return -1 and +// errno = EINTR, there should be no problem if interrupted, so we do not check. +static void ThreadSuspendSleep(useconds_t delay_us) { + if (delay_us == 0) { + sched_yield(); + } else { + usleep(delay_us); + } +} + size_t ThreadList::RunCheckpoint(Closure* checkpoint_function, Closure* callback) { Thread* self = Thread::Current(); Locks::mutator_lock_->AssertNotExclusiveHeld(self); @@ -310,7 +326,9 @@ size_t ThreadList::RunCheckpoint(Closure* checkpoint_function, Closure* callback // This thread will run its checkpoint some time in the near future. if (requested_suspend) { // The suspend request is now unnecessary. - thread->DecrementSuspendCount(self); + bool updated = + thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal); + DCHECK(updated); requested_suspend = false; } break; @@ -321,9 +339,9 @@ size_t ThreadList::RunCheckpoint(Closure* checkpoint_function, Closure* callback continue; } if (!requested_suspend) { - // Since we don't suspend other threads to run checkpoint_function, we claim this is - // safe even with flip_function set. - thread->IncrementSuspendCount(self); + bool updated = + thread->ModifySuspendCount(self, +1, nullptr, SuspendReason::kInternal); + DCHECK(updated); requested_suspend = true; if (thread->IsSuspended()) { break; @@ -358,7 +376,8 @@ size_t ThreadList::RunCheckpoint(Closure* checkpoint_function, Closure* callback checkpoint_function->Run(thread); { MutexLock mu2(self, *Locks::thread_suspend_count_lock_); - thread->DecrementSuspendCount(self); + bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal); + DCHECK(updated); } } @@ -497,14 +516,14 @@ size_t ThreadList::FlipThreadRoots(Closure* thread_flip_visitor, // ThreadFlipBegin happens before we suspend all the threads, so it does not count towards the // pause. const uint64_t suspend_start_time = NanoTime(); - SuspendAllInternal(self); + SuspendAllInternal(self, self, nullptr); if (pause_listener != nullptr) { pause_listener->StartPause(); } // Run the flip callback for the collector. Locks::mutator_lock_->ExclusiveLock(self); - suspend_all_histogram_.AdjustAndAddValue(NanoTime() - suspend_start_time); + suspend_all_historam_.AdjustAndAddValue(NanoTime() - suspend_start_time); flip_callback->Run(self); Locks::mutator_lock_->ExclusiveUnlock(self); collector->RegisterPause(NanoTime() - suspend_start_time); @@ -535,7 +554,8 @@ size_t ThreadList::FlipThreadRoots(Closure* thread_flip_visitor, if ((state == ThreadState::kWaitingForGcThreadFlip || thread->IsTransitioningToRunnable()) && thread->GetSuspendCount() == 1) { // The thread will resume right after the broadcast. - thread->DecrementSuspendCount(self); + bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal); + DCHECK(updated); ++runnable_thread_count; } else { other_threads.push_back(thread); @@ -564,7 +584,8 @@ size_t ThreadList::FlipThreadRoots(Closure* thread_flip_visitor, TimingLogger::ScopedTiming split4("ResumeOtherThreads", collector->GetTimings()); MutexLock mu2(self, *Locks::thread_suspend_count_lock_); for (const auto& thread : other_threads) { - thread->DecrementSuspendCount(self); + bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal); + DCHECK(updated); } Thread::resume_cond_->Broadcast(self); } @@ -572,32 +593,6 @@ size_t ThreadList::FlipThreadRoots(Closure* thread_flip_visitor, return runnable_thread_count + other_threads.size() + 1; // +1 for self. } -bool ThreadList::WaitForSuspendBarrier(AtomicInteger* barrier) { -#if ART_USE_FUTEXES - timespec wait_timeout; - InitTimeSpec(false, CLOCK_MONOTONIC, NsToMs(thread_suspend_timeout_ns_), 0, &wait_timeout); -#endif - while (true) { - int32_t cur_val = barrier->load(std::memory_order_acquire); - if (cur_val <= 0) { - CHECK_EQ(cur_val, 0); - return true; - } -#if ART_USE_FUTEXES - if (futex(barrier->Address(), FUTEX_WAIT_PRIVATE, cur_val, &wait_timeout, nullptr, 0) - != 0) { - if (errno == ETIMEDOUT) { - return false; - } else if (errno != EAGAIN && errno != EINTR) { - PLOG(FATAL) << "futex wait for suspend barrier failed"; - } - } -#endif - // Else spin wait. This is likely to be slow, but ART_USE_FUTEXES is set on Linux, - // including all targets. - } -} - void ThreadList::SuspendAll(const char* cause, bool long_suspend) { Thread* self = Thread::Current(); @@ -610,7 +605,7 @@ void ThreadList::SuspendAll(const char* cause, bool long_suspend) { ScopedTrace trace("Suspending mutator threads"); const uint64_t start_time = NanoTime(); - SuspendAllInternal(self); + SuspendAllInternal(self, self); // All threads are known to have suspended (but a thread may still own the mutator lock) // Make sure this thread grabs exclusive access to the mutator lock and its protected data. #if HAVE_TIMED_RWLOCK @@ -634,14 +629,14 @@ void ThreadList::SuspendAll(const char* cause, bool long_suspend) { const uint64_t end_time = NanoTime(); const uint64_t suspend_time = end_time - start_time; - suspend_all_histogram_.AdjustAndAddValue(suspend_time); + suspend_all_historam_.AdjustAndAddValue(suspend_time); if (suspend_time > kLongThreadSuspendThreshold) { LOG(WARNING) << "Suspending all threads took: " << PrettyDuration(suspend_time); } if (kDebugLocking) { // Debug check that all threads are suspended. - AssertOtherThreadsAreSuspended(self); + AssertThreadsAreSuspended(self, self); } } ATraceBegin((std::string("Mutator threads suspended for ") + cause).c_str()); @@ -654,8 +649,10 @@ void ThreadList::SuspendAll(const char* cause, bool long_suspend) { } // Ensures all threads running Java suspend and that those not running Java don't start. -void ThreadList::SuspendAllInternal(Thread* self, SuspendReason reason) { - const uint64_t start_time = NanoTime(); +void ThreadList::SuspendAllInternal(Thread* self, + Thread* ignore1, + Thread* ignore2, + SuspendReason reason) { Locks::mutator_lock_->AssertNotExclusiveHeld(self); Locks::thread_list_lock_->AssertNotHeld(self); Locks::thread_suspend_count_lock_->AssertNotHeld(self); @@ -673,77 +670,85 @@ void ThreadList::SuspendAllInternal(Thread* self, SuspendReason reason) { // The atomic counter for number of threads that need to pass the barrier. AtomicInteger pending_threads; - while (true) { - { - MutexLock mu(self, *Locks::thread_list_lock_); - MutexLock mu2(self, *Locks::thread_suspend_count_lock_); - bool in_flip = false; - for (const auto& thread : list_) { - if (thread->GetFlipFunction() != nullptr) { - in_flip = true; - break; - } - } - if (LIKELY(suspend_all_count_ == 0 - && (self == nullptr || self->GetSuspendCount() == 0) - && !in_flip)) { - // The above condition remains valid while we hold thread_suspend_count_lock_ . - bool found_myself = false; - // Update global suspend all state for attaching threads. - ++suspend_all_count_; - pending_threads.store(list_.size() - (self == nullptr ? 0 : 1), std::memory_order_relaxed); - // Increment everybody else's suspend count. - for (const auto& thread : list_) { - if (thread == self) { - found_myself = true; - } else { - VLOG(threads) << "requesting thread suspend: " << *thread; - DCHECK_EQ(suspend_all_count_, 1); - thread->IncrementSuspendCount(self, &pending_threads, nullptr, reason); - - // Must install the pending_threads counter first, then check thread->IsSuspended() - // and clear the counter. Otherwise there's a race with - // Thread::TransitionFromRunnableToSuspended() that can lead a thread to miss a call - // to PassActiveSuspendBarriers(). - if (thread->IsSuspended()) { - // Effectively pass the barrier on behalf of the already suspended thread. - DCHECK_EQ(thread->tlsPtr_.active_suspendall_barrier, &pending_threads); - pending_threads.fetch_sub(1, std::memory_order_seq_cst); - thread->tlsPtr_.active_suspendall_barrier = nullptr; - if (!thread->HasActiveSuspendBarrier()) { - thread->AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier); - } - } - } - } - DCHECK(self == nullptr || found_myself); - break; - } - } - // The if (LIKELY ...) condition above didn't hold. This is a bad time to initiate a suspend. - // Either the suspendall_barrier is already in use, or proceeding at this time risks deadlock. - // See b/31683379 for an explanation of the thread flip condition. Note that in the event of a - // competing Suspend or SuspendAll, we are about to be suspended anyway. We hold no locks, - // so it's safe to sleep and retry. - VLOG(threads) << "SuspendAll is sleeping"; - usleep(kThreadSuspendSleepUs); - // We're already not runnable, so an attempt to suspend us should succeed. + uint32_t num_ignored = 0; + if (ignore1 != nullptr) { + ++num_ignored; } - - if (!WaitForSuspendBarrier(&pending_threads)) { - const uint64_t wait_time = NanoTime() - start_time; + if (ignore2 != nullptr && ignore1 != ignore2) { + ++num_ignored; + } + { MutexLock mu(self, *Locks::thread_list_lock_); MutexLock mu2(self, *Locks::thread_suspend_count_lock_); - std::ostringstream oss; + // Update global suspend all state for attaching threads. + ++suspend_all_count_; + pending_threads.store(list_.size() - num_ignored, std::memory_order_relaxed); + // Increment everybody's suspend count (except those that should be ignored). for (const auto& thread : list_) { - if (thread != self && !thread->IsSuspended()) { - oss << std::endl << "Thread not suspended: " << *thread; + if (thread == ignore1 || thread == ignore2) { + continue; + } + VLOG(threads) << "requesting thread suspend: " << *thread; + bool updated = thread->ModifySuspendCount(self, +1, &pending_threads, reason); + DCHECK(updated); + + // Must install the pending_threads counter first, then check thread->IsSuspend() and clear + // the counter. Otherwise there's a race with Thread::TransitionFromRunnableToSuspended() + // that can lead a thread to miss a call to PassActiveSuspendBarriers(). + if (thread->IsSuspended()) { + // Only clear the counter for the current thread. + thread->ClearSuspendBarrier(&pending_threads); + pending_threads.fetch_sub(1, std::memory_order_seq_cst); } } - LOG(::android::base::FATAL) - << "Timed out waiting for threads to suspend, waited for " - << PrettyDuration(wait_time) - << oss.str(); + } + + // Wait for the barrier to be passed by all runnable threads. This wait + // is done with a timeout so that we can detect problems. +#if ART_USE_FUTEXES + timespec wait_timeout; + InitTimeSpec(false, CLOCK_MONOTONIC, NsToMs(thread_suspend_timeout_ns_), 0, &wait_timeout); +#endif + const uint64_t start_time = NanoTime(); + while (true) { + int32_t cur_val = pending_threads.load(std::memory_order_relaxed); + if (LIKELY(cur_val > 0)) { +#if ART_USE_FUTEXES + if (futex(pending_threads.Address(), FUTEX_WAIT_PRIVATE, cur_val, &wait_timeout, nullptr, 0) + != 0) { + if ((errno == EAGAIN) || (errno == EINTR)) { + // EAGAIN and EINTR both indicate a spurious failure, try again from the beginning. + continue; + } + if (errno == ETIMEDOUT) { + const uint64_t wait_time = NanoTime() - start_time; + MutexLock mu(self, *Locks::thread_list_lock_); + MutexLock mu2(self, *Locks::thread_suspend_count_lock_); + std::ostringstream oss; + for (const auto& thread : list_) { + if (thread == ignore1 || thread == ignore2) { + continue; + } + if (!thread->IsSuspended()) { + oss << std::endl << "Thread not suspended: " << *thread; + } + } + LOG(kIsDebugBuild ? ::android::base::FATAL : ::android::base::ERROR) + << "Timed out waiting for threads to suspend, waited for " + << PrettyDuration(wait_time) + << oss.str(); + } else { + PLOG(FATAL) << "futex wait failed for SuspendAllInternal()"; + } + } // else re-check pending_threads in the next iteration (this may be a spurious wake-up). +#else + // Spin wait. This is likely to be slow, but on most architecture ART_USE_FUTEXES is set. + UNUSED(start_time); +#endif + } else { + CHECK_EQ(cur_val, 0); + break; + } } } @@ -762,7 +767,7 @@ void ThreadList::ResumeAll() { if (kDebugLocking) { // Debug check that all threads are suspended. - AssertOtherThreadsAreSuspended(self); + AssertThreadsAreSuspended(self, self); } long_suspend_ = false; @@ -775,9 +780,11 @@ void ThreadList::ResumeAll() { --suspend_all_count_; // Decrement the suspend counts for all threads. for (const auto& thread : list_) { - if (thread != self) { - thread->DecrementSuspendCount(self); + if (thread == self) { + continue; } + bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal); + DCHECK(updated); } // Broadcast a notification to all suspended threads, some or all of @@ -822,7 +829,11 @@ bool ThreadList::Resume(Thread* thread, SuspendReason reason) { << ") thread not within thread list"; return false; } - thread->DecrementSuspendCount(self, reason); + if (UNLIKELY(!thread->ModifySuspendCount(self, -1, nullptr, reason))) { + LOG(ERROR) << "Resume(" << reinterpret_cast<void*>(thread) + << ") could not modify suspend count."; + return false; + } } { @@ -852,19 +863,40 @@ static void ThreadSuspendByPeerWarning(Thread* self, } } -Thread* ThreadList::SuspendThreadByPeer(jobject peer, SuspendReason reason) { - bool is_suspended = false; +Thread* ThreadList::SuspendThreadByPeer(jobject peer, + SuspendReason reason, + bool* timed_out) { + bool request_suspension = true; + const uint64_t start_time = NanoTime(); + int self_suspend_count = 0; + useconds_t sleep_us = kThreadSuspendInitialSleepUs; + *timed_out = false; Thread* const self = Thread::Current(); + Thread* suspended_thread = nullptr; VLOG(threads) << "SuspendThreadByPeer starting"; - Thread* thread; - WrappedSuspend1Barrier wrapped_barrier{}; while (true) { + Thread* thread; { - // Note: this will transition to runnable and potentially suspend. + // Note: this will transition to runnable and potentially suspend. We ensure only one thread + // is requesting another suspend, to avoid deadlock, by requiring this function be called + // holding Locks::thread_list_suspend_thread_lock_. Its important this thread suspend rather + // than request thread suspension, to avoid potential cycles in threads requesting each other + // suspend. ScopedObjectAccess soa(self); MutexLock thread_list_mu(self, *Locks::thread_list_lock_); thread = Thread::FromManagedThread(soa, peer); if (thread == nullptr) { + if (suspended_thread != nullptr) { + MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_); + // If we incremented the suspend count but the thread reset its peer, we need to + // re-decrement it since it is shutting down and may deadlock the runtime in + // ThreadList::WaitForOtherNonDaemonThreadsToExit. + bool updated = suspended_thread->ModifySuspendCount(soa.Self(), + -1, + nullptr, + reason); + DCHECK(updated); + } ThreadSuspendByPeerWarning(self, ::android::base::WARNING, "No such thread for suspend", @@ -872,138 +904,188 @@ Thread* ThreadList::SuspendThreadByPeer(jobject peer, SuspendReason reason) { return nullptr; } if (!Contains(thread)) { + CHECK(suspended_thread == nullptr); VLOG(threads) << "SuspendThreadByPeer failed for unattached thread: " << reinterpret_cast<void*>(thread); return nullptr; } - // IsSuspended on the current thread will fail as the current thread is changed into - // Runnable above. As the suspend count is now raised if this is the current thread - // it will self suspend on transition to Runnable, making it hard to work with. It's simpler - // to just explicitly handle the current thread in the callers to this code. - CHECK_NE(thread, self) << "Attempt to suspend the current thread for the debugger"; VLOG(threads) << "SuspendThreadByPeer found thread: " << *thread; { MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_); - if (LIKELY(self->GetSuspendCount() == 0 && thread->GetFlipFunction() == nullptr)) { - thread->IncrementSuspendCount(self, nullptr, &wrapped_barrier, reason); - if (thread->IsSuspended()) { - // See the discussion in mutator_gc_coord.md for the race here. - thread->RemoveFirstSuspend1Barrier(); - if (!thread->HasActiveSuspendBarrier()) { - thread->AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier); - } - is_suspended = true; + if (request_suspension) { + if (self->GetSuspendCount() > 0) { + // We hold the suspend count lock but another thread is trying to suspend us. Its not + // safe to try to suspend another thread in case we get a cycle. Start the loop again + // which will allow this thread to be suspended. + ++self_suspend_count; + continue; } - DCHECK_GT(thread->GetSuspendCount(), 0); - break; + CHECK(suspended_thread == nullptr); + suspended_thread = thread; + bool updated = suspended_thread->ModifySuspendCount(self, +1, nullptr, reason); + DCHECK(updated); + request_suspension = false; + } else { + // If the caller isn't requesting suspension, a suspension should have already occurred. + CHECK_GT(thread->GetSuspendCount(), 0); + } + // IsSuspended on the current thread will fail as the current thread is changed into + // Runnable above. As the suspend count is now raised if this is the current thread + // it will self suspend on transition to Runnable, making it hard to work with. It's simpler + // to just explicitly handle the current thread in the callers to this code. + CHECK_NE(thread, self) << "Attempt to suspend the current thread for the debugger"; + // If thread is suspended (perhaps it was already not Runnable but didn't have a suspend + // count, or else we've waited and it has self suspended) or is the current thread, we're + // done. + if (thread->IsSuspended()) { + VLOG(threads) << "SuspendThreadByPeer thread suspended: " << *thread; + if (ATraceEnabled()) { + std::string name; + thread->GetThreadName(name); + ATraceBegin(StringPrintf("SuspendThreadByPeer suspended %s for peer=%p", name.c_str(), + peer).c_str()); + } + return thread; + } + const uint64_t total_delay = NanoTime() - start_time; + if (total_delay >= thread_suspend_timeout_ns_) { + if (suspended_thread == nullptr) { + ThreadSuspendByPeerWarning(self, + ::android::base::FATAL, + "Failed to issue suspend request", + peer); + } else { + CHECK_EQ(suspended_thread, thread); + LOG(WARNING) << "Suspended thread state_and_flags: " + << suspended_thread->StateAndFlagsAsHexString() + << ", self_suspend_count = " << self_suspend_count; + // Explicitly release thread_suspend_count_lock_; we haven't held it for long, so + // seeing threads blocked on it is not informative. + Locks::thread_suspend_count_lock_->Unlock(self); + ThreadSuspendByPeerWarning(self, + ::android::base::FATAL, + "Thread suspension timed out", + peer); + } + UNREACHABLE(); + } else if (sleep_us == 0 && + total_delay > static_cast<uint64_t>(kThreadSuspendMaxYieldUs) * 1000) { + // We have spun for kThreadSuspendMaxYieldUs time, switch to sleeps to prevent + // excessive CPU usage. + sleep_us = kThreadSuspendMaxYieldUs / 2; } - // Else we either hold the suspend count lock but another thread is trying to suspend us, - // making it unsafe to try to suspend another thread in case we get a cycle. Or we're - // currently in the middle of a flip, and could otherwise encounter b/31683379. In either - // case, start the loop again, which will allow this thread to be suspended. } + // Release locks and come out of runnable state. } - // All locks are released, and we should quickly exit the suspend-unfriendly state. Retry. - VLOG(threads) << "SuspendThreadByPeer is sleeping"; - usleep(kThreadSuspendSleepUs); - } - // Now wait for target to decrement suspend barrier. - if (is_suspended || WaitForSuspendBarrier(&wrapped_barrier.barrier_)) { - // wrapped_barrier.barrier_ has been decremented and will no longer be accessed. - VLOG(threads) << "SuspendThreadByPeer thread suspended: " << *thread; - if (ATraceEnabled()) { - std::string name; - thread->GetThreadName(name); - ATraceBegin(StringPrintf("SuspendThreadByPeer suspended %s for peer=%p", name.c_str(), - peer).c_str()); - } - return thread; - } else { - LOG(WARNING) << "Suspended thread state_and_flags: " - << thread->StateAndFlagsAsHexString(); - // thread still has a pointer to wrapped_barrier. Returning and continuing would be unsafe - // without additional cleanup. - ThreadSuspendByPeerWarning(self, - ::android::base::FATAL, - "SuspendThreadByPeer timed out", - peer); - UNREACHABLE(); + VLOG(threads) << "SuspendThreadByPeer waiting to allow thread chance to suspend"; + ThreadSuspendSleep(sleep_us); + // This may stay at 0 if sleep_us == 0, but this is WAI since we want to avoid using usleep at + // all if possible. This shouldn't be an issue since time to suspend should always be small. + sleep_us = std::min(sleep_us * 2, kThreadSuspendMaxSleepUs); } } - static void ThreadSuspendByThreadIdWarning(LogSeverity severity, const char* message, uint32_t thread_id) { LOG(severity) << StringPrintf("%s: %d", message, thread_id); } -Thread* ThreadList::SuspendThreadByThreadId(uint32_t thread_id, SuspendReason reason) { - bool is_suspended = false; +Thread* ThreadList::SuspendThreadByThreadId(uint32_t thread_id, + SuspendReason reason, + bool* timed_out) { + const uint64_t start_time = NanoTime(); + useconds_t sleep_us = kThreadSuspendInitialSleepUs; + *timed_out = false; + Thread* suspended_thread = nullptr; Thread* const self = Thread::Current(); CHECK_NE(thread_id, kInvalidThreadId); VLOG(threads) << "SuspendThreadByThreadId starting"; - Thread* thread; - WrappedSuspend1Barrier wrapped_barrier{}; while (true) { { - // Note: this will transition to runnable and potentially suspend. + // Note: this will transition to runnable and potentially suspend. We ensure only one thread + // is requesting another suspend, to avoid deadlock, by requiring this function be called + // holding Locks::thread_list_suspend_thread_lock_. Its important this thread suspend rather + // than request thread suspension, to avoid potential cycles in threads requesting each other + // suspend. ScopedObjectAccess soa(self); MutexLock thread_list_mu(self, *Locks::thread_list_lock_); - thread = FindThreadByThreadId(thread_id); + Thread* thread = nullptr; + for (const auto& it : list_) { + if (it->GetThreadId() == thread_id) { + thread = it; + break; + } + } if (thread == nullptr) { + CHECK(suspended_thread == nullptr) << "Suspended thread " << suspended_thread + << " no longer in thread list"; // There's a race in inflating a lock and the owner giving up ownership and then dying. ThreadSuspendByThreadIdWarning(::android::base::WARNING, "No such thread id for suspend", thread_id); return nullptr; } - DCHECK(Contains(thread)); - CHECK_NE(thread, self) << "Attempt to suspend the current thread for the debugger"; VLOG(threads) << "SuspendThreadByThreadId found thread: " << *thread; + DCHECK(Contains(thread)); { MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_); - if (LIKELY(self->GetSuspendCount() == 0 && thread->GetFlipFunction() == nullptr)) { - thread->IncrementSuspendCount(self, nullptr, &wrapped_barrier, reason); - if (thread->IsSuspended()) { - // See the discussion in mutator_gc_coord.md for the race here. - thread->RemoveFirstSuspend1Barrier(); - if (!thread->HasActiveSuspendBarrier()) { - thread->AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier); - } - is_suspended = true; + if (suspended_thread == nullptr) { + if (self->GetSuspendCount() > 0) { + // We hold the suspend count lock but another thread is trying to suspend us. Its not + // safe to try to suspend another thread in case we get a cycle. Start the loop again + // which will allow this thread to be suspended. + continue; } - DCHECK_GT(thread->GetSuspendCount(), 0); - break; + bool updated = thread->ModifySuspendCount(self, +1, nullptr, reason); + DCHECK(updated); + suspended_thread = thread; + } else { + CHECK_EQ(suspended_thread, thread); + // If the caller isn't requesting suspension, a suspension should have already occurred. + CHECK_GT(thread->GetSuspendCount(), 0); + } + // IsSuspended on the current thread will fail as the current thread is changed into + // Runnable above. As the suspend count is now raised if this is the current thread + // it will self suspend on transition to Runnable, making it hard to work with. It's simpler + // to just explicitly handle the current thread in the callers to this code. + CHECK_NE(thread, self) << "Attempt to suspend the current thread for the debugger"; + // If thread is suspended (perhaps it was already not Runnable but didn't have a suspend + // count, or else we've waited and it has self suspended) or is the current thread, we're + // done. + if (thread->IsSuspended()) { + if (ATraceEnabled()) { + std::string name; + thread->GetThreadName(name); + ATraceBegin(StringPrintf("SuspendThreadByThreadId suspended %s id=%d", + name.c_str(), thread_id).c_str()); + } + VLOG(threads) << "SuspendThreadByThreadId thread suspended: " << *thread; + return thread; + } + const uint64_t total_delay = NanoTime() - start_time; + if (total_delay >= thread_suspend_timeout_ns_) { + ThreadSuspendByThreadIdWarning(::android::base::WARNING, + "Thread suspension timed out", + thread_id); + if (suspended_thread != nullptr) { + bool updated = thread->ModifySuspendCount(soa.Self(), -1, nullptr, reason); + DCHECK(updated); + } + *timed_out = true; + return nullptr; + } else if (sleep_us == 0 && + total_delay > static_cast<uint64_t>(kThreadSuspendMaxYieldUs) * 1000) { + // We have spun for kThreadSuspendMaxYieldUs time, switch to sleeps to prevent + // excessive CPU usage. + sleep_us = kThreadSuspendMaxYieldUs / 2; } - // Else we either hold the suspend count lock but another thread is trying to suspend us, - // making it unsafe to try to suspend another thread in case we get a cycle. Or we're - // currently in the middle of a flip, and could otherwise encounter b/31683379. In either - // case, start the loop again, which will allow this thread to be suspended. } + // Release locks and come out of runnable state. } - // All locks are released, and we should quickly exit the suspend-unfriendly state. Retry. - VLOG(threads) << "SuspendThreadByThreadId is sleeping"; - usleep(kThreadSuspendSleepUs); - } - // Now wait for target to decrement suspend barrier. - if (is_suspended || WaitForSuspendBarrier(&wrapped_barrier.barrier_)) { - // wrapped_barrier.barrier_ has been decremented and will no longer be accessed. - VLOG(threads) << "SuspendThreadByThreadId thread suspended: " << *thread; - if (ATraceEnabled()) { - std::string name; - thread->GetThreadName(name); - ATraceBegin(StringPrintf("SuspendThreadByPeer suspended %s for id=%d", - name.c_str(), thread_id).c_str()); - } - return thread; - } else { - // thread still has a pointer to wrapped_barrier. Returning and continuing would be unsafe without - // additional cleanup. - ThreadSuspendByThreadIdWarning(::android::base::FATAL, - "SuspendThreadByThreadId timed out", - thread_id); - UNREACHABLE(); + VLOG(threads) << "SuspendThreadByThreadId waiting to allow thread chance to suspend"; + ThreadSuspendSleep(sleep_us); + sleep_us = std::min(sleep_us * 2, kThreadSuspendMaxSleepUs); } } @@ -1079,7 +1161,8 @@ void ThreadList::SuspendAllDaemonThreadsForShutdown() { // daemons. CHECK(thread->IsDaemon()) << *thread; if (thread != self) { - thread->IncrementSuspendCount(self); + bool updated = thread->ModifySuspendCount(self, +1, nullptr, SuspendReason::kInternal); + DCHECK(updated); ++daemons_left; } // We are shutting down the runtime, set the JNI functions of all the JNIEnvs to be @@ -1179,10 +1262,11 @@ void ThreadList::Register(Thread* self) { // SuspendAll requests. MutexLock mu(self, *Locks::thread_list_lock_); MutexLock mu2(self, *Locks::thread_suspend_count_lock_); - if (suspend_all_count_ == 1) { - self->IncrementSuspendCount(self); - } else { - DCHECK_EQ(suspend_all_count_, 0); + // Modify suspend count in increments of 1 to maintain invariants in ModifySuspendCount. While + // this isn't particularly efficient the suspend counts are most commonly 0 or 1. + for (int delta = suspend_all_count_; delta > 0; delta--) { + bool updated = self->ModifySuspendCount(self, +1, nullptr, SuspendReason::kInternal); + DCHECK(updated); } CHECK(!Contains(self)); list_.push_back(self); @@ -1288,11 +1372,13 @@ void ThreadList::VisitRootsForSuspendedThreads(RootVisitor* visitor) { MutexLock mu(self, *Locks::thread_list_lock_); MutexLock mu2(self, *Locks::thread_suspend_count_lock_); for (Thread* thread : list_) { - thread->IncrementSuspendCount(self); + bool suspended = thread->ModifySuspendCount(self, +1, nullptr, SuspendReason::kInternal); + DCHECK(suspended); if (thread == self || thread->IsSuspended()) { threads_to_visit.push_back(thread); } else { - thread->DecrementSuspendCount(self); + bool resumed = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal); + DCHECK(resumed); } } } @@ -1307,7 +1393,8 @@ void ThreadList::VisitRootsForSuspendedThreads(RootVisitor* visitor) { { MutexLock mu2(self, *Locks::thread_suspend_count_lock_); for (Thread* thread : threads_to_visit) { - thread->DecrementSuspendCount(self); + bool updated = thread->ModifySuspendCount(self, -1, nullptr, SuspendReason::kInternal); + DCHECK(updated); } } } diff --git a/runtime/thread_list.h b/runtime/thread_list.h index a17a11b3ce..29b0c52186 100644 --- a/runtime/thread_list.h +++ b/runtime/thread_list.h @@ -70,7 +70,7 @@ class ThreadList { bool Resume(Thread* thread, SuspendReason reason = SuspendReason::kInternal) REQUIRES(!Locks::thread_suspend_count_lock_) WARN_UNUSED; - // Suspends all other threads and gets exclusive access to the mutator lock. + // Suspends all threads and gets exclusive access to the mutator lock. // If long_suspend is true, then other threads who try to suspend will never timeout. // long_suspend is currenly used for hprof since large heaps take a long time. void SuspendAll(const char* cause, bool long_suspend = false) @@ -81,8 +81,10 @@ class ThreadList { // Suspend a thread using a peer, typically used by the debugger. Returns the thread on success, // else null. The peer is used to identify the thread to avoid races with the thread terminating. + // If the suspension times out then *timeout is set to true. Thread* SuspendThreadByPeer(jobject peer, - SuspendReason reason) + SuspendReason reason, + bool* timed_out) REQUIRES(!Locks::mutator_lock_, !Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_); @@ -90,8 +92,8 @@ class ThreadList { // Suspend a thread using its thread id, typically used by lock/monitor inflation. Returns the // thread on success else null. The thread id is used to identify the thread to avoid races with // the thread terminating. Note that as thread ids are recycled this may not suspend the expected - // thread, that may be terminating. - Thread* SuspendThreadByThreadId(uint32_t thread_id, SuspendReason reason) + // thread, that may be terminating. If the suspension times out then *timeout is set to true. + Thread* SuspendThreadByThreadId(uint32_t thread_id, SuspendReason reason, bool* timed_out) REQUIRES(!Locks::mutator_lock_, !Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_); @@ -111,9 +113,8 @@ class ThreadList { // Running threads are not suspended but run the checkpoint inside of the suspend check. The // return value includes already suspended threads for b/24191051. Runs or requests the // callback, if non-null, inside the thread_list_lock critical section after determining the - // runnable/suspended states of the threads. Does not wait for completion of the checkpoint - // function in running threads. If the caller holds the mutator lock, then all instances of the - // checkpoint function are run with the mutator lock. + // runnable/suspended states of the threads. Does not wait for completion of the callbacks in + // running threads. size_t RunCheckpoint(Closure* checkpoint_function, Closure* callback = nullptr) REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_); @@ -196,15 +197,12 @@ class ThreadList { REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_); void SuspendAllInternal(Thread* self, + Thread* ignore1, + Thread* ignore2 = nullptr, SuspendReason reason = SuspendReason::kInternal) - REQUIRES(!Locks::thread_list_lock_, - !Locks::thread_suspend_count_lock_, - !Locks::mutator_lock_); - - // Wait for suspend barrier to reach zero. Return false on timeout. - bool WaitForSuspendBarrier(AtomicInteger* barrier); + REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_); - void AssertOtherThreadsAreSuspended(Thread* self) + void AssertThreadsAreSuspended(Thread* self, Thread* ignore1, Thread* ignore2 = nullptr) REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_); std::bitset<kMaxThreadId> allocated_ids_ GUARDED_BY(Locks::allocated_thread_ids_lock_); @@ -213,7 +211,6 @@ class ThreadList { std::list<Thread*> list_ GUARDED_BY(Locks::thread_list_lock_); // Ongoing suspend all requests, used to ensure threads added to list_ respect SuspendAll. - // The value is always either 0 or 1. int suspend_all_count_ GUARDED_BY(Locks::thread_suspend_count_lock_); // Number of threads unregistering, ~ThreadList blocks until this hits 0. @@ -221,7 +218,7 @@ class ThreadList { // Thread suspend time histogram. Only modified when all the threads are suspended, so guarding // by mutator lock ensures no thread can read when another thread is modifying it. - Histogram<uint64_t> suspend_all_histogram_ GUARDED_BY(Locks::mutator_lock_); + Histogram<uint64_t> suspend_all_historam_ GUARDED_BY(Locks::mutator_lock_); // Whether or not the current thread suspension is long. bool long_suspend_; |