Reduce and consistently report suspend timeouts am: d00d24530a
Original change: https://android-review.googlesource.com/c/platform/art/+/2959623
Change-Id: I84ac2d41f4d296cdea153d1e2bfee8fbd192ddf8
Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index cbd329f..3458508 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -1054,8 +1054,11 @@
}
}
-void Monitor::InflateThinLocked(Thread* self, Handle<mirror::Object> obj, LockWord lock_word,
- uint32_t hash_code) {
+void Monitor::InflateThinLocked(Thread* self,
+ Handle<mirror::Object> obj,
+ LockWord lock_word,
+ uint32_t hash_code,
+ int attempt_of_4) {
DCHECK_EQ(lock_word.GetState(), LockWord::kThinLocked);
uint32_t owner_thread_id = lock_word.ThinLockOwner();
if (owner_thread_id == self->GetThreadId()) {
@@ -1068,7 +1071,8 @@
Thread* owner;
{
ScopedThreadSuspension sts(self, ThreadState::kWaitingForLockInflation);
- owner = thread_list->SuspendThreadByThreadId(owner_thread_id, SuspendReason::kInternal);
+ owner = thread_list->SuspendThreadByThreadId(
+ owner_thread_id, SuspendReason::kInternal, attempt_of_4);
}
if (owner != nullptr) {
// We succeeded in suspending the thread, check the lock's status didn't change.
@@ -1107,6 +1111,7 @@
uint32_t thread_id = self->GetThreadId();
size_t contention_count = 0;
constexpr size_t kExtraSpinIters = 100;
+ int inflation_attempt = 1;
StackHandleScope<1> hs(self);
Handle<mirror::Object> h_obj(hs.NewHandle(obj));
while (true) {
@@ -1153,7 +1158,7 @@
continue; // Go again.
} else {
// We'd overflow the recursion count, so inflate the monitor.
- InflateThinLocked(self, h_obj, lock_word, 0);
+ InflateThinLocked(self, h_obj, lock_word, 0, inflation_attempt++);
}
} else {
if (trylock) {
@@ -1174,7 +1179,7 @@
} else {
contention_count = 0;
// No ordering required for initial lockword read. Install rereads it anyway.
- InflateThinLocked(self, h_obj, lock_word, 0);
+ InflateThinLocked(self, h_obj, lock_word, 0, inflation_attempt++);
}
}
continue; // Start from the beginning.
diff --git a/runtime/monitor.h b/runtime/monitor.h
index d142212..0ee2dbb 100644
--- a/runtime/monitor.h
+++ b/runtime/monitor.h
@@ -156,8 +156,14 @@
}
// Inflate the lock on obj. May fail to inflate for spurious reasons, always re-check.
- static void InflateThinLocked(Thread* self, Handle<mirror::Object> obj, LockWord lock_word,
- uint32_t hash_code) REQUIRES_SHARED(Locks::mutator_lock_);
+ // attempt_of_4 is in 1..4 inclusive or 0. A non-zero value indicates that we are retrying
+ // up to 4 times, and should only abort on 4. Zero means we are only trying once, with the
+ // full suspend timeout instead of a quarter.
+ static void InflateThinLocked(Thread* self,
+ Handle<mirror::Object> obj,
+ LockWord lock_word,
+ uint32_t hash_code,
+ int attempt_of_4 = 0) REQUIRES_SHARED(Locks::mutator_lock_);
// Not exclusive because ImageWriter calls this during a Heap::VisitObjects() that
// does not allow a thread suspension in the middle. TODO: maybe make this exclusive.
diff --git a/runtime/native/jdk_internal_misc_Unsafe.cc b/runtime/native/jdk_internal_misc_Unsafe.cc
index 10c6b2d..ba64c81 100644
--- a/runtime/native/jdk_internal_misc_Unsafe.cc
+++ b/runtime/native/jdk_internal_misc_Unsafe.cc
@@ -491,8 +491,9 @@
ThrowIllegalArgumentException("Argument to unpark() was not a Thread");
return;
}
- art::MutexLock mu(soa.Self(), *art::Locks::thread_list_lock_);
- art::Thread* thread = art::Thread::FromManagedThread(soa, mirror_thread);
+ Thread* self = soa.Self();
+ art::MutexLock mu(self, *art::Locks::thread_list_lock_);
+ art::Thread* thread = art::Thread::FromManagedThread(self, mirror_thread);
if (thread != nullptr) {
thread->Unpark();
} else {
diff --git a/runtime/native/sun_misc_Unsafe.cc b/runtime/native/sun_misc_Unsafe.cc
index 573b5a9..38fe725 100644
--- a/runtime/native/sun_misc_Unsafe.cc
+++ b/runtime/native/sun_misc_Unsafe.cc
@@ -531,8 +531,9 @@
ThrowIllegalArgumentException("Argument to unpark() was not a Thread");
return;
}
- art::MutexLock mu(soa.Self(), *art::Locks::thread_list_lock_);
- art::Thread* thread = art::Thread::FromManagedThread(soa, mirror_thread);
+ Thread* self = soa.Self();
+ art::MutexLock mu(self, *art::Locks::thread_list_lock_);
+ art::Thread* thread = art::Thread::FromManagedThread(self, mirror_thread);
if (thread != nullptr) {
thread->Unpark();
} else {
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 83ab469..2fcc4b0 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -284,10 +284,20 @@
tlsPtr_.active_suspend1_barriers = suspend1_barrier;
}
-inline void Thread::RemoveFirstSuspend1Barrier() {
+inline void Thread::RemoveFirstSuspend1Barrier(WrappedSuspend1Barrier* suspend1_barrier) {
+ DCHECK_EQ(tlsPtr_.active_suspend1_barriers, suspend1_barrier);
tlsPtr_.active_suspend1_barriers = tlsPtr_.active_suspend1_barriers->next_;
}
+inline void Thread::RemoveSuspend1Barrier(WrappedSuspend1Barrier* barrier) {
+ // 'barrier' should be in the list. If not, we will get a SIGSEGV with fault address of 4 or 8.
+ WrappedSuspend1Barrier** last = &tlsPtr_.active_suspend1_barriers;
+ while (*last != barrier) {
+ last = &((*last)->next_);
+ }
+ *last = (*last)->next_;
+}
+
inline bool Thread::HasActiveSuspendBarrier() {
return tlsPtr_.active_suspend1_barriers != nullptr ||
tlsPtr_.active_suspendall_barrier != nullptr;
diff --git a/runtime/thread.cc b/runtime/thread.cc
index e1ee900..052602c 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -675,16 +675,15 @@
return nullptr;
}
-Thread* Thread::FromManagedThread(const ScopedObjectAccessAlreadyRunnable& soa,
- ObjPtr<mirror::Object> thread_peer) {
+Thread* Thread::FromManagedThread(Thread* self, ObjPtr<mirror::Object> thread_peer) {
ArtField* f = WellKnownClasses::java_lang_Thread_nativePeer;
Thread* result = reinterpret_cast64<Thread*>(f->GetLong(thread_peer));
// Check that if we have a result it is either suspended or we hold the thread_list_lock_
// to stop it from going away.
if (kIsDebugBuild) {
- MutexLock mu(soa.Self(), *Locks::thread_suspend_count_lock_);
+ MutexLock mu(self, *Locks::thread_suspend_count_lock_);
if (result != nullptr && !result->IsSuspended()) {
- Locks::thread_list_lock_->AssertHeld(soa.Self());
+ Locks::thread_list_lock_->AssertHeld(self);
}
}
return result;
@@ -692,7 +691,7 @@
Thread* Thread::FromManagedThread(const ScopedObjectAccessAlreadyRunnable& soa,
jobject java_thread) {
- return FromManagedThread(soa, soa.Decode<mirror::Object>(java_thread));
+ return FromManagedThread(soa.Self(), soa.Decode<mirror::Object>(java_thread));
}
static size_t FixStackSize(size_t stack_size) {
@@ -1518,20 +1517,27 @@
}
tlsPtr_.active_suspend1_barriers = nullptr;
AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier);
+ CHECK_GT(pass_barriers.size(), 0U); // Since kActiveSuspendBarrier was set.
+ // Decrement suspend barrier(s) while we still hold the lock, since SuspendThread may
+ // remove and deallocate suspend barriers while holding suspend_count_lock_ .
+ // There will typically only be a single barrier to pass here.
+ for (AtomicInteger*& barrier : pass_barriers) {
+ int32_t old_val = barrier->fetch_sub(1, std::memory_order_release);
+ CHECK_GT(old_val, 0) << "Unexpected value for PassActiveSuspendBarriers(): " << old_val;
+ if (old_val != 1) {
+ // We're done with it.
+ barrier = nullptr;
+ }
+ }
}
-
- uint32_t barrier_count = 0;
+ // Finally do futex_wakes after releasing the lock.
for (AtomicInteger* barrier : pass_barriers) {
- ++barrier_count;
- int32_t old_val = barrier->fetch_sub(1, std::memory_order_release);
- CHECK_GT(old_val, 0) << "Unexpected value for PassActiveSuspendBarriers(): " << old_val;
#if ART_USE_FUTEXES
- if (old_val == 1) {
+ if (barrier != nullptr) {
futex(barrier->Address(), FUTEX_WAKE_PRIVATE, INT_MAX, nullptr, nullptr, 0);
}
#endif
}
- CHECK_GT(barrier_count, 0U);
return true;
}
@@ -1721,7 +1727,7 @@
Locks::thread_list_lock_->ExclusiveUnlock(self);
if (IsSuspended()) {
// See the discussion in mutator_gc_coord.md and SuspendAllInternal for the race here.
- RemoveFirstSuspend1Barrier();
+ RemoveFirstSuspend1Barrier(&wrapped_barrier);
if (!HasActiveSuspendBarrier()) {
AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier);
}
diff --git a/runtime/thread.h b/runtime/thread.h
index b1afac1..a59b10a 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -200,7 +200,7 @@
// See Thread.tlsPtr_.active_suspend1_barriers below for explanation.
struct WrappedSuspend1Barrier {
WrappedSuspend1Barrier() : barrier_(1), next_(nullptr) {}
- AtomicInteger barrier_;
+ AtomicInteger barrier_; // Only updated while holding thread_suspend_count_lock_ .
struct WrappedSuspend1Barrier* next_ GUARDED_BY(Locks::thread_suspend_count_lock_);
};
@@ -300,8 +300,7 @@
void CheckEmptyCheckpointFromWeakRefAccess(BaseMutex* cond_var_mutex);
void CheckEmptyCheckpointFromMutex();
- static Thread* FromManagedThread(const ScopedObjectAccessAlreadyRunnable& ts,
- ObjPtr<mirror::Object> thread_peer)
+ static Thread* FromManagedThread(Thread* self, ObjPtr<mirror::Object> thread_peer)
REQUIRES(Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_)
REQUIRES_SHARED(Locks::mutator_lock_);
static Thread* FromManagedThread(const ScopedObjectAccessAlreadyRunnable& ts, jobject thread)
@@ -1762,7 +1761,14 @@
// Remove last-added entry from active_suspend1_barriers.
// Only makes sense if we're still holding thread_suspend_count_lock_ since insertion.
- ALWAYS_INLINE void RemoveFirstSuspend1Barrier() REQUIRES(Locks::thread_suspend_count_lock_);
+ // We redundantly pass in the barrier to be removed in order to enable a DCHECK.
+ ALWAYS_INLINE void RemoveFirstSuspend1Barrier(WrappedSuspend1Barrier* suspend1_barrier)
+ REQUIRES(Locks::thread_suspend_count_lock_);
+
+ // Remove the "barrier" from the list no matter where it appears. Called only under exceptional
+ // circumstances. The barrier must be in the list.
+ ALWAYS_INLINE void RemoveSuspend1Barrier(WrappedSuspend1Barrier* suspend1_barrier)
+ REQUIRES(Locks::thread_suspend_count_lock_);
ALWAYS_INLINE bool HasActiveSuspendBarrier() REQUIRES(Locks::thread_suspend_count_lock_);
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index d5face1..5e63b27 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -659,8 +659,9 @@
// failures are expected.
static constexpr bool kShortSuspendTimeouts = false;
+static constexpr unsigned kSuspendBarrierIters = kShortSuspendTimeouts ? 5 : 20;
+
#if ART_USE_FUTEXES
-static constexpr int kSuspendBarrierIters = 5;
// Returns true if it timed out.
static bool WaitOnceForSuspendBarrier(AtomicInteger* barrier,
@@ -669,8 +670,9 @@
timespec wait_timeout;
if (kShortSuspendTimeouts) {
timeout_ns = MsToNs(kSuspendBarrierIters);
+ CHECK_GE(NsToMs(timeout_ns / kSuspendBarrierIters), 1ul);
} else {
- DCHECK_GE(NsToMs(timeout_ns / kSuspendBarrierIters), 100ul);
+ DCHECK_GE(NsToMs(timeout_ns / kSuspendBarrierIters), 10ul);
}
InitTimeSpec(false, CLOCK_MONOTONIC, NsToMs(timeout_ns / kSuspendBarrierIters), 0, &wait_timeout);
if (futex(barrier->Address(), FUTEX_WAIT_PRIVATE, cur_val, &wait_timeout, nullptr, 0) != 0) {
@@ -684,16 +686,15 @@
}
#else
-static constexpr int kSuspendBarrierIters = 10;
static bool WaitOnceForSuspendBarrier(AtomicInteger* barrier,
int32_t cur_val,
uint64_t timeout_ns) {
- static constexpr int kIters = kShortSuspendTimeouts ? 10'000 : 1'000'000;
- if (!kShortSuspendTimeouts) {
- DCHECK_GE(NsToMs(timeout_ns / kSuspendBarrierIters), 100ul);
- }
- for (int i = 0; i < kIters; ++i) {
+ // In the normal case, aim for a couple of hundred milliseconds.
+ static constexpr unsigned kInnerIters =
+ kShortSuspendTimeouts ? 1'000 : (timeout_ns / 1000) / kSuspendBarrierIters;
+ DCHECK_GE(kInnerIters, 1'000u);
+ for (int i = 0; i < kInnerIters; ++i) {
sched_yield();
if (barrier->load(std::memory_order_acquire) == 0) {
return false;
@@ -701,7 +702,8 @@
}
return true;
}
-#endif
+
+#endif // ART_USE_FUTEXES
// Return a short string describing the scheduling state of the thread with the given tid.
static std::string GetThreadState(pid_t t) {
@@ -726,18 +728,23 @@
#endif
}
-std::optional<std::string> ThreadList::WaitForSuspendBarrier(AtomicInteger* barrier, pid_t t) {
+std::optional<std::string> ThreadList::WaitForSuspendBarrier(AtomicInteger* barrier,
+ pid_t t,
+ int attempt_of_4) {
// Only fail after kIter timeouts, to make us robust against app freezing.
#if ART_USE_FUTEXES
const uint64_t start_time = NanoTime();
#endif
+ uint64_t timeout_ns =
+ attempt_of_4 == 0 ? thread_suspend_timeout_ns_ : thread_suspend_timeout_ns_ / 4;
+ bool collect_state = (t != 0 && (attempt_of_4 == 0 || attempt_of_4 == 4));
int32_t cur_val = barrier->load(std::memory_order_acquire);
if (cur_val <= 0) {
DCHECK_EQ(cur_val, 0);
return std::nullopt;
}
- int i = 0;
- if (WaitOnceForSuspendBarrier(barrier, cur_val, thread_suspend_timeout_ns_)) {
+ unsigned i = 0;
+ if (WaitOnceForSuspendBarrier(barrier, cur_val, timeout_ns)) {
i = 1;
}
cur_val = barrier->load(std::memory_order_acquire);
@@ -747,14 +754,13 @@
}
// Long wait; gather information in case of timeout.
- std::string sampled_state = t == 0 ? "" : GetThreadState(t);
+ std::string sampled_state = collect_state ? GetThreadState(t) : "";
while (i < kSuspendBarrierIters) {
- if (WaitOnceForSuspendBarrier(barrier, cur_val, thread_suspend_timeout_ns_)) {
+ if (WaitOnceForSuspendBarrier(barrier, cur_val, timeout_ns)) {
++i;
#if ART_USE_FUTEXES
if (!kShortSuspendTimeouts) {
- CHECK_GE(NanoTime() - start_time,
- i * thread_suspend_timeout_ns_ / kSuspendBarrierIters - 1'000'000);
+ CHECK_GE(NanoTime() - start_time, i * timeout_ns / kSuspendBarrierIters - 1'000'000);
}
#endif
}
@@ -764,11 +770,10 @@
return std::nullopt;
}
}
- std::string result = t == 0 ? "" :
- "Target states: [" + sampled_state + ", " + GetThreadState(t) +
- "]" + std::to_string(cur_val) + "@" +
- std::to_string((uintptr_t)barrier) + "->";
- return result + std::to_string(barrier->load(std::memory_order_acquire));
+ return collect_state ? "Target states: [" + sampled_state + ", " + GetThreadState(t) + "]" +
+ std::to_string(cur_val) + "@" + std::to_string((uintptr_t)barrier) +
+ " Final wait time: " + PrettyDuration(NanoTime() - start_time) :
+ "";
}
void ThreadList::SuspendAll(const char* cause, bool long_suspend) {
@@ -834,7 +839,6 @@
// Ensures all threads running Java suspend and that those not running Java don't start.
void ThreadList::SuspendAllInternal(Thread* self, SuspendReason reason) {
// self can be nullptr if this is an unregistered thread.
- const uint64_t start_time = NanoTime();
Locks::mutator_lock_->AssertNotExclusiveHeld(self);
Locks::thread_list_lock_->AssertNotHeld(self);
Locks::thread_suspend_count_lock_->AssertNotHeld(self);
@@ -914,24 +918,47 @@
// We're already not runnable, so an attempt to suspend us should succeed.
}
- if (WaitForSuspendBarrier(&pending_threads).has_value()) {
- const uint64_t wait_time = NanoTime() - start_time;
- MutexLock mu(self, *Locks::thread_list_lock_);
- MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
- std::ostringstream oss;
- oss << "Unsuspended threads: ";
- Thread* culprit = nullptr;
- for (const auto& thread : list_) {
- if (thread != self && !thread->IsSuspended()) {
- culprit = thread;
- oss << *thread << ", ";
- }
+ Thread* culprit = nullptr;
+ pid_t tid = 0;
+ std::ostringstream oss;
+ for (int attempt_of_4 = 1; attempt_of_4 <= 4; ++attempt_of_4) {
+ auto result = WaitForSuspendBarrier(&pending_threads, tid, attempt_of_4);
+ if (!result.has_value()) {
+ // Wait succeeded.
+ break;
}
- oss << "waited for " << PrettyDuration(wait_time);
- if (culprit == nullptr) {
- LOG(FATAL) << "SuspendAll timeout. " << oss.str();
- } else {
- culprit->AbortInThis("SuspendAll timeout. " + oss.str());
+ if (attempt_of_4 == 3) {
+ // Second to the last attempt; Try to gather more information in case we time out.
+ MutexLock mu(self, *Locks::thread_list_lock_);
+ MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
+ oss << "Unsuspended threads: ";
+ for (const auto& thread : list_) {
+ if (thread != self && !thread->IsSuspended()) {
+ culprit = thread;
+ oss << *thread << ", ";
+ }
+ }
+ if (culprit != nullptr) {
+ tid = culprit->GetTid();
+ }
+ } else if (attempt_of_4 == 4) {
+ // Final attempt still timed out.
+ if (culprit == nullptr) {
+ LOG(FATAL) << "SuspendAll timeout. Couldn't find holdouts.";
+ } else {
+ std::string name;
+ culprit->GetThreadName(name);
+ oss << "Info for " << *culprit << ":";
+ std::string thr_descr =
+ StringPrintf("%s tid: %d, state&flags: 0x%x, priority: %d, barrier value: %d, ",
+ name.c_str(),
+ tid,
+ culprit->GetStateAndFlags(std::memory_order_relaxed).GetValue(),
+ culprit->GetNativePriority(),
+ pending_threads.load());
+ oss << thr_descr << result.value();
+ culprit->AbortInThis("SuspendAll timeout: " + oss.str());
+ }
}
}
}
@@ -1007,15 +1034,15 @@
// To check IsSuspended.
MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
if (UNLIKELY(!thread->IsSuspended())) {
- LOG(ERROR) << "Resume(" << reinterpret_cast<void*>(thread)
- << ") thread not suspended";
+ LOG(reason == SuspendReason::kForUserCode ? ERROR : FATAL)
+ << "Resume(" << reinterpret_cast<void*>(thread) << ") thread not suspended";
return false;
}
if (!Contains(thread)) {
// We only expect threads within the thread-list to have been suspended otherwise we can't
// stop such threads from delete-ing themselves.
- LOG(ERROR) << "Resume(" << reinterpret_cast<void*>(thread)
- << ") thread not within thread list";
+ LOG(reason == SuspendReason::kForUserCode ? ERROR : FATAL)
+ << "Resume(" << reinterpret_cast<void*>(thread) << ") thread not within thread list";
return false;
}
thread->DecrementSuspendCount(self, /*for_user_code=*/(reason == SuspendReason::kForUserCode));
@@ -1026,135 +1053,42 @@
return true;
}
-static void ThreadSuspendByPeerWarning(ScopedObjectAccess& soa,
- LogSeverity severity,
- const char* message,
- jobject peer) REQUIRES_SHARED(Locks::mutator_lock_) {
- ObjPtr<mirror::Object> name =
- WellKnownClasses::java_lang_Thread_name->GetObject(soa.Decode<mirror::Object>(peer));
- if (name == nullptr) {
- LOG(severity) << message << ": " << peer;
- } else {
- LOG(severity) << message << ": " << peer << ":" << name->AsString()->ToModifiedUtf8();
- }
-}
-
-Thread* ThreadList::SuspendThreadByPeer(jobject peer, SuspendReason reason) {
+bool ThreadList::SuspendThread(Thread* self,
+ Thread* thread,
+ SuspendReason reason,
+ ThreadState self_state,
+ const char* func_name,
+ int attempt_of_4) {
bool is_suspended = false;
- Thread* const self = Thread::Current();
- VLOG(threads) << "SuspendThreadByPeer starting";
- Thread* thread;
- WrappedSuspend1Barrier wrapped_barrier{};
- for (int iter_count = 1;; ++iter_count) {
- {
- // Note: this will transition to runnable and potentially suspend.
- ScopedObjectAccess soa(self);
- MutexLock thread_list_mu(self, *Locks::thread_list_lock_);
- thread = Thread::FromManagedThread(soa, peer);
- if (thread == nullptr) {
- ThreadSuspendByPeerWarning(soa,
- ::android::base::WARNING,
- "No such thread for suspend",
- peer);
- return nullptr;
- }
- if (!Contains(thread)) {
- VLOG(threads) << "SuspendThreadByPeer failed for unattached thread: "
- << reinterpret_cast<void*>(thread);
- return nullptr;
- }
- // IsSuspended on the current thread will fail as the current thread is changed into
- // Runnable above. As the suspend count is now raised if this is the current thread
- // it will self suspend on transition to Runnable, making it hard to work with. It's simpler
- // to just explicitly handle the current thread in the callers to this code.
- CHECK_NE(thread, self) << "Attempt to suspend the current thread for the debugger";
- VLOG(threads) << "SuspendThreadByPeer found thread: " << *thread;
- {
- MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
- if (LIKELY(self->GetSuspendCount() == 0)) {
- thread->IncrementSuspendCount(self, nullptr, &wrapped_barrier, reason);
- if (thread->IsSuspended()) {
- // See the discussion in mutator_gc_coord.md and SuspendAllInternal for the race here.
- thread->RemoveFirstSuspend1Barrier();
- if (!thread->HasActiveSuspendBarrier()) {
- thread->AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier);
- }
- is_suspended = true;
- }
- DCHECK_GT(thread->GetSuspendCount(), 0);
- break;
- }
- // Else we hold the suspend count lock but another thread is trying to suspend us,
- // making it unsafe to try to suspend another thread in case we get a cycle.
- // We start the loop again, which will allow this thread to be suspended.
- }
- }
- // All locks are released, and we should quickly exit the suspend-unfriendly state. Retry.
- if (iter_count >= kMaxSuspendRetries) {
- LOG(FATAL) << "Too many suspend retries";
- }
- usleep(kThreadSuspendSleepUs);
- }
- // Now wait for target to decrement suspend barrier.
- if (is_suspended || !WaitForSuspendBarrier(&wrapped_barrier.barrier_).has_value()) {
- // wrapped_barrier.barrier_ has been decremented and will no longer be accessed.
- VLOG(threads) << "SuspendThreadByPeer thread suspended: " << *thread;
- if (ATraceEnabled()) {
- std::string name;
- thread->GetThreadName(name);
- ATraceBegin(
- StringPrintf("SuspendThreadByPeer suspended %s for peer=%p", name.c_str(), peer).c_str());
- }
- DCHECK(thread->IsSuspended());
- return thread;
- } else {
- LOG(WARNING) << "Suspended thread state_and_flags: " << thread->StateAndFlagsAsHexString();
- // thread still has a pointer to wrapped_barrier. Returning and continuing would be unsafe
- // without additional cleanup.
- {
- ScopedObjectAccess soa(self);
- ThreadSuspendByPeerWarning(
- soa, ::android::base::FATAL, "SuspendThreadByPeer timed out", peer);
- }
- UNREACHABLE();
- }
-}
-
-Thread* ThreadList::SuspendThreadByThreadId(uint32_t thread_id, SuspendReason reason) {
- bool is_suspended = false;
- Thread* const self = Thread::Current();
- CHECK_NE(thread_id, kInvalidThreadId);
- VLOG(threads) << "SuspendThreadByThreadId starting";
- Thread* thread;
- pid_t tid;
+ VLOG(threads) << func_name << "starting";
+ pid_t tid = thread->GetTid();
uint8_t suspended_count;
uint8_t checkpoint_count;
WrappedSuspend1Barrier wrapped_barrier{};
static_assert(sizeof wrapped_barrier.barrier_ == sizeof(uint32_t));
- for (int iter_count = 1;; ++iter_count) {
+ ThreadExitFlag tef;
+ bool exited = false;
+ thread->NotifyOnThreadExit(&tef);
+ int iter_count = 1;
+ do {
{
+ Locks::mutator_lock_->AssertSharedHeld(self);
+ Locks::thread_list_lock_->AssertHeld(self);
// Note: this will transition to runnable and potentially suspend.
- ScopedObjectAccess soa(self);
- MutexLock thread_list_mu(self, *Locks::thread_list_lock_);
- thread = FindThreadByThreadId(thread_id);
- if (thread == nullptr) {
- // There's a race in inflating a lock and the owner giving up ownership and then dying.
- LOG(WARNING) << StringPrintf("No such thread id %d for suspend", thread_id);
- return nullptr;
- }
DCHECK(Contains(thread));
- CHECK_NE(thread, self) << "Attempt to suspend the current thread for the debugger";
- VLOG(threads) << "SuspendThreadByThreadId found thread: " << *thread;
+ // This implementation fails if thread == self. Let the clients handle that case
+ // appropriately.
+ CHECK_NE(thread, self) << func_name << "(self)";
+ VLOG(threads) << func_name << " suspending: " << *thread;
{
MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
if (LIKELY(self->GetSuspendCount() == 0)) {
- tid = thread->GetTid();
suspended_count = thread->suspended_count_;
checkpoint_count = thread->checkpoint_count_;
thread->IncrementSuspendCount(self, nullptr, &wrapped_barrier, reason);
if (thread->IsSuspended()) {
// See the discussion in mutator_gc_coord.md and SuspendAllInternal for the race here.
- thread->RemoveFirstSuspend1Barrier();
+ thread->RemoveFirstSuspend1Barrier(&wrapped_barrier);
if (!thread->HasActiveSuspendBarrier()) {
thread->AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier);
}
@@ -1172,7 +1106,23 @@
if (iter_count >= kMaxSuspendRetries) {
LOG(FATAL) << "Too many suspend retries";
}
- usleep(kThreadSuspendSleepUs);
+ Locks::thread_list_lock_->ExclusiveUnlock(self);
+ {
+ ScopedThreadSuspension sts(self, ThreadState::kSuspended);
+ usleep(kThreadSuspendSleepUs);
+ ++iter_count;
+ }
+ Locks::thread_list_lock_->ExclusiveLock(self);
+ exited = tef.HasExited();
+ } while (!exited);
+ thread->UnregisterThreadExitFlag(&tef);
+ Locks::thread_list_lock_->ExclusiveUnlock(self);
+ self->TransitionFromRunnableToSuspended(self_state);
+ if (exited) {
+ // This is OK: There's a race in inflating a lock and the owner giving up ownership and then
+ // dying.
+ LOG(WARNING) << StringPrintf("Thread with tid %d exited before suspending", tid);
+ return false;
}
// Now wait for target to decrement suspend barrier.
std::optional<std::string> failure_info;
@@ -1180,26 +1130,31 @@
// As an experiment, redundantly trigger suspension. TODO: Remove this.
std::atomic_thread_fence(std::memory_order_seq_cst);
thread->TriggerSuspend();
- failure_info = WaitForSuspendBarrier(&wrapped_barrier.barrier_, tid);
+ failure_info = WaitForSuspendBarrier(&wrapped_barrier.barrier_, tid, attempt_of_4);
if (!failure_info.has_value()) {
is_suspended = true;
}
}
- if (is_suspended) {
- // wrapped_barrier.barrier_ has been decremented and will no longer be accessed.
- VLOG(threads) << "SuspendThreadByThreadId thread suspended: " << *thread;
- if (ATraceEnabled()) {
- std::string name;
- thread->GetThreadName(name);
- ATraceBegin(
- StringPrintf("SuspendThreadByPeer suspended %s for id=%d", name.c_str(), thread_id)
- .c_str());
+ while (!is_suspended) {
+ if (attempt_of_4 > 0 && attempt_of_4 < 4) {
+ // Caller will try again. Give up and resume the thread for now. We need to make sure
+ // that wrapped_barrier is removed from the list before we deallocate it.
+ MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
+ if (wrapped_barrier.barrier_.load() == 0) {
+ // Succeeded in the meantime.
+ is_suspended = true;
+ continue;
+ }
+ thread->RemoveSuspend1Barrier(&wrapped_barrier);
+ if (!thread->HasActiveSuspendBarrier()) {
+ thread->AtomicClearFlag(ThreadFlag::kActiveSuspendBarrier);
+ }
+ // Do not call Resume(), since we are probably not fully suspended.
+ thread->DecrementSuspendCount(self,
+ /*for_user_code=*/(reason == SuspendReason::kForUserCode));
+ Thread::resume_cond_->Broadcast(self);
+ return false;
}
- DCHECK(thread->IsSuspended());
- return thread;
- } else {
- // thread still has a pointer to wrapped_barrier. Returning and continuing would be unsafe
- // without additional cleanup.
std::string name;
thread->GetThreadName(name);
WrappedSuspend1Barrier* first_barrier;
@@ -1207,12 +1162,13 @@
MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
first_barrier = thread->tlsPtr_.active_suspend1_barriers;
}
- // 'thread' should still be suspended, and hence stick around. Try to abort there, since its
- // stack trace is much more interesting than ours.
- thread->AbortInThis(StringPrintf(
- "Caused SuspendThreadByThreadId to time out: %d (%s), state&flags: 0x%x, priority: %d,"
+ // 'thread' should still have a suspend request pending, and hence stick around. Try to abort
+ // there, since its stack trace is much more interesting than ours.
+ std::string message = StringPrintf(
+ "%s timed out: %d (%s), state&flags: 0x%x, priority: %d,"
" barriers: %p, ours: %p, barrier value: %d, nsusps: %d, ncheckpts: %d, thread_info: %s",
- thread_id,
+ func_name,
+ thread->GetTid(),
name.c_str(),
thread->GetStateAndFlags(std::memory_order_relaxed).GetValue(),
thread->GetNativePriority(),
@@ -1221,9 +1177,81 @@
wrapped_barrier.barrier_.load(),
thread->suspended_count_ - suspended_count,
thread->checkpoint_count_ - checkpoint_count,
- failure_info.value().c_str()));
- UNREACHABLE();
+ failure_info.value().c_str());
+ // Check one last time whether thread passed the suspend barrier. Empirically this seems to
+ // happen maybe between 1 and 5% of the time.
+ if (wrapped_barrier.barrier_.load() != 0) {
+ // thread still has a pointer to wrapped_barrier. Returning and continuing would be unsafe
+ // without additional cleanup.
+ thread->AbortInThis(message);
+ UNREACHABLE();
+ }
+ is_suspended = true;
}
+ // wrapped_barrier.barrier_ has been decremented and will no longer be accessed.
+ VLOG(threads) << func_name << " suspended: " << *thread;
+ if (ATraceEnabled()) {
+ std::string name;
+ thread->GetThreadName(name);
+ ATraceBegin(
+ StringPrintf("%s suspended %s for tid=%d", func_name, name.c_str(), thread->GetTid())
+ .c_str());
+ }
+ DCHECK(thread->IsSuspended());
+ return true;
+}
+
+Thread* ThreadList::SuspendThreadByPeer(jobject peer, SuspendReason reason) {
+ Thread* const self = Thread::Current();
+ ThreadState old_self_state = self->GetState();
+ self->TransitionFromSuspendedToRunnable();
+ Locks::thread_list_lock_->ExclusiveLock(self);
+ ObjPtr<mirror::Object> thread_ptr = self->DecodeJObject(peer);
+ Thread* thread = Thread::FromManagedThread(self, thread_ptr);
+ if (thread == nullptr || !Contains(thread)) {
+ if (thread == nullptr) {
+ ObjPtr<mirror::Object> name = WellKnownClasses::java_lang_Thread_name->GetObject(thread_ptr);
+ std::string thr_name = (name == nullptr ? "<unknown>" : name->AsString()->ToModifiedUtf8());
+ LOG(WARNING) << "No such thread for suspend"
+ << ": " << peer << ":" << thr_name;
+ } else {
+ LOG(WARNING) << "SuspendThreadByPeer failed for unattached thread: "
+ << reinterpret_cast<void*>(thread);
+ }
+ Locks::thread_list_lock_->ExclusiveUnlock(self);
+ self->TransitionFromRunnableToSuspended(old_self_state);
+ return nullptr;
+ }
+ VLOG(threads) << "SuspendThreadByPeer found thread: " << *thread;
+ // Releases thread_list_lock_ and mutator lock.
+ bool success = SuspendThread(self, thread, reason, old_self_state, __func__, 0);
+ Locks::thread_list_lock_->AssertNotHeld(self);
+ return success ? thread : nullptr;
+}
+
+Thread* ThreadList::SuspendThreadByThreadId(uint32_t thread_id,
+ SuspendReason reason,
+ int attempt_of_4) {
+ Thread* const self = Thread::Current();
+ ThreadState old_self_state = self->GetState();
+ CHECK_NE(thread_id, kInvalidThreadId);
+ VLOG(threads) << "SuspendThreadByThreadId starting";
+ self->TransitionFromSuspendedToRunnable();
+ Locks::thread_list_lock_->ExclusiveLock(self);
+ Thread* thread = FindThreadByThreadId(thread_id);
+ if (thread == nullptr) {
+ // There's a race in inflating a lock and the owner giving up ownership and then dying.
+ LOG(WARNING) << StringPrintf("No such thread id %d for suspend", thread_id);
+ Locks::thread_list_lock_->ExclusiveUnlock(self);
+ self->TransitionFromRunnableToSuspended(old_self_state);
+ return nullptr;
+ }
+ DCHECK(Contains(thread));
+ VLOG(threads) << "SuspendThreadByThreadId found thread: " << *thread;
+ // Releases thread_list_lock_ and mutator lock.
+ bool success = SuspendThread(self, thread, reason, old_self_state, __func__, attempt_of_4);
+ Locks::thread_list_lock_->AssertNotHeld(self);
+ return success ? thread : nullptr;
}
Thread* ThreadList::FindThreadByThreadId(uint32_t thread_id) {
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index cb77440..8be2d4e 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -17,6 +17,10 @@
#ifndef ART_RUNTIME_THREAD_LIST_H_
#define ART_RUNTIME_THREAD_LIST_H_
+#include <bitset>
+#include <list>
+#include <vector>
+
#include "barrier.h"
#include "base/histogram.h"
#include "base/mutex.h"
@@ -25,10 +29,7 @@
#include "jni.h"
#include "reflective_handle_scope.h"
#include "suspend_reason.h"
-
-#include <bitset>
-#include <list>
-#include <vector>
+#include "thread_state.h"
namespace art HIDDEN {
namespace gc {
@@ -94,8 +95,10 @@
// Suspend a thread using its thread id, typically used by lock/monitor inflation. Returns the
// thread on success else null. The thread id is used to identify the thread to avoid races with
// the thread terminating. Note that as thread ids are recycled this may not suspend the expected
- // thread, that may be terminating.
- Thread* SuspendThreadByThreadId(uint32_t thread_id, SuspendReason reason)
+ // thread, that may be terminating. 'attempt_of_4' is zero if this is the only
+ // attempt, or 1..4 to try 4 times with fractional timeouts.
+ // TODO: Reconsider the use of thread_id, now that we have ThreadExitFlag.
+ Thread* SuspendThreadByThreadId(uint32_t thread_id, SuspendReason reason, int attempt_of_4 = 0)
REQUIRES(!Locks::mutator_lock_,
!Locks::thread_list_lock_,
!Locks::thread_suspend_count_lock_);
@@ -217,7 +220,9 @@
// the diagnostic information. If 0 is passed, we return an empty string on timeout. Normally
// the caller does not hold the mutator lock. See the comment at the call in
// RequestSynchronousCheckpoint for the only exception.
- std::optional<std::string> WaitForSuspendBarrier(AtomicInteger* barrier, pid_t t = 0)
+ std::optional<std::string> WaitForSuspendBarrier(AtomicInteger* barrier,
+ pid_t t = 0,
+ int attempt_of_4 = 0)
REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_);
private:
@@ -234,6 +239,19 @@
REQUIRES(Locks::thread_list_lock_, Locks::thread_suspend_count_lock_)
UNLOCK_FUNCTION(Locks::mutator_lock_);
+ // Helper to actually suspend a single thread. This is called with thread_list_lock_ held and
+ // the caller guarantees that *thread is valid until that is released. We "release the mutator
+ // lock", by switching to self_state. 'attempt_of_4' is 0 if we only attempt once, and 1..4 if
+ // we are going to try 4 times with a quarter of the full timeout. 'func_name' is used only to
+ // identify ourselves for logging.
+ bool SuspendThread(Thread* self,
+ Thread* thread,
+ SuspendReason reason,
+ ThreadState self_state,
+ const char* func_name,
+ int attempt_of_4) RELEASE(Locks::thread_list_lock_)
+ RELEASE_SHARED(Locks::mutator_lock_);
+
void SuspendAllInternal(Thread* self, SuspendReason reason = SuspendReason::kInternal)
REQUIRES(!Locks::thread_list_lock_,
!Locks::thread_suspend_count_lock_,