summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author Hans Boehm <hboehm@google.com> 2024-09-19 10:04:31 -0700
committer Hans Boehm <hboehm@google.com> 2024-09-21 00:06:03 +0000
commite23d7335d327e3ad2d1c6c88261ea22835ecb342 (patch)
tree822b10aca4a6ec5a863d15c08d639fe9eb1967c7
parented67125787c5ec24285e694e56f6e4f1f7fef1be (diff)
More suspend timeout improvements
1) Increase suspend timeouts when the request comes from a low priority thread. We used to double the timeout, we now triple it. 2) Rather than including /proc/<tid>/stat prefixes in timeout messages. use /proc/<pid>/task/<tid>/stat. The former reports running times for the process, while the latter reports thread-specific numbers. (In spite of what the man page states.) 3) Improve the timeout message format to eliminate some redundant information and instead include 10 more characters from the stat files, better exposing priority-related fields. 4) We print a calculated approximation of the total wait time before the timeout. Just printing the final wait time turned out to be confusing. 5) Rather than lengthening the third out of 4 wait attempts more than the others, we now lengthen the 4th and final one instead. This means the two stat file snapshots may be further apart. This should somewhat reduce timeouts, but mostly it further improves the message we get when we do time out. Bug: 366061900 Test: Treehugger Test: Check messages with kShortSuspendTimeouts set. Change-Id: Ife42f0b9c37c4fabe82c076c66923d9592df53f7
-rw-r--r--libartbase/base/utils.cc8
-rw-r--r--runtime/thread_list.cc47
2 files changed, 34 insertions, 21 deletions
diff --git a/libartbase/base/utils.cc b/libartbase/base/utils.cc
index 3d4f670e64..da53d0b99f 100644
--- a/libartbase/base/utils.cc
+++ b/libartbase/base/utils.cc
@@ -377,9 +377,11 @@ std::string GetProcessStatus(const char* key) {
size_t GetOsThreadStat(pid_t tid, char* buf, size_t len) {
#if defined(__linux__)
- static constexpr int NAME_BUF_SIZE = 50;
+ static constexpr int NAME_BUF_SIZE = 60;
char file_name_buf[NAME_BUF_SIZE];
- snprintf(file_name_buf, NAME_BUF_SIZE, "/proc/%d/stat", tid);
+ // We don't use just /proc/<pid>/stat since, in spite of some documentation to the contrary,
+ // those report utime and stime values for the whole process, not just the thread.
+ snprintf(file_name_buf, NAME_BUF_SIZE, "/proc/%d/task/%d/stat", getpid(), tid);
int stat_fd = open(file_name_buf, O_RDONLY | O_CLOEXEC);
if (stat_fd >= 0) {
ssize_t bytes_read = TEMP_FAILURE_RETRY(read(stat_fd, buf, len));
@@ -398,7 +400,7 @@ size_t GetOsThreadStat(pid_t tid, char* buf, size_t len) {
}
std::string GetOsThreadStatQuick(pid_t tid) {
- static constexpr int BUF_SIZE = 90;
+ static constexpr int BUF_SIZE = 100;
char buf[BUF_SIZE];
#if defined(__linux__)
if (GetOsThreadStat(tid, buf, BUF_SIZE) == 0) {
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index c5b80bcdc3..3665b34cd5 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -768,15 +768,22 @@ std::optional<std::string> ThreadList::WaitForSuspendBarrier(AtomicInteger* barr
#endif
uint64_t timeout_ns =
attempt_of_4 == 0 ? thread_suspend_timeout_ns_ : thread_suspend_timeout_ns_ / 4;
- if (attempt_of_4 != 1 && getpriority(PRIO_PROCESS, 0 /* this thread */) > 0) {
- // We're a low priority thread, and thus have a longer ANR timeout. Double the suspend
- // timeout. To avoid the getpriority system call in the common case, we fail to double the
- // first of 4 waits, but then triple the third one to compensate.
- if (attempt_of_4 == 3) {
- timeout_ns *= 3;
- } else {
- timeout_ns *= 2;
+
+ uint64_t avg_wait_multiplier = 1;
+ uint64_t wait_multiplier = 1;
+ if (attempt_of_4 != 1) {
+ // TODO: RequestSynchronousCheckpoint routinely passes attempt_of_4 = 0. Can
+ // we avoid the getpriority() call?
+ if (getpriority(PRIO_PROCESS, 0 /* this thread */) > 0) {
+ // We're a low priority thread, and thus have a longer ANR timeout. Increase the suspend
+ // timeout.
+ avg_wait_multiplier = 3;
}
+ // To avoid the system calls in the common case, we fail to increase the first of 4 waits, but
+ // then compensate during the last one. This also allows somewhat longer thread monitoring
+ // before we time out.
+ wait_multiplier = attempt_of_4 == 4 ? 2 * avg_wait_multiplier - 1 : avg_wait_multiplier;
+ timeout_ns *= wait_multiplier;
}
bool collect_state = (t != 0 && (attempt_of_4 == 0 || attempt_of_4 == 4));
int32_t cur_val = barrier->load(std::memory_order_acquire);
@@ -811,9 +818,14 @@ std::optional<std::string> ThreadList::WaitForSuspendBarrier(AtomicInteger* barr
return std::nullopt;
}
}
+ uint64_t final_wait_time = NanoTime() - start_time;
+ uint64_t total_wait_time = attempt_of_4 == 0 ?
+ final_wait_time :
+ 4 * final_wait_time * avg_wait_multiplier / wait_multiplier;
return collect_state ? "Target states: [" + sampled_state + ", " + GetOsThreadStatQuick(t) + "]" +
- std::to_string(cur_val) + "@" + std::to_string((uintptr_t)barrier) +
- " Final wait time: " + PrettyDuration(NanoTime() - start_time) :
+ (cur_val == 0 ? "(barrier now passed)" : "") +
+ " Final wait time: " + PrettyDuration(final_wait_time) +
+ "; appr. total wait time: " + PrettyDuration(total_wait_time) :
"";
}
@@ -972,7 +984,7 @@ void ThreadList::SuspendAllInternal(Thread* self, SuspendReason reason) {
// Second to the last attempt; Try to gather more information in case we time out.
MutexLock mu(self, *Locks::thread_list_lock_);
MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
- oss << "Unsuspended threads: ";
+ oss << "remaining threads: ";
for (const auto& thread : list_) {
if (thread != self && !thread->IsSuspended()) {
culprit = thread;
@@ -989,16 +1001,15 @@ void ThreadList::SuspendAllInternal(Thread* self, SuspendReason reason) {
} else {
std::string name;
culprit->GetThreadName(name);
- oss << "Info for " << *culprit << ":";
+ oss << "Info for " << name << ": ";
std::string thr_descr =
- StringPrintf("%s tid: %d, state&flags: 0x%x, priority: %d, barrier value: %d, ",
- name.c_str(),
- tid,
+ StringPrintf("state&flags: 0x%x, Java/native priority: %d/%d, barrier value: %d, ",
culprit->GetStateAndFlags(std::memory_order_relaxed).GetValue(),
culprit->GetNativePriority(),
+ getpriority(PRIO_PROCESS /* really thread */, culprit->GetTid()),
pending_threads.load());
oss << thr_descr << result.value();
- culprit->AbortInThis("SuspendAll timeout: " + oss.str());
+ culprit->AbortInThis("SuspendAll timeout; " + oss.str());
}
}
}
@@ -1204,13 +1215,13 @@ bool ThreadList::SuspendThread(Thread* self,
// 'thread' should still have a suspend request pending, and hence stick around. Try to abort
// there, since its stack trace is much more interesting than ours.
std::string message = StringPrintf(
- "%s timed out: %d (%s), state&flags: 0x%x, priority: %d,"
+ "%s timed out: %s: state&flags: 0x%x, Java/native priority: %d/%d,"
" barriers: %p, ours: %p, barrier value: %d, nsusps: %d, ncheckpts: %d, thread_info: %s",
func_name,
- thread->GetTid(),
name.c_str(),
thread->GetStateAndFlags(std::memory_order_relaxed).GetValue(),
thread->GetNativePriority(),
+ getpriority(PRIO_PROCESS /* really thread */, thread->GetTid()),
first_barrier,
&wrapped_barrier,
wrapped_barrier.barrier_.load(),