summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author Hans Boehm <hboehm@google.com> 2025-01-24 13:42:17 -0800
committer Hans Boehm <hboehm@google.com> 2025-02-13 09:26:23 -0800
commit4bcbbe50ab5c85c5b988fe8bcf46895c850cc4ca (patch)
tree927434a6af8e67a0a9fc2110baff1179effaa646
parentf937d3d23083efc167ff68b41216373bffd23c95 (diff)
Extend suspend timeout for debug activities
Extend the debugging suspend timeout substantially if it appears we are running concurrently with an ANR dump. There is some concern that we may be regularly getting suspend timeouts because too many cycles are devoted to an ANR dump at the time. Extend it almost indefinitely if the cuplrit thread is in a tracing stop 't' state. This makes debugging slightly easier, and should reduce the number of spurious bug reports we have to process. Add GetOsThreadStatQuick test and include minimal test for GetStateFromStatString. Bug: 330444460 Bug: 392053307 Test: Treehugger Change-Id: I820e7283fda84fa9fd9820ff9948b992e10547ec
-rw-r--r--libartbase/base/utils.cc12
-rw-r--r--libartbase/base/utils.h12
-rw-r--r--libartbase/base/utils_test.cc7
-rw-r--r--runtime/runtime.cc4
-rw-r--r--runtime/runtime.h2
-rw-r--r--runtime/signal_catcher.h2
-rw-r--r--runtime/thread.cc2
-rw-r--r--runtime/thread_list.cc26
8 files changed, 56 insertions, 11 deletions
diff --git a/libartbase/base/utils.cc b/libartbase/base/utils.cc
index e77d37d29b..efb727f950 100644
--- a/libartbase/base/utils.cc
+++ b/libartbase/base/utils.cc
@@ -424,6 +424,18 @@ std::string GetOsThreadStatQuick(pid_t tid) {
#endif
}
+char GetStateFromStatString(const std::string& stat_output) {
+ size_t rparen_pos = stat_output.find(")");
+ if (rparen_pos == std::string::npos || rparen_pos >= stat_output.length() - 3) {
+ return '?';
+ }
+ size_t state_pos = stat_output.find_first_not_of(" ", rparen_pos + 1);
+ if (rparen_pos == std::string::npos) {
+ return '?';
+ }
+ return stat_output[state_pos];
+}
+
std::string GetOtherThreadOsStats() {
#if defined(__linux__)
DIR* dir = opendir("/proc/self/task");
diff --git a/libartbase/base/utils.h b/libartbase/base/utils.h
index 4b86651f95..fdb7f6650a 100644
--- a/libartbase/base/utils.h
+++ b/libartbase/base/utils.h
@@ -152,14 +152,18 @@ inline void ForceRead(const T* pointer) {
// there is an I/O error.
std::string GetProcessStatus(const char* key);
-// Copy a prefix of /proc/tid/stat of the given length into buf. Return the number of bytes
-// actually read, 0 on error.
+// Copy a prefix of /proc/pid/task/tid/stat of the given length into buf. Return the number of
+// bytes actually read, 0 on error.
size_t GetOsThreadStat(pid_t tid, char* buf, size_t len);
-// Return a short prefix of /proc/tid/stat as quickly and robustly as possible. Used for debugging
-// timing issues and possibly issues with /proc itself. Always atomic.
+// Return a short prefix of /proc/pid/task/tid/stat as quickly and robustly as possible. Used for
+// debugging timing issues and possibly issues with /proc itself. Always atomic.
std::string GetOsThreadStatQuick(pid_t tid);
+// Given a /proc/.../stat string or prefix, such as those returned by the above, return the single
+// character representation of the thread state from that string, or '?' if it can't be found.
+char GetStateFromStatString(const std::string& stat_output);
+
// Return a concatenation of the output of GetOsThreadStatQuick(tid) for all other tids.
// Less robust against concurrent change, but individual stat strings should still always
// be consistent. Called only when we are nearly certain to crash anyway.
diff --git a/libartbase/base/utils_test.cc b/libartbase/base/utils_test.cc
index ab8627fa7d..a422fc5369 100644
--- a/libartbase/base/utils_test.cc
+++ b/libartbase/base/utils_test.cc
@@ -124,6 +124,13 @@ TEST_F(UtilsTest, GetProcessStatus) {
EXPECT_EQ("<unknown>", GetProcessStatus("InvalidFieldName"));
}
+TEST_F(UtilsTest, GetOsThreadStatQuick) {
+ std::string my_stat = GetOsThreadStatQuick(GetTid());
+ EXPECT_GT(my_stat.length(), 20);
+ EXPECT_LT(my_stat.length(), 1000);
+ EXPECT_EQ('R', GetStateFromStatString(my_stat));
+}
+
TEST_F(UtilsTest, StringSplit) {
auto range = SplitString("[ab[c[[d[e[", '[');
auto it = range.begin();
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index a16bde3469..e2c30d619c 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -2450,8 +2450,8 @@ void Runtime::DumpDeoptimizations(std::ostream& os) {
}
}
-std::optional<uint64_t> Runtime::SiqQuitNanoTime() const {
- return signal_catcher_ != nullptr ? signal_catcher_->SiqQuitNanoTime() : std::nullopt;
+std::optional<uint64_t> Runtime::SigQuitNanoTime() const {
+ return signal_catcher_ != nullptr ? signal_catcher_->SigQuitNanoTime() : std::nullopt;
}
void Runtime::DumpForSigQuit(std::ostream& os) {
diff --git a/runtime/runtime.h b/runtime/runtime.h
index e43d1d3432..63a0707424 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -314,7 +314,7 @@ class Runtime {
void DetachCurrentThread(bool should_run_callbacks = true) REQUIRES(!Locks::mutator_lock_);
// If we are handling SIQQUIT return the time when we received it.
- std::optional<uint64_t> SiqQuitNanoTime() const;
+ std::optional<uint64_t> SigQuitNanoTime() const;
void DumpDeoptimizations(std::ostream& os);
void DumpForSigQuit(std::ostream& os);
diff --git a/runtime/signal_catcher.h b/runtime/signal_catcher.h
index 79014ea022..4d5d71ede8 100644
--- a/runtime/signal_catcher.h
+++ b/runtime/signal_catcher.h
@@ -42,7 +42,7 @@ class SignalCatcher {
void HandleSigQuit() REQUIRES(!Locks::mutator_lock_, !Locks::thread_list_lock_,
!Locks::thread_suspend_count_lock_);
- std::optional<uint64_t> SiqQuitNanoTime() const { return sigquit_nanotime_; }
+ std::optional<uint64_t> SigQuitNanoTime() const { return sigquit_nanotime_; }
private:
// NO_THREAD_SAFETY_ANALYSIS for static function calling into member function with excludes lock.
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 89465979da..5fc0ce6790 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -2439,7 +2439,7 @@ Thread::DumpOrder Thread::DumpStack(std::ostream& os,
/*check_suspended=*/ !force_dump_stack,
/*dump_locks=*/ !force_dump_stack);
Runtime* runtime = Runtime::Current();
- std::optional<uint64_t> start = runtime != nullptr ? runtime->SiqQuitNanoTime() : std::nullopt;
+ std::optional<uint64_t> start = runtime != nullptr ? runtime->SigQuitNanoTime() : std::nullopt;
if (start.has_value()) {
os << "DumpLatencyMs: " << static_cast<float>(nanotime - start.value()) / 1000000.0 << "\n";
}
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 2acf58bfce..b4f79ee5a8 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -720,7 +720,7 @@ static constexpr unsigned kSuspendBarrierIters = kShortSuspendTimeouts ? 5 : 20;
#if ART_USE_FUTEXES
-// Returns true if it timed out.
+// Returns true if it timed out. Times out after timeout_ns/kSuspendBarrierIters nsecs
static bool WaitOnceForSuspendBarrier(AtomicInteger* barrier,
int32_t cur_val,
uint64_t timeout_ns) {
@@ -804,10 +804,23 @@ std::optional<std::string> ThreadList::WaitForSuspendBarrier(AtomicInteger* barr
return std::nullopt;
}
+ // Extra timeout to compensate for concurrent thread dumps, so that we are less likely to time
+ // out during ANR dumps.
+ uint64_t dump_adjustment_ns = 0;
+ // Total timeout increment if we see a concurrent thread dump. Distributed evenly across
+ // remaining iterations.
+ static constexpr uint64_t kDumpWaitNSecs = 30'000'000'000ull; // 30 seconds
+ // Replacement timeout if thread is stopped for tracing, probably by a debugger.
+ static constexpr uint64_t kTracingWaitNSecs = 7'200'000'000'000ull; // wait a bit < 2 hours;
+
// Long wait; gather information in case of timeout.
std::string sampled_state = collect_state ? GetOsThreadStatQuick(t) : "";
+ if (collect_state && GetStateFromStatString(sampled_state) == 't') {
+ LOG(WARNING) << "Thread suspension nearly timed out due to Tracing stop (debugger attached?)";
+ timeout_ns = kTracingWaitNSecs;
+ }
while (i < kSuspendBarrierIters) {
- if (WaitOnceForSuspendBarrier(barrier, cur_val, timeout_ns)) {
+ if (WaitOnceForSuspendBarrier(barrier, cur_val, timeout_ns + dump_adjustment_ns)) {
++i;
#if ART_USE_FUTEXES
if (!kShortSuspendTimeouts) {
@@ -820,6 +833,15 @@ std::optional<std::string> ThreadList::WaitForSuspendBarrier(AtomicInteger* barr
DCHECK_EQ(cur_val, 0);
return std::nullopt;
}
+ std::optional<uint64_t> last_sigquit_nanotime = Runtime::Current()->SigQuitNanoTime();
+ if (last_sigquit_nanotime.has_value() && i < kSuspendBarrierIters) {
+ // Adjust dump_adjustment_ns to reflect the number of iterations we have left and how long
+ // ago we started dumping threads.
+ uint64_t new_unscaled_adj = kDumpWaitNSecs + last_sigquit_nanotime.value() - NanoTime();
+ // Scale by the fraction of iterations still remaining.
+ dump_adjustment_ns = new_unscaled_adj * kSuspendBarrierIters / kSuspendBarrierIters - i;
+ }
+ // Keep the old dump_adjustment_ns if SigQuitNanoTime() was cleared.
}
uint64_t final_wait_time = NanoTime() - start_time;
uint64_t total_wait_time = attempt_of_4 == 0 ?