diff options
author | 2025-01-24 13:42:17 -0800 | |
---|---|---|
committer | 2025-02-13 09:26:23 -0800 | |
commit | 4bcbbe50ab5c85c5b988fe8bcf46895c850cc4ca (patch) | |
tree | 927434a6af8e67a0a9fc2110baff1179effaa646 | |
parent | f937d3d23083efc167ff68b41216373bffd23c95 (diff) |
Extend suspend timeout for debug activities
Extend the debugging suspend timeout substantially if it appears we
are running concurrently with an ANR dump. There is some concern
that we may be regularly getting suspend timeouts because too many
cycles are devoted to an ANR dump at the time.
Extend it almost indefinitely if the cuplrit thread is in a tracing
stop 't' state. This makes debugging slightly easier, and should reduce
the number of spurious bug reports we have to process.
Add GetOsThreadStatQuick test and include minimal test for
GetStateFromStatString.
Bug: 330444460
Bug: 392053307
Test: Treehugger
Change-Id: I820e7283fda84fa9fd9820ff9948b992e10547ec
-rw-r--r-- | libartbase/base/utils.cc | 12 | ||||
-rw-r--r-- | libartbase/base/utils.h | 12 | ||||
-rw-r--r-- | libartbase/base/utils_test.cc | 7 | ||||
-rw-r--r-- | runtime/runtime.cc | 4 | ||||
-rw-r--r-- | runtime/runtime.h | 2 | ||||
-rw-r--r-- | runtime/signal_catcher.h | 2 | ||||
-rw-r--r-- | runtime/thread.cc | 2 | ||||
-rw-r--r-- | runtime/thread_list.cc | 26 |
8 files changed, 56 insertions, 11 deletions
diff --git a/libartbase/base/utils.cc b/libartbase/base/utils.cc index e77d37d29b..efb727f950 100644 --- a/libartbase/base/utils.cc +++ b/libartbase/base/utils.cc @@ -424,6 +424,18 @@ std::string GetOsThreadStatQuick(pid_t tid) { #endif } +char GetStateFromStatString(const std::string& stat_output) { + size_t rparen_pos = stat_output.find(")"); + if (rparen_pos == std::string::npos || rparen_pos >= stat_output.length() - 3) { + return '?'; + } + size_t state_pos = stat_output.find_first_not_of(" ", rparen_pos + 1); + if (rparen_pos == std::string::npos) { + return '?'; + } + return stat_output[state_pos]; +} + std::string GetOtherThreadOsStats() { #if defined(__linux__) DIR* dir = opendir("/proc/self/task"); diff --git a/libartbase/base/utils.h b/libartbase/base/utils.h index 4b86651f95..fdb7f6650a 100644 --- a/libartbase/base/utils.h +++ b/libartbase/base/utils.h @@ -152,14 +152,18 @@ inline void ForceRead(const T* pointer) { // there is an I/O error. std::string GetProcessStatus(const char* key); -// Copy a prefix of /proc/tid/stat of the given length into buf. Return the number of bytes -// actually read, 0 on error. +// Copy a prefix of /proc/pid/task/tid/stat of the given length into buf. Return the number of +// bytes actually read, 0 on error. size_t GetOsThreadStat(pid_t tid, char* buf, size_t len); -// Return a short prefix of /proc/tid/stat as quickly and robustly as possible. Used for debugging -// timing issues and possibly issues with /proc itself. Always atomic. +// Return a short prefix of /proc/pid/task/tid/stat as quickly and robustly as possible. Used for +// debugging timing issues and possibly issues with /proc itself. Always atomic. std::string GetOsThreadStatQuick(pid_t tid); +// Given a /proc/.../stat string or prefix, such as those returned by the above, return the single +// character representation of the thread state from that string, or '?' if it can't be found. +char GetStateFromStatString(const std::string& stat_output); + // Return a concatenation of the output of GetOsThreadStatQuick(tid) for all other tids. // Less robust against concurrent change, but individual stat strings should still always // be consistent. Called only when we are nearly certain to crash anyway. diff --git a/libartbase/base/utils_test.cc b/libartbase/base/utils_test.cc index ab8627fa7d..a422fc5369 100644 --- a/libartbase/base/utils_test.cc +++ b/libartbase/base/utils_test.cc @@ -124,6 +124,13 @@ TEST_F(UtilsTest, GetProcessStatus) { EXPECT_EQ("<unknown>", GetProcessStatus("InvalidFieldName")); } +TEST_F(UtilsTest, GetOsThreadStatQuick) { + std::string my_stat = GetOsThreadStatQuick(GetTid()); + EXPECT_GT(my_stat.length(), 20); + EXPECT_LT(my_stat.length(), 1000); + EXPECT_EQ('R', GetStateFromStatString(my_stat)); +} + TEST_F(UtilsTest, StringSplit) { auto range = SplitString("[ab[c[[d[e[", '['); auto it = range.begin(); diff --git a/runtime/runtime.cc b/runtime/runtime.cc index a16bde3469..e2c30d619c 100644 --- a/runtime/runtime.cc +++ b/runtime/runtime.cc @@ -2450,8 +2450,8 @@ void Runtime::DumpDeoptimizations(std::ostream& os) { } } -std::optional<uint64_t> Runtime::SiqQuitNanoTime() const { - return signal_catcher_ != nullptr ? signal_catcher_->SiqQuitNanoTime() : std::nullopt; +std::optional<uint64_t> Runtime::SigQuitNanoTime() const { + return signal_catcher_ != nullptr ? signal_catcher_->SigQuitNanoTime() : std::nullopt; } void Runtime::DumpForSigQuit(std::ostream& os) { diff --git a/runtime/runtime.h b/runtime/runtime.h index e43d1d3432..63a0707424 100644 --- a/runtime/runtime.h +++ b/runtime/runtime.h @@ -314,7 +314,7 @@ class Runtime { void DetachCurrentThread(bool should_run_callbacks = true) REQUIRES(!Locks::mutator_lock_); // If we are handling SIQQUIT return the time when we received it. - std::optional<uint64_t> SiqQuitNanoTime() const; + std::optional<uint64_t> SigQuitNanoTime() const; void DumpDeoptimizations(std::ostream& os); void DumpForSigQuit(std::ostream& os); diff --git a/runtime/signal_catcher.h b/runtime/signal_catcher.h index 79014ea022..4d5d71ede8 100644 --- a/runtime/signal_catcher.h +++ b/runtime/signal_catcher.h @@ -42,7 +42,7 @@ class SignalCatcher { void HandleSigQuit() REQUIRES(!Locks::mutator_lock_, !Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_); - std::optional<uint64_t> SiqQuitNanoTime() const { return sigquit_nanotime_; } + std::optional<uint64_t> SigQuitNanoTime() const { return sigquit_nanotime_; } private: // NO_THREAD_SAFETY_ANALYSIS for static function calling into member function with excludes lock. diff --git a/runtime/thread.cc b/runtime/thread.cc index 89465979da..5fc0ce6790 100644 --- a/runtime/thread.cc +++ b/runtime/thread.cc @@ -2439,7 +2439,7 @@ Thread::DumpOrder Thread::DumpStack(std::ostream& os, /*check_suspended=*/ !force_dump_stack, /*dump_locks=*/ !force_dump_stack); Runtime* runtime = Runtime::Current(); - std::optional<uint64_t> start = runtime != nullptr ? runtime->SiqQuitNanoTime() : std::nullopt; + std::optional<uint64_t> start = runtime != nullptr ? runtime->SigQuitNanoTime() : std::nullopt; if (start.has_value()) { os << "DumpLatencyMs: " << static_cast<float>(nanotime - start.value()) / 1000000.0 << "\n"; } diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc index 2acf58bfce..b4f79ee5a8 100644 --- a/runtime/thread_list.cc +++ b/runtime/thread_list.cc @@ -720,7 +720,7 @@ static constexpr unsigned kSuspendBarrierIters = kShortSuspendTimeouts ? 5 : 20; #if ART_USE_FUTEXES -// Returns true if it timed out. +// Returns true if it timed out. Times out after timeout_ns/kSuspendBarrierIters nsecs static bool WaitOnceForSuspendBarrier(AtomicInteger* barrier, int32_t cur_val, uint64_t timeout_ns) { @@ -804,10 +804,23 @@ std::optional<std::string> ThreadList::WaitForSuspendBarrier(AtomicInteger* barr return std::nullopt; } + // Extra timeout to compensate for concurrent thread dumps, so that we are less likely to time + // out during ANR dumps. + uint64_t dump_adjustment_ns = 0; + // Total timeout increment if we see a concurrent thread dump. Distributed evenly across + // remaining iterations. + static constexpr uint64_t kDumpWaitNSecs = 30'000'000'000ull; // 30 seconds + // Replacement timeout if thread is stopped for tracing, probably by a debugger. + static constexpr uint64_t kTracingWaitNSecs = 7'200'000'000'000ull; // wait a bit < 2 hours; + // Long wait; gather information in case of timeout. std::string sampled_state = collect_state ? GetOsThreadStatQuick(t) : ""; + if (collect_state && GetStateFromStatString(sampled_state) == 't') { + LOG(WARNING) << "Thread suspension nearly timed out due to Tracing stop (debugger attached?)"; + timeout_ns = kTracingWaitNSecs; + } while (i < kSuspendBarrierIters) { - if (WaitOnceForSuspendBarrier(barrier, cur_val, timeout_ns)) { + if (WaitOnceForSuspendBarrier(barrier, cur_val, timeout_ns + dump_adjustment_ns)) { ++i; #if ART_USE_FUTEXES if (!kShortSuspendTimeouts) { @@ -820,6 +833,15 @@ std::optional<std::string> ThreadList::WaitForSuspendBarrier(AtomicInteger* barr DCHECK_EQ(cur_val, 0); return std::nullopt; } + std::optional<uint64_t> last_sigquit_nanotime = Runtime::Current()->SigQuitNanoTime(); + if (last_sigquit_nanotime.has_value() && i < kSuspendBarrierIters) { + // Adjust dump_adjustment_ns to reflect the number of iterations we have left and how long + // ago we started dumping threads. + uint64_t new_unscaled_adj = kDumpWaitNSecs + last_sigquit_nanotime.value() - NanoTime(); + // Scale by the fraction of iterations still remaining. + dump_adjustment_ns = new_unscaled_adj * kSuspendBarrierIters / kSuspendBarrierIters - i; + } + // Keep the old dump_adjustment_ns if SigQuitNanoTime() was cleared. } uint64_t final_wait_time = NanoTime() - start_time; uint64_t total_wait_time = attempt_of_4 == 0 ? |