diff options
author | 2024-04-17 15:06:10 -0700 | |
---|---|---|
committer | 2024-04-19 23:59:23 +0000 | |
commit | 55e99bd1c5a403c4bddc023403593c9199af56f2 (patch) | |
tree | d3ff0c260735e4d45c033b9818fcfda04389b789 | |
parent | 1df1ff5f6c8c06fd1f03b85761f774d086ac65c8 (diff) |
Add debug info for failure to become 1-threaded
Add more information for the 'Failed to reach single-threaded state'
crashes in the zygote.
We capture abbreviated /proc/.../stat for all other threads
in the process at that point. After doing so, we also re-read and
re-print the line for the requesting process, to see if anything
changed while we were capturing the others.
This should tell us whether we're still dealing with the expected
daemons not exiting fast enough, or whether we have a different issue
as well.
This involved moving the /proc/.../stat printing code to enable reuse.
We also now use the same code in the primary test to determine if we
are single-threaded at the OS level.
Test: Standalone test program attached to b/310106506.
Test: art/test/testrunner/testrunner.py --host -b with kNumTries = 1
Bug: 310106506
Bug: 333219584
Bug: 315721088
Bug: 314247665
Bug: 314225311
Bug: 319209976
Bug: 317838599
Change-Id: I09aa9dd1bca13f7cd3bfcc1551bceaf1eb7458c2
-rw-r--r-- | libartbase/base/utils.cc | 73 | ||||
-rw-r--r-- | libartbase/base/utils.h | 13 | ||||
-rw-r--r-- | runtime/runtime.cc | 27 | ||||
-rw-r--r-- | runtime/thread_list.cc | 27 |
4 files changed, 103 insertions, 37 deletions
diff --git a/libartbase/base/utils.cc b/libartbase/base/utils.cc index 9efc1477cc..0ec262e696 100644 --- a/libartbase/base/utils.cc +++ b/libartbase/base/utils.cc @@ -366,4 +366,77 @@ std::string GetProcessStatus(const char* key) { return "<unknown>"; } +size_t GetOsThreadStat(pid_t tid, char* buf, size_t len) { +#if defined(__linux__) + static constexpr int NAME_BUF_SIZE = 50; + char file_name_buf[NAME_BUF_SIZE]; + snprintf(file_name_buf, NAME_BUF_SIZE, "/proc/%d/stat", tid); + int stat_fd = open(file_name_buf, O_RDONLY | O_CLOEXEC); + if (stat_fd >= 0) { + ssize_t bytes_read = TEMP_FAILURE_RETRY(read(stat_fd, buf, len)); + CHECK_GT(bytes_read, 0) << strerror(errno); + int ret = close(stat_fd); + CHECK_EQ(ret, 0) << strerror(errno); + buf[len - 1] = '\0'; + return bytes_read; + } +#else + UNUSED(tid); + UNUSED(buf); + UNUSED(len); +#endif + return 0; +} + +std::string GetOsThreadStatQuick(pid_t tid) { + static constexpr int BUF_SIZE = 90; + char buf[BUF_SIZE]; +#if defined(__linux__) + if (GetOsThreadStat(tid, buf, BUF_SIZE) == 0) { + snprintf(buf, BUF_SIZE, "Unknown state: %d", tid); + } +#else + UNUSED(tid); + strcpy(buf, "Unknown state"); // snprintf may not be usable. +#endif + return buf; +} + +std::string GetOtherThreadOsStats() { +#if defined(__linux__) + DIR* dir = opendir("/proc/self/task"); + if (dir == nullptr) { + return std::string("Failed to open /proc/self/task: ") + strerror(errno); + } + pid_t me = GetTid(); + struct dirent* de; + std::string result; + bool found_me = false; + errno = 0; + while ((de = readdir(dir)) != nullptr) { + if (de->d_name[0] == '.') { + continue; + } + pid_t tid = atoi(de->d_name); + if (tid == me) { + found_me = true; + } else { + if (!result.empty()) { + result += "; "; + } + result += tid == 0 ? std::string("bad tid: ") + de->d_name : GetOsThreadStatQuick(tid); + } + } + if (errno == EBADF) { + result += "(Bad directory)"; + } + if (!found_me) { + result += "(Failed to find requestor)"; + } + return result; +#else + return "Can't get other threads"; +#endif +} + } // namespace art diff --git a/libartbase/base/utils.h b/libartbase/base/utils.h index fb91f26279..b5e03dcdb8 100644 --- a/libartbase/base/utils.h +++ b/libartbase/base/utils.h @@ -145,6 +145,19 @@ inline void ForceRead(const T* pointer) { // there is an I/O error. std::string GetProcessStatus(const char* key); +// Copy a prefix of /proc/tid/stat of the given length into buf. Return the number of bytes +// actually read, 0 on error. +size_t GetOsThreadStat(pid_t tid, char* buf, size_t len); + +// Return a short prefix of /proc/tid/stat as quickly and robustly as possible. Used for debugging +// timing issues and possibly issues with /proc itself. Always atomic. +std::string GetOsThreadStatQuick(pid_t tid); + +// Return a concatenation of the output of GetOsThreadStatQuick(tid) for all other tids. +// Less robust against concurrent change, but individual stat strings should still always +// be consistent. Called only when we are nearly certain to crash anyway. +std::string GetOtherThreadOsStats(); + } // namespace art #endif // ART_LIBARTBASE_BASE_UTILS_H_ diff --git a/runtime/runtime.cc b/runtime/runtime.cc index 6416199192..df66c5e175 100644 --- a/runtime/runtime.cc +++ b/runtime/runtime.cc @@ -768,19 +768,16 @@ static void WaitUntilSingleThreaded() { // break atomicity of the read. static constexpr size_t kNumTries = 1000; static constexpr size_t kNumThreadsIndex = 20; - static constexpr ssize_t BUF_SIZE = 500; - static constexpr ssize_t BUF_PRINT_SIZE = 150; // Only log this much on failure to limit length. + static constexpr size_t BUF_SIZE = 500; + static constexpr size_t BUF_PRINT_SIZE = 150; // Only log this much on failure to limit length. static_assert(BUF_SIZE > BUF_PRINT_SIZE); char buf[BUF_SIZE]; - ssize_t bytes_read = -1; + size_t bytes_read = 0; + for (size_t tries = 0; tries < kNumTries; ++tries) { - int stat_fd = open("/proc/self/stat", O_RDONLY | O_CLOEXEC); - CHECK(stat_fd >= 0) << strerror(errno); - bytes_read = TEMP_FAILURE_RETRY(read(stat_fd, buf, BUF_SIZE)); - CHECK(bytes_read >= 0) << strerror(errno); - int ret = close(stat_fd); - DCHECK(ret == 0) << strerror(errno); - ssize_t pos = 0; + bytes_read = GetOsThreadStat(getpid(), buf, BUF_SIZE); + CHECK_NE(bytes_read, 0u); + size_t pos = 0; while (pos < bytes_read && buf[pos++] != ')') {} ++pos; // We're now positioned at the beginning of the third field. Don't count blanks embedded in @@ -800,8 +797,14 @@ static void WaitUntilSingleThreaded() { usleep(1000); } buf[std::min(BUF_PRINT_SIZE, bytes_read)] = '\0'; // Truncate buf before printing. - LOG(FATAL) << "Failed to reach single-threaded state: bytes_read = " << bytes_read - << " stat contents = \"" << buf << "...\""; + LOG(ERROR) << "Not single threaded: bytes_read = " << bytes_read << " stat contents = \"" << buf + << "...\""; + LOG(ERROR) << "Other threads' abbreviated stats: " << GetOtherThreadOsStats(); + bytes_read = GetOsThreadStat(getpid(), buf, BUF_PRINT_SIZE); + CHECK_NE(bytes_read, 0u); + LOG(ERROR) << "After re-read: bytes_read = " << bytes_read << " stat contents = \"" << buf + << "...\""; + LOG(FATAL) << "Failed to reach single-threaded state"; #else // Not Linux; shouldn't matter, but this has a high probability of working slowly. usleep(20'000); #endif diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc index 02e3f5475a..8ecc89c0a1 100644 --- a/runtime/thread_list.cc +++ b/runtime/thread_list.cc @@ -705,29 +705,6 @@ static bool WaitOnceForSuspendBarrier(AtomicInteger* barrier, #endif // ART_USE_FUTEXES -// Return a short string describing the scheduling state of the thread with the given tid. -static std::string GetThreadState(pid_t t) { -#if defined(__linux__) - static constexpr int BUF_SIZE = 90; - char file_name_buf[BUF_SIZE]; - char buf[BUF_SIZE]; - snprintf(file_name_buf, BUF_SIZE, "/proc/%d/stat", t); - int stat_fd = open(file_name_buf, O_RDONLY | O_CLOEXEC); - if (stat_fd < 0) { - return std::string("failed to get thread state: ") + std::string(strerror(errno)); - } - CHECK(stat_fd >= 0) << strerror(errno); - ssize_t bytes_read = TEMP_FAILURE_RETRY(read(stat_fd, buf, BUF_SIZE)); - CHECK(bytes_read >= 0) << strerror(errno); - int ret = close(stat_fd); - DCHECK(ret == 0) << strerror(errno); - buf[BUF_SIZE - 1] = '\0'; - return buf; -#else - return "unknown state"; -#endif -} - std::optional<std::string> ThreadList::WaitForSuspendBarrier(AtomicInteger* barrier, pid_t t, int attempt_of_4) { @@ -754,7 +731,7 @@ std::optional<std::string> ThreadList::WaitForSuspendBarrier(AtomicInteger* barr } // Long wait; gather information in case of timeout. - std::string sampled_state = collect_state ? GetThreadState(t) : ""; + std::string sampled_state = collect_state ? GetOsThreadStatQuick(t) : ""; while (i < kSuspendBarrierIters) { if (WaitOnceForSuspendBarrier(barrier, cur_val, timeout_ns)) { ++i; @@ -770,7 +747,7 @@ std::optional<std::string> ThreadList::WaitForSuspendBarrier(AtomicInteger* barr return std::nullopt; } } - return collect_state ? "Target states: [" + sampled_state + ", " + GetThreadState(t) + "]" + + return collect_state ? "Target states: [" + sampled_state + ", " + GetOsThreadStatQuick(t) + "]" + std::to_string(cur_val) + "@" + std::to_string((uintptr_t)barrier) + " Final wait time: " + PrettyDuration(NanoTime() - start_time) : ""; |