Dump java backtrace when contending lock too long time.

It’s easy to cause a problem when execute time-consuming work inside a synchronized method/block. Previously, we use "dvm_lock_sample" in event log to help us look into. But it's always after the event, "dvm_lock_sample" is printed after the owner thread exited the critical section, we don't know more details that which statement get stuck or more time-consuming. This implementation provides a lot more information about what the contending thread is doing. when a thread contend for a long time, we will print backtrack which thread holding the lock. Maybe many threads waiting this lock, those threads will request printing backtrack many times, which is helpful for finding and locating problems. This mechanism is only suitable for very long wait times, not lots of short wait times. This feature is heavyweight, it's not enabled by default. we can turn on by "-XX:MonitorTimeoutEnable" in AndroidRuntime::startVm. Bug:140968481 Test: Manually check contention log messages. Signed-off-by: wangguibo <wangguibo@xiaomi.com> Change-Id: I29784d11355ae3c581723254848a87cd80e71456
author: wangguibo <wangguibo@xiaomi.com> 2021-04-24 11:27:06 +0800
committer: Hans Boehm <hboehm@google.com> 2021-05-17 18:19:16 +0000
commit: 0d290721a8d63d5e0d38a64d3ecf922a91becb10 (patch)
tree: 02daf3bb3ae703da0a9b99a307f8bbfb2c03c748 /runtime/base/mutex.cc
parent: c63d9672264e894d8d409e8d582b4e086b26abca (diff)
1 files changed, 80 insertions, 2 deletions
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 0b8c781858..6574ec0db6 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -31,11 +31,18 @@
 #include "mutex-inl.h"
 #include "scoped_thread_state_change-inl.h"
 #include "thread-inl.h"
+#include "thread.h"
+#include "thread_list.h"
 
 namespace art {
 
 using android::base::StringPrintf;
 
+static constexpr uint64_t kIntervalMillis = 50;
+static constexpr int kMonitorTimeoutTryMax = 5;
+
+static const char* kLastDumpStackTime = "LastDumpStackTime";
+
 struct AllMutexData {
   // A guard for all_mutexes_ that's not a mutex (Mutexes must CAS to acquire and busy wait).
   Atomic<const BaseMutex*> all_mutexes_guard;
@@ -45,6 +52,13 @@ struct AllMutexData {
 };
 static struct AllMutexData gAllMutexData[kAllMutexDataSize];
 
+struct DumpStackLastTimeTLSData : public art::TLSData {
+  explicit DumpStackLastTimeTLSData(uint64_t last_dump_time_ms) {
+    last_dump_time_ms_ = last_dump_time_ms;
+  }
+  uint64_t last_dump_time_ms_;
+};
+
 #if ART_USE_FUTEXES
 static bool ComputeRelativeTimeSpec(timespec* result_ts, const timespec& lhs, const timespec& rhs) {
   const int32_t one_sec = 1000 * 1000 * 1000;  // one second in nanoseconds.
@@ -443,15 +457,28 @@ void Mutex::ExclusiveLock(Thread* self) {
           if (UNLIKELY(should_respond_to_empty_checkpoint_request_)) {
             self->CheckEmptyCheckpointFromMutex();
           }
+
+          uint64_t wait_start_ms = enable_monitor_timeout_ ? MilliTime() : 0;
+          uint64_t try_times = 0;
           do {
+            timespec timeout_ts;
+            timeout_ts.tv_sec = 0;
+            timeout_ts.tv_nsec = Runtime::Current()->GetMonitorTimeoutNs();
             if (futex(state_and_contenders_.Address(), FUTEX_WAIT_PRIVATE, cur_state,
-                      nullptr, nullptr, 0) != 0) {
+                      enable_monitor_timeout_ ? &timeout_ts : nullptr , nullptr, 0) != 0) {
               // We only went to sleep after incrementing and contenders and checking that the
               // lock is still held by someone else.  EAGAIN and EINTR both indicate a spurious
               // failure, try again from the beginning.  We don't use TEMP_FAILURE_RETRY so we can
               // intentionally retry to acquire the lock.
               if ((errno != EAGAIN) && (errno != EINTR)) {
-                PLOG(FATAL) << "futex wait failed for " << name_;
+                if (errno == ETIMEDOUT) {
+                  try_times++;
+                  if (try_times <= kMonitorTimeoutTryMax) {
+                    DumpStack(self, wait_start_ms, try_times);
+                  }
+                } else {
+                  PLOG(FATAL) << "futex wait failed for " << name_;
+                }
               }
             }
             SleepIfRuntimeDeleted(self);
@@ -481,6 +508,57 @@ void Mutex::ExclusiveLock(Thread* self) {
   }
 }
 
+void Mutex::DumpStack(Thread* self, uint64_t wait_start_ms, uint64_t try_times) {
+  ScopedObjectAccess soa(self);
+  Locks::thread_list_lock_->ExclusiveLock(self);
+  std::string owner_stack_dump;
+  pid_t owner_tid = GetExclusiveOwnerTid();
+  Thread *owner = Runtime::Current()->GetThreadList()->FindThreadByTid(owner_tid);
+  if (owner != nullptr) {
+    if (IsDumpFrequent(owner, try_times)) {
+      Locks::thread_list_lock_->ExclusiveUnlock(self);
+      LOG(WARNING) << "Contention with tid " << owner_tid << ", monitor id " << monitor_id_;
+      return;
+    }
+    struct CollectStackTrace : public Closure {
+      void Run(art::Thread* thread) override
+        REQUIRES_SHARED(art::Locks::mutator_lock_) {
+        if (IsDumpFrequent(thread)) {
+          return;
+        }
+        thread->SetCustomTLS(kLastDumpStackTime, new DumpStackLastTimeTLSData(MilliTime()));
+        thread->DumpJavaStack(oss);
+      }
+      std::ostringstream oss;
+    };
+    CollectStackTrace owner_trace;
+    owner->RequestSynchronousCheckpoint(&owner_trace);
+    owner_stack_dump = owner_trace.oss.str();
+    uint64_t wait_ms = MilliTime() - wait_start_ms;
+    LOG(WARNING) << "Monitor contention with tid " << owner_tid << ", wait time: " << wait_ms
+                 << "ms, monitor id: " << monitor_id_
+                 << "\nPerfMonitor owner thread(" << owner_tid << ") stack is:\n"
+                 << owner_stack_dump;
+  } else {
+    Locks::thread_list_lock_->ExclusiveUnlock(self);
+  }
+}
+
+bool Mutex::IsDumpFrequent(Thread* thread, uint64_t try_times) {
+  uint64_t last_dump_time_ms = 0;
+  DumpStackLastTimeTLSData* tls_data =
+      reinterpret_cast<DumpStackLastTimeTLSData*>(thread->GetCustomTLS(kLastDumpStackTime));
+  if (tls_data != nullptr) {
+     last_dump_time_ms = tls_data->last_dump_time_ms_;
+  }
+  uint64_t interval = MilliTime() - last_dump_time_ms;
+  if (interval < kIntervalMillis * try_times) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 bool Mutex::ExclusiveTryLock(Thread* self) {
   DCHECK(self == nullptr || self == Thread::Current());
   if (kDebugLocking && !recursive_) {
author	wangguibo <wangguibo@xiaomi.com>	2021-04-24 11:27:06 +0800
committer	Hans Boehm <hboehm@google.com>	2021-05-17 18:19:16 +0000
commit	0d290721a8d63d5e0d38a64d3ecf922a91becb10 (patch)
tree	02daf3bb3ae703da0a9b99a307f8bbfb2c03c748 /runtime/base/mutex.cc
parent	c63d9672264e894d8d409e8d582b4e086b26abca (diff)