Handle suspend requests in getReferent() When waiting in getReferent or the like, use a TimedWait, so we can occasionally check for suspend requests, thus avoiding deadlocks that can arise from blocking indefinitely in a runnable state. This is not particularly clean, and may introduce short delays when we would otherwise deadlock. It's also a bit risky because we are now releasing the mutator lock in code that previously didn't. This is a hopefully more correct replacement for aosp/1784003, which overlooked some of the complications here. This does not handle a similar problem in the JNI weak reference code. Each additional use context adds risk here, due to the mutator lock release. Bug: 195336624 Bug: 195664026 Test: Build and boot AOSP with much shorter timeouts. Test: Confirm that the timeout code is invoked. Change-Id: I0ffb2ffd105bed9dcb8664f92b17cfbcf756a6e0

commit: 0c3cc6350749a441fd54f8f3f67b7c69775000c8 [log] [tgz]
author: Hans Boehm <hboehm@google.com> Thu Aug 05 18:30:08 2021 -0700
committer: Hans Boehm <hboehm@google.com> Sat Aug 07 15:16:49 2021 +0000
tree: b1269dfa3f20f2abee25a880df1ad36884ff3f56
parent: 825e82972fe46fdb0419c42bd7df102df1989ff9 [diff]
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 6574ec0..a5fb40d 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc

@@ -1105,10 +1105,14 @@
 }
 
 bool ConditionVariable::TimedWait(Thread* self, int64_t ms, int32_t ns) {
+  guard_.CheckSafeToWait(self);
+  return TimedWaitHoldingLocks(self, ms, ns);
+}
+
+bool ConditionVariable::TimedWaitHoldingLocks(Thread* self, int64_t ms, int32_t ns) {
   DCHECK(self == nullptr || self == Thread::Current());
   bool timed_out = false;
   guard_.AssertExclusiveHeld(self);
-  guard_.CheckSafeToWait(self);
   unsigned int old_recursion_count = guard_.recursion_count_;
 #if ART_USE_FUTEXES
   timespec rel_ts;

diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index 8f2a8ea..d4fb778 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h

@@ -465,11 +465,13 @@
   // TODO: No thread safety analysis on Wait and TimedWait as they call mutex operations via their
   //       pointer copy, thereby defeating annotalysis.
   void Wait(Thread* self) NO_THREAD_SAFETY_ANALYSIS;
+  // Returns true on timeout.
   bool TimedWait(Thread* self, int64_t ms, int32_t ns) NO_THREAD_SAFETY_ANALYSIS;
   // Variant of Wait that should be used with caution. Doesn't validate that no mutexes are held
   // when waiting.
   // TODO: remove this.
   void WaitHoldingLocks(Thread* self) NO_THREAD_SAFETY_ANALYSIS;
+  bool TimedWaitHoldingLocks(Thread* self, int64_t ms, int32_t ns) NO_THREAD_SAFETY_ANALYSIS;
 
   void CheckSafeToWait(Thread* self) NO_THREAD_SAFETY_ANALYSIS {
     if (kDebugLocking) {

diff --git a/runtime/gc/reference_processor.cc b/runtime/gc/reference_processor.cc
index e34d140..ea0ea4b 100644
--- a/runtime/gc/reference_processor.cc
+++ b/runtime/gc/reference_processor.cc

@@ -147,15 +147,12 @@
         }
       }
     }
-    // Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
-    // presence of threads blocking for weak ref access.
-    self->CheckEmptyCheckpointFromWeakRefAccess(Locks::reference_processor_lock_);
     if (!started_trace) {
       ATraceBegin("GetReferent blocked");
       started_trace = true;
       start_millis = MilliTime();
     }
-    condition_.WaitHoldingLocks(self);
+    WaitUntilDoneProcessingReferences(self);
   }
   if (started_trace) {
     finish_trace(start_millis);
@@ -380,13 +377,34 @@
 }
 
 void ReferenceProcessor::WaitUntilDoneProcessingReferences(Thread* self) {
-  // Wait until we are done processing reference.
+  // Wait until we are done processing references.
+  // TODO: We must hold reference_processor_lock_ to wait, and we cannot release and reacquire
+  // the mutator lock while we hold it. But we shouldn't remain runnable while we're asleep.
+  // Is there a way to do this more cleanly if we release the mutator lock in the condvar
+  // implementation? Without such a fix, we still need to be careful that we only very rarely
+  // need checkpoint or suspend requests to be serviced while we're waiting here; waiting for
+  // a timeout is better than a deadlock, but not cheap. See b/195664026 .
+  bool warned = false;
   while ((!kUseReadBarrier && SlowPathEnabled()) ||
          (kUseReadBarrier && !self->GetWeakRefAccessEnabled())) {
     // Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
     // presence of threads blocking for weak ref access.
     self->CheckEmptyCheckpointFromWeakRefAccess(Locks::reference_processor_lock_);
-    condition_.WaitHoldingLocks(self);
+    if (condition_.TimedWaitHoldingLocks(self, /*ms=*/ 10, /*nsec=*/ 0)) {
+      // Timed out.
+      // We should rarely get here. If we do, temporarily release reference_processor_lock_ and
+      // mutator lock, so we can respond to checkpoint and suspend requests.
+      Locks::reference_processor_lock_->ExclusiveUnlock(self);
+      {
+        ScopedThreadSuspension sts(self, ThreadState::kSuspended);
+        if (!warned) {
+          LOG(WARNING) << "Long wait for reference processor.";
+          warned = true;
+        }
+        usleep(100);
+      }
+      Locks::reference_processor_lock_->ExclusiveLock(self);
+    }
   }
 }
 

diff --git a/runtime/gc/reference_processor.h b/runtime/gc/reference_processor.h
index 54de5cc..8ea7bb1 100644
--- a/runtime/gc/reference_processor.h
+++ b/runtime/gc/reference_processor.h

@@ -58,7 +58,7 @@
   // GetReferent fast path as an optimization.
   void EnableSlowPath() REQUIRES_SHARED(Locks::mutator_lock_);
   void BroadcastForSlowPath(Thread* self);
-  // Decode the referent, may block if references are being processed.
+  // Decode the referent, may block and allow suspension if references are being processed.
   ObjPtr<mirror::Object> GetReferent(Thread* self, ObjPtr<mirror::Reference> reference)
       REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!Locks::reference_processor_lock_);
   // Collects the cleared references and returns a task, to be executed after FinishGC, that will
@@ -89,7 +89,7 @@
   // referents.
   void StartPreservingReferences(Thread* self) REQUIRES(!Locks::reference_processor_lock_);
   void StopPreservingReferences(Thread* self) REQUIRES(!Locks::reference_processor_lock_);
-  // Wait until reference processing is done.
+  // Wait until reference processing is done. May temporarily release both required locks.
   void WaitUntilDoneProcessingReferences(Thread* self)
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(Locks::reference_processor_lock_);
commit	0c3cc6350749a441fd54f8f3f67b7c69775000c8	[log] [tgz]
author	Hans Boehm <hboehm@google.com>	Thu Aug 05 18:30:08 2021 -0700
committer	Hans Boehm <hboehm@google.com>	Sat Aug 07 15:16:49 2021 +0000
tree	b1269dfa3f20f2abee25a880df1ad36884ff3f56
parent	825e82972fe46fdb0419c42bd7df102df1989ff9 [diff]