19 files changed, 330 insertions, 196 deletions
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 77b55e455e..0236f0d5a9 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -3475,7 +3475,8 @@ void IntrinsicCodeGeneratorARM64::VisitReferenceGetReferent(HInvoke* invoke) {
     Register temp = temps.AcquireW();
     __ Ldr(temp,
            MemOperand(tr, Thread::WeakRefAccessEnabledOffset<kArm64PointerSize>().Uint32Value()));
-    __ Cbz(temp, slow_path->GetEntryLabel());
+    static_assert(enum_cast<int32_t>(WeakRefAccessState::kVisiblyEnabled) == 0);
+    __ Cbnz(temp, slow_path->GetEntryLabel());
   }
 
   {
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index a4a3457c37..303ac171a7 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -2517,8 +2517,8 @@ void IntrinsicCodeGeneratorARMVIXL::VisitReferenceGetReferent(HInvoke* invoke) {
     vixl32::Register temp = temps.Acquire();
     __ Ldr(temp,
            MemOperand(tr, Thread::WeakRefAccessEnabledOffset<kArmPointerSize>().Uint32Value()));
-    __ Cmp(temp, 0);
-    __ B(eq, slow_path->GetEntryLabel());
+    __ Cmp(temp, enum_cast<int32_t>(WeakRefAccessState::kVisiblyEnabled));
+    __ B(ne, slow_path->GetEntryLabel());
   }
 
   {
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 7c2537495a..3a3886432a 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -3346,8 +3346,9 @@ void IntrinsicCodeGeneratorX86::VisitReferenceGetReferent(HInvoke* invoke) {
   if (kEmitCompilerReadBarrier) {
     // Check self->GetWeakRefAccessEnabled().
     ThreadOffset32 offset = Thread::WeakRefAccessEnabledOffset<kX86PointerSize>();
-    __ fs()->cmpl(Address::Absolute(offset), Immediate(0));
-    __ j(kEqual, slow_path->GetEntryLabel());
+    __ fs()->cmpl(Address::Absolute(offset),
+                  Immediate(enum_cast<int32_t>(WeakRefAccessState::kVisiblyEnabled)));
+    __ j(kNotEqual, slow_path->GetEntryLabel());
   }
 
   // Load the java.lang.ref.Reference class, use the output register as a temporary.
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index d5a7cb10e1..e3be98732b 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -3098,8 +3098,9 @@ void IntrinsicCodeGeneratorX86_64::VisitReferenceGetReferent(HInvoke* invoke) {
   if (kEmitCompilerReadBarrier) {
     // Check self->GetWeakRefAccessEnabled().
     ThreadOffset64 offset = Thread::WeakRefAccessEnabledOffset<kX86_64PointerSize>();
-    __ gs()->cmpl(Address::Absolute(offset, /* no_rip= */ true), Immediate(0));
-    __ j(kEqual, slow_path->GetEntryLabel());
+    __ gs()->cmpl(Address::Absolute(offset, /* no_rip= */ true),
+                  Immediate(enum_cast<int32_t>(WeakRefAccessState::kVisiblyEnabled)));
+    __ j(kNotEqual, slow_path->GetEntryLabel());
   }
 
   // Load the java.lang.ref.Reference class, use the output register as a temporary.
diff --git a/runtime/gc/allocation_record.cc b/runtime/gc/allocation_record.cc
index 60fb71d952..ffd4d2b582 100644
--- a/runtime/gc/allocation_record.cc
+++ b/runtime/gc/allocation_record.cc
@@ -23,6 +23,7 @@
 #include "obj_ptr-inl.h"
 #include "object_callbacks.h"
 #include "stack.h"
+#include "thread-inl.h"  // For GetWeakRefAccessEnabled().
 
 #include <android-base/properties.h>
 
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 3a60191310..34a4089702 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -1621,18 +1621,28 @@ void ConcurrentCopying::CopyingPhase() {
 
   {
     TimingLogger::ScopedTiming split7("Process mark stacks and References", GetTimings());
+
+    // Process the mark stack once in the thread local stack mode. This marks most of the live
+    // objects, aside from weak ref accesses with read barriers (Reference::GetReferent() and
+    // system weaks) that may happen concurrently while we are processing the mark stack and newly
+    // mark/gray objects and push refs on the mark stack.
+    ProcessMarkStack();
+
+    ReferenceProcessor* rp = GetHeap()->GetReferenceProcessor();
+    bool clear_soft_references = GetCurrentIteration()->GetClearSoftReferences();
+    rp->Setup(self, this, /*concurrent=*/ true, clear_soft_references);
+    if (!clear_soft_references) {
+      // Forward as many SoftReferences as possible before inhibiting reference access.
+      rp->ForwardSoftReferences(GetTimings());
+    }
+
     // We transition through three mark stack modes (thread-local, shared, GC-exclusive). The
-    // primary reasons are the fact that we need to use a checkpoint to process thread-local mark
+    // primary reasons are that we need to use a checkpoint to process thread-local mark
     // stacks, but after we disable weak refs accesses, we can't use a checkpoint due to a deadlock
     // issue because running threads potentially blocking at WaitHoldingLocks, and that once we
     // reach the point where we process weak references, we can avoid using a lock when accessing
     // the GC mark stack, which makes mark stack processing more efficient.
 
-    // Process the mark stack once in the thread local stack mode. This marks most of the live
-    // objects, aside from weak ref accesses with read barriers (Reference::GetReferent() and system
-    // weaks) that may happen concurrently while we processing the mark stack and newly mark/gray
-    // objects and push refs on the mark stack.
-    ProcessMarkStack();
     // Switch to the shared mark stack mode. That is, revoke and process thread-local mark stacks
     // for the last time before transitioning to the shared mark stack mode, which would process new
     // refs that may have been concurrently pushed onto the mark stack during the ProcessMarkStack()
@@ -1649,6 +1659,7 @@ void ConcurrentCopying::CopyingPhase() {
     // forever as the checkpoint never finishes (See runtime/mutator_gc_coord.md).
     SwitchToSharedMarkStackMode();
     CHECK(!self->GetWeakRefAccessEnabled());
+
     // Now that weak refs accesses are disabled, once we exhaust the shared mark stack again here
     // (which may be non-empty if there were refs found on thread-local mark stacks during the above
     // SwitchToSharedMarkStackMode() call), we won't have new refs to process, that is, mutators
@@ -1656,6 +1667,7 @@ void ConcurrentCopying::CopyingPhase() {
     // before we process weak refs below.
     ProcessMarkStack();
     CheckEmptyMarkStack();
+
     // Switch to the GC exclusive mark stack mode so that we can process the mark stack without a
     // lock from this point on.
     SwitchToGcExclusiveMarkStackMode();
@@ -1663,24 +1675,23 @@ void ConcurrentCopying::CopyingPhase() {
     if (kVerboseMode) {
       LOG(INFO) << "ProcessReferences";
     }
-    // Process weak references. This may produce new refs to process and have them processed via
-    // ProcessMarkStack (in the GC exclusive mark stack mode).
+    // Process weak references. This also marks through finalizers. Although
+    // reference processing is "disabled", some accesses will proceed once we've ensured that
+    // objects directly reachable by the mutator are marked, i.e. before we mark through
+    // finalizers.
     ProcessReferences(self);
     CheckEmptyMarkStack();
+    // JNI WeakGlobalRefs and most other system weaks cannot be processed until we're done marking
+    // through finalizers, since such references to finalizer-reachable objects must be preserved.
     if (kVerboseMode) {
       LOG(INFO) << "SweepSystemWeaks";
     }
     SweepSystemWeaks(self);
+    CheckEmptyMarkStack();
+    ReenableWeakRefAccess(self);
     if (kVerboseMode) {
       LOG(INFO) << "SweepSystemWeaks done";
     }
-    // Process the mark stack here one last time because the above SweepSystemWeaks() call may have
-    // marked some objects (strings alive) as hash_set::Erase() can call the hash function for
-    // arbitrary elements in the weak intern table in InternTable::Table::SweepWeaks().
-    ProcessMarkStack();
-    CheckEmptyMarkStack();
-    // Re-enable weak ref accesses.
-    ReenableWeakRefAccess(self);
     // Free data for class loaders that we unloaded.
     Runtime::Current()->GetClassLinker()->CleanupClassLoaders();
     // Marking is done. Disable marking.
@@ -3830,8 +3841,7 @@ void ConcurrentCopying::DelayReferenceReferent(ObjPtr<mirror::Class> klass,
 void ConcurrentCopying::ProcessReferences(Thread* self) {
   // We don't really need to lock the heap bitmap lock as we use CAS to mark in bitmaps.
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-  GetHeap()->GetReferenceProcessor()->ProcessReferences(
-      /*concurrent=*/ true, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(), this);
+  GetHeap()->GetReferenceProcessor()->ProcessReferences(self, GetTimings());
 }
 
 void ConcurrentCopying::RevokeAllThreadLocalBuffers() {
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index e3cd1eefe9..bd5ce37b2c 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -177,11 +177,7 @@ void MarkSweep::RunPhases() {
 
 void MarkSweep::ProcessReferences(Thread* self) {
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-  GetHeap()->GetReferenceProcessor()->ProcessReferences(
-      true,
-      GetTimings(),
-      GetCurrentIteration()->GetClearSoftReferences(),
-      this);
+  GetHeap()->GetReferenceProcessor()->ProcessReferences(self, GetTimings());
 }
 
 void MarkSweep::PausePhase() {
@@ -213,7 +209,9 @@ void MarkSweep::PausePhase() {
   Runtime::Current()->DisallowNewSystemWeaks();
   // Enable the reference processing slow path, needs to be done with mutators paused since there
   // is no lock in the GetReferent fast path.
-  GetHeap()->GetReferenceProcessor()->EnableSlowPath();
+  ReferenceProcessor* rp = GetHeap()->GetReferenceProcessor();
+  rp->Setup(self, this, /*concurrent=*/true, GetCurrentIteration()->GetClearSoftReferences());
+  rp->EnableSlowPath();
 }
 
 void MarkSweep::PreCleanCards() {
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index 0848f34d0b..53b060483f 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -145,8 +145,9 @@ void SemiSpace::InitializePhase() {
 
 void SemiSpace::ProcessReferences(Thread* self) {
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-  GetHeap()->GetReferenceProcessor()->ProcessReferences(
-      false, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(), this);
+  ReferenceProcessor* rp = GetHeap()->GetReferenceProcessor();
+  rp->Setup(self, this, /*concurrent=*/false, GetCurrentIteration()->GetClearSoftReferences());
+  rp->ProcessReferences(self, GetTimings());
 }
 
 void SemiSpace::MarkingPhase() {
diff --git a/runtime/gc/reference_processor.cc b/runtime/gc/reference_processor.cc
index e34d140db4..df8b3fed64 100644
--- a/runtime/gc/reference_processor.cc
+++ b/runtime/gc/reference_processor.cc
@@ -32,6 +32,7 @@
 #include "reflection.h"
 #include "scoped_thread_state_change-inl.h"
 #include "task_processor.h"
+#include "thread-inl.h"
 #include "thread_pool.h"
 #include "well_known_classes.h"
 
@@ -42,7 +43,6 @@ static constexpr bool kAsyncReferenceQueueAdd = false;
 
 ReferenceProcessor::ReferenceProcessor()
     : collector_(nullptr),
-      preserving_references_(false),
       condition_("reference processor condition", *Locks::reference_processor_lock_) ,
       soft_reference_queue_(Locks::reference_queue_soft_references_lock_),
       weak_reference_queue_(Locks::reference_queue_weak_references_lock_),
@@ -89,17 +89,20 @@ void ReferenceProcessor::BroadcastForSlowPath(Thread* self) {
 
 ObjPtr<mirror::Object> ReferenceProcessor::GetReferent(Thread* self,
                                                        ObjPtr<mirror::Reference> reference) {
-  if (!kUseReadBarrier || self->GetWeakRefAccessEnabled()) {
-    // Under read barrier / concurrent copying collector, it's not safe to call GetReferent() when
-    // weak ref access is disabled as the call includes a read barrier which may push a ref onto the
-    // mark stack and interfere with termination of marking.
-    const ObjPtr<mirror::Object> referent = reference->GetReferent();
-    // If the referent is null then it is already cleared, we can just return null since there is no
-    // scenario where it becomes non-null during the reference processing phase.
-    if (UNLIKELY(!SlowPathEnabled()) || referent == nullptr) {
-      return referent;
-    }
+  auto slow_path_required = [this, self]() {
+    return kUseReadBarrier ? !self->GetWeakRefAccessEnabled() : SlowPathEnabled();
+  };
+  if (!slow_path_required()) {
+    return reference->GetReferent();
+  }
+  // If the referent is null then it is already cleared, we can just return null since there is no
+  // scenario where it becomes non-null during the reference processing phase.
+  // A read barrier may be unsafe here, and we use the result only when it's null or marked.
+  ObjPtr<mirror::Object> referent = reference->template GetReferent<kWithoutReadBarrier>();
+  if (referent.IsNull()) {
+    return referent;
   }
+
   bool started_trace = false;
   uint64_t start_millis;
   auto finish_trace = [](uint64_t start_millis) {
@@ -112,50 +115,47 @@ ObjPtr<mirror::Object> ReferenceProcessor::GetReferent(Thread* self,
   };
 
   MutexLock mu(self, *Locks::reference_processor_lock_);
-  while ((!kUseReadBarrier && SlowPathEnabled()) ||
-         (kUseReadBarrier && !self->GetWeakRefAccessEnabled())) {
-    ObjPtr<mirror::Object> referent = reference->GetReferent<kWithoutReadBarrier>();
-    // If the referent became cleared, return it. Don't need barrier since thread roots can't get
-    // updated until after we leave the function due to holding the mutator lock.
-    if (referent == nullptr) {
-      if (started_trace) {
-        finish_trace(start_millis);
+  // Keeping reference_processor_lock_ blocks the broadcast when we try to reenable the fast path.
+  while (slow_path_required()) {
+    DCHECK(collector_ != nullptr);
+    constexpr bool kOtherReadBarrier = kUseReadBarrier && !kUseBakerReadBarrier;
+    if (UNLIKELY(reference->IsFinalizerReferenceInstance()
+                 || rp_state_ == RpState::kStarting /* too early to determine mark state */
+                 || (kOtherReadBarrier && reference->IsPhantomReferenceInstance()))) {
+      // Odd cases in which it doesn't hurt to just wait, or the wait is likely to be very brief.
+
+      // Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
+      // presence of threads blocking for weak ref access.
+      self->CheckEmptyCheckpointFromWeakRefAccess(Locks::reference_processor_lock_);
+      if (!started_trace) {
+        ATraceBegin("GetReferent blocked");
+        started_trace = true;
+        start_millis = MilliTime();
       }
-      return nullptr;
+      condition_.WaitHoldingLocks(self);
+      continue;
     }
-    // Try to see if the referent is already marked by using the is_marked_callback. We can return
-    // it to the mutator as long as the GC is not preserving references.
-    if (LIKELY(collector_ != nullptr)) {
-      // If it's null it means not marked, but it could become marked if the referent is reachable
-      // by finalizer referents. So we cannot return in this case and must block. Otherwise, we
-      // can return it to the mutator as long as the GC is not preserving references, in which
-      // case only black nodes can be safely returned. If the GC is preserving references, the
-      // mutator could take a white field from a grey or white node and move it somewhere else
-      // in the heap causing corruption since this field would get swept.
-      // Use the cached referent instead of calling GetReferent since other threads could call
-      // Reference.clear() after we did the null check resulting in a null pointer being
-      // incorrectly passed to IsMarked. b/33569625
-      ObjPtr<mirror::Object> forwarded_ref = collector_->IsMarked(referent.Ptr());
-      if (forwarded_ref != nullptr) {
-        // Non null means that it is marked.
-        if (!preserving_references_ ||
-           (LIKELY(!reference->IsFinalizerReferenceInstance()) && reference->IsUnprocessed())) {
-          if (started_trace) {
-            finish_trace(start_millis);
-          }
-          return forwarded_ref;
-        }
-      }
+    DCHECK(!reference->IsPhantomReferenceInstance());
+
+    if (rp_state_ == RpState::kInitClearingDone) {
+      // Reachable references have their final referent values.
+      break;
     }
-    // Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
-    // presence of threads blocking for weak ref access.
-    self->CheckEmptyCheckpointFromWeakRefAccess(Locks::reference_processor_lock_);
-    if (!started_trace) {
-      ATraceBegin("GetReferent blocked");
-      started_trace = true;
-      start_millis = MilliTime();
+    // Although reference processing is not done, we can always predict the correct return value
+    // based on the current mark state. No additional marking from finalizers has been done, since
+    // we hold reference_processor_lock_, which is required to advance to kInitClearingDone.
+    DCHECK(rp_state_ == RpState::kInitMarkingDone);
+    // Re-load and re-check referent, since the current one may have been read before we acquired
+    // reference_lock. In particular a Reference.clear() call may have intervened. (b/33569625)
+    referent = reference->GetReferent<kWithoutReadBarrier>();
+    ObjPtr<mirror::Object> forwarded_ref =
+        referent.IsNull() ? nullptr : collector_->IsMarked(referent.Ptr());
+    // Either the referent was marked, and forwarded_ref is the correct return value, or it
+    // was not, and forwarded_ref == null, which is again the correct return value.
+    if (started_trace) {
+      finish_trace(start_millis);
     }
-    condition_.WaitHoldingLocks(self);
+    return forwarded_ref;
   }
   if (started_trace) {
     finish_trace(start_millis);
@@ -163,36 +163,64 @@ ObjPtr<mirror::Object> ReferenceProcessor::GetReferent(Thread* self,
   return reference->GetReferent();
 }
 
-void ReferenceProcessor::StartPreservingReferences(Thread* self) {
-  MutexLock mu(self, *Locks::reference_processor_lock_);
-  preserving_references_ = true;
+// Forward SoftReferences. Can be done before we disable Reference access. Only
+// invoked if we are not clearing SoftReferences.
+uint32_t ReferenceProcessor::ForwardSoftReferences(TimingLogger* timings) {
+  TimingLogger::ScopedTiming split(
+      concurrent_ ? "ForwardSoftReferences" : "(Paused)ForwardSoftReferences", timings);
+  // We used to argue that we should be smarter about doing this conditionally, but it's unclear
+  // that's actually better than the more predictable strategy of basically only clearing
+  // SoftReferences just before we would otherwise run out of memory.
+  uint32_t non_null_refs = soft_reference_queue_.ForwardSoftReferences(collector_);
+  if (ATraceEnabled()) {
+    static constexpr size_t kBufSize = 80;
+    char buf[kBufSize];
+    snprintf(buf, kBufSize, "Marking for %" PRIu32 " SoftReferences", non_null_refs);
+    ATraceBegin(buf);
+    collector_->ProcessMarkStack();
+    ATraceEnd();
+  } else {
+    collector_->ProcessMarkStack();
+  }
+  return non_null_refs;
 }
 
-void ReferenceProcessor::StopPreservingReferences(Thread* self) {
+void ReferenceProcessor::Setup(Thread* self,
+                               collector::GarbageCollector* collector,
+                               bool concurrent,
+                               bool clear_soft_references) {
+  DCHECK(collector != nullptr);
   MutexLock mu(self, *Locks::reference_processor_lock_);
-  preserving_references_ = false;
-  // We are done preserving references, some people who are blocked may see a marked referent.
-  condition_.Broadcast(self);
+  collector_ = collector;
+  rp_state_ = RpState::kStarting;
+  concurrent_ = concurrent;
+  clear_soft_references_ = clear_soft_references;
 }
 
 // Process reference class instances and schedule finalizations.
-void ReferenceProcessor::ProcessReferences(bool concurrent,
-                                           TimingLogger* timings,
-                                           bool clear_soft_references,
-                                           collector::GarbageCollector* collector) {
-  TimingLogger::ScopedTiming t(concurrent ? __FUNCTION__ : "(Paused)ProcessReferences", timings);
-  Thread* self = Thread::Current();
+// We advance rp_state_ to signal partial completion for the benefit of GetReferent.
+void ReferenceProcessor::ProcessReferences(Thread* self, TimingLogger* timings) {
+  TimingLogger::ScopedTiming t(concurrent_ ? __FUNCTION__ : "(Paused)ProcessReferences", timings);
+  if (!clear_soft_references_) {
+    // Forward any additional SoftReferences we discovered late, now that reference access has been
+    // inhibited.
+    while (!soft_reference_queue_.IsEmpty()) {
+      ForwardSoftReferences(timings);
+    }
+  }
   {
     MutexLock mu(self, *Locks::reference_processor_lock_);
-    collector_ = collector;
     if (!kUseReadBarrier) {
-      CHECK_EQ(SlowPathEnabled(), concurrent) << "Slow path must be enabled iff concurrent";
+      CHECK_EQ(SlowPathEnabled(), concurrent_) << "Slow path must be enabled iff concurrent";
     } else {
-      // Weak ref access is enabled at Zygote compaction by SemiSpace (concurrent == false).
-      CHECK_EQ(!self->GetWeakRefAccessEnabled(), concurrent);
+      // Weak ref access is enabled at Zygote compaction by SemiSpace (concurrent_ == false).
+      CHECK_EQ(!self->GetWeakRefAccessEnabled(), concurrent_);
     }
+    DCHECK(rp_state_ == RpState::kStarting);
+    rp_state_ = RpState::kInitMarkingDone;
+    condition_.Broadcast(self);
   }
-  if (kIsDebugBuild && collector->IsTransactionActive()) {
+  if (kIsDebugBuild && collector_->IsTransactionActive()) {
     // In transaction mode, we shouldn't enqueue any Reference to the queues.
     // See DelayReferenceReferent().
     DCHECK(soft_reference_queue_.IsEmpty());
@@ -200,75 +228,84 @@ void ReferenceProcessor::ProcessReferences(bool concurrent,
     DCHECK(finalizer_reference_queue_.IsEmpty());
     DCHECK(phantom_reference_queue_.IsEmpty());
   }
-  // Unless required to clear soft references with white references, preserve some white referents.
-  if (!clear_soft_references) {
-    TimingLogger::ScopedTiming split(concurrent ? "ForwardSoftReferences" :
-        "(Paused)ForwardSoftReferences", timings);
-    if (concurrent) {
-      StartPreservingReferences(self);
-    }
-    // TODO: Add smarter logic for preserving soft references. The behavior should be a conditional
-    // mark if the SoftReference is supposed to be preserved.
-    uint32_t non_null_refs = soft_reference_queue_.ForwardSoftReferences(collector);
-    if (ATraceEnabled()) {
-      static constexpr size_t kBufSize = 80;
-      char buf[kBufSize];
-      snprintf(buf, kBufSize, "Marking for %" PRIu32 " SoftReferences", non_null_refs);
-      ATraceBegin(buf);
-      collector->ProcessMarkStack();
-      ATraceEnd();
-    } else {
-      collector->ProcessMarkStack();
-    }
-    if (concurrent) {
-      StopPreservingReferences(self);
-    }
-  }
   // Clear all remaining soft and weak references with white referents.
-  soft_reference_queue_.ClearWhiteReferences(&cleared_references_, collector);
-  weak_reference_queue_.ClearWhiteReferences(&cleared_references_, collector);
+  // This misses references only reachable through finalizers.
+  soft_reference_queue_.ClearWhiteReferences(&cleared_references_, collector_);
+  weak_reference_queue_.ClearWhiteReferences(&cleared_references_, collector_);
+  // Defer PhantomReference processing until we've finished marking through finalizers.
   {
-    TimingLogger::ScopedTiming t2(concurrent ? "EnqueueFinalizerReferences" :
-        "(Paused)EnqueueFinalizerReferences", timings);
-    if (concurrent) {
-      StartPreservingReferences(self);
-    }
+    // TODO: Capture mark state of some system weaks here. If the referent was marked here,
+    // then it is now safe to return, since it can only refer to marked objects. If it becomes
+    // marked below, that is no longer guaranteed.
+    MutexLock mu(self, *Locks::reference_processor_lock_);
+    rp_state_ = RpState::kInitClearingDone;
+    // At this point, all mutator-accessible data is marked (black). Objects enqueued for
+    // finalization will only be made available to the mutator via CollectClearedReferences after
+    // we're fully done marking. Soft and WeakReferences accessible to the mutator have been
+    // processed and refer only to black objects.  Thus there is no danger of the mutator getting
+    // access to non-black objects.  Weak reference processing is still nominally suspended,
+    // But many kinds of references, including all java.lang.ref ones, are handled normally from
+    // here on. See GetReferent().
+  }
+  {
+    TimingLogger::ScopedTiming t2(
+        concurrent_ ? "EnqueueFinalizerReferences" : "(Paused)EnqueueFinalizerReferences", timings);
     // Preserve all white objects with finalize methods and schedule them for finalization.
     FinalizerStats finalizer_stats =
-        finalizer_reference_queue_.EnqueueFinalizerReferences(&cleared_references_, collector);
+        finalizer_reference_queue_.EnqueueFinalizerReferences(&cleared_references_, collector_);
     if (ATraceEnabled()) {
       static constexpr size_t kBufSize = 80;
       char buf[kBufSize];
       snprintf(buf, kBufSize, "Marking from %" PRIu32 " / %" PRIu32 " finalizers",
                finalizer_stats.num_enqueued_, finalizer_stats.num_refs_);
       ATraceBegin(buf);
-      collector->ProcessMarkStack();
+      collector_->ProcessMarkStack();
       ATraceEnd();
     } else {
-      collector->ProcessMarkStack();
-    }
-    if (concurrent) {
-      StopPreservingReferences(self);
+      collector_->ProcessMarkStack();
     }
   }
-  // Clear all finalizer referent reachable soft and weak references with white referents.
-  soft_reference_queue_.ClearWhiteReferences(&cleared_references_, collector);
-  weak_reference_queue_.ClearWhiteReferences(&cleared_references_, collector);
-  // Clear all phantom references with white referents.
-  phantom_reference_queue_.ClearWhiteReferences(&cleared_references_, collector);
+
+  // Process all soft and weak references with white referents, where the references are reachable
+  // only from finalizers. It is unclear that there is any way to do this without slightly
+  // violating some language spec. We choose to apply normal Reference processing rules for these.
+  // This exposes the following issues:
+  // 1) In the case of an unmarked referent, we may end up enqueuing an "unreachable" reference.
+  //    This appears unavoidable, since we need to clear the reference for safety, unless we
+  //    mark the referent and undo finalization decisions for objects we encounter during marking.
+  //    (Some versions of the RI seem to do something along these lines.)
+  //    Or we could clear the reference without enqueuing it, which also seems strange and
+  //    unhelpful.
+  // 2) In the case of a marked referent, we will preserve a reference to objects that may have
+  //    been enqueued for finalization. Again fixing this would seem to involve at least undoing
+  //    previous finalization / reference clearing decisions. (This would also mean than an object
+  //    containing both a strong and a WeakReference to the same referent could see the
+  //    WeakReference cleared.)
+  // The treatment in (2) is potentially quite dangerous, since Reference.get() can e.g. return a
+  // finalized object containing pointers to native objects that have already been deallocated.
+  // But it can be argued that this is just an instance of the broader rule that it is not safe
+  // for finalizers to access otherwise inaccessible finalizable objects.
+  soft_reference_queue_.ClearWhiteReferences(&cleared_references_, collector_,
+                                             /*report_cleared=*/ true);
+  weak_reference_queue_.ClearWhiteReferences(&cleared_references_, collector_,
+                                             /*report_cleared=*/ true);
+
+  // Clear all phantom references with white referents. It's fine to do this just once here.
+  phantom_reference_queue_.ClearWhiteReferences(&cleared_references_, collector_);
+
   // At this point all reference queues other than the cleared references should be empty.
   DCHECK(soft_reference_queue_.IsEmpty());
   DCHECK(weak_reference_queue_.IsEmpty());
   DCHECK(finalizer_reference_queue_.IsEmpty());
   DCHECK(phantom_reference_queue_.IsEmpty());
+
   {
     MutexLock mu(self, *Locks::reference_processor_lock_);
     // Need to always do this since the next GC may be concurrent. Doing this for only concurrent
     // could result in a stale is_marked_callback_ being called before the reference processing
     // starts since there is a small window of time where slow_path_enabled_ is enabled but the
     // callback isn't yet set.
-    collector_ = nullptr;
-    if (!kUseReadBarrier && concurrent) {
+    if (!kUseReadBarrier && concurrent_) {
       // Done processing, disable the slow path and broadcast to the waiters.
       DisableSlowPath(self);
     }
diff --git a/runtime/gc/reference_processor.h b/runtime/gc/reference_processor.h
index 51c50544bf..0f84211a87 100644
--- a/runtime/gc/reference_processor.h
+++ b/runtime/gc/reference_processor.h
@@ -46,19 +46,29 @@ class Heap;
 class ReferenceProcessor {
  public:
   ReferenceProcessor();
-  void ProcessReferences(bool concurrent,
-                         TimingLogger* timings,
-                         bool clear_soft_references,
-                         gc::collector::GarbageCollector* collector)
+
+  // Initialize for a reference processing pass. Called before suspending weak
+  // access.
+  void Setup(Thread* self,
+             collector::GarbageCollector* collector,
+             bool concurrent,
+             bool clear_soft_references)
+      REQUIRES(!Locks::reference_processor_lock_);
+  // Enqueue all types of java.lang.ref.References, and mark through finalizers.
+  // Assumes there is no concurrent mutator-driven marking, i.e. all potentially
+  // mutator-accessible objects should be marked before this.
+  void ProcessReferences(Thread* self, TimingLogger* timings)
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(Locks::heap_bitmap_lock_)
       REQUIRES(!Locks::reference_processor_lock_);
+
   // The slow path bool is contained in the reference class object, can only be set once
   // Only allow setting this with mutators suspended so that we can avoid using a lock in the
   // GetReferent fast path as an optimization.
   void EnableSlowPath() REQUIRES_SHARED(Locks::mutator_lock_);
   void BroadcastForSlowPath(Thread* self);
-  // Decode the referent, may block if references are being processed.
+  // Decode the referent, may block if references are being processed. In the normal
+  // no-read-barrier or Baker-read-barrier cases, we assume reference is not a PhantomReference.
   ObjPtr<mirror::Object> GetReferent(Thread* self, ObjPtr<mirror::Reference> reference)
       REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!Locks::reference_processor_lock_);
   // Collects the cleared references and returns a task, to be executed after FinishGC, that will
@@ -78,27 +88,30 @@ class ReferenceProcessor {
   void ClearReferent(ObjPtr<mirror::Reference> ref)
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!Locks::reference_processor_lock_);
+  uint32_t ForwardSoftReferences(TimingLogger* timings)
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
  private:
   bool SlowPathEnabled() REQUIRES_SHARED(Locks::mutator_lock_);
   // Called by ProcessReferences.
   void DisableSlowPath(Thread* self) REQUIRES(Locks::reference_processor_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
-  // If we are preserving references it means that some dead objects may become live, we use start
-  // and stop preserving to block mutators using GetReferrent from getting access to these
-  // referents.
-  void StartPreservingReferences(Thread* self) REQUIRES(!Locks::reference_processor_lock_);
-  void StopPreservingReferences(Thread* self) REQUIRES(!Locks::reference_processor_lock_);
   // Wait until reference processing is done.
   void WaitUntilDoneProcessingReferences(Thread* self)
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(Locks::reference_processor_lock_);
   // Collector which is clearing references, used by the GetReferent to return referents which are
-  // already marked.
-  collector::GarbageCollector* collector_ GUARDED_BY(Locks::reference_processor_lock_);
-  // Boolean for whether or not we are preserving references (either soft references or finalizers).
-  // If this is true, then we cannot return a referent (see comment in GetReferent).
-  bool preserving_references_ GUARDED_BY(Locks::reference_processor_lock_);
+  // already marked. Only updated by thread currently running GC.
+  // Guarded by reference_processor_lock_ when not read by collector. Only the collector changes
+  // it.
+  collector::GarbageCollector* collector_;
+  // Reference processor state. Only valid while weak reference processing is suspended.
+  // Used by GetReferent and friends to return early.
+  enum class RpState : uint8_t { kStarting, kInitMarkingDone, kInitClearingDone };
+  RpState rp_state_ GUARDED_BY(Locks::reference_processor_lock_);
+  bool concurrent_;  // Running concurrently with mutator? Only used by GC thread.
+  bool clear_soft_references_;  // Only used by GC thread.
+
   // Condition that people wait on if they attempt to get the referent of a reference while
   // processing is in progress. Broadcast when an empty checkpoint is requested, but not for other
   // checkpoints or thread suspensions. See mutator_gc_coord.md.
diff --git a/runtime/gc/reference_queue.cc b/runtime/gc/reference_queue.cc
index 568ca04c1d..6bdacaf18c 100644
--- a/runtime/gc/reference_queue.cc
+++ b/runtime/gc/reference_queue.cc
@@ -131,7 +131,8 @@ size_t ReferenceQueue::GetLength() const {
 }
 
 void ReferenceQueue::ClearWhiteReferences(ReferenceQueue* cleared_references,
-                                          collector::GarbageCollector* collector) {
+                                          collector::GarbageCollector* collector,
+                                          bool report_cleared) {
   while (!IsEmpty()) {
     ObjPtr<mirror::Reference> ref = DequeuePendingReference();
     mirror::HeapReference<mirror::Object>* referent_addr = ref->GetReferentReferenceAddr();
@@ -145,6 +146,15 @@ void ReferenceQueue::ClearWhiteReferences(ReferenceQueue* cleared_references,
         ref->ClearReferent<false>();
       }
       cleared_references->EnqueueReference(ref);
+      if (report_cleared) {
+        static bool already_reported = false;
+        if (!already_reported) {
+          // TODO: Maybe do this only if the queue is non-null?
+          LOG(WARNING)
+              << "Cleared Reference was only reachable from finalizer (only reported once)";
+          already_reported = true;
+        }
+      }
     }
     // Delay disabling the read barrier until here so that the ClearReferent call above in
     // transaction mode will trigger the read barrier.
@@ -182,22 +192,33 @@ FinalizerStats ReferenceQueue::EnqueueFinalizerReferences(ReferenceQueue* cleare
 }
 
 uint32_t ReferenceQueue::ForwardSoftReferences(MarkObjectVisitor* visitor) {
-  if (UNLIKELY(IsEmpty())) {
-    return 0;
-  }
   uint32_t num_refs(0);
-  const ObjPtr<mirror::Reference> head = list_;
-  ObjPtr<mirror::Reference> ref = head;
+  Thread* self = Thread::Current();
+  static constexpr int SR_BUF_SIZE = 32;
+  ObjPtr<mirror::Reference> buf[SR_BUF_SIZE];
+  int n_entries;
+  bool empty;
   do {
-    mirror::HeapReference<mirror::Object>* referent_addr = ref->GetReferentReferenceAddr();
-    if (referent_addr->AsMirrorPtr() != nullptr) {
-      // do_atomic_update is false because mutators can't access the referent due to the weak ref
-      // access blocking.
-      visitor->MarkHeapReference(referent_addr, /*do_atomic_update=*/ false);
-      ++num_refs;
+    {
+      // Acquire lock only a few times and hold it as briefly as possible.
+      MutexLock mu(self, *lock_);
+      empty = IsEmpty();
+      for (n_entries = 0; n_entries < SR_BUF_SIZE && !empty; ++n_entries) {
+        // Dequeuing the Reference here means it could possibly be enqueued again during this GC.
+        // That's unlikely and benign.
+        buf[n_entries] = DequeuePendingReference();
+        empty = IsEmpty();
+      }
+    }
+    for (int i = 0; i < n_entries; ++i) {
+      mirror::HeapReference<mirror::Object>* referent_addr = buf[i]->GetReferentReferenceAddr();
+      if (referent_addr->AsMirrorPtr() != nullptr) {
+        visitor->MarkHeapReference(referent_addr, /*do_atomic_update=*/ true);
+        ++num_refs;
+      }
+      DisableReadBarrierForReference(buf[i]->AsReference());
     }
-    ref = ref->GetPendingNext();
-  } while (LIKELY(ref != head));
+  } while (!empty);
   return num_refs;
 }
 
diff --git a/runtime/gc/reference_queue.h b/runtime/gc/reference_queue.h
index 06243c40a2..3fda7167d4 100644
--- a/runtime/gc/reference_queue.h
+++ b/runtime/gc/reference_queue.h
@@ -90,17 +90,20 @@ class ReferenceQueue {
                                   collector::GarbageCollector* collector)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // Walks the reference list marking any references subject to the reference clearing policy.
-  // References with a black referent are removed from the list.  References with white referents
-  // biased toward saving are blackened and also removed from the list.
-  // Returns the number of non-null soft references.
+  // Walks the reference list marking and dequeuing any references subject to the reference
+  // clearing policy.  References with a black referent are removed from the list.  References
+  // with white referents biased toward saving are blackened and also removed from the list.
+  // Returns the number of non-null soft references. May be called concurrently with
+  // AtomicEnqueueIfNotEnqueued().
   uint32_t ForwardSoftReferences(MarkObjectVisitor* visitor)
+      REQUIRES(!*lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Unlink the reference list clearing references objects with white referents. Cleared references
   // registered to a reference queue are scheduled for appending by the heap worker thread.
   void ClearWhiteReferences(ReferenceQueue* cleared_references,
-                            collector::GarbageCollector* collector)
+                            collector::GarbageCollector* collector,
+                            bool report_cleared = false)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   void Dump(std::ostream& os) const REQUIRES_SHARED(Locks::mutator_lock_);
@@ -109,9 +112,12 @@ class ReferenceQueue {
   bool IsEmpty() const {
     return list_ == nullptr;
   }
+
+  // Clear this queue. Only safe after handing off the contents elsewhere for further processing.
   void Clear() {
     list_ = nullptr;
   }
+
   mirror::Reference* GetList() REQUIRES_SHARED(Locks::mutator_lock_) {
     return list_;
   }
@@ -124,8 +130,10 @@ class ReferenceQueue {
   // Lock, used for parallel GC reference enqueuing. It allows for multiple threads simultaneously
   // calling AtomicEnqueueIfNotEnqueued.
   Mutex* const lock_;
-  // The actual reference list. Only a root for the mark compact GC since it will be null for other
-  // GC types. Not an ObjPtr since it is accessed from multiple threads.
+  // The actual reference list. Only a root for the mark compact GC since it
+  // will be null during root marking for other GC types. Not an ObjPtr since it
+  // is accessed from multiple threads.  Points to a singly-linked circular list
+  // using the pendingNext field.
   mirror::Reference* list_;
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(ReferenceQueue);
diff --git a/runtime/intern_table.cc b/runtime/intern_table.cc
index 25c45cba6a..5d4c50727e 100644
--- a/runtime/intern_table.cc
+++ b/runtime/intern_table.cc
@@ -32,6 +32,7 @@
 #include "object_callbacks.h"
 #include "scoped_thread_state_change-inl.h"
 #include "thread.h"
+#include "thread-inl.h"
 
 namespace art {
 
diff --git a/runtime/intern_table.h b/runtime/intern_table.h
index db6016eba7..9d1720ffa5 100644
--- a/runtime/intern_table.h
+++ b/runtime/intern_table.h
@@ -116,7 +116,7 @@ class InternTable {
       REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
 
   // Only used by image writer. Special version that may not cause thread suspension since the GC
-  // cannot be running while we are doing image writing. Maybe be called while while holding a
+  // cannot be running while we are doing image writing. Maybe be called while holding a
   // lock since there will not be thread suspension.
   ObjPtr<mirror::String> InternStrongImageString(ObjPtr<mirror::String> s)
       REQUIRES_SHARED(Locks::mutator_lock_);
diff --git a/runtime/interpreter/interpreter_common.h b/runtime/interpreter/interpreter_common.h
index 1809227f2c..e9b78f3259 100644
--- a/runtime/interpreter/interpreter_common.h
+++ b/runtime/interpreter/interpreter_common.h
@@ -59,6 +59,7 @@
 #include "obj_ptr.h"
 #include "stack.h"
 #include "thread.h"
+#include "thread-inl.h"
 #include "unstarted_runtime.h"
 #include "verifier/method_verifier.h"
 #include "well_known_classes.h"
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index 34ddfc4233..ad44ac2094 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -57,6 +57,7 @@
 #include "scoped_thread_state_change-inl.h"
 #include "stack.h"
 #include "thread-current-inl.h"
+#include "thread-inl.h"
 #include "thread_list.h"
 
 namespace art {
diff --git a/runtime/jni/java_vm_ext.cc b/runtime/jni/java_vm_ext.cc
index 12e6d22252..94108c69de 100644
--- a/runtime/jni/java_vm_ext.cc
+++ b/runtime/jni/java_vm_ext.cc
@@ -864,6 +864,10 @@ ObjPtr<mirror::Object> JavaVMExt::DecodeWeakGlobalLocked(Thread* self, IndirectR
   if (kDebugLocking) {
     Locks::jni_weak_globals_lock_->AssertHeld(self);
   }
+  // TODO: Handle the already null case without waiting.
+  // TODO: Otherwise we should just wait for kInitMarkingDone, and track which weak globals were
+  // marked at that point. We would only need one mark bit per entry in the weak_globals_ table,
+  // and a quick pass over that early on during reference processing.
   WaitForWeakGlobalsAccess(self);
   return weak_globals_.Get(ref);
 }
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 3c1e7a0a00..c0dcf8846b 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -372,6 +372,25 @@ inline bool Thread::PushOnThreadLocalAllocationStack(mirror::Object* obj) {
   return false;
 }
 
+inline bool Thread::GetWeakRefAccessEnabled() const {
+  CHECK(kUseReadBarrier);
+  DCHECK(this == Thread::Current());
+  WeakRefAccessState s = tls32_.weak_ref_access_enabled.load(std::memory_order_acquire);
+  if (s == WeakRefAccessState::kVisiblyEnabled) {
+    return true;
+  } else if (s == WeakRefAccessState::kDisabled) {
+    return false;
+  }
+  DCHECK(s == WeakRefAccessState::kEnabled)
+      << "state = " << static_cast<std::underlying_type_t<WeakRefAccessState>>(s);
+  // The state is only changed back to DISABLED during a checkpoint. Thus no other thread can
+  // change the value concurrently here. No other thread reads the value we store here, so there
+  // is no need for a release store.
+  tls32_.weak_ref_access_enabled.store(WeakRefAccessState::kVisiblyEnabled,
+                                       std::memory_order_relaxed);
+  return true;
+}
+
 inline void Thread::SetThreadLocalAllocationStack(StackReference<mirror::Object>* start,
                                                   StackReference<mirror::Object>* end) {
   DCHECK(Thread::Current() == this) << "Should be called by self";
diff --git a/runtime/thread.h b/runtime/thread.h
index 1085a563c4..7eb57e914c 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -171,6 +171,20 @@ enum class DeoptimizationMethodType {
   kDefault     // dex pc may or may not advance depending on other conditions.
 };
 
+// For the CC colector, normal weak reference access can be disabled on a per-thread basis, while
+// processing references.  After finishing, the reference processor asynchronously sets the
+// per-thread flags back to kEnabled with release memory ordering semantics. Each mutator thread
+// should check its flag with acquire semantics before assuming that it is enabled. However,
+// that is often too expensive, so the reading thread sets it to kVisiblyEnabled after seeing it
+// kEnabled.  The Reference.get() intrinsic can thus read it in relaxed mode, and reread (by
+// resorting to the slow path) with acquire semantics if it sees a value of kEnabled rather than
+// kVisiblyEnabled.
+enum class WeakRefAccessState : int32_t {
+  kVisiblyEnabled = 0,  // Enabled, and previously read with acquire load by this thread.
+  kEnabled,
+  kDisabled
+};
+
 // This should match RosAlloc::kNumThreadLocalSizeBrackets.
 static constexpr size_t kNumRosAllocThreadLocalSizeBracketsInThread = 16;
 
@@ -994,14 +1008,13 @@ class Thread {
 
   void SetIsGcMarkingAndUpdateEntrypoints(bool is_marking);
 
-  bool GetWeakRefAccessEnabled() const {
-    CHECK(kUseReadBarrier);
-    return tls32_.weak_ref_access_enabled;
-  }
+  bool GetWeakRefAccessEnabled() const;  // Only safe for current thread.
 
   void SetWeakRefAccessEnabled(bool enabled) {
     CHECK(kUseReadBarrier);
-    tls32_.weak_ref_access_enabled = enabled;
+    WeakRefAccessState new_state = enabled ?
+        WeakRefAccessState::kEnabled : WeakRefAccessState::kDisabled;
+    tls32_.weak_ref_access_enabled.store(new_state, std::memory_order_release);
   }
 
   uint32_t GetDisableThreadFlipCount() const {
@@ -1672,7 +1685,7 @@ class Thread {
           thread_exit_check_count(0),
           is_transitioning_to_runnable(false),
           is_gc_marking(false),
-          weak_ref_access_enabled(true),
+          weak_ref_access_enabled(WeakRefAccessState::kVisiblyEnabled),
           disable_thread_flip_count(0),
           user_code_suspend_count(0),
           force_interpreter_count(0),
@@ -1728,14 +1741,17 @@ class Thread {
 
     AtomicInteger park_state_;
 
-    // True if the thread is allowed to access a weak ref (Reference::GetReferent() and system
-    // weaks) and to potentially mark an object alive/gray. This is used for concurrent reference
-    // processing of the CC collector only. This is thread local so that we can enable/disable weak
-    // ref access by using a checkpoint and avoid a race around the time weak ref access gets
-    // disabled and concurrent reference processing begins (if weak ref access is disabled during a
-    // pause, this is not an issue.) Other collectors use Runtime::DisallowNewSystemWeaks() and
-    // ReferenceProcessor::EnableSlowPath().
-    bool32_t weak_ref_access_enabled;
+    // Determines whether the thread is allowed to directly access a weak ref
+    // (Reference::GetReferent() and system weaks) and to potentially mark an object alive/gray.
+    // This is used for concurrent reference processing of the CC collector only. This is thread
+    // local so that we can enable/disable weak ref access by using a checkpoint and avoid a race
+    // around the time weak ref access gets disabled and concurrent reference processing begins
+    // (if weak ref access is disabled during a pause, this is not an issue.) Other collectors use
+    // Runtime::DisallowNewSystemWeaks() and ReferenceProcessor::EnableSlowPath().  Can be
+    // concurrently accessed by GetReferent() and set (by iterating over threads).
+    // Can be changed from kEnabled to kVisiblyEnabled by readers. No other concurrent access is
+    // possible when that happens.
+    mutable std::atomic<WeakRefAccessState> weak_ref_access_enabled;
 
     // A thread local version of Heap::disable_thread_flip_count_. This keeps track of how many
     // levels of (nested) JNI critical sections the thread is in and is used to detect a nested JNI