Revert "Reduce pauses for weak reference access"

This reverts commit 0ab5b6d2afbdd71a18f8fb9b1fcf39e54cfd55a5.

Reason for revert: Breaks CMS builds

Change-Id: Ib3dfcc90ac5b7259c7f718a0373b48acc2ba10b2
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 0236f0d..77b55e4 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -3475,8 +3475,7 @@
     Register temp = temps.AcquireW();
     __ Ldr(temp,
            MemOperand(tr, Thread::WeakRefAccessEnabledOffset<kArm64PointerSize>().Uint32Value()));
-    static_assert(enum_cast<int32_t>(WeakRefAccessState::kVisiblyEnabled) == 0);
-    __ Cbnz(temp, slow_path->GetEntryLabel());
+    __ Cbz(temp, slow_path->GetEntryLabel());
   }
 
   {
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 303ac17..a4a3457 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -2517,8 +2517,8 @@
     vixl32::Register temp = temps.Acquire();
     __ Ldr(temp,
            MemOperand(tr, Thread::WeakRefAccessEnabledOffset<kArmPointerSize>().Uint32Value()));
-    __ Cmp(temp, enum_cast<int32_t>(WeakRefAccessState::kVisiblyEnabled));
-    __ B(ne, slow_path->GetEntryLabel());
+    __ Cmp(temp, 0);
+    __ B(eq, slow_path->GetEntryLabel());
   }
 
   {
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 3a38864..7c25374 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -3346,9 +3346,8 @@
   if (kEmitCompilerReadBarrier) {
     // Check self->GetWeakRefAccessEnabled().
     ThreadOffset32 offset = Thread::WeakRefAccessEnabledOffset<kX86PointerSize>();
-    __ fs()->cmpl(Address::Absolute(offset),
-                  Immediate(enum_cast<int32_t>(WeakRefAccessState::kVisiblyEnabled)));
-    __ j(kNotEqual, slow_path->GetEntryLabel());
+    __ fs()->cmpl(Address::Absolute(offset), Immediate(0));
+    __ j(kEqual, slow_path->GetEntryLabel());
   }
 
   // Load the java.lang.ref.Reference class, use the output register as a temporary.
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index e3be987..d5a7cb1 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -3098,9 +3098,8 @@
   if (kEmitCompilerReadBarrier) {
     // Check self->GetWeakRefAccessEnabled().
     ThreadOffset64 offset = Thread::WeakRefAccessEnabledOffset<kX86_64PointerSize>();
-    __ gs()->cmpl(Address::Absolute(offset, /* no_rip= */ true),
-                  Immediate(enum_cast<int32_t>(WeakRefAccessState::kVisiblyEnabled)));
-    __ j(kNotEqual, slow_path->GetEntryLabel());
+    __ gs()->cmpl(Address::Absolute(offset, /* no_rip= */ true), Immediate(0));
+    __ j(kEqual, slow_path->GetEntryLabel());
   }
 
   // Load the java.lang.ref.Reference class, use the output register as a temporary.
diff --git a/runtime/gc/allocation_record.cc b/runtime/gc/allocation_record.cc
index ffd4d2b..60fb71d 100644
--- a/runtime/gc/allocation_record.cc
+++ b/runtime/gc/allocation_record.cc
@@ -23,7 +23,6 @@
 #include "obj_ptr-inl.h"
 #include "object_callbacks.h"
 #include "stack.h"
-#include "thread-inl.h"  // For GetWeakRefAccessEnabled().
 
 #include <android-base/properties.h>
 
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 34a4089..3a60191 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -1621,28 +1621,18 @@
 
   {
     TimingLogger::ScopedTiming split7("Process mark stacks and References", GetTimings());
-
-    // Process the mark stack once in the thread local stack mode. This marks most of the live
-    // objects, aside from weak ref accesses with read barriers (Reference::GetReferent() and
-    // system weaks) that may happen concurrently while we are processing the mark stack and newly
-    // mark/gray objects and push refs on the mark stack.
-    ProcessMarkStack();
-
-    ReferenceProcessor* rp = GetHeap()->GetReferenceProcessor();
-    bool clear_soft_references = GetCurrentIteration()->GetClearSoftReferences();
-    rp->Setup(self, this, /*concurrent=*/ true, clear_soft_references);
-    if (!clear_soft_references) {
-      // Forward as many SoftReferences as possible before inhibiting reference access.
-      rp->ForwardSoftReferences(GetTimings());
-    }
-
     // We transition through three mark stack modes (thread-local, shared, GC-exclusive). The
-    // primary reasons are that we need to use a checkpoint to process thread-local mark
+    // primary reasons are the fact that we need to use a checkpoint to process thread-local mark
     // stacks, but after we disable weak refs accesses, we can't use a checkpoint due to a deadlock
     // issue because running threads potentially blocking at WaitHoldingLocks, and that once we
     // reach the point where we process weak references, we can avoid using a lock when accessing
     // the GC mark stack, which makes mark stack processing more efficient.
 
+    // Process the mark stack once in the thread local stack mode. This marks most of the live
+    // objects, aside from weak ref accesses with read barriers (Reference::GetReferent() and system
+    // weaks) that may happen concurrently while we processing the mark stack and newly mark/gray
+    // objects and push refs on the mark stack.
+    ProcessMarkStack();
     // Switch to the shared mark stack mode. That is, revoke and process thread-local mark stacks
     // for the last time before transitioning to the shared mark stack mode, which would process new
     // refs that may have been concurrently pushed onto the mark stack during the ProcessMarkStack()
@@ -1659,7 +1649,6 @@
     // forever as the checkpoint never finishes (See runtime/mutator_gc_coord.md).
     SwitchToSharedMarkStackMode();
     CHECK(!self->GetWeakRefAccessEnabled());
-
     // Now that weak refs accesses are disabled, once we exhaust the shared mark stack again here
     // (which may be non-empty if there were refs found on thread-local mark stacks during the above
     // SwitchToSharedMarkStackMode() call), we won't have new refs to process, that is, mutators
@@ -1667,7 +1656,6 @@
     // before we process weak refs below.
     ProcessMarkStack();
     CheckEmptyMarkStack();
-
     // Switch to the GC exclusive mark stack mode so that we can process the mark stack without a
     // lock from this point on.
     SwitchToGcExclusiveMarkStackMode();
@@ -1675,23 +1663,24 @@
     if (kVerboseMode) {
       LOG(INFO) << "ProcessReferences";
     }
-    // Process weak references. This also marks through finalizers. Although
-    // reference processing is "disabled", some accesses will proceed once we've ensured that
-    // objects directly reachable by the mutator are marked, i.e. before we mark through
-    // finalizers.
+    // Process weak references. This may produce new refs to process and have them processed via
+    // ProcessMarkStack (in the GC exclusive mark stack mode).
     ProcessReferences(self);
     CheckEmptyMarkStack();
-    // JNI WeakGlobalRefs and most other system weaks cannot be processed until we're done marking
-    // through finalizers, since such references to finalizer-reachable objects must be preserved.
     if (kVerboseMode) {
       LOG(INFO) << "SweepSystemWeaks";
     }
     SweepSystemWeaks(self);
-    CheckEmptyMarkStack();
-    ReenableWeakRefAccess(self);
     if (kVerboseMode) {
       LOG(INFO) << "SweepSystemWeaks done";
     }
+    // Process the mark stack here one last time because the above SweepSystemWeaks() call may have
+    // marked some objects (strings alive) as hash_set::Erase() can call the hash function for
+    // arbitrary elements in the weak intern table in InternTable::Table::SweepWeaks().
+    ProcessMarkStack();
+    CheckEmptyMarkStack();
+    // Re-enable weak ref accesses.
+    ReenableWeakRefAccess(self);
     // Free data for class loaders that we unloaded.
     Runtime::Current()->GetClassLinker()->CleanupClassLoaders();
     // Marking is done. Disable marking.
@@ -3841,7 +3830,8 @@
 void ConcurrentCopying::ProcessReferences(Thread* self) {
   // We don't really need to lock the heap bitmap lock as we use CAS to mark in bitmaps.
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-  GetHeap()->GetReferenceProcessor()->ProcessReferences(self, GetTimings());
+  GetHeap()->GetReferenceProcessor()->ProcessReferences(
+      /*concurrent=*/ true, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(), this);
 }
 
 void ConcurrentCopying::RevokeAllThreadLocalBuffers() {
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index bd5ce37..e3cd1ee 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -177,7 +177,11 @@
 
 void MarkSweep::ProcessReferences(Thread* self) {
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-  GetHeap()->GetReferenceProcessor()->ProcessReferences(self, GetTimings());
+  GetHeap()->GetReferenceProcessor()->ProcessReferences(
+      true,
+      GetTimings(),
+      GetCurrentIteration()->GetClearSoftReferences(),
+      this);
 }
 
 void MarkSweep::PausePhase() {
@@ -209,9 +213,7 @@
   Runtime::Current()->DisallowNewSystemWeaks();
   // Enable the reference processing slow path, needs to be done with mutators paused since there
   // is no lock in the GetReferent fast path.
-  ReferenceProcessor* rp = GetHeap()->GetReferenceProcessor();
-  rp->Setup(self, this, /*concurrent=*/true, GetCurrentIteration()->GetClearSoftReferences());
-  rp->EnableSlowPath();
+  GetHeap()->GetReferenceProcessor()->EnableSlowPath();
 }
 
 void MarkSweep::PreCleanCards() {
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index 53b0604..0848f34 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -145,9 +145,8 @@
 
 void SemiSpace::ProcessReferences(Thread* self) {
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-  ReferenceProcessor* rp = GetHeap()->GetReferenceProcessor();
-  rp->Setup(self, this, /*concurrent=*/false, GetCurrentIteration()->GetClearSoftReferences());
-  rp->ProcessReferences(self, GetTimings());
+  GetHeap()->GetReferenceProcessor()->ProcessReferences(
+      false, GetTimings(), GetCurrentIteration()->GetClearSoftReferences(), this);
 }
 
 void SemiSpace::MarkingPhase() {
diff --git a/runtime/gc/reference_processor.cc b/runtime/gc/reference_processor.cc
index df8b3fe..e34d140 100644
--- a/runtime/gc/reference_processor.cc
+++ b/runtime/gc/reference_processor.cc
@@ -32,7 +32,6 @@
 #include "reflection.h"
 #include "scoped_thread_state_change-inl.h"
 #include "task_processor.h"
-#include "thread-inl.h"
 #include "thread_pool.h"
 #include "well_known_classes.h"
 
@@ -43,6 +42,7 @@
 
 ReferenceProcessor::ReferenceProcessor()
     : collector_(nullptr),
+      preserving_references_(false),
       condition_("reference processor condition", *Locks::reference_processor_lock_) ,
       soft_reference_queue_(Locks::reference_queue_soft_references_lock_),
       weak_reference_queue_(Locks::reference_queue_weak_references_lock_),
@@ -89,20 +89,17 @@
 
 ObjPtr<mirror::Object> ReferenceProcessor::GetReferent(Thread* self,
                                                        ObjPtr<mirror::Reference> reference) {
-  auto slow_path_required = [this, self]() {
-    return kUseReadBarrier ? !self->GetWeakRefAccessEnabled() : SlowPathEnabled();
-  };
-  if (!slow_path_required()) {
-    return reference->GetReferent();
+  if (!kUseReadBarrier || self->GetWeakRefAccessEnabled()) {
+    // Under read barrier / concurrent copying collector, it's not safe to call GetReferent() when
+    // weak ref access is disabled as the call includes a read barrier which may push a ref onto the
+    // mark stack and interfere with termination of marking.
+    const ObjPtr<mirror::Object> referent = reference->GetReferent();
+    // If the referent is null then it is already cleared, we can just return null since there is no
+    // scenario where it becomes non-null during the reference processing phase.
+    if (UNLIKELY(!SlowPathEnabled()) || referent == nullptr) {
+      return referent;
+    }
   }
-  // If the referent is null then it is already cleared, we can just return null since there is no
-  // scenario where it becomes non-null during the reference processing phase.
-  // A read barrier may be unsafe here, and we use the result only when it's null or marked.
-  ObjPtr<mirror::Object> referent = reference->template GetReferent<kWithoutReadBarrier>();
-  if (referent.IsNull()) {
-    return referent;
-  }
-
   bool started_trace = false;
   uint64_t start_millis;
   auto finish_trace = [](uint64_t start_millis) {
@@ -115,47 +112,50 @@
   };
 
   MutexLock mu(self, *Locks::reference_processor_lock_);
-  // Keeping reference_processor_lock_ blocks the broadcast when we try to reenable the fast path.
-  while (slow_path_required()) {
-    DCHECK(collector_ != nullptr);
-    constexpr bool kOtherReadBarrier = kUseReadBarrier && !kUseBakerReadBarrier;
-    if (UNLIKELY(reference->IsFinalizerReferenceInstance()
-                 || rp_state_ == RpState::kStarting /* too early to determine mark state */
-                 || (kOtherReadBarrier && reference->IsPhantomReferenceInstance()))) {
-      // Odd cases in which it doesn't hurt to just wait, or the wait is likely to be very brief.
-
-      // Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
-      // presence of threads blocking for weak ref access.
-      self->CheckEmptyCheckpointFromWeakRefAccess(Locks::reference_processor_lock_);
-      if (!started_trace) {
-        ATraceBegin("GetReferent blocked");
-        started_trace = true;
-        start_millis = MilliTime();
+  while ((!kUseReadBarrier && SlowPathEnabled()) ||
+         (kUseReadBarrier && !self->GetWeakRefAccessEnabled())) {
+    ObjPtr<mirror::Object> referent = reference->GetReferent<kWithoutReadBarrier>();
+    // If the referent became cleared, return it. Don't need barrier since thread roots can't get
+    // updated until after we leave the function due to holding the mutator lock.
+    if (referent == nullptr) {
+      if (started_trace) {
+        finish_trace(start_millis);
       }
-      condition_.WaitHoldingLocks(self);
-      continue;
+      return nullptr;
     }
-    DCHECK(!reference->IsPhantomReferenceInstance());
-
-    if (rp_state_ == RpState::kInitClearingDone) {
-      // Reachable references have their final referent values.
-      break;
+    // Try to see if the referent is already marked by using the is_marked_callback. We can return
+    // it to the mutator as long as the GC is not preserving references.
+    if (LIKELY(collector_ != nullptr)) {
+      // If it's null it means not marked, but it could become marked if the referent is reachable
+      // by finalizer referents. So we cannot return in this case and must block. Otherwise, we
+      // can return it to the mutator as long as the GC is not preserving references, in which
+      // case only black nodes can be safely returned. If the GC is preserving references, the
+      // mutator could take a white field from a grey or white node and move it somewhere else
+      // in the heap causing corruption since this field would get swept.
+      // Use the cached referent instead of calling GetReferent since other threads could call
+      // Reference.clear() after we did the null check resulting in a null pointer being
+      // incorrectly passed to IsMarked. b/33569625
+      ObjPtr<mirror::Object> forwarded_ref = collector_->IsMarked(referent.Ptr());
+      if (forwarded_ref != nullptr) {
+        // Non null means that it is marked.
+        if (!preserving_references_ ||
+           (LIKELY(!reference->IsFinalizerReferenceInstance()) && reference->IsUnprocessed())) {
+          if (started_trace) {
+            finish_trace(start_millis);
+          }
+          return forwarded_ref;
+        }
+      }
     }
-    // Although reference processing is not done, we can always predict the correct return value
-    // based on the current mark state. No additional marking from finalizers has been done, since
-    // we hold reference_processor_lock_, which is required to advance to kInitClearingDone.
-    DCHECK(rp_state_ == RpState::kInitMarkingDone);
-    // Re-load and re-check referent, since the current one may have been read before we acquired
-    // reference_lock. In particular a Reference.clear() call may have intervened. (b/33569625)
-    referent = reference->GetReferent<kWithoutReadBarrier>();
-    ObjPtr<mirror::Object> forwarded_ref =
-        referent.IsNull() ? nullptr : collector_->IsMarked(referent.Ptr());
-    // Either the referent was marked, and forwarded_ref is the correct return value, or it
-    // was not, and forwarded_ref == null, which is again the correct return value.
-    if (started_trace) {
-      finish_trace(start_millis);
+    // Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
+    // presence of threads blocking for weak ref access.
+    self->CheckEmptyCheckpointFromWeakRefAccess(Locks::reference_processor_lock_);
+    if (!started_trace) {
+      ATraceBegin("GetReferent blocked");
+      started_trace = true;
+      start_millis = MilliTime();
     }
-    return forwarded_ref;
+    condition_.WaitHoldingLocks(self);
   }
   if (started_trace) {
     finish_trace(start_millis);
@@ -163,64 +163,36 @@
   return reference->GetReferent();
 }
 
-// Forward SoftReferences. Can be done before we disable Reference access. Only
-// invoked if we are not clearing SoftReferences.
-uint32_t ReferenceProcessor::ForwardSoftReferences(TimingLogger* timings) {
-  TimingLogger::ScopedTiming split(
-      concurrent_ ? "ForwardSoftReferences" : "(Paused)ForwardSoftReferences", timings);
-  // We used to argue that we should be smarter about doing this conditionally, but it's unclear
-  // that's actually better than the more predictable strategy of basically only clearing
-  // SoftReferences just before we would otherwise run out of memory.
-  uint32_t non_null_refs = soft_reference_queue_.ForwardSoftReferences(collector_);
-  if (ATraceEnabled()) {
-    static constexpr size_t kBufSize = 80;
-    char buf[kBufSize];
-    snprintf(buf, kBufSize, "Marking for %" PRIu32 " SoftReferences", non_null_refs);
-    ATraceBegin(buf);
-    collector_->ProcessMarkStack();
-    ATraceEnd();
-  } else {
-    collector_->ProcessMarkStack();
-  }
-  return non_null_refs;
+void ReferenceProcessor::StartPreservingReferences(Thread* self) {
+  MutexLock mu(self, *Locks::reference_processor_lock_);
+  preserving_references_ = true;
 }
 
-void ReferenceProcessor::Setup(Thread* self,
-                               collector::GarbageCollector* collector,
-                               bool concurrent,
-                               bool clear_soft_references) {
-  DCHECK(collector != nullptr);
+void ReferenceProcessor::StopPreservingReferences(Thread* self) {
   MutexLock mu(self, *Locks::reference_processor_lock_);
-  collector_ = collector;
-  rp_state_ = RpState::kStarting;
-  concurrent_ = concurrent;
-  clear_soft_references_ = clear_soft_references;
+  preserving_references_ = false;
+  // We are done preserving references, some people who are blocked may see a marked referent.
+  condition_.Broadcast(self);
 }
 
 // Process reference class instances and schedule finalizations.
-// We advance rp_state_ to signal partial completion for the benefit of GetReferent.
-void ReferenceProcessor::ProcessReferences(Thread* self, TimingLogger* timings) {
-  TimingLogger::ScopedTiming t(concurrent_ ? __FUNCTION__ : "(Paused)ProcessReferences", timings);
-  if (!clear_soft_references_) {
-    // Forward any additional SoftReferences we discovered late, now that reference access has been
-    // inhibited.
-    while (!soft_reference_queue_.IsEmpty()) {
-      ForwardSoftReferences(timings);
-    }
-  }
+void ReferenceProcessor::ProcessReferences(bool concurrent,
+                                           TimingLogger* timings,
+                                           bool clear_soft_references,
+                                           collector::GarbageCollector* collector) {
+  TimingLogger::ScopedTiming t(concurrent ? __FUNCTION__ : "(Paused)ProcessReferences", timings);
+  Thread* self = Thread::Current();
   {
     MutexLock mu(self, *Locks::reference_processor_lock_);
+    collector_ = collector;
     if (!kUseReadBarrier) {
-      CHECK_EQ(SlowPathEnabled(), concurrent_) << "Slow path must be enabled iff concurrent";
+      CHECK_EQ(SlowPathEnabled(), concurrent) << "Slow path must be enabled iff concurrent";
     } else {
-      // Weak ref access is enabled at Zygote compaction by SemiSpace (concurrent_ == false).
-      CHECK_EQ(!self->GetWeakRefAccessEnabled(), concurrent_);
+      // Weak ref access is enabled at Zygote compaction by SemiSpace (concurrent == false).
+      CHECK_EQ(!self->GetWeakRefAccessEnabled(), concurrent);
     }
-    DCHECK(rp_state_ == RpState::kStarting);
-    rp_state_ = RpState::kInitMarkingDone;
-    condition_.Broadcast(self);
   }
-  if (kIsDebugBuild && collector_->IsTransactionActive()) {
+  if (kIsDebugBuild && collector->IsTransactionActive()) {
     // In transaction mode, we shouldn't enqueue any Reference to the queues.
     // See DelayReferenceReferent().
     DCHECK(soft_reference_queue_.IsEmpty());
@@ -228,84 +200,75 @@
     DCHECK(finalizer_reference_queue_.IsEmpty());
     DCHECK(phantom_reference_queue_.IsEmpty());
   }
-  // Clear all remaining soft and weak references with white referents.
-  // This misses references only reachable through finalizers.
-  soft_reference_queue_.ClearWhiteReferences(&cleared_references_, collector_);
-  weak_reference_queue_.ClearWhiteReferences(&cleared_references_, collector_);
-  // Defer PhantomReference processing until we've finished marking through finalizers.
-  {
-    // TODO: Capture mark state of some system weaks here. If the referent was marked here,
-    // then it is now safe to return, since it can only refer to marked objects. If it becomes
-    // marked below, that is no longer guaranteed.
-    MutexLock mu(self, *Locks::reference_processor_lock_);
-    rp_state_ = RpState::kInitClearingDone;
-    // At this point, all mutator-accessible data is marked (black). Objects enqueued for
-    // finalization will only be made available to the mutator via CollectClearedReferences after
-    // we're fully done marking. Soft and WeakReferences accessible to the mutator have been
-    // processed and refer only to black objects.  Thus there is no danger of the mutator getting
-    // access to non-black objects.  Weak reference processing is still nominally suspended,
-    // But many kinds of references, including all java.lang.ref ones, are handled normally from
-    // here on. See GetReferent().
+  // Unless required to clear soft references with white references, preserve some white referents.
+  if (!clear_soft_references) {
+    TimingLogger::ScopedTiming split(concurrent ? "ForwardSoftReferences" :
+        "(Paused)ForwardSoftReferences", timings);
+    if (concurrent) {
+      StartPreservingReferences(self);
+    }
+    // TODO: Add smarter logic for preserving soft references. The behavior should be a conditional
+    // mark if the SoftReference is supposed to be preserved.
+    uint32_t non_null_refs = soft_reference_queue_.ForwardSoftReferences(collector);
+    if (ATraceEnabled()) {
+      static constexpr size_t kBufSize = 80;
+      char buf[kBufSize];
+      snprintf(buf, kBufSize, "Marking for %" PRIu32 " SoftReferences", non_null_refs);
+      ATraceBegin(buf);
+      collector->ProcessMarkStack();
+      ATraceEnd();
+    } else {
+      collector->ProcessMarkStack();
+    }
+    if (concurrent) {
+      StopPreservingReferences(self);
+    }
   }
+  // Clear all remaining soft and weak references with white referents.
+  soft_reference_queue_.ClearWhiteReferences(&cleared_references_, collector);
+  weak_reference_queue_.ClearWhiteReferences(&cleared_references_, collector);
   {
-    TimingLogger::ScopedTiming t2(
-        concurrent_ ? "EnqueueFinalizerReferences" : "(Paused)EnqueueFinalizerReferences", timings);
+    TimingLogger::ScopedTiming t2(concurrent ? "EnqueueFinalizerReferences" :
+        "(Paused)EnqueueFinalizerReferences", timings);
+    if (concurrent) {
+      StartPreservingReferences(self);
+    }
     // Preserve all white objects with finalize methods and schedule them for finalization.
     FinalizerStats finalizer_stats =
-        finalizer_reference_queue_.EnqueueFinalizerReferences(&cleared_references_, collector_);
+        finalizer_reference_queue_.EnqueueFinalizerReferences(&cleared_references_, collector);
     if (ATraceEnabled()) {
       static constexpr size_t kBufSize = 80;
       char buf[kBufSize];
       snprintf(buf, kBufSize, "Marking from %" PRIu32 " / %" PRIu32 " finalizers",
                finalizer_stats.num_enqueued_, finalizer_stats.num_refs_);
       ATraceBegin(buf);
-      collector_->ProcessMarkStack();
+      collector->ProcessMarkStack();
       ATraceEnd();
     } else {
-      collector_->ProcessMarkStack();
+      collector->ProcessMarkStack();
+    }
+    if (concurrent) {
+      StopPreservingReferences(self);
     }
   }
-
-  // Process all soft and weak references with white referents, where the references are reachable
-  // only from finalizers. It is unclear that there is any way to do this without slightly
-  // violating some language spec. We choose to apply normal Reference processing rules for these.
-  // This exposes the following issues:
-  // 1) In the case of an unmarked referent, we may end up enqueuing an "unreachable" reference.
-  //    This appears unavoidable, since we need to clear the reference for safety, unless we
-  //    mark the referent and undo finalization decisions for objects we encounter during marking.
-  //    (Some versions of the RI seem to do something along these lines.)
-  //    Or we could clear the reference without enqueuing it, which also seems strange and
-  //    unhelpful.
-  // 2) In the case of a marked referent, we will preserve a reference to objects that may have
-  //    been enqueued for finalization. Again fixing this would seem to involve at least undoing
-  //    previous finalization / reference clearing decisions. (This would also mean than an object
-  //    containing both a strong and a WeakReference to the same referent could see the
-  //    WeakReference cleared.)
-  // The treatment in (2) is potentially quite dangerous, since Reference.get() can e.g. return a
-  // finalized object containing pointers to native objects that have already been deallocated.
-  // But it can be argued that this is just an instance of the broader rule that it is not safe
-  // for finalizers to access otherwise inaccessible finalizable objects.
-  soft_reference_queue_.ClearWhiteReferences(&cleared_references_, collector_,
-                                             /*report_cleared=*/ true);
-  weak_reference_queue_.ClearWhiteReferences(&cleared_references_, collector_,
-                                             /*report_cleared=*/ true);
-
-  // Clear all phantom references with white referents. It's fine to do this just once here.
-  phantom_reference_queue_.ClearWhiteReferences(&cleared_references_, collector_);
-
+  // Clear all finalizer referent reachable soft and weak references with white referents.
+  soft_reference_queue_.ClearWhiteReferences(&cleared_references_, collector);
+  weak_reference_queue_.ClearWhiteReferences(&cleared_references_, collector);
+  // Clear all phantom references with white referents.
+  phantom_reference_queue_.ClearWhiteReferences(&cleared_references_, collector);
   // At this point all reference queues other than the cleared references should be empty.
   DCHECK(soft_reference_queue_.IsEmpty());
   DCHECK(weak_reference_queue_.IsEmpty());
   DCHECK(finalizer_reference_queue_.IsEmpty());
   DCHECK(phantom_reference_queue_.IsEmpty());
-
   {
     MutexLock mu(self, *Locks::reference_processor_lock_);
     // Need to always do this since the next GC may be concurrent. Doing this for only concurrent
     // could result in a stale is_marked_callback_ being called before the reference processing
     // starts since there is a small window of time where slow_path_enabled_ is enabled but the
     // callback isn't yet set.
-    if (!kUseReadBarrier && concurrent_) {
+    collector_ = nullptr;
+    if (!kUseReadBarrier && concurrent) {
       // Done processing, disable the slow path and broadcast to the waiters.
       DisableSlowPath(self);
     }
diff --git a/runtime/gc/reference_processor.h b/runtime/gc/reference_processor.h
index 0f84211..51c5054 100644
--- a/runtime/gc/reference_processor.h
+++ b/runtime/gc/reference_processor.h
@@ -46,29 +46,19 @@
 class ReferenceProcessor {
  public:
   ReferenceProcessor();
-
-  // Initialize for a reference processing pass. Called before suspending weak
-  // access.
-  void Setup(Thread* self,
-             collector::GarbageCollector* collector,
-             bool concurrent,
-             bool clear_soft_references)
-      REQUIRES(!Locks::reference_processor_lock_);
-  // Enqueue all types of java.lang.ref.References, and mark through finalizers.
-  // Assumes there is no concurrent mutator-driven marking, i.e. all potentially
-  // mutator-accessible objects should be marked before this.
-  void ProcessReferences(Thread* self, TimingLogger* timings)
+  void ProcessReferences(bool concurrent,
+                         TimingLogger* timings,
+                         bool clear_soft_references,
+                         gc::collector::GarbageCollector* collector)
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(Locks::heap_bitmap_lock_)
       REQUIRES(!Locks::reference_processor_lock_);
-
   // The slow path bool is contained in the reference class object, can only be set once
   // Only allow setting this with mutators suspended so that we can avoid using a lock in the
   // GetReferent fast path as an optimization.
   void EnableSlowPath() REQUIRES_SHARED(Locks::mutator_lock_);
   void BroadcastForSlowPath(Thread* self);
-  // Decode the referent, may block if references are being processed. In the normal
-  // no-read-barrier or Baker-read-barrier cases, we assume reference is not a PhantomReference.
+  // Decode the referent, may block if references are being processed.
   ObjPtr<mirror::Object> GetReferent(Thread* self, ObjPtr<mirror::Reference> reference)
       REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!Locks::reference_processor_lock_);
   // Collects the cleared references and returns a task, to be executed after FinishGC, that will
@@ -88,30 +78,27 @@
   void ClearReferent(ObjPtr<mirror::Reference> ref)
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!Locks::reference_processor_lock_);
-  uint32_t ForwardSoftReferences(TimingLogger* timings)
-      REQUIRES_SHARED(Locks::mutator_lock_);
 
  private:
   bool SlowPathEnabled() REQUIRES_SHARED(Locks::mutator_lock_);
   // Called by ProcessReferences.
   void DisableSlowPath(Thread* self) REQUIRES(Locks::reference_processor_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
+  // If we are preserving references it means that some dead objects may become live, we use start
+  // and stop preserving to block mutators using GetReferrent from getting access to these
+  // referents.
+  void StartPreservingReferences(Thread* self) REQUIRES(!Locks::reference_processor_lock_);
+  void StopPreservingReferences(Thread* self) REQUIRES(!Locks::reference_processor_lock_);
   // Wait until reference processing is done.
   void WaitUntilDoneProcessingReferences(Thread* self)
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(Locks::reference_processor_lock_);
   // Collector which is clearing references, used by the GetReferent to return referents which are
-  // already marked. Only updated by thread currently running GC.
-  // Guarded by reference_processor_lock_ when not read by collector. Only the collector changes
-  // it.
-  collector::GarbageCollector* collector_;
-  // Reference processor state. Only valid while weak reference processing is suspended.
-  // Used by GetReferent and friends to return early.
-  enum class RpState : uint8_t { kStarting, kInitMarkingDone, kInitClearingDone };
-  RpState rp_state_ GUARDED_BY(Locks::reference_processor_lock_);
-  bool concurrent_;  // Running concurrently with mutator? Only used by GC thread.
-  bool clear_soft_references_;  // Only used by GC thread.
-
+  // already marked.
+  collector::GarbageCollector* collector_ GUARDED_BY(Locks::reference_processor_lock_);
+  // Boolean for whether or not we are preserving references (either soft references or finalizers).
+  // If this is true, then we cannot return a referent (see comment in GetReferent).
+  bool preserving_references_ GUARDED_BY(Locks::reference_processor_lock_);
   // Condition that people wait on if they attempt to get the referent of a reference while
   // processing is in progress. Broadcast when an empty checkpoint is requested, but not for other
   // checkpoints or thread suspensions. See mutator_gc_coord.md.
diff --git a/runtime/gc/reference_queue.cc b/runtime/gc/reference_queue.cc
index 6bdacaf..568ca04 100644
--- a/runtime/gc/reference_queue.cc
+++ b/runtime/gc/reference_queue.cc
@@ -131,8 +131,7 @@
 }
 
 void ReferenceQueue::ClearWhiteReferences(ReferenceQueue* cleared_references,
-                                          collector::GarbageCollector* collector,
-                                          bool report_cleared) {
+                                          collector::GarbageCollector* collector) {
   while (!IsEmpty()) {
     ObjPtr<mirror::Reference> ref = DequeuePendingReference();
     mirror::HeapReference<mirror::Object>* referent_addr = ref->GetReferentReferenceAddr();
@@ -146,15 +145,6 @@
         ref->ClearReferent<false>();
       }
       cleared_references->EnqueueReference(ref);
-      if (report_cleared) {
-        static bool already_reported = false;
-        if (!already_reported) {
-          // TODO: Maybe do this only if the queue is non-null?
-          LOG(WARNING)
-              << "Cleared Reference was only reachable from finalizer (only reported once)";
-          already_reported = true;
-        }
-      }
     }
     // Delay disabling the read barrier until here so that the ClearReferent call above in
     // transaction mode will trigger the read barrier.
@@ -192,33 +182,22 @@
 }
 
 uint32_t ReferenceQueue::ForwardSoftReferences(MarkObjectVisitor* visitor) {
+  if (UNLIKELY(IsEmpty())) {
+    return 0;
+  }
   uint32_t num_refs(0);
-  Thread* self = Thread::Current();
-  static constexpr int SR_BUF_SIZE = 32;
-  ObjPtr<mirror::Reference> buf[SR_BUF_SIZE];
-  int n_entries;
-  bool empty;
+  const ObjPtr<mirror::Reference> head = list_;
+  ObjPtr<mirror::Reference> ref = head;
   do {
-    {
-      // Acquire lock only a few times and hold it as briefly as possible.
-      MutexLock mu(self, *lock_);
-      empty = IsEmpty();
-      for (n_entries = 0; n_entries < SR_BUF_SIZE && !empty; ++n_entries) {
-        // Dequeuing the Reference here means it could possibly be enqueued again during this GC.
-        // That's unlikely and benign.
-        buf[n_entries] = DequeuePendingReference();
-        empty = IsEmpty();
-      }
+    mirror::HeapReference<mirror::Object>* referent_addr = ref->GetReferentReferenceAddr();
+    if (referent_addr->AsMirrorPtr() != nullptr) {
+      // do_atomic_update is false because mutators can't access the referent due to the weak ref
+      // access blocking.
+      visitor->MarkHeapReference(referent_addr, /*do_atomic_update=*/ false);
+      ++num_refs;
     }
-    for (int i = 0; i < n_entries; ++i) {
-      mirror::HeapReference<mirror::Object>* referent_addr = buf[i]->GetReferentReferenceAddr();
-      if (referent_addr->AsMirrorPtr() != nullptr) {
-        visitor->MarkHeapReference(referent_addr, /*do_atomic_update=*/ true);
-        ++num_refs;
-      }
-      DisableReadBarrierForReference(buf[i]->AsReference());
-    }
-  } while (!empty);
+    ref = ref->GetPendingNext();
+  } while (LIKELY(ref != head));
   return num_refs;
 }
 
diff --git a/runtime/gc/reference_queue.h b/runtime/gc/reference_queue.h
index 3fda716..06243c4 100644
--- a/runtime/gc/reference_queue.h
+++ b/runtime/gc/reference_queue.h
@@ -90,20 +90,17 @@
                                   collector::GarbageCollector* collector)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // Walks the reference list marking and dequeuing any references subject to the reference
-  // clearing policy.  References with a black referent are removed from the list.  References
-  // with white referents biased toward saving are blackened and also removed from the list.
-  // Returns the number of non-null soft references. May be called concurrently with
-  // AtomicEnqueueIfNotEnqueued().
+  // Walks the reference list marking any references subject to the reference clearing policy.
+  // References with a black referent are removed from the list.  References with white referents
+  // biased toward saving are blackened and also removed from the list.
+  // Returns the number of non-null soft references.
   uint32_t ForwardSoftReferences(MarkObjectVisitor* visitor)
-      REQUIRES(!*lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Unlink the reference list clearing references objects with white referents. Cleared references
   // registered to a reference queue are scheduled for appending by the heap worker thread.
   void ClearWhiteReferences(ReferenceQueue* cleared_references,
-                            collector::GarbageCollector* collector,
-                            bool report_cleared = false)
+                            collector::GarbageCollector* collector)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   void Dump(std::ostream& os) const REQUIRES_SHARED(Locks::mutator_lock_);
@@ -112,12 +109,9 @@
   bool IsEmpty() const {
     return list_ == nullptr;
   }
-
-  // Clear this queue. Only safe after handing off the contents elsewhere for further processing.
   void Clear() {
     list_ = nullptr;
   }
-
   mirror::Reference* GetList() REQUIRES_SHARED(Locks::mutator_lock_) {
     return list_;
   }
@@ -130,10 +124,8 @@
   // Lock, used for parallel GC reference enqueuing. It allows for multiple threads simultaneously
   // calling AtomicEnqueueIfNotEnqueued.
   Mutex* const lock_;
-  // The actual reference list. Only a root for the mark compact GC since it
-  // will be null during root marking for other GC types. Not an ObjPtr since it
-  // is accessed from multiple threads.  Points to a singly-linked circular list
-  // using the pendingNext field.
+  // The actual reference list. Only a root for the mark compact GC since it will be null for other
+  // GC types. Not an ObjPtr since it is accessed from multiple threads.
   mirror::Reference* list_;
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(ReferenceQueue);
diff --git a/runtime/intern_table.cc b/runtime/intern_table.cc
index 5d4c507..25c45cb 100644
--- a/runtime/intern_table.cc
+++ b/runtime/intern_table.cc
@@ -32,7 +32,6 @@
 #include "object_callbacks.h"
 #include "scoped_thread_state_change-inl.h"
 #include "thread.h"
-#include "thread-inl.h"
 
 namespace art {
 
diff --git a/runtime/intern_table.h b/runtime/intern_table.h
index 9d1720f..db6016e 100644
--- a/runtime/intern_table.h
+++ b/runtime/intern_table.h
@@ -116,7 +116,7 @@
       REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
 
   // Only used by image writer. Special version that may not cause thread suspension since the GC
-  // cannot be running while we are doing image writing. Maybe be called while holding a
+  // cannot be running while we are doing image writing. Maybe be called while while holding a
   // lock since there will not be thread suspension.
   ObjPtr<mirror::String> InternStrongImageString(ObjPtr<mirror::String> s)
       REQUIRES_SHARED(Locks::mutator_lock_);
diff --git a/runtime/interpreter/interpreter_common.h b/runtime/interpreter/interpreter_common.h
index e9b78f3..1809227 100644
--- a/runtime/interpreter/interpreter_common.h
+++ b/runtime/interpreter/interpreter_common.h
@@ -59,7 +59,6 @@
 #include "obj_ptr.h"
 #include "stack.h"
 #include "thread.h"
-#include "thread-inl.h"
 #include "unstarted_runtime.h"
 #include "verifier/method_verifier.h"
 #include "well_known_classes.h"
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index ad44ac2..34ddfc4 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -57,7 +57,6 @@
 #include "scoped_thread_state_change-inl.h"
 #include "stack.h"
 #include "thread-current-inl.h"
-#include "thread-inl.h"
 #include "thread_list.h"
 
 namespace art {
diff --git a/runtime/jni/java_vm_ext.cc b/runtime/jni/java_vm_ext.cc
index 94108c6..12e6d22 100644
--- a/runtime/jni/java_vm_ext.cc
+++ b/runtime/jni/java_vm_ext.cc
@@ -864,10 +864,6 @@
   if (kDebugLocking) {
     Locks::jni_weak_globals_lock_->AssertHeld(self);
   }
-  // TODO: Handle the already null case without waiting.
-  // TODO: Otherwise we should just wait for kInitMarkingDone, and track which weak globals were
-  // marked at that point. We would only need one mark bit per entry in the weak_globals_ table,
-  // and a quick pass over that early on during reference processing.
   WaitForWeakGlobalsAccess(self);
   return weak_globals_.Get(ref);
 }
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index c0dcf88..3c1e7a0 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -372,25 +372,6 @@
   return false;
 }
 
-inline bool Thread::GetWeakRefAccessEnabled() const {
-  CHECK(kUseReadBarrier);
-  DCHECK(this == Thread::Current());
-  WeakRefAccessState s = tls32_.weak_ref_access_enabled.load(std::memory_order_acquire);
-  if (s == WeakRefAccessState::kVisiblyEnabled) {
-    return true;
-  } else if (s == WeakRefAccessState::kDisabled) {
-    return false;
-  }
-  DCHECK(s == WeakRefAccessState::kEnabled)
-      << "state = " << static_cast<std::underlying_type_t<WeakRefAccessState>>(s);
-  // The state is only changed back to DISABLED during a checkpoint. Thus no other thread can
-  // change the value concurrently here. No other thread reads the value we store here, so there
-  // is no need for a release store.
-  tls32_.weak_ref_access_enabled.store(WeakRefAccessState::kVisiblyEnabled,
-                                       std::memory_order_relaxed);
-  return true;
-}
-
 inline void Thread::SetThreadLocalAllocationStack(StackReference<mirror::Object>* start,
                                                   StackReference<mirror::Object>* end) {
   DCHECK(Thread::Current() == this) << "Should be called by self";
diff --git a/runtime/thread.h b/runtime/thread.h
index 7eb57e9..1085a56 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -171,20 +171,6 @@
   kDefault     // dex pc may or may not advance depending on other conditions.
 };
 
-// For the CC colector, normal weak reference access can be disabled on a per-thread basis, while
-// processing references.  After finishing, the reference processor asynchronously sets the
-// per-thread flags back to kEnabled with release memory ordering semantics. Each mutator thread
-// should check its flag with acquire semantics before assuming that it is enabled. However,
-// that is often too expensive, so the reading thread sets it to kVisiblyEnabled after seeing it
-// kEnabled.  The Reference.get() intrinsic can thus read it in relaxed mode, and reread (by
-// resorting to the slow path) with acquire semantics if it sees a value of kEnabled rather than
-// kVisiblyEnabled.
-enum class WeakRefAccessState : int32_t {
-  kVisiblyEnabled = 0,  // Enabled, and previously read with acquire load by this thread.
-  kEnabled,
-  kDisabled
-};
-
 // This should match RosAlloc::kNumThreadLocalSizeBrackets.
 static constexpr size_t kNumRosAllocThreadLocalSizeBracketsInThread = 16;
 
@@ -1008,13 +994,14 @@
 
   void SetIsGcMarkingAndUpdateEntrypoints(bool is_marking);
 
-  bool GetWeakRefAccessEnabled() const;  // Only safe for current thread.
+  bool GetWeakRefAccessEnabled() const {
+    CHECK(kUseReadBarrier);
+    return tls32_.weak_ref_access_enabled;
+  }
 
   void SetWeakRefAccessEnabled(bool enabled) {
     CHECK(kUseReadBarrier);
-    WeakRefAccessState new_state = enabled ?
-        WeakRefAccessState::kEnabled : WeakRefAccessState::kDisabled;
-    tls32_.weak_ref_access_enabled.store(new_state, std::memory_order_release);
+    tls32_.weak_ref_access_enabled = enabled;
   }
 
   uint32_t GetDisableThreadFlipCount() const {
@@ -1685,7 +1672,7 @@
           thread_exit_check_count(0),
           is_transitioning_to_runnable(false),
           is_gc_marking(false),
-          weak_ref_access_enabled(WeakRefAccessState::kVisiblyEnabled),
+          weak_ref_access_enabled(true),
           disable_thread_flip_count(0),
           user_code_suspend_count(0),
           force_interpreter_count(0),
@@ -1741,17 +1728,14 @@
 
     AtomicInteger park_state_;
 
-    // Determines whether the thread is allowed to directly access a weak ref
-    // (Reference::GetReferent() and system weaks) and to potentially mark an object alive/gray.
-    // This is used for concurrent reference processing of the CC collector only. This is thread
-    // local so that we can enable/disable weak ref access by using a checkpoint and avoid a race
-    // around the time weak ref access gets disabled and concurrent reference processing begins
-    // (if weak ref access is disabled during a pause, this is not an issue.) Other collectors use
-    // Runtime::DisallowNewSystemWeaks() and ReferenceProcessor::EnableSlowPath().  Can be
-    // concurrently accessed by GetReferent() and set (by iterating over threads).
-    // Can be changed from kEnabled to kVisiblyEnabled by readers. No other concurrent access is
-    // possible when that happens.
-    mutable std::atomic<WeakRefAccessState> weak_ref_access_enabled;
+    // True if the thread is allowed to access a weak ref (Reference::GetReferent() and system
+    // weaks) and to potentially mark an object alive/gray. This is used for concurrent reference
+    // processing of the CC collector only. This is thread local so that we can enable/disable weak
+    // ref access by using a checkpoint and avoid a race around the time weak ref access gets
+    // disabled and concurrent reference processing begins (if weak ref access is disabled during a
+    // pause, this is not an issue.) Other collectors use Runtime::DisallowNewSystemWeaks() and
+    // ReferenceProcessor::EnableSlowPath().
+    bool32_t weak_ref_access_enabled;
 
     // A thread local version of Heap::disable_thread_flip_count_. This keeps track of how many
     // levels of (nested) JNI critical sections the thread is in and is used to detect a nested JNI