Thread-local mark stacks for the CC collector.

Thread-local mark stacks are assigned to mutators where they push
references in read barriers to reduce the (CAS) synchronization cost
in a global mark stack/queue.

We step through three mark stack modes (thread-local, shared,
GC-exclusive) and use per-thread flags to disable/enable system weak
accesses (only for the CC collector) instead of the existing global
one to safely perform the marking phase. The reasons are 1)
thread-local mark stacks for mutators need to be revoked using a
checkpoint to avoid races (incorrectly leaving a reference on mark
stacks) when terminating marking, and 2) we can’t use a checkpoint
while system weak accesses are disabled (or a deadlock would
happen). More details are described in the code comments.

Performance improvements in Ritzperf EAAC: a ~2.8% improvement
(13290->12918) in run time and a ~23% improvement (51.6s->39.8s) in
the total GC time on N5.

Bug: 12687968
Change-Id: I5d234d7e48bf115cd773d38bdb62ad24ce9116c7
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index b1897b8..1fb4703 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -49,89 +49,6 @@
 
 namespace collector {
 
-// Concurrent queue. Used as the mark stack. TODO: use a concurrent
-// stack for locality.
-class MarkQueue {
- public:
-  explicit MarkQueue(size_t size) : size_(size) {
-    CHECK(IsPowerOfTwo(size_));
-    buf_.reset(new Atomic<mirror::Object*>[size_]);
-    CHECK(buf_.get() != nullptr);
-    Clear();
-  }
-
-  ALWAYS_INLINE Atomic<mirror::Object*>* GetSlotAddr(size_t index) {
-    return &(buf_.get()[index & (size_ - 1)]);
-  }
-
-  // Multiple-proceducer enqueue.
-  bool Enqueue(mirror::Object* to_ref) {
-    size_t t;
-    do {
-      t = tail_.LoadRelaxed();
-      size_t h = head_.LoadSequentiallyConsistent();
-      if (t + size_ == h) {
-        // It's full.
-        return false;
-      }
-    } while (!tail_.CompareExchangeWeakSequentiallyConsistent(t, t + 1));
-    // We got a slot but its content has not been filled yet at this point.
-    GetSlotAddr(t)->StoreSequentiallyConsistent(to_ref);
-    return true;
-  }
-
-  // Thread-unsafe.
-  bool EnqueueThreadUnsafe(mirror::Object* to_ref) {
-    size_t t = tail_.LoadRelaxed();
-    size_t h = head_.LoadRelaxed();
-    if (t + size_ == h) {
-      // It's full.
-      return false;
-    }
-    GetSlotAddr(t)->StoreRelaxed(to_ref);
-    tail_.StoreRelaxed(t + 1);
-    return true;
-  }
-
-  // Single-consumer dequeue.
-  mirror::Object* Dequeue() {
-    size_t h = head_.LoadRelaxed();
-    size_t t = tail_.LoadSequentiallyConsistent();
-    if (h == t) {
-      // it's empty.
-      return nullptr;
-    }
-    Atomic<mirror::Object*>* slot = GetSlotAddr(h);
-    mirror::Object* ref = slot->LoadSequentiallyConsistent();
-    while (ref == nullptr) {
-      // Wait until the slot content becomes visible.
-      ref = slot->LoadSequentiallyConsistent();
-    }
-    slot->StoreRelaxed(nullptr);
-    head_.StoreSequentiallyConsistent(h + 1);
-    return ref;
-  }
-
-  bool IsEmpty() {
-    size_t h = head_.LoadSequentiallyConsistent();
-    size_t t = tail_.LoadSequentiallyConsistent();
-    return h == t;
-  }
-
-  void Clear() {
-    head_.StoreRelaxed(0);
-    tail_.StoreRelaxed(0);
-    memset(buf_.get(), 0, size_ * sizeof(Atomic<mirror::Object*>));
-  }
-
- private:
-  Atomic<size_t> head_;
-  Atomic<size_t> tail_;
-
-  size_t size_;
-  std::unique_ptr<Atomic<mirror::Object*>[]> buf_;
-};
-
 class ConcurrentCopying : public GarbageCollector {
  public:
   // TODO: disable thse flags for production use.
@@ -185,10 +102,12 @@
   Barrier& GetBarrier() {
     return *gc_barrier_;
   }
+  bool IsWeakRefAccessEnabled() {
+    return weak_ref_access_enabled_.LoadRelaxed();
+  }
+  void RevokeThreadLocalMarkStack(Thread* thread) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
  private:
-  mirror::Object* PopOffMarkStack();
-  template<bool kThreadSafe>
   void PushOntoMarkStack(mirror::Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::Object* Copy(mirror::Object* from_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void Scan(mirror::Object* to_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -202,11 +121,18 @@
   void VerifyNoFromSpaceReferences() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
   accounting::ObjectStack* GetAllocationStack();
   accounting::ObjectStack* GetLiveStack();
-  bool ProcessMarkStack() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void ProcessMarkStack() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  bool ProcessMarkStackOnce() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void ProcessMarkStackRef(mirror::Object* to_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  size_t ProcessThreadLocalMarkStacks(bool disable_weak_ref_access)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void RevokeThreadLocalMarkStacks(bool disable_weak_ref_access)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void SwitchToSharedMarkStackMode() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void SwitchToGcExclusiveMarkStackMode() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void DelayReferenceReferent(mirror::Class* klass, mirror::Reference* reference)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void ProcessReferences(Thread* self, bool concurrent)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void ProcessReferences(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::Object* IsMarked(mirror::Object* from_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   static mirror::Object* MarkCallback(mirror::Object* from_ref, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -229,7 +155,7 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::Object* AllocateInSkippedBlock(size_t alloc_size)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  void CheckEmptyMarkQueue() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void CheckEmptyMarkStack() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void IssueEmptyCheckpoint() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   bool IsOnAllocStack(mirror::Object* ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   mirror::Object* GetFwdPtr(mirror::Object* from_ref)
@@ -242,10 +168,19 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   void AssertToSpaceInvariantInNonMovingSpace(mirror::Object* obj, mirror::Object* ref)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void ReenableWeakRefAccess(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   space::RegionSpace* region_space_;      // The underlying region space.
   std::unique_ptr<Barrier> gc_barrier_;
-  MarkQueue mark_queue_;
+  std::unique_ptr<accounting::ObjectStack> gc_mark_stack_;
+  Mutex mark_stack_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  std::vector<accounting::ObjectStack*> revoked_mark_stacks_
+      GUARDED_BY(mark_stack_lock_);
+  static constexpr size_t kMarkStackSize = kPageSize;
+  static constexpr size_t kMarkStackPoolSize = 256;
+  std::vector<accounting::ObjectStack*> pooled_mark_stacks_
+      GUARDED_BY(mark_stack_lock_);
+  Thread* thread_running_gc_;
   bool is_marking_;                       // True while marking is ongoing.
   bool is_active_;                        // True while the collection is ongoing.
   bool is_asserting_to_space_invariant_;  // True while asserting the to-space invariant.
@@ -258,7 +193,18 @@
   size_t live_stack_freeze_size_;
   size_t from_space_num_objects_at_first_pause_;
   size_t from_space_num_bytes_at_first_pause_;
-  Atomic<int> is_mark_queue_push_disallowed_;
+  Atomic<int> is_mark_stack_push_disallowed_;
+  enum MarkStackMode {
+    kMarkStackModeOff = 0,      // Mark stack is off.
+    kMarkStackModeThreadLocal,  // All threads except for the GC-running thread push refs onto
+                                // thread-local mark stacks. The GC-running thread pushes onto and
+                                // pops off the GC mark stack without a lock.
+    kMarkStackModeShared,       // All threads share the GC mark stack with a lock.
+    kMarkStackModeGcExclusive   // The GC-running thread pushes onto and pops from the GC mark stack
+                                // without a lock. Other threads won't access the mark stack.
+  };
+  Atomic<MarkStackMode> mark_stack_mode_;
+  Atomic<bool> weak_ref_access_enabled_;
 
   // How many objects and bytes we moved. Used for accounting.
   Atomic<size_t> bytes_moved_;
@@ -284,6 +230,7 @@
   friend class ThreadFlipVisitor;
   friend class FlipCallback;
   friend class ConcurrentCopyingComputeUnevacFromSpaceLiveRatioVisitor;
+  friend class RevokeThreadLocalMarkStackCheckpoint;
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(ConcurrentCopying);
 };