Thread-local mark stacks for the CC collector.
Thread-local mark stacks are assigned to mutators where they push
references in read barriers to reduce the (CAS) synchronization cost
in a global mark stack/queue.
We step through three mark stack modes (thread-local, shared,
GC-exclusive) and use per-thread flags to disable/enable system weak
accesses (only for the CC collector) instead of the existing global
one to safely perform the marking phase. The reasons are 1)
thread-local mark stacks for mutators need to be revoked using a
checkpoint to avoid races (incorrectly leaving a reference on mark
stacks) when terminating marking, and 2) we can’t use a checkpoint
while system weak accesses are disabled (or a deadlock would
happen). More details are described in the code comments.
Performance improvements in Ritzperf EAAC: a ~2.8% improvement
(13290->12918) in run time and a ~23% improvement (51.6s->39.8s) in
the total GC time on N5.
Bug: 12687968
Change-Id: I5d234d7e48bf115cd773d38bdb62ad24ce9116c7
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index b1897b8..1fb4703 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -49,89 +49,6 @@
namespace collector {
-// Concurrent queue. Used as the mark stack. TODO: use a concurrent
-// stack for locality.
-class MarkQueue {
- public:
- explicit MarkQueue(size_t size) : size_(size) {
- CHECK(IsPowerOfTwo(size_));
- buf_.reset(new Atomic<mirror::Object*>[size_]);
- CHECK(buf_.get() != nullptr);
- Clear();
- }
-
- ALWAYS_INLINE Atomic<mirror::Object*>* GetSlotAddr(size_t index) {
- return &(buf_.get()[index & (size_ - 1)]);
- }
-
- // Multiple-proceducer enqueue.
- bool Enqueue(mirror::Object* to_ref) {
- size_t t;
- do {
- t = tail_.LoadRelaxed();
- size_t h = head_.LoadSequentiallyConsistent();
- if (t + size_ == h) {
- // It's full.
- return false;
- }
- } while (!tail_.CompareExchangeWeakSequentiallyConsistent(t, t + 1));
- // We got a slot but its content has not been filled yet at this point.
- GetSlotAddr(t)->StoreSequentiallyConsistent(to_ref);
- return true;
- }
-
- // Thread-unsafe.
- bool EnqueueThreadUnsafe(mirror::Object* to_ref) {
- size_t t = tail_.LoadRelaxed();
- size_t h = head_.LoadRelaxed();
- if (t + size_ == h) {
- // It's full.
- return false;
- }
- GetSlotAddr(t)->StoreRelaxed(to_ref);
- tail_.StoreRelaxed(t + 1);
- return true;
- }
-
- // Single-consumer dequeue.
- mirror::Object* Dequeue() {
- size_t h = head_.LoadRelaxed();
- size_t t = tail_.LoadSequentiallyConsistent();
- if (h == t) {
- // it's empty.
- return nullptr;
- }
- Atomic<mirror::Object*>* slot = GetSlotAddr(h);
- mirror::Object* ref = slot->LoadSequentiallyConsistent();
- while (ref == nullptr) {
- // Wait until the slot content becomes visible.
- ref = slot->LoadSequentiallyConsistent();
- }
- slot->StoreRelaxed(nullptr);
- head_.StoreSequentiallyConsistent(h + 1);
- return ref;
- }
-
- bool IsEmpty() {
- size_t h = head_.LoadSequentiallyConsistent();
- size_t t = tail_.LoadSequentiallyConsistent();
- return h == t;
- }
-
- void Clear() {
- head_.StoreRelaxed(0);
- tail_.StoreRelaxed(0);
- memset(buf_.get(), 0, size_ * sizeof(Atomic<mirror::Object*>));
- }
-
- private:
- Atomic<size_t> head_;
- Atomic<size_t> tail_;
-
- size_t size_;
- std::unique_ptr<Atomic<mirror::Object*>[]> buf_;
-};
-
class ConcurrentCopying : public GarbageCollector {
public:
// TODO: disable thse flags for production use.
@@ -185,10 +102,12 @@
Barrier& GetBarrier() {
return *gc_barrier_;
}
+ bool IsWeakRefAccessEnabled() {
+ return weak_ref_access_enabled_.LoadRelaxed();
+ }
+ void RevokeThreadLocalMarkStack(Thread* thread) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
private:
- mirror::Object* PopOffMarkStack();
- template<bool kThreadSafe>
void PushOntoMarkStack(mirror::Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
mirror::Object* Copy(mirror::Object* from_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
void Scan(mirror::Object* to_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -202,11 +121,18 @@
void VerifyNoFromSpaceReferences() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
accounting::ObjectStack* GetAllocationStack();
accounting::ObjectStack* GetLiveStack();
- bool ProcessMarkStack() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+ void ProcessMarkStack() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+ bool ProcessMarkStackOnce() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+ void ProcessMarkStackRef(mirror::Object* to_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+ size_t ProcessThreadLocalMarkStacks(bool disable_weak_ref_access)
+ SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+ void RevokeThreadLocalMarkStacks(bool disable_weak_ref_access)
+ SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+ void SwitchToSharedMarkStackMode() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+ void SwitchToGcExclusiveMarkStackMode() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
void DelayReferenceReferent(mirror::Class* klass, mirror::Reference* reference)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
- void ProcessReferences(Thread* self, bool concurrent)
- SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+ void ProcessReferences(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
mirror::Object* IsMarked(mirror::Object* from_ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
static mirror::Object* MarkCallback(mirror::Object* from_ref, void* arg)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -229,7 +155,7 @@
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
mirror::Object* AllocateInSkippedBlock(size_t alloc_size)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
- void CheckEmptyMarkQueue() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+ void CheckEmptyMarkStack() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
void IssueEmptyCheckpoint() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
bool IsOnAllocStack(mirror::Object* ref) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
mirror::Object* GetFwdPtr(mirror::Object* from_ref)
@@ -242,10 +168,19 @@
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
void AssertToSpaceInvariantInNonMovingSpace(mirror::Object* obj, mirror::Object* ref)
SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+ void ReenableWeakRefAccess(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
space::RegionSpace* region_space_; // The underlying region space.
std::unique_ptr<Barrier> gc_barrier_;
- MarkQueue mark_queue_;
+ std::unique_ptr<accounting::ObjectStack> gc_mark_stack_;
+ Mutex mark_stack_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+ std::vector<accounting::ObjectStack*> revoked_mark_stacks_
+ GUARDED_BY(mark_stack_lock_);
+ static constexpr size_t kMarkStackSize = kPageSize;
+ static constexpr size_t kMarkStackPoolSize = 256;
+ std::vector<accounting::ObjectStack*> pooled_mark_stacks_
+ GUARDED_BY(mark_stack_lock_);
+ Thread* thread_running_gc_;
bool is_marking_; // True while marking is ongoing.
bool is_active_; // True while the collection is ongoing.
bool is_asserting_to_space_invariant_; // True while asserting the to-space invariant.
@@ -258,7 +193,18 @@
size_t live_stack_freeze_size_;
size_t from_space_num_objects_at_first_pause_;
size_t from_space_num_bytes_at_first_pause_;
- Atomic<int> is_mark_queue_push_disallowed_;
+ Atomic<int> is_mark_stack_push_disallowed_;
+ enum MarkStackMode {
+ kMarkStackModeOff = 0, // Mark stack is off.
+ kMarkStackModeThreadLocal, // All threads except for the GC-running thread push refs onto
+ // thread-local mark stacks. The GC-running thread pushes onto and
+ // pops off the GC mark stack without a lock.
+ kMarkStackModeShared, // All threads share the GC mark stack with a lock.
+ kMarkStackModeGcExclusive // The GC-running thread pushes onto and pops from the GC mark stack
+ // without a lock. Other threads won't access the mark stack.
+ };
+ Atomic<MarkStackMode> mark_stack_mode_;
+ Atomic<bool> weak_ref_access_enabled_;
// How many objects and bytes we moved. Used for accounting.
Atomic<size_t> bytes_moved_;
@@ -284,6 +230,7 @@
friend class ThreadFlipVisitor;
friend class FlipCallback;
friend class ConcurrentCopyingComputeUnevacFromSpaceLiveRatioVisitor;
+ friend class RevokeThreadLocalMarkStackCheckpoint;
DISALLOW_IMPLICIT_CONSTRUCTORS(ConcurrentCopying);
};