diff options
author | 2012-10-26 13:51:26 -0700 | |
---|---|---|
committer | 2012-11-06 16:07:36 -0800 | |
commit | 02b6a78038f12c109f95eb31713cfc747f5512f1 (patch) | |
tree | c36841561a47b2ce3cf15b00fdae822e5a6c5b64 | |
parent | bcc2926b9721f94c17ed98fae5264cc98f0e066f (diff) |
Parellel mark stack processing
Enabled parallel mark stack processing by using a thread pool.
Optimized object scanning by removing dependent loads for IsClass.
Performance:
Prime: ~10% speedup of partial GC.
Nakasi: ~50% speedup of partial GC.
Change-Id: I43256a068efc47cb52d93108458ea18d4e02fccc
-rw-r--r-- | src/atomic_integer.h | 4 | ||||
-rw-r--r-- | src/barrier_test.cc | 18 | ||||
-rw-r--r-- | src/class_linker.cc | 14 | ||||
-rw-r--r-- | src/common_test.h | 4 | ||||
-rw-r--r-- | src/compiler.cc | 10 | ||||
-rw-r--r-- | src/gc/heap_bitmap.h | 4 | ||||
-rw-r--r-- | src/gc/mark_sweep.cc | 499 | ||||
-rw-r--r-- | src/gc/mark_sweep.h | 177 | ||||
-rw-r--r-- | src/gc/space_bitmap.h | 45 | ||||
-rw-r--r-- | src/heap.cc | 81 | ||||
-rw-r--r-- | src/heap.h | 14 | ||||
-rw-r--r-- | src/mutex.cc | 2 | ||||
-rw-r--r-- | src/object.cc | 13 | ||||
-rw-r--r-- | src/object.h | 16 | ||||
-rw-r--r-- | src/runtime.cc | 7 | ||||
-rw-r--r-- | src/thread_pool.cc | 178 | ||||
-rw-r--r-- | src/thread_pool.h | 96 | ||||
-rw-r--r-- | src/thread_pool_test.cc | 26 |
18 files changed, 848 insertions, 360 deletions
diff --git a/src/atomic_integer.h b/src/atomic_integer.h index adf3e774c4..22cc7b41ed 100644 --- a/src/atomic_integer.h +++ b/src/atomic_integer.h @@ -71,6 +71,10 @@ class AtomicInteger { int32_t operator -- () { return android_atomic_dec(&value_) - 1; } + + int CompareAndSwap(int expected_value, int new_value) { + return android_atomic_cas(expected_value, new_value, &value_); + } private: int32_t value_; }; diff --git a/src/barrier_test.cc b/src/barrier_test.cc index 43b279e047..7b31e29111 100644 --- a/src/barrier_test.cc +++ b/src/barrier_test.cc @@ -24,9 +24,9 @@ #include "UniquePtr.h" namespace art { -class CheckWaitClosure : public Closure { +class CheckWaitTask : public Task { public: - CheckWaitClosure(Barrier* barrier, AtomicInteger* count1, AtomicInteger* count2, + CheckWaitTask(Barrier* barrier, AtomicInteger* count1, AtomicInteger* count2, AtomicInteger* count3) : barrier_(barrier), count1_(count1), @@ -44,6 +44,9 @@ class CheckWaitClosure : public Closure { barrier_->Wait(self); ++*count3_; LOG(INFO) << "After barrier 2 " << self; + } + + virtual void Finalize() { delete this; } private: @@ -69,7 +72,7 @@ TEST_F(BarrierTest, CheckWait) { AtomicInteger count2 = 0; AtomicInteger count3 = 0; for (int32_t i = 0; i < num_threads; ++i) { - thread_pool.AddTask(self, new CheckWaitClosure(&barrier, &count1, &count2, &count3)); + thread_pool.AddTask(self, new CheckWaitTask(&barrier, &count1, &count2, &count3)); } thread_pool.StartWorkers(self); barrier.Increment(self, num_threads); @@ -91,9 +94,9 @@ TEST_F(BarrierTest, CheckWait) { EXPECT_EQ(num_threads, count3); } -class CheckPassClosure : public Closure { +class CheckPassTask : public Task { public: - CheckPassClosure(Barrier* barrier, AtomicInteger* count, size_t subtasks) + CheckPassTask(Barrier* barrier, AtomicInteger* count, size_t subtasks) : barrier_(barrier), count_(count), subtasks_(subtasks) { @@ -106,6 +109,9 @@ class CheckPassClosure : public Closure { // Pass through to next subtask. barrier_->Pass(self); } + } + + void Finalize() { delete this; } private: @@ -123,7 +129,7 @@ TEST_F(BarrierTest, CheckPass) { const int32_t num_tasks = num_threads * 4; const int32_t num_sub_tasks = 128; for (int32_t i = 0; i < num_tasks; ++i) { - thread_pool.AddTask(self, new CheckPassClosure(&barrier, &count, num_sub_tasks)); + thread_pool.AddTask(self, new CheckPassTask(&barrier, &count, num_sub_tasks)); } thread_pool.StartWorkers(self); const int32_t expected_total_tasks = num_sub_tasks * num_tasks; diff --git a/src/class_linker.cc b/src/class_linker.cc index dc86aed4a1..ce9b37bb7c 100644 --- a/src/class_linker.cc +++ b/src/class_linker.cc @@ -240,6 +240,7 @@ void ClassLinker::InitFromCompiler(const std::vector<const DexFile*>& boot_class SirtRef<Class> java_lang_Class(self, down_cast<Class*>(heap->AllocObject(self, NULL, sizeof(ClassClass)))); CHECK(java_lang_Class.get() != NULL); + Class::SetClassClass(java_lang_Class.get()); java_lang_Class->SetClass(java_lang_Class.get()); java_lang_Class->SetClassSize(sizeof(ClassClass)); // AllocClass(Class*) can now be used @@ -972,11 +973,12 @@ void ClassLinker::InitFromImage() { Object* dex_caches_object = space->GetImageHeader().GetImageRoot(ImageHeader::kDexCaches); ObjectArray<DexCache>* dex_caches = dex_caches_object->AsObjectArray<DexCache>(); + ObjectArray<Class>* class_roots = + space->GetImageHeader().GetImageRoot(ImageHeader::kClassRoots)->AsObjectArray<Class>(); + // Special case of setting up the String class early so that we can test arbitrary objects // as being Strings or not - Class* java_lang_String = space->GetImageHeader().GetImageRoot(ImageHeader::kClassRoots) - ->AsObjectArray<Class>()->Get(kJavaLangString); - String::SetClass(java_lang_String); + String::SetClass(class_roots->Get(kJavaLangString)); CHECK_EQ(oat_file->GetOatHeader().GetDexFileCount(), static_cast<uint32_t>(dex_caches->GetLength())); @@ -1004,9 +1006,8 @@ void ClassLinker::InitFromImage() { } // reinit class_roots_ - Object* class_roots_object = - heap->GetImageSpace()->GetImageHeader().GetImageRoot(ImageHeader::kClassRoots); - class_roots_ = class_roots_object->AsObjectArray<Class>(); + Class::SetClassClass(class_roots->Get(kJavaLangClass)); + class_roots_ = class_roots; // reinit array_iftable_ from any array class instance, they should be == array_iftable_ = GetClassRoot(kObjectArrayClass)->GetIfTable(); @@ -1112,6 +1113,7 @@ void ClassLinker::VisitClassesWithoutClassesLock(ClassVisitor* visitor, void* ar ClassLinker::~ClassLinker() { + Class::ResetClass(); String::ResetClass(); Field::ResetClass(); AbstractMethod::ResetClasses(); diff --git a/src/common_test.h b/src/common_test.h index 67d2266da1..f564bbdb14 100644 --- a/src/common_test.h +++ b/src/common_test.h @@ -388,6 +388,10 @@ class CommonTest : public testing::Test { compiler_.reset(new Compiler(compiler_backend, instruction_set, true, 2, false, image_classes_.get(), true, true)); + // Create the heap thread pool so that the GC runs in parallel for tests. Normally, the thread + // pool is created by the runtime. + runtime_->GetHeap()->CreateThreadPool(); + runtime_->GetHeap()->VerifyHeap(); // Check for heap corruption before the test } diff --git a/src/compiler.cc b/src/compiler.cc index b09691211f..4c9860cecb 100644 --- a/src/compiler.cc +++ b/src/compiler.cc @@ -993,7 +993,7 @@ class CompilationContext { self->AssertNoPendingException(); CHECK_GT(work_units, 0U); - std::vector<Closure*> closures(work_units); + std::vector<ForAllClosure*> closures(work_units); for (size_t i = 0; i < work_units; ++i) { closures[i] = new ForAllClosure(this, begin + i, end, callback, work_units); thread_pool_->AddTask(self, closures[i]); @@ -1006,13 +1006,11 @@ class CompilationContext { // Wait for all the worker threads to finish. thread_pool_->Wait(self); - - STLDeleteElements(&closures); } private: - class ForAllClosure : public Closure { + class ForAllClosure : public Task { public: ForAllClosure(CompilationContext* context, size_t begin, size_t end, Callback* callback, size_t stripe) @@ -1031,6 +1029,10 @@ class CompilationContext { self->AssertNoPendingException(); } } + + virtual void Finalize() { + delete this; + } private: CompilationContext* const context_; const size_t begin_; diff --git a/src/gc/heap_bitmap.h b/src/gc/heap_bitmap.h index 1610df8fe2..666fcc7dd9 100644 --- a/src/gc/heap_bitmap.h +++ b/src/gc/heap_bitmap.h @@ -38,9 +38,9 @@ namespace art { EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) { SpaceBitmap* bitmap = GetSpaceBitmap(obj); if (LIKELY(bitmap != NULL)) { - return bitmap->Clear(obj); + bitmap->Clear(obj); } else { - return large_objects_->Clear(obj); + large_objects_->Clear(obj); } } diff --git a/src/gc/mark_sweep.cc b/src/gc/mark_sweep.cc index e93eb1af98..4662cf6901 100644 --- a/src/gc/mark_sweep.cc +++ b/src/gc/mark_sweep.cc @@ -41,8 +41,17 @@ namespace art { +// Performance options. +static const bool kParallelMarkStack = true; +static const bool kDisableFinger = true; static const bool kUseMarkStackPrefetch = true; +// Profiling and information flags. +static const bool kCountClassesMarked = false; +static const bool kProfileLargeObjects = false; +static const bool kMeasureOverhead = false; +static const bool kCountTasks = false; + class SetFingerVisitor { public: SetFingerVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) { @@ -71,17 +80,20 @@ MarkSweep::MarkSweep(ObjectStack* mark_stack) cleared_reference_list_(NULL), freed_bytes_(0), freed_objects_(0), class_count_(0), array_count_(0), other_count_(0), - gc_barrier_(new Barrier) { + large_object_test_(0), large_object_mark_(0), + classes_marked_(0), overhead_time_(0), + work_chunks_created_(0), work_chunks_deleted_(0), + gc_barrier_(new Barrier), + large_object_lock_("large object lock") { DCHECK(mark_stack_ != NULL); } void MarkSweep::Init() { + java_lang_Class_ = Class::GetJavaLangClass(); + CHECK(java_lang_Class_ != NULL); heap_ = Runtime::Current()->GetHeap(); mark_stack_->Reset(); - // TODO: C++0x auto FindDefaultMarkBitmap(); - // TODO: if concurrent, enable card marking in compiler - // TODO: check that the mark bitmap is entirely clear. // Mark any concurrent roots as dirty since we need to scan them at least once during this GC. Runtime::Current()->DirtyRoots(); } @@ -99,7 +111,7 @@ void MarkSweep::FindDefaultMarkBitmap() { LOG(FATAL) << "Could not find a default mark bitmap"; } -inline void MarkSweep::MarkObject0(const Object* obj, bool check_finger) { +inline void MarkSweep::MarkObjectNonNull(const Object* obj, bool check_finger) { DCHECK(obj != NULL); if (obj >= immune_begin_ && obj < immune_end_) { @@ -109,32 +121,21 @@ inline void MarkSweep::MarkObject0(const Object* obj, bool check_finger) { // Try to take advantage of locality of references within a space, failing this find the space // the hard way. - if (UNLIKELY(!current_mark_bitmap_->HasAddress(obj))) { + SpaceBitmap* object_bitmap = current_mark_bitmap_; + if (UNLIKELY(!object_bitmap->HasAddress(obj))) { SpaceBitmap* new_bitmap = heap_->GetMarkBitmap()->GetSpaceBitmap(obj); if (new_bitmap != NULL) { - current_mark_bitmap_ = new_bitmap; + object_bitmap = new_bitmap; } else { - LargeObjectSpace* large_object_space = GetHeap()->GetLargeObjectsSpace(); - SpaceSetMap* large_objects = large_object_space->GetMarkObjects(); - if (!large_objects->Test(obj)) { - if (!large_object_space->Contains(obj)) { - LOG(ERROR) << "Tried to mark " << obj << " not contained by any spaces"; - LOG(ERROR) << "Attempting see if it's a bad root"; - VerifyRoots(); - LOG(FATAL) << "Can't mark bad root"; - } - large_objects->Set(obj); - // Don't need to check finger since large objects never have any object references. - } - // TODO: Improve clarity of control flow in this function? + MarkLargeObject(obj); return; } } // This object was not previously marked. - if (!current_mark_bitmap_->Test(obj)) { - current_mark_bitmap_->Set(obj); - if (check_finger && obj < finger_) { + if (!object_bitmap->Test(obj)) { + object_bitmap->Set(obj); + if (kDisableFinger || (check_finger && obj < finger_)) { // Do we need to expand the mark stack? if (UNLIKELY(mark_stack_->Size() >= mark_stack_->Capacity())) { std::vector<Object*> temp; @@ -150,6 +151,57 @@ inline void MarkSweep::MarkObject0(const Object* obj, bool check_finger) { } } +// Rare case, probably not worth inlining since it will increase instruction cache miss rate. +bool MarkSweep::MarkLargeObject(const Object* obj) { + LargeObjectSpace* large_object_space = GetHeap()->GetLargeObjectsSpace(); + SpaceSetMap* large_objects = large_object_space->GetMarkObjects(); + if (kProfileLargeObjects) { + ++large_object_test_; + } + if (UNLIKELY(!large_objects->Test(obj))) { + if (!large_object_space->Contains(obj)) { + LOG(ERROR) << "Tried to mark " << obj << " not contained by any spaces"; + LOG(ERROR) << "Attempting see if it's a bad root"; + VerifyRoots(); + LOG(FATAL) << "Can't mark bad root"; + } + if (kProfileLargeObjects) { + ++large_object_mark_; + } + large_objects->Set(obj); + // Don't need to check finger since large objects never have any object references. + return true; + } + return false; +} + +inline bool MarkSweep::MarkObjectParallel(const Object* obj) { + DCHECK(obj != NULL); + + if (obj >= immune_begin_ && obj < immune_end_) { + DCHECK(IsMarked(obj)); + return false; + } + + // Try to take advantage of locality of references within a space, failing this find the space + // the hard way. + SpaceBitmap* object_bitmap = current_mark_bitmap_; + if (UNLIKELY(!object_bitmap->HasAddress(obj))) { + SpaceBitmap* new_bitmap = heap_->GetMarkBitmap()->GetSpaceBitmap(obj); + if (new_bitmap != NULL) { + object_bitmap = new_bitmap; + } else { + // TODO: Remove the Thread::Current here? + // TODO: Convert this to some kind of atomic marking? + MutexLock mu(Thread::Current(), large_object_lock_); + return MarkLargeObject(obj); + } + } + + // Return true if the object was not previously marked. + return !object_bitmap->AtomicTestAndSet(obj); +} + // Used to mark objects when recursing. Recursion is done by moving // the finger across the bitmaps in address order and marking child // objects. Any newly-marked objects whose addresses are lower than @@ -157,22 +209,22 @@ inline void MarkSweep::MarkObject0(const Object* obj, bool check_finger) { // need to be added to the mark stack. void MarkSweep::MarkObject(const Object* obj) { if (obj != NULL) { - MarkObject0(obj, true); + MarkObjectNonNull(obj, true); } } -void MarkSweep::MarkObjectVisitor(const Object* root, void* arg) { +void MarkSweep::MarkObjectCallback(const Object* root, void* arg) { DCHECK(root != NULL); DCHECK(arg != NULL); MarkSweep* mark_sweep = reinterpret_cast<MarkSweep*>(arg); - mark_sweep->MarkObject0(root, false); + mark_sweep->MarkObjectNonNull(root, false); } void MarkSweep::ReMarkObjectVisitor(const Object* root, void* arg) { DCHECK(root != NULL); DCHECK(arg != NULL); MarkSweep* mark_sweep = reinterpret_cast<MarkSweep*>(arg); - mark_sweep->MarkObject0(root, true); + mark_sweep->MarkObjectNonNull(root, true); } void MarkSweep::VerifyRootCallback(const Object* root, void* arg, size_t vreg, @@ -201,15 +253,15 @@ void MarkSweep::VerifyRoots() { // Marks all objects in the root set. void MarkSweep::MarkRoots() { - Runtime::Current()->VisitNonConcurrentRoots(MarkObjectVisitor, this); + Runtime::Current()->VisitNonConcurrentRoots(MarkObjectCallback, this); } void MarkSweep::MarkNonThreadRoots() { - Runtime::Current()->VisitNonThreadRoots(MarkObjectVisitor, this); + Runtime::Current()->VisitNonThreadRoots(MarkObjectCallback, this); } void MarkSweep::MarkConcurrentRoots() { - Runtime::Current()->VisitConcurrentRoots(MarkObjectVisitor, this); + Runtime::Current()->VisitConcurrentRoots(MarkObjectCallback, this); } class CheckObjectVisitor { @@ -259,26 +311,26 @@ void MarkSweep::BindLiveToMarkBitmap(ContinuousSpace* space) { alloc_space->mark_bitmap_.reset(live_bitmap); } -class ScanImageRootVisitor { +class ScanObjectVisitor { public: - ScanImageRootVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) { + ScanObjectVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) { + } - void operator ()(const Object* root) const + void operator ()(const Object* obj) const EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) { - DCHECK(root != NULL); - mark_sweep_->ScanObject(root); + mark_sweep_->ScanObject(obj); } private: MarkSweep* const mark_sweep_; }; -void MarkSweep::ScanGrayObjects(bool update_finger) { +void MarkSweep::ScanGrayObjects() { const Spaces& spaces = heap_->GetSpaces(); CardTable* card_table = heap_->GetCardTable(); - ScanImageRootVisitor image_root_visitor(this); + ScanObjectVisitor visitor(this); SetFingerVisitor finger_visitor(this); // TODO: C++ 0x auto for (Spaces::const_iterator it = spaces.begin(); it != spaces.end(); ++it) { @@ -287,11 +339,7 @@ void MarkSweep::ScanGrayObjects(bool update_finger) { byte* end = space->End(); // Image spaces are handled properly since live == marked for them. SpaceBitmap* mark_bitmap = space->GetMarkBitmap(); - if (update_finger) { - card_table->Scan(mark_bitmap, begin, end, image_root_visitor, finger_visitor); - } else { - card_table->Scan(mark_bitmap, begin, end, image_root_visitor, IdentityFunctor()); - } + card_table->Scan(mark_bitmap, begin, end, visitor, IdentityFunctor()); } } @@ -330,22 +378,6 @@ void MarkSweep::VerifyImageRoots() { } } -class ScanObjectVisitor { - public: - ScanObjectVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) { - - } - - void operator ()(const Object* obj) const - EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) - SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) { - mark_sweep_->ScanObject(obj); - } - - private: - MarkSweep* const mark_sweep_; -}; - // Populates the mark stack based on the set of marked objects and // recursively marks until the mark stack is emptied. void MarkSweep::RecursiveMark(bool partial, TimingLogger& timings) { @@ -358,12 +390,11 @@ void MarkSweep::RecursiveMark(bool partial, TimingLogger& timings) { CHECK(cleared_reference_list_ == NULL); const Spaces& spaces = heap_->GetSpaces(); - SetFingerVisitor set_finger_visitor(this); ScanObjectVisitor scan_visitor(this); for (Spaces::const_iterator it = spaces.begin(); it != spaces.end(); ++it) { ContinuousSpace* space = *it; - if (space->GetGcRetentionPolicy() == kGcRetentionPolicyAlwaysCollect || + if ((!kDisableFinger && space->GetGcRetentionPolicy() == kGcRetentionPolicyAlwaysCollect) || (!partial && space->GetGcRetentionPolicy() == kGcRetentionPolicyFullCollect) ) { current_mark_bitmap_ = space->GetMarkBitmap(); @@ -386,7 +417,7 @@ void MarkSweep::RecursiveMark(bool partial, TimingLogger& timings) { void MarkSweep::RecursiveMarkCards(CardTable* card_table, const std::vector<byte*>& cards, TimingLogger& timings) { - ScanImageRootVisitor image_root_visitor(this); + ScanObjectVisitor image_root_visitor(this); SetFingerVisitor finger_visitor(this); const size_t card_count = cards.size(); SpaceBitmap* active_bitmap = NULL; @@ -399,14 +430,16 @@ void MarkSweep::RecursiveMarkCards(CardTable* card_table, const std::vector<byte } if (active_bitmap == NULL || !active_bitmap->HasAddress(start_obj)) { active_bitmap = heap_->GetMarkBitmap()->GetSpaceBitmap(start_obj); -#ifndef NDEBUG - if (active_bitmap == NULL) { + if (kIsDebugBuild && active_bitmap == NULL) { GetHeap()->DumpSpaces(); LOG(FATAL) << "Object " << reinterpret_cast<const void*>(start_obj); } -#endif } - active_bitmap->VisitMarkedRange(begin, end, image_root_visitor, finger_visitor); + if (kDisableFinger) { + active_bitmap->VisitMarkedRange(begin, end, image_root_visitor, IdentityFunctor()); + } else { + active_bitmap->VisitMarkedRange(begin, end, image_root_visitor, finger_visitor); + } } timings.AddSplit("RecursiveMarkCards"); ProcessMarkStack(); @@ -419,8 +452,8 @@ bool MarkSweep::IsMarkedCallback(const Object* object, void* arg) { !reinterpret_cast<MarkSweep*>(arg)->GetHeap()->GetLiveBitmap()->Test(object); } -void MarkSweep::RecursiveMarkDirtyObjects(bool update_finger) { - ScanGrayObjects(update_finger); +void MarkSweep::RecursiveMarkDirtyObjects() { + ScanGrayObjects(); ProcessMarkStack(); } @@ -539,7 +572,7 @@ class CheckpointMarkThreadRoots : public Closure { DCHECK(thread == self || thread->IsSuspended() || thread->GetState() == kWaitingPerformingGc) << thread->GetState(); WriterMutexLock mu(self, *Locks::heap_bitmap_lock_); - thread->VisitRoots(MarkSweep::MarkObjectVisitor, mark_sweep_); + thread->VisitRoots(MarkSweep::MarkObjectCallback, mark_sweep_); mark_sweep_->GetBarrier().Pass(self); } @@ -561,8 +594,6 @@ void MarkSweep::MarkRootsCheckpoint() { } void MarkSweep::SweepCallback(size_t num_ptrs, Object** ptrs, void* arg) { - size_t freed_objects = num_ptrs; - size_t freed_bytes = 0; SweepCallbackContext* context = static_cast<SweepCallbackContext*>(arg); MarkSweep* mark_sweep = context->mark_sweep; Heap* heap = mark_sweep->GetHeap(); @@ -572,17 +603,9 @@ void MarkSweep::SweepCallback(size_t num_ptrs, Object** ptrs, void* arg) { // Use a bulk free, that merges consecutive objects before freeing or free per object? // Documentation suggests better free performance with merging, but this may be at the expensive // of allocation. - // TODO: investigate performance - static const bool kUseFreeList = true; - if (kUseFreeList) { - // AllocSpace::FreeList clears the value in ptrs, so perform after clearing the live bit - freed_bytes += space->FreeList(self, num_ptrs, ptrs); - } else { - for (size_t i = 0; i < num_ptrs; ++i) { - freed_bytes += space->Free(self, ptrs[i]); - } - } - + size_t freed_objects = num_ptrs; + // AllocSpace::FreeList clears the value in ptrs, so perform after clearing the live bit + size_t freed_bytes = space->FreeList(self, num_ptrs, ptrs); heap->RecordFree(freed_objects, freed_bytes); mark_sweep->freed_objects_ += freed_objects; mark_sweep->freed_bytes_ += freed_bytes; @@ -606,7 +629,6 @@ void MarkSweep::SweepArray(TimingLogger& logger, ObjectStack* allocations, bool // If we don't swap bitmaps then newly allocated Weaks go into the live bitmap but not mark // bitmap, resulting in occasional frees of Weaks which are still in use. - // TODO: Fix when sweeping weaks works properly with mutators unpaused + allocation list. SweepSystemWeaksArray(allocations); logger.AddSplit("SweepSystemWeaks"); @@ -656,12 +678,13 @@ void MarkSweep::SweepArray(TimingLogger& logger, ObjectStack* allocations, bool logger.AddSplit("Reset stack"); } -void MarkSweep::Sweep(bool partial, bool swap_bitmaps) { +void MarkSweep::Sweep(TimingLogger& timings, bool partial, bool swap_bitmaps) { DCHECK(mark_stack_->IsEmpty()); // If we don't swap bitmaps then newly allocated Weaks go into the live bitmap but not mark // bitmap, resulting in occasional frees of Weaks which are still in use. SweepSystemWeaks(); + timings.AddSplit("SweepSystemWeaks"); const Spaces& spaces = heap_->GetSpaces(); SweepCallbackContext scc; @@ -693,6 +716,7 @@ void MarkSweep::Sweep(bool partial, bool swap_bitmaps) { } } } + timings.AddSplit("Sweep"); } void MarkSweep::SweepLargeObjects(bool swap_bitmaps) { @@ -721,53 +745,6 @@ void MarkSweep::SweepLargeObjects(bool swap_bitmaps) { GetHeap()->RecordFree(freed_objects, freed_bytes); } -// Scans instance fields. -inline void MarkSweep::ScanInstanceFields(const Object* obj) { - DCHECK(obj != NULL); - Class* klass = obj->GetClass(); - DCHECK(klass != NULL); - ScanFields(obj, klass->GetReferenceInstanceOffsets(), false); -} - -// Scans static storage on a Class. -inline void MarkSweep::ScanStaticFields(const Class* klass) { - DCHECK(klass != NULL); - ScanFields(klass, klass->GetReferenceStaticOffsets(), true); -} - -inline void MarkSweep::ScanFields(const Object* obj, uint32_t ref_offsets, bool is_static) { - if (ref_offsets != CLASS_WALK_SUPER) { - // Found a reference offset bitmap. Mark the specified offsets. - while (ref_offsets != 0) { - const size_t right_shift = CLZ(ref_offsets); - MemberOffset byte_offset = CLASS_OFFSET_FROM_CLZ(right_shift); - const Object* ref = obj->GetFieldObject<const Object*>(byte_offset, false); - MarkObject(ref); - ref_offsets ^= CLASS_HIGH_BIT >> right_shift; - } - } else { - // There is no reference offset bitmap. In the non-static case, - // walk up the class inheritance hierarchy and find reference - // offsets the hard way. In the static case, just consider this - // class. - for (const Class* klass = is_static ? obj->AsClass() : obj->GetClass(); - klass != NULL; - klass = is_static ? NULL : klass->GetSuperClass()) { - size_t num_reference_fields = (is_static - ? klass->NumReferenceStaticFields() - : klass->NumReferenceInstanceFields()); - for (size_t i = 0; i < num_reference_fields; ++i) { - Field* field = (is_static - ? klass->GetStaticField(i) - : klass->GetInstanceField(i)); - MemberOffset field_offset = field->GetOffset(); - const Object* ref = obj->GetFieldObject<const Object*>(field_offset, false); - MarkObject(ref); - } - } - } -} - void MarkSweep::CheckReference(const Object* obj, const Object* ref, MemberOffset offset, bool is_static) { const Spaces& spaces = heap_->GetSpaces(); // TODO: C++0x auto @@ -812,32 +789,6 @@ void MarkSweep::CheckReference(const Object* obj, const Object* ref, MemberOffse } } -// Scans the header, static field references, and interface pointers -// of a class object. -inline void MarkSweep::ScanClass(const Object* obj) { -#ifndef NDEBUG - ++class_count_; -#endif - ScanInstanceFields(obj); - ScanStaticFields(obj->AsClass()); -} - -// Scans the header of all array objects. If the array object is -// specialized to a reference type, scans the array data as well. -inline void MarkSweep::ScanArray(const Object* obj) { -#ifndef NDEBUG - ++array_count_; -#endif - MarkObject(obj->GetClass()); - if (obj->IsObjectArray()) { - const ObjectArray<Object>* array = obj->AsObjectArray<Object>(); - for (int32_t i = 0; i < array->GetLength(); ++i) { - const Object* element = array->GetWithoutChecks(i); - MarkObject(element); - } - } -} - // Process the "referent" field in a java.lang.ref.Reference. If the // referent has not yet been marked, put it on the appropriate list in // the gcHeap for later processing. @@ -860,49 +811,205 @@ void MarkSweep::DelayReferenceReferent(Object* obj) { list = &phantom_reference_list_; } DCHECK(list != NULL) << PrettyClass(klass) << " " << std::hex << klass->GetAccessFlags(); + // TODO: One lock per list? heap_->EnqueuePendingReference(obj, list); } } -// Scans the header and field references of a data object. If the -// scanned object is a reference subclass, it is scheduled for later -// processing. -inline void MarkSweep::ScanOther(const Object* obj) { -#ifndef NDEBUG - ++other_count_; -#endif - ScanInstanceFields(obj); - if (obj->GetClass()->IsReferenceClass()) { - DelayReferenceReferent(const_cast<Object*>(obj)); - } -} - void MarkSweep::ScanRoot(const Object* obj) { ScanObject(obj); } +class MarkObjectVisitor { + public: + MarkObjectVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) { + } + + void operator ()(const Object* /* obj */, const Object* ref, const MemberOffset& /* offset */, + bool /* is_static */) const + EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) + SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) { + mark_sweep_->MarkObject(ref); + } + + private: + MarkSweep* const mark_sweep_; +}; + // Scans an object reference. Determines the type of the reference // and dispatches to a specialized scanning routine. void MarkSweep::ScanObject(const Object* obj) { - DCHECK(obj != NULL); - DCHECK(obj->GetClass() != NULL); -#ifndef NDEBUG - if (!IsMarked(obj)) { - heap_->DumpSpaces(); - LOG(FATAL) << "Scanning unmarked object " << reinterpret_cast<const void*>(obj); - } -#endif - if (obj->IsClass()) { - ScanClass(obj); - } else if (obj->IsArrayInstance()) { - ScanArray(obj); - } else { - ScanOther(obj); + MarkObjectVisitor visitor(this); + ScanObjectVisit(obj, visitor); +} + +class MarkStackChunk : public Task { +public: + MarkStackChunk(ThreadPool* thread_pool, MarkSweep* mark_sweep, Object** begin, Object** end) + : mark_sweep_(mark_sweep), + thread_pool_(thread_pool), + index_(0), + length_(0), + output_(NULL) { + length_ = end - begin; + if (begin != end) { + // Cost not significant since we only do this for the initial set of mark stack chunks. + memcpy(data_, begin, length_ * sizeof(*begin)); + } + if (kCountTasks) { + ++mark_sweep_->work_chunks_created_; + } + } + + ~MarkStackChunk() { + DCHECK(output_ == NULL || output_->length_ == 0); + DCHECK_GE(index_, length_); + delete output_; + if (kCountTasks) { + ++mark_sweep_->work_chunks_deleted_; + } + } + + MarkSweep* const mark_sweep_; + ThreadPool* const thread_pool_; + static const size_t max_size = 1 * KB; + // Index of which object we are scanning. Only needs to be atomic if we are doing work stealing. + size_t index_; + // Input / output mark stack. We add newly marked references to data_ until length reaches + // max_size. This is an optimization so that less tasks are created. + // TODO: Investigate using a bounded buffer FIFO. + Object* data_[max_size]; + // How many elements in data_ we need to scan. + size_t length_; + // Output block, newly marked references get added to the ouput block so that another thread can + // scan them. + MarkStackChunk* output_; + + class MarkObjectParallelVisitor { + public: + MarkObjectParallelVisitor(MarkStackChunk* chunk_task) : chunk_task_(chunk_task) { + + } + + void operator ()(const Object* /* obj */, const Object* ref, + const MemberOffset& /* offset */, bool /* is_static */) const { + if (ref != NULL && chunk_task_->mark_sweep_->MarkObjectParallel(ref)) { + chunk_task_->MarkStackPush(ref); + } + } + + private: + MarkStackChunk* const chunk_task_; + }; + + // Push an object into the block. + // Don't need to use atomic ++ since we only one thread is writing to an output block at any + // given time. + void Push(Object* obj) { + data_[length_++] = obj; } + + void MarkStackPush(const Object* obj) { + if (static_cast<size_t>(length_) < max_size) { + Push(const_cast<Object*>(obj)); + } else { + // Internal buffer is full, push to a new buffer instead. + if (UNLIKELY(output_ == NULL)) { + AllocateOutputChunk(); + } else if (UNLIKELY(static_cast<size_t>(output_->length_) == max_size)) { + // Output block is full, queue it up for processing and obtain a new block. + EnqueueOutput(); + AllocateOutputChunk(); + } + output_->Push(const_cast<Object*>(obj)); + } + } + + void ScanObject(Object* obj) { + mark_sweep_->ScanObjectVisit(obj, MarkObjectParallelVisitor(this)); + } + + void EnqueueOutput() { + if (output_ != NULL) { + uint64_t start = 0; + if (kMeasureOverhead) { + start = NanoTime(); + } + thread_pool_->AddTask(Thread::Current(), output_); + output_ = NULL; + if (kMeasureOverhead) { + mark_sweep_->overhead_time_ += NanoTime() - start; + } + } + } + + void AllocateOutputChunk() { + uint64_t start = 0; + if (kMeasureOverhead) { + start = NanoTime(); + } + output_ = new MarkStackChunk(thread_pool_, mark_sweep_, NULL, NULL); + if (kMeasureOverhead) { + mark_sweep_->overhead_time_ += NanoTime() - start; + } + } + + void Finalize() { + EnqueueOutput(); + delete this; + } + + // Scans all of the objects + virtual void Run(Thread* self) { + int index; + while ((index = index_++) < length_) { + static const size_t prefetch_look_ahead = 4; + if (kUseMarkStackPrefetch) { + if (index + prefetch_look_ahead < length_) { + __builtin_prefetch(&data_[index + prefetch_look_ahead]); + } else { + __builtin_prefetch(&data_[length_ - 1]); + } + } + Object* obj = data_[index]; + DCHECK(obj != NULL); + ScanObject(obj); + } + } +}; + +void MarkSweep::ProcessMarkStackParallel() { + CHECK(kDisableFinger) << "parallel mark stack processing cannot work when finger is enabled"; + Thread* self = Thread::Current(); + ThreadPool* thread_pool = GetHeap()->GetThreadPool(); + // Split the current mark stack up into work tasks. + const size_t num_threads = thread_pool->GetThreadCount(); + const size_t stack_size = mark_stack_->Size(); + const size_t chunk_size = + std::min((stack_size + num_threads - 1) / num_threads, + static_cast<size_t>(MarkStackChunk::max_size)); + size_t index = 0; + for (size_t i = 0; i < num_threads || index < stack_size; ++i) { + Object** begin = &mark_stack_->Begin()[std::min(stack_size, index)]; + Object** end = &mark_stack_->Begin()[std::min(stack_size, index + chunk_size)]; + index += chunk_size; + thread_pool->AddTask(self, new MarkStackChunk(thread_pool, this, begin, end)); + } + thread_pool->StartWorkers(self); + mark_stack_->Reset(); + thread_pool->Wait(self, true); + //LOG(INFO) << "Idle wait time " << PrettyDuration(thread_pool->GetWaitTime()); + CHECK_EQ(work_chunks_created_, work_chunks_deleted_) << " some of the work chunks were leaked"; } // Scan anything that's on the mark stack. void MarkSweep::ProcessMarkStack() { + ThreadPool* thread_pool = GetHeap()->GetThreadPool(); + if (kParallelMarkStack && thread_pool != NULL && thread_pool->GetThreadCount() > 0) { + ProcessMarkStackParallel(); + return; + } + if (kUseMarkStackPrefetch) { const size_t fifo_size = 4; const size_t fifo_mask = fifo_size - 1; @@ -1096,13 +1203,31 @@ void MarkSweep::UnBindBitmaps() { } MarkSweep::~MarkSweep() { -#ifndef NDEBUG - VLOG(heap) << "MarkSweep scanned classes=" << class_count_ << " arrays=" << array_count_ << " other=" << other_count_; -#endif + if (class_count_ != 0 || array_count_ != 0 || other_count_ != 0) { + LOG(INFO) << "MarkSweep scanned classes=" << class_count_ << " arrays=" << array_count_ + << " other=" << other_count_; + } + + if (kCountTasks) { + LOG(INFO) << "Total number of work chunks allocated: " << work_chunks_created_; + } + + if (kMeasureOverhead) { + LOG(INFO) << "Overhead time " << PrettyDuration(overhead_time_); + } + + if (kProfileLargeObjects) { + LOG(INFO) << "Large objects tested " << large_object_test_ << " marked " << large_object_mark_; + } + + if (kCountClassesMarked) { + LOG(INFO) << "Classes marked " << classes_marked_; + } + // Ensure that the mark stack is empty. CHECK(mark_stack_->IsEmpty()); - // Clear all of the alloc spaces' mark bitmaps. + // Clear all of the spaces' mark bitmaps. const Spaces& spaces = heap_->GetSpaces(); // TODO: C++0x auto for (Spaces::const_iterator it = spaces.begin(); it != spaces.end(); ++it) { diff --git a/src/gc/mark_sweep.h b/src/gc/mark_sweep.h index d64439f4e3..f510943254 100644 --- a/src/gc/mark_sweep.h +++ b/src/gc/mark_sweep.h @@ -35,6 +35,7 @@ class ModUnionVisitor; class ModUnionTableBitmap; class Object; class TimingLogger; +class MarkStackChunk; class MarkSweep { public: @@ -80,9 +81,8 @@ class MarkSweep { void UnBindBitmaps() EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_); - // Builds a mark stack with objects on dirty cards and recursively mark - // until it empties. - void RecursiveMarkDirtyObjects(bool update_finger) + // Builds a mark stack with objects on dirty cards and recursively mark until it empties. + void RecursiveMarkDirtyObjects() EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); @@ -111,7 +111,7 @@ class MarkSweep { } // Sweeps unmarked objects to complete the garbage collection. - void Sweep(bool partial, bool swap_bitmaps) + void Sweep(TimingLogger& timings, bool partial, bool swap_bitmaps) EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_); // Sweeps unmarked objects to complete the garbage collection. @@ -136,6 +136,41 @@ class MarkSweep { EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); + template <typename MarkVisitor> + void ScanObjectVisit(const Object* obj, const MarkVisitor& visitor) + NO_THREAD_SAFETY_ANALYSIS { + DCHECK(obj != NULL); + if (kIsDebugBuild && !IsMarked(obj)) { + heap_->DumpSpaces(); + LOG(FATAL) << "Scanning unmarked object " << obj; + } + Class* klass = obj->GetClass(); + DCHECK(klass != NULL); + if (klass == java_lang_Class_) { + DCHECK_EQ(klass->GetClass(), java_lang_Class_); + if (kCountScannedTypes) { + ++class_count_; + } + VisitClassReferences(klass, obj, visitor); + } else if (klass->IsArrayClass()) { + if (kCountScannedTypes) { + ++array_count_; + } + visitor(obj, klass, Object::ClassOffset(), false); + if (klass->IsObjectArrayClass()) { + VisitObjectArrayReferences(obj->AsObjectArray<Object>(), visitor); + } + } else { + if (kCountScannedTypes) { + ++other_count_; + } + VisitOtherReferences(klass, obj, visitor); + if (klass->IsReferenceClass()) { + DelayReferenceReferent(const_cast<Object*>(obj)); + } + } + } + void SetFinger(Object* new_finger) { finger_ = new_finger; } @@ -181,16 +216,29 @@ class MarkSweep { Locks::mutator_lock_) { DCHECK(obj != NULL); DCHECK(obj->GetClass() != NULL); - if (obj->IsClass()) { - VisitClassReferences(obj, visitor); - } else if (obj->IsArrayInstance()) { - VisitArrayReferences(obj, visitor); + + Class* klass = obj->GetClass(); + DCHECK(klass != NULL); + if (klass == Class::GetJavaLangClass()) { + DCHECK_EQ(klass->GetClass(), Class::GetJavaLangClass()); + VisitClassReferences(klass, obj, visitor); } else { - VisitOtherReferences(obj, visitor); + if (klass->IsArrayClass()) { + visitor(obj, klass, Object::ClassOffset(), false); + if (klass->IsObjectArrayClass()) { + VisitObjectArrayReferences(obj->AsObjectArray<Object>(), visitor); + } + } else { + VisitOtherReferences(klass, obj, visitor); + } } } - static void MarkObjectVisitor(const Object* root, void* arg) + static void MarkObjectCallback(const Object* root, void* arg) + EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_); + + // Marks an object. + void MarkObject(const Object* obj) EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_); Barrier& GetBarrier(); @@ -216,14 +264,15 @@ class MarkSweep { EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); - // Marks an object. - void MarkObject(const Object* obj) + void MarkObjectNonNull(const Object* obj, bool check_finger) EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_); - // Yuck. - void MarkObject0(const Object* obj, bool check_finger) + bool MarkLargeObject(const Object* obj) EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_); + // Returns true if we need to add obj to a mark stack. + bool MarkObjectParallel(const Object* obj) NO_THREAD_SAFETY_ANALYSIS; + static void ScanBitmapCallback(Object* obj, void* finger, void* arg) EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); @@ -241,11 +290,6 @@ class MarkSweep { void CheckObject(const Object* obj) SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_); - // Grays references in instance fields. - void ScanInstanceFields(const Object* obj) - EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) - SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); - // Verify the roots of the heap and print out information related to any invalid roots. // Called in MarkObject, so may we may not hold the mutator lock. void VerifyRoots() @@ -258,32 +302,23 @@ class MarkSweep { NO_THREAD_SAFETY_ANALYSIS; template <typename Visitor> - static void VisitInstanceFieldsReferences(const Object* obj, const Visitor& visitor) + static void VisitInstanceFieldsReferences(const Class* klass, const Object* obj, + const Visitor& visitor) SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) { DCHECK(obj != NULL); - Class* klass = obj->GetClass(); DCHECK(klass != NULL); VisitFieldsReferences(obj, klass->GetReferenceInstanceOffsets(), false, visitor); } - // Blackens a class object. - void ScanClass(const Object* obj) - EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) - SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); - - + // Visit the header, static field references, and interface pointers of a class object. template <typename Visitor> - static void VisitClassReferences(const Object* obj, const Visitor& visitor) + static void VisitClassReferences(const Class* klass, const Object* obj, + const Visitor& visitor) SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) { - VisitInstanceFieldsReferences(obj, visitor); + VisitInstanceFieldsReferences(klass, obj, visitor); VisitStaticFieldsReferences(obj->AsClass(), visitor); } - // Grays references in static fields. - void ScanStaticFields(const Class* klass) - EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) - SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); - template <typename Visitor> static void VisitStaticFieldsReferences(const Class* klass, const Visitor& visitor) SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) { @@ -291,11 +326,6 @@ class MarkSweep { VisitFieldsReferences(klass, klass->GetReferenceStaticOffsets(), true, visitor); } - // Used by ScanInstanceFields and ScanStaticFields - void ScanFields(const Object* obj, uint32_t ref_offsets, bool is_static) - EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) - SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); - template <typename Visitor> static void VisitFieldsReferences(const Object* obj, uint32_t ref_offsets, bool is_static, const Visitor& visitor) @@ -333,37 +363,30 @@ class MarkSweep { } } - // Grays references in an array. - void ScanArray(const Object* obj) - EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) - SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); - + // Visit all of the references in an object array. template <typename Visitor> - static void VisitArrayReferences(const Object* obj, const Visitor& visitor) + static void VisitObjectArrayReferences(const ObjectArray<Object>* array, + const Visitor& visitor) SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) { - visitor(obj, obj->GetClass(), Object::ClassOffset(), false); - if (obj->IsObjectArray()) { - const ObjectArray<Object>* array = obj->AsObjectArray<Object>(); - for (int32_t i = 0; i < array->GetLength(); ++i) { - const Object* element = array->GetWithoutChecks(i); - size_t width = sizeof(Object*); - visitor(obj, element, MemberOffset(i * width + Array::DataOffset(width).Int32Value()), false); - } + const int32_t length = array->GetLength(); + for (int32_t i = 0; i < length; ++i) { + const Object* element = array->GetWithoutChecks(i); + const size_t width = sizeof(Object*); + MemberOffset offset = MemberOffset(i * width + Array::DataOffset(width).Int32Value()); + visitor(array, element, offset, false); } } - void ScanOther(const Object* obj) - EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) - SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); - + // Visits the header and field references of a data object. template <typename Visitor> - static void VisitOtherReferences(const Object* obj, const Visitor& visitor) + static void VisitOtherReferences(const Class* klass, const Object* obj, + const Visitor& visitor) SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_, Locks::mutator_lock_) { - return VisitInstanceFieldsReferences(obj, visitor); + return VisitInstanceFieldsReferences(klass, obj, visitor); } // Blackens objects grayed during a garbage collection. - void ScanGrayObjects(bool update_finger) + void ScanGrayObjects() EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_); // Schedules an unmarked object for reference processing. @@ -375,6 +398,10 @@ class MarkSweep { EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); + void ProcessMarkStackParallel() + EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) + SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); + void EnqueueFinalizerReferences(Object** ref) EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); @@ -396,9 +423,15 @@ class MarkSweep { void SweepJniWeakGlobals(Heap::IsMarkedTester is_marked, void* arg) SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_); + // Whether or not we count how many of each type of object were scanned. + static const bool kCountScannedTypes = false; + // Current space, we check this space first to avoid searching for the appropriate space for an object. SpaceBitmap* current_mark_bitmap_; + // Cache java.lang.Class for optimization. + Class* java_lang_Class_; + ObjectStack* mark_stack_; Heap* heap_; @@ -410,23 +443,25 @@ class MarkSweep { Object* immune_end_; Object* soft_reference_list_; - Object* weak_reference_list_; - Object* finalizer_reference_list_; - Object* phantom_reference_list_; - Object* cleared_reference_list_; - size_t freed_bytes_; - size_t freed_objects_; - - size_t class_count_; - size_t array_count_; - size_t other_count_; + AtomicInteger freed_bytes_; + AtomicInteger freed_objects_; + AtomicInteger class_count_; + AtomicInteger array_count_; + AtomicInteger other_count_; + AtomicInteger large_object_test_; + AtomicInteger large_object_mark_; + AtomicInteger classes_marked_; + AtomicInteger overhead_time_; + AtomicInteger work_chunks_created_; + AtomicInteger work_chunks_deleted_; UniquePtr<Barrier> gc_barrier_; + Mutex large_object_lock_; friend class AddIfReachesAllocSpaceVisitor; // Used by mod-union table. friend class CheckBitmapVisitor; @@ -443,6 +478,8 @@ class MarkSweep { friend class ModUnionScanImageRootVisitor; friend class ScanBitmapVisitor; friend class ScanImageRootVisitor; + friend class MarkStackChunk; + friend class FifoMarkStackChunk; DISALLOW_COPY_AND_ASSIGN(MarkSweep); }; diff --git a/src/gc/space_bitmap.h b/src/gc/space_bitmap.h index 885491fac0..25fd538f22 100644 --- a/src/gc/space_bitmap.h +++ b/src/gc/space_bitmap.h @@ -22,6 +22,8 @@ #include <stdint.h> #include <vector> +#include "cutils/atomic.h" +#include "cutils/atomic-inline.h" #include "UniquePtr.h" #include "globals.h" #include "logging.h" @@ -64,12 +66,32 @@ class SpaceBitmap { return static_cast<uintptr_t>(kWordHighBitMask) >> ((offset_ / kAlignment) % kBitsPerWord); } - inline void Set(const Object* obj) { - Modify(obj, true); + inline bool Set(const Object* obj) { + return Modify(obj, true); } - inline void Clear(const Object* obj) { - Modify(obj, false); + inline bool Clear(const Object* obj) { + return Modify(obj, false); + } + + // Returns true if the object was previously marked. + inline bool AtomicTestAndSet(const Object* obj) { + uintptr_t addr = reinterpret_cast<uintptr_t>(obj); + DCHECK_GE(addr, heap_begin_); + const uintptr_t offset = addr - heap_begin_; + const size_t index = OffsetToIndex(offset); + const word mask = OffsetToMask(offset); + word* const address = &bitmap_begin_[index]; + DCHECK_LT(index, bitmap_size_ / kWordSize) << " bitmap_size_ = " << bitmap_size_; + word old_word; + do { + old_word = *address; + // Fast path: The bit is already set. + if ((old_word & mask) != 0) { + return true; + } + } while (UNLIKELY(android_atomic_cas(old_word, old_word | mask, address) != 0)); + return false; } void Clear(); @@ -229,6 +251,12 @@ class SpaceBitmap { std::string GetName() const; void SetName(const std::string& name); + const void* GetObjectWordAddress(const Object* obj) const { + uintptr_t addr = reinterpret_cast<uintptr_t>(obj); + const uintptr_t offset = addr - heap_begin_; + const size_t index = OffsetToIndex(offset); + return &bitmap_begin_[index]; + } private: // TODO: heap_end_ is initialized so that the heap bitmap is empty, this doesn't require the -1, // however, we document that this is expected on heap_end_ @@ -237,18 +265,21 @@ class SpaceBitmap { heap_begin_(reinterpret_cast<uintptr_t>(heap_begin)), name_(name) {} - inline void Modify(const Object* obj, bool do_set) { + inline bool Modify(const Object* obj, bool do_set) { uintptr_t addr = reinterpret_cast<uintptr_t>(obj); DCHECK_GE(addr, heap_begin_); const uintptr_t offset = addr - heap_begin_; const size_t index = OffsetToIndex(offset); const word mask = OffsetToMask(offset); DCHECK_LT(index, bitmap_size_ / kWordSize) << " bitmap_size_ = " << bitmap_size_; + word* address = &bitmap_begin_[index]; + word old_word = *address; if (do_set) { - bitmap_begin_[index] |= mask; + *address = old_word | mask; } else { - bitmap_begin_[index] &= ~mask; + *address = old_word & ~mask; } + return (old_word & mask) != 0; } // Backing storage for bitmap. diff --git a/src/heap.cc b/src/heap.cc index d12d20e5b8..b4cf4a9af5 100644 --- a/src/heap.cc +++ b/src/heap.cc @@ -45,6 +45,7 @@ namespace art { +static const bool kDumpGcPerformanceOnShutdown = false; const double Heap::kDefaultTargetUtilization = 0.5; static bool GenerateImage(const std::string& image_file_name) { @@ -282,6 +283,11 @@ Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max gc_complete_cond_.reset(new ConditionVariable("GC complete condition variable", *gc_complete_lock_)); + // Create the reference queue lock, this is required so for parrallel object scanning in the GC. + reference_queue_lock_.reset(new Mutex("reference queue lock")); + + CHECK(max_allowed_footprint_ != 0); + // Set up the cumulative timing loggers. for (size_t i = static_cast<size_t>(kGcTypeSticky); i < static_cast<size_t>(kGcTypeMax); ++i) { @@ -296,6 +302,17 @@ Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max } } +void Heap::CreateThreadPool() { + // TODO: Make sysconf(_SC_NPROCESSORS_CONF) be a helper function? + // Use the number of processors - 1 since the thread doing the GC does work while its waiting for + // workers to complete. + thread_pool_.reset(new ThreadPool(sysconf(_SC_NPROCESSORS_CONF) - 1)); +} + +void Heap::DeleteThreadPool() { + thread_pool_.reset(NULL); +} + // Sort spaces based on begin address struct SpaceSorter { bool operator ()(const ContinuousSpace* a, const ContinuousSpace* b) const { @@ -369,6 +386,10 @@ void Heap::DumpGcPerformanceInfo() { } Heap::~Heap() { + if (kDumpGcPerformanceOnShutdown) { + DumpGcPerformanceInfo(); + } + // If we don't reset then the mark stack complains in it's destructor. allocation_stack_->Reset(); live_stack_->Reset(); @@ -947,12 +968,11 @@ GcType Heap::CollectGarbageInternal(GcType gc_type, GcCause gc_cause, bool clear // We need to do partial GCs every now and then to avoid the heap growing too much and // fragmenting. if (gc_type == kGcTypeSticky && ++sticky_gc_count_ > partial_gc_frequency_) { - gc_type = kGcTypePartial; + gc_type = have_zygote_space_ ? kGcTypePartial : kGcTypeFull; } if (gc_type != kGcTypeSticky) { sticky_gc_count_ = 0; } - if (concurrent_gc_) { CollectGarbageConcurrentMarkSweepPlan(self, gc_type, gc_cause, clear_soft_references); } else { @@ -1049,9 +1069,6 @@ void Heap::CollectGarbageMarkSweepPlan(Thread* self, GcType gc_type, GcCause gc_ mark_sweep.MarkConcurrentRoots(); timings.AddSplit("MarkRoots"); - // Roots are marked on the bitmap and the mark_stack is empty. - DCHECK(mark_stack_->IsEmpty()); - UpdateAndMarkModUnion(&mark_sweep, timings, gc_type); if (gc_type != kGcTypeSticky) { @@ -1079,15 +1096,14 @@ void Heap::CollectGarbageMarkSweepPlan(Thread* self, GcType gc_type, GcCause gc_ mark_sweep.ProcessReferences(clear_soft_references); timings.AddSplit("ProcessReferences"); -#ifndef NDEBUG - // Verify that we only reach marked objects from the image space - mark_sweep.VerifyImageRoots(); - timings.AddSplit("VerifyImageRoots"); -#endif + if (kIsDebugBuild) { + // Verify that we only reach marked objects from the image space + mark_sweep.VerifyImageRoots(); + timings.AddSplit("VerifyImageRoots"); + } if (gc_type != kGcTypeSticky) { - mark_sweep.Sweep(gc_type == kGcTypePartial, false); - timings.AddSplit("Sweep"); + mark_sweep.Sweep(timings, gc_type == kGcTypePartial, false); mark_sweep.SweepLargeObjects(false); timings.AddSplit("SweepLargeObjects"); } else { @@ -1098,15 +1114,12 @@ void Heap::CollectGarbageMarkSweepPlan(Thread* self, GcType gc_type, GcCause gc_ // Unbind the live and mark bitmaps. mark_sweep.UnBindBitmaps(); - - const bool swap = true; - if (swap) { - if (gc_type == kGcTypeSticky) { - SwapLargeObjects(); - } else { - SwapBitmaps(gc_type); - } + if (gc_type == kGcTypeSticky) { + SwapLargeObjects(); + } else { + SwapBitmaps(gc_type); } + timings.AddSplit("SwapBitmaps"); if (verify_system_weaks_) { mark_sweep.VerifySystemWeaks(); @@ -1600,9 +1613,6 @@ void Heap::CollectGarbageConcurrentMarkSweepPlan(Thread* self, GcType gc_type, G ClearCards(timings); } - // Roots are marked on the bitmap and the mark_stack is empty. - DCHECK(mark_stack_->IsEmpty()); - if (verify_mod_union_table_) { ReaderMutexLock reader_lock(self, *Locks::heap_bitmap_lock_); zygote_mod_union_table_->Update(); @@ -1657,7 +1667,7 @@ void Heap::CollectGarbageConcurrentMarkSweepPlan(Thread* self, GcType gc_type, G timings.AddSplit("ReMarkRoots"); // Scan dirty objects, this is only required if we are not doing concurrent GC. - mark_sweep.RecursiveMarkDirtyObjects(false); + mark_sweep.RecursiveMarkDirtyObjects(); timings.AddSplit("RecursiveMarkDirtyObjects"); } @@ -1721,8 +1731,7 @@ void Heap::CollectGarbageConcurrentMarkSweepPlan(Thread* self, GcType gc_type, G // TODO: this lock shouldn't be necessary (it's why we did the bitmap flip above). if (gc_type != kGcTypeSticky) { WriterMutexLock mu(self, *Locks::heap_bitmap_lock_); - mark_sweep.Sweep(gc_type == kGcTypePartial, false); - timings.AddSplit("Sweep"); + mark_sweep.Sweep(timings, gc_type == kGcTypePartial, false); mark_sweep.SweepLargeObjects(false); timings.AddSplit("SweepLargeObjects"); } else { @@ -1740,14 +1749,12 @@ void Heap::CollectGarbageConcurrentMarkSweepPlan(Thread* self, GcType gc_type, G // Swap the live and mark bitmaps for each space which we modified space. This is an // optimization that enables us to not clear live bits inside of the sweep. - const bool swap = true; - if (swap) { - if (gc_type == kGcTypeSticky) { - SwapLargeObjects(); - } else { - SwapBitmaps(gc_type); - } + if (gc_type == kGcTypeSticky) { + SwapLargeObjects(); + } else { + SwapBitmaps(gc_type); } + timings.AddSplit("SwapBitmaps"); } if (verify_system_weaks_) { @@ -1850,7 +1857,7 @@ void Heap::SetIdealFootprint(size_t max_allowed_footprint) { void Heap::GrowForUtilization() { // We know what our utilization is at this moment. // This doesn't actually resize any memory. It just lets the heap grow more when necessary. - size_t target_size = num_bytes_allocated_ / Heap::GetTargetHeapUtilization(); + size_t target_size = num_bytes_allocated_ / GetTargetHeapUtilization(); if (target_size > num_bytes_allocated_ + max_free_) { target_size = num_bytes_allocated_ + max_free_; } else if (target_size < num_bytes_allocated_ + min_free_) { @@ -1870,7 +1877,6 @@ void Heap::GrowForUtilization() { } void Heap::ClearGrowthLimit() { - WaitForConcurrentGcToComplete(Thread::Current()); alloc_space_->ClearGrowthLimit(); } @@ -1922,6 +1928,8 @@ void Heap::EnqueuePendingReference(Object* ref, Object** list) { DCHECK(ref != NULL); DCHECK(list != NULL); + // TODO: Remove this lock, use atomic stacks for storing references. + MutexLock mu(Thread::Current(), *reference_queue_lock_); if (*list == NULL) { ref->SetFieldObject(reference_pendingNext_offset_, ref, false); *list = ref; @@ -1937,6 +1945,9 @@ Object* Heap::DequeuePendingReference(Object** list) { DCHECK(*list != NULL); Object* head = (*list)->GetFieldObject<Object*>(reference_pendingNext_offset_, false); Object* ref; + + // TODO: Remove this lock, use atomic stacks for storing references. + MutexLock mu(Thread::Current(), *reference_queue_lock_); if (*list == head) { ref = *list; *list = NULL; diff --git a/src/heap.h b/src/heap.h index 8ed5881624..6c4c38bd7a 100644 --- a/src/heap.h +++ b/src/heap.h @@ -31,6 +31,7 @@ #include "offsets.h" #include "safe_map.h" #include "timing_logger.h" +#include "thread_pool.h" #define VERIFY_OBJECT_ENABLED 0 @@ -312,6 +313,13 @@ class Heap { // GC performance measuring void DumpGcPerformanceInfo(); + // Thread pool. + void CreateThreadPool(); + void DeleteThreadPool(); + ThreadPool* GetThreadPool() { + return thread_pool_.get(); + } + private: // Allocates uninitialized storage. Passing in a null space tries to place the object in the // large object space. @@ -408,6 +416,9 @@ class Heap { Mutex* gc_complete_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER; UniquePtr<ConditionVariable> gc_complete_cond_ GUARDED_BY(gc_complete_lock_); + // Reference queue lock + UniquePtr<Mutex> reference_queue_lock_; + // True while the garbage collector is running. volatile bool is_gc_running_ GUARDED_BY(gc_complete_lock_); @@ -450,6 +461,9 @@ class Heap { const bool verify_post_gc_heap_; const bool verify_mod_union_table_; + // Parallel GC data structures. + UniquePtr<ThreadPool> thread_pool_; + // After how many GCs we force to do a partial GC instead of sticky mark bits GC. const size_t partial_gc_frequency_; diff --git a/src/mutex.cc b/src/mutex.cc index e2044d6395..1cdbc4d368 100644 --- a/src/mutex.cc +++ b/src/mutex.cc @@ -758,7 +758,6 @@ void ConditionVariable::Signal(Thread* self) { void ConditionVariable::Wait(Thread* self) { DCHECK(self == NULL || self == Thread::Current()); guard_.AssertExclusiveHeld(self); - guard_.CheckSafeToWait(self); unsigned int old_recursion_count = guard_.recursion_count_; #if ART_USE_FUTEXES int32_t cur_state = state_; @@ -794,7 +793,6 @@ void ConditionVariable::Wait(Thread* self) { void ConditionVariable::TimedWait(Thread* self, int64_t ms, int32_t ns) { DCHECK(self == NULL || self == Thread::Current()); guard_.AssertExclusiveHeld(self); - guard_.CheckSafeToWait(self); unsigned int old_recursion_count = guard_.recursion_count_; #if ART_USE_FUTEXES // Record the original end time so that if the futex call fails we can recompute the appropriate diff --git a/src/object.cc b/src/object.cc index 9a4588a74b..cebbb2a06d 100644 --- a/src/object.cc +++ b/src/object.cc @@ -733,6 +733,19 @@ void AbstractMethod::UnregisterNative(Thread* self) { RegisterNative(self, Runtime::Current()->GetJniDlsymLookupStub()->GetData()); } +Class* Class::java_lang_Class_ = NULL; + +void Class::SetClassClass(Class* java_lang_Class) { + CHECK(java_lang_Class_ == NULL) << java_lang_Class_ << " " << java_lang_Class; + CHECK(java_lang_Class != NULL); + java_lang_Class_ = java_lang_Class; +} + +void Class::ResetClass() { + CHECK(java_lang_Class_ != NULL); + java_lang_Class_ = NULL; +} + void Class::SetStatus(Status new_status) { CHECK(new_status > GetStatus() || new_status == kStatusError || !Runtime::Current()->IsStarted()) << PrettyClass(this) << " " << GetStatus() << " -> " << new_status; diff --git a/src/object.h b/src/object.h index 79df4e217d..efd456e766 100644 --- a/src/object.h +++ b/src/object.h @@ -1521,6 +1521,10 @@ class MANAGED Class : public StaticStorageBase { return !IsPrimitive() && !IsInterface() && !IsAbstract(); } + bool IsObjectArrayClass() const { + return GetComponentType() != NULL && !GetComponentType()->IsPrimitive(); + } + // Creates a raw object instance but does not invoke the default constructor. Object* AllocObject(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_); @@ -2008,6 +2012,15 @@ class MANAGED Class : public StaticStorageBase { SetField32(OFFSET_OF_OBJECT_MEMBER(Class, dex_type_idx_), type_idx, false); } + static Class* GetJavaLangClass() { + DCHECK(java_lang_Class_ != NULL); + return java_lang_Class_; + } + + // Can't call this SetClass or else gets called instead of Object::SetClass in places. + static void SetClassClass(Class* java_lang_Class); + static void ResetClass(); + private: void SetVerifyErrorClass(Class* klass) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) { CHECK(klass != NULL) << PrettyClass(this); @@ -2127,6 +2140,9 @@ class MANAGED Class : public StaticStorageBase { // Location of first static field. uint32_t fields_[0]; + // java.lang.Class + static Class* java_lang_Class_; + friend struct ClassOffsets; // for verifying offset information DISALLOW_IMPLICIT_CONSTRUCTORS(Class); }; diff --git a/src/runtime.cc b/src/runtime.cc index 79d1fb2604..3fa31237f6 100644 --- a/src/runtime.cc +++ b/src/runtime.cc @@ -115,6 +115,8 @@ Runtime::Runtime() } Runtime::~Runtime() { + heap_->DeleteThreadPool(); + Thread* self = Thread::Current(); { MutexLock mu(self, *Locks::runtime_shutdown_lock_); @@ -696,6 +698,9 @@ void Runtime::EndThreadBirth() EXCLUSIVE_LOCKS_REQUIRED(Locks::runtime_shutdown_ void Runtime::DidForkFromZygote() { is_zygote_ = false; + // Create the thread pool. + heap_->CreateThreadPool(); + StartSignalCatcher(); // Start the JDWP thread. If the command-line debugger flags specified "suspend=y", @@ -1030,7 +1035,9 @@ void Runtime::VisitNonConcurrentRoots(Heap::RootVisitor* visitor, void* arg) { } void Runtime::DirtyRoots() { + CHECK(intern_table_ != NULL); intern_table_->Dirty(); + CHECK(class_linker_ != NULL); class_linker_->Dirty(); } diff --git a/src/thread_pool.cc b/src/thread_pool.cc index ba531132f9..26c83d230d 100644 --- a/src/thread_pool.cc +++ b/src/thread_pool.cc @@ -1,3 +1,4 @@ +#include "casts.h" #include "runtime.h" #include "stl_util.h" #include "thread.h" @@ -24,9 +25,10 @@ ThreadPoolWorker::~ThreadPoolWorker() { void ThreadPoolWorker::Run() { Thread* self = Thread::Current(); - Closure* task = NULL; + Task* task = NULL; while ((task = thread_pool_->GetTask(self)) != NULL) { task->Run(self); + task->Finalize(); } } @@ -40,7 +42,7 @@ void* ThreadPoolWorker::Callback(void* arg) { return NULL; } -void ThreadPool::AddTask(Thread* self, Closure* task){ +void ThreadPool::AddTask(Thread* self, Task* task){ MutexLock mu(self, task_queue_lock_); tasks_.push_back(task); // If we have any waiters, signal one. @@ -49,14 +51,6 @@ void ThreadPool::AddTask(Thread* self, Closure* task){ } } -void ThreadPool::AddThread(size_t stack_size) { - threads_.push_back( - new ThreadPoolWorker( - this, - StringPrintf("Thread pool worker %d", static_cast<int>(GetThreadCount())), - stack_size)); -} - ThreadPool::ThreadPool(size_t num_threads) : task_queue_lock_("task queue lock"), task_queue_condition_("task queue condition", task_queue_lock_), @@ -65,7 +59,8 @@ ThreadPool::ThreadPool(size_t num_threads) shutting_down_(false), waiting_count_(0) { while (GetThreadCount() < num_threads) { - AddThread(ThreadPoolWorker::kDefaultStackSize); + const std::string name = StringPrintf("Thread pool worker %zu", GetThreadCount()); + threads_.push_back(new ThreadPoolWorker(this, name, ThreadPoolWorker::kDefaultStackSize)); } } @@ -75,9 +70,9 @@ ThreadPool::~ThreadPool() { MutexLock mu(self, task_queue_lock_); // Tell any remaining workers to shut down. shutting_down_ = true; - android_memory_barrier(); // Broadcast to everyone waiting. task_queue_condition_.Broadcast(self); + completion_condition_.Broadcast(self); } // Wait for the threads to finish. STLDeleteElements(&threads_); @@ -86,22 +81,21 @@ ThreadPool::~ThreadPool() { void ThreadPool::StartWorkers(Thread* self) { MutexLock mu(self, task_queue_lock_); started_ = true; - android_memory_barrier(); task_queue_condition_.Broadcast(self); + start_time_ = NanoTime(); + total_wait_time_ = 0; } void ThreadPool::StopWorkers(Thread* self) { MutexLock mu(self, task_queue_lock_); started_ = false; - android_memory_barrier(); } -Closure* ThreadPool::GetTask(Thread* self) { +Task* ThreadPool::GetTask(Thread* self) { MutexLock mu(self, task_queue_lock_); - while (!shutting_down_) { - if (started_ && !tasks_.empty()) { - Closure* task = tasks_.front(); - tasks_.pop_front(); + while (!IsShuttingDown()) { + Task* task = TryGetTaskLocked(self); + if (task != NULL) { return task; } @@ -110,7 +104,10 @@ Closure* ThreadPool::GetTask(Thread* self) { // We may be done, lets broadcast to the completion condition. completion_condition_.Broadcast(self); } + const uint64_t wait_start = NanoTime(); task_queue_condition_.Wait(self); + const uint64_t wait_end = NanoTime(); + total_wait_time_ += wait_end - std::max(wait_start, start_time_); waiting_count_--; } @@ -118,12 +115,151 @@ Closure* ThreadPool::GetTask(Thread* self) { return NULL; } -void ThreadPool::Wait(Thread* self) { +Task* ThreadPool::TryGetTask(Thread* self) { MutexLock mu(self, task_queue_lock_); + return TryGetTaskLocked(self); +} + +Task* ThreadPool::TryGetTaskLocked(Thread* self) { + if (started_ && !tasks_.empty()) { + Task* task = tasks_.front(); + tasks_.pop_front(); + return task; + } + return NULL; +} + +void ThreadPool::Wait(Thread* self, bool do_work) { + Task* task = NULL; + while ((task = TryGetTask(self)) != NULL) { + task->Run(self); + task->Finalize(); + } + // Wait until each thread is waiting and the task list is empty. - while (waiting_count_ != GetThreadCount() || !tasks_.empty()) { + MutexLock mu(self, task_queue_lock_); + while (!shutting_down_ && (waiting_count_ != GetThreadCount() || !tasks_.empty())) { completion_condition_.Wait(self); } } +size_t ThreadPool::GetTaskCount(Thread* self){ + MutexLock mu(self, task_queue_lock_); + return tasks_.size(); +} + +WorkStealingWorker::WorkStealingWorker(ThreadPool* thread_pool, const std::string& name, + size_t stack_size) + : ThreadPoolWorker(thread_pool, name, stack_size), + task_(NULL) { + +} + +void WorkStealingWorker::Run() { + Thread* self = Thread::Current(); + Task* task = NULL; + WorkStealingThreadPool* thread_pool = down_cast<WorkStealingThreadPool*>(thread_pool_); + while ((task = thread_pool_->GetTask(self)) != NULL) { + WorkStealingTask* stealing_task = down_cast<WorkStealingTask*>(task); + + { + CHECK(task_ == NULL); + MutexLock mu(self, thread_pool->work_steal_lock_); + // Register that we are running the task + ++stealing_task->ref_count_; + task_ = stealing_task; + } + stealing_task->Run(self); + // Mark ourselves as not running a task so that nobody tries to steal from us. + // There is a race condition that someone starts stealing from us at this point. This is okay + // due to the reference counting. + task_ = NULL; + + bool finalize; + + // Steal work from tasks until there is none left to steal. Note: There is a race, but + // all that happens when the race occurs is that we steal some work instead of processing a + // task from the queue. + while (thread_pool->GetTaskCount(self) == 0) { + WorkStealingTask* steal_from_task = NULL; + + { + MutexLock mu(self, thread_pool->work_steal_lock_); + // Try finding a task to steal from. + steal_from_task = thread_pool->FindTaskToStealFrom(self); + if (steal_from_task != NULL) { + CHECK_NE(stealing_task, steal_from_task) + << "Attempting to steal from completed self task"; + steal_from_task->ref_count_++; + } else { + break; + } + } + + if (steal_from_task != NULL) { + // Task which completed earlier is going to steal some work. + stealing_task->StealFrom(self, steal_from_task); + + { + // We are done stealing from the task, lets decrement its reference count. + MutexLock mu(self, thread_pool->work_steal_lock_); + finalize = !--steal_from_task->ref_count_; + } + + if (finalize) { + steal_from_task->Finalize(); + } + } + } + + { + MutexLock mu(self, thread_pool->work_steal_lock_); + // If nobody is still referencing task_ we can finalize it. + finalize = !--stealing_task->ref_count_; + } + + if (finalize) { + stealing_task->Finalize(); + } + } +} + +WorkStealingWorker::~WorkStealingWorker() { + +} + +WorkStealingThreadPool::WorkStealingThreadPool(size_t num_threads) + : ThreadPool(0), + work_steal_lock_("work stealing lock"), + steal_index_(0) { + while (GetThreadCount() < num_threads) { + const std::string name = StringPrintf("Work stealing worker %zu", GetThreadCount()); + threads_.push_back(new WorkStealingWorker(this, name, ThreadPoolWorker::kDefaultStackSize)); + } +} + +WorkStealingTask* WorkStealingThreadPool::FindTaskToStealFrom(Thread* self) { + const size_t thread_count = GetThreadCount(); + for (size_t i = 0; i < thread_count; ++i) { + // TODO: Use CAS instead of lock. + ++steal_index_; + if (steal_index_ >= thread_count) { + steal_index_-= thread_count; + } + + WorkStealingWorker* worker = down_cast<WorkStealingWorker*>(threads_[steal_index_]); + WorkStealingTask* task = worker->task_; + if (task) { + // Not null, we can probably steal from this worker. + return task; + } + } + // Couldn't find something to steal. + return NULL; +} + +WorkStealingThreadPool::~WorkStealingThreadPool() { + +} + } // namespace art diff --git a/src/thread_pool.h b/src/thread_pool.h index 1d0f85deea..c8f6056f66 100644 --- a/src/thread_pool.h +++ b/src/thread_pool.h @@ -20,14 +20,20 @@ #include <deque> #include <vector> +#include "closure.h" #include "locks.h" #include "../src/mutex.h" namespace art { -class Closure; class ThreadPool; +class Task : public Closure { +public: + // Called when references reaches 0. + virtual void Finalize() { } +}; + class ThreadPoolWorker { public: static const size_t kDefaultStackSize = 1 * MB; @@ -38,10 +44,10 @@ class ThreadPoolWorker { virtual ~ThreadPoolWorker(); - private: + protected: ThreadPoolWorker(ThreadPool* thread_pool, const std::string& name, size_t stack_size); static void* Callback(void* arg) LOCKS_EXCLUDED(Locks::mutator_lock_); - void Run(); + virtual void Run(); ThreadPool* thread_pool_; const std::string name_; @@ -67,20 +73,33 @@ class ThreadPool { // Add a new task, the first available started worker will process it. Does not delete the task // after running it, it is the caller's responsibility. - void AddTask(Thread* self, Closure* task); + void AddTask(Thread* self, Task* task); ThreadPool(size_t num_threads); virtual ~ThreadPool(); // Wait for all tasks currently on queue to get completed. - void Wait(Thread* self); + void Wait(Thread* self, bool do_work = true); - private: - // Add a new task. - void AddThread(size_t stack_size); + size_t GetTaskCount(Thread* self); + // Returns the total amount of workers waited for tasks. + uint64_t GetWaitTime() const { + return total_wait_time_; + } + + protected: // Get a task to run, blocks if there are no tasks left - Closure* GetTask(Thread* self); + virtual Task* GetTask(Thread* self); + + // Try to get a task, returning NULL if there is none available. + Task* TryGetTask(Thread* self); + Task* TryGetTaskLocked(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(task_queue_lock_); + + // Are we shutting down? + bool IsShuttingDown() const EXCLUSIVE_LOCKS_REQUIRED(task_queue_lock_) { + return shutting_down_; + } Mutex task_queue_lock_; ConditionVariable task_queue_condition_ GUARDED_BY(task_queue_lock_); @@ -89,14 +108,71 @@ class ThreadPool { volatile bool shutting_down_ GUARDED_BY(task_queue_lock_); // How many worker threads are waiting on the condition. volatile size_t waiting_count_ GUARDED_BY(task_queue_lock_); - std::deque<Closure*> tasks_ GUARDED_BY(task_queue_lock_); + std::deque<Task*> tasks_ GUARDED_BY(task_queue_lock_); // TODO: make this immutable/const? std::vector<ThreadPoolWorker*> threads_; + // Work balance detection. + uint64_t start_time_ GUARDED_BY(task_queue_lock_); + uint64_t total_wait_time_; friend class ThreadPoolWorker; + friend class WorkStealingWorker; DISALLOW_COPY_AND_ASSIGN(ThreadPool); }; +class WorkStealingTask : public Task { + public: + WorkStealingTask() : ref_count_(0) { + + } + + size_t GetRefCount() const { + return ref_count_; + } + + virtual void StealFrom(Thread* self, WorkStealingTask* source) = 0; + + private: + // How many people are referencing this task. + size_t ref_count_; + + friend class WorkStealingWorker; +}; + +class WorkStealingWorker : public ThreadPoolWorker { + public: + virtual ~WorkStealingWorker(); + + bool IsRunningTask() const { + return task_ != NULL; + } + + protected: + WorkStealingTask* task_; + + WorkStealingWorker(ThreadPool* thread_pool, const std::string& name, size_t stack_size); + virtual void Run(); + + friend class WorkStealingThreadPool; + DISALLOW_COPY_AND_ASSIGN(WorkStealingWorker); +}; + +class WorkStealingThreadPool : public ThreadPool { + public: + WorkStealingThreadPool(size_t num_threads); + virtual ~WorkStealingThreadPool(); + + private: + Mutex work_steal_lock_; + // Which thread we are stealing from (round robin). + size_t steal_index_; + + // Find a task to steal from + WorkStealingTask* FindTaskToStealFrom(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(work_steal_lock_); + + friend class WorkStealingWorker; +}; + } // namespace art #endif // ART_SRC_THREAD_POOL_H_ diff --git a/src/thread_pool_test.cc b/src/thread_pool_test.cc index 783f786bef..bac60021c1 100644 --- a/src/thread_pool_test.cc +++ b/src/thread_pool_test.cc @@ -23,9 +23,9 @@ namespace art { -class CountClosure : public Closure { +class CountTask : public Task { public: - CountClosure(AtomicInteger* count) : count_(count) { + CountTask(AtomicInteger* count) : count_(count) { } @@ -34,6 +34,9 @@ class CountClosure : public Closure { usleep(100); // Increment the counter which keeps track of work completed. ++*count_; + } + + void Finalize() { delete this; } @@ -55,7 +58,7 @@ TEST_F(ThreadPoolTest, CheckRun) { AtomicInteger count = 0; static const int32_t num_tasks = num_threads * 4; for (int32_t i = 0; i < num_tasks; ++i) { - thread_pool.AddTask(self, new CountClosure(&count)); + thread_pool.AddTask(self, new CountTask(&count)); } thread_pool.StartWorkers(self); // Wait for tasks to complete. @@ -70,7 +73,7 @@ TEST_F(ThreadPoolTest, StopStart) { AtomicInteger count = 0; static const int32_t num_tasks = num_threads * 4; for (int32_t i = 0; i < num_tasks; ++i) { - thread_pool.AddTask(self, new CountClosure(&count)); + thread_pool.AddTask(self, new CountTask(&count)); } usleep(200); // Check that no threads started prematurely. @@ -80,15 +83,15 @@ TEST_F(ThreadPoolTest, StopStart) { usleep(200); thread_pool.StopWorkers(self); AtomicInteger bad_count = 0; - thread_pool.AddTask(self, new CountClosure(&bad_count)); + thread_pool.AddTask(self, new CountTask(&bad_count)); usleep(200); // Ensure that the task added after the workers were stopped doesn't get run. EXPECT_EQ(0, bad_count); } -class TreeClosure : public Closure { +class TreeTask : public Task { public: - TreeClosure(ThreadPool* const thread_pool, AtomicInteger* count, int depth) + TreeTask(ThreadPool* const thread_pool, AtomicInteger* count, int depth) : thread_pool_(thread_pool), count_(count), depth_(depth) { @@ -97,11 +100,14 @@ class TreeClosure : public Closure { void Run(Thread* self) { if (depth_ > 1) { - thread_pool_->AddTask(self, new TreeClosure(thread_pool_, count_, depth_ - 1)); - thread_pool_->AddTask(self, new TreeClosure(thread_pool_, count_, depth_ - 1)); + thread_pool_->AddTask(self, new TreeTask(thread_pool_, count_, depth_ - 1)); + thread_pool_->AddTask(self, new TreeTask(thread_pool_, count_, depth_ - 1)); } // Increment the counter which keeps track of work completed. ++*count_; + } + + void Finalize() { delete this; } @@ -117,7 +123,7 @@ TEST_F(ThreadPoolTest, RecursiveTest) { ThreadPool thread_pool(num_threads); AtomicInteger count = 0; static const int depth = 8; - thread_pool.AddTask(self, new TreeClosure(&thread_pool, &count, depth)); + thread_pool.AddTask(self, new TreeTask(&thread_pool, &count, depth)); thread_pool.StartWorkers(self); thread_pool.Wait(self); EXPECT_EQ((1 << depth) - 1, count); |