diff options
author | 2019-02-27 16:21:16 -0800 | |
---|---|---|
committer | 2019-03-07 17:56:07 +0000 | |
commit | b4dad1bc139e46f25afeac21c9959f2787e4937a (patch) | |
tree | 5f09fa649df56eb8cd217fc9a32de9046f312e28 | |
parent | 85dd9e35193a886d76734387d6764045bb4bb2d5 (diff) |
Add peak RSS stats to GC perf dump
Use mincore() to extract resident memory info and compute RSS specific
to GC data structures and heaps. It must be computed at a time in the GC
cycle when the physical memory consumption is at its peak. For instance,
for CC it is immediately after marking is finished and before regions
are cleared.
Test: art/test/testrunner/testrunner.py --target --runtime-option=-XX:DumpGCPerformanceOnShutdown
Bug: b/112187497
Change-Id: I92cf006524cf6c91ba1e96aa7c5303c578e6db54
-rw-r--r-- | runtime/gc/accounting/card_table.h | 8 | ||||
-rw-r--r-- | runtime/gc/collector/concurrent_copying.cc | 74 | ||||
-rw-r--r-- | runtime/gc/collector/concurrent_copying.h | 1 | ||||
-rw-r--r-- | runtime/gc/collector/garbage_collector.cc | 66 | ||||
-rw-r--r-- | runtime/gc/collector/garbage_collector.h | 6 | ||||
-rw-r--r-- | runtime/gc/space/large_object_space.cc | 13 | ||||
-rw-r--r-- | runtime/gc/space/large_object_space.h | 5 | ||||
-rw-r--r-- | runtime/gc/space/space.h | 4 | ||||
-rw-r--r-- | runtime/runtime.h | 4 |
9 files changed, 178 insertions, 3 deletions
diff --git a/runtime/gc/accounting/card_table.h b/runtime/gc/accounting/card_table.h index 30c438614b..5f4675d1cf 100644 --- a/runtime/gc/accounting/card_table.h +++ b/runtime/gc/accounting/card_table.h @@ -94,6 +94,14 @@ class CardTable { return biased_begin_; } + void* MemMapBegin() const { + return mem_map_.BaseBegin(); + } + + size_t MemMapSize() const { + return mem_map_.BaseSize(); + } + /* * Modify cards in the range from scan_begin (inclusive) to scan_end (exclusive). Each card * value v is replaced by visitor(v). Visitor() should not have side-effects. diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc index 642b12e9b7..5483364600 100644 --- a/runtime/gc/collector/concurrent_copying.cc +++ b/runtime/gc/collector/concurrent_copying.cc @@ -2528,6 +2528,72 @@ void ConcurrentCopying::SweepLargeObjects(bool swap_bitmaps) { } } +void ConcurrentCopying::CaptureRssAtPeak() { + using range_t = std::pair<void*, void*>; + // This operation is expensive as several calls to mincore() are performed. + // Also, this must be called before clearing regions in ReclaimPhase(). + // Therefore, we make it conditional on the flag that enables dumping GC + // performance info on shutdown. + if (Runtime::Current()->GetDumpGCPerformanceOnShutdown()) { + std::list<range_t> gc_ranges; + auto add_gc_range = [&gc_ranges](void* start, size_t size) { + void* end = static_cast<char*>(start) + RoundUp(size, kPageSize); + gc_ranges.emplace_back(range_t(start, end)); + }; + + // region space + DCHECK(IsAligned<kPageSize>(region_space_->Limit())); + gc_ranges.emplace_back(range_t(region_space_->Begin(), region_space_->Limit())); + // mark bitmap + add_gc_range(region_space_bitmap_->Begin(), region_space_bitmap_->Size()); + + // non-moving space + { + DCHECK(IsAligned<kPageSize>(heap_->non_moving_space_->Limit())); + gc_ranges.emplace_back(range_t(heap_->non_moving_space_->Begin(), + heap_->non_moving_space_->Limit())); + // mark bitmap + accounting::ContinuousSpaceBitmap *bitmap = heap_->non_moving_space_->GetMarkBitmap(); + add_gc_range(bitmap->Begin(), bitmap->Size()); + // live bitmap. Deal with bound bitmaps. + ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_); + if (heap_->non_moving_space_->HasBoundBitmaps()) { + DCHECK_EQ(bitmap, heap_->non_moving_space_->GetLiveBitmap()); + bitmap = heap_->non_moving_space_->GetTempBitmap(); + } else { + bitmap = heap_->non_moving_space_->GetLiveBitmap(); + } + add_gc_range(bitmap->Begin(), bitmap->Size()); + } + // large-object space + if (heap_->GetLargeObjectsSpace()) { + heap_->GetLargeObjectsSpace()->ForEachMemMap([&add_gc_range](const MemMap& map) { + DCHECK(IsAligned<kPageSize>(map.BaseSize())); + add_gc_range(map.BaseBegin(), map.BaseSize()); + }); + // mark bitmap + accounting::LargeObjectBitmap* bitmap = heap_->GetLargeObjectsSpace()->GetMarkBitmap(); + add_gc_range(bitmap->Begin(), bitmap->Size()); + // live bitmap + bitmap = heap_->GetLargeObjectsSpace()->GetLiveBitmap(); + add_gc_range(bitmap->Begin(), bitmap->Size()); + } + // card table + add_gc_range(heap_->GetCardTable()->MemMapBegin(), heap_->GetCardTable()->MemMapSize()); + // inter-region refs + if (use_generational_cc_ && !young_gen_) { + // region space + add_gc_range(region_space_inter_region_bitmap_->Begin(), + region_space_inter_region_bitmap_->Size()); + // non-moving space + add_gc_range(non_moving_space_inter_region_bitmap_->Begin(), + non_moving_space_inter_region_bitmap_->Size()); + } + // Extract RSS using mincore(). Updates the cummulative RSS counter. + ExtractRssFromMincore(&gc_ranges); + } +} + void ConcurrentCopying::ReclaimPhase() { TimingLogger::ScopedTiming split("ReclaimPhase", GetTimings()); if (kVerboseMode) { @@ -2552,6 +2618,14 @@ void ConcurrentCopying::ReclaimPhase() { CheckEmptyMarkStack(); } + // Capture RSS at the time when memory usage is at its peak. All GC related + // memory ranges like java heap, card table, bitmap etc. are taken into + // account. + // TODO: We can fetch resident memory for region space directly by going + // through list of allocated regions. This way we can avoid calling mincore on + // the biggest memory range, thereby reducing the cost of this function. + CaptureRssAtPeak(); + { // Record freed objects. TimingLogger::ScopedTiming split2("RecordFree", GetTimings()); diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h index 124713c17c..44ee7c2023 100644 --- a/runtime/gc/collector/concurrent_copying.h +++ b/runtime/gc/collector/concurrent_copying.h @@ -88,6 +88,7 @@ class ConcurrentCopying : public GarbageCollector { !rb_slow_path_histogram_lock_, !skipped_blocks_lock_); + void CaptureRssAtPeak() REQUIRES(!mark_stack_lock_); void BindBitmaps() REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!Locks::heap_bitmap_lock_); GcType GetGcType() const override { diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc index b8ad62410a..54daf0d845 100644 --- a/runtime/gc/collector/garbage_collector.cc +++ b/runtime/gc/collector/garbage_collector.cc @@ -15,6 +15,8 @@ */ #include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> #include "garbage_collector.h" @@ -65,6 +67,9 @@ GarbageCollector::GarbageCollector(Heap* heap, const std::string& name) : heap_(heap), name_(name), pause_histogram_((name_ + " paused").c_str(), kPauseBucketSize, kPauseBucketCount), + rss_histogram_((name_ + " peak-rss").c_str(), + /*initial_bucket_width=*/ 10, + /*max_buckets=*/ 20), cumulative_timings_(name), pause_histogram_lock_("pause histogram lock", kDefaultMutexLevel, true), is_transaction_active_(false) { @@ -81,10 +86,61 @@ void GarbageCollector::ResetCumulativeStatistics() { total_time_ns_ = 0u; total_freed_objects_ = 0u; total_freed_bytes_ = 0; + rss_histogram_.Reset(); MutexLock mu(Thread::Current(), pause_histogram_lock_); pause_histogram_.Reset(); } +uint64_t GarbageCollector::ExtractRssFromMincore( + std::list<std::pair<void*, void*>>* gc_ranges) { + using range_t = std::pair<void*, void*>; + if (gc_ranges->empty()) { + return 0; + } + // Sort gc_ranges + gc_ranges->sort([](const range_t& a, const range_t& b) { + return std::less()(a.first, b.first); + }); + // Merge gc_ranges. It's necessary because the kernel may merge contiguous + // regions if their properties match. This is sufficient as kernel doesn't + // merge those adjoining ranges which differ only in name. + size_t vec_len = 0; + for (auto it = gc_ranges->begin(); it != gc_ranges->end(); it++) { + auto next_it = it; + next_it++; + while (next_it != gc_ranges->end()) { + if (it->second == next_it->first) { + it->second = next_it->second; + next_it = gc_ranges->erase(next_it); + } else { + break; + } + } + size_t length = static_cast<uint8_t*>(it->second) - static_cast<uint8_t*>(it->first); + // Compute max length for vector allocation later. + vec_len = std::max(vec_len, length / kPageSize); + } + unsigned char *vec = new unsigned char[vec_len]; + uint64_t rss = 0; + for (const auto it : *gc_ranges) { + size_t length = static_cast<uint8_t*>(it.second) - static_cast<uint8_t*>(it.first); + if (mincore(it.first, length, vec) == 0) { + for (size_t i = 0; i < length / kPageSize; i++) { + // Least significant bit represents residency of a page. Other bits are + // reserved. + rss += vec[i] & 0x1; + } + } else { + LOG(WARNING) << "Call to mincore() on memory range [0x" << std::hex << it.first + << ", 0x" << it.second << std::dec << ") failed: " << strerror(errno); + } + } + delete[] vec; + rss *= kPageSize; + rss_histogram_.AddValue(rss / KB); + return rss; +} + void GarbageCollector::Run(GcCause gc_cause, bool clear_soft_references) { ScopedTrace trace(android::base::StringPrintf("%s %s GC", PrettyCause(gc_cause), GetName())); Thread* self = Thread::Current(); @@ -165,6 +221,7 @@ void GarbageCollector::ResetMeasurements() { pause_histogram_.Reset(); } cumulative_timings_.Reset(); + rss_histogram_.Reset(); total_thread_cpu_time_ns_ = 0u; total_time_ns_ = 0u; total_freed_objects_ = 0u; @@ -236,6 +293,15 @@ void GarbageCollector::DumpPerformanceInfo(std::ostream& os) { pause_histogram_.PrintConfidenceIntervals(os, 0.99, cumulative_data); } } + if (rss_histogram_.SampleSize() > 0) { + os << rss_histogram_.Name() + << ": Avg: " << PrettySize(rss_histogram_.Mean() * KB) + << " Max: " << PrettySize(rss_histogram_.Max() * KB) + << " Min: " << PrettySize(rss_histogram_.Min() * KB) << "\n"; + os << "Peak-rss Histogram: "; + rss_histogram_.DumpBins(os); + os << "\n"; + } double cpu_seconds = NsToMs(GetTotalCpuTime()) / 1000.0; os << GetName() << " total time: " << PrettyDuration(total_ns) << " mean time: " << PrettyDuration(total_ns / iterations) << "\n" diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h index 2857881456..a5a3938e70 100644 --- a/runtime/gc/collector/garbage_collector.h +++ b/runtime/gc/collector/garbage_collector.h @@ -18,7 +18,7 @@ #define ART_RUNTIME_GC_COLLECTOR_GARBAGE_COLLECTOR_H_ #include <stdint.h> -#include <vector> +#include <list> #include "base/histogram.h" #include "base/mutex.h" @@ -111,6 +111,9 @@ class GarbageCollector : public RootVisitor, public IsMarkedVisitor, public Mark void RecordFreeLOS(const ObjectBytePair& freed); virtual void DumpPerformanceInfo(std::ostream& os) REQUIRES(!pause_histogram_lock_); + // Extract RSS for GC-specific memory ranges using mincore(). + uint64_t ExtractRssFromMincore(std::list<std::pair<void*, void*>>* gc_ranges); + // Helper functions for querying if objects are marked. These are used for processing references, // and will be used for reading system weaks while the GC is running. virtual mirror::Object* IsMarked(mirror::Object* obj) @@ -149,6 +152,7 @@ class GarbageCollector : public RootVisitor, public IsMarkedVisitor, public Mark std::string name_; // Cumulative statistics. Histogram<uint64_t> pause_histogram_ GUARDED_BY(pause_histogram_lock_); + Histogram<uint64_t> rss_histogram_; uint64_t total_thread_cpu_time_ns_; uint64_t total_time_ns_; uint64_t total_freed_objects_; diff --git a/runtime/gc/space/large_object_space.cc b/runtime/gc/space/large_object_space.cc index 1658dba413..2c18888c5f 100644 --- a/runtime/gc/space/large_object_space.cc +++ b/runtime/gc/space/large_object_space.cc @@ -231,6 +231,13 @@ void LargeObjectMapSpace::Walk(DlMallocSpace::WalkCallback callback, void* arg) } } +void LargeObjectMapSpace::ForEachMemMap(std::function<void(const MemMap&)> func) const { + MutexLock mu(Thread::Current(), lock_); + for (auto& pair : large_objects_) { + func(pair.second.mem_map); + } +} + bool LargeObjectMapSpace::Contains(const mirror::Object* obj) const { Thread* self = Thread::Current(); if (lock_.IsExclusiveHeld(self)) { @@ -398,6 +405,12 @@ void FreeListSpace::Walk(DlMallocSpace::WalkCallback callback, void* arg) { CHECK_EQ(cur_info, end_info); } +void FreeListSpace::ForEachMemMap(std::function<void(const MemMap&)> func) const { + MutexLock mu(Thread::Current(), lock_); + func(allocation_info_map_); + func(mem_map_); +} + void FreeListSpace::RemoveFreePrev(AllocationInfo* info) { CHECK_GT(info->GetPrevFree(), 0U); auto it = free_blocks_.lower_bound(info); diff --git a/runtime/gc/space/large_object_space.h b/runtime/gc/space/large_object_space.h index a4d6a24263..86ecd85c83 100644 --- a/runtime/gc/space/large_object_space.h +++ b/runtime/gc/space/large_object_space.h @@ -110,6 +110,7 @@ class LargeObjectSpace : public DiscontinuousSpace, public AllocSpace { // objects. virtual void SetAllLargeObjectsAsZygoteObjects(Thread* self) = 0; + virtual void ForEachMemMap(std::function<void(const MemMap&)> func) const = 0; // GetRangeAtomic returns Begin() and End() atomically, that is, it never returns Begin() and // End() from different allocations. virtual std::pair<uint8_t*, uint8_t*> GetBeginEndAtomic() const = 0; @@ -160,7 +161,7 @@ class LargeObjectMapSpace : public LargeObjectSpace { void Walk(DlMallocSpace::WalkCallback, void* arg) override REQUIRES(!lock_); // TODO: disabling thread safety analysis as this may be called when we already hold lock_. bool Contains(const mirror::Object* obj) const NO_THREAD_SAFETY_ANALYSIS; - + void ForEachMemMap(std::function<void(const MemMap&)> func) const override REQUIRES(!lock_); std::pair<uint8_t*, uint8_t*> GetBeginEndAtomic() const override REQUIRES(!lock_); protected: @@ -193,7 +194,7 @@ class FreeListSpace final : public LargeObjectSpace { size_t Free(Thread* self, mirror::Object* obj) override REQUIRES(!lock_); void Walk(DlMallocSpace::WalkCallback callback, void* arg) override REQUIRES(!lock_); void Dump(std::ostream& os) const REQUIRES(!lock_); - + void ForEachMemMap(std::function<void(const MemMap&)> func) const override REQUIRES(!lock_); std::pair<uint8_t*, uint8_t*> GetBeginEndAtomic() const override REQUIRES(!lock_); protected: diff --git a/runtime/gc/space/space.h b/runtime/gc/space/space.h index 903263f26a..3a42f9847c 100644 --- a/runtime/gc/space/space.h +++ b/runtime/gc/space/space.h @@ -443,6 +443,10 @@ class ContinuousMemMapAllocSpace : public MemMapSpace, public AllocSpace { return mark_bitmap_.get(); } + accounting::ContinuousSpaceBitmap* GetTempBitmap() const { + return temp_bitmap_.get(); + } + collector::ObjectBytePair Sweep(bool swap_bitmaps); virtual accounting::ContinuousSpaceBitmap::SweepCallback* GetSweepCallback() = 0; diff --git a/runtime/runtime.h b/runtime/runtime.h index ace0eea139..49ebbdc4ef 100644 --- a/runtime/runtime.h +++ b/runtime/runtime.h @@ -775,6 +775,10 @@ class Runtime { dump_gc_performance_on_shutdown_ = value; } + bool GetDumpGCPerformanceOnShutdown() const { + return dump_gc_performance_on_shutdown_; + } + void IncrementDeoptimizationCount(DeoptimizationKind kind) { DCHECK_LE(kind, DeoptimizationKind::kLast); deoptimization_counts_[static_cast<size_t>(kind)]++; |