Revert^2 "Add peak RSS stats to GC perf dump"
This reverts commit cc292c611af7cdea6a2d9196fc347468b9233f71.
Reason for revert: RSS code is enabled only on linux now.
Test: art/test/testrunner/testrunner.py --target --runtime-option=-XX:DumpGCPerformanceOnShutdown
Bug: b/112187497
Change-Id: Iea5926d3dd4f6248f85422627b6ee0da559beb39
diff --git a/runtime/gc/accounting/card_table.h b/runtime/gc/accounting/card_table.h
index 30c4386..5f4675d 100644
--- a/runtime/gc/accounting/card_table.h
+++ b/runtime/gc/accounting/card_table.h
@@ -94,6 +94,14 @@
return biased_begin_;
}
+ void* MemMapBegin() const {
+ return mem_map_.BaseBegin();
+ }
+
+ size_t MemMapSize() const {
+ return mem_map_.BaseSize();
+ }
+
/*
* Modify cards in the range from scan_begin (inclusive) to scan_end (exclusive). Each card
* value v is replaced by visitor(v). Visitor() should not have side-effects.
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 642b12e..5483364 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -2528,6 +2528,72 @@
}
}
+void ConcurrentCopying::CaptureRssAtPeak() {
+ using range_t = std::pair<void*, void*>;
+ // This operation is expensive as several calls to mincore() are performed.
+ // Also, this must be called before clearing regions in ReclaimPhase().
+ // Therefore, we make it conditional on the flag that enables dumping GC
+ // performance info on shutdown.
+ if (Runtime::Current()->GetDumpGCPerformanceOnShutdown()) {
+ std::list<range_t> gc_ranges;
+ auto add_gc_range = [&gc_ranges](void* start, size_t size) {
+ void* end = static_cast<char*>(start) + RoundUp(size, kPageSize);
+ gc_ranges.emplace_back(range_t(start, end));
+ };
+
+ // region space
+ DCHECK(IsAligned<kPageSize>(region_space_->Limit()));
+ gc_ranges.emplace_back(range_t(region_space_->Begin(), region_space_->Limit()));
+ // mark bitmap
+ add_gc_range(region_space_bitmap_->Begin(), region_space_bitmap_->Size());
+
+ // non-moving space
+ {
+ DCHECK(IsAligned<kPageSize>(heap_->non_moving_space_->Limit()));
+ gc_ranges.emplace_back(range_t(heap_->non_moving_space_->Begin(),
+ heap_->non_moving_space_->Limit()));
+ // mark bitmap
+ accounting::ContinuousSpaceBitmap *bitmap = heap_->non_moving_space_->GetMarkBitmap();
+ add_gc_range(bitmap->Begin(), bitmap->Size());
+ // live bitmap. Deal with bound bitmaps.
+ ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
+ if (heap_->non_moving_space_->HasBoundBitmaps()) {
+ DCHECK_EQ(bitmap, heap_->non_moving_space_->GetLiveBitmap());
+ bitmap = heap_->non_moving_space_->GetTempBitmap();
+ } else {
+ bitmap = heap_->non_moving_space_->GetLiveBitmap();
+ }
+ add_gc_range(bitmap->Begin(), bitmap->Size());
+ }
+ // large-object space
+ if (heap_->GetLargeObjectsSpace()) {
+ heap_->GetLargeObjectsSpace()->ForEachMemMap([&add_gc_range](const MemMap& map) {
+ DCHECK(IsAligned<kPageSize>(map.BaseSize()));
+ add_gc_range(map.BaseBegin(), map.BaseSize());
+ });
+ // mark bitmap
+ accounting::LargeObjectBitmap* bitmap = heap_->GetLargeObjectsSpace()->GetMarkBitmap();
+ add_gc_range(bitmap->Begin(), bitmap->Size());
+ // live bitmap
+ bitmap = heap_->GetLargeObjectsSpace()->GetLiveBitmap();
+ add_gc_range(bitmap->Begin(), bitmap->Size());
+ }
+ // card table
+ add_gc_range(heap_->GetCardTable()->MemMapBegin(), heap_->GetCardTable()->MemMapSize());
+ // inter-region refs
+ if (use_generational_cc_ && !young_gen_) {
+ // region space
+ add_gc_range(region_space_inter_region_bitmap_->Begin(),
+ region_space_inter_region_bitmap_->Size());
+ // non-moving space
+ add_gc_range(non_moving_space_inter_region_bitmap_->Begin(),
+ non_moving_space_inter_region_bitmap_->Size());
+ }
+ // Extract RSS using mincore(). Updates the cummulative RSS counter.
+ ExtractRssFromMincore(&gc_ranges);
+ }
+}
+
void ConcurrentCopying::ReclaimPhase() {
TimingLogger::ScopedTiming split("ReclaimPhase", GetTimings());
if (kVerboseMode) {
@@ -2552,6 +2618,14 @@
CheckEmptyMarkStack();
}
+ // Capture RSS at the time when memory usage is at its peak. All GC related
+ // memory ranges like java heap, card table, bitmap etc. are taken into
+ // account.
+ // TODO: We can fetch resident memory for region space directly by going
+ // through list of allocated regions. This way we can avoid calling mincore on
+ // the biggest memory range, thereby reducing the cost of this function.
+ CaptureRssAtPeak();
+
{
// Record freed objects.
TimingLogger::ScopedTiming split2("RecordFree", GetTimings());
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index 124713c..44ee7c2 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -88,6 +88,7 @@
!rb_slow_path_histogram_lock_,
!skipped_blocks_lock_);
+ void CaptureRssAtPeak() REQUIRES(!mark_stack_lock_);
void BindBitmaps() REQUIRES_SHARED(Locks::mutator_lock_)
REQUIRES(!Locks::heap_bitmap_lock_);
GcType GetGcType() const override {
diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc
index ec0ac6f..e4ae10a 100644
--- a/runtime/gc/collector/garbage_collector.cc
+++ b/runtime/gc/collector/garbage_collector.cc
@@ -15,6 +15,8 @@
*/
#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
#include "garbage_collector.h"
@@ -65,6 +67,9 @@
: heap_(heap),
name_(name),
pause_histogram_((name_ + " paused").c_str(), kPauseBucketSize, kPauseBucketCount),
+ rss_histogram_((name_ + " peak-rss").c_str(),
+ /*initial_bucket_width=*/ 10,
+ /*max_buckets=*/ 20),
freed_bytes_histogram_((name_ + " freed-bytes").c_str(),
/*initial_bucket_width=*/ 10,
/*max_buckets=*/ 20),
@@ -84,11 +89,64 @@
total_time_ns_ = 0u;
total_freed_objects_ = 0u;
total_freed_bytes_ = 0;
+ rss_histogram_.Reset();
freed_bytes_histogram_.Reset();
MutexLock mu(Thread::Current(), pause_histogram_lock_);
pause_histogram_.Reset();
}
+uint64_t GarbageCollector::ExtractRssFromMincore(
+ std::list<std::pair<void*, void*>>* gc_ranges) {
+ using range_t = std::pair<void*, void*>;
+ uint64_t rss = 0;
+ if (gc_ranges->empty()) {
+ return 0;
+ }
+ // mincore() is linux-specific syscall.
+#if defined(__linux__)
+ // Sort gc_ranges
+ gc_ranges->sort([](const range_t& a, const range_t& b) {
+ return std::less()(a.first, b.first);
+ });
+ // Merge gc_ranges. It's necessary because the kernel may merge contiguous
+ // regions if their properties match. This is sufficient as kernel doesn't
+ // merge those adjoining ranges which differ only in name.
+ size_t vec_len = 0;
+ for (auto it = gc_ranges->begin(); it != gc_ranges->end(); it++) {
+ auto next_it = it;
+ next_it++;
+ while (next_it != gc_ranges->end()) {
+ if (it->second == next_it->first) {
+ it->second = next_it->second;
+ next_it = gc_ranges->erase(next_it);
+ } else {
+ break;
+ }
+ }
+ size_t length = static_cast<uint8_t*>(it->second) - static_cast<uint8_t*>(it->first);
+ // Compute max length for vector allocation later.
+ vec_len = std::max(vec_len, length / kPageSize);
+ }
+ std::unique_ptr<unsigned char[]> vec(new unsigned char[vec_len]);
+ for (const auto it : *gc_ranges) {
+ size_t length = static_cast<uint8_t*>(it.second) - static_cast<uint8_t*>(it.first);
+ if (mincore(it.first, length, vec.get()) == 0) {
+ for (size_t i = 0; i < length / kPageSize; i++) {
+ // Least significant bit represents residency of a page. Other bits are
+ // reserved.
+ rss += vec[i] & 0x1;
+ }
+ } else {
+ LOG(WARNING) << "Call to mincore() on memory range [0x" << std::hex << it.first
+ << ", 0x" << it.second << std::dec << ") failed: " << strerror(errno);
+ }
+ }
+ rss *= kPageSize;
+ rss_histogram_.AddValue(rss / KB);
+#endif
+ return rss;
+}
+
void GarbageCollector::Run(GcCause gc_cause, bool clear_soft_references) {
ScopedTrace trace(android::base::StringPrintf("%s %s GC", PrettyCause(gc_cause), GetName()));
Thread* self = Thread::Current();
@@ -171,6 +229,7 @@
pause_histogram_.Reset();
}
cumulative_timings_.Reset();
+ rss_histogram_.Reset();
freed_bytes_histogram_.Reset();
total_thread_cpu_time_ns_ = 0u;
total_time_ns_ = 0u;
@@ -243,6 +302,17 @@
pause_histogram_.PrintConfidenceIntervals(os, 0.99, cumulative_data);
}
}
+#if defined(__linux__)
+ if (rss_histogram_.SampleSize() > 0) {
+ os << rss_histogram_.Name()
+ << ": Avg: " << PrettySize(rss_histogram_.Mean() * KB)
+ << " Max: " << PrettySize(rss_histogram_.Max() * KB)
+ << " Min: " << PrettySize(rss_histogram_.Min() * KB) << "\n";
+ os << "Peak-rss Histogram: ";
+ rss_histogram_.DumpBins(os);
+ os << "\n";
+ }
+#endif
if (freed_bytes_histogram_.SampleSize() > 0) {
os << freed_bytes_histogram_.Name()
<< ": Avg: " << PrettySize(freed_bytes_histogram_.Mean() * KB)
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index 22afac6..a4f9467 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -18,7 +18,7 @@
#define ART_RUNTIME_GC_COLLECTOR_GARBAGE_COLLECTOR_H_
#include <stdint.h>
-#include <vector>
+#include <list>
#include "base/histogram.h"
#include "base/mutex.h"
@@ -111,6 +111,9 @@
void RecordFreeLOS(const ObjectBytePair& freed);
virtual void DumpPerformanceInfo(std::ostream& os) REQUIRES(!pause_histogram_lock_);
+ // Extract RSS for GC-specific memory ranges using mincore().
+ uint64_t ExtractRssFromMincore(std::list<std::pair<void*, void*>>* gc_ranges);
+
// Helper functions for querying if objects are marked. These are used for processing references,
// and will be used for reading system weaks while the GC is running.
virtual mirror::Object* IsMarked(mirror::Object* obj)
@@ -149,6 +152,7 @@
std::string name_;
// Cumulative statistics.
Histogram<uint64_t> pause_histogram_ GUARDED_BY(pause_histogram_lock_);
+ Histogram<uint64_t> rss_histogram_;
Histogram<size_t> freed_bytes_histogram_;
uint64_t total_thread_cpu_time_ns_;
uint64_t total_time_ns_;
diff --git a/runtime/gc/space/large_object_space.cc b/runtime/gc/space/large_object_space.cc
index 1658dba..2c18888 100644
--- a/runtime/gc/space/large_object_space.cc
+++ b/runtime/gc/space/large_object_space.cc
@@ -231,6 +231,13 @@
}
}
+void LargeObjectMapSpace::ForEachMemMap(std::function<void(const MemMap&)> func) const {
+ MutexLock mu(Thread::Current(), lock_);
+ for (auto& pair : large_objects_) {
+ func(pair.second.mem_map);
+ }
+}
+
bool LargeObjectMapSpace::Contains(const mirror::Object* obj) const {
Thread* self = Thread::Current();
if (lock_.IsExclusiveHeld(self)) {
@@ -398,6 +405,12 @@
CHECK_EQ(cur_info, end_info);
}
+void FreeListSpace::ForEachMemMap(std::function<void(const MemMap&)> func) const {
+ MutexLock mu(Thread::Current(), lock_);
+ func(allocation_info_map_);
+ func(mem_map_);
+}
+
void FreeListSpace::RemoveFreePrev(AllocationInfo* info) {
CHECK_GT(info->GetPrevFree(), 0U);
auto it = free_blocks_.lower_bound(info);
diff --git a/runtime/gc/space/large_object_space.h b/runtime/gc/space/large_object_space.h
index a4d6a24..86ecd85 100644
--- a/runtime/gc/space/large_object_space.h
+++ b/runtime/gc/space/large_object_space.h
@@ -110,6 +110,7 @@
// objects.
virtual void SetAllLargeObjectsAsZygoteObjects(Thread* self) = 0;
+ virtual void ForEachMemMap(std::function<void(const MemMap&)> func) const = 0;
// GetRangeAtomic returns Begin() and End() atomically, that is, it never returns Begin() and
// End() from different allocations.
virtual std::pair<uint8_t*, uint8_t*> GetBeginEndAtomic() const = 0;
@@ -160,7 +161,7 @@
void Walk(DlMallocSpace::WalkCallback, void* arg) override REQUIRES(!lock_);
// TODO: disabling thread safety analysis as this may be called when we already hold lock_.
bool Contains(const mirror::Object* obj) const NO_THREAD_SAFETY_ANALYSIS;
-
+ void ForEachMemMap(std::function<void(const MemMap&)> func) const override REQUIRES(!lock_);
std::pair<uint8_t*, uint8_t*> GetBeginEndAtomic() const override REQUIRES(!lock_);
protected:
@@ -193,7 +194,7 @@
size_t Free(Thread* self, mirror::Object* obj) override REQUIRES(!lock_);
void Walk(DlMallocSpace::WalkCallback callback, void* arg) override REQUIRES(!lock_);
void Dump(std::ostream& os) const REQUIRES(!lock_);
-
+ void ForEachMemMap(std::function<void(const MemMap&)> func) const override REQUIRES(!lock_);
std::pair<uint8_t*, uint8_t*> GetBeginEndAtomic() const override REQUIRES(!lock_);
protected:
diff --git a/runtime/gc/space/space.h b/runtime/gc/space/space.h
index 903263f..3a42f98 100644
--- a/runtime/gc/space/space.h
+++ b/runtime/gc/space/space.h
@@ -443,6 +443,10 @@
return mark_bitmap_.get();
}
+ accounting::ContinuousSpaceBitmap* GetTempBitmap() const {
+ return temp_bitmap_.get();
+ }
+
collector::ObjectBytePair Sweep(bool swap_bitmaps);
virtual accounting::ContinuousSpaceBitmap::SweepCallback* GetSweepCallback() = 0;
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 564a1b9..6a9e1e9 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -775,6 +775,10 @@
dump_gc_performance_on_shutdown_ = value;
}
+ bool GetDumpGCPerformanceOnShutdown() const {
+ return dump_gc_performance_on_shutdown_;
+ }
+
void IncrementDeoptimizationCount(DeoptimizationKind kind) {
DCHECK_LE(kind, DeoptimizationKind::kLast);
deoptimization_counts_[static_cast<size_t>(kind)]++;