Add peak RSS stats to GC perf dump

Use mincore() to extract resident memory info and compute RSS specific
to GC data structures and heaps. It must be computed at a time in the GC
cycle when the physical memory consumption is at its peak. For instance,
for CC it is immediately after marking is finished and before regions
are cleared.

Test: art/test/testrunner/testrunner.py --target --runtime-option=-XX:DumpGCPerformanceOnShutdown
Bug: b/112187497
Change-Id: I92cf006524cf6c91ba1e96aa7c5303c578e6db54
diff --git a/runtime/gc/accounting/card_table.h b/runtime/gc/accounting/card_table.h
index 30c4386..5f4675d 100644
--- a/runtime/gc/accounting/card_table.h
+++ b/runtime/gc/accounting/card_table.h
@@ -94,6 +94,14 @@
     return biased_begin_;
   }
 
+  void* MemMapBegin() const {
+    return mem_map_.BaseBegin();
+  }
+
+  size_t MemMapSize() const {
+    return mem_map_.BaseSize();
+  }
+
   /*
    * Modify cards in the range from scan_begin (inclusive) to scan_end (exclusive). Each card
    * value v is replaced by visitor(v). Visitor() should not have side-effects.
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 642b12e..5483364 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -2528,6 +2528,72 @@
   }
 }
 
+void ConcurrentCopying::CaptureRssAtPeak() {
+  using range_t = std::pair<void*, void*>;
+  // This operation is expensive as several calls to mincore() are performed.
+  // Also, this must be called before clearing regions in ReclaimPhase().
+  // Therefore, we make it conditional on the flag that enables dumping GC
+  // performance info on shutdown.
+  if (Runtime::Current()->GetDumpGCPerformanceOnShutdown()) {
+    std::list<range_t> gc_ranges;
+    auto add_gc_range = [&gc_ranges](void* start, size_t size) {
+      void* end = static_cast<char*>(start) + RoundUp(size, kPageSize);
+      gc_ranges.emplace_back(range_t(start, end));
+    };
+
+    // region space
+    DCHECK(IsAligned<kPageSize>(region_space_->Limit()));
+    gc_ranges.emplace_back(range_t(region_space_->Begin(), region_space_->Limit()));
+    // mark bitmap
+    add_gc_range(region_space_bitmap_->Begin(), region_space_bitmap_->Size());
+
+    // non-moving space
+    {
+      DCHECK(IsAligned<kPageSize>(heap_->non_moving_space_->Limit()));
+      gc_ranges.emplace_back(range_t(heap_->non_moving_space_->Begin(),
+                                     heap_->non_moving_space_->Limit()));
+      // mark bitmap
+      accounting::ContinuousSpaceBitmap *bitmap = heap_->non_moving_space_->GetMarkBitmap();
+      add_gc_range(bitmap->Begin(), bitmap->Size());
+      // live bitmap. Deal with bound bitmaps.
+      ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
+      if (heap_->non_moving_space_->HasBoundBitmaps()) {
+        DCHECK_EQ(bitmap, heap_->non_moving_space_->GetLiveBitmap());
+        bitmap = heap_->non_moving_space_->GetTempBitmap();
+      } else {
+        bitmap = heap_->non_moving_space_->GetLiveBitmap();
+      }
+      add_gc_range(bitmap->Begin(), bitmap->Size());
+    }
+    // large-object space
+    if (heap_->GetLargeObjectsSpace()) {
+      heap_->GetLargeObjectsSpace()->ForEachMemMap([&add_gc_range](const MemMap& map) {
+        DCHECK(IsAligned<kPageSize>(map.BaseSize()));
+        add_gc_range(map.BaseBegin(), map.BaseSize());
+      });
+      // mark bitmap
+      accounting::LargeObjectBitmap* bitmap = heap_->GetLargeObjectsSpace()->GetMarkBitmap();
+      add_gc_range(bitmap->Begin(), bitmap->Size());
+      // live bitmap
+      bitmap = heap_->GetLargeObjectsSpace()->GetLiveBitmap();
+      add_gc_range(bitmap->Begin(), bitmap->Size());
+    }
+    // card table
+    add_gc_range(heap_->GetCardTable()->MemMapBegin(), heap_->GetCardTable()->MemMapSize());
+    // inter-region refs
+    if (use_generational_cc_ && !young_gen_) {
+      // region space
+      add_gc_range(region_space_inter_region_bitmap_->Begin(),
+                   region_space_inter_region_bitmap_->Size());
+      // non-moving space
+      add_gc_range(non_moving_space_inter_region_bitmap_->Begin(),
+                   non_moving_space_inter_region_bitmap_->Size());
+    }
+    // Extract RSS using mincore(). Updates the cummulative RSS counter.
+    ExtractRssFromMincore(&gc_ranges);
+  }
+}
+
 void ConcurrentCopying::ReclaimPhase() {
   TimingLogger::ScopedTiming split("ReclaimPhase", GetTimings());
   if (kVerboseMode) {
@@ -2552,6 +2618,14 @@
     CheckEmptyMarkStack();
   }
 
+  // Capture RSS at the time when memory usage is at its peak. All GC related
+  // memory ranges like java heap, card table, bitmap etc. are taken into
+  // account.
+  // TODO: We can fetch resident memory for region space directly by going
+  // through list of allocated regions. This way we can avoid calling mincore on
+  // the biggest memory range, thereby reducing the cost of this function.
+  CaptureRssAtPeak();
+
   {
     // Record freed objects.
     TimingLogger::ScopedTiming split2("RecordFree", GetTimings());
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index 124713c..44ee7c2 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -88,6 +88,7 @@
                               !rb_slow_path_histogram_lock_,
                               !skipped_blocks_lock_);
 
+  void CaptureRssAtPeak() REQUIRES(!mark_stack_lock_);
   void BindBitmaps() REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!Locks::heap_bitmap_lock_);
   GcType GetGcType() const override {
diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc
index b8ad624..54daf0d 100644
--- a/runtime/gc/collector/garbage_collector.cc
+++ b/runtime/gc/collector/garbage_collector.cc
@@ -15,6 +15,8 @@
  */
 
 #include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
 
 #include "garbage_collector.h"
 
@@ -65,6 +67,9 @@
     : heap_(heap),
       name_(name),
       pause_histogram_((name_ + " paused").c_str(), kPauseBucketSize, kPauseBucketCount),
+      rss_histogram_((name_ + " peak-rss").c_str(),
+                     /*initial_bucket_width=*/ 10,
+                     /*max_buckets=*/ 20),
       cumulative_timings_(name),
       pause_histogram_lock_("pause histogram lock", kDefaultMutexLevel, true),
       is_transaction_active_(false) {
@@ -81,10 +86,61 @@
   total_time_ns_ = 0u;
   total_freed_objects_ = 0u;
   total_freed_bytes_ = 0;
+  rss_histogram_.Reset();
   MutexLock mu(Thread::Current(), pause_histogram_lock_);
   pause_histogram_.Reset();
 }
 
+uint64_t GarbageCollector::ExtractRssFromMincore(
+    std::list<std::pair<void*, void*>>* gc_ranges) {
+  using range_t = std::pair<void*, void*>;
+  if (gc_ranges->empty()) {
+    return 0;
+  }
+  // Sort gc_ranges
+  gc_ranges->sort([](const range_t& a, const range_t& b) {
+    return std::less()(a.first, b.first);
+  });
+  // Merge gc_ranges. It's necessary because the kernel may merge contiguous
+  // regions if their properties match. This is sufficient as kernel doesn't
+  // merge those adjoining ranges which differ only in name.
+  size_t vec_len = 0;
+  for (auto it = gc_ranges->begin(); it != gc_ranges->end(); it++) {
+    auto next_it = it;
+    next_it++;
+    while (next_it != gc_ranges->end()) {
+      if (it->second == next_it->first) {
+        it->second = next_it->second;
+        next_it = gc_ranges->erase(next_it);
+      } else {
+        break;
+      }
+    }
+    size_t length = static_cast<uint8_t*>(it->second) - static_cast<uint8_t*>(it->first);
+    // Compute max length for vector allocation later.
+    vec_len = std::max(vec_len, length / kPageSize);
+  }
+  unsigned char *vec = new unsigned char[vec_len];
+  uint64_t rss = 0;
+  for (const auto it : *gc_ranges) {
+    size_t length = static_cast<uint8_t*>(it.second) - static_cast<uint8_t*>(it.first);
+    if (mincore(it.first, length, vec) == 0) {
+      for (size_t i = 0; i < length / kPageSize; i++) {
+        // Least significant bit represents residency of a page. Other bits are
+        // reserved.
+        rss += vec[i] & 0x1;
+      }
+    } else {
+      LOG(WARNING) << "Call to mincore() on memory range [0x" << std::hex << it.first
+                   << ", 0x" << it.second << std::dec << ") failed: " << strerror(errno);
+    }
+  }
+  delete[] vec;
+  rss *= kPageSize;
+  rss_histogram_.AddValue(rss / KB);
+  return rss;
+}
+
 void GarbageCollector::Run(GcCause gc_cause, bool clear_soft_references) {
   ScopedTrace trace(android::base::StringPrintf("%s %s GC", PrettyCause(gc_cause), GetName()));
   Thread* self = Thread::Current();
@@ -165,6 +221,7 @@
     pause_histogram_.Reset();
   }
   cumulative_timings_.Reset();
+  rss_histogram_.Reset();
   total_thread_cpu_time_ns_ = 0u;
   total_time_ns_ = 0u;
   total_freed_objects_ = 0u;
@@ -236,6 +293,15 @@
       pause_histogram_.PrintConfidenceIntervals(os, 0.99, cumulative_data);
     }
   }
+  if (rss_histogram_.SampleSize() > 0) {
+    os << rss_histogram_.Name()
+       << ": Avg: " << PrettySize(rss_histogram_.Mean() * KB)
+       << " Max: " << PrettySize(rss_histogram_.Max() * KB)
+       << " Min: " << PrettySize(rss_histogram_.Min() * KB) << "\n";
+    os << "Peak-rss Histogram: ";
+    rss_histogram_.DumpBins(os);
+    os << "\n";
+  }
   double cpu_seconds = NsToMs(GetTotalCpuTime()) / 1000.0;
   os << GetName() << " total time: " << PrettyDuration(total_ns)
      << " mean time: " << PrettyDuration(total_ns / iterations) << "\n"
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index 2857881..a5a3938 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -18,7 +18,7 @@
 #define ART_RUNTIME_GC_COLLECTOR_GARBAGE_COLLECTOR_H_
 
 #include <stdint.h>
-#include <vector>
+#include <list>
 
 #include "base/histogram.h"
 #include "base/mutex.h"
@@ -111,6 +111,9 @@
   void RecordFreeLOS(const ObjectBytePair& freed);
   virtual void DumpPerformanceInfo(std::ostream& os) REQUIRES(!pause_histogram_lock_);
 
+  // Extract RSS for GC-specific memory ranges using mincore().
+  uint64_t ExtractRssFromMincore(std::list<std::pair<void*, void*>>* gc_ranges);
+
   // Helper functions for querying if objects are marked. These are used for processing references,
   // and will be used for reading system weaks while the GC is running.
   virtual mirror::Object* IsMarked(mirror::Object* obj)
@@ -149,6 +152,7 @@
   std::string name_;
   // Cumulative statistics.
   Histogram<uint64_t> pause_histogram_ GUARDED_BY(pause_histogram_lock_);
+  Histogram<uint64_t> rss_histogram_;
   uint64_t total_thread_cpu_time_ns_;
   uint64_t total_time_ns_;
   uint64_t total_freed_objects_;
diff --git a/runtime/gc/space/large_object_space.cc b/runtime/gc/space/large_object_space.cc
index 1658dba..2c18888 100644
--- a/runtime/gc/space/large_object_space.cc
+++ b/runtime/gc/space/large_object_space.cc
@@ -231,6 +231,13 @@
   }
 }
 
+void LargeObjectMapSpace::ForEachMemMap(std::function<void(const MemMap&)> func) const {
+  MutexLock mu(Thread::Current(), lock_);
+  for (auto& pair : large_objects_) {
+    func(pair.second.mem_map);
+  }
+}
+
 bool LargeObjectMapSpace::Contains(const mirror::Object* obj) const {
   Thread* self = Thread::Current();
   if (lock_.IsExclusiveHeld(self)) {
@@ -398,6 +405,12 @@
   CHECK_EQ(cur_info, end_info);
 }
 
+void FreeListSpace::ForEachMemMap(std::function<void(const MemMap&)> func) const {
+  MutexLock mu(Thread::Current(), lock_);
+  func(allocation_info_map_);
+  func(mem_map_);
+}
+
 void FreeListSpace::RemoveFreePrev(AllocationInfo* info) {
   CHECK_GT(info->GetPrevFree(), 0U);
   auto it = free_blocks_.lower_bound(info);
diff --git a/runtime/gc/space/large_object_space.h b/runtime/gc/space/large_object_space.h
index a4d6a24..86ecd85 100644
--- a/runtime/gc/space/large_object_space.h
+++ b/runtime/gc/space/large_object_space.h
@@ -110,6 +110,7 @@
   // objects.
   virtual void SetAllLargeObjectsAsZygoteObjects(Thread* self) = 0;
 
+  virtual void ForEachMemMap(std::function<void(const MemMap&)> func) const = 0;
   // GetRangeAtomic returns Begin() and End() atomically, that is, it never returns Begin() and
   // End() from different allocations.
   virtual std::pair<uint8_t*, uint8_t*> GetBeginEndAtomic() const = 0;
@@ -160,7 +161,7 @@
   void Walk(DlMallocSpace::WalkCallback, void* arg) override REQUIRES(!lock_);
   // TODO: disabling thread safety analysis as this may be called when we already hold lock_.
   bool Contains(const mirror::Object* obj) const NO_THREAD_SAFETY_ANALYSIS;
-
+  void ForEachMemMap(std::function<void(const MemMap&)> func) const override REQUIRES(!lock_);
   std::pair<uint8_t*, uint8_t*> GetBeginEndAtomic() const override REQUIRES(!lock_);
 
  protected:
@@ -193,7 +194,7 @@
   size_t Free(Thread* self, mirror::Object* obj) override REQUIRES(!lock_);
   void Walk(DlMallocSpace::WalkCallback callback, void* arg) override REQUIRES(!lock_);
   void Dump(std::ostream& os) const REQUIRES(!lock_);
-
+  void ForEachMemMap(std::function<void(const MemMap&)> func) const override REQUIRES(!lock_);
   std::pair<uint8_t*, uint8_t*> GetBeginEndAtomic() const override REQUIRES(!lock_);
 
  protected:
diff --git a/runtime/gc/space/space.h b/runtime/gc/space/space.h
index 903263f..3a42f98 100644
--- a/runtime/gc/space/space.h
+++ b/runtime/gc/space/space.h
@@ -443,6 +443,10 @@
     return mark_bitmap_.get();
   }
 
+  accounting::ContinuousSpaceBitmap* GetTempBitmap() const {
+    return temp_bitmap_.get();
+  }
+
   collector::ObjectBytePair Sweep(bool swap_bitmaps);
   virtual accounting::ContinuousSpaceBitmap::SweepCallback* GetSweepCallback() = 0;
 
diff --git a/runtime/runtime.h b/runtime/runtime.h
index ace0eea..49ebbdc 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -775,6 +775,10 @@
     dump_gc_performance_on_shutdown_ = value;
   }
 
+  bool GetDumpGCPerformanceOnShutdown() const {
+    return dump_gc_performance_on_shutdown_;
+  }
+
   void IncrementDeoptimizationCount(DeoptimizationKind kind) {
     DCHECK_LE(kind, DeoptimizationKind::kLast);
     deoptimization_counts_[static_cast<size_t>(kind)]++;