Revert^2 "Add peak RSS stats to GC perf dump"

This reverts commit cc292c611af7cdea6a2d9196fc347468b9233f71.

Reason for revert: RSS code is enabled only on linux now.

Test: art/test/testrunner/testrunner.py --target --runtime-option=-XX:DumpGCPerformanceOnShutdown
Bug: b/112187497
Change-Id: Iea5926d3dd4f6248f85422627b6ee0da559beb39
diff --git a/runtime/gc/accounting/card_table.h b/runtime/gc/accounting/card_table.h
index 30c4386..5f4675d 100644
--- a/runtime/gc/accounting/card_table.h
+++ b/runtime/gc/accounting/card_table.h
@@ -94,6 +94,14 @@
     return biased_begin_;
   }
 
+  void* MemMapBegin() const {
+    return mem_map_.BaseBegin();
+  }
+
+  size_t MemMapSize() const {
+    return mem_map_.BaseSize();
+  }
+
   /*
    * Modify cards in the range from scan_begin (inclusive) to scan_end (exclusive). Each card
    * value v is replaced by visitor(v). Visitor() should not have side-effects.
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 642b12e..5483364 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -2528,6 +2528,72 @@
   }
 }
 
+void ConcurrentCopying::CaptureRssAtPeak() {
+  using range_t = std::pair<void*, void*>;
+  // This operation is expensive as several calls to mincore() are performed.
+  // Also, this must be called before clearing regions in ReclaimPhase().
+  // Therefore, we make it conditional on the flag that enables dumping GC
+  // performance info on shutdown.
+  if (Runtime::Current()->GetDumpGCPerformanceOnShutdown()) {
+    std::list<range_t> gc_ranges;
+    auto add_gc_range = [&gc_ranges](void* start, size_t size) {
+      void* end = static_cast<char*>(start) + RoundUp(size, kPageSize);
+      gc_ranges.emplace_back(range_t(start, end));
+    };
+
+    // region space
+    DCHECK(IsAligned<kPageSize>(region_space_->Limit()));
+    gc_ranges.emplace_back(range_t(region_space_->Begin(), region_space_->Limit()));
+    // mark bitmap
+    add_gc_range(region_space_bitmap_->Begin(), region_space_bitmap_->Size());
+
+    // non-moving space
+    {
+      DCHECK(IsAligned<kPageSize>(heap_->non_moving_space_->Limit()));
+      gc_ranges.emplace_back(range_t(heap_->non_moving_space_->Begin(),
+                                     heap_->non_moving_space_->Limit()));
+      // mark bitmap
+      accounting::ContinuousSpaceBitmap *bitmap = heap_->non_moving_space_->GetMarkBitmap();
+      add_gc_range(bitmap->Begin(), bitmap->Size());
+      // live bitmap. Deal with bound bitmaps.
+      ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
+      if (heap_->non_moving_space_->HasBoundBitmaps()) {
+        DCHECK_EQ(bitmap, heap_->non_moving_space_->GetLiveBitmap());
+        bitmap = heap_->non_moving_space_->GetTempBitmap();
+      } else {
+        bitmap = heap_->non_moving_space_->GetLiveBitmap();
+      }
+      add_gc_range(bitmap->Begin(), bitmap->Size());
+    }
+    // large-object space
+    if (heap_->GetLargeObjectsSpace()) {
+      heap_->GetLargeObjectsSpace()->ForEachMemMap([&add_gc_range](const MemMap& map) {
+        DCHECK(IsAligned<kPageSize>(map.BaseSize()));
+        add_gc_range(map.BaseBegin(), map.BaseSize());
+      });
+      // mark bitmap
+      accounting::LargeObjectBitmap* bitmap = heap_->GetLargeObjectsSpace()->GetMarkBitmap();
+      add_gc_range(bitmap->Begin(), bitmap->Size());
+      // live bitmap
+      bitmap = heap_->GetLargeObjectsSpace()->GetLiveBitmap();
+      add_gc_range(bitmap->Begin(), bitmap->Size());
+    }
+    // card table
+    add_gc_range(heap_->GetCardTable()->MemMapBegin(), heap_->GetCardTable()->MemMapSize());
+    // inter-region refs
+    if (use_generational_cc_ && !young_gen_) {
+      // region space
+      add_gc_range(region_space_inter_region_bitmap_->Begin(),
+                   region_space_inter_region_bitmap_->Size());
+      // non-moving space
+      add_gc_range(non_moving_space_inter_region_bitmap_->Begin(),
+                   non_moving_space_inter_region_bitmap_->Size());
+    }
+    // Extract RSS using mincore(). Updates the cummulative RSS counter.
+    ExtractRssFromMincore(&gc_ranges);
+  }
+}
+
 void ConcurrentCopying::ReclaimPhase() {
   TimingLogger::ScopedTiming split("ReclaimPhase", GetTimings());
   if (kVerboseMode) {
@@ -2552,6 +2618,14 @@
     CheckEmptyMarkStack();
   }
 
+  // Capture RSS at the time when memory usage is at its peak. All GC related
+  // memory ranges like java heap, card table, bitmap etc. are taken into
+  // account.
+  // TODO: We can fetch resident memory for region space directly by going
+  // through list of allocated regions. This way we can avoid calling mincore on
+  // the biggest memory range, thereby reducing the cost of this function.
+  CaptureRssAtPeak();
+
   {
     // Record freed objects.
     TimingLogger::ScopedTiming split2("RecordFree", GetTimings());
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index 124713c..44ee7c2 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -88,6 +88,7 @@
                               !rb_slow_path_histogram_lock_,
                               !skipped_blocks_lock_);
 
+  void CaptureRssAtPeak() REQUIRES(!mark_stack_lock_);
   void BindBitmaps() REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!Locks::heap_bitmap_lock_);
   GcType GetGcType() const override {
diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc
index ec0ac6f..e4ae10a 100644
--- a/runtime/gc/collector/garbage_collector.cc
+++ b/runtime/gc/collector/garbage_collector.cc
@@ -15,6 +15,8 @@
  */
 
 #include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
 
 #include "garbage_collector.h"
 
@@ -65,6 +67,9 @@
     : heap_(heap),
       name_(name),
       pause_histogram_((name_ + " paused").c_str(), kPauseBucketSize, kPauseBucketCount),
+      rss_histogram_((name_ + " peak-rss").c_str(),
+                     /*initial_bucket_width=*/ 10,
+                     /*max_buckets=*/ 20),
       freed_bytes_histogram_((name_ + " freed-bytes").c_str(),
                              /*initial_bucket_width=*/ 10,
                              /*max_buckets=*/ 20),
@@ -84,11 +89,64 @@
   total_time_ns_ = 0u;
   total_freed_objects_ = 0u;
   total_freed_bytes_ = 0;
+  rss_histogram_.Reset();
   freed_bytes_histogram_.Reset();
   MutexLock mu(Thread::Current(), pause_histogram_lock_);
   pause_histogram_.Reset();
 }
 
+uint64_t GarbageCollector::ExtractRssFromMincore(
+    std::list<std::pair<void*, void*>>* gc_ranges) {
+  using range_t = std::pair<void*, void*>;
+  uint64_t rss = 0;
+  if (gc_ranges->empty()) {
+    return 0;
+  }
+  // mincore() is linux-specific syscall.
+#if defined(__linux__)
+  // Sort gc_ranges
+  gc_ranges->sort([](const range_t& a, const range_t& b) {
+    return std::less()(a.first, b.first);
+  });
+  // Merge gc_ranges. It's necessary because the kernel may merge contiguous
+  // regions if their properties match. This is sufficient as kernel doesn't
+  // merge those adjoining ranges which differ only in name.
+  size_t vec_len = 0;
+  for (auto it = gc_ranges->begin(); it != gc_ranges->end(); it++) {
+    auto next_it = it;
+    next_it++;
+    while (next_it != gc_ranges->end()) {
+      if (it->second == next_it->first) {
+        it->second = next_it->second;
+        next_it = gc_ranges->erase(next_it);
+      } else {
+        break;
+      }
+    }
+    size_t length = static_cast<uint8_t*>(it->second) - static_cast<uint8_t*>(it->first);
+    // Compute max length for vector allocation later.
+    vec_len = std::max(vec_len, length / kPageSize);
+  }
+  std::unique_ptr<unsigned char[]> vec(new unsigned char[vec_len]);
+  for (const auto it : *gc_ranges) {
+    size_t length = static_cast<uint8_t*>(it.second) - static_cast<uint8_t*>(it.first);
+    if (mincore(it.first, length, vec.get()) == 0) {
+      for (size_t i = 0; i < length / kPageSize; i++) {
+        // Least significant bit represents residency of a page. Other bits are
+        // reserved.
+        rss += vec[i] & 0x1;
+      }
+    } else {
+      LOG(WARNING) << "Call to mincore() on memory range [0x" << std::hex << it.first
+                   << ", 0x" << it.second << std::dec << ") failed: " << strerror(errno);
+    }
+  }
+  rss *= kPageSize;
+  rss_histogram_.AddValue(rss / KB);
+#endif
+  return rss;
+}
+
 void GarbageCollector::Run(GcCause gc_cause, bool clear_soft_references) {
   ScopedTrace trace(android::base::StringPrintf("%s %s GC", PrettyCause(gc_cause), GetName()));
   Thread* self = Thread::Current();
@@ -171,6 +229,7 @@
     pause_histogram_.Reset();
   }
   cumulative_timings_.Reset();
+  rss_histogram_.Reset();
   freed_bytes_histogram_.Reset();
   total_thread_cpu_time_ns_ = 0u;
   total_time_ns_ = 0u;
@@ -243,6 +302,17 @@
       pause_histogram_.PrintConfidenceIntervals(os, 0.99, cumulative_data);
     }
   }
+#if defined(__linux__)
+  if (rss_histogram_.SampleSize() > 0) {
+    os << rss_histogram_.Name()
+       << ": Avg: " << PrettySize(rss_histogram_.Mean() * KB)
+       << " Max: " << PrettySize(rss_histogram_.Max() * KB)
+       << " Min: " << PrettySize(rss_histogram_.Min() * KB) << "\n";
+    os << "Peak-rss Histogram: ";
+    rss_histogram_.DumpBins(os);
+    os << "\n";
+  }
+#endif
   if (freed_bytes_histogram_.SampleSize() > 0) {
     os << freed_bytes_histogram_.Name()
        << ": Avg: " << PrettySize(freed_bytes_histogram_.Mean() * KB)
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index 22afac6..a4f9467 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -18,7 +18,7 @@
 #define ART_RUNTIME_GC_COLLECTOR_GARBAGE_COLLECTOR_H_
 
 #include <stdint.h>
-#include <vector>
+#include <list>
 
 #include "base/histogram.h"
 #include "base/mutex.h"
@@ -111,6 +111,9 @@
   void RecordFreeLOS(const ObjectBytePair& freed);
   virtual void DumpPerformanceInfo(std::ostream& os) REQUIRES(!pause_histogram_lock_);
 
+  // Extract RSS for GC-specific memory ranges using mincore().
+  uint64_t ExtractRssFromMincore(std::list<std::pair<void*, void*>>* gc_ranges);
+
   // Helper functions for querying if objects are marked. These are used for processing references,
   // and will be used for reading system weaks while the GC is running.
   virtual mirror::Object* IsMarked(mirror::Object* obj)
@@ -149,6 +152,7 @@
   std::string name_;
   // Cumulative statistics.
   Histogram<uint64_t> pause_histogram_ GUARDED_BY(pause_histogram_lock_);
+  Histogram<uint64_t> rss_histogram_;
   Histogram<size_t> freed_bytes_histogram_;
   uint64_t total_thread_cpu_time_ns_;
   uint64_t total_time_ns_;
diff --git a/runtime/gc/space/large_object_space.cc b/runtime/gc/space/large_object_space.cc
index 1658dba..2c18888 100644
--- a/runtime/gc/space/large_object_space.cc
+++ b/runtime/gc/space/large_object_space.cc
@@ -231,6 +231,13 @@
   }
 }
 
+void LargeObjectMapSpace::ForEachMemMap(std::function<void(const MemMap&)> func) const {
+  MutexLock mu(Thread::Current(), lock_);
+  for (auto& pair : large_objects_) {
+    func(pair.second.mem_map);
+  }
+}
+
 bool LargeObjectMapSpace::Contains(const mirror::Object* obj) const {
   Thread* self = Thread::Current();
   if (lock_.IsExclusiveHeld(self)) {
@@ -398,6 +405,12 @@
   CHECK_EQ(cur_info, end_info);
 }
 
+void FreeListSpace::ForEachMemMap(std::function<void(const MemMap&)> func) const {
+  MutexLock mu(Thread::Current(), lock_);
+  func(allocation_info_map_);
+  func(mem_map_);
+}
+
 void FreeListSpace::RemoveFreePrev(AllocationInfo* info) {
   CHECK_GT(info->GetPrevFree(), 0U);
   auto it = free_blocks_.lower_bound(info);
diff --git a/runtime/gc/space/large_object_space.h b/runtime/gc/space/large_object_space.h
index a4d6a24..86ecd85 100644
--- a/runtime/gc/space/large_object_space.h
+++ b/runtime/gc/space/large_object_space.h
@@ -110,6 +110,7 @@
   // objects.
   virtual void SetAllLargeObjectsAsZygoteObjects(Thread* self) = 0;
 
+  virtual void ForEachMemMap(std::function<void(const MemMap&)> func) const = 0;
   // GetRangeAtomic returns Begin() and End() atomically, that is, it never returns Begin() and
   // End() from different allocations.
   virtual std::pair<uint8_t*, uint8_t*> GetBeginEndAtomic() const = 0;
@@ -160,7 +161,7 @@
   void Walk(DlMallocSpace::WalkCallback, void* arg) override REQUIRES(!lock_);
   // TODO: disabling thread safety analysis as this may be called when we already hold lock_.
   bool Contains(const mirror::Object* obj) const NO_THREAD_SAFETY_ANALYSIS;
-
+  void ForEachMemMap(std::function<void(const MemMap&)> func) const override REQUIRES(!lock_);
   std::pair<uint8_t*, uint8_t*> GetBeginEndAtomic() const override REQUIRES(!lock_);
 
  protected:
@@ -193,7 +194,7 @@
   size_t Free(Thread* self, mirror::Object* obj) override REQUIRES(!lock_);
   void Walk(DlMallocSpace::WalkCallback callback, void* arg) override REQUIRES(!lock_);
   void Dump(std::ostream& os) const REQUIRES(!lock_);
-
+  void ForEachMemMap(std::function<void(const MemMap&)> func) const override REQUIRES(!lock_);
   std::pair<uint8_t*, uint8_t*> GetBeginEndAtomic() const override REQUIRES(!lock_);
 
  protected:
diff --git a/runtime/gc/space/space.h b/runtime/gc/space/space.h
index 903263f..3a42f98 100644
--- a/runtime/gc/space/space.h
+++ b/runtime/gc/space/space.h
@@ -443,6 +443,10 @@
     return mark_bitmap_.get();
   }
 
+  accounting::ContinuousSpaceBitmap* GetTempBitmap() const {
+    return temp_bitmap_.get();
+  }
+
   collector::ObjectBytePair Sweep(bool swap_bitmaps);
   virtual accounting::ContinuousSpaceBitmap::SweepCallback* GetSweepCallback() = 0;
 
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 564a1b9..6a9e1e9 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -775,6 +775,10 @@
     dump_gc_performance_on_shutdown_ = value;
   }
 
+  bool GetDumpGCPerformanceOnShutdown() const {
+    return dump_gc_performance_on_shutdown_;
+  }
+
   void IncrementDeoptimizationCount(DeoptimizationKind kind) {
     DCHECK_LE(kind, DeoptimizationKind::kLast);
     deoptimization_counts_[static_cast<size_t>(kind)]++;