ART: add reclaim bytes ratio metrics to GC performance

Record reclaimed bytes ratio (number of bytes reclaimed after a GC)
and print it if DumpGCPerformanceOnShutdown is set.

Test: Run art with -XX:DumpGCPerformanceOnShutdown on some benchmarks.
Bug: 112187497
Change-Id: I306e86c52102ab06d5279705ebc9e35b22b6b748
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index aba1c5a..fefe9ab 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -95,6 +95,7 @@
       weak_ref_access_enabled_(true),
       copied_live_bytes_ratio_sum_(0.f),
       gc_count_(0),
+      reclaimed_bytes_ratio_sum_(0.f),
       young_gen_(young_gen),
       skipped_blocks_lock_("concurrent copying bytes blocks lock", kMarkSweepMarkStackLock),
       measure_read_barrier_slow_path_(measure_read_barrier_slow_path),
@@ -110,7 +111,8 @@
       force_evacuate_all_(false),
       gc_grays_immune_objects_(false),
       immune_gray_stack_lock_("concurrent copying immune gray stack lock",
-                              kMarkSweepMarkStackLock) {
+                              kMarkSweepMarkStackLock),
+      num_bytes_allocated_before_gc_(0) {
   static_assert(space::RegionSpace::kRegionSize == accounting::ReadBarrierTable::kRegionSize,
                 "The region space size and the read barrier table region size must match");
   CHECK(kEnableGenerationalConcurrentCopyingCollection || !young_gen_);
@@ -323,6 +325,7 @@
 
 void ConcurrentCopying::InitializePhase() {
   TimingLogger::ScopedTiming split("InitializePhase", GetTimings());
+  num_bytes_allocated_before_gc_ = static_cast<int64_t>(heap_->GetBytesAllocated());
   if (kVerboseMode) {
     LOG(INFO) << "GC InitializePhase";
     LOG(INFO) << "Region-space : " << reinterpret_cast<void*>(region_space_->Begin()) << "-"
@@ -2091,6 +2094,11 @@
 
   CheckEmptyMarkStack();
 
+  int64_t num_bytes_allocated_after_gc = static_cast<int64_t>(heap_->GetBytesAllocated());
+  int64_t diff = num_bytes_allocated_before_gc_ - num_bytes_allocated_after_gc;
+  auto ratio = static_cast<float>(diff) / num_bytes_allocated_before_gc_;
+  reclaimed_bytes_ratio_sum_ += ratio;
+
   if (kVerboseMode) {
     LOG(INFO) << "GC end of ReclaimPhase";
   }
@@ -3199,6 +3207,7 @@
 
 void ConcurrentCopying::DumpPerformanceInfo(std::ostream& os) {
   GarbageCollector::DumpPerformanceInfo(os);
+  size_t num_gc_cycles = GetCumulativeTimings().GetIterations();
   MutexLock mu(Thread::Current(), rb_slow_path_histogram_lock_);
   if (rb_slow_path_time_histogram_.SampleSize() > 0) {
     Histogram<uint64_t>::CumulativeData cumulative_data;
@@ -3211,15 +3220,15 @@
   if (rb_slow_path_count_gc_total_ > 0) {
     os << "GC slow path count " << rb_slow_path_count_gc_total_ << "\n";
   }
-  float average_ratio = copied_live_bytes_ratio_sum_ / gc_count_;
 
-  if (young_gen_) {
-    os << "Average minor GC copied live bytes ratio "
-       << average_ratio << " over " << gc_count_ << " minor GCs\n";
-  } else {
-    os << "Average major GC copied live bytes ratio "
-       << average_ratio << " over " << gc_count_ << " major GCs\n";
-  }
+  os << "Average " << (young_gen_ ? "minor" : "major") << " GC reclaim bytes ratio "
+     << (reclaimed_bytes_ratio_sum_ / num_gc_cycles) << " over " << num_gc_cycles
+     << " GC cycles\n";
+
+  os << "Average " << (young_gen_ ? "minor" : "major") << " GC copied live bytes ratio "
+     << (copied_live_bytes_ratio_sum_ / gc_count_) << " over " << gc_count_
+     << " " << (young_gen_ ? "minor" : "major") << " GCs\n";
+
   os << "Cumulative bytes moved "
      << cumulative_bytes_moved_.load(std::memory_order_relaxed) << "\n";
   os << "Cumulative objects moved "
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index cd086c4..6535b11 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -359,10 +359,12 @@
   Atomic<uint64_t> cumulative_bytes_moved_;
   Atomic<uint64_t> cumulative_objects_moved_;
 
-  // copied_live_bytes_ratio_sum_ and gc_count_ are read and written by CC per
-  // GC, in ReclaimPhase, and are read by DumpPerformanceInfo (potentially from
-  // another thread). However, at present, DumpPerformanceInfo is only called
-  // when the runtime shuts down, so no concurrent access.
+  // copied_live_bytes_ratio_sum_ is read and written by CC per GC, in
+  // ReclaimPhase, and is read by DumpPerformanceInfo (potentially from another
+  // thread). However, at present, DumpPerformanceInfo is only called when the
+  // runtime shuts down, so no concurrent access. The same reasoning goes for
+  // gc_count_ and reclaimed_bytes_ratio_sum_
+
   // The sum of of all copied live bytes ratio (to_bytes/from_bytes)
   float copied_live_bytes_ratio_sum_;
   // The number of GC counts, used to calculate the average above. (It doesn't
@@ -371,6 +373,9 @@
   // space.)
   size_t gc_count_;
 
+  // reclaimed_bytes_ratio = reclaimed_bytes/num_allocated_bytes per GC cycle
+  float reclaimed_bytes_ratio_sum_;
+
   // Generational "sticky", only trace through dirty objects in region space.
   const bool young_gen_;
   // If true, the GC thread is done scanning marked objects on dirty and aged
@@ -416,6 +421,9 @@
   // ConcurrentCopying::SweepArray).
   MemMap sweep_array_free_buffer_mem_map_;
 
+  // Use signed because after_gc may be larger than before_gc.
+  int64_t num_bytes_allocated_before_gc_;
+
   class ActivateReadBarrierEntrypointsCallback;
   class ActivateReadBarrierEntrypointsCheckpoint;
   class AssertToSpaceInvariantFieldVisitor;