[metrics] Add GC-work throughput metrics am: 7f0473851d am: 0ff602aeaa

Original change: https://android-review.googlesource.com/c/platform/art/+/1740166

Change-Id: I24520a356605804fcd1ac9ecaa89f5935bcc4c97
diff --git a/libartbase/base/metrics/metrics.h b/libartbase/base/metrics/metrics.h
index 72d8365..78a6387 100644
--- a/libartbase/base/metrics/metrics.h
+++ b/libartbase/base/metrics/metrics.h
@@ -49,8 +49,10 @@
   METRIC(JitMethodCompileCount, MetricsCounter)                    \
   METRIC(YoungGcCollectionTime, MetricsHistogram, 15, 0, 60'000)   \
   METRIC(FullGcCollectionTime, MetricsHistogram, 15, 0, 60'000)    \
-  METRIC(YoungGcThroughput, MetricsHistogram, 15, 0, 1'000)        \
-  METRIC(FullGcThroughput, MetricsHistogram, 15, 0, 1'000)
+  METRIC(YoungGcThroughput, MetricsHistogram, 15, 0, 10'000)       \
+  METRIC(FullGcThroughput, MetricsHistogram, 15, 0, 10'000)        \
+  METRIC(YoungGcTracingThroughput, MetricsHistogram, 15, 0, 10'000)   \
+  METRIC(FullGcTracingThroughput, MetricsHistogram, 15, 0, 10'000)
 
 // A lot of the metrics implementation code is generated by passing one-off macros into ART_COUNTERS
 // and ART_HISTOGRAMS. This means metrics.h and metrics.cc are very #define-heavy, which can be
diff --git a/libartbase/base/time_utils.h b/libartbase/base/time_utils.h
index fbf3e94..dd73b1c 100644
--- a/libartbase/base/time_utils.h
+++ b/libartbase/base/time_utils.h
@@ -77,6 +77,11 @@
   return ns / 1000 / 1000;
 }
 
+// Converts the given number of nanoseconds to microseconds.
+static constexpr uint64_t NsToUs(uint64_t ns) {
+  return ns / 1000;
+}
+
 // Converts the given number of milliseconds to nanoseconds
 static constexpr uint64_t MsToNs(uint64_t ms) {
   return ms * 1000 * 1000;
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index f58224d..96c792b 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -102,6 +102,8 @@
       copied_live_bytes_ratio_sum_(0.f),
       gc_count_(0),
       reclaimed_bytes_ratio_sum_(0.f),
+      cumulative_bytes_moved_(0),
+      cumulative_objects_moved_(0),
       skipped_blocks_lock_("concurrent copying bytes blocks lock", kMarkSweepMarkStackLock),
       measure_read_barrier_slow_path_(measure_read_barrier_slow_path),
       mark_from_read_barrier_measurements_(false),
@@ -158,10 +160,12 @@
     gc_time_histogram_ = metrics->YoungGcCollectionTime();
     metrics_gc_count_ = metrics->YoungGcCount();
     gc_throughput_histogram_ = metrics->YoungGcThroughput();
+    gc_tracing_throughput_hist_ = metrics->YoungGcTracingThroughput();
   } else {
     gc_time_histogram_ = metrics->FullGcCollectionTime();
     metrics_gc_count_ = metrics->FullGcCount();
     gc_throughput_histogram_ = metrics->FullGcThroughput();
+    gc_tracing_throughput_hist_ = metrics->FullGcTracingThroughput();
   }
 }
 
@@ -401,6 +405,7 @@
   objects_moved_.store(0, std::memory_order_relaxed);
   bytes_moved_gc_thread_ = 0;
   objects_moved_gc_thread_ = 0;
+  bytes_scanned_ = 0;
   GcCause gc_cause = GetCurrentIteration()->GetGcCause();
 
   force_evacuate_all_ = false;
@@ -2196,6 +2201,7 @@
 
 inline void ConcurrentCopying::ProcessMarkStackRef(mirror::Object* to_ref) {
   DCHECK(!region_space_->IsInFromSpace(to_ref));
+  size_t obj_size = 0;
   space::RegionSpace::RegionType rtype = region_space_->GetRegionType(to_ref);
   if (kUseBakerReadBarrier) {
     DCHECK(to_ref->GetReadBarrierState() == ReadBarrier::GrayState())
@@ -2282,10 +2288,11 @@
       }
   }
   if (perform_scan) {
+    obj_size = to_ref->SizeOf<kDefaultVerifyFlags>();
     if (use_generational_cc_ && young_gen_) {
-      Scan<true>(to_ref);
+      Scan<true>(to_ref, obj_size);
     } else {
-      Scan<false>(to_ref);
+      Scan<false>(to_ref, obj_size);
     }
   }
   if (kUseBakerReadBarrier) {
@@ -2329,9 +2336,10 @@
     // Add to the live bytes per unevacuated from-space. Note this code is always run by the
     // GC-running thread (no synchronization required).
     DCHECK(region_space_bitmap_->Test(to_ref));
-    size_t obj_size = to_ref->SizeOf<kDefaultVerifyFlags>();
-    size_t alloc_size = RoundUp(obj_size, space::RegionSpace::kAlignment);
-    region_space_->AddLiveBytes(to_ref, alloc_size);
+    if (obj_size == 0) {
+      obj_size = to_ref->SizeOf<kDefaultVerifyFlags>();
+    }
+    region_space_->AddLiveBytes(to_ref, RoundUp(obj_size, space::RegionSpace::kAlignment));
   }
   if (ReadBarrier::kEnableToSpaceInvariantChecks) {
     CHECK(to_ref != nullptr);
@@ -2713,9 +2721,9 @@
     const uint64_t unevac_from_bytes = region_space_->GetBytesAllocatedInUnevacFromSpace();
     const uint64_t unevac_from_objects = region_space_->GetObjectsAllocatedInUnevacFromSpace();
     uint64_t to_bytes = bytes_moved_.load(std::memory_order_relaxed) + bytes_moved_gc_thread_;
-    cumulative_bytes_moved_.fetch_add(to_bytes, std::memory_order_relaxed);
+    cumulative_bytes_moved_ += to_bytes;
     uint64_t to_objects = objects_moved_.load(std::memory_order_relaxed) + objects_moved_gc_thread_;
-    cumulative_objects_moved_.fetch_add(to_objects, std::memory_order_relaxed);
+    cumulative_objects_moved_ += to_objects;
     if (kEnableFromSpaceAccountingCheck) {
       CHECK_EQ(from_space_num_objects_at_first_pause_, from_objects + unevac_from_objects);
       CHECK_EQ(from_space_num_bytes_at_first_pause_, from_bytes + unevac_from_bytes);
@@ -2757,6 +2765,7 @@
                 << heap_->num_bytes_allocated_.load();
     }
     RecordFree(ObjectBytePair(freed_objects, freed_bytes));
+    GetCurrentIteration()->SetScannedBytes(bytes_scanned_);
     if (kVerboseMode) {
       LOG(INFO) << "(after) num_bytes_allocated="
                 << heap_->num_bytes_allocated_.load();
@@ -3135,7 +3144,7 @@
 };
 
 template <bool kNoUnEvac>
-inline void ConcurrentCopying::Scan(mirror::Object* to_ref) {
+inline void ConcurrentCopying::Scan(mirror::Object* to_ref, size_t obj_size) {
   // Cannot have `kNoUnEvac` when Generational CC collection is disabled.
   DCHECK(!kNoUnEvac || use_generational_cc_);
   if (kDisallowReadBarrierDuringScan && !Runtime::Current()->IsActiveTransaction()) {
@@ -3144,6 +3153,11 @@
     // trigger read barriers.
     Thread::Current()->ModifyDebugDisallowReadBarrier(1);
   }
+  if (obj_size == 0) {
+    obj_size = to_ref->SizeOf<kDefaultVerifyFlags>();
+  }
+  bytes_scanned_ += obj_size;
+
   DCHECK(!region_space_->IsInFromSpace(to_ref));
   DCHECK_EQ(Thread::Current(), thread_running_gc_);
   RefFieldsVisitor<kNoUnEvac> visitor(this, thread_running_gc_);
@@ -3841,10 +3855,8 @@
      << (copied_live_bytes_ratio_sum_ / gc_count_) << " over " << gc_count_
      << " " << (young_gen_ ? "minor" : "major") << " GCs\n";
 
-  os << "Cumulative bytes moved "
-     << cumulative_bytes_moved_.load(std::memory_order_relaxed) << "\n";
-  os << "Cumulative objects moved "
-     << cumulative_objects_moved_.load(std::memory_order_relaxed) << "\n";
+  os << "Cumulative bytes moved " << cumulative_bytes_moved_ << "\n";
+  os << "Cumulative objects moved " << cumulative_objects_moved_ << "\n";
 
   os << "Peak regions allocated "
      << region_space_->GetMaxPeakNumNonFreeRegions() << " ("
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index 582328f..72d460d 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -177,7 +177,7 @@
       REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
   // Scan the reference fields of object `to_ref`.
   template <bool kNoUnEvac>
-  void Scan(mirror::Object* to_ref) REQUIRES_SHARED(Locks::mutator_lock_)
+  void Scan(mirror::Object* to_ref, size_t obj_size = 0) REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!mark_stack_lock_);
   // Scan the reference fields of object 'obj' in the dirty cards during
   // card-table scan. In addition to visiting the references, it also sets the
@@ -410,10 +410,6 @@
   // bytes_moved_gc_thread_ are critical for GC triggering; the others are just informative.
   Atomic<size_t> bytes_moved_;  // Used by mutators
   Atomic<size_t> objects_moved_;  // Used by mutators
-  size_t bytes_moved_gc_thread_;  // Used by GC
-  size_t objects_moved_gc_thread_;  // Used by GC
-  Atomic<uint64_t> cumulative_bytes_moved_;
-  Atomic<uint64_t> cumulative_objects_moved_;
 
   // copied_live_bytes_ratio_sum_ is read and written by CC per GC, in
   // ReclaimPhase, and is read by DumpPerformanceInfo (potentially from another
@@ -436,6 +432,15 @@
   // reclaimed_bytes_ratio = reclaimed_bytes/num_allocated_bytes per GC cycle
   float reclaimed_bytes_ratio_sum_;
 
+  // Used only by GC thread, so need not be atomic. Also, should be kept
+  // in a different cacheline than bytes/objects_moved_ (above) to avoid false
+  // cacheline sharing.
+  size_t bytes_moved_gc_thread_;
+  size_t objects_moved_gc_thread_;
+  uint64_t bytes_scanned_;
+  uint64_t cumulative_bytes_moved_;
+  uint64_t cumulative_objects_moved_;
+
   // The skipped blocks are memory blocks/chucks that were copies of
   // objects that were unused due to lost races (cas failures) at
   // object copy/forward pointer install. They may be reused.
diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc
index f6bd996..70aec9a 100644
--- a/runtime/gc/collector/garbage_collector.cc
+++ b/runtime/gc/collector/garbage_collector.cc
@@ -51,6 +51,7 @@
   timings_.Reset();
   pause_times_.clear();
   duration_ns_ = 0;
+  bytes_scanned_ = 0;
   clear_soft_references_ = clear_soft_references;
   gc_cause_ = gc_cause;
   freed_ = ObjectBytePair();
@@ -72,6 +73,7 @@
       gc_time_histogram_(nullptr),
       metrics_gc_count_(nullptr),
       gc_throughput_histogram_(nullptr),
+      gc_tracing_throughput_hist_(nullptr),
       cumulative_timings_(name),
       pause_histogram_lock_("pause histogram lock", kDefaultMutexLevel, true),
       is_transaction_active_(false) {
@@ -88,6 +90,7 @@
   total_time_ns_ = 0u;
   total_freed_objects_ = 0u;
   total_freed_bytes_ = 0;
+  total_scanned_bytes_ = 0;
   rss_histogram_.Reset();
   freed_bytes_histogram_.Reset();
   MutexLock mu(Thread::Current(), pause_histogram_lock_);
@@ -169,6 +172,7 @@
   // Update cumulative statistics with how many bytes the GC iteration freed.
   total_freed_objects_ += current_iteration->GetFreedObjects() +
       current_iteration->GetFreedLargeObjects();
+  total_scanned_bytes_ += current_iteration->GetScannedBytes();
   int64_t freed_bytes = current_iteration->GetFreedBytes() +
       current_iteration->GetFreedLargeObjectBytes();
   total_freed_bytes_ += freed_bytes;
@@ -177,17 +181,18 @@
   uint64_t end_time = NanoTime();
   uint64_t thread_cpu_end_time = ThreadCpuNanoTime();
   total_thread_cpu_time_ns_ += thread_cpu_end_time - thread_cpu_start_time;
-  current_iteration->SetDurationNs(end_time - start_time);
+  uint64_t duration_ns = end_time - start_time;
+  current_iteration->SetDurationNs(duration_ns);
   if (Locks::mutator_lock_->IsExclusiveHeld(self)) {
     // The entire GC was paused, clear the fake pauses which might be in the pause times and add
     // the whole GC duration.
     current_iteration->pause_times_.clear();
-    RegisterPause(current_iteration->GetDurationNs());
+    RegisterPause(duration_ns);
   }
-  total_time_ns_ += current_iteration->GetDurationNs();
+  total_time_ns_ += duration_ns;
   if (gc_time_histogram_ != nullptr) {
     // Report GC time in milliseconds.
-    gc_time_histogram_->Add(NsToMs(current_iteration->GetDurationNs()));
+    gc_time_histogram_->Add(NsToMs(duration_ns));
   }
   uint64_t total_pause_time = 0;
   for (uint64_t pause_time : current_iteration->GetPauseTimes()) {
@@ -201,6 +206,15 @@
   if (metrics_gc_count_ != nullptr) {
     metrics_gc_count_->Add(1);
   }
+
+  if (gc_tracing_throughput_hist_ != nullptr) {
+    // Throughput in bytes/s. Add 1us to prevent possible division by 0.
+    uint64_t throughput = (current_iteration->GetScannedBytes() * 1'000'000)
+            / (NsToUs(duration_ns) + 1);
+    // Report in MB/s.
+    gc_tracing_throughput_hist_->Add(throughput / MB);
+  }
+
   if (gc_throughput_histogram_ != nullptr) {
     // Report GC throughput in MB/s.
     gc_throughput_histogram_->Add(current_iteration->GetEstimatedThroughput() / MB);
@@ -247,6 +261,7 @@
   total_time_ns_ = 0u;
   total_freed_objects_ = 0u;
   total_freed_bytes_ = 0;
+  total_scanned_bytes_ = 0u;
 }
 
 GarbageCollector::ScopedPause::ScopedPause(GarbageCollector* collector, bool with_reporting)
@@ -306,6 +321,7 @@
   const double seconds = NsToMs(total_ns) / 1000.0;
   const uint64_t freed_bytes = GetTotalFreedBytes();
   const uint64_t freed_objects = GetTotalFreedObjects();
+  const uint64_t scanned_bytes = GetTotalScannedBytes();
   {
     MutexLock mu(Thread::Current(), pause_histogram_lock_);
     if (pause_histogram_.SampleSize() > 0) {
@@ -343,7 +359,11 @@
      << PrettySize(freed_bytes / seconds) << "/s"
      << "  per cpu-time: "
      << static_cast<uint64_t>(freed_bytes / cpu_seconds) << "/s / "
-     << PrettySize(freed_bytes / cpu_seconds) << "/s\n";
+     << PrettySize(freed_bytes / cpu_seconds) << "/s\n"
+     << GetName() << " tracing throughput: "
+     << PrettySize(scanned_bytes / seconds) << "/s "
+     << " per cpu-time: "
+     << PrettySize(scanned_bytes / cpu_seconds) << "/s\n";
 }
 
 }  // namespace collector
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index b42f189..d2b1170 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -92,6 +92,9 @@
   uint64_t GetTotalFreedObjects() const {
     return total_freed_objects_;
   }
+  uint64_t GetTotalScannedBytes() const {
+    return total_scanned_bytes_;
+  }
   // Reset the cumulative timings and pause histogram.
   void ResetMeasurements() REQUIRES(!pause_histogram_lock_);
   // Returns the estimated throughput in bytes / second.
@@ -160,10 +163,12 @@
   metrics::MetricsBase<int64_t>* gc_time_histogram_;
   metrics::MetricsBase<uint64_t>* metrics_gc_count_;
   metrics::MetricsBase<int64_t>* gc_throughput_histogram_;
+  metrics::MetricsBase<int64_t>* gc_tracing_throughput_hist_;
   uint64_t total_thread_cpu_time_ns_;
   uint64_t total_time_ns_;
   uint64_t total_freed_objects_;
   int64_t total_freed_bytes_;
+  uint64_t total_scanned_bytes_;
   CumulativeLogger cumulative_timings_;
   mutable Mutex pause_histogram_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   bool is_transaction_active_;
diff --git a/runtime/gc/collector/iteration.h b/runtime/gc/collector/iteration.h
index 363459a..348f49d 100644
--- a/runtime/gc/collector/iteration.h
+++ b/runtime/gc/collector/iteration.h
@@ -60,6 +60,12 @@
   uint64_t GetFreedRevokeBytes() const {
     return freed_bytes_revoke_;
   }
+  uint64_t GetScannedBytes() const {
+    return bytes_scanned_;
+  }
+  void SetScannedBytes(uint64_t bytes) {
+      bytes_scanned_ = bytes;
+  }
   void SetFreedRevoke(uint64_t freed) {
     freed_bytes_revoke_ = freed;
   }
@@ -84,6 +90,7 @@
   GcCause gc_cause_;
   bool clear_soft_references_;
   uint64_t duration_ns_;
+  uint64_t bytes_scanned_;
   TimingLogger timings_;
   ObjectBytePair freed_;
   ObjectBytePair freed_los_;
diff --git a/runtime/metrics/statsd.cc b/runtime/metrics/statsd.cc
index dac41b9..7e74cf6 100644
--- a/runtime/metrics/statsd.cc
+++ b/runtime/metrics/statsd.cc
@@ -31,6 +31,7 @@
 // EncodeDatumId returns a std::optional that provides a enum value from atoms.proto if the datum is
 // one that we support logging to statsd. The list of datums that ART collects is a superset of what
 // we report to statsd. Therefore, we only have mappings for the DatumIds that statsd recognizes.
+// Also it must be noted that histograms are not handled yet by statsd yet.
 //
 // Other code can use whether the result of this function has a value to decide whether to report
 // the atom to statsd.
@@ -76,14 +77,22 @@
     case DatumId::kYoungGcThroughput:
       return std::make_optional(
           statsd::
-              ART_DATUM_REPORTED__KIND__ART_DATUM_GC_YOUNG_GENERATION_COLLECTION_THROUGHPUT_MB_PER_SEC);
+              ART_DATUM_REPORTED__KIND__ART_DATUM_GC_YOUNG_GENERATION_COLLECTION_THROUGHPUT_HISTO_MB_PER_SEC);
     case DatumId::kFullGcThroughput:
       return std::make_optional(
           statsd::
-              ART_DATUM_REPORTED__KIND__ART_DATUM_GC_FULL_HEAP_COLLECTION_THROUGHPUT_MB_PER_SEC);
+              ART_DATUM_REPORTED__KIND__ART_DATUM_GC_FULL_HEAP_COLLECTION_THROUGHPUT_HISTO_MB_PER_SEC);
     case DatumId::kJitMethodCompileCount:
       return std::make_optional(
           statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_JIT_METHOD_COMPILE_COUNT);
+    case DatumId::kYoungGcTracingThroughput:
+      return std::make_optional(
+          statsd::
+              ART_DATUM_REPORTED__KIND__ART_DATUM_GC_YOUNG_GENERATION_TRACING_THROUGHPUT_HISTO_MB_PER_SEC);
+    case DatumId::kFullGcTracingThroughput:
+      return std::make_optional(
+          statsd::
+              ART_DATUM_REPORTED__KIND__ART_DATUM_GC_FULL_HEAP_TRACING_THROUGHPUT_HISTO_MB_PER_SEC);
   }
 }