[metrics] Add GC-work throughput metrics am: 7f0473851d am: 0ff602aeaa
Original change: https://android-review.googlesource.com/c/platform/art/+/1740166
Change-Id: I24520a356605804fcd1ac9ecaa89f5935bcc4c97
diff --git a/libartbase/base/metrics/metrics.h b/libartbase/base/metrics/metrics.h
index 72d8365..78a6387 100644
--- a/libartbase/base/metrics/metrics.h
+++ b/libartbase/base/metrics/metrics.h
@@ -49,8 +49,10 @@
METRIC(JitMethodCompileCount, MetricsCounter) \
METRIC(YoungGcCollectionTime, MetricsHistogram, 15, 0, 60'000) \
METRIC(FullGcCollectionTime, MetricsHistogram, 15, 0, 60'000) \
- METRIC(YoungGcThroughput, MetricsHistogram, 15, 0, 1'000) \
- METRIC(FullGcThroughput, MetricsHistogram, 15, 0, 1'000)
+ METRIC(YoungGcThroughput, MetricsHistogram, 15, 0, 10'000) \
+ METRIC(FullGcThroughput, MetricsHistogram, 15, 0, 10'000) \
+ METRIC(YoungGcTracingThroughput, MetricsHistogram, 15, 0, 10'000) \
+ METRIC(FullGcTracingThroughput, MetricsHistogram, 15, 0, 10'000)
// A lot of the metrics implementation code is generated by passing one-off macros into ART_COUNTERS
// and ART_HISTOGRAMS. This means metrics.h and metrics.cc are very #define-heavy, which can be
diff --git a/libartbase/base/time_utils.h b/libartbase/base/time_utils.h
index fbf3e94..dd73b1c 100644
--- a/libartbase/base/time_utils.h
+++ b/libartbase/base/time_utils.h
@@ -77,6 +77,11 @@
return ns / 1000 / 1000;
}
+// Converts the given number of nanoseconds to microseconds.
+static constexpr uint64_t NsToUs(uint64_t ns) {
+ return ns / 1000;
+}
+
// Converts the given number of milliseconds to nanoseconds
static constexpr uint64_t MsToNs(uint64_t ms) {
return ms * 1000 * 1000;
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index f58224d..96c792b 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -102,6 +102,8 @@
copied_live_bytes_ratio_sum_(0.f),
gc_count_(0),
reclaimed_bytes_ratio_sum_(0.f),
+ cumulative_bytes_moved_(0),
+ cumulative_objects_moved_(0),
skipped_blocks_lock_("concurrent copying bytes blocks lock", kMarkSweepMarkStackLock),
measure_read_barrier_slow_path_(measure_read_barrier_slow_path),
mark_from_read_barrier_measurements_(false),
@@ -158,10 +160,12 @@
gc_time_histogram_ = metrics->YoungGcCollectionTime();
metrics_gc_count_ = metrics->YoungGcCount();
gc_throughput_histogram_ = metrics->YoungGcThroughput();
+ gc_tracing_throughput_hist_ = metrics->YoungGcTracingThroughput();
} else {
gc_time_histogram_ = metrics->FullGcCollectionTime();
metrics_gc_count_ = metrics->FullGcCount();
gc_throughput_histogram_ = metrics->FullGcThroughput();
+ gc_tracing_throughput_hist_ = metrics->FullGcTracingThroughput();
}
}
@@ -401,6 +405,7 @@
objects_moved_.store(0, std::memory_order_relaxed);
bytes_moved_gc_thread_ = 0;
objects_moved_gc_thread_ = 0;
+ bytes_scanned_ = 0;
GcCause gc_cause = GetCurrentIteration()->GetGcCause();
force_evacuate_all_ = false;
@@ -2196,6 +2201,7 @@
inline void ConcurrentCopying::ProcessMarkStackRef(mirror::Object* to_ref) {
DCHECK(!region_space_->IsInFromSpace(to_ref));
+ size_t obj_size = 0;
space::RegionSpace::RegionType rtype = region_space_->GetRegionType(to_ref);
if (kUseBakerReadBarrier) {
DCHECK(to_ref->GetReadBarrierState() == ReadBarrier::GrayState())
@@ -2282,10 +2288,11 @@
}
}
if (perform_scan) {
+ obj_size = to_ref->SizeOf<kDefaultVerifyFlags>();
if (use_generational_cc_ && young_gen_) {
- Scan<true>(to_ref);
+ Scan<true>(to_ref, obj_size);
} else {
- Scan<false>(to_ref);
+ Scan<false>(to_ref, obj_size);
}
}
if (kUseBakerReadBarrier) {
@@ -2329,9 +2336,10 @@
// Add to the live bytes per unevacuated from-space. Note this code is always run by the
// GC-running thread (no synchronization required).
DCHECK(region_space_bitmap_->Test(to_ref));
- size_t obj_size = to_ref->SizeOf<kDefaultVerifyFlags>();
- size_t alloc_size = RoundUp(obj_size, space::RegionSpace::kAlignment);
- region_space_->AddLiveBytes(to_ref, alloc_size);
+ if (obj_size == 0) {
+ obj_size = to_ref->SizeOf<kDefaultVerifyFlags>();
+ }
+ region_space_->AddLiveBytes(to_ref, RoundUp(obj_size, space::RegionSpace::kAlignment));
}
if (ReadBarrier::kEnableToSpaceInvariantChecks) {
CHECK(to_ref != nullptr);
@@ -2713,9 +2721,9 @@
const uint64_t unevac_from_bytes = region_space_->GetBytesAllocatedInUnevacFromSpace();
const uint64_t unevac_from_objects = region_space_->GetObjectsAllocatedInUnevacFromSpace();
uint64_t to_bytes = bytes_moved_.load(std::memory_order_relaxed) + bytes_moved_gc_thread_;
- cumulative_bytes_moved_.fetch_add(to_bytes, std::memory_order_relaxed);
+ cumulative_bytes_moved_ += to_bytes;
uint64_t to_objects = objects_moved_.load(std::memory_order_relaxed) + objects_moved_gc_thread_;
- cumulative_objects_moved_.fetch_add(to_objects, std::memory_order_relaxed);
+ cumulative_objects_moved_ += to_objects;
if (kEnableFromSpaceAccountingCheck) {
CHECK_EQ(from_space_num_objects_at_first_pause_, from_objects + unevac_from_objects);
CHECK_EQ(from_space_num_bytes_at_first_pause_, from_bytes + unevac_from_bytes);
@@ -2757,6 +2765,7 @@
<< heap_->num_bytes_allocated_.load();
}
RecordFree(ObjectBytePair(freed_objects, freed_bytes));
+ GetCurrentIteration()->SetScannedBytes(bytes_scanned_);
if (kVerboseMode) {
LOG(INFO) << "(after) num_bytes_allocated="
<< heap_->num_bytes_allocated_.load();
@@ -3135,7 +3144,7 @@
};
template <bool kNoUnEvac>
-inline void ConcurrentCopying::Scan(mirror::Object* to_ref) {
+inline void ConcurrentCopying::Scan(mirror::Object* to_ref, size_t obj_size) {
// Cannot have `kNoUnEvac` when Generational CC collection is disabled.
DCHECK(!kNoUnEvac || use_generational_cc_);
if (kDisallowReadBarrierDuringScan && !Runtime::Current()->IsActiveTransaction()) {
@@ -3144,6 +3153,11 @@
// trigger read barriers.
Thread::Current()->ModifyDebugDisallowReadBarrier(1);
}
+ if (obj_size == 0) {
+ obj_size = to_ref->SizeOf<kDefaultVerifyFlags>();
+ }
+ bytes_scanned_ += obj_size;
+
DCHECK(!region_space_->IsInFromSpace(to_ref));
DCHECK_EQ(Thread::Current(), thread_running_gc_);
RefFieldsVisitor<kNoUnEvac> visitor(this, thread_running_gc_);
@@ -3841,10 +3855,8 @@
<< (copied_live_bytes_ratio_sum_ / gc_count_) << " over " << gc_count_
<< " " << (young_gen_ ? "minor" : "major") << " GCs\n";
- os << "Cumulative bytes moved "
- << cumulative_bytes_moved_.load(std::memory_order_relaxed) << "\n";
- os << "Cumulative objects moved "
- << cumulative_objects_moved_.load(std::memory_order_relaxed) << "\n";
+ os << "Cumulative bytes moved " << cumulative_bytes_moved_ << "\n";
+ os << "Cumulative objects moved " << cumulative_objects_moved_ << "\n";
os << "Peak regions allocated "
<< region_space_->GetMaxPeakNumNonFreeRegions() << " ("
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index 582328f..72d460d 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -177,7 +177,7 @@
REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
// Scan the reference fields of object `to_ref`.
template <bool kNoUnEvac>
- void Scan(mirror::Object* to_ref) REQUIRES_SHARED(Locks::mutator_lock_)
+ void Scan(mirror::Object* to_ref, size_t obj_size = 0) REQUIRES_SHARED(Locks::mutator_lock_)
REQUIRES(!mark_stack_lock_);
// Scan the reference fields of object 'obj' in the dirty cards during
// card-table scan. In addition to visiting the references, it also sets the
@@ -410,10 +410,6 @@
// bytes_moved_gc_thread_ are critical for GC triggering; the others are just informative.
Atomic<size_t> bytes_moved_; // Used by mutators
Atomic<size_t> objects_moved_; // Used by mutators
- size_t bytes_moved_gc_thread_; // Used by GC
- size_t objects_moved_gc_thread_; // Used by GC
- Atomic<uint64_t> cumulative_bytes_moved_;
- Atomic<uint64_t> cumulative_objects_moved_;
// copied_live_bytes_ratio_sum_ is read and written by CC per GC, in
// ReclaimPhase, and is read by DumpPerformanceInfo (potentially from another
@@ -436,6 +432,15 @@
// reclaimed_bytes_ratio = reclaimed_bytes/num_allocated_bytes per GC cycle
float reclaimed_bytes_ratio_sum_;
+ // Used only by GC thread, so need not be atomic. Also, should be kept
+ // in a different cacheline than bytes/objects_moved_ (above) to avoid false
+ // cacheline sharing.
+ size_t bytes_moved_gc_thread_;
+ size_t objects_moved_gc_thread_;
+ uint64_t bytes_scanned_;
+ uint64_t cumulative_bytes_moved_;
+ uint64_t cumulative_objects_moved_;
+
// The skipped blocks are memory blocks/chucks that were copies of
// objects that were unused due to lost races (cas failures) at
// object copy/forward pointer install. They may be reused.
diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc
index f6bd996..70aec9a 100644
--- a/runtime/gc/collector/garbage_collector.cc
+++ b/runtime/gc/collector/garbage_collector.cc
@@ -51,6 +51,7 @@
timings_.Reset();
pause_times_.clear();
duration_ns_ = 0;
+ bytes_scanned_ = 0;
clear_soft_references_ = clear_soft_references;
gc_cause_ = gc_cause;
freed_ = ObjectBytePair();
@@ -72,6 +73,7 @@
gc_time_histogram_(nullptr),
metrics_gc_count_(nullptr),
gc_throughput_histogram_(nullptr),
+ gc_tracing_throughput_hist_(nullptr),
cumulative_timings_(name),
pause_histogram_lock_("pause histogram lock", kDefaultMutexLevel, true),
is_transaction_active_(false) {
@@ -88,6 +90,7 @@
total_time_ns_ = 0u;
total_freed_objects_ = 0u;
total_freed_bytes_ = 0;
+ total_scanned_bytes_ = 0;
rss_histogram_.Reset();
freed_bytes_histogram_.Reset();
MutexLock mu(Thread::Current(), pause_histogram_lock_);
@@ -169,6 +172,7 @@
// Update cumulative statistics with how many bytes the GC iteration freed.
total_freed_objects_ += current_iteration->GetFreedObjects() +
current_iteration->GetFreedLargeObjects();
+ total_scanned_bytes_ += current_iteration->GetScannedBytes();
int64_t freed_bytes = current_iteration->GetFreedBytes() +
current_iteration->GetFreedLargeObjectBytes();
total_freed_bytes_ += freed_bytes;
@@ -177,17 +181,18 @@
uint64_t end_time = NanoTime();
uint64_t thread_cpu_end_time = ThreadCpuNanoTime();
total_thread_cpu_time_ns_ += thread_cpu_end_time - thread_cpu_start_time;
- current_iteration->SetDurationNs(end_time - start_time);
+ uint64_t duration_ns = end_time - start_time;
+ current_iteration->SetDurationNs(duration_ns);
if (Locks::mutator_lock_->IsExclusiveHeld(self)) {
// The entire GC was paused, clear the fake pauses which might be in the pause times and add
// the whole GC duration.
current_iteration->pause_times_.clear();
- RegisterPause(current_iteration->GetDurationNs());
+ RegisterPause(duration_ns);
}
- total_time_ns_ += current_iteration->GetDurationNs();
+ total_time_ns_ += duration_ns;
if (gc_time_histogram_ != nullptr) {
// Report GC time in milliseconds.
- gc_time_histogram_->Add(NsToMs(current_iteration->GetDurationNs()));
+ gc_time_histogram_->Add(NsToMs(duration_ns));
}
uint64_t total_pause_time = 0;
for (uint64_t pause_time : current_iteration->GetPauseTimes()) {
@@ -201,6 +206,15 @@
if (metrics_gc_count_ != nullptr) {
metrics_gc_count_->Add(1);
}
+
+ if (gc_tracing_throughput_hist_ != nullptr) {
+ // Throughput in bytes/s. Add 1us to prevent possible division by 0.
+ uint64_t throughput = (current_iteration->GetScannedBytes() * 1'000'000)
+ / (NsToUs(duration_ns) + 1);
+ // Report in MB/s.
+ gc_tracing_throughput_hist_->Add(throughput / MB);
+ }
+
if (gc_throughput_histogram_ != nullptr) {
// Report GC throughput in MB/s.
gc_throughput_histogram_->Add(current_iteration->GetEstimatedThroughput() / MB);
@@ -247,6 +261,7 @@
total_time_ns_ = 0u;
total_freed_objects_ = 0u;
total_freed_bytes_ = 0;
+ total_scanned_bytes_ = 0u;
}
GarbageCollector::ScopedPause::ScopedPause(GarbageCollector* collector, bool with_reporting)
@@ -306,6 +321,7 @@
const double seconds = NsToMs(total_ns) / 1000.0;
const uint64_t freed_bytes = GetTotalFreedBytes();
const uint64_t freed_objects = GetTotalFreedObjects();
+ const uint64_t scanned_bytes = GetTotalScannedBytes();
{
MutexLock mu(Thread::Current(), pause_histogram_lock_);
if (pause_histogram_.SampleSize() > 0) {
@@ -343,7 +359,11 @@
<< PrettySize(freed_bytes / seconds) << "/s"
<< " per cpu-time: "
<< static_cast<uint64_t>(freed_bytes / cpu_seconds) << "/s / "
- << PrettySize(freed_bytes / cpu_seconds) << "/s\n";
+ << PrettySize(freed_bytes / cpu_seconds) << "/s\n"
+ << GetName() << " tracing throughput: "
+ << PrettySize(scanned_bytes / seconds) << "/s "
+ << " per cpu-time: "
+ << PrettySize(scanned_bytes / cpu_seconds) << "/s\n";
}
} // namespace collector
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index b42f189..d2b1170 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -92,6 +92,9 @@
uint64_t GetTotalFreedObjects() const {
return total_freed_objects_;
}
+ uint64_t GetTotalScannedBytes() const {
+ return total_scanned_bytes_;
+ }
// Reset the cumulative timings and pause histogram.
void ResetMeasurements() REQUIRES(!pause_histogram_lock_);
// Returns the estimated throughput in bytes / second.
@@ -160,10 +163,12 @@
metrics::MetricsBase<int64_t>* gc_time_histogram_;
metrics::MetricsBase<uint64_t>* metrics_gc_count_;
metrics::MetricsBase<int64_t>* gc_throughput_histogram_;
+ metrics::MetricsBase<int64_t>* gc_tracing_throughput_hist_;
uint64_t total_thread_cpu_time_ns_;
uint64_t total_time_ns_;
uint64_t total_freed_objects_;
int64_t total_freed_bytes_;
+ uint64_t total_scanned_bytes_;
CumulativeLogger cumulative_timings_;
mutable Mutex pause_histogram_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
bool is_transaction_active_;
diff --git a/runtime/gc/collector/iteration.h b/runtime/gc/collector/iteration.h
index 363459a..348f49d 100644
--- a/runtime/gc/collector/iteration.h
+++ b/runtime/gc/collector/iteration.h
@@ -60,6 +60,12 @@
uint64_t GetFreedRevokeBytes() const {
return freed_bytes_revoke_;
}
+ uint64_t GetScannedBytes() const {
+ return bytes_scanned_;
+ }
+ void SetScannedBytes(uint64_t bytes) {
+ bytes_scanned_ = bytes;
+ }
void SetFreedRevoke(uint64_t freed) {
freed_bytes_revoke_ = freed;
}
@@ -84,6 +90,7 @@
GcCause gc_cause_;
bool clear_soft_references_;
uint64_t duration_ns_;
+ uint64_t bytes_scanned_;
TimingLogger timings_;
ObjectBytePair freed_;
ObjectBytePair freed_los_;
diff --git a/runtime/metrics/statsd.cc b/runtime/metrics/statsd.cc
index dac41b9..7e74cf6 100644
--- a/runtime/metrics/statsd.cc
+++ b/runtime/metrics/statsd.cc
@@ -31,6 +31,7 @@
// EncodeDatumId returns a std::optional that provides a enum value from atoms.proto if the datum is
// one that we support logging to statsd. The list of datums that ART collects is a superset of what
// we report to statsd. Therefore, we only have mappings for the DatumIds that statsd recognizes.
+// Also it must be noted that histograms are not handled yet by statsd yet.
//
// Other code can use whether the result of this function has a value to decide whether to report
// the atom to statsd.
@@ -76,14 +77,22 @@
case DatumId::kYoungGcThroughput:
return std::make_optional(
statsd::
- ART_DATUM_REPORTED__KIND__ART_DATUM_GC_YOUNG_GENERATION_COLLECTION_THROUGHPUT_MB_PER_SEC);
+ ART_DATUM_REPORTED__KIND__ART_DATUM_GC_YOUNG_GENERATION_COLLECTION_THROUGHPUT_HISTO_MB_PER_SEC);
case DatumId::kFullGcThroughput:
return std::make_optional(
statsd::
- ART_DATUM_REPORTED__KIND__ART_DATUM_GC_FULL_HEAP_COLLECTION_THROUGHPUT_MB_PER_SEC);
+ ART_DATUM_REPORTED__KIND__ART_DATUM_GC_FULL_HEAP_COLLECTION_THROUGHPUT_HISTO_MB_PER_SEC);
case DatumId::kJitMethodCompileCount:
return std::make_optional(
statsd::ART_DATUM_REPORTED__KIND__ART_DATUM_JIT_METHOD_COMPILE_COUNT);
+ case DatumId::kYoungGcTracingThroughput:
+ return std::make_optional(
+ statsd::
+ ART_DATUM_REPORTED__KIND__ART_DATUM_GC_YOUNG_GENERATION_TRACING_THROUGHPUT_HISTO_MB_PER_SEC);
+ case DatumId::kFullGcTracingThroughput:
+ return std::make_optional(
+ statsd::
+ ART_DATUM_REPORTED__KIND__ART_DATUM_GC_FULL_HEAP_TRACING_THROUGHPUT_HISTO_MB_PER_SEC);
}
}