Fix concurrent GC ergonomics Fixed a race with the gc_request_pending_ boolean which would cause two concurrent GCs to start in a row in most cases. This broke sticky CMS ergonomics since the second GC was a sticky CMS which started way too early resulting in low throughput. Since the throughput was low, it switch to partial / full for the next iteration. The race happened as follows, allocating thread would request concurrent GC which woke up the daemon thread. The daemon thread cleared the gc_request_pending_ boolean, but before we set the concurrent_start_bytes_ to max in to prevent more request, the allocating thread would call RequestConcurrentGC again. This caused the next WaitForConcurrentGCRequest to return right away and a concurrent GC to occur earlier than necessary. Changed the allocation rate ergonomics to use allocation rate during the GC instead of allocation rate inbetween GCs, this is better since the allocation rate may become slower if the GC steals mutator time, resulting in concurrent GCs starting a bit earlier than they need to. Fixed a bug in GrowForUtilization where we didn't use the adjusted max_free when we shrank down the heap, this caused the sticky CMS to occasionally shrink the heap more than necessary. EvaluateAndApplyChanges: Before: ~12.6s GC time After: ~7.75s GC time Change-Id: I354bc825b3c44ccfbfe867af0d437b17fe1fe022

commit: e2c2f6e61df0bfa1516bfc8943e78ea610d4d878 [log] [tgz]
author: Mathieu Chartier <mathieuc@google.com> Tue Dec 16 18:49:31 2014 -0800
committer: Mathieu Chartier <mathieuc@google.com> Wed Dec 17 18:14:34 2014 -0800
tree: e993fb6cb53a5e66e204e5fae1cc9a397320c739
parent: 3e2f0fc584463e9da1142a5011ac3f7bcf083e0f [diff]
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 0587bb5..f738c6e 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc

@@ -143,6 +143,7 @@
       zygote_space_(nullptr),
       large_object_threshold_(large_object_threshold),
       gc_request_pending_(false),
+      conc_gc_running_(false),
       collector_type_running_(kCollectorTypeNone),
       last_gc_type_(collector::kGcTypeNone),
       next_gc_type_(collector::kGcTypePartial),
@@ -167,8 +168,6 @@
       verify_pre_gc_rosalloc_(verify_pre_gc_rosalloc),
       verify_pre_sweeping_rosalloc_(verify_pre_sweeping_rosalloc),
       verify_post_gc_rosalloc_(verify_post_gc_rosalloc),
-      last_gc_time_ns_(NanoTime()),
-      allocation_rate_(0),
       /* For GC a lot mode, we limit the allocations stacks to be kGcAlotInterval allocations. This
        * causes a lot of GC since we do a GC for alloc whenever the stack is full. When heap
        * verification is enabled, we limit the size of allocation stacks to speed up their
@@ -413,7 +412,6 @@
   gc_request_lock_ = new Mutex("GC request lock");
   gc_request_cond_.reset(new ConditionVariable("GC request condition variable", *gc_request_lock_));
   heap_trim_request_lock_ = new Mutex("Heap trim request lock");
-  last_gc_size_ = GetBytesAllocated();
   if (ignore_max_footprint_) {
     SetIdealFootprint(std::numeric_limits<size_t>::max());
     concurrent_start_bytes_ = std::numeric_limits<size_t>::max();
@@ -2154,16 +2152,9 @@
     ++runtime->GetStats()->gc_for_alloc_count;
     ++self->GetStats()->gc_for_alloc_count;
   }
-  uint64_t gc_start_time_ns = NanoTime();
-  uint64_t gc_start_size = GetBytesAllocated();
-  // Approximate allocation rate in bytes / second.
-  uint64_t ms_delta = NsToMs(gc_start_time_ns - last_gc_time_ns_);
-  // Back to back GCs can cause 0 ms of wait time in between GC invocations.
-  if (LIKELY(ms_delta != 0)) {
-    allocation_rate_ = ((gc_start_size - last_gc_size_) * 1000) / ms_delta;
-    ATRACE_INT("Allocation rate KB/s", allocation_rate_ / KB);
-    VLOG(heap) << "Allocation rate: " << PrettySize(allocation_rate_) << "/s";
-  }
+  const uint64_t bytes_allocated_before_gc = GetBytesAllocated();
+  // Approximate heap size.
+  ATRACE_INT("Heap size (KB)", bytes_allocated_before_gc / KB);
 
   DCHECK_LT(gc_type, collector::kGcTypeMax);
   DCHECK_NE(gc_type, collector::kGcTypeNone);
@@ -2220,7 +2211,7 @@
   // Enqueue cleared references.
   reference_processor_.EnqueueClearedReferences(self);
   // Grow the heap so that we know when to perform the next GC.
-  GrowForUtilization(collector);
+  GrowForUtilization(collector, bytes_allocated_before_gc);
   const size_t duration = GetCurrentGcIteration()->GetDurationNs();
   const std::vector<uint64_t>& pause_times = GetCurrentGcIteration()->GetPauseTimes();
   // Print the GC if it is an explicit GC (e.g. Runtime.gc()) or a slow GC
@@ -2930,25 +2921,24 @@
   return foreground_heap_growth_multiplier_;
 }
 
-void Heap::GrowForUtilization(collector::GarbageCollector* collector_ran) {
+void Heap::GrowForUtilization(collector::GarbageCollector* collector_ran,
+                              uint64_t bytes_allocated_before_gc) {
   // We know what our utilization is at this moment.
   // This doesn't actually resize any memory. It just lets the heap grow more when necessary.
   const uint64_t bytes_allocated = GetBytesAllocated();
-  last_gc_size_ = bytes_allocated;
-  last_gc_time_ns_ = NanoTime();
   uint64_t target_size;
   collector::GcType gc_type = collector_ran->GetGcType();
+  const double multiplier = HeapGrowthMultiplier();  // Use the multiplier to grow more for
+  // foreground.
+  const uint64_t adjusted_min_free = static_cast<uint64_t>(min_free_ * multiplier);
+  const uint64_t adjusted_max_free = static_cast<uint64_t>(max_free_ * multiplier);
   if (gc_type != collector::kGcTypeSticky) {
     // Grow the heap for non sticky GC.
-    const float multiplier = HeapGrowthMultiplier();  // Use the multiplier to grow more for
-    // foreground.
-    intptr_t delta = bytes_allocated / GetTargetHeapUtilization() - bytes_allocated;
+    ssize_t delta = bytes_allocated / GetTargetHeapUtilization() - bytes_allocated;
     CHECK_GE(delta, 0);
     target_size = bytes_allocated + delta * multiplier;
-    target_size = std::min(target_size,
-                           bytes_allocated + static_cast<uint64_t>(max_free_ * multiplier));
-    target_size = std::max(target_size,
-                           bytes_allocated + static_cast<uint64_t>(min_free_ * multiplier));
+    target_size = std::min(target_size, bytes_allocated + adjusted_max_free);
+    target_size = std::max(target_size, bytes_allocated + adjusted_min_free);
     native_need_to_run_finalization_ = true;
     next_gc_type_ = collector::kGcTypeSticky;
   } else {
@@ -2970,8 +2960,8 @@
       next_gc_type_ = non_sticky_gc_type;
     }
     // If we have freed enough memory, shrink the heap back down.
-    if (bytes_allocated + max_free_ < max_allowed_footprint_) {
-      target_size = bytes_allocated + max_free_;
+    if (bytes_allocated + adjusted_max_free < max_allowed_footprint_) {
+      target_size = bytes_allocated + adjusted_max_free;
     } else {
       target_size = std::max(bytes_allocated, static_cast<uint64_t>(max_allowed_footprint_));
     }
@@ -2979,11 +2969,18 @@
   if (!ignore_max_footprint_) {
     SetIdealFootprint(target_size);
     if (IsGcConcurrent()) {
+      const uint64_t freed_bytes = current_gc_iteration_.GetFreedBytes() +
+          current_gc_iteration_.GetFreedLargeObjectBytes();
+      // Bytes allocated will shrink by freed_bytes after the GC runs, so if we want to figure out
+      // how many bytes were allocated during the GC we need to add freed_bytes back on.
+      CHECK_GE(bytes_allocated + freed_bytes, bytes_allocated_before_gc);
+      const uint64_t bytes_allocated_during_gc = bytes_allocated + freed_bytes -
+          bytes_allocated_before_gc;
       // Calculate when to perform the next ConcurrentGC.
       // Calculate the estimated GC duration.
       const double gc_duration_seconds = NsToMs(current_gc_iteration_.GetDurationNs()) / 1000.0;
       // Estimate how many remaining bytes we will have when we need to start the next GC.
-      size_t remaining_bytes = allocation_rate_ * gc_duration_seconds;
+      size_t remaining_bytes = bytes_allocated_during_gc * gc_duration_seconds;
       remaining_bytes = std::min(remaining_bytes, kMaxConcurrentRemainingBytes);
       remaining_bytes = std::max(remaining_bytes, kMinConcurrentRemainingBytes);
       if (UNLIKELY(remaining_bytes > max_allowed_footprint_)) {
@@ -3278,17 +3275,21 @@
 void Heap::WaitForConcurrentGCRequest(Thread* self) {
   ScopedThreadStateChange tsc(self, kBlocked);
   MutexLock mu(self, *gc_request_lock_);
+  conc_gc_running_ = false;
   while (!gc_request_pending_) {
     gc_request_cond_->Wait(self);
   }
   gc_request_pending_ = false;
+  conc_gc_running_ = true;
 }
 
 void Heap::NotifyConcurrentGCRequest(Thread* self) {
   ScopedThreadStateChange tsc(self, kBlocked);
   MutexLock mu(self, *gc_request_lock_);
-  gc_request_pending_ = true;
-  gc_request_cond_->Signal(self);
+  if (!conc_gc_running_) {
+    gc_request_pending_ = true;
+    gc_request_cond_->Signal(self);
+  }
 }
 
 }  // namespace gc

diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 529af95..cf94eb6 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h

@@ -755,8 +755,10 @@
 
   // Given the current contents of the alloc space, increase the allowed heap footprint to match
   // the target utilization ratio.  This should only be called immediately after a full garbage
-  // collection.
-  void GrowForUtilization(collector::GarbageCollector* collector_ran);
+  // collection. bytes_allocated_before_gc is used to measure bytes / second for the period which
+  // the GC was run.
+  void GrowForUtilization(collector::GarbageCollector* collector_ran,
+                          uint64_t bytes_allocated_before_gc = 0);
 
   size_t GetPercentFree();
 
@@ -881,6 +883,7 @@
   Mutex* gc_request_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   std::unique_ptr<ConditionVariable> gc_request_cond_ GUARDED_BY(gc_request_lock_);
   bool gc_request_pending_ GUARDED_BY(gc_request_lock_);
+  bool conc_gc_running_ GUARDED_BY(gc_request_lock_);
 
   // Reference processor;
   ReferenceProcessor reference_processor_;
@@ -971,12 +974,6 @@
   // Parallel GC data structures.
   std::unique_ptr<ThreadPool> thread_pool_;
 
-  // The nanosecond time at which the last GC ended.
-  uint64_t last_gc_time_ns_;
-
-  // How many bytes were allocated at the end of the last GC.
-  uint64_t last_gc_size_;
-
   // Estimated allocation rate (bytes / second). Computed between the time of the last GC cycle
   // and the start of the current one.
   uint64_t allocation_rate_;
commit	e2c2f6e61df0bfa1516bfc8943e78ea610d4d878	[log] [tgz]
author	Mathieu Chartier <mathieuc@google.com>	Tue Dec 16 18:49:31 2014 -0800
committer	Mathieu Chartier <mathieuc@google.com>	Wed Dec 17 18:14:34 2014 -0800
tree	e993fb6cb53a5e66e204e5fae1cc9a397320c739
parent	3e2f0fc584463e9da1142a5011ac3f7bcf083e0f [diff]