4 files changed, 164 insertions, 100 deletions
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 61378c90a2..4cfbe2d4bd 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -81,7 +81,12 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self,
   size_t bytes_allocated;
   size_t usable_size;
   size_t new_num_bytes_allocated = 0;
+  bool need_gc = false;
+  uint32_t starting_gc_num;  // o.w. GC number at which we observed need for GC.
   {
+    // Bytes allocated that includes bulk thread-local buffer allocations in addition to direct
+    // non-TLAB object allocations. Only set for non-thread-local allocation,
+    size_t bytes_tl_bulk_allocated = 0u;
     // Do the initial pre-alloc
     pre_object_allocated();
     ScopedAssertNoThreadSuspension ants("Called PreObjectAllocated, no suspend until alloc");
@@ -132,9 +137,6 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self,
       no_suspend_pre_fence_visitor(obj, usable_size);
       QuasiAtomic::ThreadFenceForConstructor();
     } else {
-      // Bytes allocated that includes bulk thread-local buffer allocations in addition to direct
-      // non-TLAB object allocations.
-      size_t bytes_tl_bulk_allocated = 0u;
       obj = TryToAllocate<kInstrumented, false>(self, allocator, byte_count, &bytes_allocated,
                                                 &usable_size, &bytes_tl_bulk_allocated);
       if (UNLIKELY(obj == nullptr)) {
@@ -180,22 +182,32 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self,
       }
       no_suspend_pre_fence_visitor(obj, usable_size);
       QuasiAtomic::ThreadFenceForConstructor();
-      if (bytes_tl_bulk_allocated > 0) {
-        size_t num_bytes_allocated_before =
-            num_bytes_allocated_.fetch_add(bytes_tl_bulk_allocated, std::memory_order_relaxed);
-        new_num_bytes_allocated = num_bytes_allocated_before + bytes_tl_bulk_allocated;
-        // Only trace when we get an increase in the number of bytes allocated. This happens when
-        // obtaining a new TLAB and isn't often enough to hurt performance according to golem.
-        if (region_space_) {
-          // With CC collector, during a GC cycle, the heap usage increases as
-          // there are two copies of evacuated objects. Therefore, add evac-bytes
-          // to the heap size. When the GC cycle is not running, evac-bytes
-          // are 0, as required.
-          TraceHeapSize(new_num_bytes_allocated + region_space_->EvacBytes());
-        } else {
-          TraceHeapSize(new_num_bytes_allocated);
-        }
+    }
+    if (bytes_tl_bulk_allocated > 0) {
+      starting_gc_num = GetCurrentGcNum();
+      size_t num_bytes_allocated_before =
+          num_bytes_allocated_.fetch_add(bytes_tl_bulk_allocated, std::memory_order_relaxed);
+      new_num_bytes_allocated = num_bytes_allocated_before + bytes_tl_bulk_allocated;
+      // Only trace when we get an increase in the number of bytes allocated. This happens when
+      // obtaining a new TLAB and isn't often enough to hurt performance according to golem.
+      if (region_space_) {
+        // With CC collector, during a GC cycle, the heap usage increases as
+        // there are two copies of evacuated objects. Therefore, add evac-bytes
+        // to the heap size. When the GC cycle is not running, evac-bytes
+        // are 0, as required.
+        TraceHeapSize(new_num_bytes_allocated + region_space_->EvacBytes());
+      } else {
+        TraceHeapSize(new_num_bytes_allocated);
+      }
+      // IsGcConcurrent() isn't known at compile time so we can optimize by not checking it for the
+      // BumpPointer or TLAB allocators. This is nice since it allows the entire if statement to be
+      // optimized out. And for the other allocators, AllocatorMayHaveConcurrentGC is a constant
+      // since the allocator_type should be constant propagated.
+      if (AllocatorMayHaveConcurrentGC(allocator) && IsGcConcurrent()
+          && UNLIKELY(ShouldConcurrentGCForJava(new_num_bytes_allocated))) {
+        need_gc = true;
       }
+      GetMetrics()->TotalBytesAllocated()->Add(bytes_tl_bulk_allocated);
     }
   }
   if (kIsDebugBuild && Runtime::Current()->IsStarted()) {
@@ -214,7 +226,6 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self,
   } else {
     DCHECK(!Runtime::Current()->HasStatsEnabled());
   }
-  GetMetrics()->TotalBytesAllocated()->Add(bytes_allocated);
   if (kInstrumented) {
     if (IsAllocTrackingEnabled()) {
       // allocation_records_ is not null since it never becomes null after allocation tracking is
@@ -241,14 +252,9 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self,
   } else {
     DCHECK(!gc_stress_mode_);
   }
-  // IsGcConcurrent() isn't known at compile time so we can optimize by not checking it for
-  // the BumpPointer or TLAB allocators. This is nice since it allows the entire if statement to be
-  // optimized out. And for the other allocators, AllocatorMayHaveConcurrentGC is a constant since
-  // the allocator_type should be constant propagated.
-  if (AllocatorMayHaveConcurrentGC(allocator) && IsGcConcurrent()) {
-    // New_num_bytes_allocated is zero if we didn't update num_bytes_allocated_.
-    // That's fine.
-    CheckConcurrentGCForJava(self, new_num_bytes_allocated, &obj);
+  if (need_gc) {
+    // Do this only once thread suspension is allowed again, and we're done with kInstrumented.
+    RequestConcurrentGCAndSaveObject(self, /*force_full=*/ false, starting_gc_num, &obj);
   }
   VerifyObject(obj);
   self->VerifyStack();
@@ -464,14 +470,6 @@ inline bool Heap::ShouldConcurrentGCForJava(size_t new_num_bytes_allocated) {
   return new_num_bytes_allocated >= concurrent_start_bytes_;
 }
 
-inline void Heap::CheckConcurrentGCForJava(Thread* self,
-                                    size_t new_num_bytes_allocated,
-                                    ObjPtr<mirror::Object>* obj) {
-  if (UNLIKELY(ShouldConcurrentGCForJava(new_num_bytes_allocated))) {
-    RequestConcurrentGCAndSaveObject(self, false /* force_full */, obj);
-  }
-}
-
 }  // namespace gc
 }  // namespace art
 
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 7cfa0fae3d..a55df4631d 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -366,6 +366,8 @@ Heap::Heap(size_t initial_size,
       min_interval_homogeneous_space_compaction_by_oom_(
           min_interval_homogeneous_space_compaction_by_oom),
       last_time_homogeneous_space_compaction_by_oom_(NanoTime()),
+      gcs_completed_(0u),
+      gcs_requested_(0u),
       pending_collector_transition_(nullptr),
       pending_heap_trim_(nullptr),
       use_homogeneous_space_compaction_for_oom_(use_homogeneous_space_compaction_for_oom),
@@ -1465,7 +1467,7 @@ void Heap::DoPendingCollectorTransition() {
       // Invoke CC full compaction.
       CollectGarbageInternal(collector::kGcTypeFull,
                              kGcCauseCollectorTransition,
-                             /*clear_soft_references=*/false);
+                             /*clear_soft_references=*/false, GC_NUM_ANY);
     } else {
       VLOG(gc) << "CC background compaction ignored due to jank perceptible process state";
     }
@@ -1824,6 +1826,7 @@ mirror::Object* Heap::AllocateInternalWithGc(Thread* self,
       (!instrumented && EntrypointsInstrumented())) {
     return nullptr;
   }
+  uint32_t starting_gc_num = GetCurrentGcNum();
   if (last_gc != collector::kGcTypeNone) {
     // A GC was in progress and we blocked, retry allocation now that memory has been freed.
     mirror::Object* ptr = TryToAllocate<true, false>(self, allocator, alloc_size, bytes_allocated,
@@ -1848,7 +1851,8 @@ mirror::Object* Heap::AllocateInternalWithGc(Thread* self,
   collector::GcType tried_type = next_gc_type_;
   if (last_gc < tried_type) {
     const bool gc_ran = PERFORM_SUSPENDING_OPERATION(
-        CollectGarbageInternal(tried_type, kGcCauseForAlloc, false) != collector::kGcTypeNone);
+        CollectGarbageInternal(tried_type, kGcCauseForAlloc, false, starting_gc_num + 1)
+        != collector::kGcTypeNone);
 
     if ((was_default_allocator && allocator != GetCurrentAllocator()) ||
         (!instrumented && EntrypointsInstrumented())) {
@@ -1871,8 +1875,11 @@ mirror::Object* Heap::AllocateInternalWithGc(Thread* self,
            << " allocation";
   // TODO: Run finalization, but this may cause more allocations to occur.
   // We don't need a WaitForGcToComplete here either.
+  // TODO: Should check whether another thread already just ran a GC with soft
+  // references.
   DCHECK(!gc_plan_.empty());
-  PERFORM_SUSPENDING_OPERATION(CollectGarbageInternal(gc_plan_.back(), kGcCauseForAlloc, true));
+  PERFORM_SUSPENDING_OPERATION(
+      CollectGarbageInternal(gc_plan_.back(), kGcCauseForAlloc, true, GC_NUM_ANY));
   if ((was_default_allocator && allocator != GetCurrentAllocator()) ||
       (!instrumented && EntrypointsInstrumented())) {
     return nullptr;
@@ -2028,7 +2035,7 @@ void Heap::CountInstances(const std::vector<Handle<mirror::Class>>& classes,
 void Heap::CollectGarbage(bool clear_soft_references, GcCause cause) {
   // Even if we waited for a GC we still need to do another GC since weaks allocated during the
   // last GC will not have necessarily been cleared.
-  CollectGarbageInternal(gc_plan_.back(), cause, clear_soft_references);
+  CollectGarbageInternal(gc_plan_.back(), cause, clear_soft_references, GC_NUM_ANY);
 }
 
 bool Heap::SupportHomogeneousSpaceCompactAndCollectorTransitions() const {
@@ -2295,7 +2302,7 @@ void Heap::PreZygoteFork() {
   if (!HasZygoteSpace()) {
     // We still want to GC in case there is some unreachable non moving objects that could cause a
     // suboptimal bin packing when we compact the zygote space.
-    CollectGarbageInternal(collector::kGcTypeFull, kGcCauseBackground, false);
+    CollectGarbageInternal(collector::kGcTypeFull, kGcCauseBackground, false, GC_NUM_ANY);
     // Trim the pages at the end of the non moving space. Trim while not holding zygote lock since
     // the trim process may require locking the mutator lock.
     non_moving_space_->Trim();
@@ -2552,9 +2559,17 @@ size_t Heap::GetNativeBytes() {
   // other things. It seems risky to trigger GCs as a result of such changes.
 }
 
+static inline bool GCNumberLt(uint32_t gcs_completed, uint32_t gcs_requested) {
+  uint32_t difference = gcs_requested - gcs_completed;
+  bool completed_more_than_requested = difference > 0x80000000;
+  return difference > 0 && !completed_more_than_requested;
+}
+
+
 collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type,
                                                GcCause gc_cause,
-                                               bool clear_soft_references) {
+                                               bool clear_soft_references,
+                                               uint32_t requested_gc_num) {
   Thread* self = Thread::Current();
   Runtime* runtime = Runtime::Current();
   // If the heap can't run the GC, silently fail and return that no GC was run.
@@ -2584,6 +2599,10 @@ collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type,
     MutexLock mu(self, *gc_complete_lock_);
     // Ensure there is only one GC at a time.
     WaitForGcToCompleteLocked(gc_cause, self);
+    if (requested_gc_num != GC_NUM_ANY && !GCNumberLt(GetCurrentGcNum(), requested_gc_num)) {
+      // The appropriate GC was already triggered elsewhere.
+      return collector::kGcTypeNone;
+    }
     compacting_gc = IsMovingGc(collector_type_);
     // GC can be disabled if someone has a used GetPrimitiveArrayCritical.
     if (compacting_gc && disable_moving_gc_count_ != 0) {
@@ -2594,6 +2613,7 @@ collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type,
       return collector::kGcTypeNone;
     }
     collector_type_running_ = collector_type_;
+    last_gc_cause_ = gc_cause;
   }
   if (gc_cause == kGcCauseForAlloc && runtime->HasStatsEnabled()) {
     ++runtime->GetStats()->gc_for_alloc_count;
@@ -2662,6 +2682,7 @@ collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type,
   SelfDeletingTask* clear = reference_processor_->CollectClearedReferences(self);
   // Grow the heap so that we know when to perform the next GC.
   GrowForUtilization(collector, bytes_allocated_before_gc);
+  old_native_bytes_allocated_.store(GetNativeBytes());
   LogGC(gc_cause, collector);
   FinishGC(self, gc_type);
   // Actually enqueue all cleared references. Do this after the GC has officially finished since
@@ -2671,8 +2692,6 @@ collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type,
   // Inform DDMS that a GC completed.
   Dbg::GcDidFinish();
 
-  old_native_bytes_allocated_.store(GetNativeBytes());
-
   // Unload native libraries for class unloading. We do this after calling FinishGC to prevent
   // deadlocks in case the JNI_OnUnload function does allocations.
   {
@@ -2738,6 +2757,9 @@ void Heap::FinishGC(Thread* self, collector::GcType gc_type) {
   // Reset.
   running_collection_is_blocking_ = false;
   thread_running_gc_ = nullptr;
+  if (gc_type != collector::kGcTypeNone) {
+    gcs_completed_.fetch_add(1, std::memory_order_release);
+  }
   // Wake anyone who may have been waiting for the GC to complete.
   gc_complete_cond_->Broadcast(self);
 }
@@ -3000,7 +3022,10 @@ void Heap::PushOnAllocationStackWithInternalGC(Thread* self, ObjPtr<mirror::Obje
     // to heap verification requiring that roots are live (either in the live bitmap or in the
     // allocation stack).
     CHECK(allocation_stack_->AtomicPushBackIgnoreGrowthLimit(obj->Ptr()));
-    CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
+    CollectGarbageInternal(collector::kGcTypeSticky,
+                           kGcCauseForAlloc,
+                           false,
+                           GetCurrentGcNum() + 1);
   } while (!allocation_stack_->AtomicPushBack(obj->Ptr()));
 }
 
@@ -3020,7 +3045,10 @@ void Heap::PushOnThreadLocalAllocationStackWithInternalGC(Thread* self,
     // allocation stack).
     CHECK(allocation_stack_->AtomicPushBackIgnoreGrowthLimit(obj->Ptr()));
     // Push into the reserve allocation stack.
-    CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
+    CollectGarbageInternal(collector::kGcTypeSticky,
+                           kGcCauseForAlloc,
+                           false,
+                           GetCurrentGcNum() + 1);
   }
   self->SetThreadLocalAllocationStack(start_address, end_address);
   // Retry on the new thread-local allocation stack.
@@ -3441,7 +3469,6 @@ collector::GcType Heap::WaitForGcToCompleteLocked(GcCause cause, Thread* self) {
     // Don't log fake "GC" types that are only used for debugger or hidden APIs. If we log these,
     // it results in log spam. kGcCauseExplicit is already logged in LogGC, so avoid it here too.
     if (cause == kGcCauseForAlloc ||
-        cause == kGcCauseForNativeAlloc ||
         cause == kGcCauseDisableMovingGc) {
       VLOG(gc) << "Starting a blocking GC " << cause;
     }
@@ -3656,25 +3683,26 @@ void Heap::AddFinalizerReference(Thread* self, ObjPtr<mirror::Object>* object) {
 
 void Heap::RequestConcurrentGCAndSaveObject(Thread* self,
                                             bool force_full,
+                                            uint32_t observed_gc_num,
                                             ObjPtr<mirror::Object>* obj) {
   StackHandleScope<1> hs(self);
   HandleWrapperObjPtr<mirror::Object> wrapper(hs.NewHandleWrapper(obj));
-  RequestConcurrentGC(self, kGcCauseBackground, force_full);
+  RequestConcurrentGC(self, kGcCauseBackground, force_full, observed_gc_num);
 }
 
 class Heap::ConcurrentGCTask : public HeapTask {
  public:
-  ConcurrentGCTask(uint64_t target_time, GcCause cause, bool force_full)
-      : HeapTask(target_time), cause_(cause), force_full_(force_full) {}
+  ConcurrentGCTask(uint64_t target_time, GcCause cause, bool force_full, uint32_t gc_num)
+      : HeapTask(target_time), cause_(cause), force_full_(force_full), my_gc_num_(gc_num) {}
   void Run(Thread* self) override {
     gc::Heap* heap = Runtime::Current()->GetHeap();
-    heap->ConcurrentGC(self, cause_, force_full_);
-    heap->ClearConcurrentGCRequest();
+    heap->ConcurrentGC(self, cause_, force_full_, my_gc_num_);
   }
 
  private:
   const GcCause cause_;
   const bool force_full_;  // If true, force full (or partial) collection.
+  const uint32_t my_gc_num_;  // Sequence number of requested GC.
 };
 
 static bool CanAddHeapTask(Thread* self) REQUIRES(!Locks::runtime_shutdown_lock_) {
@@ -3683,20 +3711,24 @@ static bool CanAddHeapTask(Thread* self) REQUIRES(!Locks::runtime_shutdown_lock_
       !self->IsHandlingStackOverflow();
 }
 
-void Heap::ClearConcurrentGCRequest() {
-  concurrent_gc_pending_.store(false, std::memory_order_relaxed);
-}
-
-void Heap::RequestConcurrentGC(Thread* self, GcCause cause, bool force_full) {
-  if (CanAddHeapTask(self) &&
-      concurrent_gc_pending_.CompareAndSetStrongSequentiallyConsistent(false, true)) {
-    task_processor_->AddTask(self, new ConcurrentGCTask(NanoTime(),  // Start straight away.
-                                                        cause,
-                                                        force_full));
+void Heap::RequestConcurrentGC(Thread* self,
+                               GcCause cause,
+                               bool force_full,
+                               uint32_t observed_gc_num) {
+  uint32_t gcs_requested = gcs_requested_.load(std::memory_order_relaxed);
+  if (!GCNumberLt(observed_gc_num, gcs_requested)) {
+    // Nobody beat us to requesting the next gc after observed_gc_num.
+    if (CanAddHeapTask(self)
+        && gcs_requested_.CompareAndSetStrongRelaxed(gcs_requested, observed_gc_num + 1)) {
+      task_processor_->AddTask(self, new ConcurrentGCTask(NanoTime(),  // Start straight away.
+                                                          cause,
+                                                          force_full,
+                                                          observed_gc_num + 1));
+    }
   }
 }
 
-void Heap::ConcurrentGC(Thread* self, GcCause cause, bool force_full) {
+void Heap::ConcurrentGC(Thread* self, GcCause cause, bool force_full, uint32_t requested_gc_num) {
   if (!Runtime::Current()->IsShuttingDown(self)) {
     // Wait for any GCs currently running to finish.
     if (WaitForGcToComplete(cause, self) == collector::kGcTypeNone) {
@@ -3707,11 +3739,18 @@ void Heap::ConcurrentGC(Thread* self, GcCause cause, bool force_full) {
       if (force_full && next_gc_type == collector::kGcTypeSticky) {
         next_gc_type = NonStickyGcType();
       }
-      if (CollectGarbageInternal(next_gc_type, cause, false) == collector::kGcTypeNone) {
+      if (CollectGarbageInternal(next_gc_type, cause, false, requested_gc_num)
+          == collector::kGcTypeNone) {
         for (collector::GcType gc_type : gc_plan_) {
           // Attempt to run the collector, if we succeed, we are done.
+          uint32_t gcs_completed = GetCurrentGcNum();
+          if (!GCNumberLt(gcs_completed, requested_gc_num)) {
+            // Somebody did it for us.
+            break;
+          }
           if (gc_type > next_gc_type &&
-              CollectGarbageInternal(gc_type, cause, false) != collector::kGcTypeNone) {
+              CollectGarbageInternal(gc_type, cause, false, requested_gc_num)
+              != collector::kGcTypeNone) {
             break;
           }
         }
@@ -3752,7 +3791,7 @@ void Heap::RequestCollectorTransition(CollectorType desired_collector_type, uint
   const uint64_t target_time = NanoTime() + delta_time;
   {
     MutexLock mu(self, *pending_task_lock_);
-    // If we have an existing collector transition, update the targe time to be the new target.
+    // If we have an existing collector transition, update the target time to be the new target.
     if (pending_collector_transition_ != nullptr) {
       task_processor_->UpdateTargetRunTime(self, pending_collector_transition_, target_time);
       return;
@@ -3856,10 +3895,6 @@ void Heap::RevokeAllThreadLocalBuffers() {
   }
 }
 
-bool Heap::IsGCRequestPending() const {
-  return concurrent_gc_pending_.load(std::memory_order_relaxed);
-}
-
 void Heap::RunFinalization(JNIEnv* env, uint64_t timeout) {
   env->CallStaticVoidMethod(WellKnownClasses::dalvik_system_VMRuntime,
                             WellKnownClasses::dalvik_system_VMRuntime_runFinalization,
@@ -3913,21 +3948,35 @@ inline float Heap::NativeMemoryOverTarget(size_t current_native_bytes, bool is_g
 
 inline void Heap::CheckGCForNative(Thread* self) {
   bool is_gc_concurrent = IsGcConcurrent();
+  uint32_t starting_gc_num = GetCurrentGcNum();
   size_t current_native_bytes = GetNativeBytes();
   float gc_urgency = NativeMemoryOverTarget(current_native_bytes, is_gc_concurrent);
   if (UNLIKELY(gc_urgency >= 1.0)) {
     if (is_gc_concurrent) {
-      RequestConcurrentGC(self, kGcCauseForNativeAlloc, /*force_full=*/true);
+      RequestConcurrentGC(self, kGcCauseForNativeAlloc, /*force_full=*/true, starting_gc_num);
       if (gc_urgency > kStopForNativeFactor
           && current_native_bytes > stop_for_native_allocs_) {
         // We're in danger of running out of memory due to rampant native allocation.
         if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
           LOG(INFO) << "Stopping for native allocation, urgency: " << gc_urgency;
         }
-        WaitForGcToComplete(kGcCauseForNativeAlloc, self);
+        // Count how many times we do this, so we can warn if this becomes excessive.
+        // Stop after a while out of excessive caution.
+        static constexpr int kGcWaitIters = 50;
+        for (int i = 1; i <= kGcWaitIters; ++i) {
+          if (GCNumberLt(starting_gc_num, starting_gc_num)
+              || WaitForGcToComplete(kGcCauseForNativeAlloc, self) != collector::kGcTypeNone) {
+            break;
+          }
+          if (i % 10 == 0) {
+            LOG(WARNING) << "Slept " << i << " times in native allocation, waiting for GC";
+          }
+          static constexpr int kGcWaitSleepMicros = 2000;
+          usleep(kGcWaitSleepMicros);  // Encourage our requested GC to start.
+        }
       }
     } else {
-      CollectGarbageInternal(NonStickyGcType(), kGcCauseForNativeAlloc, false);
+      CollectGarbageInternal(NonStickyGcType(), kGcCauseForNativeAlloc, false, starting_gc_num + 1);
     }
   }
 }
@@ -4372,7 +4421,7 @@ class Heap::TriggerPostForkCCGcTask : public HeapTask {
     // Trigger a GC, if not already done. The first GC after fork, whenever it
     // takes place, will adjust the thresholds to normal levels.
     if (heap->target_footprint_.load(std::memory_order_relaxed) == heap->growth_limit_) {
-      heap->RequestConcurrentGC(self, kGcCauseBackground, false);
+      heap->RequestConcurrentGC(self, kGcCauseBackground, false, heap->GetCurrentGcNum());
     }
   }
 };
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 35dce5ef81..450f38732f 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -376,13 +376,14 @@ class Heap {
       REQUIRES(Locks::heap_bitmap_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // Initiates an explicit garbage collection.
+  // Initiates an explicit garbage collection. Guarantees that a GC started after this call has
+  // completed.
   void CollectGarbage(bool clear_soft_references, GcCause cause = kGcCauseExplicit)
       REQUIRES(!*gc_complete_lock_, !*pending_task_lock_, !process_state_update_lock_);
 
-  // Does a concurrent GC, should only be called by the GC daemon thread
-  // through runtime.
-  void ConcurrentGC(Thread* self, GcCause cause, bool force_full)
+  // Does a concurrent GC, provided the GC numbered requested_gc_num has not already been
+  // completed. Should only be called by the GC daemon thread through runtime.
+  void ConcurrentGC(Thread* self, GcCause cause, bool force_full, uint32_t requested_gc_num)
       REQUIRES(!Locks::runtime_shutdown_lock_, !*gc_complete_lock_,
                !*pending_task_lock_, !process_state_update_lock_);
 
@@ -452,7 +453,8 @@ class Heap {
   void SetIdealFootprint(size_t max_allowed_footprint);
 
   // Blocks the caller until the garbage collector becomes idle and returns the type of GC we
-  // waited for.
+  // waited for. Only waits for running collections, ignoring a requested but unstarted GC. Only
+  // heuristic, since a new GC may have started by the time we return.
   collector::GcType WaitForGcToComplete(GcCause cause, Thread* self) REQUIRES(!*gc_complete_lock_);
 
   // Update the heap's process state to a new value, may cause compaction to occur.
@@ -815,8 +817,17 @@ class Heap {
   // Request an asynchronous trim.
   void RequestTrim(Thread* self) REQUIRES(!*pending_task_lock_);
 
-  // Request asynchronous GC.
-  void RequestConcurrentGC(Thread* self, GcCause cause, bool force_full)
+  // Retrieve the current GC number, i.e. the number n such that we completed n GCs so far.
+  // Provides acquire ordering, so that if we read this first, and then check whether a GC is
+  // required, we know that the GC number read actually preceded the test.
+  uint32_t GetCurrentGcNum() {
+    return gcs_completed_.load(std::memory_order_acquire);
+  }
+
+  // Request asynchronous GC. Observed_gc_num is the value of GetCurrentGcNum() when we started to
+  // evaluate the GC triggering condition. If a GC has been completed since then, we consider our
+  // job done.
+  void RequestConcurrentGC(Thread* self, GcCause cause, bool force_full, uint32_t observed_gc_num)
       REQUIRES(!*pending_task_lock_);
 
   // Whether or not we may use a garbage collector, used so that we only create collectors we need.
@@ -1002,11 +1013,6 @@ class Heap {
   // Checks whether we should garbage collect:
   ALWAYS_INLINE bool ShouldConcurrentGCForJava(size_t new_num_bytes_allocated);
   float NativeMemoryOverTarget(size_t current_native_bytes, bool is_gc_concurrent);
-  ALWAYS_INLINE void CheckConcurrentGCForJava(Thread* self,
-                                              size_t new_num_bytes_allocated,
-                                              ObjPtr<mirror::Object>* obj)
-      REQUIRES_SHARED(Locks::mutator_lock_)
-      REQUIRES(!*pending_task_lock_, !*gc_complete_lock_);
   void CheckGCForNative(Thread* self)
       REQUIRES(!*pending_task_lock_, !*gc_complete_lock_, !process_state_update_lock_);
 
@@ -1092,16 +1098,25 @@ class Heap {
   void RequestCollectorTransition(CollectorType desired_collector_type, uint64_t delta_time)
       REQUIRES(!*pending_task_lock_);
 
-  void RequestConcurrentGCAndSaveObject(Thread* self, bool force_full, ObjPtr<mirror::Object>* obj)
+  void RequestConcurrentGCAndSaveObject(Thread* self,
+                                        bool force_full,
+                                        uint32_t observed_gc_num,
+                                        ObjPtr<mirror::Object>* obj)
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!*pending_task_lock_);
-  bool IsGCRequestPending() const;
+
+  static constexpr uint32_t GC_NUM_ANY = std::numeric_limits<uint32_t>::max();
 
   // Sometimes CollectGarbageInternal decides to run a different Gc than you requested. Returns
-  // which type of Gc was actually ran.
+  // which type of Gc was actually run.
+  // We pass in the intended GC sequence number to ensure that multiple approximately concurrent
+  // requests result in a single GC; clearly redundant request will be pruned.  A requested_gc_num
+  // of GC_NUM_ANY indicates that we should not prune redundant requests.  (In the unlikely case
+  // that gcs_completed_ gets this big, we just accept a potential extra GC or two.)
   collector::GcType CollectGarbageInternal(collector::GcType gc_plan,
                                            GcCause gc_cause,
-                                           bool clear_soft_references)
+                                           bool clear_soft_references,
+                                           uint32_t requested_gc_num)
       REQUIRES(!*gc_complete_lock_, !Locks::heap_bitmap_lock_, !Locks::thread_suspend_count_lock_,
                !*pending_task_lock_, !process_state_update_lock_);
 
@@ -1168,7 +1183,6 @@ class Heap {
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!*gc_complete_lock_, !*pending_task_lock_, !process_state_update_lock_);
 
-  void ClearConcurrentGCRequest();
   void ClearPendingTrim(Thread* self) REQUIRES(!*pending_task_lock_);
   void ClearPendingCollectorTransition(Thread* self) REQUIRES(!*pending_task_lock_);
 
@@ -1550,8 +1564,14 @@ class Heap {
   // Count for performed homogeneous space compaction.
   Atomic<size_t> count_performed_homogeneous_space_compaction_;
 
-  // Whether or not a concurrent GC is pending.
-  Atomic<bool> concurrent_gc_pending_;
+  // The number of garbage collections (either young or full, not trims or the like) we have
+  // completed since heap creation. We guard against wrapping, though that's unlikely.
+  // Increment is guarded by gc_complete_lock_.
+  Atomic<uint32_t> gcs_completed_;
+
+  // The number of garbage collections we've scheduled. Normally either gcs_complete_ or
+  // gcs_complete + 1.
+  Atomic<uint32_t> gcs_requested_;
 
   // Active tasks which we can modify (change target time, desired collector type, etc..).
   CollectorTransitionTask* pending_collector_transition_ GUARDED_BY(pending_task_lock_);
diff --git a/runtime/native/dalvik_system_VMRuntime.cc b/runtime/native/dalvik_system_VMRuntime.cc
index eb1814d687..a483ae714d 100644
--- a/runtime/native/dalvik_system_VMRuntime.cc
+++ b/runtime/native/dalvik_system_VMRuntime.cc
@@ -336,18 +336,16 @@ static void VMRuntime_trimHeap(JNIEnv* env, jobject) {
   Runtime::Current()->GetHeap()->Trim(ThreadForEnv(env));
 }
 
-static void VMRuntime_concurrentGC(JNIEnv* env, jobject) {
-  Runtime::Current()->GetHeap()->ConcurrentGC(ThreadForEnv(env), gc::kGcCauseBackground, true);
-}
-
 static void VMRuntime_requestHeapTrim(JNIEnv* env, jobject) {
   Runtime::Current()->GetHeap()->RequestTrim(ThreadForEnv(env));
 }
 
 static void VMRuntime_requestConcurrentGC(JNIEnv* env, jobject) {
-  Runtime::Current()->GetHeap()->RequestConcurrentGC(ThreadForEnv(env),
-                                                     gc::kGcCauseBackground,
-                                                     true);
+  gc::Heap *heap = Runtime::Current()->GetHeap();
+  heap->RequestConcurrentGC(ThreadForEnv(env),
+                            gc::kGcCauseBackground,
+                            true,
+                            heap->GetCurrentGcNum());
 }
 
 static void VMRuntime_startHeapTaskProcessor(JNIEnv* env, jobject) {
@@ -497,7 +495,6 @@ static JNINativeMethod gMethods[] = {
   NATIVE_METHOD(VMRuntime, clampGrowthLimit, "()V"),
   NATIVE_METHOD(VMRuntime, classPath, "()Ljava/lang/String;"),
   NATIVE_METHOD(VMRuntime, clearGrowthLimit, "()V"),
-  NATIVE_METHOD(VMRuntime, concurrentGC, "()V"),
   NATIVE_METHOD(VMRuntime, setHiddenApiExemptions, "([Ljava/lang/String;)V"),
   NATIVE_METHOD(VMRuntime, setHiddenApiAccessLogSamplingRate, "(I)V"),
   NATIVE_METHOD(VMRuntime, getTargetHeapUtilization, "()F"),