Card pre-cleaning.

We now pre-clean cards before the pause in the concurrent mark sweep
collectors. This provides substantial a pause time reduction for GC
iterations which have a lot of dirty cards. The only downside is a
slight GC time increase for large heaps.

Benchmark FormulaEvaluationActions.EvaluateAndApplyChanges:

Before:
Partial average pause: 5.47ms
Sticky average pause: 2.91ms
Total GC time: 25.8s

After:
Partial average pause: 1.98ms
Sticky average pause: 1.66ms
Total GC time: 27.0s

Benchmark score difference in the noise.

Change-Id: If9f01f8c1501f122e19432438108d48e723b332e
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index fb797e0..9ab2d2e 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -65,6 +65,7 @@
 constexpr bool kUseRecursiveMark = false;
 constexpr bool kUseMarkStackPrefetch = true;
 constexpr size_t kSweepArrayChunkFreeSize = 1024;
+constexpr bool kPreCleanCards = true;
 
 // Parallelism options.
 constexpr bool kParallelCardScan = true;
@@ -232,6 +233,25 @@
   return is_concurrent_;
 }
 
+void MarkSweep::PreCleanCards() {
+  // Don't do this for non concurrent GCs since they don't have any dirty cards.
+  if (kPreCleanCards && IsConcurrent()) {
+    Thread* self = Thread::Current();
+    CHECK(!Locks::mutator_lock_->IsExclusiveHeld(self));
+    // Process dirty cards and add dirty cards to mod union tables, also ages cards.
+    heap_->ProcessCards(timings_);
+    // Required so that we see aged cards before we start scanning the cards.
+    MarkThreadRoots(self);
+    // TODO: Only mark the dirty roots.
+    MarkNonThreadRoots();
+    MarkConcurrentRoots();
+    // Process the newly aged cards.
+    RecursiveMarkDirtyObjects(false, accounting::CardTable::kCardDirty - 1);
+    // TODO: Empty allocation stack to reduce the number of objects we need to test / mark as live
+    // in the next GC.
+  }
+}
+
 void MarkSweep::MarkingPhase() {
   TimingLogger::ScopedSplit split("MarkingPhase", &timings_);
   Thread* self = Thread::Current();
@@ -263,6 +283,8 @@
   MarkConcurrentRoots();
   UpdateAndMarkModUnion();
   MarkReachableObjects();
+  // Pre-clean dirtied cards to reduce pauses.
+  PreCleanCards();
 }
 
 void MarkSweep::UpdateAndMarkModUnion() {
@@ -313,20 +335,24 @@
     TimingLogger::ScopedSplit split("UnMarkAllocStack", &timings_);
     WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
     accounting::ObjectStack* allocation_stack = GetHeap()->allocation_stack_.get();
-    // The allocation stack contains things allocated since the start of the GC. These may have been
-    // marked during this GC meaning they won't be eligible for reclaiming in the next sticky GC.
-    // Remove these objects from the mark bitmaps so that they will be eligible for sticky
-    // collection.
-    // There is a race here which is safely handled. Another thread such as the hprof could
-    // have flushed the alloc stack after we resumed the threads. This is safe however, since
-    // reseting the allocation stack zeros it out with madvise. This means that we will either
-    // read NULLs or attempt to unmark a newly allocated object which will not be marked in the
-    // first place.
-    mirror::Object** end = allocation_stack->End();
-    for (mirror::Object** it = allocation_stack->Begin(); it != end; ++it) {
-      const Object* obj = *it;
-      if (obj != NULL) {
-        UnMarkObjectNonNull(obj);
+    if (!kPreCleanCards) {
+      // The allocation stack contains things allocated since the start of the GC. These may have
+      // been marked during this GC meaning they won't be eligible for reclaiming in the next
+      // sticky GC. Unmark these objects so that they are eligible for reclaiming in the next
+      // sticky GC.
+      // There is a race here which is safely handled. Another thread such as the hprof could
+      // have flushed the alloc stack after we resumed the threads. This is safe however, since
+      // reseting the allocation stack zeros it out with madvise. This means that we will either
+      // read NULLs or attempt to unmark a newly allocated object which will not be marked in the
+      // first place.
+      // We can't do this if we pre-clean cards since we will unmark objects which are no longer on
+      // a dirty card since we aged cards during the pre-cleaning process.
+      mirror::Object** end = allocation_stack->End();
+      for (mirror::Object** it = allocation_stack->Begin(); it != end; ++it) {
+        const Object* obj = *it;
+        if (obj != nullptr) {
+          UnMarkObjectNonNull(obj);
+        }
       }
     }
   }
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index 963b9ea..8f8a0f0 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -135,6 +135,11 @@
   virtual void UpdateAndMarkModUnion()
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  // Pre clean cards to reduce how much work is needed in the pause.
+  void PreCleanCards()
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   // Sweeps unmarked objects to complete the garbage collection.
   virtual void Sweep(bool swap_bitmaps) EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
diff --git a/runtime/gc/collector/sticky_mark_sweep.cc b/runtime/gc/collector/sticky_mark_sweep.cc
index 9e3adb4..450445e 100644
--- a/runtime/gc/collector/sticky_mark_sweep.cc
+++ b/runtime/gc/collector/sticky_mark_sweep.cc
@@ -53,6 +53,8 @@
   // TODO: Not put these objects in the mark stack in the first place.
   mark_stack_->Reset();
   RecursiveMarkDirtyObjects(false, accounting::CardTable::kCardDirty - 1);
+  // Pre clean dirtied cards to reduce pauses.
+  PreCleanCards();
 }
 
 void StickyMarkSweep::Sweep(bool swap_bitmaps) {