Enable reading page map without lock in RosAlloc::BulkFree

Enabling this flag greatly reduces how much time was spent in the GC.
It was not done previously since it was regressing MemAllocTest. With
these RosAlloc changes, the benchmark score no longer regresses after
we enable the flag.

Changed Run::AllocSlot to only have one mode of allocation. The new
mode is finding the first free bit in the bitmap. This was
previously the slow path but is now the fast path. Some optimizations
which enabled this include always having the alloc bitmap bits which
correspond to invalid slots be set to 1. This prevents us from needing
a bound check since we will never end up allocating there.

Changed revoking thread local buffer to point to an invalid run. The
invalid run is just a run which always has all the allocation bits set
to 1. When a thread attempts to do a thread local allocation from here
it will always fail and go slow path. This eliminates the need for a
null check for revoked runs.

Changed zeroing of memory to happen during free, AllocPages should
always return zeroed memory. Added prefetching which happens when we
allocate a run.

Some refactoring to reduce duplicated code.

Ergonomics changes: Changed kStickyGcThroughputAdjustment to 1.0,
this helps reduce GC time.

Measurements (3 samples per benchmark):
Before: MemAllocTest scores: 3463, 3445, 3431
EvaluateAndApplyChanges score | total GC time
Iter 1: 3485, 23.602436s
Iter 2: 3434, 22.499882s
Iter 3: 3483, 23.253274s

After: MemAllocTest scores: 3495, 3417, 3409
EvaluateAndApplyChanges score | total GC time:
Iter 1: 3375, 17.463462s
Iter 2: 3358, 16.185188s
Iter 3: 3367, 15.822312s

Bug: 8788501
Bug: 11790317
Bug: 9986565
Change-Id: Ifd273a054824028dabed27c07c081dde1816f93c
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 517c748..5d72bc1 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -78,9 +78,9 @@
 static constexpr size_t kMinConcurrentRemainingBytes = 128 * KB;
 static constexpr size_t kMaxConcurrentRemainingBytes = 512 * KB;
 // Sticky GC throughput adjustment, divided by 4. Increasing this causes sticky GC to occur more
-// relative to partial/full GC. This is desirable since sticky GCs interfere less with mutator
+// relative to partial/full GC. This may be desirable since sticky GCs interfere less with mutator
 // threads (lower pauses, use less memory bandwidth).
-static constexpr double kStickyGcThroughputAdjustment = 1.25;
+static constexpr double kStickyGcThroughputAdjustment = 1.0;
 // Whether or not we use the free list large object space.
 static constexpr bool kUseFreeListSpaceForLOS = false;
 // Whtehr or not we compact the zygote in PreZygoteFork.
@@ -595,6 +595,11 @@
       if (continuous_space->IsDlMallocSpace()) {
         dlmalloc_space_ = continuous_space->AsDlMallocSpace();
       } else if (continuous_space->IsRosAllocSpace()) {
+        // Revoke before if we already have a rosalloc_space_ so that we don't end up with non full
+        // runs from the previous one during the revoke after.
+        if (rosalloc_space_ != nullptr) {
+          rosalloc_space_->RevokeAllThreadLocalBuffers();
+        }
         rosalloc_space_ = continuous_space->AsRosAllocSpace();
       }
     }
@@ -615,7 +620,7 @@
   }
 }
 
-void Heap::RemoveSpace(space::Space* space) {
+void Heap::RemoveSpace(space::Space* space, bool unset_as_default) {
   DCHECK(space != nullptr);
   WriterMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
   if (space->IsContinuousSpace()) {
@@ -632,17 +637,19 @@
     auto it = std::find(continuous_spaces_.begin(), continuous_spaces_.end(), continuous_space);
     DCHECK(it != continuous_spaces_.end());
     continuous_spaces_.erase(it);
-    if (continuous_space == dlmalloc_space_) {
-      dlmalloc_space_ = nullptr;
-    } else if (continuous_space == rosalloc_space_) {
-      rosalloc_space_ = nullptr;
-    }
-    if (continuous_space == main_space_) {
-      main_space_ = nullptr;
-    } else if (continuous_space == bump_pointer_space_) {
-      bump_pointer_space_ = nullptr;
-    } else if (continuous_space == temp_space_) {
-      temp_space_ = nullptr;
+    if (unset_as_default) {
+      if (continuous_space == dlmalloc_space_) {
+        dlmalloc_space_ = nullptr;
+      } else if (continuous_space == rosalloc_space_) {
+        rosalloc_space_ = nullptr;
+      }
+      if (continuous_space == main_space_) {
+        main_space_ = nullptr;
+      } else if (continuous_space == bump_pointer_space_) {
+        bump_pointer_space_ = nullptr;
+      } else if (continuous_space == temp_space_) {
+        temp_space_ = nullptr;
+      }
     }
   } else {
     DCHECK(space->IsDiscontinuousSpace());
@@ -725,6 +732,7 @@
   os << "Total mutator paused time: " << PrettyDuration(total_paused_time) << "\n";
   os << "Total time waiting for GC to complete: " << PrettyDuration(total_wait_time_) << "\n";
   os << "Approximate GC data structures memory overhead: " << gc_memory_overhead_;
+  BaseMutex::DumpAll(os);
 }
 
 Heap::~Heap() {
@@ -1457,6 +1465,10 @@
         // pointer space last transition it will be protected.
         bump_pointer_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
         Compact(bump_pointer_space_, main_space_);
+        // Remove the main space so that we don't try to trim it, this doens't work for debug
+        // builds since RosAlloc attempts to read the magic number from a protected page.
+        // TODO: Clean this up by getting rid of the remove_as_default parameter.
+        RemoveSpace(main_space_, false);
       }
       break;
     }
@@ -1465,6 +1477,7 @@
     case kCollectorTypeCMS: {
       if (IsMovingGc(collector_type_)) {
         // Compact to the main space from the bump pointer space, don't need to swap semispaces.
+        AddSpace(main_space_, false);
         main_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
         Compact(main_space_, bump_pointer_space_);
       }