Add thread unsafe allocation methods to spaces.

Used by SS/GSS collectors since these run with mutators suspended and
only allocate from a single thread. Added AllocThreadUnsafe to
BumpPointerSpace and RosAllocSpace. Added AllocThreadUnsafe which uses
current runs as thread local runs for a thread unsafe allocation.
Added code to revoke current runs which are the same idx as thread
local runs.

Changed:
The number of thread local runs in each thread is now the the number
of thread local runs in RosAlloc instead of the number of size
brackets.

Total GC time / time on EvaluateAndApplyChanges.
TLAB SS:
Before: 36.7s / 7254
After: 16.1s / 4837

TLAB GSS:
Before: 6.9s / 3973
After: 5.7s / 3778

Bug: 8981901

Change-Id: Id1d264ade3799f431bf7ebbdcca6146aefbeb632
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index ff59016..f113030 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -67,11 +67,11 @@
              << std::hex << (intptr_t)(base_ + capacity_)
              << ", capacity=" << std::dec << capacity_
              << ", max_capacity=" << std::dec << max_capacity_;
-  memset(current_runs_, 0, sizeof(current_runs_));
   for (size_t i = 0; i < kNumOfSizeBrackets; i++) {
     size_bracket_lock_names[i] =
         StringPrintf("an rosalloc size bracket %d lock", static_cast<int>(i));
     size_bracket_locks_[i] = new Mutex(size_bracket_lock_names[i].c_str(), kRosAllocBracketLock);
+    current_runs_[i] = dedicated_full_run_;
   }
   DCHECK_EQ(footprint_, capacity_);
   size_t num_of_pages = footprint_ / kPageSize;
@@ -548,7 +548,7 @@
     DCHECK(!new_run->IsThreadLocal());
     DCHECK_EQ(new_run->first_search_vec_idx_, 0U);
     DCHECK(!new_run->to_be_bulk_freed_);
-    if (kUsePrefetchDuringAllocRun && idx <= kMaxThreadLocalSizeBracketIdx) {
+    if (kUsePrefetchDuringAllocRun && idx < kNumThreadLocalSizeBrackets) {
       // Take ownership of the cache lines if we are likely to be thread local run.
       if (kPrefetchNewRunDataByZeroing) {
         // Zeroing the data is sometimes faster than prefetching but it increases memory usage
@@ -584,6 +584,60 @@
   return AllocRun(self, idx);
 }
 
+void* RosAlloc::AllocFromCurrentRunUnlocked(Thread* self, size_t idx) {
+  Run* current_run = current_runs_[idx];
+  DCHECK(current_run != nullptr);
+  void* slot_addr = current_run->AllocSlot();
+  if (UNLIKELY(slot_addr == nullptr)) {
+    // The current run got full. Try to refill it.
+    DCHECK(current_run->IsFull());
+    if (kIsDebugBuild && current_run != dedicated_full_run_) {
+      full_runs_[idx].insert(current_run);
+      if (kTraceRosAlloc) {
+        LOG(INFO) << __FUNCTION__ << " : Inserted run 0x" << std::hex << reinterpret_cast<intptr_t>(current_run)
+                  << " into full_runs_[" << std::dec << idx << "]";
+      }
+      DCHECK(non_full_runs_[idx].find(current_run) == non_full_runs_[idx].end());
+      DCHECK(full_runs_[idx].find(current_run) != full_runs_[idx].end());
+    }
+    current_run = RefillRun(self, idx);
+    if (UNLIKELY(current_run == nullptr)) {
+      // Failed to allocate a new run, make sure that it is the dedicated full run.
+      current_runs_[idx] = dedicated_full_run_;
+      return nullptr;
+    }
+    DCHECK(current_run != nullptr);
+    DCHECK(non_full_runs_[idx].find(current_run) == non_full_runs_[idx].end());
+    DCHECK(full_runs_[idx].find(current_run) == full_runs_[idx].end());
+    current_run->SetIsThreadLocal(false);
+    current_runs_[idx] = current_run;
+    DCHECK(!current_run->IsFull());
+    slot_addr = current_run->AllocSlot();
+    // Must succeed now with a new run.
+    DCHECK(slot_addr != nullptr);
+  }
+  return slot_addr;
+}
+
+void* RosAlloc::AllocFromRunThreadUnsafe(Thread* self, size_t size, size_t* bytes_allocated) {
+  DCHECK_LE(size, kLargeSizeThreshold);
+  size_t bracket_size;
+  size_t idx = SizeToIndexAndBracketSize(size, &bracket_size);
+  DCHECK_EQ(idx, SizeToIndex(size));
+  DCHECK_EQ(bracket_size, IndexToBracketSize(idx));
+  DCHECK_EQ(bracket_size, bracketSizes[idx]);
+  DCHECK_LE(size, bracket_size);
+  DCHECK(size > 512 || bracket_size - size < 16);
+  Locks::mutator_lock_->AssertExclusiveHeld(self);
+  void* slot_addr = AllocFromCurrentRunUnlocked(self, idx);
+  if (LIKELY(slot_addr != nullptr)) {
+    DCHECK(bytes_allocated != nullptr);
+    *bytes_allocated = bracket_size;
+    // Caller verifies that it is all 0.
+  }
+  return slot_addr;
+}
+
 void* RosAlloc::AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated) {
   DCHECK_LE(size, kLargeSizeThreshold);
   size_t bracket_size;
@@ -596,7 +650,7 @@
 
   void* slot_addr;
 
-  if (LIKELY(idx <= kMaxThreadLocalSizeBracketIdx)) {
+  if (LIKELY(idx < kNumThreadLocalSizeBrackets)) {
     // Use a thread-local run.
     Run* thread_local_run = reinterpret_cast<Run*>(self->GetRosAllocRun(idx));
     // Allow invalid since this will always fail the allocation.
@@ -631,7 +685,6 @@
         // No slots got freed. Try to refill the thread-local run.
         DCHECK(thread_local_run->IsFull());
         if (thread_local_run != dedicated_full_run_) {
-          self->SetRosAllocRun(idx, dedicated_full_run_);
           thread_local_run->SetIsThreadLocal(false);
           if (kIsDebugBuild) {
             full_runs_[idx].insert(thread_local_run);
@@ -646,8 +699,9 @@
         }
 
         thread_local_run = RefillRun(self, idx);
-        if (UNLIKELY(thread_local_run == NULL)) {
-          return NULL;
+        if (UNLIKELY(thread_local_run == nullptr)) {
+          self->SetRosAllocRun(idx, dedicated_full_run_);
+          return nullptr;
         }
         DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end());
         DCHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end());
@@ -656,12 +710,12 @@
         DCHECK(!thread_local_run->IsFull());
       }
 
-      DCHECK(thread_local_run != NULL);
+      DCHECK(thread_local_run != nullptr);
       DCHECK(!thread_local_run->IsFull());
       DCHECK(thread_local_run->IsThreadLocal());
       slot_addr = thread_local_run->AllocSlot();
       // Must succeed now with a new run.
-      DCHECK(slot_addr != NULL);
+      DCHECK(slot_addr != nullptr);
     }
     if (kTraceRosAlloc) {
       LOG(INFO) << "RosAlloc::AllocFromRun() thread-local : 0x" << std::hex << reinterpret_cast<intptr_t>(slot_addr)
@@ -671,48 +725,7 @@
   } else {
     // Use the (shared) current run.
     MutexLock mu(self, *size_bracket_locks_[idx]);
-    Run* current_run = current_runs_[idx];
-    if (UNLIKELY(current_run == NULL)) {
-      current_run = RefillRun(self, idx);
-      if (UNLIKELY(current_run == NULL)) {
-        return NULL;
-      }
-      DCHECK(non_full_runs_[idx].find(current_run) == non_full_runs_[idx].end());
-      DCHECK(full_runs_[idx].find(current_run) == full_runs_[idx].end());
-      current_run->SetIsThreadLocal(false);
-      current_runs_[idx] = current_run;
-      DCHECK(!current_run->IsFull());
-    }
-    DCHECK(current_run != NULL);
-    slot_addr = current_run->AllocSlot();
-    if (UNLIKELY(slot_addr == NULL)) {
-      // The current run got full. Try to refill it.
-      DCHECK(current_run->IsFull());
-      current_runs_[idx] = NULL;
-      if (kIsDebugBuild) {
-        // Insert it into full_runs and set the current run to NULL.
-        full_runs_[idx].insert(current_run);
-        if (kTraceRosAlloc) {
-          LOG(INFO) << "RosAlloc::AllocFromRun() : Inserted run 0x" << std::hex << reinterpret_cast<intptr_t>(current_run)
-                    << " into full_runs_[" << std::dec << idx << "]";
-        }
-      }
-      DCHECK(non_full_runs_[idx].find(current_run) == non_full_runs_[idx].end());
-      DCHECK(full_runs_[idx].find(current_run) != full_runs_[idx].end());
-      current_run = RefillRun(self, idx);
-      if (UNLIKELY(current_run == NULL)) {
-        return NULL;
-      }
-      DCHECK(current_run != NULL);
-      DCHECK(non_full_runs_[idx].find(current_run) == non_full_runs_[idx].end());
-      DCHECK(full_runs_[idx].find(current_run) == full_runs_[idx].end());
-      current_run->SetIsThreadLocal(false);
-      current_runs_[idx] = current_run;
-      DCHECK(!current_run->IsFull());
-      slot_addr = current_run->AllocSlot();
-      // Must succeed now with a new run.
-      DCHECK(slot_addr != NULL);
-    }
+    slot_addr = AllocFromCurrentRunUnlocked(self, idx);
     if (kTraceRosAlloc) {
       LOG(INFO) << "RosAlloc::AllocFromRun() : 0x" << std::hex << reinterpret_cast<intptr_t>(slot_addr)
                 << "-0x" << (reinterpret_cast<intptr_t>(slot_addr) + bracket_size)
@@ -741,7 +754,7 @@
   }
   if (LIKELY(run->IsThreadLocal())) {
     // It's a thread-local run. Just mark the thread-local free bit map and return.
-    DCHECK_LE(run->size_bracket_idx_, kMaxThreadLocalSizeBracketIdx);
+    DCHECK_LT(run->size_bracket_idx_, kNumThreadLocalSizeBrackets);
     DCHECK(non_full_runs_[idx].find(run) == non_full_runs_[idx].end());
     DCHECK(full_runs_[idx].find(run) == full_runs_[idx].end());
     run->MarkThreadLocalFreeBitMap(ptr);
@@ -766,7 +779,7 @@
       }
     }
     if (run == current_runs_[idx]) {
-      current_runs_[idx] = NULL;
+      current_runs_[idx] = dedicated_full_run_;
     }
     DCHECK(non_full_runs_[idx].find(run) == non_full_runs_[idx].end());
     DCHECK(full_runs_[idx].find(run) == full_runs_[idx].end());
@@ -1233,7 +1246,7 @@
     size_t idx = run->size_bracket_idx_;
     MutexLock mu(self, *size_bracket_locks_[idx]);
     if (run->IsThreadLocal()) {
-      DCHECK_LE(run->size_bracket_idx_, kMaxThreadLocalSizeBracketIdx);
+      DCHECK_LT(run->size_bracket_idx_, kNumThreadLocalSizeBrackets);
       DCHECK(non_full_runs_[idx].find(run) == non_full_runs_[idx].end());
       DCHECK(full_runs_[idx].find(run) == full_runs_[idx].end());
       run->UnionBulkFreeBitMapToThreadLocalFreeBitMap();
@@ -1627,7 +1640,7 @@
   Thread* self = Thread::Current();
   // Avoid race conditions on the bulk free bit maps with BulkFree() (GC).
   WriterMutexLock wmu(self, bulk_free_lock_);
-  for (size_t idx = 0; idx < kNumOfSizeBrackets; idx++) {
+  for (size_t idx = 0; idx < kNumThreadLocalSizeBrackets; idx++) {
     MutexLock mu(self, *size_bracket_locks_[idx]);
     Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(idx));
     CHECK(thread_local_run != nullptr);
@@ -1643,30 +1656,48 @@
       thread_local_run->MergeBulkFreeBitMapIntoAllocBitMap();
       DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end());
       DCHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end());
-      if (thread_local_run->IsFull()) {
-        if (kIsDebugBuild) {
-          full_runs_[idx].insert(thread_local_run);
-          DCHECK(full_runs_[idx].find(thread_local_run) != full_runs_[idx].end());
-          if (kTraceRosAlloc) {
-            LOG(INFO) << "RosAlloc::RevokeThreadLocalRuns() : Inserted run 0x" << std::hex
-                      << reinterpret_cast<intptr_t>(thread_local_run)
-                      << " into full_runs_[" << std::dec << idx << "]";
-          }
-        }
-      } else if (thread_local_run->IsAllFree()) {
-        MutexLock mu(self, lock_);
-        thread_local_run->ZeroHeader();
-        FreePages(self, thread_local_run, true);
-      } else {
-        non_full_runs_[idx].insert(thread_local_run);
-        DCHECK(non_full_runs_[idx].find(thread_local_run) != non_full_runs_[idx].end());
-        if (kTraceRosAlloc) {
-          LOG(INFO) << "RosAlloc::RevokeThreadLocalRuns() : Inserted run 0x" << std::hex
-                    << reinterpret_cast<intptr_t>(thread_local_run)
-                    << " into non_full_runs_[" << std::dec << idx << "]";
-        }
+      RevokeRun(self, idx, thread_local_run);
+    }
+  }
+}
+
+void RosAlloc::RevokeRun(Thread* self, size_t idx, Run* run) {
+  size_bracket_locks_[idx]->AssertHeld(self);
+  DCHECK(run != dedicated_full_run_);
+  if (run->IsFull()) {
+    if (kIsDebugBuild) {
+      full_runs_[idx].insert(run);
+      DCHECK(full_runs_[idx].find(run) != full_runs_[idx].end());
+      if (kTraceRosAlloc) {
+        LOG(INFO) << __FUNCTION__  << " : Inserted run 0x" << std::hex
+                  << reinterpret_cast<intptr_t>(run)
+                  << " into full_runs_[" << std::dec << idx << "]";
       }
     }
+  } else if (run->IsAllFree()) {
+    run->ZeroHeader();
+    MutexLock mu(self, lock_);
+    FreePages(self, run, true);
+  } else {
+    non_full_runs_[idx].insert(run);
+    DCHECK(non_full_runs_[idx].find(run) != non_full_runs_[idx].end());
+    if (kTraceRosAlloc) {
+      LOG(INFO) << __FUNCTION__ << " : Inserted run 0x" << std::hex
+                << reinterpret_cast<intptr_t>(run)
+                << " into non_full_runs_[" << std::dec << idx << "]";
+    }
+  }
+}
+
+void RosAlloc::RevokeThreadUnsafeCurrentRuns() {
+  // Revoke the current runs which share the same idx as thread local runs.
+  Thread* self = Thread::Current();
+  for (size_t idx = 0; idx < kNumThreadLocalSizeBrackets; ++idx) {
+    MutexLock mu(self, *size_bracket_locks_[idx]);
+    if (current_runs_[idx] != dedicated_full_run_) {
+      RevokeRun(self, idx, current_runs_[idx]);
+      current_runs_[idx] = dedicated_full_run_;
+    }
   }
 }
 
@@ -1679,6 +1710,7 @@
   for (Thread* thread : thread_list) {
     RevokeThreadLocalRuns(thread);
   }
+  RevokeThreadUnsafeCurrentRuns();
 }
 
 void RosAlloc::AssertThreadLocalRunsAreRevoked(Thread* thread) {
@@ -1686,7 +1718,7 @@
     Thread* self = Thread::Current();
     // Avoid race conditions on the bulk free bit maps with BulkFree() (GC).
     WriterMutexLock wmu(self, bulk_free_lock_);
-    for (size_t idx = 0; idx < kNumOfSizeBrackets; idx++) {
+    for (size_t idx = 0; idx < kNumThreadLocalSizeBrackets; idx++) {
       MutexLock mu(self, *size_bracket_locks_[idx]);
       Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(idx));
       DCHECK(thread_local_run == nullptr || thread_local_run == dedicated_full_run_);
@@ -1696,18 +1728,21 @@
 
 void RosAlloc::AssertAllThreadLocalRunsAreRevoked() {
   if (kIsDebugBuild) {
-    MutexLock mu(Thread::Current(), *Locks::runtime_shutdown_lock_);
-    MutexLock mu2(Thread::Current(), *Locks::thread_list_lock_);
+    Thread* self = Thread::Current();
+    MutexLock mu(self, *Locks::runtime_shutdown_lock_);
+    MutexLock mu2(self, *Locks::thread_list_lock_);
     std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
     for (Thread* t : thread_list) {
       AssertThreadLocalRunsAreRevoked(t);
     }
+    for (size_t idx = 0; idx < kNumThreadLocalSizeBrackets; ++idx) {
+      MutexLock mu(self, *size_bracket_locks_[idx]);
+      CHECK_EQ(current_runs_[idx], dedicated_full_run_);
+    }
   }
 }
 
 void RosAlloc::Initialize() {
-  // Check the consistency of the number of size brackets.
-  DCHECK_EQ(Thread::kRosAllocNumOfSizeBrackets, kNumOfSizeBrackets);
   // bracketSizes.
   for (size_t i = 0; i < kNumOfSizeBrackets; i++) {
     if (i < kNumOfSizeBrackets - 2) {
@@ -1911,15 +1946,34 @@
           break;
         }
         case kPageMapRunPart:
-          LOG(FATAL) << "Unreachable - page map type: " << pm << std::endl << DumpPageMap();
-          break;
+          // Fall-through.
         default:
           LOG(FATAL) << "Unreachable - page map type: " << pm << std::endl << DumpPageMap();
           break;
       }
     }
   }
-
+  std::list<Thread*> threads = Runtime::Current()->GetThreadList()->GetList();
+  for (Thread* thread : threads) {
+    for (size_t i = 0; i < kNumThreadLocalSizeBrackets; ++i) {
+      MutexLock mu(self, *size_bracket_locks_[i]);
+      Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(i));
+      CHECK(thread_local_run != nullptr);
+      CHECK(thread_local_run->IsThreadLocal());
+      CHECK(thread_local_run == dedicated_full_run_ ||
+            thread_local_run->size_bracket_idx_ == i);
+    }
+  }
+  for (size_t i = 0; i < kNumOfSizeBrackets; i++) {
+    MutexLock mu(self, *size_bracket_locks_[i]);
+    Run* current_run = current_runs_[i];
+    CHECK(current_run != nullptr);
+    if (current_run != dedicated_full_run_) {
+      // The dedicated full run is currently marked as thread local.
+      CHECK(!current_run->IsThreadLocal());
+      CHECK_EQ(current_run->size_bracket_idx_, i);
+    }
+  }
   // Call Verify() here for the lock order.
   for (auto& run : runs) {
     run->Verify(self, this);
@@ -1952,7 +2006,7 @@
     std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
     for (auto it = thread_list.begin(); it != thread_list.end(); ++it) {
       Thread* thread = *it;
-      for (size_t i = 0; i < kNumOfSizeBrackets; i++) {
+      for (size_t i = 0; i < kNumThreadLocalSizeBrackets; i++) {
         MutexLock mu(self, *rosalloc->size_bracket_locks_[i]);
         Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(i));
         if (thread_local_run == this) {