Rosalloc thread local allocation path without a cas.

Speedup on N4:
MemAllocTest 3044 -> 2396 (~21% reduction)
BinaryTrees  4101 -> 2929 (~26% reduction)

Bug: 9986565
Change-Id: Ia1d1a37b9e001f903c3c056e8ec68fc8c623a78b
diff --git a/runtime/gc/allocator/rosalloc-inl.h b/runtime/gc/allocator/rosalloc-inl.h
index f6c9d3c..bba92a1 100644
--- a/runtime/gc/allocator/rosalloc-inl.h
+++ b/runtime/gc/allocator/rosalloc-inl.h
@@ -28,15 +28,19 @@
 }
 
 template<bool kThreadSafe>
-inline ALWAYS_INLINE void* RosAlloc::Alloc(Thread* self, size_t size, size_t* bytes_allocated) {
+inline ALWAYS_INLINE void* RosAlloc::Alloc(Thread* self, size_t size, size_t* bytes_allocated,
+                                           size_t* usable_size,
+                                           size_t* bytes_tl_bulk_allocated) {
   if (UNLIKELY(size > kLargeSizeThreshold)) {
-    return AllocLargeObject(self, size, bytes_allocated);
+    return AllocLargeObject(self, size, bytes_allocated, usable_size,
+                            bytes_tl_bulk_allocated);
   }
   void* m;
   if (kThreadSafe) {
-    m = AllocFromRun(self, size, bytes_allocated);
+    m = AllocFromRun(self, size, bytes_allocated, usable_size, bytes_tl_bulk_allocated);
   } else {
-    m = AllocFromRunThreadUnsafe(self, size, bytes_allocated);
+    m = AllocFromRunThreadUnsafe(self, size, bytes_allocated, usable_size,
+                                 bytes_tl_bulk_allocated);
   }
   // Check if the returned memory is really all zero.
   if (ShouldCheckZeroMemory() && m != nullptr) {
@@ -48,6 +52,115 @@
   return m;
 }
 
+inline bool RosAlloc::Run::IsFull() {
+  const size_t num_vec = NumberOfBitmapVectors();
+  for (size_t v = 0; v < num_vec; ++v) {
+    if (~alloc_bit_map_[v] != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline bool RosAlloc::CanAllocFromThreadLocalRun(Thread* self, size_t size) {
+  if (UNLIKELY(!IsSizeForThreadLocal(size))) {
+    return false;
+  }
+  size_t bracket_size;
+  size_t idx = SizeToIndexAndBracketSize(size, &bracket_size);
+  DCHECK_EQ(idx, SizeToIndex(size));
+  DCHECK_EQ(bracket_size, IndexToBracketSize(idx));
+  DCHECK_EQ(bracket_size, bracketSizes[idx]);
+  DCHECK_LE(size, bracket_size);
+  DCHECK(size > 512 || bracket_size - size < 16);
+  DCHECK_LT(idx, kNumThreadLocalSizeBrackets);
+  Run* thread_local_run = reinterpret_cast<Run*>(self->GetRosAllocRun(idx));
+  if (kIsDebugBuild) {
+    // Need the lock to prevent race conditions.
+    MutexLock mu(self, *size_bracket_locks_[idx]);
+    CHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end());
+    CHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end());
+  }
+  DCHECK(thread_local_run != nullptr);
+  DCHECK(thread_local_run->IsThreadLocal() || thread_local_run == dedicated_full_run_);
+  return !thread_local_run->IsFull();
+}
+
+inline void* RosAlloc::AllocFromThreadLocalRun(Thread* self, size_t size,
+                                               size_t* bytes_allocated) {
+  DCHECK(bytes_allocated != nullptr);
+  if (UNLIKELY(!IsSizeForThreadLocal(size))) {
+    return nullptr;
+  }
+  size_t bracket_size;
+  size_t idx = SizeToIndexAndBracketSize(size, &bracket_size);
+  Run* thread_local_run = reinterpret_cast<Run*>(self->GetRosAllocRun(idx));
+  if (kIsDebugBuild) {
+    // Need the lock to prevent race conditions.
+    MutexLock mu(self, *size_bracket_locks_[idx]);
+    CHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end());
+    CHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end());
+  }
+  DCHECK(thread_local_run != nullptr);
+  DCHECK(thread_local_run->IsThreadLocal() || thread_local_run == dedicated_full_run_);
+  void* slot_addr = thread_local_run->AllocSlot();
+  if (LIKELY(slot_addr != nullptr)) {
+    *bytes_allocated = bracket_size;
+  }
+  return slot_addr;
+}
+
+inline size_t RosAlloc::MaxBytesBulkAllocatedFor(size_t size) {
+  if (UNLIKELY(!IsSizeForThreadLocal(size))) {
+    return size;
+  }
+  size_t bracket_size;
+  size_t idx = SizeToIndexAndBracketSize(size, &bracket_size);
+  return numOfSlots[idx] * bracket_size;
+}
+
+inline void* RosAlloc::Run::AllocSlot() {
+  const size_t idx = size_bracket_idx_;
+  while (true) {
+    if (kIsDebugBuild) {
+      // Make sure that no slots leaked, the bitmap should be full for all previous vectors.
+      for (size_t i = 0; i < first_search_vec_idx_; ++i) {
+        CHECK_EQ(~alloc_bit_map_[i], 0U);
+      }
+    }
+    uint32_t* const alloc_bitmap_ptr = &alloc_bit_map_[first_search_vec_idx_];
+    uint32_t ffz1 = __builtin_ffs(~*alloc_bitmap_ptr);
+    if (LIKELY(ffz1 != 0)) {
+      const uint32_t ffz = ffz1 - 1;
+      const uint32_t slot_idx = ffz +
+          first_search_vec_idx_ * sizeof(*alloc_bitmap_ptr) * kBitsPerByte;
+      const uint32_t mask = 1U << ffz;
+      DCHECK_LT(slot_idx, numOfSlots[idx]) << "out of range";
+      // Found an empty slot. Set the bit.
+      DCHECK_EQ(*alloc_bitmap_ptr & mask, 0U);
+      *alloc_bitmap_ptr |= mask;
+      DCHECK_NE(*alloc_bitmap_ptr & mask, 0U);
+      uint8_t* slot_addr = reinterpret_cast<uint8_t*>(this) +
+          headerSizes[idx] + slot_idx * bracketSizes[idx];
+      if (kTraceRosAlloc) {
+        LOG(INFO) << "RosAlloc::Run::AllocSlot() : 0x" << std::hex
+                  << reinterpret_cast<intptr_t>(slot_addr)
+                  << ", bracket_size=" << std::dec << bracketSizes[idx]
+                  << ", slot_idx=" << slot_idx;
+      }
+      return slot_addr;
+    }
+    const size_t num_words = RoundUp(numOfSlots[idx], 32) / 32;
+    if (first_search_vec_idx_ + 1 >= num_words) {
+      DCHECK(IsFull());
+      // Already at the last word, return null.
+      return nullptr;
+    }
+    // Increase the index to the next word and try again.
+    ++first_search_vec_idx_;
+  }
+}
+
 }  // namespace allocator
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index f51093a..f64a4ff 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -454,7 +454,10 @@
   return byte_size;
 }
 
-void* RosAlloc::AllocLargeObject(Thread* self, size_t size, size_t* bytes_allocated) {
+void* RosAlloc::AllocLargeObject(Thread* self, size_t size, size_t* bytes_allocated,
+                                 size_t* usable_size, size_t* bytes_tl_bulk_allocated) {
+  DCHECK(bytes_allocated != nullptr);
+  DCHECK(usable_size != nullptr);
   DCHECK_GT(size, kLargeSizeThreshold);
   size_t num_pages = RoundUp(size, kPageSize) / kPageSize;
   void* r;
@@ -470,6 +473,8 @@
   }
   const size_t total_bytes = num_pages * kPageSize;
   *bytes_allocated = total_bytes;
+  *usable_size = total_bytes;
+  *bytes_tl_bulk_allocated = total_bytes;
   if (kTraceRosAlloc) {
     LOG(INFO) << "RosAlloc::AllocLargeObject() : 0x" << std::hex << reinterpret_cast<intptr_t>(r)
               << "-0x" << (reinterpret_cast<intptr_t>(r) + num_pages * kPageSize)
@@ -622,7 +627,12 @@
   return slot_addr;
 }
 
-void* RosAlloc::AllocFromRunThreadUnsafe(Thread* self, size_t size, size_t* bytes_allocated) {
+void* RosAlloc::AllocFromRunThreadUnsafe(Thread* self, size_t size, size_t* bytes_allocated,
+                                         size_t* usable_size,
+                                         size_t* bytes_tl_bulk_allocated) {
+  DCHECK(bytes_allocated != nullptr);
+  DCHECK(usable_size != nullptr);
+  DCHECK(bytes_tl_bulk_allocated != nullptr);
   DCHECK_LE(size, kLargeSizeThreshold);
   size_t bracket_size;
   size_t idx = SizeToIndexAndBracketSize(size, &bracket_size);
@@ -634,14 +644,19 @@
   Locks::mutator_lock_->AssertExclusiveHeld(self);
   void* slot_addr = AllocFromCurrentRunUnlocked(self, idx);
   if (LIKELY(slot_addr != nullptr)) {
-    DCHECK(bytes_allocated != nullptr);
     *bytes_allocated = bracket_size;
-    // Caller verifies that it is all 0.
+    *usable_size = bracket_size;
+    *bytes_tl_bulk_allocated = bracket_size;
   }
+  // Caller verifies that it is all 0.
   return slot_addr;
 }
 
-void* RosAlloc::AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated) {
+void* RosAlloc::AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated,
+                             size_t* usable_size, size_t* bytes_tl_bulk_allocated) {
+  DCHECK(bytes_allocated != nullptr);
+  DCHECK(usable_size != nullptr);
+  DCHECK(bytes_tl_bulk_allocated != nullptr);
   DCHECK_LE(size, kLargeSizeThreshold);
   size_t bracket_size;
   size_t idx = SizeToIndexAndBracketSize(size, &bracket_size);
@@ -712,31 +727,43 @@
         self->SetRosAllocRun(idx, thread_local_run);
         DCHECK(!thread_local_run->IsFull());
       }
-
       DCHECK(thread_local_run != nullptr);
       DCHECK(!thread_local_run->IsFull());
       DCHECK(thread_local_run->IsThreadLocal());
+      // Account for all the free slots in the new or refreshed thread local run.
+      *bytes_tl_bulk_allocated = thread_local_run->NumberOfFreeSlots() * bracket_size;
       slot_addr = thread_local_run->AllocSlot();
       // Must succeed now with a new run.
       DCHECK(slot_addr != nullptr);
+    } else {
+      // The slot is already counted. Leave it as is.
+      *bytes_tl_bulk_allocated = 0;
     }
+    DCHECK(slot_addr != nullptr);
     if (kTraceRosAlloc) {
-      LOG(INFO) << "RosAlloc::AllocFromRun() thread-local : 0x" << std::hex << reinterpret_cast<intptr_t>(slot_addr)
+      LOG(INFO) << "RosAlloc::AllocFromRun() thread-local : 0x" << std::hex
+                << reinterpret_cast<intptr_t>(slot_addr)
                 << "-0x" << (reinterpret_cast<intptr_t>(slot_addr) + bracket_size)
                 << "(" << std::dec << (bracket_size) << ")";
     }
+    *bytes_allocated = bracket_size;
+    *usable_size = bracket_size;
   } else {
     // Use the (shared) current run.
     MutexLock mu(self, *size_bracket_locks_[idx]);
     slot_addr = AllocFromCurrentRunUnlocked(self, idx);
     if (kTraceRosAlloc) {
-      LOG(INFO) << "RosAlloc::AllocFromRun() : 0x" << std::hex << reinterpret_cast<intptr_t>(slot_addr)
+      LOG(INFO) << "RosAlloc::AllocFromRun() : 0x" << std::hex
+                << reinterpret_cast<intptr_t>(slot_addr)
                 << "-0x" << (reinterpret_cast<intptr_t>(slot_addr) + bracket_size)
                 << "(" << std::dec << (bracket_size) << ")";
     }
+    if (LIKELY(slot_addr != nullptr)) {
+      *bytes_allocated = bracket_size;
+      *usable_size = bracket_size;
+      *bytes_tl_bulk_allocated = bracket_size;
+    }
   }
-  DCHECK(bytes_allocated != nullptr);
-  *bytes_allocated = bracket_size;
   // Caller verifies that it is all 0.
   return slot_addr;
 }
@@ -852,44 +879,6 @@
   return stream.str();
 }
 
-inline void* RosAlloc::Run::AllocSlot() {
-  const size_t idx = size_bracket_idx_;
-  while (true) {
-    if (kIsDebugBuild) {
-      // Make sure that no slots leaked, the bitmap should be full for all previous vectors.
-      for (size_t i = 0; i < first_search_vec_idx_; ++i) {
-        CHECK_EQ(~alloc_bit_map_[i], 0U);
-      }
-    }
-    uint32_t* const alloc_bitmap_ptr = &alloc_bit_map_[first_search_vec_idx_];
-    uint32_t ffz1 = __builtin_ffs(~*alloc_bitmap_ptr);
-    if (LIKELY(ffz1 != 0)) {
-      const uint32_t ffz = ffz1 - 1;
-      const uint32_t slot_idx = ffz + first_search_vec_idx_ * sizeof(*alloc_bitmap_ptr) * kBitsPerByte;
-      const uint32_t mask = 1U << ffz;
-      DCHECK_LT(slot_idx, numOfSlots[idx]) << "out of range";
-      // Found an empty slot. Set the bit.
-      DCHECK_EQ(*alloc_bitmap_ptr & mask, 0U);
-      *alloc_bitmap_ptr |= mask;
-      DCHECK_NE(*alloc_bitmap_ptr & mask, 0U);
-      uint8_t* slot_addr = reinterpret_cast<uint8_t*>(this) + headerSizes[idx] + slot_idx * bracketSizes[idx];
-      if (kTraceRosAlloc) {
-        LOG(INFO) << "RosAlloc::Run::AllocSlot() : 0x" << std::hex << reinterpret_cast<intptr_t>(slot_addr)
-                  << ", bracket_size=" << std::dec << bracketSizes[idx] << ", slot_idx=" << slot_idx;
-      }
-      return slot_addr;
-    }
-    const size_t num_words = RoundUp(numOfSlots[idx], 32) / 32;
-    if (first_search_vec_idx_ + 1 >= num_words) {
-      DCHECK(IsFull());
-      // Already at the last word, return null.
-      return nullptr;
-    }
-    // Increase the index to the next word and try again.
-    ++first_search_vec_idx_;
-  }
-}
-
 void RosAlloc::Run::FreeSlot(void* ptr) {
   DCHECK(!IsThreadLocal());
   const uint8_t idx = size_bracket_idx_;
@@ -920,6 +909,25 @@
   }
 }
 
+size_t RosAlloc::Run::NumberOfFreeSlots() {
+  size_t num_alloc_slots = 0;
+  const size_t idx = size_bracket_idx_;
+  const size_t num_slots = numOfSlots[idx];
+  const size_t num_vec = RoundUp(num_slots, 32) / 32;
+  DCHECK_NE(num_vec, 0U);
+  for (size_t v = 0; v < num_vec - 1; v++) {
+    num_alloc_slots += POPCOUNT(alloc_bit_map_[v]);
+  }
+  // Don't count the invalid bits in the last vector.
+  uint32_t last_vec_masked = alloc_bit_map_[num_vec - 1] &
+      ~GetBitmapLastVectorMask(num_slots, num_vec);
+  num_alloc_slots += POPCOUNT(last_vec_masked);
+  size_t num_free_slots = num_slots - num_alloc_slots;
+  DCHECK_LE(num_alloc_slots, num_slots);
+  DCHECK_LE(num_free_slots, num_slots);
+  return num_free_slots;
+}
+
 inline bool RosAlloc::Run::MergeThreadLocalFreeBitMapToAllocBitMap(bool* is_all_free_after_out) {
   DCHECK(IsThreadLocal());
   // Free slots in the alloc bit map based on the thread local free bit map.
@@ -1055,16 +1063,6 @@
   return alloc_bit_map_[num_vec - 1] == GetBitmapLastVectorMask(num_slots, num_vec);
 }
 
-inline bool RosAlloc::Run::IsFull() {
-  const size_t num_vec = NumberOfBitmapVectors();
-  for (size_t v = 0; v < num_vec; ++v) {
-    if (~alloc_bit_map_[v] != 0) {
-      return false;
-    }
-  }
-  return true;
-}
-
 inline bool RosAlloc::Run::IsBulkFreeBitmapClean() {
   const size_t num_vec = NumberOfBitmapVectors();
   for (size_t v = 0; v < num_vec; v++) {
@@ -1654,10 +1652,11 @@
   }
 }
 
-void RosAlloc::RevokeThreadLocalRuns(Thread* thread) {
+size_t RosAlloc::RevokeThreadLocalRuns(Thread* thread) {
   Thread* self = Thread::Current();
   // Avoid race conditions on the bulk free bit maps with BulkFree() (GC).
   ReaderMutexLock wmu(self, bulk_free_lock_);
+  size_t free_bytes = 0U;
   for (size_t idx = 0; idx < kNumThreadLocalSizeBrackets; idx++) {
     MutexLock mu(self, *size_bracket_locks_[idx]);
     Run* thread_local_run = reinterpret_cast<Run*>(thread->GetRosAllocRun(idx));
@@ -1665,9 +1664,12 @@
     // Invalid means already revoked.
     DCHECK(thread_local_run->IsThreadLocal());
     if (thread_local_run != dedicated_full_run_) {
+      // Note the thread local run may not be full here.
       thread->SetRosAllocRun(idx, dedicated_full_run_);
       DCHECK_EQ(thread_local_run->magic_num_, kMagicNum);
-      // Note the thread local run may not be full here.
+      // Count the number of free slots left.
+      size_t num_free_slots = thread_local_run->NumberOfFreeSlots();
+      free_bytes += num_free_slots * bracketSizes[idx];
       bool dont_care;
       thread_local_run->MergeThreadLocalFreeBitMapToAllocBitMap(&dont_care);
       thread_local_run->SetIsThreadLocal(false);
@@ -1677,6 +1679,7 @@
       RevokeRun(self, idx, thread_local_run);
     }
   }
+  return free_bytes;
 }
 
 void RosAlloc::RevokeRun(Thread* self, size_t idx, Run* run) {
@@ -1719,16 +1722,18 @@
   }
 }
 
-void RosAlloc::RevokeAllThreadLocalRuns() {
+size_t RosAlloc::RevokeAllThreadLocalRuns() {
   // This is called when a mutator thread won't allocate such as at
   // the Zygote creation time or during the GC pause.
   MutexLock mu(Thread::Current(), *Locks::runtime_shutdown_lock_);
   MutexLock mu2(Thread::Current(), *Locks::thread_list_lock_);
   std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
+  size_t free_bytes = 0U;
   for (Thread* thread : thread_list) {
-    RevokeThreadLocalRuns(thread);
+    free_bytes += RevokeThreadLocalRuns(thread);
   }
   RevokeThreadUnsafeCurrentRuns();
+  return free_bytes;
 }
 
 void RosAlloc::AssertThreadLocalRunsAreRevoked(Thread* thread) {
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
index 3269e10..d1e7ad9 100644
--- a/runtime/gc/allocator/rosalloc.h
+++ b/runtime/gc/allocator/rosalloc.h
@@ -230,8 +230,10 @@
     static uint32_t GetBitmapLastVectorMask(size_t num_slots, size_t num_vec);
     // Returns true if all the slots in the run are not in use.
     bool IsAllFree();
+    // Returns the number of free slots.
+    size_t NumberOfFreeSlots();
     // Returns true if all the slots in the run are in use.
-    bool IsFull();
+    ALWAYS_INLINE bool IsFull();
     // Returns true if the bulk free bit map is clean.
     bool IsBulkFreeBitmapClean();
     // Returns true if the thread local free bit map is clean.
@@ -309,6 +311,15 @@
     DCHECK(bracketSizes[idx] == size);
     return idx;
   }
+  // Returns true if the given allocation size is for a thread local allocation.
+  static bool IsSizeForThreadLocal(size_t size) {
+    DCHECK_GT(kNumThreadLocalSizeBrackets, 0U);
+    size_t max_thread_local_bracket_idx = kNumThreadLocalSizeBrackets - 1;
+    bool is_size_for_thread_local = size <= bracketSizes[max_thread_local_bracket_idx];
+    DCHECK(size > kLargeSizeThreshold ||
+           (is_size_for_thread_local == (SizeToIndex(size) < kNumThreadLocalSizeBrackets)));
+    return is_size_for_thread_local;
+  }
   // Rounds up the size up the nearest bracket size.
   static size_t RoundToBracketSize(size_t size) {
     DCHECK(size <= kLargeSizeThreshold);
@@ -504,11 +515,13 @@
   size_t FreePages(Thread* self, void* ptr, bool already_zero) EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Allocate/free a run slot.
-  void* AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated)
+  void* AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated, size_t* usable_size,
+                     size_t* bytes_tl_bulk_allocated)
       LOCKS_EXCLUDED(lock_);
   // Allocate/free a run slot without acquiring locks.
   // TODO: EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
-  void* AllocFromRunThreadUnsafe(Thread* self, size_t size, size_t* bytes_allocated)
+  void* AllocFromRunThreadUnsafe(Thread* self, size_t size, size_t* bytes_allocated,
+                                 size_t* usable_size, size_t* bytes_tl_bulk_allocated)
       LOCKS_EXCLUDED(lock_);
   void* AllocFromCurrentRunUnlocked(Thread* self, size_t idx);
 
@@ -527,7 +540,9 @@
   size_t FreeInternal(Thread* self, void* ptr) LOCKS_EXCLUDED(lock_);
 
   // Allocates large objects.
-  void* AllocLargeObject(Thread* self, size_t size, size_t* bytes_allocated) LOCKS_EXCLUDED(lock_);
+  void* AllocLargeObject(Thread* self, size_t size, size_t* bytes_allocated,
+                         size_t* usable_size, size_t* bytes_tl_bulk_allocated)
+      LOCKS_EXCLUDED(lock_);
 
   // Revoke a run by adding it to non_full_runs_ or freeing the pages.
   void RevokeRun(Thread* self, size_t idx, Run* run);
@@ -551,13 +566,26 @@
   // If kThreadUnsafe is true then the allocator may avoid acquiring some locks as an optimization.
   // If used, this may cause race conditions if multiple threads are allocating at the same time.
   template<bool kThreadSafe = true>
-  void* Alloc(Thread* self, size_t size, size_t* bytes_allocated)
+  void* Alloc(Thread* self, size_t size, size_t* bytes_allocated, size_t* usable_size,
+              size_t* bytes_tl_bulk_allocated)
       LOCKS_EXCLUDED(lock_);
   size_t Free(Thread* self, void* ptr)
       LOCKS_EXCLUDED(bulk_free_lock_);
   size_t BulkFree(Thread* self, void** ptrs, size_t num_ptrs)
       LOCKS_EXCLUDED(bulk_free_lock_);
 
+  // Returns true if the given allocation request can be allocated in
+  // an existing thread local run without allocating a new run.
+  ALWAYS_INLINE bool CanAllocFromThreadLocalRun(Thread* self, size_t size);
+  // Allocate the given allocation request in an existing thread local
+  // run without allocating a new run.
+  ALWAYS_INLINE void* AllocFromThreadLocalRun(Thread* self, size_t size, size_t* bytes_allocated);
+
+  // Returns the maximum bytes that could be allocated for the given
+  // size in bulk, that is the maximum value for the
+  // bytes_allocated_bulk out param returned by RosAlloc::Alloc().
+  ALWAYS_INLINE size_t MaxBytesBulkAllocatedFor(size_t size);
+
   // Returns the size of the allocated slot for a given allocated memory chunk.
   size_t UsableSize(const void* ptr);
   // Returns the size of the allocated slot for a given size.
@@ -586,9 +614,13 @@
   void SetFootprintLimit(size_t bytes) LOCKS_EXCLUDED(lock_);
 
   // Releases the thread-local runs assigned to the given thread back to the common set of runs.
-  void RevokeThreadLocalRuns(Thread* thread);
+  // Returns the total bytes of free slots in the revoked thread local runs. This is to be
+  // subtracted from Heap::num_bytes_allocated_ to cancel out the ahead-of-time counting.
+  size_t RevokeThreadLocalRuns(Thread* thread);
   // Releases the thread-local runs assigned to all the threads back to the common set of runs.
-  void RevokeAllThreadLocalRuns() LOCKS_EXCLUDED(Locks::thread_list_lock_);
+  // Returns the total bytes of free slots in the revoked thread local runs. This is to be
+  // subtracted from Heap::num_bytes_allocated_ to cancel out the ahead-of-time counting.
+  size_t RevokeAllThreadLocalRuns() LOCKS_EXCLUDED(Locks::thread_list_lock_);
   // Assert the thread local runs of a thread are revoked.
   void AssertThreadLocalRunsAreRevoked(Thread* thread);
   // Assert all the thread local runs are revoked.