Utilize partially used TLABs

Currently, once a mutator tries to allocate an object bigger than the
TLAB size, it attempts to acquire another TLAB. The previous TLAB is not
utilized again. This leads to blocking GCs when mutators get created and
killed very frequently, as could happen in the case of Zygote.

In this change, we maintain a separate list of partially used TLABs
which can be reused whenever any mutator attempts to allocate a new TLAB
from region space.

Test: forrest hermatic test
Bug: 146706834
Change-Id: I8076663628e49fc10e33f30de937833f6812fdca
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index d15fdad..1f50c27 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -444,15 +444,17 @@
         << thread->GetState() << " thread " << thread << " self " << self;
     thread->SetIsGcMarkingAndUpdateEntrypoints(true);
     if (use_tlab_ && thread->HasTlab()) {
+      // We should not reuse the partially utilized TLABs revoked here as they
+      // are going to be part of from-space.
       if (ConcurrentCopying::kEnableFromSpaceAccountingCheck) {
         // This must come before the revoke.
         size_t thread_local_objects = thread->GetThreadLocalObjectsAllocated();
-        concurrent_copying_->region_space_->RevokeThreadLocalBuffers(thread);
+        concurrent_copying_->region_space_->RevokeThreadLocalBuffers(thread, /*reuse=*/ false);
         reinterpret_cast<Atomic<size_t>*>(
             &concurrent_copying_->from_space_num_objects_at_first_pause_)->
                 fetch_add(thread_local_objects, std::memory_order_relaxed);
       } else {
-        concurrent_copying_->region_space_->RevokeThreadLocalBuffers(thread);
+        concurrent_copying_->region_space_->RevokeThreadLocalBuffers(thread, /*reuse=*/ false);
       }
     }
     if (kUseThreadLocalAllocationStack) {
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index ee9e4a8..be3b7f8 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -147,10 +147,6 @@
 // If true, we log all GCs in the both the foreground and background. Used for debugging.
 static constexpr bool kLogAllGCs = false;
 
-// How much we grow the TLAB if we can do it.
-static constexpr size_t kPartialTlabSize = 16 * KB;
-static constexpr bool kUsePartialTlabs = true;
-
 // Use Max heap for 2 seconds, this is smaller than the usual 5s window since we don't want to leave
 // allocate with relaxed ergonomics for that long.
 static constexpr size_t kPostForkMaxHeapDurationMS = 2000;
@@ -4215,14 +4211,13 @@
             ? std::max(alloc_size, kPartialTlabSize)
             : gc::space::RegionSpace::kRegionSize;
         // Try to allocate a tlab.
-        if (!region_space_->AllocNewTlab(self, new_tlab_size)) {
+        if (!region_space_->AllocNewTlab(self, new_tlab_size, bytes_tl_bulk_allocated)) {
           // Failed to allocate a tlab. Try non-tlab.
           return region_space_->AllocNonvirtual<false>(alloc_size,
                                                        bytes_allocated,
                                                        usable_size,
                                                        bytes_tl_bulk_allocated);
         }
-        *bytes_tl_bulk_allocated = new_tlab_size;
         // Fall-through to using the TLAB below.
       } else {
         // Check OOME for a non-tlab allocation.
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 9d40b93..8f85c7b 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -129,6 +129,10 @@
 
 class Heap {
  public:
+  // How much we grow the TLAB if we can do it.
+  static constexpr size_t kPartialTlabSize = 16 * KB;
+  static constexpr bool kUsePartialTlabs = true;
+
   static constexpr size_t kDefaultStartingSize = kPageSize;
   static constexpr size_t kDefaultInitialSize = 2 * MB;
   static constexpr size_t kDefaultMaximumSize = 256 * MB;
diff --git a/runtime/gc/space/region_space-inl.h b/runtime/gc/space/region_space-inl.h
index 33b72ac..901568e 100644
--- a/runtime/gc/space/region_space-inl.h
+++ b/runtime/gc/space/region_space-inl.h
@@ -498,7 +498,7 @@
     DCHECK_LE(begin_, Top());
     size_t bytes;
     if (is_a_tlab_) {
-      bytes = thread_->GetThreadLocalBytesAllocated();
+      bytes = thread_->GetTlabEnd() - begin_;
     } else {
       bytes = static_cast<size_t>(Top() - begin_);
     }
diff --git a/runtime/gc/space/region_space.cc b/runtime/gc/space/region_space.cc
index c8b5669..faeeec0 100644
--- a/runtime/gc/space/region_space.cc
+++ b/runtime/gc/space/region_space.cc
@@ -337,6 +337,10 @@
     rb_table->SetAll();
   }
   MutexLock mu(Thread::Current(), region_lock_);
+  // We cannot use the partially utilized TLABs across a GC. Therefore, revoke
+  // them during the thread-flip.
+  partial_tlabs_.clear();
+
   // Counter for the number of expected large tail regions following a large region.
   size_t num_expected_large_tails = 0U;
   // Flag to store whether the previously seen large region has been evacuated.
@@ -833,17 +837,40 @@
   r->objects_allocated_.fetch_add(1, std::memory_order_relaxed);
 }
 
-bool RegionSpace::AllocNewTlab(Thread* self, size_t min_bytes) {
+bool RegionSpace::AllocNewTlab(Thread* self,
+                               const size_t tlab_size,
+                               size_t* bytes_tl_bulk_allocated) {
   MutexLock mu(self, region_lock_);
-  RevokeThreadLocalBuffersLocked(self);
-  // Retain sufficient free regions for full evacuation.
-
-  Region* r = AllocateRegion(/*for_evac=*/ false);
+  RevokeThreadLocalBuffersLocked(self, /*reuse=*/ gc::Heap::kUsePartialTlabs);
+  Region* r = nullptr;
+  uint8_t* pos = nullptr;
+  *bytes_tl_bulk_allocated = tlab_size;
+  // First attempt to get a partially used TLAB, if available.
+  if (tlab_size < kRegionSize) {
+    // Fetch the largest partial TLAB. The multimap is ordered in decreasing
+    // size.
+    auto largest_partial_tlab = partial_tlabs_.begin();
+    if (largest_partial_tlab != partial_tlabs_.end() && largest_partial_tlab->first >= tlab_size) {
+      r = largest_partial_tlab->second;
+      pos = r->End() - largest_partial_tlab->first;
+      partial_tlabs_.erase(largest_partial_tlab);
+      DCHECK_GT(r->End(), pos);
+      DCHECK_LE(r->Begin(), pos);
+      DCHECK_GE(r->Top(), pos);
+      *bytes_tl_bulk_allocated -= r->Top() - pos;
+    }
+  }
+  if (r == nullptr) {
+    // Fallback to allocating an entire region as TLAB.
+    r = AllocateRegion(/*for_evac=*/ false);
+  }
   if (r != nullptr) {
+    uint8_t* start = pos != nullptr ? pos : r->Begin();
+    DCHECK_ALIGNED(start, kObjectAlignment);
     r->is_a_tlab_ = true;
     r->thread_ = self;
     r->SetTop(r->End());
-    self->SetTlab(r->Begin(), r->Begin() + min_bytes, r->End());
+    self->SetTlab(start, start + tlab_size, r->End());
     return true;
   }
   return false;
@@ -851,22 +878,33 @@
 
 size_t RegionSpace::RevokeThreadLocalBuffers(Thread* thread) {
   MutexLock mu(Thread::Current(), region_lock_);
-  RevokeThreadLocalBuffersLocked(thread);
+  RevokeThreadLocalBuffersLocked(thread, /*reuse=*/ gc::Heap::kUsePartialTlabs);
   return 0U;
 }
 
-void RegionSpace::RevokeThreadLocalBuffersLocked(Thread* thread) {
+size_t RegionSpace::RevokeThreadLocalBuffers(Thread* thread, const bool reuse) {
+  MutexLock mu(Thread::Current(), region_lock_);
+  RevokeThreadLocalBuffersLocked(thread, reuse);
+  return 0U;
+}
+
+void RegionSpace::RevokeThreadLocalBuffersLocked(Thread* thread, bool reuse) {
   uint8_t* tlab_start = thread->GetTlabStart();
   DCHECK_EQ(thread->HasTlab(), tlab_start != nullptr);
   if (tlab_start != nullptr) {
-    DCHECK_ALIGNED(tlab_start, kRegionSize);
     Region* r = RefToRegionLocked(reinterpret_cast<mirror::Object*>(tlab_start));
+    r->is_a_tlab_ = false;
+    r->thread_ = nullptr;
     DCHECK(r->IsAllocated());
     DCHECK_LE(thread->GetThreadLocalBytesAllocated(), kRegionSize);
     r->RecordThreadLocalAllocations(thread->GetThreadLocalObjectsAllocated(),
-                                    thread->GetThreadLocalBytesAllocated());
-    r->is_a_tlab_ = false;
-    r->thread_ = nullptr;
+                                    thread->GetTlabEnd() - r->Begin());
+    DCHECK_GE(r->End(), thread->GetTlabPos());
+    DCHECK_LE(r->Begin(), thread->GetTlabPos());
+    size_t remaining_bytes = r->End() - thread->GetTlabPos();
+    if (reuse && remaining_bytes >= gc::Heap::kPartialTlabSize) {
+      partial_tlabs_.insert(std::make_pair(remaining_bytes, r));
+    }
   }
   thread->ResetTlab();
 }
diff --git a/runtime/gc/space/region_space.h b/runtime/gc/space/region_space.h
index 6d654b3..f74abfb 100644
--- a/runtime/gc/space/region_space.h
+++ b/runtime/gc/space/region_space.h
@@ -22,6 +22,9 @@
 #include "space.h"
 #include "thread.h"
 
+#include <functional>
+#include <map>
+
 namespace art {
 namespace gc {
 
@@ -141,7 +144,7 @@
   void DumpNonFreeRegions(std::ostream& os) REQUIRES(!region_lock_);
 
   size_t RevokeThreadLocalBuffers(Thread* thread) override REQUIRES(!region_lock_);
-  void RevokeThreadLocalBuffersLocked(Thread* thread) REQUIRES(region_lock_);
+  size_t RevokeThreadLocalBuffers(Thread* thread, const bool reuse) REQUIRES(!region_lock_);
   size_t RevokeAllThreadLocalBuffers() override
       REQUIRES(!Locks::runtime_shutdown_lock_, !Locks::thread_list_lock_, !region_lock_);
   void AssertThreadLocalBuffersAreRevoked(Thread* thread) REQUIRES(!region_lock_);
@@ -189,6 +192,9 @@
   size_t GetNumRegions() const {
     return num_regions_;
   }
+  size_t GetNumNonFreeRegions() const NO_THREAD_SAFETY_ANALYSIS {
+    return num_non_free_regions_;
+  }
 
   bool CanMoveObjects() const override {
     return true;
@@ -363,7 +369,8 @@
   // Increment object allocation count for region containing ref.
   void RecordAlloc(mirror::Object* ref) REQUIRES(!region_lock_);
 
-  bool AllocNewTlab(Thread* self, size_t min_bytes) REQUIRES(!region_lock_);
+  bool AllocNewTlab(Thread* self, const size_t tlab_size, size_t* bytes_tl_bulk_allocated)
+      REQUIRES(!region_lock_);
 
   uint32_t Time() {
     return time_;
@@ -591,9 +598,8 @@
 
     void RecordThreadLocalAllocations(size_t num_objects, size_t num_bytes) {
       DCHECK(IsAllocated());
-      DCHECK_EQ(objects_allocated_.load(std::memory_order_relaxed), 0U);
       DCHECK_EQ(Top(), end_);
-      objects_allocated_.store(num_objects, std::memory_order_relaxed);
+      objects_allocated_.fetch_add(num_objects, std::memory_order_relaxed);
       top_.store(begin_ + num_bytes, std::memory_order_relaxed);
       DCHECK_LE(Top(), end_);
     }
@@ -697,6 +703,7 @@
   }
 
   Region* AllocateRegion(bool for_evac) REQUIRES(region_lock_);
+  void RevokeThreadLocalBuffersLocked(Thread* thread, bool reuse) REQUIRES(region_lock_);
 
   // Scan region range [`begin`, `end`) in increasing order to try to
   // allocate a large region having a size of `num_regs_in_large_region`
@@ -745,6 +752,9 @@
   // The pointer to the region array.
   std::unique_ptr<Region[]> regions_ GUARDED_BY(region_lock_);
 
+  // To hold partially used TLABs which can be reassigned to threads later for
+  // utilizing the un-used portion.
+  std::multimap<size_t, Region*, std::greater<size_t>> partial_tlabs_ GUARDED_BY(region_lock_);
   // The upper-bound index of the non-free regions. Used to avoid scanning all regions in
   // RegionSpace::SetFromSpace and RegionSpace::ClearFromSpace.
   //
diff --git a/runtime/thread.h b/runtime/thread.h
index 32a620c..34434cf 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -1189,7 +1189,9 @@
   uint8_t* GetTlabPos() {
     return tlsPtr_.thread_local_pos;
   }
-
+  uint8_t* GetTlabEnd() {
+    return tlsPtr_.thread_local_end;
+  }
   // Remove the suspend trigger for this thread by making the suspend_trigger_ TLS value
   // equal to a valid pointer.
   // TODO: does this need to atomic?  I don't think so.