11 files changed, 613 insertions, 241 deletions
diff --git a/runtime/base/gc_visited_arena_pool.cc b/runtime/base/gc_visited_arena_pool.cc
index 039157a30c..77cc7ae32d 100644
--- a/runtime/base/gc_visited_arena_pool.cc
+++ b/runtime/base/gc_visited_arena_pool.cc
@@ -27,43 +27,67 @@
 
 namespace art {
 
-TrackedArena::TrackedArena(uint8_t* start, size_t size, bool pre_zygote_fork)
-    : Arena(), first_obj_array_(nullptr), pre_zygote_fork_(pre_zygote_fork) {
+TrackedArena::TrackedArena(uint8_t* start, size_t size, bool pre_zygote_fork, bool single_obj_arena)
+    : Arena(),
+      first_obj_array_(nullptr),
+      pre_zygote_fork_(pre_zygote_fork),
+      waiting_for_deletion_(false) {
   static_assert(ArenaAllocator::kArenaAlignment <= kPageSize,
                 "Arena should not need stronger alignment than kPageSize.");
-  DCHECK_ALIGNED(size, kPageSize);
-  DCHECK_ALIGNED(start, kPageSize);
   memory_ = start;
   size_ = size;
-  size_t arr_size = size / kPageSize;
-  first_obj_array_.reset(new uint8_t*[arr_size]);
-  std::fill_n(first_obj_array_.get(), arr_size, nullptr);
+  if (single_obj_arena) {
+    // We have only one object in this arena and it is expected to consume the
+    // entire arena.
+    bytes_allocated_ = size;
+  } else {
+    DCHECK_ALIGNED(size, kPageSize);
+    DCHECK_ALIGNED(start, kPageSize);
+    size_t arr_size = size / kPageSize;
+    first_obj_array_.reset(new uint8_t*[arr_size]);
+    std::fill_n(first_obj_array_.get(), arr_size, nullptr);
+  }
+}
+
+void TrackedArena::ReleasePages(uint8_t* begin, size_t size, bool pre_zygote_fork) {
+  DCHECK_ALIGNED(begin, kPageSize);
+  // Userfaultfd GC uses MAP_SHARED mappings for linear-alloc and therefore
+  // MADV_DONTNEED will not free the pages from page cache. Therefore use
+  // MADV_REMOVE instead, which is meant for this purpose.
+  // Arenas allocated pre-zygote fork are private anonymous and hence must be
+  // released using MADV_DONTNEED.
+  if (!gUseUserfaultfd || pre_zygote_fork ||
+      (madvise(begin, size, MADV_REMOVE) == -1 && errno == EINVAL)) {
+    // MADV_REMOVE fails if invoked on anonymous mapping, which could happen
+    // if the arena is released before userfaultfd-GC starts using memfd. So
+    // use MADV_DONTNEED.
+    ZeroAndReleaseMemory(begin, size);
+  }
 }
 
 void TrackedArena::Release() {
   if (bytes_allocated_ > 0) {
-    // Userfaultfd GC uses MAP_SHARED mappings for linear-alloc and therefore
-    // MADV_DONTNEED will not free the pages from page cache. Therefore use
-    // MADV_REMOVE instead, which is meant for this purpose.
-    // Arenas allocated pre-zygote fork are private anonymous and hence must be
-    // released using MADV_DONTNEED.
-    if (!gUseUserfaultfd || pre_zygote_fork_ ||
-        (madvise(Begin(), Size(), MADV_REMOVE) == -1 && errno == EINVAL)) {
-      // MADV_REMOVE fails if invoked on anonymous mapping, which could happen
-      // if the arena is released before userfaultfd-GC starts using memfd. So
-      // use MADV_DONTNEED.
-      ZeroAndReleaseMemory(Begin(), Size());
+    ReleasePages(Begin(), Size(), pre_zygote_fork_);
+    if (first_obj_array_.get() != nullptr) {
+      std::fill_n(first_obj_array_.get(), Size() / kPageSize, nullptr);
     }
-    std::fill_n(first_obj_array_.get(), Size() / kPageSize, nullptr);
     bytes_allocated_ = 0;
   }
 }
 
 void TrackedArena::SetFirstObject(uint8_t* obj_begin, uint8_t* obj_end) {
+  DCHECK(first_obj_array_.get() != nullptr);
   DCHECK_LE(static_cast<void*>(Begin()), static_cast<void*>(obj_end));
   DCHECK_LT(static_cast<void*>(obj_begin), static_cast<void*>(obj_end));
+  GcVisitedArenaPool* arena_pool =
+      static_cast<GcVisitedArenaPool*>(Runtime::Current()->GetLinearAllocArenaPool());
   size_t idx = static_cast<size_t>(obj_begin - Begin()) / kPageSize;
   size_t last_byte_idx = static_cast<size_t>(obj_end - 1 - Begin()) / kPageSize;
+  // Do the update below with arena-pool's lock in shared-mode to serialize with
+  // the compaction-pause wherein we acquire it exclusively. This is to ensure
+  // that last-byte read there doesn't change after reading it and before
+  // userfaultfd registration.
+  ReaderMutexLock rmu(Thread::Current(), arena_pool->GetLock());
   // If the addr is at the beginning of a page, then we set it for that page too.
   if (IsAligned<kPageSize>(obj_begin)) {
     first_obj_array_[idx] = obj_begin;
@@ -106,7 +130,13 @@ uint8_t* GcVisitedArenaPool::AddMap(size_t min_size) {
 }
 
 GcVisitedArenaPool::GcVisitedArenaPool(bool low_4gb, bool is_zygote, const char* name)
-    : bytes_allocated_(0), name_(name), low_4gb_(low_4gb), pre_zygote_fork_(is_zygote) {}
+    : lock_("gc-visited arena-pool", kGenericBottomLock),
+      bytes_allocated_(0),
+      unused_arenas_(nullptr),
+      name_(name),
+      defer_arena_freeing_(false),
+      low_4gb_(low_4gb),
+      pre_zygote_fork_(is_zygote) {}
 
 GcVisitedArenaPool::~GcVisitedArenaPool() {
   for (Chunk* chunk : free_chunks_) {
@@ -117,13 +147,12 @@ GcVisitedArenaPool::~GcVisitedArenaPool() {
 }
 
 size_t GcVisitedArenaPool::GetBytesAllocated() const {
-  std::lock_guard<std::mutex> lock(lock_);
+  ReaderMutexLock rmu(Thread::Current(), lock_);
   return bytes_allocated_;
 }
 
 uint8_t* GcVisitedArenaPool::AddPreZygoteForkMap(size_t size) {
   DCHECK(pre_zygote_fork_);
-  DCHECK(Runtime::Current()->IsZygote());
   std::string pre_fork_name = "Pre-zygote-";
   pre_fork_name += name_;
   std::string err_msg;
@@ -137,18 +166,67 @@ uint8_t* GcVisitedArenaPool::AddPreZygoteForkMap(size_t size) {
   return map.Begin();
 }
 
-Arena* GcVisitedArenaPool::AllocArena(size_t size) {
+uint8_t* GcVisitedArenaPool::AllocSingleObjArena(size_t size) {
+  WriterMutexLock wmu(Thread::Current(), lock_);
+  Arena* arena;
+  DCHECK(gUseUserfaultfd);
+  // To minimize private dirty, all class and intern table allocations are
+  // done outside LinearAlloc range so they are untouched during GC.
+  if (pre_zygote_fork_) {
+    uint8_t* begin = static_cast<uint8_t*>(malloc(size));
+    auto insert_result = allocated_arenas_.insert(
+        new TrackedArena(begin, size, /*pre_zygote_fork=*/true, /*single_obj_arena=*/true));
+    arena = *insert_result.first;
+  } else {
+    arena = AllocArena(size, /*single_obj_arena=*/true);
+  }
+  return arena->Begin();
+}
+
+void GcVisitedArenaPool::FreeSingleObjArena(uint8_t* addr) {
+  Thread* self = Thread::Current();
+  size_t size;
+  bool zygote_arena;
+  {
+    TrackedArena temp_arena(addr);
+    WriterMutexLock wmu(self, lock_);
+    auto iter = allocated_arenas_.find(&temp_arena);
+    DCHECK(iter != allocated_arenas_.end());
+    TrackedArena* arena = *iter;
+    size = arena->Size();
+    zygote_arena = arena->IsPreZygoteForkArena();
+    DCHECK_EQ(arena->Begin(), addr);
+    DCHECK(arena->IsSingleObjectArena());
+    allocated_arenas_.erase(iter);
+    if (defer_arena_freeing_) {
+      arena->SetupForDeferredDeletion(unused_arenas_);
+      unused_arenas_ = arena;
+    } else {
+      delete arena;
+    }
+  }
+  // Refer to the comment in FreeArenaChain() for why the pages are released
+  // after deleting the arena.
+  if (zygote_arena) {
+    free(addr);
+  } else {
+    TrackedArena::ReleasePages(addr, size, /*pre_zygote_fork=*/false);
+    WriterMutexLock wmu(self, lock_);
+    FreeRangeLocked(addr, size);
+  }
+}
+
+Arena* GcVisitedArenaPool::AllocArena(size_t size, bool single_obj_arena) {
   // Return only page aligned sizes so that madvise can be leveraged.
   size = RoundUp(size, kPageSize);
-  std::lock_guard<std::mutex> lock(lock_);
-
   if (pre_zygote_fork_) {
     // The first fork out of zygote hasn't happened yet. Allocate arena in a
     // private-anonymous mapping to retain clean pages across fork.
-    DCHECK(Runtime::Current()->IsZygote());
     uint8_t* addr = AddPreZygoteForkMap(size);
-    auto emplace_result = allocated_arenas_.emplace(addr, size, /*pre_zygote_fork=*/true);
-    return const_cast<TrackedArena*>(&(*emplace_result.first));
+    auto insert_result = allocated_arenas_.insert(
+        new TrackedArena(addr, size, /*pre_zygote_fork=*/true, single_obj_arena));
+    DCHECK(insert_result.second);
+    return *insert_result.first;
   }
 
   Chunk temp_chunk(nullptr, size);
@@ -165,19 +243,21 @@ Arena* GcVisitedArenaPool::AllocArena(size_t size) {
   // if the best-fit chunk < 2x the requested size, then give the whole chunk.
   if (chunk->size_ < 2 * size) {
     DCHECK_GE(chunk->size_, size);
-    auto emplace_result = allocated_arenas_.emplace(chunk->addr_,
-                                                    chunk->size_,
-                                                    /*pre_zygote_fork=*/false);
-    DCHECK(emplace_result.second);
+    auto insert_result = allocated_arenas_.insert(new TrackedArena(chunk->addr_,
+                                                                   chunk->size_,
+                                                                   /*pre_zygote_fork=*/false,
+                                                                   single_obj_arena));
+    DCHECK(insert_result.second);
     free_chunks_.erase(free_chunks_iter);
     best_fit_allocs_.erase(best_fit_iter);
     delete chunk;
-    return const_cast<TrackedArena*>(&(*emplace_result.first));
+    return *insert_result.first;
   } else {
-    auto emplace_result = allocated_arenas_.emplace(chunk->addr_,
-                                                    size,
-                                                    /*pre_zygote_fork=*/false);
-    DCHECK(emplace_result.second);
+    auto insert_result = allocated_arenas_.insert(new TrackedArena(chunk->addr_,
+                                                                   size,
+                                                                   /*pre_zygote_fork=*/false,
+                                                                   single_obj_arena));
+    DCHECK(insert_result.second);
     // Compute next iterators for faster insert later.
     auto next_best_fit_iter = best_fit_iter;
     next_best_fit_iter++;
@@ -190,7 +270,7 @@ Arena* GcVisitedArenaPool::AllocArena(size_t size) {
     DCHECK_EQ(free_chunks_nh.value()->addr_, chunk->addr_);
     best_fit_allocs_.insert(next_best_fit_iter, std::move(best_fit_nh));
     free_chunks_.insert(next_free_chunks_iter, std::move(free_chunks_nh));
-    return const_cast<TrackedArena*>(&(*emplace_result.first));
+    return *insert_result.first;
   }
 }
 
@@ -266,27 +346,79 @@ void GcVisitedArenaPool::FreeArenaChain(Arena* first) {
   // TODO: Handle the case when arena_allocator::kArenaAllocatorPreciseTracking
   // is true. See MemMapArenaPool::FreeArenaChain() for example.
   CHECK(!arena_allocator::kArenaAllocatorPreciseTracking);
+  Thread* self = Thread::Current();
+  // vector of arena ranges to be freed and whether they are pre-zygote-fork.
+  std::vector<std::tuple<uint8_t*, size_t, bool>> free_ranges;
+
+  {
+    WriterMutexLock wmu(self, lock_);
+    while (first != nullptr) {
+      TrackedArena* temp = down_cast<TrackedArena*>(first);
+      DCHECK(!temp->IsSingleObjectArena());
+      first = first->Next();
+      free_ranges.emplace_back(temp->Begin(), temp->Size(), temp->IsPreZygoteForkArena());
+      // In other implementations of ArenaPool this is calculated when asked for,
+      // thanks to the list of free arenas that is kept around. But in this case,
+      // we release the freed arena back to the pool and therefore need to
+      // calculate here.
+      bytes_allocated_ += temp->GetBytesAllocated();
+      auto iter = allocated_arenas_.find(temp);
+      DCHECK(iter != allocated_arenas_.end());
+      allocated_arenas_.erase(iter);
+      if (defer_arena_freeing_) {
+        temp->SetupForDeferredDeletion(unused_arenas_);
+        unused_arenas_ = temp;
+      } else {
+        delete temp;
+      }
+    }
+  }
 
-  // madvise the arenas before acquiring lock for scalability
-  for (Arena* temp = first; temp != nullptr; temp = temp->Next()) {
-    temp->Release();
+  // madvise of arenas must be done after the above loop which serializes with
+  // MarkCompact::ProcessLinearAlloc() so that if it finds an arena to be not
+  // 'waiting-for-deletion' then it finishes the arena's processing before
+  // clearing here. Otherwise, we could have a situation wherein arena-pool
+  // assumes the memory range of the arena(s) to be zero'ed (by madvise),
+  // whereas GC maps stale arena pages.
+  for (auto& iter : free_ranges) {
+    // No need to madvise pre-zygote-fork arenas as they will munmapped below.
+    if (!std::get<2>(iter)) {
+      TrackedArena::ReleasePages(std::get<0>(iter), std::get<1>(iter), /*pre_zygote_fork=*/false);
+    }
   }
 
-  std::lock_guard<std::mutex> lock(lock_);
-  arenas_freed_ = true;
-  while (first != nullptr) {
-    FreeRangeLocked(first->Begin(), first->Size());
-    // In other implementations of ArenaPool this is calculated when asked for,
-    // thanks to the list of free arenas that is kept around. But in this case,
-    // we release the freed arena back to the pool and therefore need to
-    // calculate here.
-    bytes_allocated_ += first->GetBytesAllocated();
-    TrackedArena* temp = down_cast<TrackedArena*>(first);
-    // TODO: Add logic to unmap the maps corresponding to pre-zygote-fork
-    // arenas, which are expected to be released only during shutdown.
-    first = first->Next();
-    size_t erase_count = allocated_arenas_.erase(*temp);
-    DCHECK_EQ(erase_count, 1u);
+  WriterMutexLock wmu(self, lock_);
+  for (auto& iter : free_ranges) {
+    if (UNLIKELY(std::get<2>(iter))) {
+      bool found = false;
+      for (auto map_iter = maps_.begin(); map_iter != maps_.end(); map_iter++) {
+        if (map_iter->Begin() == std::get<0>(iter)) {
+          // erase will destruct the MemMap and thereby munmap. But this happens
+          // very rarely so it's ok to do it with lock acquired.
+          maps_.erase(map_iter);
+          found = true;
+          break;
+        }
+      }
+      CHECK(found);
+    } else {
+      FreeRangeLocked(std::get<0>(iter), std::get<1>(iter));
+    }
+  }
+}
+
+void GcVisitedArenaPool::DeleteUnusedArenas() {
+  TrackedArena* arena;
+  {
+    WriterMutexLock wmu(Thread::Current(), lock_);
+    defer_arena_freeing_ = false;
+    arena = unused_arenas_;
+    unused_arenas_ = nullptr;
+  }
+  while (arena != nullptr) {
+    TrackedArena* temp = down_cast<TrackedArena*>(arena->Next());
+    delete arena;
+    arena = temp;
   }
 }
 
diff --git a/runtime/base/gc_visited_arena_pool.h b/runtime/base/gc_visited_arena_pool.h
index e307147c9e..f3c1abcb5a 100644
--- a/runtime/base/gc_visited_arena_pool.h
+++ b/runtime/base/gc_visited_arena_pool.h
@@ -17,12 +17,16 @@
 #ifndef ART_RUNTIME_BASE_GC_VISITED_ARENA_POOL_H_
 #define ART_RUNTIME_BASE_GC_VISITED_ARENA_POOL_H_
 
-#include "base/casts.h"
+#include <set>
+
+#include "base/allocator.h"
 #include "base/arena_allocator.h"
+#include "base/casts.h"
+#include "base/hash_set.h"
 #include "base/locks.h"
 #include "base/mem_map.h"
-
-#include <set>
+#include "read_barrier_config.h"
+#include "runtime.h"
 
 namespace art {
 
@@ -34,27 +38,45 @@ class TrackedArena final : public Arena {
  public:
   // Used for searching in maps. Only arena's starting address is relevant.
   explicit TrackedArena(uint8_t* addr) : pre_zygote_fork_(false) { memory_ = addr; }
-  TrackedArena(uint8_t* start, size_t size, bool pre_zygote_fork);
+  TrackedArena(uint8_t* start, size_t size, bool pre_zygote_fork, bool single_obj_arena);
 
   template <typename PageVisitor>
   void VisitRoots(PageVisitor& visitor) const REQUIRES_SHARED(Locks::mutator_lock_) {
-    DCHECK_ALIGNED(Size(), kPageSize);
-    DCHECK_ALIGNED(Begin(), kPageSize);
-    int nr_pages = Size() / kPageSize;
     uint8_t* page_begin = Begin();
-    for (int i = 0; i < nr_pages && first_obj_array_[i] != nullptr; i++, page_begin += kPageSize) {
-      visitor(page_begin, first_obj_array_[i]);
+    if (first_obj_array_.get() != nullptr) {
+      DCHECK_ALIGNED(Size(), kPageSize);
+      DCHECK_ALIGNED(Begin(), kPageSize);
+      for (int i = 0, nr_pages = Size() / kPageSize; i < nr_pages; i++, page_begin += kPageSize) {
+        uint8_t* first = first_obj_array_[i];
+        if (first != nullptr) {
+          visitor(page_begin, first, kPageSize);
+        } else {
+          break;
+        }
+      }
+    } else {
+      size_t page_size = Size();
+      while (page_size > kPageSize) {
+        visitor(page_begin, nullptr, kPageSize);
+        page_begin += kPageSize;
+        page_size -= kPageSize;
+      }
+      visitor(page_begin, nullptr, page_size);
     }
   }
 
   // Return the page addr of the first page with first_obj set to nullptr.
   uint8_t* GetLastUsedByte() const REQUIRES_SHARED(Locks::mutator_lock_) {
-    DCHECK_ALIGNED(Begin(), kPageSize);
-    DCHECK_ALIGNED(End(), kPageSize);
     // Jump past bytes-allocated for arenas which are not currently being used
     // by arena-allocator. This helps in reducing loop iterations below.
     uint8_t* last_byte = AlignUp(Begin() + GetBytesAllocated(), kPageSize);
-    DCHECK_LE(last_byte, End());
+    if (first_obj_array_.get() != nullptr) {
+      DCHECK_ALIGNED(Begin(), kPageSize);
+      DCHECK_ALIGNED(End(), kPageSize);
+      DCHECK_LE(last_byte, End());
+    } else {
+      DCHECK_EQ(last_byte, End());
+    }
     for (size_t i = (last_byte - Begin()) / kPageSize;
          last_byte < End() && first_obj_array_[i] != nullptr;
          last_byte += kPageSize, i++) {
@@ -66,21 +88,43 @@ class TrackedArena final : public Arena {
   uint8_t* GetFirstObject(uint8_t* addr) const REQUIRES_SHARED(Locks::mutator_lock_) {
     DCHECK_LE(Begin(), addr);
     DCHECK_GT(End(), addr);
-    return first_obj_array_[(addr - Begin()) / kPageSize];
+    if (first_obj_array_.get() != nullptr) {
+      return first_obj_array_[(addr - Begin()) / kPageSize];
+    } else {
+      // The pages of this arena contain array of GC-roots. So we don't need
+      // first-object of any given page of the arena.
+      // Returning null helps distinguish which visitor is to be called.
+      return nullptr;
+    }
   }
 
   // Set 'obj_begin' in first_obj_array_ in every element for which it's the
   // first object.
   void SetFirstObject(uint8_t* obj_begin, uint8_t* obj_end);
+  // Setup the arena for deferred deletion.
+  void SetupForDeferredDeletion(TrackedArena* next_arena) {
+    DCHECK(next_arena->waiting_for_deletion_);
+    DCHECK(!waiting_for_deletion_);
+    waiting_for_deletion_ = true;
+    next_ = next_arena;
+  }
+  bool IsWaitingForDeletion() const { return waiting_for_deletion_; }
 
+  // Madvise the pages in the given range. 'begin' is expected to be page
+  // aligned.
+  // TODO: Remove this once we remove the shmem (minor-fault) code in
+  // userfaultfd GC and directly use ZeroAndReleaseMemory().
+  static void ReleasePages(uint8_t* begin, size_t size, bool pre_zygote_fork);
   void Release() override;
   bool IsPreZygoteForkArena() const { return pre_zygote_fork_; }
+  bool IsSingleObjectArena() const { return first_obj_array_.get() == nullptr; }
 
  private:
   // first_obj_array_[i] is the object that overlaps with the ith page's
   // beginning, i.e. first_obj_array_[i] <= ith page_begin.
   std::unique_ptr<uint8_t*[]> first_obj_array_;
   const bool pre_zygote_fork_;
+  bool waiting_for_deletion_;
 };
 
 // An arena-pool wherein allocations can be tracked so that the GC can visit all
@@ -101,15 +145,24 @@ class GcVisitedArenaPool final : public ArenaPool {
                               bool is_zygote = false,
                               const char* name = "LinearAlloc");
   virtual ~GcVisitedArenaPool();
-  Arena* AllocArena(size_t size) override;
-  void FreeArenaChain(Arena* first) override;
-  size_t GetBytesAllocated() const override;
+
+  Arena* AllocArena(size_t size, bool need_first_obj_arr) REQUIRES(lock_);
+  // Use by arena allocator.
+  Arena* AllocArena(size_t size) override REQUIRES(!lock_) {
+    WriterMutexLock wmu(Thread::Current(), lock_);
+    return AllocArena(size, /*single_obj_arena=*/false);
+  }
+  void FreeArenaChain(Arena* first) override REQUIRES(!lock_);
+  size_t GetBytesAllocated() const override REQUIRES(!lock_);
   void ReclaimMemory() override {}
   void LockReclaimMemory() override {}
   void TrimMaps() override {}
 
-  bool Contains(void* ptr) {
-    std::lock_guard<std::mutex> lock(lock_);
+  uint8_t* AllocSingleObjArena(size_t size) REQUIRES(!lock_);
+  void FreeSingleObjArena(uint8_t* addr) REQUIRES(!lock_);
+
+  bool Contains(void* ptr) REQUIRES(!lock_) {
+    ReaderMutexLock rmu(Thread::Current(), lock_);
     for (auto& map : maps_) {
       if (map.HasAddress(ptr)) {
         return true;
@@ -119,51 +172,43 @@ class GcVisitedArenaPool final : public ArenaPool {
   }
 
   template <typename PageVisitor>
-  void VisitRoots(PageVisitor& visitor) REQUIRES_SHARED(Locks::mutator_lock_) {
-    std::lock_guard<std::mutex> lock(lock_);
+  void VisitRoots(PageVisitor& visitor) REQUIRES_SHARED(Locks::mutator_lock_, lock_) {
     for (auto& arena : allocated_arenas_) {
-      arena.VisitRoots(visitor);
+      arena->VisitRoots(visitor);
     }
   }
 
   template <typename Callback>
-  void ForEachAllocatedArena(Callback cb) REQUIRES_SHARED(Locks::mutator_lock_) {
-    std::lock_guard<std::mutex> lock(lock_);
+  void ForEachAllocatedArena(Callback cb) REQUIRES_SHARED(Locks::mutator_lock_, lock_) {
+    // We should not have any unused arenas when calling this function.
+    CHECK(unused_arenas_ == nullptr);
     for (auto& arena : allocated_arenas_) {
-      cb(arena);
+      cb(*arena);
     }
   }
 
   // Called in Heap::PreZygoteFork(). All allocations after this are done in
   // arena-pool which is visited by userfaultfd.
-  void SetupPostZygoteMode() {
-    std::lock_guard<std::mutex> lock(lock_);
+  void SetupPostZygoteMode() REQUIRES(!lock_) {
+    WriterMutexLock wmu(Thread::Current(), lock_);
     DCHECK(pre_zygote_fork_);
     pre_zygote_fork_ = false;
   }
 
   // For userfaultfd GC to be able to acquire the lock to avoid concurrent
   // release of arenas when it is visiting them.
-  std::mutex& GetLock() { return lock_; }
-
-  // Find the given arena in allocated_arenas_. The function is called with
-  // lock_ acquired.
-  bool FindAllocatedArena(const TrackedArena* arena) const NO_THREAD_SAFETY_ANALYSIS {
-    for (auto& allocated_arena : allocated_arenas_) {
-      if (arena == &allocated_arena) {
-        return true;
-      }
-    }
-    return false;
-  }
+  ReaderWriterMutex& GetLock() const RETURN_CAPABILITY(lock_) { return lock_; }
 
-  void ClearArenasFreed() {
-    std::lock_guard<std::mutex> lock(lock_);
-    arenas_freed_ = false;
+  // Called in the compaction pause to indicate that all arenas that will be
+  // freed until compaction is done shouldn't delete the TrackedArena object to
+  // avoid ABA problem. Called with lock_ acquired.
+  void DeferArenaFreeing() REQUIRES(lock_) {
+    CHECK(unused_arenas_ == nullptr);
+    defer_arena_freeing_ = true;
   }
 
-  // The function is called with lock_ acquired.
-  bool AreArenasFreed() const NO_THREAD_SAFETY_ANALYSIS { return arenas_freed_; }
+  // Clear defer_arena_freeing_ and delete all unused arenas.
+  void DeleteUnusedArenas() REQUIRES(!lock_);
 
  private:
   void FreeRangeLocked(uint8_t* range_begin, size_t range_size) REQUIRES(lock_);
@@ -197,31 +242,40 @@ class GcVisitedArenaPool final : public ArenaPool {
     }
   };
 
-  class LessByArenaAddr {
+  class TrackedArenaEquals {
    public:
-    bool operator()(const TrackedArena& a, const TrackedArena& b) const {
-      return std::less<uint8_t*>{}(a.Begin(), b.Begin());
+    bool operator()(const TrackedArena* a, const TrackedArena* b) const {
+      return std::equal_to<uint8_t*>{}(a->Begin(), b->Begin());
     }
   };
 
-  // Use a std::mutex here as Arenas are second-from-the-bottom when using MemMaps, and MemMap
-  // itself uses std::mutex scoped to within an allocate/free only.
-  mutable std::mutex lock_;
+  class TrackedArenaHash {
+   public:
+    size_t operator()(const TrackedArena* arena) const {
+      return std::hash<size_t>{}(reinterpret_cast<uintptr_t>(arena->Begin()) / kPageSize);
+    }
+  };
+  using AllocatedArenaSet =
+      HashSet<TrackedArena*, DefaultEmptyFn<TrackedArena*>, TrackedArenaHash, TrackedArenaEquals>;
+
+  mutable ReaderWriterMutex lock_;
   std::vector<MemMap> maps_ GUARDED_BY(lock_);
   std::set<Chunk*, LessByChunkSize> best_fit_allocs_ GUARDED_BY(lock_);
   std::set<Chunk*, LessByChunkAddr> free_chunks_ GUARDED_BY(lock_);
   // Set of allocated arenas. It's required to be able to find the arena
   // corresponding to a given address.
-  // TODO: consider using HashSet, which is more memory efficient.
-  std::set<TrackedArena, LessByArenaAddr> allocated_arenas_ GUARDED_BY(lock_);
+  AllocatedArenaSet allocated_arenas_ GUARDED_BY(lock_);
   // Number of bytes allocated so far.
   size_t bytes_allocated_ GUARDED_BY(lock_);
+  // To hold arenas that are freed while GC is happening. These are kept until
+  // the end of GC to avoid ABA problem.
+  TrackedArena* unused_arenas_ GUARDED_BY(lock_);
   const char* name_;
   // Flag to indicate that some arenas have been freed. This flag is used as an
   // optimization by GC to know if it needs to find if the arena being visited
   // has been freed or not. The flag is cleared in the compaction pause and read
   // when linear-alloc space is concurrently visited updated to update GC roots.
-  bool arenas_freed_ GUARDED_BY(lock_);
+  bool defer_arena_freeing_ GUARDED_BY(lock_);
   const bool low_4gb_;
   // Set to true in zygote process so that all linear-alloc allocations are in
   // private-anonymous mappings and not on userfaultfd visited pages. At
@@ -232,6 +286,55 @@ class GcVisitedArenaPool final : public ArenaPool {
   DISALLOW_COPY_AND_ASSIGN(GcVisitedArenaPool);
 };
 
+// Allocator for class-table and intern-table hash-sets. It enables updating the
+// roots concurrently page-by-page.
+template <class T, AllocatorTag kTag>
+class GcRootArenaAllocator : public TrackingAllocator<T, kTag> {
+ public:
+  using value_type = typename TrackingAllocator<T, kTag>::value_type;
+  using size_type = typename TrackingAllocator<T, kTag>::size_type;
+  using difference_type = typename TrackingAllocator<T, kTag>::difference_type;
+  using pointer = typename TrackingAllocator<T, kTag>::pointer;
+  using const_pointer = typename TrackingAllocator<T, kTag>::const_pointer;
+  using reference = typename TrackingAllocator<T, kTag>::reference;
+  using const_reference = typename TrackingAllocator<T, kTag>::const_reference;
+
+  // Used internally by STL data structures.
+  template <class U>
+  explicit GcRootArenaAllocator(
+      [[maybe_unused]] const GcRootArenaAllocator<U, kTag>& alloc) noexcept {}
+  // Used internally by STL data structures.
+  GcRootArenaAllocator() noexcept : TrackingAllocator<T, kTag>() {}
+
+  // Enables an allocator for objects of one type to allocate storage for objects of another type.
+  // Used internally by STL data structures.
+  template <class U>
+  struct rebind {
+    using other = GcRootArenaAllocator<U, kTag>;
+  };
+
+  pointer allocate(size_type n, [[maybe_unused]] const_pointer hint = 0) {
+    if (!gUseUserfaultfd) {
+      return TrackingAllocator<T, kTag>::allocate(n);
+    }
+    size_t size = n * sizeof(T);
+    GcVisitedArenaPool* pool =
+        down_cast<GcVisitedArenaPool*>(Runtime::Current()->GetLinearAllocArenaPool());
+    return reinterpret_cast<pointer>(pool->AllocSingleObjArena(size));
+  }
+
+  template <typename PT>
+  void deallocate(PT p, size_type n) {
+    if (!gUseUserfaultfd) {
+      TrackingAllocator<T, kTag>::deallocate(p, n);
+      return;
+    }
+    GcVisitedArenaPool* pool =
+        down_cast<GcVisitedArenaPool*>(Runtime::Current()->GetLinearAllocArenaPool());
+    pool->FreeSingleObjArena(reinterpret_cast<uint8_t*>(p));
+  }
+};
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_BASE_GC_VISITED_ARENA_POOL_H_
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 9d175cdba5..d4666e5f9c 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -2426,17 +2426,9 @@ void ClassLinker::VisitClassRoots(RootVisitor* visitor, VisitRootFlags flags) {
     boot_class_table_->VisitRoots(root_visitor);
     // If tracing is enabled, then mark all the class loaders to prevent unloading.
     if ((flags & kVisitRootFlagClassLoader) != 0 || tracing_enabled) {
-      gc::Heap* const heap = Runtime::Current()->GetHeap();
-      // Don't visit class-loaders if compacting with userfaultfd GC as these
-      // weaks are updated using Runtime::SweepSystemWeaks() and the GC doesn't
-      // tolerate double updates.
-      if (!heap->IsPerformingUffdCompaction()) {
-        for (const ClassLoaderData& data : class_loaders_) {
-          GcRoot<mirror::Object> root(GcRoot<mirror::Object>(self->DecodeJObject(data.weak_root)));
-          root.VisitRoot(visitor, RootInfo(kRootVMInternal));
-        }
-      } else {
-        DCHECK_EQ(heap->CurrentCollectorType(), gc::CollectorType::kCollectorTypeCMC);
+      for (const ClassLoaderData& data : class_loaders_) {
+        GcRoot<mirror::Object> root(GcRoot<mirror::Object>(self->DecodeJObject(data.weak_root)));
+        root.VisitRoot(visitor, RootInfo(kRootVMInternal));
       }
     }
   } else if (!gUseReadBarrier && (flags & kVisitRootFlagNewRoots) != 0) {
@@ -2476,9 +2468,11 @@ void ClassLinker::VisitClassRoots(RootVisitor* visitor, VisitRootFlags flags) {
 // Keep in sync with InitCallback. Anything we visit, we need to
 // reinit references to when reinitializing a ClassLinker from a
 // mapped image.
-void ClassLinker::VisitRoots(RootVisitor* visitor, VisitRootFlags flags) {
+void ClassLinker::VisitRoots(RootVisitor* visitor, VisitRootFlags flags, bool visit_class_roots) {
   class_roots_.VisitRootIfNonNull(visitor, RootInfo(kRootVMInternal));
-  VisitClassRoots(visitor, flags);
+  if (visit_class_roots) {
+    VisitClassRoots(visitor, flags);
+  }
   // Instead of visiting the find_array_class_cache_ drop it so that it doesn't prevent class
   // unloading if we are marking roots.
   DropFindArrayClassCache();
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index 3aae0422c1..c81713b116 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -514,7 +514,7 @@ class ClassLinker {
   void VisitClassRoots(RootVisitor* visitor, VisitRootFlags flags)
       REQUIRES(!Locks::classlinker_classes_lock_, !Locks::trace_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
-  void VisitRoots(RootVisitor* visitor, VisitRootFlags flags)
+  void VisitRoots(RootVisitor* visitor, VisitRootFlags flags, bool visit_class_roots = true)
       REQUIRES(!Locks::dex_lock_, !Locks::classlinker_classes_lock_, !Locks::trace_lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
   // Visits all dex-files accessible by any class-loader or the BCP.
@@ -866,6 +866,9 @@ class ClassLinker {
   // Enable or disable public sdk checks.
   virtual void SetEnablePublicSdkChecks(bool enabled);
   void RemoveDexFromCaches(const DexFile& dex_file);
+  ClassTable* GetBootClassTable() REQUIRES_SHARED(Locks::classlinker_classes_lock_) {
+    return boot_class_table_.get();
+  }
 
  protected:
   virtual bool InitializeClass(Thread* self,
diff --git a/runtime/class_table-inl.h b/runtime/class_table-inl.h
index ecc8a0a620..4ee59a79f7 100644
--- a/runtime/class_table-inl.h
+++ b/runtime/class_table-inl.h
@@ -68,12 +68,14 @@ inline bool ClassTable::ClassDescriptorEquals::operator()(const TableSlot& a,
   return a.Read<kWithoutReadBarrier>()->DescriptorEquals(b.first);
 }
 
-template<class Visitor>
-void ClassTable::VisitRoots(Visitor& visitor) {
+template <class Visitor>
+void ClassTable::VisitRoots(Visitor& visitor, bool skip_classes) {
   ReaderMutexLock mu(Thread::Current(), lock_);
-  for (ClassSet& class_set : classes_) {
-    for (TableSlot& table_slot : class_set) {
-      table_slot.VisitRoot(visitor);
+  if (!skip_classes) {
+    for (ClassSet& class_set : classes_) {
+      for (TableSlot& table_slot : class_set) {
+        table_slot.VisitRoot(visitor);
+      }
     }
   }
   for (GcRoot<mirror::Object>& root : strong_roots_) {
@@ -86,12 +88,14 @@ void ClassTable::VisitRoots(Visitor& visitor) {
   }
 }
 
-template<class Visitor>
-void ClassTable::VisitRoots(const Visitor& visitor) {
+template <class Visitor>
+void ClassTable::VisitRoots(const Visitor& visitor, bool skip_classes) {
   ReaderMutexLock mu(Thread::Current(), lock_);
-  for (ClassSet& class_set : classes_) {
-    for (TableSlot& table_slot : class_set) {
-      table_slot.VisitRoot(visitor);
+  if (!skip_classes) {
+    for (ClassSet& class_set : classes_) {
+      for (TableSlot& table_slot : class_set) {
+        table_slot.VisitRoot(visitor);
+      }
     }
   }
   for (GcRoot<mirror::Object>& root : strong_roots_) {
@@ -104,6 +108,18 @@ void ClassTable::VisitRoots(const Visitor& visitor) {
   }
 }
 
+template <class Condition, class Visitor>
+void ClassTable::VisitClassesIfConditionMet(Condition& cond, Visitor& visitor) {
+  ReaderMutexLock mu(Thread::Current(), lock_);
+  for (ClassSet& class_set : classes_) {
+    if (cond(class_set)) {
+      for (TableSlot& table_slot : class_set) {
+        table_slot.VisitRoot(visitor);
+      }
+    }
+  }
+}
+
 template <typename Visitor>
 class ClassTable::TableSlot::ClassAndRootVisitor {
  public:
diff --git a/runtime/class_table.h b/runtime/class_table.h
index 7e263737c3..54e066a18c 100644
--- a/runtime/class_table.h
+++ b/runtime/class_table.h
@@ -21,7 +21,7 @@
 #include <utility>
 #include <vector>
 
-#include "base/allocator.h"
+#include "base/gc_visited_arena_pool.h"
 #include "base/hash_set.h"
 #include "base/macros.h"
 #include "base/mutex.h"
@@ -151,7 +151,7 @@ class ClassTable {
                            TableSlotEmptyFn,
                            ClassDescriptorHash,
                            ClassDescriptorEquals,
-                           TrackingAllocator<TableSlot, kAllocatorTagClassTable>>;
+                           GcRootArenaAllocator<TableSlot, kAllocatorTagClassTable>>;
 
   ClassTable();
 
@@ -181,7 +181,7 @@ class ClassTable {
       REQUIRES(!lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // Returns the number of classes in the class table.
+  // Returns the number of class-sets in the class table.
   size_t Size() const
       REQUIRES(!lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
@@ -194,17 +194,13 @@ class ClassTable {
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   // NO_THREAD_SAFETY_ANALYSIS for object marking requiring heap bitmap lock.
-  template<class Visitor>
-  void VisitRoots(Visitor& visitor)
-      NO_THREAD_SAFETY_ANALYSIS
-      REQUIRES(!lock_)
-      REQUIRES_SHARED(Locks::mutator_lock_);
+  template <class Visitor>
+  void VisitRoots(Visitor& visitor, bool skip_classes = false) NO_THREAD_SAFETY_ANALYSIS
+      REQUIRES(!lock_) REQUIRES_SHARED(Locks::mutator_lock_);
 
-  template<class Visitor>
-  void VisitRoots(const Visitor& visitor)
-      NO_THREAD_SAFETY_ANALYSIS
-      REQUIRES(!lock_)
-      REQUIRES_SHARED(Locks::mutator_lock_);
+  template <class Visitor>
+  void VisitRoots(const Visitor& visitor, bool skip_classes = false) NO_THREAD_SAFETY_ANALYSIS
+      REQUIRES(!lock_) REQUIRES_SHARED(Locks::mutator_lock_);
 
   template<class Visitor>
   void VisitClassesAndRoots(Visitor& visitor)
@@ -212,6 +208,10 @@ class ClassTable {
       REQUIRES(!lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // Visit classes in those class-sets which satisfy 'cond'.
+  template <class Condition, class Visitor>
+  void VisitClassesIfConditionMet(Condition& cond, Visitor& visitor) REQUIRES(!lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_);
   // Stops visit if the visitor returns false.
   template <ReadBarrierOption kReadBarrierOption = kWithReadBarrier, typename Visitor>
   bool Visit(Visitor& visitor)
diff --git a/runtime/gc/collector/mark_compact.cc b/runtime/gc/collector/mark_compact.cc
index 6509dbe3ef..8434433efc 100644
--- a/runtime/gc/collector/mark_compact.cc
+++ b/runtime/gc/collector/mark_compact.cc
@@ -303,9 +303,8 @@ static constexpr size_t kMaxNumUffdWorkers = 2;
 // Number of compaction buffers reserved for mutator threads in SIGBUS feature
 // case. It's extremely unlikely that we will ever have more than these number
 // of mutator threads trying to access the moving-space during one compaction
-// phase. Using a lower number in debug builds to hopefully catch the issue
-// before it becomes a problem on user builds.
-static constexpr size_t kMutatorCompactionBufferCount = kIsDebugBuild ? 256 : 2048;
+// phase.
+static constexpr size_t kMutatorCompactionBufferCount = 2048;
 // Minimum from-space chunk to be madvised (during concurrent compaction) in one go.
 static constexpr ssize_t kMinFromSpaceMadviseSize = 1 * MB;
 // Concurrent compaction termination logic is different (and slightly more efficient) if the
@@ -2376,7 +2375,7 @@ void MarkCompact::UpdateNonMovingPage(mirror::Object* first, uint8_t* page) {
 }
 
 void MarkCompact::UpdateNonMovingSpace() {
-  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  TimingLogger::ScopedTiming t("(Paused)UpdateNonMovingSpace", GetTimings());
   // Iterating in reverse ensures that the class pointer in objects which span
   // across more than one page gets updated in the end. This is necessary for
   // VisitRefsForCompaction() to work correctly.
@@ -2597,7 +2596,8 @@ class MarkCompact::ClassLoaderRootsUpdater : public ClassLoaderVisitor {
       REQUIRES_SHARED(Locks::classlinker_classes_lock_, Locks::mutator_lock_) {
     ClassTable* const class_table = class_loader->GetClassTable();
     if (class_table != nullptr) {
-      class_table->VisitRoots(*this);
+      // Classes are updated concurrently.
+      class_table->VisitRoots(*this, /*skip_classes=*/true);
     }
   }
 
@@ -2621,8 +2621,10 @@ class MarkCompact::LinearAllocPageUpdater {
  public:
   explicit LinearAllocPageUpdater(MarkCompact* collector) : collector_(collector) {}
 
-  void operator()(uint8_t* page_begin, uint8_t* first_obj) ALWAYS_INLINE
+  // Update a page in multi-object arena.
+  void MultiObjectArena(uint8_t* page_begin, uint8_t* first_obj)
       REQUIRES_SHARED(Locks::mutator_lock_) {
+    DCHECK(first_obj != nullptr);
     DCHECK_ALIGNED(page_begin, kPageSize);
     uint8_t* page_end = page_begin + kPageSize;
     uint32_t obj_size;
@@ -2652,6 +2654,28 @@ class MarkCompact::LinearAllocPageUpdater {
     last_page_touched_ = true;
   }
 
+  // This version is only used for cases where the entire page is filled with
+  // GC-roots. For example, class-table and intern-table.
+  void SingleObjectArena(uint8_t* page_begin, size_t page_size)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    static_assert(sizeof(uint32_t) == sizeof(GcRoot<mirror::Object>));
+    DCHECK_ALIGNED(page_begin, kAlignment);
+    // Least significant bits are used by class-table.
+    static constexpr uint32_t kMask = kObjectAlignment - 1;
+    size_t num_roots = page_size / sizeof(GcRoot<mirror::Object>);
+    uint32_t* root_ptr = reinterpret_cast<uint32_t*>(page_begin);
+    for (size_t i = 0; i < num_roots; root_ptr++, i++) {
+      uint32_t word = *root_ptr;
+      if (word != 0) {
+        uint32_t lsbs = word & kMask;
+        word &= ~kMask;
+        VisitRootIfNonNull(reinterpret_cast<mirror::CompressedReference<mirror::Object>*>(&word));
+        *root_ptr = word | lsbs;
+        last_page_touched_ = true;
+      }
+    }
+  }
+
   bool WasLastPageTouched() const { return last_page_touched_; }
 
   void VisitRootIfNonNull(mirror::CompressedReference<mirror::Object>* root) const
@@ -2670,7 +2694,8 @@ class MarkCompact::LinearAllocPageUpdater {
       if (reinterpret_cast<uint8_t*>(old_ref) >= collector_->black_allocations_begin_) {
         new_ref = collector_->PostCompactBlackObjAddr(old_ref);
       } else if (collector_->live_words_bitmap_->Test(old_ref)) {
-        DCHECK(collector_->moving_space_bitmap_->Test(old_ref)) << old_ref;
+        DCHECK(collector_->moving_space_bitmap_->Test(old_ref))
+            << "ref:" << old_ref << " root:" << root;
         new_ref = collector_->PostCompactOldObjAddr(old_ref);
       }
       if (old_ref != new_ref) {
@@ -2735,9 +2760,34 @@ class MarkCompact::LinearAllocPageUpdater {
 
   MarkCompact* const collector_;
   // Whether the last page was touched or not.
-  bool last_page_touched_;
+  bool last_page_touched_ = false;
 };
 
+void MarkCompact::UpdateClassTableClasses(Runtime* runtime, bool immune_class_table_only) {
+  // If the process is debuggable then redefinition is allowed, which may mean
+  // pre-zygote-fork class-tables may have pointer to class in moving-space.
+  // So visit classes from class-sets that are not in linear-alloc arena-pool.
+  if (UNLIKELY(runtime->IsJavaDebuggableAtInit())) {
+    ClassLinker* linker = runtime->GetClassLinker();
+    ClassLoaderRootsUpdater updater(this);
+    GcVisitedArenaPool* pool = static_cast<GcVisitedArenaPool*>(runtime->GetLinearAllocArenaPool());
+    auto cond = [this, pool, immune_class_table_only](ClassTable::ClassSet& set) -> bool {
+      if (!set.empty()) {
+        return immune_class_table_only ?
+               immune_spaces_.ContainsObject(reinterpret_cast<mirror::Object*>(&*set.begin())) :
+               !pool->Contains(reinterpret_cast<void*>(&*set.begin()));
+      }
+      return false;
+    };
+    linker->VisitClassTables([cond, &updater](ClassTable* table)
+                                 REQUIRES_SHARED(Locks::mutator_lock_) {
+                               table->VisitClassesIfConditionMet(cond, updater);
+                             });
+    ReaderMutexLock rmu(thread_running_gc_, *Locks::classlinker_classes_lock_);
+    linker->GetBootClassTable()->VisitClassesIfConditionMet(cond, updater);
+  }
+}
+
 void MarkCompact::CompactionPause() {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   Runtime* runtime = Runtime::Current();
@@ -2791,65 +2841,6 @@ void MarkCompact::CompactionPause() {
     heap_->GetReferenceProcessor()->UpdateRoots(this);
   }
   {
-    TimingLogger::ScopedTiming t2("(Paused)UpdateClassLoaderRoots", GetTimings());
-    ReaderMutexLock rmu(thread_running_gc_, *Locks::classlinker_classes_lock_);
-    {
-      ClassLoaderRootsUpdater updater(this);
-      runtime->GetClassLinker()->VisitClassLoaders(&updater);
-    }
-  }
-
-  bool has_zygote_space = heap_->HasZygoteSpace();
-  // TODO: Find out why it's not sufficient to visit native roots of immune
-  // spaces, and why all the pre-zygote fork arenas have to be linearly updated.
-  // Is it possible that some native root starts getting pointed to by some object
-  // in moving space after fork? Or are we missing a write-barrier somewhere
-  // when a native root is updated?
-  GcVisitedArenaPool* arena_pool =
-      static_cast<GcVisitedArenaPool*>(runtime->GetLinearAllocArenaPool());
-  if (uffd_ == kFallbackMode || (!has_zygote_space && runtime->IsZygote())) {
-    // Besides fallback-mode, visit linear-alloc space in the pause for zygote
-    // processes prior to first fork (that's when zygote space gets created).
-    if (kIsDebugBuild && IsValidFd(uffd_)) {
-      // All arenas allocated so far are expected to be pre-zygote fork.
-      arena_pool->ForEachAllocatedArena(
-          [](const TrackedArena& arena)
-              REQUIRES_SHARED(Locks::mutator_lock_) { CHECK(arena.IsPreZygoteForkArena()); });
-    }
-    LinearAllocPageUpdater updater(this);
-    arena_pool->VisitRoots(updater);
-  } else {
-    // Clear the flag as we care about this only if arenas are freed during
-    // concurrent compaction.
-    arena_pool->ClearArenasFreed();
-    arena_pool->ForEachAllocatedArena(
-        [this](const TrackedArena& arena) REQUIRES_SHARED(Locks::mutator_lock_) {
-          // The pre-zygote fork arenas are not visited concurrently in the
-          // zygote children processes. The native roots of the dirty objects
-          // are visited during immune space visit below.
-          if (!arena.IsPreZygoteForkArena()) {
-            uint8_t* last_byte = arena.GetLastUsedByte();
-            CHECK(linear_alloc_arenas_.insert({&arena, last_byte}).second);
-          } else {
-            LinearAllocPageUpdater updater(this);
-            arena.VisitRoots(updater);
-          }
-        });
-  }
-
-  SweepSystemWeaks(thread_running_gc_, runtime, /*paused*/ true);
-
-  {
-    TimingLogger::ScopedTiming t2("(Paused)UpdateConcurrentRoots", GetTimings());
-    runtime->VisitConcurrentRoots(this, kVisitRootFlagAllRoots);
-  }
-  {
-    // TODO: don't visit the transaction roots if it's not active.
-    TimingLogger::ScopedTiming t2("(Paused)UpdateNonThreadRoots", GetTimings());
-    runtime->VisitNonThreadRoots(this);
-  }
-
-  {
     // TODO: Immune space updation has to happen either before or after
     // remapping pre-compact pages to from-space. And depending on when it's
     // done, we have to invoke VisitRefsForCompaction() with or without
@@ -2880,11 +2871,98 @@ void MarkCompact::CompactionPause() {
     }
   }
 
-  if (use_uffd_sigbus_) {
-    // Release order wrt to mutator threads' SIGBUS handler load.
-    sigbus_in_progress_count_.store(0, std::memory_order_release);
+  {
+    TimingLogger::ScopedTiming t2("(Paused)UpdateRoots", GetTimings());
+    runtime->VisitConcurrentRoots(this, kVisitRootFlagAllRoots);
+    runtime->VisitNonThreadRoots(this);
+    {
+      ClassLinker* linker = runtime->GetClassLinker();
+      ClassLoaderRootsUpdater updater(this);
+      ReaderMutexLock rmu(thread_running_gc_, *Locks::classlinker_classes_lock_);
+      linker->VisitClassLoaders(&updater);
+      linker->GetBootClassTable()->VisitRoots(updater, /*skip_classes=*/true);
+    }
+    SweepSystemWeaks(thread_running_gc_, runtime, /*paused=*/true);
+
+    bool has_zygote_space = heap_->HasZygoteSpace();
+    GcVisitedArenaPool* arena_pool =
+        static_cast<GcVisitedArenaPool*>(runtime->GetLinearAllocArenaPool());
+    // Update immune/pre-zygote class-tables in case class redefinition took
+    // place. pre-zygote class-tables that are not in immune spaces are updated
+    // below if we are in fallback-mode or if there is no zygote space. So in
+    // that case only visit class-tables that are there in immune-spaces.
+    UpdateClassTableClasses(runtime, uffd_ == kFallbackMode || !has_zygote_space);
+
+    // Acquire arena-pool's lock, which should be released after the pool is
+    // userfaultfd registered. This is to ensure that no new arenas are
+    // allocated and used in between. Since they will not be captured in
+    // linear_alloc_arenas_ below, we will miss updating their pages. The same
+    // reason also applies to new allocations within the existing arena which
+    // may change last_byte.
+    // Since we are in a STW pause, this shouldn't happen anyways, but holding
+    // the lock confirms it.
+    // TODO (b/305779657): Replace with ExclusiveTryLock() and assert that it
+    // doesn't fail once it is available for ReaderWriterMutex.
+    WriterMutexLock pool_wmu(thread_running_gc_, arena_pool->GetLock());
+
+    // TODO: Find out why it's not sufficient to visit native roots of immune
+    // spaces, and why all the pre-zygote fork arenas have to be linearly updated.
+    // Is it possible that some native root starts getting pointed to by some object
+    // in moving space after fork? Or are we missing a write-barrier somewhere
+    // when a native root is updated?
+    auto arena_visitor = [this](uint8_t* page_begin, uint8_t* first_obj, size_t page_size)
+                             REQUIRES_SHARED(Locks::mutator_lock_) {
+                           LinearAllocPageUpdater updater(this);
+                           if (first_obj != nullptr) {
+                             updater.MultiObjectArena(page_begin, first_obj);
+                           } else {
+                             updater.SingleObjectArena(page_begin, page_size);
+                           }
+                         };
+    if (uffd_ == kFallbackMode || (!has_zygote_space && runtime->IsZygote())) {
+      // Besides fallback-mode, visit linear-alloc space in the pause for zygote
+      // processes prior to first fork (that's when zygote space gets created).
+      if (kIsDebugBuild && IsValidFd(uffd_)) {
+        // All arenas allocated so far are expected to be pre-zygote fork.
+        arena_pool->ForEachAllocatedArena(
+            [](const TrackedArena& arena)
+                REQUIRES_SHARED(Locks::mutator_lock_) { CHECK(arena.IsPreZygoteForkArena()); });
+      }
+      arena_pool->VisitRoots(arena_visitor);
+    } else {
+      // Inform the arena-pool that compaction is going on. So the TrackedArena
+      // objects corresponding to the arenas that are freed shouldn't be deleted
+      // immediately. We will do that in FinishPhase(). This is to avoid ABA
+      // problem.
+      arena_pool->DeferArenaFreeing();
+      arena_pool->ForEachAllocatedArena(
+          [this, arena_visitor, has_zygote_space](const TrackedArena& arena)
+              REQUIRES_SHARED(Locks::mutator_lock_) {
+            // The pre-zygote fork arenas are not visited concurrently in the
+            // zygote children processes. The native roots of the dirty objects
+            // are visited during immune space visit below.
+            if (!arena.IsPreZygoteForkArena()) {
+              uint8_t* last_byte = arena.GetLastUsedByte();
+              auto ret = linear_alloc_arenas_.insert({&arena, last_byte});
+              CHECK(ret.second);
+            } else if (!arena.IsSingleObjectArena() || !has_zygote_space) {
+              // Pre-zygote class-table and intern-table don't need to be updated.
+              // TODO: Explore the possibility of using /proc/self/pagemap to
+              // fetch which pages in these arenas are private-dirty and then only
+              // visit those pages. To optimize it further, we can keep all
+              // pre-zygote arenas in a single memory range so that just one read
+              // from pagemap is sufficient.
+              arena.VisitRoots(arena_visitor);
+            }
+          });
+    }
+    if (use_uffd_sigbus_) {
+      // Release order wrt to mutator threads' SIGBUS handler load.
+      sigbus_in_progress_count_.store(0, std::memory_order_release);
+    }
+    KernelPreparation();
   }
-  KernelPreparation();
+
   UpdateNonMovingSpace();
   // fallback mode
   if (uffd_ == kFallbackMode) {
@@ -2950,7 +3028,7 @@ void MarkCompact::KernelPrepareRangeForUffd(uint8_t* to_addr,
 }
 
 void MarkCompact::KernelPreparation() {
-  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  TimingLogger::ScopedTiming t("(Paused)KernelPreparation", GetTimings());
   uint8_t* moving_space_begin = bump_pointer_space_->Begin();
   size_t moving_space_size = bump_pointer_space_->Capacity();
   int mode = kCopyMode;
@@ -3377,14 +3455,22 @@ void MarkCompact::ConcurrentlyProcessLinearAllocPage(uint8_t* fault_page, bool i
     arena_iter = arena_iter != linear_alloc_arenas_.begin() ? std::prev(arena_iter)
                                                             : linear_alloc_arenas_.end();
   }
-  if (arena_iter == linear_alloc_arenas_.end() || arena_iter->second <= fault_page) {
+  // Unlike ProcessLinearAlloc(), we don't need to hold arena-pool's lock here
+  // because a thread trying to access the page and as a result causing this
+  // userfault confirms that nobody can delete the corresponding arena and
+  // release its pages.
+  // NOTE: We may have some memory range be recycled several times during a
+  // compaction cycle, thereby potentially causing userfault on the same page
+  // several times. That's not a problem as all of them (except for possibly the
+  // first one) would require us mapping a zero-page, which we do without updating
+  // the 'state_arr'.
+  if (arena_iter == linear_alloc_arenas_.end() ||
+      arena_iter->first->IsWaitingForDeletion() ||
+      arena_iter->second <= fault_page) {
     // Fault page isn't in any of the arenas that existed before we started
     // compaction. So map zeropage and return.
     ZeropageIoctl(fault_page, /*tolerate_eexist=*/true, /*tolerate_enoent=*/false);
   } else {
-    // fault_page should always belong to some arena.
-    DCHECK(arena_iter != linear_alloc_arenas_.end())
-        << "fault_page:" << static_cast<void*>(fault_page) << "is_minor_fault:" << is_minor_fault;
     // Find the linear-alloc space containing fault-page
     LinearAllocSpaceData* space_data = nullptr;
     for (auto& data : linear_alloc_spaces_data_) {
@@ -3409,10 +3495,15 @@ void MarkCompact::ConcurrentlyProcessLinearAllocPage(uint8_t* fault_page, bool i
           if (state_arr[page_idx].compare_exchange_strong(
                   state, PageState::kProcessingAndMapping, std::memory_order_acquire)) {
             if (kMode == kCopyMode || is_minor_fault) {
-              uint8_t* first_obj = arena_iter->first->GetFirstObject(fault_page);
-              DCHECK_NE(first_obj, nullptr);
               LinearAllocPageUpdater updater(this);
-              updater(fault_page + diff, first_obj + diff);
+              uint8_t* first_obj = arena_iter->first->GetFirstObject(fault_page);
+              // null first_obj indicates that it's a page from arena for
+              // intern-table/class-table. So first object isn't required.
+              if (first_obj != nullptr) {
+                updater.MultiObjectArena(fault_page + diff, first_obj + diff);
+              } else {
+                updater.SingleObjectArena(fault_page + diff, kPageSize);
+              }
               if (kMode == kCopyMode) {
                 MapUpdatedLinearAllocPage(fault_page,
                                           fault_page + diff,
@@ -3481,6 +3572,7 @@ void MarkCompact::ConcurrentlyProcessLinearAllocPage(uint8_t* fault_page, bool i
 void MarkCompact::ProcessLinearAlloc() {
   GcVisitedArenaPool* arena_pool =
       static_cast<GcVisitedArenaPool*>(Runtime::Current()->GetLinearAllocArenaPool());
+  DCHECK_EQ(thread_running_gc_, Thread::Current());
   for (auto& pair : linear_alloc_arenas_) {
     const TrackedArena* arena = pair.first;
     size_t arena_size;
@@ -3488,12 +3580,15 @@ void MarkCompact::ProcessLinearAlloc() {
     ptrdiff_t diff;
     bool others_processing;
     {
-      // Acquire arena-pool's lock so that the arena being worked cannot be
-      // deallocated at the same time.
-      std::lock_guard<std::mutex> lock(arena_pool->GetLock());
+      // Acquire arena-pool's lock (in shared-mode) so that the arena being updated
+      // does not get deleted at the same time. If this critical section is too
+      // long and impacts mutator response time, then we get rid of this lock by
+      // holding onto memory ranges of all deleted (since compaction pause)
+      // arenas until completion finishes.
+      ReaderMutexLock rmu(thread_running_gc_, arena_pool->GetLock());
       // If any arenas were freed since compaction pause then skip them from
       // visiting.
-      if (arena_pool->AreArenasFreed() && !arena_pool->FindAllocatedArena(arena)) {
+      if (arena->IsWaitingForDeletion()) {
         continue;
       }
       uint8_t* last_byte = pair.second;
@@ -3509,11 +3604,12 @@ void MarkCompact::ProcessLinearAlloc() {
           break;
         }
       }
-      DCHECK_NE(space_data, nullptr);
+      CHECK_NE(space_data, nullptr);
       diff = space_data->shadow_.Begin() - space_data->begin_;
       auto visitor = [space_data, last_byte, diff, this, &others_processing](
                          uint8_t* page_begin,
-                         uint8_t* first_obj) REQUIRES_SHARED(Locks::mutator_lock_) {
+                         uint8_t* first_obj,
+                         size_t page_size) REQUIRES_SHARED(Locks::mutator_lock_) {
         // No need to process pages past last_byte as they already have updated
         // gc-roots, if any.
         if (page_begin >= last_byte) {
@@ -3532,7 +3628,14 @@ void MarkCompact::ProcessLinearAlloc() {
         // reason, we used 'release' order for changing the state to 'processed'.
         if (state_arr[page_idx].compare_exchange_strong(
                 expected_state, desired_state, std::memory_order_acquire)) {
-          updater(page_begin + diff, first_obj + diff);
+          // null first_obj indicates that it's a page from arena for
+          // intern-table/class-table. So first object isn't required.
+          if (first_obj != nullptr) {
+            updater.MultiObjectArena(page_begin + diff, first_obj + diff);
+          } else {
+            DCHECK_EQ(page_size, kPageSize);
+            updater.SingleObjectArena(page_begin + diff, page_size);
+          }
           expected_state = PageState::kProcessing;
           if (!minor_fault_initialized_) {
             MapUpdatedLinearAllocPage(
@@ -4333,6 +4436,9 @@ void MarkCompact::FinishPhase() {
     DCHECK_EQ(fstat(moving_to_space_fd_, &buf), 0) << "fstat failed: " << strerror(errno);
     DCHECK_EQ(buf.st_blocks, 0u);
   }
+  GcVisitedArenaPool* arena_pool =
+      static_cast<GcVisitedArenaPool*>(Runtime::Current()->GetLinearAllocArenaPool());
+  arena_pool->DeleteUnusedArenas();
 }
 
 }  // namespace collector
diff --git a/runtime/gc/collector/mark_compact.h b/runtime/gc/collector/mark_compact.h
index 97709b8cd4..c75cfb56bb 100644
--- a/runtime/gc/collector/mark_compact.h
+++ b/runtime/gc/collector/mark_compact.h
@@ -543,6 +543,11 @@ class MarkCompact final : public GarbageCollector {
   // Initialize all the info-map related fields of this GC. Returns total size
   // of all the structures in info-map.
   size_t InitializeInfoMap(uint8_t* p, size_t moving_space_sz);
+  // Update class-table classes in compaction pause if we are running in debuggable
+  // mode. Only visit class-table in image spaces if 'immune_class_table_only'
+  // is true.
+  void UpdateClassTableClasses(Runtime* runtime, bool immune_class_table_only)
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
   // For checkpoints
   Barrier gc_barrier_;
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 796f8fbf09..329777c7d7 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -2451,9 +2451,11 @@ void Heap::PreZygoteFork() {
     return;
   }
   Runtime* runtime = Runtime::Current();
+  // Setup linear-alloc pool for post-zygote fork allocations before freezing
+  // snapshots of intern-table and class-table.
+  runtime->SetupLinearAllocForPostZygoteFork(self);
   runtime->GetInternTable()->AddNewTable();
   runtime->GetClassLinker()->MoveClassTableToPreZygote();
-  runtime->SetupLinearAllocForPostZygoteFork(self);
   VLOG(heap) << "Starting PreZygoteFork";
   // The end of the non-moving space may be protected, unprotect it so that we can copy the zygote
   // there.
diff --git a/runtime/intern_table.h b/runtime/intern_table.h
index d115f51f7e..0fcada1af4 100644
--- a/runtime/intern_table.h
+++ b/runtime/intern_table.h
@@ -17,8 +17,8 @@
 #ifndef ART_RUNTIME_INTERN_TABLE_H_
 #define ART_RUNTIME_INTERN_TABLE_H_
 
-#include "base/allocator.h"
 #include "base/dchecked_vector.h"
+#include "base/gc_visited_arena_pool.h"
 #include "base/hash_set.h"
 #include "base/mutex.h"
 #include "gc/weak_root_state.h"
@@ -109,11 +109,12 @@ class InternTable {
     }
   };
 
-  using UnorderedSet = HashSet<GcRoot<mirror::String>,
-                               GcRootEmptyFn,
-                               StringHash,
-                               StringEquals,
-                               TrackingAllocator<GcRoot<mirror::String>, kAllocatorTagInternTable>>;
+  using UnorderedSet =
+      HashSet<GcRoot<mirror::String>,
+              GcRootEmptyFn,
+              StringHash,
+              StringEquals,
+              GcRootArenaAllocator<GcRoot<mirror::String>, kAllocatorTagInternTable>>;
 
   InternTable();
 
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 9c3b9de70c..97db5fb0fd 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -859,7 +859,11 @@ void Runtime::CallExitHook(jint status) {
 }
 
 void Runtime::SweepSystemWeaks(IsMarkedVisitor* visitor) {
-  GetInternTable()->SweepInternTableWeaks(visitor);
+  // Userfaultfd compaction updates weak intern-table page-by-page via
+  // LinearAlloc.
+  if (!GetHeap()->IsPerformingUffdCompaction()) {
+    GetInternTable()->SweepInternTableWeaks(visitor);
+  }
   GetMonitorList()->SweepMonitorList(visitor);
   GetJavaVM()->SweepJniWeakGlobals(visitor);
   GetHeap()->SweepAllocationRecords(visitor);
@@ -2590,8 +2594,14 @@ void Runtime::VisitConstantRoots(RootVisitor* visitor) {
 }
 
 void Runtime::VisitConcurrentRoots(RootVisitor* visitor, VisitRootFlags flags) {
-  intern_table_->VisitRoots(visitor, flags);
-  class_linker_->VisitRoots(visitor, flags);
+  // Userfaultfd compaction updates intern-tables and class-tables page-by-page
+  // via LinearAlloc. So don't visit them here.
+  if (GetHeap()->IsPerformingUffdCompaction()) {
+    class_linker_->VisitRoots(visitor, flags, /*visit_class_roots=*/false);
+  } else {
+    intern_table_->VisitRoots(visitor, flags);
+    class_linker_->VisitRoots(visitor, flags, /*visit_class_roots=*/true);
+  }
   jni_id_manager_->VisitRoots(visitor);
   heap_->VisitAllocationRecords(visitor);
   if (jit_ != nullptr) {