2 files changed, 557 insertions, 281 deletions
diff --git a/runtime/gc/collector/mark_compact.cc b/runtime/gc/collector/mark_compact.cc
index 08404619b3..e14f64db24 100644
--- a/runtime/gc/collector/mark_compact.cc
+++ b/runtime/gc/collector/mark_compact.cc
@@ -308,7 +308,8 @@ static constexpr size_t kMaxNumUffdWorkers = 2;
 // phase.
 static constexpr size_t kMutatorCompactionBufferCount = 2048;
 // Minimum from-space chunk to be madvised (during concurrent compaction) in one go.
-static constexpr ssize_t kMinFromSpaceMadviseSize = 1 * MB;
+// Choose a reasonable size to avoid making too many batched ioctl and madvise calls.
+static constexpr ssize_t kMinFromSpaceMadviseSize = 8 * MB;
 // Concurrent compaction termination logic is different (and slightly more efficient) if the
 // kernel has the fault-retry feature (allowing repeated faults on the same page), which was
 // introduced in 5.7 (https://android-review.git.corp.google.com/c/kernel/common/+/1540088).
@@ -395,6 +396,15 @@ static bool IsSigbusFeatureAvailable() {
   return (gUffdFeatures & kUffdFeaturesForSigbus) == kUffdFeaturesForSigbus;
 }
 
+size_t MarkCompact::ComputeInfoMapSize() {
+  size_t moving_space_size = bump_pointer_space_->Capacity();
+  size_t chunk_info_vec_size = moving_space_size / kOffsetChunkSize;
+  size_t nr_moving_pages = DivideByPageSize(moving_space_size);
+  size_t nr_non_moving_pages = DivideByPageSize(heap_->GetNonMovingSpace()->Capacity());
+  return chunk_info_vec_size * sizeof(uint32_t) + nr_non_moving_pages * sizeof(ObjReference) +
+         nr_moving_pages * (sizeof(ObjReference) + sizeof(uint32_t) + sizeof(Atomic<uint32_t>));
+}
+
 size_t MarkCompact::InitializeInfoMap(uint8_t* p, size_t moving_space_sz) {
   size_t nr_moving_pages = DivideByPageSize(moving_space_sz);
 
@@ -402,14 +412,18 @@ size_t MarkCompact::InitializeInfoMap(uint8_t* p, size_t moving_space_sz) {
   vector_length_ = moving_space_sz / kOffsetChunkSize;
   size_t total = vector_length_ * sizeof(uint32_t);
 
-  first_objs_non_moving_space_ = reinterpret_cast<ObjReference*>(p + total);
-  total += DivideByPageSize(heap_->GetNonMovingSpace()->Capacity()) * sizeof(ObjReference);
-
   first_objs_moving_space_ = reinterpret_cast<ObjReference*>(p + total);
   total += nr_moving_pages * sizeof(ObjReference);
 
   pre_compact_offset_moving_space_ = reinterpret_cast<uint32_t*>(p + total);
   total += nr_moving_pages * sizeof(uint32_t);
+
+  moving_pages_status_ = reinterpret_cast<Atomic<uint32_t>*>(p + total);
+  total += nr_moving_pages * sizeof(Atomic<uint32_t>);
+
+  first_objs_non_moving_space_ = reinterpret_cast<ObjReference*>(p + total);
+  total += DivideByPageSize(heap_->GetNonMovingSpace()->Capacity()) * sizeof(ObjReference);
+  DCHECK_EQ(total, ComputeInfoMapSize());
   return total;
 }
 
@@ -453,26 +467,21 @@ MarkCompact::MarkCompact(Heap* heap)
           reinterpret_cast<uintptr_t>(bump_pointer_space_->Begin()),
           reinterpret_cast<uintptr_t>(bump_pointer_space_->Limit())));
 
-  // Create one MemMap for all the data structures
-  size_t moving_space_size = bump_pointer_space_->Capacity();
-  size_t chunk_info_vec_size = moving_space_size / kOffsetChunkSize;
-  size_t nr_moving_pages = DivideByPageSize(moving_space_size);
-  size_t nr_non_moving_pages = DivideByPageSize(heap->GetNonMovingSpace()->Capacity());
-
   std::string err_msg;
-  info_map_ = MemMap::MapAnonymous("Concurrent mark-compact chunk-info vector",
-                                   chunk_info_vec_size * sizeof(uint32_t)
-                                   + nr_non_moving_pages * sizeof(ObjReference)
-                                   + nr_moving_pages * sizeof(ObjReference)
-                                   + nr_moving_pages * sizeof(uint32_t),
-                                   PROT_READ | PROT_WRITE,
-                                   /*low_4gb=*/ false,
-                                   &err_msg);
-  if (UNLIKELY(!info_map_.IsValid())) {
-    LOG(FATAL) << "Failed to allocate concurrent mark-compact chunk-info vector: " << err_msg;
-  } else {
-    size_t total = InitializeInfoMap(info_map_.Begin(), moving_space_size);
-    DCHECK_EQ(total, info_map_.Size());
+  size_t moving_space_size = bump_pointer_space_->Capacity();
+  {
+    // Create one MemMap for all the data structures
+    info_map_ = MemMap::MapAnonymous("Concurrent mark-compact chunk-info vector",
+                                     ComputeInfoMapSize(),
+                                     PROT_READ | PROT_WRITE,
+                                     /*low_4gb=*/false,
+                                     &err_msg);
+    if (UNLIKELY(!info_map_.IsValid())) {
+      LOG(FATAL) << "Failed to allocate concurrent mark-compact chunk-info vector: " << err_msg;
+    } else {
+      size_t total = InitializeInfoMap(info_map_.Begin(), moving_space_size);
+      DCHECK_EQ(total, info_map_.Size());
+    }
   }
 
   size_t moving_space_alignment = Heap::BestPageTableAlignment(moving_space_size);
@@ -835,11 +844,12 @@ void MarkCompact::InitMovingSpaceFirstObjects(const size_t vec_len) {
   size_t chunk_idx;
   // Find the first live word in the space
   for (chunk_idx = 0; chunk_info_vec_[chunk_idx] == 0; chunk_idx++) {
-    if (chunk_idx > vec_len) {
+    if (chunk_idx >= vec_len) {
       // We don't have any live data on the moving-space.
       return;
     }
   }
+  DCHECK_LT(chunk_idx, vec_len);
   // Use live-words bitmap to find the first word
   offset_in_chunk_word = live_words_bitmap_->FindNthLiveWordOffset(chunk_idx, /*n*/ 0);
   offset = chunk_idx * kBitsPerVectorWord + offset_in_chunk_word;
@@ -860,7 +870,7 @@ void MarkCompact::InitMovingSpaceFirstObjects(const size_t vec_len) {
   uint32_t page_live_bytes = 0;
   while (true) {
     for (; page_live_bytes <= gPageSize; chunk_idx++) {
-      if (chunk_idx > vec_len) {
+      if (chunk_idx >= vec_len) {
         moving_first_objs_count_ = to_space_page_idx;
         return;
       }
@@ -2037,42 +2047,45 @@ void MarkCompact::MapProcessedPages(uint8_t* to_space_start,
   }
 }
 
-void MarkCompact::ZeropageIoctl(void* addr, bool tolerate_eexist, bool tolerate_enoent) {
+void MarkCompact::ZeropageIoctl(void* addr,
+                                size_t length,
+                                bool tolerate_eexist,
+                                bool tolerate_enoent) {
   struct uffdio_zeropage uffd_zeropage;
   DCHECK(IsAlignedParam(addr, gPageSize));
   uffd_zeropage.range.start = reinterpret_cast<uintptr_t>(addr);
-  uffd_zeropage.range.len = gPageSize;
+  uffd_zeropage.range.len = length;
   uffd_zeropage.mode = 0;
   int ret = ioctl(uffd_, UFFDIO_ZEROPAGE, &uffd_zeropage);
   if (LIKELY(ret == 0)) {
-    DCHECK_EQ(uffd_zeropage.zeropage, static_cast<ssize_t>(gPageSize));
+    DCHECK_EQ(uffd_zeropage.zeropage, static_cast<ssize_t>(length));
   } else {
     CHECK((tolerate_enoent && errno == ENOENT) || (tolerate_eexist && errno == EEXIST))
         << "ioctl_userfaultfd: zeropage failed: " << strerror(errno) << ". addr:" << addr;
   }
 }
 
-void MarkCompact::CopyIoctl(void* dst, void* buffer) {
+void MarkCompact::CopyIoctl(void* dst, void* buffer, size_t length) {
   struct uffdio_copy uffd_copy;
   uffd_copy.src = reinterpret_cast<uintptr_t>(buffer);
   uffd_copy.dst = reinterpret_cast<uintptr_t>(dst);
-  uffd_copy.len = gPageSize;
+  uffd_copy.len = length;
   uffd_copy.mode = 0;
   CHECK_EQ(ioctl(uffd_, UFFDIO_COPY, &uffd_copy), 0)
       << "ioctl_userfaultfd: copy failed: " << strerror(errno) << ". src:" << buffer
       << " dst:" << dst;
-  DCHECK_EQ(uffd_copy.copy, static_cast<ssize_t>(gPageSize));
+  DCHECK_EQ(uffd_copy.copy, static_cast<ssize_t>(length));
 }
 
 template <int kMode, typename CompactionFn>
-void MarkCompact::DoPageCompactionWithStateChange(size_t page_idx,
-                                                  size_t status_arr_len,
+bool MarkCompact::DoPageCompactionWithStateChange(size_t page_idx,
                                                   uint8_t* to_space_page,
                                                   uint8_t* page,
+                                                  bool map_immediately,
                                                   CompactionFn func) {
-  PageState expected_state = PageState::kUnprocessed;
-  PageState desired_state =
-      kMode == kCopyMode ? PageState::kProcessingAndMapping : PageState::kProcessing;
+  uint32_t expected_state = static_cast<uint8_t>(PageState::kUnprocessed);
+  uint32_t desired_state = static_cast<uint8_t>(map_immediately ? PageState::kProcessingAndMapping :
+                                                                  PageState::kProcessing);
   // In the concurrent case (kMode != kFallbackMode) we need to ensure that the update
   // to moving_spaces_status_[page_idx] is released before the contents of the page are
   // made accessible to other threads.
@@ -2083,32 +2096,57 @@ void MarkCompact::DoPageCompactionWithStateChange(size_t page_idx,
                                     expected_state, desired_state, std::memory_order_acquire)) {
     func();
     if (kMode == kCopyMode) {
-      CopyIoctl(to_space_page, page);
-      if (use_uffd_sigbus_) {
+      if (map_immediately) {
+        CopyIoctl(to_space_page, page, gPageSize);
+        // Store is sufficient as no other thread could modify the status at this
+        // point. Relaxed order is sufficient as the ioctl will act as a fence.
+        moving_pages_status_[page_idx].store(static_cast<uint8_t>(PageState::kProcessedAndMapped),
+                                             std::memory_order_relaxed);
+      } else {
+        // Add the src page's index in the status word.
+        DCHECK(from_space_map_.HasAddress(page));
+        DCHECK_LE(static_cast<size_t>(page - from_space_begin_),
+                  std::numeric_limits<uint32_t>::max());
+        uint32_t store_val = page - from_space_begin_;
+        DCHECK_EQ(store_val & kPageStateMask, 0u);
+        store_val |= static_cast<uint8_t>(PageState::kProcessed);
         // Store is sufficient as no other thread would modify the status at this point.
-        moving_pages_status_[page_idx].store(PageState::kProcessedAndMapped,
-                                             std::memory_order_release);
+        moving_pages_status_[page_idx].store(store_val, std::memory_order_release);
       }
     } else if (kMode == kMinorFaultMode) {
-      expected_state = PageState::kProcessing;
-      desired_state = PageState::kProcessed;
+      expected_state = static_cast<uint8_t>(PageState::kProcessing);
+      desired_state = static_cast<uint8_t>(PageState::kProcessed);
       // the CAS needs to be with release order to ensure that stores to the
       // page makes it to memory *before* other threads observe that it's
       // ready to be mapped.
       if (!moving_pages_status_[page_idx].compare_exchange_strong(
               expected_state, desired_state, std::memory_order_release)) {
         // Some mutator has requested to map the page after processing it.
-        DCHECK_EQ(expected_state, PageState::kProcessingAndMapping);
-        MapProcessedPages</*kFirstPageMapping=*/true>(
-            to_space_page, moving_pages_status_, page_idx, status_arr_len);
+        DCHECK_EQ(expected_state, static_cast<uint8_t>(PageState::kProcessingAndMapping));
       }
+      UNREACHABLE();
     }
+    return true;
   } else {
-    DCHECK_GT(expected_state, PageState::kProcessed);
+    // Only GC thread could have set the state to Processed.
+    DCHECK_NE(expected_state, static_cast<uint8_t>(PageState::kProcessed));
+    return false;
   }
 }
 
-void MarkCompact::FreeFromSpacePages(size_t cur_page_idx, int mode) {
+static void BackOff(uint32_t i) {
+  static constexpr uint32_t kYieldMax = 5;
+  // TODO: Consider adding x86 PAUSE and/or ARM YIELD here.
+  if (i <= kYieldMax) {
+    sched_yield();
+  } else {
+    // nanosleep is not in the async-signal-safe list, but bionic implements it
+    // with a pure system call, so it should be fine.
+    NanoSleep(10000ull * (i - kYieldMax));
+  }
+}
+
+bool MarkCompact::FreeFromSpacePages(size_t cur_page_idx, int mode, size_t end_idx_for_mapping) {
   // Thanks to sliding compaction, bump-pointer allocations, and reverse
   // compaction (see CompactMovingSpace) the logic here is pretty simple: find
   // the to-space page up to which compaction has finished, all the from-space
@@ -2118,7 +2156,8 @@ void MarkCompact::FreeFromSpacePages(size_t cur_page_idx, int mode) {
   // Find the to-space page up to which the corresponding from-space pages can be
   // freed.
   for (; idx > cur_page_idx; idx--) {
-    PageState state = moving_pages_status_[idx - 1].load(std::memory_order_acquire);
+    PageState state = static_cast<PageState>(
+        static_cast<uint8_t>(moving_pages_status_[idx - 1].load(std::memory_order_acquire)));
     if (state == PageState::kMutatorProcessing) {
       // Some mutator is working on the page.
       break;
@@ -2130,7 +2169,7 @@ void MarkCompact::FreeFromSpacePages(size_t cur_page_idx, int mode) {
   DCHECK_LE(idx, last_checked_reclaim_page_idx_);
   if (idx == last_checked_reclaim_page_idx_) {
     // Nothing to do.
-    return;
+    return false;
   }
 
   uint8_t* reclaim_begin;
@@ -2217,15 +2256,43 @@ void MarkCompact::FreeFromSpacePages(size_t cur_page_idx, int mode) {
     // lower than the reclaim range.
     break;
   }
-
+  bool ret = false;
   ssize_t size = last_reclaimed_page_ - reclaim_begin;
-  if (size >= kMinFromSpaceMadviseSize) {
+  if (size > kMinFromSpaceMadviseSize) {
+    // Map all the pages in the range.
+    if (cur_page_idx < end_idx_for_mapping) {
+      size_t len = MapMovingSpacePages(cur_page_idx, end_idx_for_mapping);
+      // The pages that were not mapped by gc-thread have to be completed
+      // before we madvise them. So wait for their status to change to 'mapped'.
+      // The wait is expected to be short as the read state indicates that
+      // another thread is actively working on mapping the page.
+      for (size_t i = cur_page_idx + DivideByPageSize(len); i < end_idx_for_mapping; i++) {
+        PageState state = static_cast<PageState>(
+            static_cast<uint8_t>(moving_pages_status_[i].load(std::memory_order_relaxed)));
+        uint32_t backoff_count = 0;
+        while (state != PageState::kProcessedAndMapped) {
+          BackOff(backoff_count++);
+          state = static_cast<PageState>(
+              static_cast<uint8_t>(moving_pages_status_[i].load(std::memory_order_relaxed)));
+        }
+      }
+      ret = true;
+    }
+    // Retain a few pages for subsequent compactions.
+    const ssize_t gBufferPages = 4 * gPageSize;
+    DCHECK_LT(gBufferPages, kMinFromSpaceMadviseSize);
+    size -= gBufferPages;
+    uint8_t* addr = last_reclaimed_page_ - size;
     int behavior = minor_fault_initialized_ ? MADV_REMOVE : MADV_DONTNEED;
-    CHECK_EQ(madvise(reclaim_begin + from_space_slide_diff_, size, behavior), 0)
+    CHECK_EQ(madvise(addr + from_space_slide_diff_, size, behavior), 0)
         << "madvise of from-space failed: " << strerror(errno);
-    last_reclaimed_page_ = reclaim_begin;
+    last_reclaimed_page_ = addr;
+    cur_reclaimable_page_ = addr;
   }
+  CHECK_LE(reclaim_begin, last_reclaimable_page_);
+  last_reclaimable_page_ = reclaim_begin;
   last_checked_reclaim_page_idx_ = idx;
+  return ret;
 }
 
 void MarkCompact::UpdateClassAfterObjMap() {
@@ -2278,6 +2345,8 @@ void MarkCompact::CompactMovingSpace(uint8_t* page) {
   UpdateClassAfterObjMap();
   // These variables are maintained by FreeFromSpacePages().
   last_reclaimed_page_ = pre_compact_page;
+  last_reclaimable_page_ = last_reclaimed_page_;
+  cur_reclaimable_page_ = last_reclaimed_page_;
   last_checked_reclaim_page_idx_ = idx;
   class_after_obj_iter_ = class_after_obj_ordered_map_.rbegin();
   // Allocated-black pages
@@ -2296,9 +2365,9 @@ void MarkCompact::CompactMovingSpace(uint8_t* page) {
     uint32_t first_chunk_size = black_alloc_pages_first_chunk_size_[idx];
     if (first_obj != nullptr) {
       DoPageCompactionWithStateChange<kMode>(idx,
-                                             page_status_arr_len,
                                              to_space_end,
                                              page,
+                                             /*map_immediately=*/true,
                                              [&]() REQUIRES_SHARED(Locks::mutator_lock_) {
                                                SlideBlackPage(first_obj,
                                                               next_page_first_obj,
@@ -2310,13 +2379,15 @@ void MarkCompact::CompactMovingSpace(uint8_t* page) {
       // We are sliding here, so no point attempting to madvise for every
       // page. Wait for enough pages to be done.
       if (idx % DivideByPageSize(kMinFromSpaceMadviseSize) == 0) {
-        FreeFromSpacePages(idx, kMode);
+        FreeFromSpacePages(idx, kMode, /*end_idx_for_mapping=*/0);
       }
     }
     next_page_first_obj = first_obj;
   }
   DCHECK_EQ(pre_compact_page, black_allocations_begin_);
-
+  // Reserved page to be used if we can't find any reclaimable page for processing.
+  uint8_t* reserve_page = page;
+  size_t end_idx_for_mapping = idx;
   while (idx > 0) {
     idx--;
     to_space_end -= gPageSize;
@@ -2325,17 +2396,83 @@ void MarkCompact::CompactMovingSpace(uint8_t* page) {
       page = shadow_space_end;
     } else if (kMode == kFallbackMode) {
       page = to_space_end;
+    } else {
+      DCHECK_EQ(kMode, kCopyMode);
+      if (cur_reclaimable_page_ > last_reclaimable_page_) {
+        cur_reclaimable_page_ -= gPageSize;
+        page = cur_reclaimable_page_ + from_space_slide_diff_;
+      } else {
+        page = reserve_page;
+      }
     }
     mirror::Object* first_obj = first_objs_moving_space_[idx].AsMirrorPtr();
-    DoPageCompactionWithStateChange<kMode>(
-        idx, page_status_arr_len, to_space_end, page, [&]() REQUIRES_SHARED(Locks::mutator_lock_) {
+    bool success = DoPageCompactionWithStateChange<kMode>(
+        idx,
+        to_space_end,
+        page,
+        /*map_immediately=*/page == reserve_page,
+        [&]() REQUIRES_SHARED(Locks::mutator_lock_) {
           CompactPage(first_obj, pre_compact_offset_moving_space_[idx], page, kMode == kCopyMode);
         });
-    FreeFromSpacePages(idx, kMode);
+    if (kMode == kCopyMode && (!success || page == reserve_page) && end_idx_for_mapping - idx > 1) {
+      // map the pages in the following pages as they can't be mapped with
+      // the subsequent pages as their src-side pages won't be contiguous.
+      MapMovingSpacePages(idx + 1, end_idx_for_mapping);
+    }
+    if (FreeFromSpacePages(idx, kMode, end_idx_for_mapping)) {
+      end_idx_for_mapping = idx;
+    }
+  }
+  // map one last time to finish anything left.
+  if (end_idx_for_mapping > 0) {
+    MapMovingSpacePages(idx, end_idx_for_mapping);
   }
   DCHECK_EQ(to_space_end, bump_pointer_space_->Begin());
 }
 
+size_t MarkCompact::MapMovingSpacePages(size_t arr_idx, size_t arr_len) {
+  // Claim all the contiguous pages, which are ready to be mapped, and then do
+  // so in a single ioctl. This helps avoid the overhead of invoking syscall
+  // several times and also maps the already-processed pages, avoiding
+  // unnecessary faults on them.
+  DCHECK_LT(arr_idx, arr_len);
+  uint32_t cur_state = moving_pages_status_[arr_idx].load(std::memory_order_relaxed);
+  if ((cur_state & kPageStateMask) != static_cast<uint8_t>(PageState::kProcessed)) {
+    return 0;
+  }
+  uint32_t from_space_offset = cur_state & ~kPageStateMask;
+  uint8_t* to_space_start = moving_space_begin_ + arr_idx * gPageSize;
+  uint8_t* from_space_start = from_space_begin_ + from_space_offset;
+  DCHECK_ALIGNED_PARAM(to_space_start, gPageSize);
+  DCHECK_ALIGNED_PARAM(from_space_start, gPageSize);
+  size_t length = 0;
+  for (size_t i = arr_idx; i < arr_len; length += gPageSize, from_space_offset += gPageSize, i++) {
+    uint8_t desired_state = static_cast<uint8_t>(PageState::kProcessedAndMapping);
+    cur_state = moving_pages_status_[i].load(std::memory_order_relaxed);
+    // We need to guarantee that we don't end up sucsessfully marking a later
+    // page 'mapping' and then fail to mark an earlier page. To guarantee that
+    // we use acq_rel order.
+    if ((cur_state & kPageStateMask) != static_cast<uint8_t>(PageState::kProcessed) ||
+        !moving_pages_status_[i].compare_exchange_strong(
+            cur_state, desired_state, std::memory_order_acq_rel)) {
+      break;
+    }
+    DCHECK_EQ(from_space_offset, cur_state & ~kPageStateMask);
+  }
+  if (length > 0) {
+    CopyIoctl(to_space_start, from_space_start, length);
+    for (size_t i = arr_idx; length > 0; length -= gPageSize, i++) {
+      // It's sufficient to use relaxed memory-order as these stores are
+      // happening after ioctl, which acts as a fence.
+      // Store is sufficient as there are no other threads updating status of
+      // these pages.
+      moving_pages_status_[i].store(static_cast<uint8_t>(PageState::kProcessedAndMapped),
+                                    std::memory_order_relaxed);
+    }
+  }
+  return length;
+}
+
 void MarkCompact::UpdateNonMovingPage(mirror::Object* first, uint8_t* page) {
   DCHECK_LT(reinterpret_cast<uint8_t*>(first), page + gPageSize);
   // For every object found in the page, visit the previous object. This ensures
@@ -2853,17 +2990,6 @@ void MarkCompact::CompactionPause() {
     // TODO: We can reduce the time spent on this in a pause by performing one
     // round of this concurrently prior to the pause.
     UpdateMovingSpaceBlackAllocations();
-    // TODO: If we want to avoid this allocation in a pause then we will have to
-    // allocate an array for the entire moving-space size, which can be made
-    // part of info_map_.
-    moving_pages_status_ = new Atomic<PageState>[moving_first_objs_count_ + black_page_count_];
-    if (kIsDebugBuild) {
-      size_t len = moving_first_objs_count_ + black_page_count_;
-      for (size_t i = 0; i < len; i++) {
-          CHECK_EQ(moving_pages_status_[i].load(std::memory_order_relaxed),
-                   PageState::kUnprocessed);
-      }
-    }
     // Iterate over the allocation_stack_, for every object in the non-moving
     // space:
     // 1. Mark the object in live bitmap
@@ -2871,7 +2997,6 @@ void MarkCompact::CompactionPause() {
     // 3. In the corresponding page, if the first-object vector needs updating
     // then do so.
     UpdateNonMovingSpaceBlackAllocations();
-
     // This store is visible to mutator (or uffd worker threads) as the mutator
     // lock's unlock guarantees that.
     compacting_ = true;
@@ -3182,7 +3307,7 @@ void MarkCompact::ConcurrentCompaction(uint8_t* buf) {
       // zeropage so that the gc-thread can proceed. Otherwise, each thread does
       // it and the gc-thread will repeat this fault until thread_pool_counter == 0.
       if (!gKernelHasFaultRetry || ret == 1) {
-        ZeropageIoctl(fault_addr, /*tolerate_eexist=*/false, /*tolerate_enoent=*/false);
+        ZeropageIoctl(fault_addr, gPageSize, /*tolerate_eexist=*/false, /*tolerate_enoent=*/false);
       } else {
         struct uffdio_range uffd_range;
         uffd_range.start = msg.arg.pagefault.address;
@@ -3290,22 +3415,13 @@ bool MarkCompact::SigbusHandler(siginfo_t* info) {
   }
 }
 
-static void BackOff(uint32_t i) {
-  static constexpr uint32_t kYieldMax = 5;
-  // TODO: Consider adding x86 PAUSE and/or ARM YIELD here.
-  if (i <= kYieldMax) {
-    sched_yield();
-  } else {
-    // nanosleep is not in the async-signal-safe list, but bionic implements it
-    // with a pure system call, so it should be fine.
-    NanoSleep(10000ull * (i - kYieldMax));
-  }
-}
-
 template <int kMode>
 void MarkCompact::ConcurrentlyProcessMovingPage(uint8_t* fault_page,
                                                 uint8_t* buf,
                                                 size_t nr_moving_space_used_pages) {
+  // TODO: add a class for Scoped dtor to set that a page has already mapped.
+  // This helps in avoiding a zero-page ioctl in gc-thread before unregistering
+  // unused space.
   class ScopedInProgressCount {
    public:
     explicit ScopedInProgressCount(MarkCompact* collector) : collector_(collector) {
@@ -3320,6 +3436,7 @@ void MarkCompact::ConcurrentlyProcessMovingPage(uint8_t* fault_page,
     MarkCompact* collector_;
   };
 
+  Thread* self = Thread::Current();
   uint8_t* unused_space_begin =
       bump_pointer_space_->Begin() + nr_moving_space_used_pages * gPageSize;
   DCHECK(IsAlignedParam(unused_space_begin, gPageSize));
@@ -3328,159 +3445,203 @@ void MarkCompact::ConcurrentlyProcessMovingPage(uint8_t* fault_page,
     // There is a race which allows more than one thread to install a
     // zero-page. But we can tolerate that. So absorb the EEXIST returned by
     // the ioctl and move on.
-    ZeropageIoctl(fault_page, /*tolerate_eexist=*/true, /*tolerate_enoent=*/true);
+    ZeropageIoctl(fault_page, gPageSize, /*tolerate_eexist=*/true, /*tolerate_enoent=*/true);
     return;
   }
   size_t page_idx = DivideByPageSize(fault_page - bump_pointer_space_->Begin());
   DCHECK_LT(page_idx, moving_first_objs_count_ + black_page_count_);
   mirror::Object* first_obj = first_objs_moving_space_[page_idx].AsMirrorPtr();
   if (first_obj == nullptr) {
-    // We should never have a case where two workers are trying to install a
-    // zeropage in this range as we synchronize using moving_pages_status_[page_idx].
-    PageState expected_state = PageState::kUnprocessed;
-    if (moving_pages_status_[page_idx].compare_exchange_strong(
-            expected_state, PageState::kProcessedAndMapping, std::memory_order_relaxed)) {
+    // Install zero-page in the entire remaining tlab to avoid multiple ioctl invocations.
+    uint8_t* end = AlignDown(self->GetTlabEnd(), gPageSize);
+    if (fault_page < self->GetTlabPos() || fault_page >= end) {
+      end = fault_page + gPageSize;
+    }
+    size_t end_idx = page_idx + DivideByPageSize(end - fault_page);
+    size_t length = 0;
+    for (size_t idx = page_idx; idx < end_idx; idx++, length += gPageSize) {
+      // We should never have a case where two workers are trying to install a
+      // zeropage in this range as we synchronize using moving_pages_status_[page_idx].
+      uint32_t expected_state = static_cast<uint8_t>(PageState::kUnprocessed);
+      if (!moving_pages_status_[idx].compare_exchange_strong(
+              expected_state,
+              static_cast<uint8_t>(PageState::kProcessedAndMapping),
+              std::memory_order_acq_rel)) {
+        DCHECK_EQ(expected_state, static_cast<uint8_t>(PageState::kProcessedAndMapping));
+        break;
+      }
+    }
+    if (length > 0) {
       // Note: ioctl acts as an acquire fence.
-      ZeropageIoctl(fault_page, /*tolerate_eexist=*/false, /*tolerate_enoent=*/true);
-    } else {
-      DCHECK_EQ(expected_state, PageState::kProcessedAndMapping);
+      ZeropageIoctl(fault_page, length, /*tolerate_eexist=*/false, /*tolerate_enoent=*/true);
+      for (size_t len = 0, idx = page_idx; len < length; idx++, len += gPageSize) {
+        moving_pages_status_[idx].store(static_cast<uint8_t>(PageState::kProcessedAndMapped),
+                                        std::memory_order_relaxed);
+      }
     }
     return;
   }
 
-  PageState state = moving_pages_status_[page_idx].load(
+  uint32_t raw_state = moving_pages_status_[page_idx].load(
       use_uffd_sigbus_ ? std::memory_order_acquire : std::memory_order_relaxed);
   uint32_t backoff_count = 0;
+  PageState state;
   while (true) {
-    switch (state) {
-      case PageState::kUnprocessed: {
-        // The increment to the in-progress counter must be done before updating
-        // the page's state. Otherwise, we will end up leaving a window wherein
-        // the GC-thread could observe that no worker is working on compaction
-        // and could end up unregistering the moving space from userfaultfd.
-        ScopedInProgressCount spc(this);
-        // Acquire order to ensure we don't start writing to shadow map, which is
-        // shared, before the CAS is successful. Release order to ensure that the
-        // increment to moving_compactions_in_progress above is not re-ordered
-        // after the CAS.
-        if (moving_pages_status_[page_idx].compare_exchange_strong(
-                state, PageState::kMutatorProcessing, std::memory_order_acq_rel)) {
-          if (kMode == kMinorFaultMode) {
-            DCHECK_EQ(buf, nullptr);
-            buf = shadow_to_space_map_.Begin() + page_idx * gPageSize;
-          } else if (UNLIKELY(buf == nullptr)) {
-            DCHECK_EQ(kMode, kCopyMode);
-            uint16_t idx = compaction_buffer_counter_.fetch_add(1, std::memory_order_relaxed);
-            // The buffer-map is one page bigger as the first buffer is used by GC-thread.
-            CHECK_LE(idx, kMutatorCompactionBufferCount);
-            buf = compaction_buffers_map_.Begin() + idx * gPageSize;
-            DCHECK(compaction_buffers_map_.HasAddress(buf));
-            Thread::Current()->SetThreadLocalGcBuffer(buf);
-          }
+    state = static_cast<PageState>(static_cast<uint8_t>(raw_state));
+    if (state == PageState::kProcessing || state == PageState::kMutatorProcessing ||
+        state == PageState::kProcessingAndMapping || state == PageState::kProcessedAndMapping) {
+      if (!use_uffd_sigbus_) {
+        break;
+      }
+      // Wait for the page to be mapped (by gc-thread or some mutator) before returning.
+      // The wait is not expected to be long as the read state indicates that the other
+      // thread is actively working on the page.
+      BackOff(backoff_count++);
+      raw_state = moving_pages_status_[page_idx].load(std::memory_order_acquire);
+    } else if (state == PageState::kProcessedAndMapped) {
+      // Nothing to do.
+      break;
+    } else {
+      // The increment to the in-progress counter must be done before updating
+      // the page's state. Otherwise, we will end up leaving a window wherein
+      // the GC-thread could observe that no worker is working on compaction
+      // and could end up unregistering the moving space from userfaultfd.
+      ScopedInProgressCount spc(this);
+      // Acquire order to ensure we don't start writing to shadow map, which is
+      // shared, before the CAS is successful. Release order to ensure that the
+      // increment to moving_compaction_in_progress above is not re-ordered
+      // after the CAS.
+      if (state == PageState::kUnprocessed &&
+          moving_pages_status_[page_idx].compare_exchange_strong(
+              raw_state,
+              static_cast<uint8_t>(PageState::kMutatorProcessing),
+              std::memory_order_acq_rel)) {
+        if (kMode == kMinorFaultMode) {
+          DCHECK_EQ(buf, nullptr);
+          buf = shadow_to_space_map_.Begin() + page_idx * gPageSize;
+        } else if (UNLIKELY(buf == nullptr)) {
+          DCHECK_EQ(kMode, kCopyMode);
+          uint16_t idx = compaction_buffer_counter_.fetch_add(1, std::memory_order_relaxed);
+          // The buffer-map is one page bigger as the first buffer is used by GC-thread.
+          CHECK_LE(idx, kMutatorCompactionBufferCount);
+          buf = compaction_buffers_map_.Begin() + idx * gPageSize;
+          DCHECK(compaction_buffers_map_.HasAddress(buf));
+          self->SetThreadLocalGcBuffer(buf);
+        }
 
-          if (fault_page < post_compact_end_) {
-            // The page has to be compacted.
-            CompactPage(
-                first_obj, pre_compact_offset_moving_space_[page_idx], buf, kMode == kCopyMode);
-          } else {
-            DCHECK_NE(first_obj, nullptr);
-            DCHECK_GT(pre_compact_offset_moving_space_[page_idx], 0u);
-            uint8_t* pre_compact_page = black_allocations_begin_ + (fault_page - post_compact_end_);
-            uint32_t first_chunk_size = black_alloc_pages_first_chunk_size_[page_idx];
-            mirror::Object* next_page_first_obj = nullptr;
-            if (page_idx + 1 < moving_first_objs_count_ + black_page_count_) {
-              next_page_first_obj = first_objs_moving_space_[page_idx + 1].AsMirrorPtr();
-            }
-            DCHECK(IsAlignedParam(pre_compact_page, gPageSize));
-            SlideBlackPage(first_obj,
-                           next_page_first_obj,
-                           first_chunk_size,
-                           pre_compact_page,
-                           buf,
-                           kMode == kCopyMode);
+        if (fault_page < post_compact_end_) {
+          // The page has to be compacted.
+          CompactPage(
+              first_obj, pre_compact_offset_moving_space_[page_idx], buf, kMode == kCopyMode);
+        } else {
+          DCHECK_NE(first_obj, nullptr);
+          DCHECK_GT(pre_compact_offset_moving_space_[page_idx], 0u);
+          uint8_t* pre_compact_page = black_allocations_begin_ + (fault_page - post_compact_end_);
+          uint32_t first_chunk_size = black_alloc_pages_first_chunk_size_[page_idx];
+          mirror::Object* next_page_first_obj = nullptr;
+          if (page_idx + 1 < moving_first_objs_count_ + black_page_count_) {
+            next_page_first_obj = first_objs_moving_space_[page_idx + 1].AsMirrorPtr();
           }
-          // Nobody else would simultaneously modify this page's state so an
-          // atomic store is sufficient. Use 'release' order to guarantee that
-          // loads/stores to the page are finished before this store.
-          moving_pages_status_[page_idx].store(PageState::kProcessedAndMapping,
+          DCHECK(IsAlignedParam(pre_compact_page, gPageSize));
+          SlideBlackPage(first_obj,
+                         next_page_first_obj,
+                         first_chunk_size,
+                         pre_compact_page,
+                         buf,
+                         kMode == kCopyMode);
+        }
+        // Nobody else would simultaneously modify this page's state so an
+        // atomic store is sufficient. Use 'release' order to guarantee that
+        // loads/stores to the page are finished before this store. Since the
+        // mutator used its own buffer for the processing, there is no reason to
+        // put its index in the status of the page. Also, the mutator is going
+        // to immediately map the page, so that info is not needed.
+        moving_pages_status_[page_idx].store(static_cast<uint8_t>(PageState::kProcessedAndMapping),
+                                             std::memory_order_release);
+        if (kMode == kCopyMode) {
+          CopyIoctl(fault_page, buf, gPageSize);
+          // Store is sufficient as no other thread modifies the status at this stage.
+          moving_pages_status_[page_idx].store(static_cast<uint8_t>(PageState::kProcessedAndMapped),
                                                std::memory_order_release);
-          if (kMode == kCopyMode) {
-            CopyIoctl(fault_page, buf);
-            if (use_uffd_sigbus_) {
-              // Store is sufficient as no other thread modifies the status at this stage.
-              moving_pages_status_[page_idx].store(PageState::kProcessedAndMapped,
-                                                   std::memory_order_release);
-            }
-            return;
-          } else {
-            break;
-          }
+          break;
+        } else {
+          // We don't support minor-fault feature anymore.
+          UNREACHABLE();
         }
       }
-        continue;
-      case PageState::kProcessing:
-        DCHECK_EQ(kMode, kMinorFaultMode);
-        if (moving_pages_status_[page_idx].compare_exchange_strong(
-                state, PageState::kProcessingAndMapping, std::memory_order_relaxed) &&
-            !use_uffd_sigbus_) {
-          // Somebody else took or will take care of finishing the compaction and
-          // then mapping the page.
-          return;
-        }
-        continue;
-      case PageState::kProcessed:
-        // The page is processed but not mapped. We should map it.
-        break;
-      case PageState::kProcessingAndMapping:
-      case PageState::kMutatorProcessing:
-      case PageState::kProcessedAndMapping:
-        if (use_uffd_sigbus_) {
-          // Wait for the page to be mapped before returning.
-          BackOff(backoff_count++);
-          state = moving_pages_status_[page_idx].load(std::memory_order_acquire);
-          continue;
+      state = static_cast<PageState>(static_cast<uint8_t>(raw_state));
+      if (state == PageState::kProcessed) {
+        size_t arr_len = moving_first_objs_count_ + black_page_count_;
+        // The page is processed but not mapped. We should map it. The release
+        // order used in MapMovingSpacePages will ensure that the increment to
+        // moving_compaction_in_progress is done first.
+        if (MapMovingSpacePages(page_idx, arr_len) >= gPageSize) {
+          break;
         }
-        return;
-      case PageState::kProcessedAndMapped:
-        // Somebody else took care of the page.
-        return;
+        raw_state = moving_pages_status_[page_idx].load(std::memory_order_acquire);
+      }
     }
-    break;
-  }
-
-  DCHECK_EQ(kMode, kMinorFaultMode);
-  if (state == PageState::kUnprocessed) {
-    MapProcessedPages</*kFirstPageMapping=*/true>(
-        fault_page, moving_pages_status_, page_idx, nr_moving_space_used_pages);
-  } else {
-    DCHECK_EQ(state, PageState::kProcessed);
-    MapProcessedPages</*kFirstPageMapping=*/false>(
-        fault_page, moving_pages_status_, page_idx, nr_moving_space_used_pages);
   }
 }
 
-void MarkCompact::MapUpdatedLinearAllocPage(uint8_t* page,
-                                            uint8_t* shadow_page,
-                                            Atomic<PageState>& state,
-                                            bool page_touched) {
+bool MarkCompact::MapUpdatedLinearAllocPages(uint8_t* start_page,
+                                             uint8_t* start_shadow_page,
+                                             Atomic<PageState>* state,
+                                             size_t length,
+                                             bool free_pages,
+                                             bool single_ioctl) {
   DCHECK(!minor_fault_initialized_);
-  if (page_touched) {
-    CopyIoctl(page, shadow_page);
-  } else {
-    // If the page wasn't touched, then it means it is empty and
-    // is most likely not present on the shadow-side. Furthermore,
-    // since the shadow is also userfaultfd registered doing copy
-    // ioctl fail as the copy-from-user in the kernel will cause
-    // userfault. Instead, just map a zeropage, which is not only
-    // correct but also efficient as it avoids unnecessary memcpy
-    // in the kernel.
-    ZeropageIoctl(page, /*tolerate_eexist=*/false, /*tolerate_enoent=*/false);
-  }
-  if (use_uffd_sigbus_) {
-    // Store is sufficient as no other thread can modify the
-    // status of this page at this point.
-    state.store(PageState::kProcessedAndMapped, std::memory_order_release);
+  DCHECK_ALIGNED_PARAM(length, gPageSize);
+  uint8_t* end_page = start_page + length;
+  while (start_page < end_page) {
+    size_t map_len = 0;
+    // Claim a contiguous range of pages that we can map.
+    for (Atomic<PageState>* cur_state = state; map_len < length;
+         map_len += gPageSize, cur_state++) {
+      PageState expected_state = PageState::kProcessed;
+      if (!cur_state->compare_exchange_strong(
+              expected_state, PageState::kProcessedAndMapping, std::memory_order_acq_rel)) {
+        break;
+      }
+    }
+    if (map_len == 0) {
+      if (single_ioctl) {
+        // Didn't map anything.
+        return false;
+      }
+      // Skip all the pages that this thread can't map.
+      while (state->load(std::memory_order_relaxed) != PageState::kProcessed && length > 0) {
+        state++;
+        length -= gPageSize;
+        start_shadow_page += gPageSize;
+        start_page += gPageSize;
+      }
+    } else {
+      CopyIoctl(start_page, start_shadow_page, map_len);
+      if (use_uffd_sigbus_) {
+        // Declare that the pages are ready to be accessed.
+        // Store is sufficient as no other thread can modify the status
+        // of this page at this point. Ioctl above will act as release fence.
+        for (size_t l = 0; l < map_len; l += gPageSize, state++) {
+          DCHECK_EQ(state->load(std::memory_order_relaxed), PageState::kProcessedAndMapping);
+          state->store(PageState::kProcessedAndMapped, std::memory_order_relaxed);
+        }
+      } else {
+        state += DivideByPageSize(map_len);
+      }
+      if (free_pages) {
+        ZeroAndReleaseMemory(start_shadow_page, map_len);
+      }
+      if (single_ioctl) {
+        break;
+      }
+      start_page += map_len;
+      start_shadow_page += map_len;
+      length -= map_len;
+      // state is already updated above.
+    }
   }
+  return true;
 }
 
 template <int kMode>
@@ -3507,7 +3668,7 @@ void MarkCompact::ConcurrentlyProcessLinearAllocPage(uint8_t* fault_page, bool i
       arena_iter->second <= fault_page) {
     // Fault page isn't in any of the arenas that existed before we started
     // compaction. So map zeropage and return.
-    ZeropageIoctl(fault_page, /*tolerate_eexist=*/true, /*tolerate_enoent=*/false);
+    ZeropageIoctl(fault_page, gPageSize, /*tolerate_eexist=*/true, /*tolerate_enoent=*/false);
   } else {
     // Find the linear-alloc space containing fault-page
     LinearAllocSpaceData* space_data = nullptr;
@@ -3531,7 +3692,7 @@ void MarkCompact::ConcurrentlyProcessLinearAllocPage(uint8_t* fault_page, bool i
           // Acquire order to ensure we don't start writing to shadow map, which is
           // shared, before the CAS is successful.
           if (state_arr[page_idx].compare_exchange_strong(
-                  state, PageState::kProcessingAndMapping, std::memory_order_acquire)) {
+                  state, PageState::kProcessing, std::memory_order_acquire)) {
             if (kMode == kCopyMode || is_minor_fault) {
               LinearAllocPageUpdater updater(this);
               uint8_t* first_obj = arena_iter->first->GetFirstObject(fault_page);
@@ -3543,11 +3704,24 @@ void MarkCompact::ConcurrentlyProcessLinearAllocPage(uint8_t* fault_page, bool i
                 updater.SingleObjectArena(fault_page + diff, gPageSize);
               }
               if (kMode == kCopyMode) {
-                MapUpdatedLinearAllocPage(fault_page,
-                                          fault_page + diff,
-                                          state_arr[page_idx],
-                                          updater.WasLastPageTouched());
-                return;
+                if (updater.WasLastPageTouched()) {
+                  state_arr[page_idx].store(PageState::kProcessed, std::memory_order_release);
+                  state = PageState::kProcessed;
+                  continue;
+                } else {
+                  // If the page wasn't touched, then it means it is empty and
+                  // is most likely not present on the shadow-side. Furthermore,
+                  // since the shadow is also userfaultfd registered doing copy
+                  // ioctl fails as the copy-from-user in the kernel will cause
+                  // userfault. Instead, just map a zeropage, which is not only
+                  // correct but also efficient as it avoids unnecessary memcpy
+                  // in the kernel.
+                  ZeropageIoctl(
+                      fault_page, gPageSize, /*tolerate_eexist=*/false, /*tolerate_enoent=*/false);
+                  state_arr[page_idx].store(PageState::kProcessedAndMapped,
+                                            std::memory_order_release);
+                  return;
+                }
               }
             } else {
               // Don't touch the page in this case (there is no reason to do so
@@ -3564,21 +3738,21 @@ void MarkCompact::ConcurrentlyProcessLinearAllocPage(uint8_t* fault_page, bool i
           }
         }
           continue;
-        case PageState::kProcessing:
-          DCHECK_EQ(kMode, kMinorFaultMode);
-          if (state_arr[page_idx].compare_exchange_strong(
-                  state, PageState::kProcessingAndMapping, std::memory_order_relaxed) &&
-              !use_uffd_sigbus_) {
-            // Somebody else took or will take care of finishing the updates and
-            // then mapping the page.
+        case PageState::kProcessed:
+          // Map as many pages as possible in a single ioctl, without spending
+          // time freeing pages.
+          if (MapUpdatedLinearAllocPages(fault_page,
+                                         fault_page + diff,
+                                         state_arr + page_idx,
+                                         space_data->end_ - fault_page,
+                                         /*free_pages=*/false,
+                                         /*single_ioctl=*/true)) {
             return;
           }
-          continue;
-        case PageState::kProcessed:
-          // The page is processed but not mapped. We should map it.
-          break;
-        case PageState::kMutatorProcessing:
-          UNREACHABLE();
+          // fault_page was not mapped by this thread (some other thread claimed
+          // it). Wait for it to be mapped before returning.
+          FALLTHROUGH_INTENDED;
+        case PageState::kProcessing:
         case PageState::kProcessingAndMapping:
         case PageState::kProcessedAndMapping:
           if (use_uffd_sigbus_) {
@@ -3588,6 +3762,8 @@ void MarkCompact::ConcurrentlyProcessLinearAllocPage(uint8_t* fault_page, bool i
             continue;
           }
           return;
+        case PageState::kMutatorProcessing:
+          UNREACHABLE();
         case PageState::kProcessedAndMapped:
           // Somebody else took care of the page.
           return;
@@ -3611,12 +3787,53 @@ void MarkCompact::ProcessLinearAlloc() {
   GcVisitedArenaPool* arena_pool =
       static_cast<GcVisitedArenaPool*>(Runtime::Current()->GetLinearAllocArenaPool());
   DCHECK_EQ(thread_running_gc_, Thread::Current());
+  uint8_t* unmapped_range_start = nullptr;
+  uint8_t* unmapped_range_end = nullptr;
+  // Pointer to the linear-alloc space containing the current arena in the loop
+  // below. Also helps in ensuring that two arenas, which are contiguous in
+  // address space but are from different linear-alloc spaces, are not coalesced
+  // into one range for mapping purpose.
+  LinearAllocSpaceData* space_data = nullptr;
+  Atomic<PageState>* state_arr = nullptr;
+  ptrdiff_t diff = 0;
+
+  auto map_pages =
+      [this, &diff, &space_data, &unmapped_range_start, &unmapped_range_end, &state_arr]() {
+        DCHECK_NE(diff, 0);
+        DCHECK_NE(space_data, nullptr);
+        DCHECK_GE(unmapped_range_start, space_data->begin_);
+        DCHECK_LT(unmapped_range_start, space_data->end_);
+        DCHECK_GT(unmapped_range_end, space_data->begin_);
+        DCHECK_LE(unmapped_range_end, space_data->end_);
+        DCHECK_LT(unmapped_range_start, unmapped_range_end);
+        DCHECK_ALIGNED_PARAM(unmapped_range_end - unmapped_range_start, gPageSize);
+        size_t page_idx = DivideByPageSize(unmapped_range_start - space_data->begin_);
+        MapUpdatedLinearAllocPages(unmapped_range_start,
+                                   unmapped_range_start + diff,
+                                   state_arr + page_idx,
+                                   unmapped_range_end - unmapped_range_start,
+                                   /*free_pages=*/true,
+                                   /*single_ioctl=*/false);
+      };
   for (auto& pair : linear_alloc_arenas_) {
     const TrackedArena* arena = pair.first;
-    size_t arena_size;
-    uint8_t* arena_begin;
-    ptrdiff_t diff;
-    bool others_processing;
+    size_t arena_size = arena->Size();
+    uint8_t* arena_begin = arena->Begin();
+    // linear_alloc_arenas_ is sorted on arena-begin. So we will get all arenas
+    // in that order.
+    DCHECK_LE(unmapped_range_end, arena_begin);
+    if (unmapped_range_end != nullptr && unmapped_range_end < arena_begin) {
+      map_pages();
+      unmapped_range_end = nullptr;
+    }
+    if (unmapped_range_end == nullptr) {
+      unmapped_range_start = unmapped_range_end = arena_begin;
+    }
+    DCHECK_NE(unmapped_range_start, nullptr);
+    // It's ok to include all arenas in the unmapped range. Since the
+    // corresponding state bytes will be kUnprocessed, we will skip calling
+    // ioctl and madvise on arenas which are waiting to be deleted.
+    unmapped_range_end += arena_size;
     {
       // Acquire arena-pool's lock (in shared-mode) so that the arena being updated
       // does not get deleted at the same time. If this critical section is too
@@ -3631,20 +3848,29 @@ void MarkCompact::ProcessLinearAlloc() {
       }
       uint8_t* last_byte = pair.second;
       DCHECK_ALIGNED_PARAM(last_byte, gPageSize);
-      others_processing = false;
-      arena_begin = arena->Begin();
-      arena_size = arena->Size();
-      // Find the linear-alloc space containing the arena
-      LinearAllocSpaceData* space_data = nullptr;
-      for (auto& data : linear_alloc_spaces_data_) {
-        if (data.begin_ <= arena_begin && arena_begin < data.end_) {
-          space_data = &data;
-          break;
+      if (space_data == nullptr || space_data->begin_ > arena_begin ||
+          space_data->end_ <= arena_begin) {
+        // Handle the case where previous and current arenas are contiguous but
+        // belong to different spaces.
+        if (space_data != nullptr && unmapped_range_start >= space_data->begin_ &&
+            unmapped_range_start < space_data->end_) {
+          map_pages();
+          unmapped_range_start = arena_begin;
+          unmapped_range_end = arena_begin + arena_size;
+        }
+        space_data = nullptr;
+        // Find the linear-alloc space containing the arena
+        for (auto& data : linear_alloc_spaces_data_) {
+          if (data.begin_ <= arena_begin && arena_begin < data.end_) {
+            space_data = &data;
+            break;
+          }
         }
+        diff = space_data->shadow_.Begin() - space_data->begin_;
+        state_arr = reinterpret_cast<Atomic<PageState>*>(space_data->page_status_map_.Begin());
       }
       CHECK_NE(space_data, nullptr);
-      diff = space_data->shadow_.Begin() - space_data->begin_;
-      auto visitor = [space_data, last_byte, diff, this, &others_processing](
+      auto visitor = [space_data, last_byte, diff, this, state_arr](
                          uint8_t* page_begin,
                          uint8_t* first_obj,
                          size_t page_size) REQUIRES_SHARED(Locks::mutator_lock_) {
@@ -3656,16 +3882,12 @@ void MarkCompact::ProcessLinearAlloc() {
         LinearAllocPageUpdater updater(this);
         size_t page_idx = DivideByPageSize(page_begin - space_data->begin_);
         DCHECK_LT(page_idx, space_data->page_status_map_.Size());
-        Atomic<PageState>* state_arr =
-            reinterpret_cast<Atomic<PageState>*>(space_data->page_status_map_.Begin());
         PageState expected_state = PageState::kUnprocessed;
-        PageState desired_state =
-            minor_fault_initialized_ ? PageState::kProcessing : PageState::kProcessingAndMapping;
         // Acquire order to ensure that we don't start accessing the shadow page,
         // which is shared with other threads, prior to CAS. Also, for same
         // reason, we used 'release' order for changing the state to 'processed'.
         if (state_arr[page_idx].compare_exchange_strong(
-                expected_state, desired_state, std::memory_order_acquire)) {
+                expected_state, PageState::kProcessing, std::memory_order_acquire)) {
           // null first_obj indicates that it's a page from arena for
           // intern-table/class-table. So first object isn't required.
           if (first_obj != nullptr) {
@@ -3676,8 +3898,18 @@ void MarkCompact::ProcessLinearAlloc() {
           }
           expected_state = PageState::kProcessing;
           if (!minor_fault_initialized_) {
-            MapUpdatedLinearAllocPage(
-                page_begin, page_begin + diff, state_arr[page_idx], updater.WasLastPageTouched());
+            // Store is sufficient as no other thread could be modifying it. Use
+            // release order to ensure that the writes to shadow page are
+            // committed to memory before.
+            if (updater.WasLastPageTouched()) {
+              state_arr[page_idx].store(PageState::kProcessed, std::memory_order_release);
+            } else {
+              // See comment in ConcurrentlyProcessLinearAllocPage() with same situation.
+              ZeropageIoctl(
+                  page_begin, gPageSize, /*tolerate_eexist=*/false, /*tolerate_enoent=*/false);
+              // Ioctl will act as release fence.
+              state_arr[page_idx].store(PageState::kProcessedAndMapped, std::memory_order_relaxed);
+            }
           } else if (!state_arr[page_idx].compare_exchange_strong(
                          expected_state, PageState::kProcessed, std::memory_order_release)) {
             DCHECK_EQ(expected_state, PageState::kProcessingAndMapping);
@@ -3688,19 +3920,15 @@ void MarkCompact::ProcessLinearAlloc() {
             MapProcessedPages</*kFirstPageMapping=*/true>(
                 page_begin, state_arr, page_idx, space_data->page_status_map_.Size());
           }
-        } else {
-          others_processing = true;
         }
       };
 
       arena->VisitRoots(visitor);
     }
-    // If we are not in minor-fault mode and if no other thread was found to be
-    // processing any pages in this arena, then we can madvise the shadow size.
-    // Otherwise, we will double the memory use for linear-alloc.
-    if (!minor_fault_initialized_ && !others_processing) {
-      ZeroAndReleaseMemory(arena_begin + diff, arena_size);
-    }
+  }
+  if (unmapped_range_end > unmapped_range_start) {
+    // Map remaining pages.
+    map_pages();
   }
 }
 
@@ -3718,6 +3946,9 @@ void MarkCompact::RegisterUffd(void* addr, size_t size, int mode) {
       << ". start:" << static_cast<void*>(addr) << " len:" << PrettySize(size);
 }
 
+// TODO: sometime we may want to tolerate certain error conditions (like ENOMEM
+// when we unregister the unused portion of the moving-space). Implement support
+// for that.
 void MarkCompact::UnregisterUffd(uint8_t* start, size_t len) {
   DCHECK(IsValidFd(uffd_));
   struct uffdio_range range;
@@ -3767,7 +3998,8 @@ void MarkCompact::CompactionPhase() {
       // range.
       uint8_t* unused_first_page = bump_pointer_space_->Begin() + used_size;
       // It's ok if somebody else already mapped the page.
-      ZeropageIoctl(unused_first_page, /*tolerate_eexist*/ true, /*tolerate_enoent*/ false);
+      ZeropageIoctl(
+          unused_first_page, gPageSize, /*tolerate_eexist*/ true, /*tolerate_enoent*/ false);
       UnregisterUffd(unused_first_page, moving_space_size - used_size);
     }
     CompactMovingSpace<kCopyMode>(compaction_buffers_map_.Begin());
@@ -4462,7 +4694,6 @@ void MarkCompact::FinishPhase() {
     }
   }
   class_after_obj_ordered_map_.clear();
-  delete[] moving_pages_status_;
   linear_alloc_arenas_.clear();
   {
     ReaderMutexLock mu(thread_running_gc_, *Locks::mutator_lock_);
diff --git a/runtime/gc/collector/mark_compact.h b/runtime/gc/collector/mark_compact.h
index 076249bf11..47b2e81e65 100644
--- a/runtime/gc/collector/mark_compact.h
+++ b/runtime/gc/collector/mark_compact.h
@@ -156,9 +156,11 @@ class MarkCompact final : public GarbageCollector {
 
   // In copy-mode of userfaultfd, we don't need to reach a 'processed' state as
   // it's given that processing thread also copies the page, thereby mapping it.
-  // The order is important as we may treat them as integers.
+  // The order is important as we may treat them as integers. Also
+  // 'kUnprocessed' should be set to 0 as we rely on madvise(dontneed) to return
+  // us zero'ed pages, which implicitly makes page-status initialized to 'kUnprocessed'.
   enum class PageState : uint8_t {
-    kUnprocessed = 0,           // Not processed yet
+    kUnprocessed = 0,           // Not processed yet.
     kProcessing = 1,            // Being processed by GC thread and will not be mapped
     kProcessed = 2,             // Processed but not mapped
     kProcessingAndMapping = 3,  // Being processed by GC or mutator and will be mapped
@@ -176,6 +178,7 @@ class MarkCompact final : public GarbageCollector {
 
  private:
   using ObjReference = mirror::CompressedReference<mirror::Object>;
+  static constexpr uint32_t kPageStateMask = (1 << BitSizeOf<uint8_t>()) - 1;
   // Number of bits (live-words) covered by a single chunk-info (below)
   // entry/word.
   // TODO: Since popcount is performed usomg SIMD instructions, we should
@@ -346,12 +349,12 @@ class MarkCompact final : public GarbageCollector {
   void CompactMovingSpace(uint8_t* page) REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Compact the given page as per func and change its state. Also map/copy the
-  // page, if required.
+  // page, if required. Returns true if the page was compacted, else false.
   template <int kMode, typename CompactionFn>
-  ALWAYS_INLINE void DoPageCompactionWithStateChange(size_t page_idx,
-                                                     size_t status_arr_len,
+  ALWAYS_INLINE bool DoPageCompactionWithStateChange(size_t page_idx,
                                                      uint8_t* to_space_page,
                                                      uint8_t* page,
+                                                     bool map_immediately,
                                                      CompactionFn func)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
@@ -515,7 +518,17 @@ class MarkCompact final : public GarbageCollector {
   // feature.
   bool CanCompactMovingSpaceWithMinorFault();
 
-  void FreeFromSpacePages(size_t cur_page_idx, int mode) REQUIRES_SHARED(Locks::mutator_lock_);
+  // Does the following:
+  // 1. Checks the status of to-space pages in [cur_page_idx,
+  //    last_checked_reclaim_page_idx_) range to see whether the corresponding
+  //    from-space pages can be reused.
+  // 2. Taking into consideration classes which are allocated after their
+  //    objects (in address order), computes the page (in from-space) from which
+  //    actual reclamation can be done.
+  // 3. Map the pages in [cur_page_idx, end_idx_for_mapping) range.
+  // 4. Madvise the pages in [page from (2), last_reclaimed_page_)
+  bool FreeFromSpacePages(size_t cur_page_idx, int mode, size_t end_idx_for_mapping)
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Maps processed pages (from moving space and linear-alloc) for uffd's
   // minor-fault feature. We try to 'claim' all processed (and unmapped) pages
@@ -528,7 +541,14 @@ class MarkCompact final : public GarbageCollector {
                          size_t arr_idx,
                          size_t arr_len) REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // Maps moving space pages in [arr_idx, arr_len) range. It fetches the page
+  // address containing the compacted content from moving_pages_status_ array.
+  // Returns number of bytes (should be multiple of page-size) that are mapped
+  // by the thread.
+  size_t MapMovingSpacePages(size_t arr_idx, size_t arr_len) REQUIRES_SHARED(Locks::mutator_lock_);
+
   bool IsValidFd(int fd) const { return fd >= 0; }
+
   // Add/update <class, obj> pair if class > obj and obj is the lowest address
   // object of class.
   ALWAYS_INLINE void UpdateClassAfterObjectMap(mirror::Object* obj)
@@ -543,20 +563,27 @@ class MarkCompact final : public GarbageCollector {
   void MarkZygoteLargeObjects() REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(Locks::heap_bitmap_lock_);
 
-  void ZeropageIoctl(void* addr, bool tolerate_eexist, bool tolerate_enoent);
-  void CopyIoctl(void* dst, void* buffer);
-  // Called after updating a linear-alloc page to either map a zero-page if the
-  // page wasn't touched during updation, or map the page via copy-ioctl. And
-  // then updates the page's state to indicate the page is mapped.
-  void MapUpdatedLinearAllocPage(uint8_t* page,
-                                 uint8_t* shadow_page,
-                                 Atomic<PageState>& state,
-                                 bool page_touched);
+  void ZeropageIoctl(void* addr, size_t length, bool tolerate_eexist, bool tolerate_enoent);
+  void CopyIoctl(void* dst, void* buffer, size_t length);
+
+  // Called after updating linear-alloc page(s) to map the page. It first
+  // updates the state of the pages to kProcessedAndMapping and after ioctl to
+  // kProcessedAndMapped. Returns true if at least one ioctl invocation was
+  // done. If 'free_pages' is true then also frees shadow pages. If 'single_ioctl'
+  // is true, then stops after first ioctl.
+  bool MapUpdatedLinearAllocPages(uint8_t* start_page,
+                                  uint8_t* start_shadow_page,
+                                  Atomic<PageState>* state,
+                                  size_t length,
+                                  bool free_pages,
+                                  bool single_ioctl);
   // Called for clamping of 'info_map_' and other GC data structures, which are
   // small and/or in >4GB address space. There is no real benefit of clamping
   // them synchronously during app forking. It clamps only if clamp_info_map_status_
   // is set to kClampInfoPending, which is done by ClampGrowthLimit().
   void MaybeClampGcStructures() REQUIRES(Locks::heap_bitmap_lock_);
+
+  size_t ComputeInfoMapSize();
   // Initialize all the info-map related fields of this GC. Returns total size
   // of all the structures in info-map.
   size_t InitializeInfoMap(uint8_t* p, size_t moving_space_sz);
@@ -680,9 +707,24 @@ class MarkCompact final : public GarbageCollector {
   // bitmap but has all its super classes lower address order than itself.
   mirror::Class* walk_super_class_cache_;
   // Used by FreeFromSpacePages() for maintaining markers in the moving space for
-  // how far the pages have been reclaimed/checked.
+  // how far the pages have been reclaimed (madvised) and checked.
+  //
+  // Pages from this index to the end of to-space have been checked (via page_status)
+  // and their corresponding from-space pages are reclaimable.
   size_t last_checked_reclaim_page_idx_;
+  // All from-space pages in [last_reclaimed_page_, from_space->End()) are
+  // reclaimed (madvised). Pages in [from-space page corresponding to
+  // last_checked_reclaim_page_idx_, last_reclaimed_page_) are not reclaimed as
+  // they may contain classes required for class hierarchy traversal for
+  // visiting references during compaction.
   uint8_t* last_reclaimed_page_;
+  // All the pages in [last_reclaimable_page_, last_reclaimed_page_) in
+  // from-space are available to store compacted contents for batching until the
+  // next time madvise is called.
+  uint8_t* last_reclaimable_page_;
+  // [cur_reclaimable_page_, last_reclaimed_page_) have been used to store
+  // compacted contents for batching.
+  uint8_t* cur_reclaimable_page_;
 
   space::ContinuousSpace* non_moving_space_;
   space::BumpPointerSpace* const bump_pointer_space_;
@@ -690,8 +732,11 @@ class MarkCompact final : public GarbageCollector {
   accounting::ContinuousSpaceBitmap* const moving_space_bitmap_;
   accounting::ContinuousSpaceBitmap* non_moving_space_bitmap_;
   Thread* thread_running_gc_;
-  // Array of moving-space's pages' compaction status.
-  Atomic<PageState>* moving_pages_status_;
+  // Array of moving-space's pages' compaction status, which is stored in the
+  // least-significant byte. kProcessed entries also contain the from-space
+  // offset of the page which contains the compacted contents of the ith
+  // to-space page.
+  Atomic<uint32_t>* moving_pages_status_;
   size_t vector_length_;
   size_t live_stack_freeze_size_;