Make native GC-root updation concurrent with userfaultfd Additionally also uses userfaultfd's minor-fault feature for moving space. Bug: 160737021 Test: ART_USE_READ_BARRIER=false art/test/testrunner/testrunner.py and module install Change-Id: I98b0c69fba4aec1263b1f38cc9f31494fd5c8cf5

commit: 485a714cbfa146528f7db9395197d855af43e188 [log] [tgz]
author: Lokesh Gidra <lokeshgidra@google.com> Wed Oct 12 10:25:23 2022 -0700
committer: Lokesh Gidra <lokeshgidra@google.com> Mon Dec 05 18:17:40 2022 +0000
tree: 914b9c764f1cc81938fb09d94e6a27bbe6e5562e
parent: 13c3ce1edf7fa9e8e97fb60625a62cb97a13f9a9 [diff]
diff --git a/libartbase/base/arena_allocator.cc b/libartbase/base/arena_allocator.cc
index 250a3d9..e5f2542 100644
--- a/libartbase/base/arena_allocator.cc
+++ b/libartbase/base/arena_allocator.cc

@@ -185,9 +185,6 @@
   MEMORY_TOOL_MAKE_NOACCESS(ptr, size);
 }
 
-Arena::Arena() : bytes_allocated_(0), memory_(nullptr), size_(0), next_(nullptr) {
-}
-
 size_t ArenaAllocator::BytesAllocated() const {
   return ArenaAllocatorStats::BytesAllocated();
 }

diff --git a/libartbase/base/arena_allocator.h b/libartbase/base/arena_allocator.h
index e340994..49c1461 100644
--- a/libartbase/base/arena_allocator.h
+++ b/libartbase/base/arena_allocator.h

@@ -178,7 +178,8 @@
 
 class Arena {
  public:
-  Arena();
+  Arena() : bytes_allocated_(0), memory_(nullptr), size_(0), next_(nullptr) {}
+
   virtual ~Arena() { }
   // Reset is for pre-use and uses memset for performance.
   void Reset();
@@ -188,9 +189,7 @@
     return memory_;
   }
 
-  uint8_t* End() {
-    return memory_ + size_;
-  }
+  uint8_t* End() const { return memory_ + size_; }
 
   size_t Size() const {
     return size_;
@@ -205,9 +204,7 @@
   }
 
   // Return true if ptr is contained in the arena.
-  bool Contains(const void* ptr) const {
-    return memory_ <= ptr && ptr < memory_ + bytes_allocated_;
-  }
+  bool Contains(const void* ptr) const { return memory_ <= ptr && ptr < memory_ + size_; }
 
   Arena* Next() const { return next_; }
 

diff --git a/libartbase/base/mem_map.cc b/libartbase/base/mem_map.cc
index aa07f1c..688325d 100644
--- a/libartbase/base/mem_map.cc
+++ b/libartbase/base/mem_map.cc

@@ -777,11 +777,11 @@
   return MemMap(tail_name, actual, tail_size, actual, tail_base_size, tail_prot, false);
 }
 
-MemMap MemMap::TakeReservedMemory(size_t byte_count) {
+MemMap MemMap::TakeReservedMemory(size_t byte_count, bool reuse) {
   uint8_t* begin = Begin();
   ReleaseReservedMemory(byte_count);  // Performs necessary DCHECK()s on this reservation.
   size_t base_size = RoundUp(byte_count, kPageSize);
-  return MemMap(name_, begin, byte_count, begin, base_size, prot_, /* reuse= */ false);
+  return MemMap(name_, begin, byte_count, begin, base_size, prot_, reuse);
 }
 
 void MemMap::ReleaseReservedMemory(size_t byte_count) {

diff --git a/libartbase/base/mem_map.h b/libartbase/base/mem_map.h
index 4c41388..28d1058 100644
--- a/libartbase/base/mem_map.h
+++ b/libartbase/base/mem_map.h

@@ -290,8 +290,9 @@
   // exceed the size of this reservation.
   //
   // Returns a mapping owning `byte_count` bytes rounded up to entire pages
-  // with size set to the passed `byte_count`.
-  MemMap TakeReservedMemory(size_t byte_count);
+  // with size set to the passed `byte_count`. If 'reuse' is true then the caller
+  // is responsible for unmapping the taken pages.
+  MemMap TakeReservedMemory(size_t byte_count, bool reuse = false);
 
   static bool CheckNoGaps(MemMap& begin_map, MemMap& end_map)
       REQUIRES(!MemMap::mem_maps_lock_);
@@ -321,6 +322,9 @@
   // in the parent process.
   void ResetInForkedProcess();
 
+  // 'redzone_size_ == 0' indicates that we are not using memory-tool on this mapping.
+  size_t GetRedzoneSize() const { return redzone_size_; }
+
  private:
   MemMap(const std::string& name,
          uint8_t* begin,

diff --git a/runtime/Android.bp b/runtime/Android.bp
index dbe11ab..fc9226e 100644
--- a/runtime/Android.bp
+++ b/runtime/Android.bp

@@ -581,6 +581,7 @@
         "gc/allocator/rosalloc.h",
         "gc/collector_type.h",
         "gc/collector/gc_type.h",
+        "gc/collector/mark_compact.h",
         "gc/space/region_space.h",
         "gc/space/space.h",
         "gc/weak_root_state.h",

diff --git a/runtime/art_field-inl.h b/runtime/art_field-inl.h
index d57110f..f6a99ac 100644
--- a/runtime/art_field-inl.h
+++ b/runtime/art_field-inl.h

@@ -70,6 +70,8 @@
   ArtField* first_field = &array->At(0);
   DCHECK_LE(static_cast<void*>(end_boundary), static_cast<void*>(first_field + array->size()));
   static constexpr size_t kFieldSize = sizeof(ArtField);
+  // Confirm the assumption that ArtField size is power of two. It's important
+  // as we assume so below (RoundUp).
   static_assert(IsPowerOfTwo(kFieldSize));
   uint8_t* declaring_class =
       reinterpret_cast<uint8_t*>(first_field) + DeclaringClassOffset().Int32Value();

diff --git a/runtime/barrier.cc b/runtime/barrier.cc
index d144591..a6cc9ba 100644
--- a/runtime/barrier.cc
+++ b/runtime/barrier.cc

@@ -40,6 +40,11 @@
   SetCountLocked(self, count_ - 1);
 }
 
+void Barrier::IncrementNoWait(Thread* self) {
+  MutexLock mu(self, *GetLock());
+  SetCountLocked(self, count_ + 1);
+}
+
 void Barrier::Wait(Thread* self) {
   Increment(self, -1);
 }

diff --git a/runtime/barrier.h b/runtime/barrier.h
index 432df76..4c94a14 100644
--- a/runtime/barrier.h
+++ b/runtime/barrier.h

@@ -51,6 +51,9 @@
 
   // Pass through the barrier, decrement the count but do not block.
   void Pass(Thread* self) REQUIRES(!GetLock());
+  // Increment the barrier but do not block. The caller should ensure that it
+  // decrements/passes it eventually.
+  void IncrementNoWait(Thread* self) REQUIRES(!GetLock());
 
   // Decrement the count, then wait until the count is zero.
   void Wait(Thread* self) REQUIRES(!GetLock());

diff --git a/runtime/base/gc_visited_arena_pool.cc b/runtime/base/gc_visited_arena_pool.cc
index dd29c7f..938dcfa 100644
--- a/runtime/base/gc_visited_arena_pool.cc
+++ b/runtime/base/gc_visited_arena_pool.cc

@@ -16,23 +16,16 @@
 
 #include "base/gc_visited_arena_pool.h"
 
-#include "base/arena_allocator-inl.h"
-#include "base/utils.h"
-
 #include <sys/mman.h>
 #include <sys/types.h>
 #include <unistd.h>
 
-namespace art {
+#include "base/arena_allocator-inl.h"
+#include "base/memfd.h"
+#include "base/utils.h"
+#include "gc/collector/mark_compact-inl.h"
 
-#if defined(__LP64__)
-// Use a size in multiples of 1GB as that can utilize the optimized mremap
-// page-table move.
-static constexpr size_t kLinearAllocPoolSize = 1 * GB;
-static constexpr size_t kLow4GBLinearAllocPoolSize = 32 * MB;
-#else
-static constexpr size_t kLinearAllocPoolSize = 32 * MB;
-#endif
+namespace art {
 
 TrackedArena::TrackedArena(uint8_t* start, size_t size) : Arena(), first_obj_array_(nullptr) {
   static_assert(ArenaAllocator::kArenaAlignment <= kPageSize,
@@ -48,7 +41,15 @@
 
 void TrackedArena::Release() {
   if (bytes_allocated_ > 0) {
-    ZeroAndReleasePages(Begin(), Size());
+    // Userfaultfd GC uses memfd mappings for linear-alloc and therefore
+    // MADV_DONTNEED will not free the pages from page cache. Therefore use
+    // MADV_REMOVE instead, which is meant for this purpose.
+    if (!gUseUserfaultfd || (madvise(Begin(), Size(), MADV_REMOVE) == -1 && errno == EINVAL)) {
+      // MADV_REMOVE fails if invoked on anonymous mapping, which could happen
+      // if the arena is released before userfaultfd-GC starts using memfd. So
+      // use MADV_DONTNEED.
+      ZeroAndReleasePages(Begin(), Size());
+    }
     std::fill_n(first_obj_array_.get(), Size() / kPageSize, nullptr);
     bytes_allocated_ = 0;
   }
@@ -76,18 +77,36 @@
     size = std::max(min_size, kLow4GBLinearAllocPoolSize);
   }
 #endif
+  Runtime* runtime = Runtime::Current();
+  gc::collector::MarkCompact* mark_compact = runtime->GetHeap()->MarkCompactCollector();
   std::string err_msg;
-  maps_.emplace_back(MemMap::MapAnonymous(name_,
-                                          size,
-                                          PROT_READ | PROT_WRITE,
-                                          low_4gb_,
-                                          &err_msg));
+  bool mapped_shared;
+  // We use MAP_SHARED on non-zygote processes for leveraging userfaultfd's minor-fault feature.
+  if (gUseUserfaultfd && !runtime->IsZygote() && mark_compact->IsUffdMinorFaultSupported()) {
+    maps_.emplace_back(MemMap::MapFile(size,
+                                       PROT_READ | PROT_WRITE,
+                                       MAP_ANONYMOUS | MAP_SHARED,
+                                       -1,
+                                       /*start=*/0,
+                                       low_4gb_,
+                                       name_,
+                                       &err_msg));
+    mapped_shared = true;
+  } else {
+    maps_.emplace_back(
+        MemMap::MapAnonymous(name_, size, PROT_READ | PROT_WRITE, low_4gb_, &err_msg));
+    mapped_shared = false;
+  }
+
   MemMap& map = maps_.back();
   if (!map.IsValid()) {
-    LOG(FATAL) << "Failed to allocate " << name_
-               << ": " << err_msg;
+    LOG(FATAL) << "Failed to allocate " << name_ << ": " << err_msg;
     UNREACHABLE();
   }
+  if (gUseUserfaultfd) {
+    // Create a shadow-map for the map being added for userfaultfd GC
+    mark_compact->AddLinearAllocSpaceData(map.Begin(), map.Size(), mapped_shared);
+  }
   Chunk* chunk = new Chunk(map.Begin(), map.Size());
   best_fit_allocs_.insert(chunk);
   free_chunks_.insert(chunk);
@@ -251,4 +270,3 @@
 }
 
 }  // namespace art
-

diff --git a/runtime/base/gc_visited_arena_pool.h b/runtime/base/gc_visited_arena_pool.h
index 7dc79af..7a5f334 100644
--- a/runtime/base/gc_visited_arena_pool.h
+++ b/runtime/base/gc_visited_arena_pool.h

@@ -32,6 +32,8 @@
 // An Arena which tracks its allocations.
 class TrackedArena final : public Arena {
  public:
+  // Used for searching in maps. Only arena's starting address is relevant.
+  explicit TrackedArena(uint8_t* addr) { memory_ = addr; }
   TrackedArena(uint8_t* start, size_t size);
 
   template <typename PageVisitor>
@@ -45,6 +47,28 @@
     }
   }
 
+  // Return the page addr of the first page with first_obj set to nullptr.
+  uint8_t* GetLastUsedByte() const REQUIRES_SHARED(Locks::mutator_lock_) {
+    DCHECK_ALIGNED(Begin(), kPageSize);
+    DCHECK_ALIGNED(End(), kPageSize);
+    // Jump past bytes-allocated for arenas which are not currently being used
+    // by arena-allocator. This helps in reducing loop iterations below.
+    uint8_t* last_byte = AlignUp(Begin() + GetBytesAllocated(), kPageSize);
+    DCHECK_LE(last_byte, End());
+    for (size_t i = (last_byte - Begin()) / kPageSize;
+         last_byte < End() && first_obj_array_[i] != nullptr;
+         last_byte += kPageSize, i++) {
+      // No body.
+    }
+    return last_byte;
+  }
+
+  uint8_t* GetFirstObject(uint8_t* addr) const REQUIRES_SHARED(Locks::mutator_lock_) {
+    DCHECK_LE(Begin(), addr);
+    DCHECK_GT(End(), addr);
+    return first_obj_array_[(addr - Begin()) / kPageSize];
+  }
+
   // Set 'obj_begin' in first_obj_array_ in every element for which it's the
   // first object.
   void SetFirstObject(uint8_t* obj_begin, uint8_t* obj_end);
@@ -62,6 +86,15 @@
 // range to avoid multiple calls to mremapped/mprotected syscalls.
 class GcVisitedArenaPool final : public ArenaPool {
  public:
+#if defined(__LP64__)
+  // Use a size in multiples of 1GB as that can utilize the optimized mremap
+  // page-table move.
+  static constexpr size_t kLinearAllocPoolSize = 1 * GB;
+  static constexpr size_t kLow4GBLinearAllocPoolSize = 32 * MB;
+#else
+  static constexpr size_t kLinearAllocPoolSize = 32 * MB;
+#endif
+
   explicit GcVisitedArenaPool(bool low_4gb = false, const char* name = "LinearAlloc");
   virtual ~GcVisitedArenaPool();
   Arena* AllocArena(size_t size) override;
@@ -79,6 +112,14 @@
     }
   }
 
+  template <typename Callback>
+  void ForEachAllocatedArena(Callback cb) REQUIRES_SHARED(Locks::mutator_lock_) {
+    std::lock_guard<std::mutex> lock(lock_);
+    for (auto& arena : allocated_arenas_) {
+      cb(arena);
+    }
+  }
+
  private:
   void FreeRangeLocked(uint8_t* range_begin, size_t range_size) REQUIRES(lock_);
   // Add a map to the pool of at least min_size
@@ -102,9 +143,8 @@
    public:
     // Since two chunks could have the same size, use addr when that happens.
     bool operator()(const Chunk* a, const Chunk* b) const {
-      return std::less<size_t>{}(a->size_, b->size_)
-             || (std::equal_to<size_t>{}(a->size_, b->size_)
-                 && std::less<uint8_t*>{}(a->addr_, b->addr_));
+      return a->size_ < b->size_ ||
+             (a->size_ == b->size_ && std::less<uint8_t*>{}(a->addr_, b->addr_));
     }
   };
 
@@ -123,9 +163,7 @@
   std::set<Chunk*, LessByChunkAddr> free_chunks_ GUARDED_BY(lock_);
   // Set of allocated arenas. It's required to be able to find the arena
   // corresponding to a given address.
-  // TODO: We can manage without this set if we decide to have a large
-  // 'first-object' array for the entire space, instead of per arena. Analyse
-  // which approach is better.
+  // TODO: consider using HashSet, which is more memory efficient.
   std::set<TrackedArena, LessByArenaAddr> allocated_arenas_ GUARDED_BY(lock_);
   // Number of bytes allocated so far.
   size_t bytes_allocated_ GUARDED_BY(lock_);

diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 7a68863..dc2ccb4 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc

@@ -3515,7 +3515,7 @@
   }
 
   // Method shouldn't have already been linked.
-  DCHECK(method->GetEntryPointFromQuickCompiledCode() == nullptr);
+  DCHECK_EQ(method->GetEntryPointFromQuickCompiledCode(), nullptr);
   DCHECK(!method->GetDeclaringClass()->IsVisiblyInitialized());  // Actually ClassStatus::Idx.
 
   if (!method->IsInvokable()) {

diff --git a/runtime/gc/collector/mark_compact.cc b/runtime/gc/collector/mark_compact.cc
index 71e5a13..4dfba3c 100644
--- a/runtime/gc/collector/mark_compact.cc
+++ b/runtime/gc/collector/mark_compact.cc

@@ -14,10 +14,11 @@
  * limitations under the License.
  */
 
-#include "mark_compact-inl.h"
+#include <fcntl.h>
 
 #include "android-base/file.h"
 #include "android-base/properties.h"
+#include "base/memfd.h"
 #include "base/quasi_atomic.h"
 #include "base/systrace.h"
 #include "base/utils.h"
@@ -28,16 +29,21 @@
 #include "gc/task_processor.h"
 #include "gc/verification-inl.h"
 #include "jit/jit_code_cache.h"
+#include "mark_compact-inl.h"
 #include "mirror/object-refvisitor-inl.h"
 #include "read_barrier_config.h"
 #include "scoped_thread_state_change-inl.h"
 #include "sigchain.h"
 #include "thread_list.h"
-
+// Glibc v2.19 doesn't include these in fcntl.h so host builds will fail without.
+#if !defined(FALLOC_FL_PUNCH_HOLE) || !defined(FALLOC_FL_KEEP_SIZE)
+#include <linux/falloc.h>
+#endif
 #include <linux/userfaultfd.h>
 #include <poll.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/resource.h>
 #include <unistd.h>
 
 #include <fstream>
@@ -47,6 +53,9 @@
 #ifndef MREMAP_DONTUNMAP
 #define MREMAP_DONTUNMAP 4
 #endif
+#ifndef MAP_FIXED_NOREPLACE
+#define MAP_FIXED_NOREPLACE 0x100000
+#endif
 #ifndef __NR_userfaultfd
 #if defined(__x86_64__)
 #define __NR_userfaultfd 323
@@ -70,8 +79,6 @@
 
 namespace art {
 
-// We require MREMAP_DONTUNMAP functionality of the mremap syscall, which was
-// introduced in 5.13 kernel version.
 static bool HaveMremapDontunmap() {
   void* old = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
   CHECK_NE(old, MAP_FAILED);
@@ -84,14 +91,9 @@
     return false;
   }
 }
+// We require MREMAP_DONTUNMAP functionality of the mremap syscall, which was
+// introduced in 5.13 kernel version. But it was backported to GKI kernels.
 static bool gHaveMremapDontunmap = IsKernelVersionAtLeast(5, 13) || HaveMremapDontunmap();
-
-// Concurrent compaction termination logic depends on the kernel having
-// the fault-retry feature (allowing repeated faults on the same page), which was
-// introduced in 5.7. On Android this feature is backported on all the kernels where
-// userfaultfd is enabled.
-static const bool gKernelHasFaultRetry = kIsTargetAndroid || IsKernelVersionAtLeast(5, 7);
-
 // The other cases are defined as constexpr in runtime/read_barrier_config.h
 #if !defined(ART_FORCE_USE_READ_BARRIER) && defined(ART_USE_READ_BARRIER)
 // Returns collector type asked to be used on the cmdline.
@@ -114,19 +116,20 @@
 }
 
 static bool KernelSupportsUffd() {
-  int fd = syscall(__NR_userfaultfd, O_CLOEXEC | UFFD_USER_MODE_ONLY);
-  // On non-android devices we may not have the kernel patches that restrict
-  // userfaultfd to user mode. But that is not a security concern as we are
-  // on host. Therefore, attempt one more time without UFFD_USER_MODE_ONLY.
-  if (!kIsTargetAndroid && fd == -1 && errno == EINVAL) {
-    fd = syscall(__NR_userfaultfd, O_CLOEXEC);
+  if (gHaveMremapDontunmap) {
+    int fd = syscall(__NR_userfaultfd, O_CLOEXEC | UFFD_USER_MODE_ONLY);
+    // On non-android devices we may not have the kernel patches that restrict
+    // userfaultfd to user mode. But that is not a security concern as we are
+    // on host. Therefore, attempt one more time without UFFD_USER_MODE_ONLY.
+    if (!kIsTargetAndroid && fd == -1 && errno == EINVAL) {
+      fd = syscall(__NR_userfaultfd, O_CLOEXEC);
+    }
+    if (fd >= 0) {
+      close(fd);
+      return true;
+    }
   }
-  if (fd >= 0) {
-    close(fd);
-    return true;
-  } else {
-    return false;
-  }
+  return false;
 }
 
 static bool ShouldUseUserfaultfd() {
@@ -157,9 +160,17 @@
 // significantly.
 static constexpr bool kCheckLocks = kDebugLocking;
 static constexpr bool kVerifyRootsMarked = kIsDebugBuild;
+// Two threads should suffice on devices.
+static constexpr size_t kMaxNumUffdWorkers = 2;
+// Concurrent compaction termination logic works if the kernel has the fault-retry feature
+// (allowing repeated faults on the same page), which was introduced in 5.7.
+// Otherwise, kernel only retries pagefaults once, therefore having 2 or less
+// workers will also suffice as the termination logic requires (n-1) pagefault
+// retries.
+static const bool gKernelHasFaultRetry = kMaxNumUffdWorkers <= 2 || IsKernelVersionAtLeast(5, 7);
 
 bool MarkCompact::CreateUserfaultfd(bool post_fork) {
-  if (post_fork || uffd_ == -1) {
+  if (post_fork || uffd_ == kFdUnused) {
     // Don't use O_NONBLOCK as we rely on read waiting on uffd_ if there isn't
     // any read event available. We don't use poll.
     if (gKernelHasFaultRetry) {
@@ -175,11 +186,18 @@
         LOG(WARNING) << "Userfaultfd isn't supported (reason: " << strerror(errno)
                      << ") and therefore falling back to stop-the-world compaction.";
       } else {
-        DCHECK_GE(uffd_, 0);
+        DCHECK(IsValidFd(uffd_));
         // Get/update the features that we want in userfaultfd
-        struct uffdio_api api = {.api = UFFD_API, .features = 0};
+        struct uffdio_api api = {.api = UFFD_API,
+                                 .features = UFFD_FEATURE_MISSING_SHMEM | UFFD_FEATURE_MINOR_SHMEM};
         CHECK_EQ(ioctl(uffd_, UFFDIO_API, &api), 0)
               << "ioctl_userfaultfd: API: " << strerror(errno);
+        // Missing userfaults on shmem should always be available.
+        DCHECK_NE(api.features & UFFD_FEATURE_MISSING_SHMEM, 0u);
+        uffd_minor_fault_supported_ =
+            gHaveMremapDontunmap && (api.features & UFFD_FEATURE_MINOR_SHMEM) != 0;
+        // TODO: Assert that minor-fault support isn't available only on 32-bit
+        // kernel.
       }
     } else {
       // Without fault-retry feature in the kernel we can't terminate concurrent
@@ -188,7 +206,7 @@
     }
   }
   uffd_initialized_ = !post_fork || uffd_ == kFallbackMode;
-  return uffd_ >= 0;
+  return IsValidFd(uffd_);
 }
 
 template <size_t kAlignment>
@@ -199,14 +217,19 @@
 }
 
 MarkCompact::MarkCompact(Heap* heap)
-        : GarbageCollector(heap, "concurrent mark compact"),
-          gc_barrier_(0),
-          mark_stack_lock_("mark compact mark stack lock", kMarkSweepMarkStackLock),
-          bump_pointer_space_(heap->GetBumpPointerSpace()),
-          uffd_(-1),
-          thread_pool_counter_(0),
-          compacting_(false),
-          uffd_initialized_(false) {
+    : GarbageCollector(heap, "concurrent mark compact"),
+      gc_barrier_(0),
+      mark_stack_lock_("mark compact mark stack lock", kMarkSweepMarkStackLock),
+      bump_pointer_space_(heap->GetBumpPointerSpace()),
+      moving_to_space_fd_(kFdUnused),
+      moving_from_space_fd_(kFdUnused),
+      uffd_(kFdUnused),
+      thread_pool_counter_(0),
+      compaction_in_progress_count_(0),
+      compacting_(false),
+      uffd_initialized_(false),
+      uffd_minor_fault_supported_(false),
+      minor_fault_initialized_(false) {
   // TODO: Depending on how the bump-pointer space move is implemented. If we
   // switch between two virtual memories each time, then we will have to
   // initialize live_words_bitmap_ accordingly.
@@ -229,7 +252,7 @@
                                    /*low_4gb=*/ false,
                                    &err_msg);
   if (UNLIKELY(!info_map_.IsValid())) {
-    LOG(ERROR) << "Failed to allocate concurrent mark-compact chunk-info vector: " << err_msg;
+    LOG(FATAL) << "Failed to allocate concurrent mark-compact chunk-info vector: " << err_msg;
   } else {
     uint8_t* p = info_map_.Begin();
     chunk_info_vec_ = reinterpret_cast<uint32_t*>(p);
@@ -245,36 +268,79 @@
     pre_compact_offset_moving_space_ = reinterpret_cast<uint32_t*>(p);
   }
 
+  // NOTE: PROT_NONE is used here as these mappings are for address space reservation
+  // only and will be used only after appropriately remapping them.
   from_space_map_ = MemMap::MapAnonymous("Concurrent mark-compact from-space",
                                          bump_pointer_space_->Capacity(),
                                          PROT_NONE,
                                          /*low_4gb=*/ kObjPtrPoisoning,
                                          &err_msg);
   if (UNLIKELY(!from_space_map_.IsValid())) {
-    LOG(ERROR) << "Failed to allocate concurrent mark-compact from-space" << err_msg;
+    LOG(FATAL) << "Failed to allocate concurrent mark-compact from-space" << err_msg;
   } else {
     from_space_begin_ = from_space_map_.Begin();
   }
 
-  // poisoning requires 32-bit pointers and therefore compaction buffers on
-  // the stack can't be used. We also use the first page-sized buffer for the
-  // purpose of terminating concurrent compaction.
-  const size_t num_pages = 1 + std::max(heap_->GetParallelGCThreadCount(),
-                                        heap_->GetConcGCThreadCount());
+  // In some cases (32-bit or kObjPtrPoisoning) it's too much to ask for 3
+  // heap-sized mappings in low-4GB. So tolerate failure here by attempting to
+  // mmap again right before the compaction pause. And if even that fails, then
+  // running the GC cycle in copy-mode rather than minor-fault.
+  //
+  // This map doesn't have to be aligned to 2MB as we don't mremap on it.
+  shadow_to_space_map_ = MemMap::MapAnonymous("Concurrent mark-compact moving-space shadow",
+                                              bump_pointer_space_->Capacity(),
+                                              PROT_NONE,
+                                              /*low_4gb=*/kObjPtrPoisoning,
+                                              &err_msg);
+  if (!shadow_to_space_map_.IsValid()) {
+    LOG(WARNING) << "Failed to allocate concurrent mark-compact moving-space shadow: " << err_msg;
+  }
+  const size_t num_pages = 1 + std::min(heap_->GetParallelGCThreadCount(), kMaxNumUffdWorkers);
   compaction_buffers_map_ = MemMap::MapAnonymous("Concurrent mark-compact compaction buffers",
-                                                 kPageSize * (kObjPtrPoisoning ? num_pages : 1),
+                                                 kPageSize * num_pages,
                                                  PROT_READ | PROT_WRITE,
-                                                 /*low_4gb=*/ kObjPtrPoisoning,
+                                                 /*low_4gb=*/kObjPtrPoisoning,
                                                  &err_msg);
   if (UNLIKELY(!compaction_buffers_map_.IsValid())) {
-    LOG(ERROR) << "Failed to allocate concurrent mark-compact compaction buffers" << err_msg;
+    LOG(FATAL) << "Failed to allocate concurrent mark-compact compaction buffers" << err_msg;
   }
+  // We also use the first page-sized buffer for the purpose of terminating concurrent compaction.
   conc_compaction_termination_page_ = compaction_buffers_map_.Begin();
-  if (kObjPtrPoisoning) {
-    // Touch the page deliberately to avoid userfaults on it. We madvise it in
-    // CompactionPhase() before using it to terminate concurrent compaction.
-    CHECK_EQ(*conc_compaction_termination_page_, 0);
+  // Touch the page deliberately to avoid userfaults on it. We madvise it in
+  // CompactionPhase() before using it to terminate concurrent compaction.
+  CHECK_EQ(*conc_compaction_termination_page_, 0);
+  // In most of the cases, we don't expect more than one LinearAlloc space.
+  linear_alloc_spaces_data_.reserve(1);
+}
+
+void MarkCompact::AddLinearAllocSpaceData(uint8_t* begin, size_t len, bool already_shared) {
+  DCHECK_ALIGNED(begin, kPageSize);
+  DCHECK_ALIGNED(len, kPageSize);
+  std::string err_msg;
+  MemMap shadow(MemMap::MapAnonymous("linear-alloc shadow map",
+                                     len,
+                                     PROT_NONE,
+                                     /*low_4gb=*/false,
+                                     &err_msg));
+  if (!shadow.IsValid()) {
+    LOG(FATAL) << "Failed to allocate linear-alloc shadow map: " << err_msg;
+    UNREACHABLE();
   }
+
+  MemMap page_status_map(MemMap::MapAnonymous("linear-alloc page-status map",
+                                              len / kPageSize,
+                                              PROT_READ | PROT_WRITE,
+                                              /*low_4gb=*/false,
+                                              &err_msg));
+  if (!page_status_map.IsValid()) {
+    LOG(FATAL) << "Failed to allocate linear-alloc page-status shadow map: " << err_msg;
+    UNREACHABLE();
+  }
+  linear_alloc_spaces_data_.emplace_back(std::forward<MemMap>(shadow),
+                                         std::forward<MemMap>(page_status_map),
+                                         begin,
+                                         begin + len,
+                                         already_shared);
 }
 
 void MarkCompact::BindAndResetBitmaps() {
@@ -342,6 +408,9 @@
   from_space_slide_diff_ = from_space_begin_ - bump_pointer_space_->Begin();
   black_allocations_begin_ = bump_pointer_space_->Limit();
   compacting_ = false;
+  // TODO: Would it suffice to read it once in the constructor, which is called
+  // in zygote process?
+  pointer_size_ = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
 }
 
 void MarkCompact::RunPhases() {
@@ -381,7 +450,7 @@
     heap_->ThreadFlipEnd(self);
   }
 
-  if (uffd_ >= 0) {
+  if (IsValidFd(uffd_)) {
     ReaderMutexLock mu(self, *Locks::mutator_lock_);
     CompactionPhase();
   }
@@ -544,26 +613,28 @@
   non_moving_first_objs_count_ = page_idx;
 }
 
+bool MarkCompact::CanCompactMovingSpaceWithMinorFault() {
+  size_t min_size = (moving_first_objs_count_ + black_page_count_) * kPageSize;
+  return minor_fault_initialized_ && shadow_to_space_map_.IsValid() &&
+         shadow_to_space_map_.Size() >= min_size;
+}
+
 class MarkCompact::ConcurrentCompactionGcTask : public SelfDeletingTask {
  public:
   explicit ConcurrentCompactionGcTask(MarkCompact* collector, size_t idx)
       : collector_(collector), index_(idx) {}
 
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wframe-larger-than="
   void Run(Thread* self ATTRIBUTE_UNUSED) override REQUIRES_SHARED(Locks::mutator_lock_) {
-    // The passed page/buf to ConcurrentCompaction is used by the thread as a
-    // kPageSize buffer for compacting and updating objects into and then
-    // passing the buf to uffd ioctls.
-    if (kObjPtrPoisoning) {
-      uint8_t* page = collector_->compaction_buffers_map_.Begin() + index_ * kPageSize;
-      collector_->ConcurrentCompaction(page);
+    if (collector_->CanCompactMovingSpaceWithMinorFault()) {
+      collector_->ConcurrentCompaction<MarkCompact::kMinorFaultMode>(/*buf=*/nullptr);
     } else {
-      uint8_t buf[kPageSize];
-      collector_->ConcurrentCompaction(buf);
+      // The passed page/buf to ConcurrentCompaction is used by the thread as a
+      // kPageSize buffer for compacting and updating objects into and then
+      // passing the buf to uffd ioctls.
+      uint8_t* buf = collector_->compaction_buffers_map_.Begin() + index_ * kPageSize;
+      collector_->ConcurrentCompaction<MarkCompact::kCopyMode>(buf);
     }
   }
-#pragma clang diagnostic pop
 
  private:
   MarkCompact* const collector_;
@@ -635,6 +706,7 @@
   // The chunk-info vector entries for the post marking-pause allocations will be
   // also updated in the pre-compaction pause.
 
+  bool is_zygote = Runtime::Current()->IsZygote();
   if (!uffd_initialized_ && CreateUserfaultfd(/*post_fork*/false)) {
     // Register the buffer that we use for terminating concurrent compaction
     struct uffdio_register uffd_register;
@@ -643,6 +715,18 @@
     uffd_register.mode = UFFDIO_REGISTER_MODE_MISSING;
     CHECK_EQ(ioctl(uffd_, UFFDIO_REGISTER, &uffd_register), 0)
           << "ioctl_userfaultfd: register compaction termination page: " << strerror(errno);
+
+    // uffd_minor_fault_supported_ would be set appropriately in
+    // CreateUserfaultfd() above.
+    if (!uffd_minor_fault_supported_ && shadow_to_space_map_.IsValid()) {
+      // A valid shadow-map for moving space is only possible if we
+      // were able to map it in the constructor. That also means that its size
+      // matches the moving-space.
+      CHECK_EQ(shadow_to_space_map_.Size(), bump_pointer_space_->Capacity());
+      // Release the shadow map for moving-space if we don't support minor-fault
+      // as it's not required.
+      shadow_to_space_map_.Reset();
+    }
   }
   // For zygote we create the thread pool each time before starting compaction,
   // and get rid of it when finished. This is expected to happen rarely as
@@ -650,15 +734,191 @@
   if (uffd_ != kFallbackMode) {
     ThreadPool* pool = heap_->GetThreadPool();
     if (UNLIKELY(pool == nullptr)) {
-      heap_->CreateThreadPool();
+      // On devices with 2 cores, GetParallelGCThreadCount() will return 1,
+      // which is desired number of workers on such devices.
+      heap_->CreateThreadPool(std::min(heap_->GetParallelGCThreadCount(), kMaxNumUffdWorkers));
       pool = heap_->GetThreadPool();
     }
-    const size_t num_threads = pool->GetThreadCount();
+    size_t num_threads = pool->GetThreadCount();
     thread_pool_counter_ = num_threads;
     for (size_t i = 0; i < num_threads; i++) {
       pool->AddTask(thread_running_gc_, new ConcurrentCompactionGcTask(this, i + 1));
     }
     CHECK_EQ(pool->GetTaskCount(thread_running_gc_), num_threads);
+
+    /*
+     * Possible scenarios for mappings:
+     * A) All zygote GCs (or if minor-fault feature isn't available): uses
+     * uffd's copy mode
+     *  1) For moving-space ('to' space is same as the moving-space):
+     *    a) Private-anonymous mappings for 'to' and 'from' space are created in
+     *    the constructor.
+     *    b) In the compaction pause, we mremap(dontunmap) from 'to' space to
+     *    'from' space. This results in moving all pages to 'from' space and
+     *    emptying the 'to' space, thereby preparing it for userfaultfd
+     *    registration.
+     *
+     *  2) For linear-alloc space:
+     *    a) Private-anonymous mappings for the linear-alloc and its 'shadow'
+     *    are created by the arena-pool.
+     *    b) In the compaction pause, we mremap(dontumap) with similar effect as
+     *    (A.1.b) above.
+     *
+     * B) First GC after zygote: uses uffd's copy-mode
+     *  1) For moving-space:
+     *    a) If the mmap for shadow-map has been successful in the constructor,
+     *    then we remap it (mmap with MAP_FIXED) to get a shared-anonymous
+     *    mapping.
+     *    b) Else, we create two memfd and ftruncate them to the moving-space
+     *    size.
+     *    c) Same as (A.1.b)
+     *    d) If (B.1.a), then mremap(dontunmap) from shadow-map to
+     *    'to' space. This will make both of them map to the same pages
+     *    e) If (B.1.b), then mmap with the first memfd in shared mode on the
+     *    'to' space.
+     *    f) At the end of compaction, we will have moved the moving-space
+     *    objects to a MAP_SHARED mapping, readying it for minor-fault from next
+     *    GC cycle.
+     *
+     *  2) For linear-alloc space:
+     *    a) Same as (A.2.b)
+     *    b) mmap a shared-anonymous mapping onto the linear-alloc space.
+     *    c) Same as (B.1.f)
+     *
+     * C) All subsequent GCs: preferable minor-fault mode. But may also require
+     * using copy-mode.
+     *  1) For moving-space:
+     *    a) If the shadow-map is created and no memfd was used, then that means
+     *    we are using shared-anonymous. Therefore, mmap a shared-anonymous on
+     *    the shadow-space.
+     *    b) If the shadow-map is not mapped yet, then mmap one with a size
+     *    big enough to hold the compacted moving space. This may fail, in which
+     *    case we will use uffd's copy-mode.
+     *    c) If (b) is successful, then mmap the free memfd onto shadow-map.
+     *    d) Same as (A.1.b)
+     *    e) In compaction pause, if the shadow-map was not created, then use
+     *    copy-mode.
+     *    f) Else, if the created map is smaller than the required-size, then
+     *    use mremap (without dontunmap) to expand the size. If failed, then use
+     *    copy-mode.
+     *    g) Otherwise, same as (B.1.d) and use minor-fault mode.
+     *
+     *  2) For linear-alloc space:
+     *    a) Same as (A.2.b)
+     *    b) Use minor-fault mode
+     */
+    auto mmap_shadow_map = [this](int flags, int fd) {
+      void* ret = mmap(shadow_to_space_map_.Begin(),
+                       shadow_to_space_map_.Size(),
+                       PROT_READ | PROT_WRITE,
+                       flags,
+                       fd,
+                       /*offset=*/0);
+      DCHECK_NE(ret, MAP_FAILED) << "mmap for moving-space shadow failed:" << strerror(errno);
+    };
+    // Setup all the virtual memory ranges required for concurrent compaction.
+    if (minor_fault_initialized_) {
+      DCHECK(!is_zygote);
+      if (UNLIKELY(!shadow_to_space_map_.IsValid())) {
+        // This case happens only once on the first GC in minor-fault mode, if
+        // we were unable to reserve shadow-map for moving-space in the
+        // beginning.
+        DCHECK_GE(moving_to_space_fd_, 0);
+        // Take extra 4MB to reduce the likelihood of requiring resizing this
+        // map in the pause due to black allocations.
+        size_t reqd_size = std::min(moving_first_objs_count_ * kPageSize + 4 * MB,
+                                    bump_pointer_space_->Capacity());
+        // We cannot support memory-tool with shadow-map (as it requires
+        // appending a redzone) in this case because the mapping may have to be expanded
+        // using mremap (in KernelPreparation()), which would ignore the redzone.
+        // MemMap::MapFile() appends a redzone, but MemMap::MapAnonymous() doesn't.
+        std::string err_msg;
+        shadow_to_space_map_ = MemMap::MapAnonymous("moving-space-shadow",
+                                                    reqd_size,
+                                                    PROT_NONE,
+                                                    /*low_4gb=*/kObjPtrPoisoning,
+                                                    &err_msg);
+
+        if (shadow_to_space_map_.IsValid()) {
+          CHECK(!kMemoryToolAddsRedzones || shadow_to_space_map_.GetRedzoneSize() == 0u);
+          // We want to use MemMap to get low-4GB mapping, if required, but then also
+          // want to have its ownership as we may grow it (in
+          // KernelPreparation()). If the ownership is not taken and we try to
+          // resize MemMap, then it unmaps the virtual range.
+          MemMap temp = shadow_to_space_map_.TakeReservedMemory(shadow_to_space_map_.Size(),
+                                                                /*reuse*/ true);
+          std::swap(temp, shadow_to_space_map_);
+          DCHECK(!temp.IsValid());
+        } else {
+          LOG(WARNING) << "Failed to create moving space's shadow map of " << PrettySize(reqd_size)
+                       << " size. " << err_msg;
+        }
+      }
+
+      if (LIKELY(shadow_to_space_map_.IsValid())) {
+        int fd = moving_to_space_fd_;
+        int mmap_flags = MAP_SHARED | MAP_FIXED;
+        if (fd == kFdUnused) {
+          // Unused moving-to-space fd means we are using anonymous shared
+          // mapping.
+          DCHECK_EQ(shadow_to_space_map_.Size(), bump_pointer_space_->Capacity());
+          mmap_flags |= MAP_ANONYMOUS;
+          fd = -1;
+        }
+        // If the map is smaller than required, then we'll do mremap in the
+        // compaction pause to increase the size.
+        mmap_shadow_map(mmap_flags, fd);
+      }
+
+      for (auto& data : linear_alloc_spaces_data_) {
+        DCHECK_EQ(mprotect(data.shadow_.Begin(), data.shadow_.Size(), PROT_READ | PROT_WRITE), 0)
+            << "mprotect failed: " << strerror(errno);
+      }
+    } else if (!is_zygote && uffd_minor_fault_supported_) {
+      // First GC after zygote-fork. We will still use uffd's copy mode but will
+      // use it to move objects to MAP_SHARED (to prepare for subsequent GCs, which
+      // will use uffd's minor-fault feature).
+      if (shadow_to_space_map_.IsValid() &&
+          shadow_to_space_map_.Size() == bump_pointer_space_->Capacity()) {
+        mmap_shadow_map(MAP_SHARED | MAP_FIXED | MAP_ANONYMOUS, /*fd=*/-1);
+      } else {
+        size_t size = bump_pointer_space_->Capacity();
+        DCHECK_EQ(moving_to_space_fd_, kFdUnused);
+        DCHECK_EQ(moving_from_space_fd_, kFdUnused);
+        const char* name = bump_pointer_space_->GetName();
+        moving_to_space_fd_ = memfd_create(name, MFD_CLOEXEC);
+        CHECK_NE(moving_to_space_fd_, -1)
+            << "memfd_create: failed for " << name << ": " << strerror(errno);
+        moving_from_space_fd_ = memfd_create(name, MFD_CLOEXEC);
+        CHECK_NE(moving_from_space_fd_, -1)
+            << "memfd_create: failed for " << name << ": " << strerror(errno);
+
+        // memfds are considered as files from resource limits point of view.
+        // And the moving space could be several hundred MBs. So increase the
+        // limit, if it's lower than moving-space size.
+        bool rlimit_changed = false;
+        rlimit rlim_read;
+        CHECK_EQ(getrlimit(RLIMIT_FSIZE, &rlim_read), 0) << "getrlimit failed: " << strerror(errno);
+        if (rlim_read.rlim_cur < size) {
+          rlimit_changed = true;
+          rlimit rlim = rlim_read;
+          rlim.rlim_cur = size;
+          CHECK_EQ(setrlimit(RLIMIT_FSIZE, &rlim), 0) << "setrlimit failed: " << strerror(errno);
+        }
+
+        // moving-space will map this fd so that we compact objects into it.
+        int ret = ftruncate(moving_to_space_fd_, size);
+        CHECK_EQ(ret, 0) << "ftruncate failed for moving-space:" << strerror(errno);
+        ret = ftruncate(moving_from_space_fd_, size);
+        CHECK_EQ(ret, 0) << "ftruncate failed for moving-space:" << strerror(errno);
+
+        if (rlimit_changed) {
+          // reset the rlimit to the original limits.
+          CHECK_EQ(setrlimit(RLIMIT_FSIZE, &rlim_read), 0)
+              << "setrlimit failed: " << strerror(errno);
+        }
+      }
+    }
   }
 }
 
@@ -941,7 +1201,10 @@
   }
 }
 
-void MarkCompact::CompactPage(mirror::Object* obj, uint32_t offset, uint8_t* addr) {
+void MarkCompact::CompactPage(mirror::Object* obj,
+                              uint32_t offset,
+                              uint8_t* addr,
+                              bool needs_memset_zero) {
   DCHECK(moving_space_bitmap_->Test(obj)
          && live_words_bitmap_->Test(obj));
   DCHECK(live_words_bitmap_->Test(offset)) << "obj=" << obj
@@ -1084,7 +1347,7 @@
   }
   // The last page that we compact may have some bytes left untouched in the
   // end, we should zero them as the kernel copies at page granularity.
-  if (UNLIKELY(bytes_done < kPageSize)) {
+  if (needs_memset_zero && UNLIKELY(bytes_done < kPageSize)) {
     std::memset(addr + bytes_done, 0x0, kPageSize - bytes_done);
   }
 }
@@ -1097,7 +1360,8 @@
 void MarkCompact::SlideBlackPage(mirror::Object* first_obj,
                                  const size_t page_idx,
                                  uint8_t* const pre_compact_page,
-                                 uint8_t* dest) {
+                                 uint8_t* dest,
+                                 bool needs_memset_zero) {
   DCHECK(IsAligned<kPageSize>(pre_compact_page));
   size_t bytes_copied;
   const uint32_t first_chunk_size = black_alloc_pages_first_chunk_size_[page_idx];
@@ -1119,7 +1383,9 @@
   if (pre_compact_addr > pre_compact_page) {
     bytes_copied = pre_compact_addr - pre_compact_page;
     DCHECK_LT(bytes_copied, kPageSize);
-    std::memset(dest, 0x0, bytes_copied);
+    if (needs_memset_zero) {
+      std::memset(dest, 0x0, bytes_copied);
+    }
     dest += bytes_copied;
   } else {
     bytes_copied = 0;
@@ -1230,8 +1496,10 @@
                                                                 });
     size_t remaining_bytes = kPageSize - bytes_copied;
     if (found_obj == nullptr) {
-      // No more black objects in this page. Zero the remaining bytes and return.
-      std::memset(dest, 0x0, remaining_bytes);
+      if (needs_memset_zero) {
+        // No more black objects in this page. Zero the remaining bytes and return.
+        std::memset(dest, 0x0, remaining_bytes);
+      }
       return;
     }
     // Copy everything in this page, which includes any zeroed regions
@@ -1271,7 +1539,149 @@
   }
 }
 
-template <bool kFallback>
+template <bool kFirstPageMapping>
+void MarkCompact::MapProcessedPages(uint8_t* to_space_start,
+                                    Atomic<PageState>* state_arr,
+                                    size_t arr_idx,
+                                    size_t arr_len) {
+  DCHECK(minor_fault_initialized_);
+  DCHECK_LT(arr_idx, arr_len);
+  DCHECK_ALIGNED(to_space_start, kPageSize);
+  // Claim all the contiguous pages, which are ready to be mapped, and then do
+  // so in a single ioctl. This helps avoid the overhead of invoking syscall
+  // several times and also maps the already-processed pages, avoiding
+  // unnecessary faults on them.
+  size_t length = kFirstPageMapping ? kPageSize : 0;
+  if (kFirstPageMapping) {
+    arr_idx++;
+  }
+  // We need to guarantee that we don't end up sucsessfully marking a later
+  // page 'mapping' and then fail to mark an earlier page. To guarantee that
+  // we use acq_rel order.
+  for (; arr_idx < arr_len; arr_idx++, length += kPageSize) {
+    PageState expected_state = PageState::kProcessed;
+    if (!state_arr[arr_idx].compare_exchange_strong(
+            expected_state, PageState::kProcessedAndMapping, std::memory_order_acq_rel)) {
+      break;
+    }
+  }
+  if (length > 0) {
+    // Note: We need the first page to be attempted (to be mapped) by the ioctl
+    // as this function is called due to some mutator thread waiting on the
+    // 'to_space_start' page. Therefore, the ioctl must always be called
+    // with 'to_space_start' as the 'start' address because it can bail out in
+    // the middle (not attempting to map the subsequent pages) if it finds any
+    // page either already mapped in between, or missing on the shadow-map.
+    struct uffdio_continue uffd_continue;
+    uffd_continue.range.start = reinterpret_cast<uintptr_t>(to_space_start);
+    uffd_continue.range.len = length;
+    uffd_continue.mode = 0;
+    int ret = ioctl(uffd_, UFFDIO_CONTINUE, &uffd_continue);
+    if (UNLIKELY(ret == -1 && errno == EAGAIN)) {
+      // This can happen only in linear-alloc.
+      DCHECK(linear_alloc_spaces_data_.end() !=
+             std::find_if(linear_alloc_spaces_data_.begin(),
+                          linear_alloc_spaces_data_.end(),
+                          [to_space_start](const LinearAllocSpaceData& data) {
+                            return data.begin_ <= to_space_start && to_space_start < data.end_;
+                          }));
+
+      // This could happen if userfaultfd couldn't find any pages mapped in the
+      // shadow map. For instance, if there are certain (contiguous) pages on
+      // linear-alloc which are allocated and have first-object set-up but have
+      // not been accessed yet.
+      // Bail out by setting the remaining pages' state back to kProcessed and
+      // then waking up any waiting threads.
+      DCHECK_GE(uffd_continue.mapped, 0);
+      DCHECK_ALIGNED(uffd_continue.mapped, kPageSize);
+      DCHECK_LT(uffd_continue.mapped, static_cast<ssize_t>(length));
+      if (kFirstPageMapping) {
+        // In this case the first page must be mapped.
+        DCHECK_GE(uffd_continue.mapped, static_cast<ssize_t>(kPageSize));
+      }
+      // Nobody would modify these pages' state simultaneously so only atomic
+      // store is sufficient. Use 'release' order to ensure that all states are
+      // modified sequentially.
+      for (size_t remaining_len = length - uffd_continue.mapped; remaining_len > 0;
+           remaining_len -= kPageSize) {
+        arr_idx--;
+        DCHECK_EQ(state_arr[arr_idx].load(std::memory_order_relaxed),
+                  PageState::kProcessedAndMapping);
+        state_arr[arr_idx].store(PageState::kProcessed, std::memory_order_release);
+      }
+      uffd_continue.range.start =
+          reinterpret_cast<uintptr_t>(to_space_start) + uffd_continue.mapped;
+      uffd_continue.range.len = length - uffd_continue.mapped;
+      ret = ioctl(uffd_, UFFDIO_WAKE, &uffd_continue.range);
+      CHECK_EQ(ret, 0) << "ioctl_userfaultfd: wake failed: " << strerror(errno);
+    } else {
+      // We may receive ENOENT if gc-thread unregisters the
+      // range behind our back, which is fine because that
+      // happens only when it knows compaction is done.
+      CHECK(ret == 0 || !kFirstPageMapping || errno == ENOENT)
+          << "ioctl_userfaultfd: continue failed: " << strerror(errno);
+      if (ret == 0) {
+        DCHECK_EQ(uffd_continue.mapped, static_cast<ssize_t>(length));
+      }
+    }
+  }
+}
+
+template <int kMode, typename CompactionFn>
+void MarkCompact::DoPageCompactionWithStateChange(size_t page_idx,
+                                                  size_t status_arr_len,
+                                                  uint8_t* to_space_page,
+                                                  uint8_t* page,
+                                                  CompactionFn func) {
+  auto copy_ioctl = [this] (void* dst, void* buffer) {
+                      struct uffdio_copy uffd_copy;
+                      uffd_copy.src = reinterpret_cast<uintptr_t>(buffer);
+                      uffd_copy.dst = reinterpret_cast<uintptr_t>(dst);
+                      uffd_copy.len = kPageSize;
+                      uffd_copy.mode = 0;
+                      CHECK_EQ(ioctl(uffd_, UFFDIO_COPY, &uffd_copy), 0)
+                          << "ioctl_userfaultfd: copy failed: " << strerror(errno)
+                          << ". src:" << buffer << " dst:" << dst;
+                      DCHECK_EQ(uffd_copy.copy, static_cast<ssize_t>(kPageSize));
+                    };
+  PageState expected_state = PageState::kUnprocessed;
+  PageState desired_state =
+      kMode == kCopyMode ? PageState::kProcessingAndMapping : PageState::kProcessing;
+  // In the concurrent case (kMode != kFallbackMode) we need to ensure that the update
+  // to moving_spaces_status_[page_idx] is released before the contents of the page are
+  // made accessible to other threads.
+  //
+  // In minor-fault case, we need acquire ordering here to ensure that when the
+  // CAS fails, another thread has completed processing the page, which is guaranteed
+  // by the release below.
+  // Relaxed memory-order is used in copy mode as the subsequent ioctl syscall acts as a fence.
+  std::memory_order order =
+      kMode == kCopyMode ? std::memory_order_relaxed : std::memory_order_acquire;
+  if (kMode == kFallbackMode || moving_pages_status_[page_idx].compare_exchange_strong(
+                                    expected_state, desired_state, order)) {
+    func();
+    if (kMode == kCopyMode) {
+      copy_ioctl(to_space_page, page);
+    } else if (kMode == kMinorFaultMode) {
+      expected_state = PageState::kProcessing;
+      desired_state = PageState::kProcessed;
+      // the CAS needs to be with release order to ensure that stores to the
+      // page makes it to memory *before* other threads observe that it's
+      // ready to be mapped.
+      if (!moving_pages_status_[page_idx].compare_exchange_strong(
+              expected_state, desired_state, std::memory_order_release)) {
+        // Some mutator has requested to map the page after processing it.
+        DCHECK_EQ(expected_state, PageState::kProcessingAndMapping);
+        MapProcessedPages</*kFirstPageMapping=*/true>(
+            to_space_page, moving_pages_status_, page_idx, status_arr_len);
+      }
+    }
+  } else {
+    DCHECK_GT(expected_state, PageState::kProcessed);
+  }
+}
+
+template <int kMode>
 void MarkCompact::CompactMovingSpace(uint8_t* page) {
   // For every page we have a starting object, which may have started in some
   // preceding page, and an offset within that object from where we must start
@@ -1281,61 +1691,60 @@
   // consulting mark-bitmap to find where does the next live object start, we
   // use the object-size returned by VisitRefsForCompaction.
   //
-  // TODO: Should we do this in reverse? If the probability of accessing an object
-  // is inversely proportional to the object's age, then it may make sense.
+  // We do the compaction in reverse direction so that the pages containing
+  // TLAB and latest allocations are processed first.
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
-  uint8_t* to_space = bump_pointer_space_->Begin();
-  auto copy_ioctl = [this] (void* dst, void* buffer) {
-                      struct uffdio_copy uffd_copy;
-                      uffd_copy.src = reinterpret_cast<uintptr_t>(buffer);
-                      uffd_copy.dst = reinterpret_cast<uintptr_t>(dst);
-                      uffd_copy.len = kPageSize;
-                      uffd_copy.mode = 0;
-                      CHECK_EQ(ioctl(uffd_, UFFDIO_COPY, &uffd_copy), 0)
-                            << "ioctl: copy " << strerror(errno);
-                      DCHECK_EQ(uffd_copy.copy, static_cast<ssize_t>(kPageSize));
-                    };
-  size_t idx = 0;
-  while (idx < moving_first_objs_count_) {
-    // Relaxed memory-order is used as the subsequent ioctl syscall will act as a fence.
-    // In the concurrent case (!kFallback) we need to ensure that the update to
-    // moving_spaces_status_[idx] is released before the contents of the page.
-    if (kFallback
-        || moving_pages_status_[idx].exchange(PageState::kCompacting, std::memory_order_relaxed)
-           == PageState::kUncompacted) {
-      CompactPage(first_objs_moving_space_[idx].AsMirrorPtr(),
-                  pre_compact_offset_moving_space_[idx],
-                  kFallback ? to_space : page);
-      if (!kFallback) {
-        copy_ioctl(to_space, page);
-      }
-    }
-    to_space += kPageSize;
-    idx++;
+  size_t page_status_arr_len = moving_first_objs_count_ + black_page_count_;
+  size_t idx = page_status_arr_len;
+  uint8_t* to_space_end = bump_pointer_space_->Begin() + page_status_arr_len * kPageSize;
+  uint8_t* shadow_space_end = nullptr;
+  if (kMode == kMinorFaultMode) {
+    shadow_space_end = shadow_to_space_map_.Begin() + page_status_arr_len * kPageSize;
   }
   // Allocated-black pages
-  size_t count = moving_first_objs_count_ + black_page_count_;
-  uint8_t* pre_compact_page = black_allocations_begin_;
+  uint8_t* pre_compact_page = black_allocations_begin_ + (black_page_count_ * kPageSize);
+
   DCHECK(IsAligned<kPageSize>(pre_compact_page));
-  while (idx < count) {
-    mirror::Object* first_obj = first_objs_moving_space_[idx].AsMirrorPtr();
-    if (first_obj != nullptr
-        && (kFallback
-            || moving_pages_status_[idx].exchange(PageState::kCompacting, std::memory_order_relaxed)
-               == PageState::kUncompacted)) {
-      DCHECK_GT(black_alloc_pages_first_chunk_size_[idx], 0u);
-      SlideBlackPage(first_obj,
-                     idx,
-                     pre_compact_page,
-                     kFallback ? to_space : page);
-      if (!kFallback) {
-        copy_ioctl(to_space, page);
-      }
+  while (idx > moving_first_objs_count_) {
+    idx--;
+    pre_compact_page -= kPageSize;
+    to_space_end -= kPageSize;
+    if (kMode == kMinorFaultMode) {
+      shadow_space_end -= kPageSize;
+      page = shadow_space_end;
+    } else if (kMode == kFallbackMode) {
+      page = to_space_end;
     }
-    pre_compact_page += kPageSize;
-    to_space += kPageSize;
-    idx++;
+    mirror::Object* first_obj = first_objs_moving_space_[idx].AsMirrorPtr();
+    if (first_obj != nullptr) {
+      DoPageCompactionWithStateChange<kMode>(
+          idx,
+          page_status_arr_len,
+          to_space_end,
+          page,
+          [&]() REQUIRES_SHARED(Locks::mutator_lock_) {
+            SlideBlackPage(first_obj, idx, pre_compact_page, page, kMode == kCopyMode);
+          });
+    }
   }
+  DCHECK_EQ(pre_compact_page, black_allocations_begin_);
+
+  while (idx > 0) {
+    idx--;
+    to_space_end -= kPageSize;
+    if (kMode == kMinorFaultMode) {
+      shadow_space_end -= kPageSize;
+      page = shadow_space_end;
+    } else if (kMode == kFallbackMode) {
+      page = to_space_end;
+    }
+    mirror::Object* first_obj = first_objs_moving_space_[idx].AsMirrorPtr();
+    DoPageCompactionWithStateChange<kMode>(
+        idx, page_status_arr_len, to_space_end, page, [&]() REQUIRES_SHARED(Locks::mutator_lock_) {
+          CompactPage(first_obj, pre_compact_offset_moving_space_[idx], page, kMode == kCopyMode);
+        });
+  }
+  DCHECK_EQ(to_space_end, bump_pointer_space_->Begin());
 }
 
 void MarkCompact::UpdateNonMovingPage(mirror::Object* first, uint8_t* page) {
@@ -1572,11 +1981,9 @@
   MarkCompact* const collector_;
 };
 
-class MarkCompact::NativeRootsUpdateVisitor : public ClassLoaderVisitor {
+class MarkCompact::ClassLoaderRootsUpdater : public ClassLoaderVisitor {
  public:
-  explicit NativeRootsUpdateVisitor(MarkCompact* collector)
-      : collector_(collector),
-        pointer_size_(Runtime::Current()->GetClassLinker()->GetImagePointerSize()) {}
+  explicit ClassLoaderRootsUpdater(MarkCompact* collector) : collector_(collector) {}
 
   void Visit(ObjPtr<mirror::ClassLoader> class_loader) override
       REQUIRES_SHARED(Locks::classlinker_classes_lock_, Locks::mutator_lock_) {
@@ -1586,8 +1993,28 @@
     }
   }
 
-  void operator()(uint8_t* page_begin, uint8_t* first_obj)
-      ALWAYS_INLINE REQUIRES_SHARED(Locks::mutator_lock_) {
+  void VisitRootIfNonNull(mirror::CompressedReference<mirror::Object>* root) const
+      REQUIRES(Locks::heap_bitmap_lock_) REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (!root->IsNull()) {
+      VisitRoot(root);
+    }
+  }
+
+  void VisitRoot(mirror::CompressedReference<mirror::Object>* root) const
+      REQUIRES(Locks::heap_bitmap_lock_) REQUIRES_SHARED(Locks::mutator_lock_) {
+    collector_->VisitRoots(&root, 1, RootInfo(RootType::kRootVMInternal));
+  }
+
+ private:
+  MarkCompact* collector_;
+};
+
+class MarkCompact::LinearAllocPageUpdater {
+ public:
+  explicit LinearAllocPageUpdater(MarkCompact* collector) : collector_(collector) {}
+
+  void operator()(uint8_t* page_begin, uint8_t* first_obj) const ALWAYS_INLINE
+      REQUIRES_SHARED(Locks::mutator_lock_) {
     DCHECK_ALIGNED(page_begin, kPageSize);
     uint8_t* page_end = page_begin + kPageSize;
     uint32_t obj_size;
@@ -1595,9 +2022,9 @@
       TrackingHeader* header = reinterpret_cast<TrackingHeader*>(byte);
       obj_size = header->GetSize();
       LinearAllocKind kind = header->GetKind();
-      if (obj_size == 0) {
+      if (UNLIKELY(obj_size == 0)) {
         // No more objects in this page to visit.
-        DCHECK_EQ(static_cast<uint32_t>(kind), 0u);
+        DCHECK_EQ(kind, LinearAllocKind::kNoGCRoots);
         break;
       }
       uint8_t* obj = byte + sizeof(TrackingHeader);
@@ -1605,10 +2032,11 @@
       if (header->Is16Aligned()) {
         obj = AlignUp(obj, 16);
       }
-      if (UNLIKELY(obj >= page_end)) {
-        break;
+      uint8_t* begin_boundary = std::max(obj, page_begin);
+      uint8_t* end_boundary = std::min(obj_end, page_end);
+      if (begin_boundary < end_boundary) {
+        VisitObject(kind, obj, begin_boundary, end_boundary);
       }
-      VisitObject(kind, obj, std::max(obj, page_begin), std::min(obj_end, page_end));
       if (ArenaAllocator::IsRunningOnMemoryTool()) {
         obj_size += ArenaAllocator::kMemoryToolRedZoneBytes;
       }
@@ -1628,12 +2056,14 @@
     mirror::Object* old_ref = root->AsMirrorPtr();
     DCHECK_NE(old_ref, nullptr);
     if (collector_->live_words_bitmap_->HasAddress(old_ref)) {
+      mirror::Object* new_ref = old_ref;
       if (reinterpret_cast<uint8_t*>(old_ref) >= collector_->black_allocations_begin_) {
-        mirror::Object* new_ref = collector_->PostCompactBlackObjAddr(old_ref);
-        root->Assign(new_ref);
+        new_ref = collector_->PostCompactBlackObjAddr(old_ref);
       } else if (collector_->live_words_bitmap_->Test(old_ref)) {
         DCHECK(collector_->moving_space_bitmap_->Test(old_ref)) << old_ref;
-        mirror::Object* new_ref = collector_->PostCompactOldObjAddr(old_ref);
+        new_ref = collector_->PostCompactOldObjAddr(old_ref);
+      }
+      if (old_ref != new_ref) {
         root->Assign(new_ref);
       }
     }
@@ -1643,9 +2073,10 @@
   void VisitObject(LinearAllocKind kind,
                    void* obj,
                    uint8_t* start_boundary,
-                   uint8_t* end_boundary)
-      REQUIRES_SHARED(Locks::mutator_lock_) {
+                   uint8_t* end_boundary) const REQUIRES_SHARED(Locks::mutator_lock_) {
     switch (kind) {
+      case LinearAllocKind::kNoGCRoots:
+        break;
       case LinearAllocKind::kGCRootArray:
         {
           GcRoot<mirror::Object>* root = reinterpret_cast<GcRoot<mirror::Object>*>(start_boundary);
@@ -1661,17 +2092,13 @@
           // Old methods are clobbered in debug builds. Check size to confirm if the array
           // has any GC roots to visit. See ClassLinker::LinkMethodsHelper::ClobberOldMethods()
           if (array->size() > 0) {
-            if (pointer_size_ == PointerSize::k64) {
-              ArtMethod::VisitArrayRoots<PointerSize::k64>(*this,
-                                                           start_boundary,
-                                                           end_boundary,
-                                                           array);
+            if (collector_->pointer_size_ == PointerSize::k64) {
+              ArtMethod::VisitArrayRoots<PointerSize::k64>(
+                  *this, start_boundary, end_boundary, array);
             } else {
-              DCHECK_EQ(pointer_size_, PointerSize::k32);
-              ArtMethod::VisitArrayRoots<PointerSize::k32>(*this,
-                                                           start_boundary,
-                                                           end_boundary,
-                                                           array);
+              DCHECK_EQ(collector_->pointer_size_, PointerSize::k32);
+              ArtMethod::VisitArrayRoots<PointerSize::k32>(
+                  *this, start_boundary, end_boundary, array);
             }
           }
         }
@@ -1692,15 +2119,11 @@
           mirror::DexCachePair<mirror::Object>* last =
               reinterpret_cast<mirror::DexCachePair<mirror::Object>*>(end_boundary);
           mirror::DexCache::VisitDexCachePairRoots(*this, first, last);
-        }
-        break;
-      case LinearAllocKind::kNoGCRoots:
-        break;
+      }
     }
   }
 
   MarkCompact* const collector_;
-  const PointerSize pointer_size_;
 };
 
 void MarkCompact::PreCompactionPhase() {
@@ -1744,7 +2167,8 @@
     if (kIsDebugBuild) {
       size_t len = moving_first_objs_count_ + black_page_count_;
       for (size_t i = 0; i < len; i++) {
-        CHECK_EQ(moving_pages_status_[i].load(std::memory_order_relaxed), PageState::kUncompacted);
+          CHECK_EQ(moving_pages_status_[i].load(std::memory_order_relaxed),
+                   PageState::kUnprocessed);
       }
     }
     // Iterate over the allocation_stack_, for every object in the non-moving
@@ -1774,19 +2198,28 @@
     }
   }
   {
-    TimingLogger::ScopedTiming t2("(Paused)UpdateNativeRoots", GetTimings());
-    NativeRootsUpdateVisitor visitor(this);
+    TimingLogger::ScopedTiming t2("(Paused)UpdateClassLoaderRoots", GetTimings());
+    ReaderMutexLock rmu(thread_running_gc_, *Locks::classlinker_classes_lock_);
     {
-      ReaderMutexLock rmu(thread_running_gc_, *Locks::classlinker_classes_lock_);
-      runtime->GetClassLinker()->VisitClassLoaders(&visitor);
+      ClassLoaderRootsUpdater updater(this);
+      runtime->GetClassLinker()->VisitClassLoaders(&updater);
     }
-    GcVisitedArenaPool *arena_pool =
-        static_cast<GcVisitedArenaPool*>(runtime->GetLinearAllocArenaPool());
-    arena_pool->VisitRoots(visitor);
   }
 
-  SweepSystemWeaks(thread_running_gc_, runtime, /*paused*/true);
-  KernelPreparation();
+  GcVisitedArenaPool* arena_pool =
+      static_cast<GcVisitedArenaPool*>(runtime->GetLinearAllocArenaPool());
+  if (uffd_ == kFallbackMode) {
+    LinearAllocPageUpdater updater(this);
+    arena_pool->VisitRoots(updater);
+  } else {
+    arena_pool->ForEachAllocatedArena(
+        [this](const TrackedArena& arena) REQUIRES_SHARED(Locks::mutator_lock_) {
+          uint8_t* last_byte = arena.GetLastUsedByte();
+          CHECK(linear_alloc_arenas_.insert({&arena, last_byte}).second);
+        });
+  }
+
+  SweepSystemWeaks(thread_running_gc_, runtime, /*paused*/ true);
 
   {
     TimingLogger::ScopedTiming t2("(Paused)UpdateConcurrentRoots", GetTimings());
@@ -1825,75 +2258,177 @@
     }
   }
 
+  KernelPreparation();
   UpdateNonMovingSpace();
   // fallback mode
   if (uffd_ == kFallbackMode) {
-    CompactMovingSpace</*kFallback*/true>();
+    CompactMovingSpace<kFallbackMode>(nullptr);
 
     int32_t freed_bytes = black_objs_slide_diff_;
     bump_pointer_space_->RecordFree(freed_objects_, freed_bytes);
     RecordFree(ObjectBytePair(freed_objects_, freed_bytes));
   } else {
+    DCHECK_EQ(compaction_in_progress_count_.load(std::memory_order_relaxed), 0u);
     // We must start worker threads before resuming mutators to avoid deadlocks.
     heap_->GetThreadPool()->StartWorkers(thread_running_gc_);
   }
   stack_end_ = nullptr;
 }
 
-void MarkCompact::KernelPreparation() {
-  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+void MarkCompact::KernelPrepareRange(uint8_t* to_addr,
+                                     uint8_t* from_addr,
+                                     size_t map_size,
+                                     size_t uffd_size,
+                                     int fd,
+                                     int uffd_mode,
+                                     uint8_t* shadow_addr) {
   // TODO: Create mapping's at 2MB aligned addresses to benefit from optimized
   // mremap.
-  size_t size = bump_pointer_space_->Capacity();
-  uint8_t* begin = bump_pointer_space_->Begin();
-  int flags = MREMAP_MAYMOVE | MREMAP_FIXED;
+  int mremap_flags = MREMAP_MAYMOVE | MREMAP_FIXED;
   if (gHaveMremapDontunmap) {
-    flags |= MREMAP_DONTUNMAP;
+    mremap_flags |= MREMAP_DONTUNMAP;
   }
 
-  void* ret = mremap(begin, size, size, flags, from_space_begin_);
-  CHECK_EQ(ret, static_cast<void*>(from_space_begin_))
-        << "mremap to move pages from moving space to from-space failed: " << strerror(errno)
-        << ". moving-space-addr=" << reinterpret_cast<void*>(begin)
-        << " size=" << size;
+  void* ret = mremap(to_addr, map_size, map_size, mremap_flags, from_addr);
+  CHECK_EQ(ret, static_cast<void*>(from_addr))
+      << "mremap to move pages failed: " << strerror(errno)
+      << ". space-addr=" << reinterpret_cast<void*>(to_addr) << " size=" << PrettySize(map_size);
 
-  // Without MREMAP_DONTUNMAP the source mapping is unmapped by mremap. So mmap
-  // the moving space again.
-  if (!gHaveMremapDontunmap) {
-    ret = mmap(begin, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0);
-    CHECK_EQ(ret, static_cast<void*>(begin)) << "mmap for moving space failed: " << strerror(errno);
+  if (shadow_addr != nullptr) {
+    DCHECK_EQ(fd, kFdUnused);
+    DCHECK(gHaveMremapDontunmap);
+    ret = mremap(shadow_addr, map_size, map_size, mremap_flags, to_addr);
+    CHECK_EQ(ret, static_cast<void*>(to_addr))
+        << "mremap from shadow to to-space map failed: " << strerror(errno);
+  } else if (!gHaveMremapDontunmap || fd > kFdUnused) {
+    // Without MREMAP_DONTUNMAP the source mapping is unmapped by mremap. So mmap
+    // the moving space again.
+    int mmap_flags = MAP_FIXED;
+    if (fd == kFdUnused) {
+      // Use MAP_FIXED_NOREPLACE so that if someone else reserves 'to_addr'
+      // mapping in meantime, which can happen when MREMAP_DONTUNMAP isn't
+      // available, to avoid unmapping someone else' mapping and then causing
+      // crashes elsewhere.
+      mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE;
+      // On some platforms MAP_ANONYMOUS expects fd to be -1.
+      fd = -1;
+    } else if (IsValidFd(fd)) {
+      mmap_flags |= MAP_SHARED;
+    } else {
+      DCHECK_EQ(fd, kFdSharedAnon);
+      mmap_flags |= MAP_SHARED | MAP_ANONYMOUS;
+    }
+    ret = mmap(to_addr, map_size, PROT_READ | PROT_WRITE, mmap_flags, fd, 0);
+    CHECK_EQ(ret, static_cast<void*>(to_addr))
+        << "mmap for moving space failed: " << strerror(errno);
   }
-
-  DCHECK_EQ(mprotect(from_space_begin_, size, PROT_READ), 0)
-         << "mprotect failed: " << strerror(errno);
-
-  if (uffd_ >= 0) {
+  if (IsValidFd(uffd_)) {
     // Userfaultfd registration
     struct uffdio_register uffd_register;
-    uffd_register.range.start = reinterpret_cast<uintptr_t>(begin);
-    uffd_register.range.len = size;
+    uffd_register.range.start = reinterpret_cast<uintptr_t>(to_addr);
+    uffd_register.range.len = uffd_size;
     uffd_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+    if (uffd_mode == kMinorFaultMode) {
+      uffd_register.mode |= UFFDIO_REGISTER_MODE_MINOR;
+    }
     CHECK_EQ(ioctl(uffd_, UFFDIO_REGISTER, &uffd_register), 0)
-          << "ioctl_userfaultfd: register moving-space: " << strerror(errno);
+        << "ioctl_userfaultfd: register failed: " << strerror(errno)
+        << ". start:" << static_cast<void*>(to_addr) << " len:" << PrettySize(uffd_size);
   }
 }
 
-void MarkCompact::ConcurrentCompaction(uint8_t* page) {
-  struct uffd_msg msg;
-  uint8_t* unused_space_begin = bump_pointer_space_->Begin()
-                                + (moving_first_objs_count_ + black_page_count_) * kPageSize;
-  DCHECK(IsAligned<kPageSize>(unused_space_begin));
-  auto zeropage_ioctl = [this] (void* addr, bool tolerate_eexist) {
-                          struct uffdio_zeropage uffd_zeropage;
-                          DCHECK(IsAligned<kPageSize>(addr));
-                          uffd_zeropage.range.start = reinterpret_cast<uintptr_t>(addr);
-                          uffd_zeropage.range.len = kPageSize;
-                          uffd_zeropage.mode = 0;
-                          int ret = ioctl(uffd_, UFFDIO_ZEROPAGE, &uffd_zeropage);
-                          CHECK(ret == 0 || (tolerate_eexist && ret == -1 && errno == EEXIST))
-                              << "ioctl: zeropage: " << strerror(errno);
-                          DCHECK_EQ(uffd_zeropage.zeropage, static_cast<ssize_t>(kPageSize));
-                        };
+void MarkCompact::KernelPreparation() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  uint8_t* moving_space_begin = bump_pointer_space_->Begin();
+  size_t moving_space_size = bump_pointer_space_->Capacity();
+  int mode = kCopyMode;
+  size_t moving_space_register_sz;
+  if (minor_fault_initialized_) {
+    moving_space_register_sz = (moving_first_objs_count_ + black_page_count_) * kPageSize;
+    if (shadow_to_space_map_.IsValid()) {
+      size_t shadow_size = shadow_to_space_map_.Size();
+      void* addr = shadow_to_space_map_.Begin();
+      if (shadow_size < moving_space_register_sz) {
+        addr = mremap(addr,
+                      shadow_size,
+                      moving_space_register_sz,
+                      // Don't allow moving with obj-ptr poisoning as the
+                      // mapping needs to be in <4GB address space.
+                      kObjPtrPoisoning ? 0 : MREMAP_MAYMOVE,
+                      /*new_address=*/nullptr);
+        if (addr != MAP_FAILED) {
+          // Succeeded in expanding the mapping. Update the MemMap entry for shadow map.
+          MemMap temp = MemMap::MapPlaceholder(
+              "moving-space-shadow", static_cast<uint8_t*>(addr), moving_space_register_sz);
+          std::swap(shadow_to_space_map_, temp);
+        }
+      }
+      if (addr != MAP_FAILED) {
+        mode = kMinorFaultMode;
+      } else {
+        // We are not going to use shadow map. So protect it to catch any
+        // potential bugs.
+        DCHECK_EQ(mprotect(shadow_to_space_map_.Begin(), shadow_to_space_map_.Size(), PROT_NONE), 0)
+            << "mprotect failed: " << strerror(errno);
+      }
+    }
+  } else {
+    moving_space_register_sz = moving_space_size;
+  }
+
+  bool map_shared =
+      minor_fault_initialized_ || (!Runtime::Current()->IsZygote() && uffd_minor_fault_supported_);
+  uint8_t* shadow_addr = nullptr;
+  if (moving_to_space_fd_ == kFdUnused && map_shared) {
+    DCHECK(gHaveMremapDontunmap);
+    DCHECK(shadow_to_space_map_.IsValid());
+    DCHECK_EQ(shadow_to_space_map_.Size(), moving_space_size);
+    shadow_addr = shadow_to_space_map_.Begin();
+  }
+
+  KernelPrepareRange(moving_space_begin,
+                     from_space_begin_,
+                     moving_space_size,
+                     moving_space_register_sz,
+                     moving_to_space_fd_,
+                     mode,
+                     shadow_addr);
+  DCHECK_EQ(mprotect(from_space_begin_, moving_space_size, PROT_READ), 0)
+      << "mprotect failed: " << strerror(errno);
+
+  if (IsValidFd(uffd_)) {
+    for (auto& data : linear_alloc_spaces_data_) {
+      KernelPrepareRange(data.begin_,
+                         data.shadow_.Begin(),
+                         data.shadow_.Size(),
+                         data.shadow_.Size(),
+                         map_shared && !data.already_shared_ ? kFdSharedAnon : kFdUnused,
+                         minor_fault_initialized_ ? kMinorFaultMode : kCopyMode);
+      if (map_shared) {
+        data.already_shared_ = true;
+      }
+    }
+  }
+}
+
+template <int kMode>
+void MarkCompact::ConcurrentCompaction(uint8_t* buf) {
+  DCHECK_NE(kMode, kFallbackMode);
+  DCHECK(kMode != kCopyMode || buf != nullptr);
+  auto zeropage_ioctl = [this](void* addr, bool tolerate_eexist, bool tolerate_enoent) {
+    struct uffdio_zeropage uffd_zeropage;
+    DCHECK(IsAligned<kPageSize>(addr));
+    uffd_zeropage.range.start = reinterpret_cast<uintptr_t>(addr);
+    uffd_zeropage.range.len = kPageSize;
+    uffd_zeropage.mode = 0;
+    int ret = ioctl(uffd_, UFFDIO_ZEROPAGE, &uffd_zeropage);
+    if (LIKELY(ret == 0)) {
+      DCHECK_EQ(uffd_zeropage.zeropage, static_cast<ssize_t>(kPageSize));
+    } else {
+      CHECK((tolerate_enoent && errno == ENOENT) || (tolerate_eexist && errno == EEXIST))
+          << "ioctl_userfaultfd: zeropage failed: " << strerror(errno) << ". addr:" << addr;
+    }
+  };
 
   auto copy_ioctl = [this] (void* fault_page, void* src) {
                           struct uffdio_copy uffd_copy;
@@ -1901,12 +2436,14 @@
                           uffd_copy.dst = reinterpret_cast<uintptr_t>(fault_page);
                           uffd_copy.len = kPageSize;
                           uffd_copy.mode = 0;
-                          CHECK_EQ(ioctl(uffd_, UFFDIO_COPY, &uffd_copy), 0)
-                                << "ioctl: copy: " << strerror(errno);
+                          int ret = ioctl(uffd_, UFFDIO_COPY, &uffd_copy);
+                          CHECK_EQ(ret, 0) << "ioctl_userfaultfd: copy failed: " << strerror(errno)
+                                           << ". src:" << src << " fault_page:" << fault_page;
                           DCHECK_EQ(uffd_copy.copy, static_cast<ssize_t>(kPageSize));
                     };
-
+  size_t nr_moving_space_used_pages = moving_first_objs_count_ + black_page_count_;
   while (true) {
+    struct uffd_msg msg;
     ssize_t nread = read(uffd_, &msg, sizeof(msg));
     CHECK_GT(nread, 0);
     CHECK_EQ(msg.event, UFFD_EVENT_PAGEFAULT);
@@ -1923,70 +2460,340 @@
       // Only the last thread should map the zeropage so that the gc-thread can
       // proceed.
       if (ret == 1) {
-        zeropage_ioctl(fault_addr, /*tolerate_eexist*/ false);
+        zeropage_ioctl(fault_addr, /*tolerate_eexist=*/false, /*tolerate_enoent=*/false);
       } else {
         struct uffdio_range uffd_range;
         uffd_range.start = msg.arg.pagefault.address;
         uffd_range.len = kPageSize;
         CHECK_EQ(ioctl(uffd_, UFFDIO_WAKE, &uffd_range), 0)
-              << "ioctl: wake: " << strerror(errno);
+            << "ioctl_userfaultfd: wake failed for concurrent-compaction termination page: "
+            << strerror(errno);
       }
       break;
     }
-    DCHECK(bump_pointer_space_->HasAddress(reinterpret_cast<mirror::Object*>(fault_addr)));
     uint8_t* fault_page = AlignDown(fault_addr, kPageSize);
-    if (fault_addr >= unused_space_begin) {
-      // There is a race which allows more than one thread to install a
-      // zero-page. But we can tolerate that. So absorb the EEXIST returned by
-      // the ioctl and move on.
-      zeropage_ioctl(fault_page, /*tolerate_eexist*/ true);
-      continue;
-    }
-    size_t page_idx = (fault_page - bump_pointer_space_->Begin()) / kPageSize;
-    PageState state = moving_pages_status_[page_idx].load(std::memory_order_relaxed);
-    if (state == PageState::kUncompacted) {
-      // Relaxed memory-order is fine as the subsequent ioctl syscall guarantees
-      // status to be flushed before this thread attempts to copy/zeropage the
-      // fault_page.
-      state = moving_pages_status_[page_idx].exchange(PageState::kCompacting,
-                                                      std::memory_order_relaxed);
-    }
-    if (state == PageState::kCompacting) {
-      // Somebody else took (or taking) care of the page, so nothing to do.
-      continue;
-    }
-
-    if (fault_page < post_compact_end_) {
-      // The page has to be compacted.
-      CompactPage(first_objs_moving_space_[page_idx].AsMirrorPtr(),
-                  pre_compact_offset_moving_space_[page_idx],
-                  page);
-      copy_ioctl(fault_page, page);
+    if (bump_pointer_space_->HasAddress(reinterpret_cast<mirror::Object*>(fault_addr))) {
+      ConcurrentlyProcessMovingPage<kMode>(
+          zeropage_ioctl, copy_ioctl, fault_page, buf, nr_moving_space_used_pages);
+    } else if (minor_fault_initialized_) {
+      ConcurrentlyProcessLinearAllocPage<kMinorFaultMode>(
+          zeropage_ioctl,
+          copy_ioctl,
+          fault_page,
+          (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) != 0);
     } else {
-      // The page either has to be slid, or if it's an empty page then a
-      // zeropage needs to be mapped.
-      mirror::Object* first_obj = first_objs_moving_space_[page_idx].AsMirrorPtr();
-      if (first_obj != nullptr) {
-        DCHECK_GT(pre_compact_offset_moving_space_[page_idx], 0u);
-        uint8_t* pre_compact_page = black_allocations_begin_ + (fault_page - post_compact_end_);
-        DCHECK(IsAligned<kPageSize>(pre_compact_page));
-        SlideBlackPage(first_obj,
-                       page_idx,
-                       pre_compact_page,
-                       page);
-        copy_ioctl(fault_page, page);
-      } else {
-        // We should never have a case where two workers are trying to install a
-        // zeropage in this range as we synchronize using
-        // moving_pages_status_[page_idx].
-        zeropage_ioctl(fault_page, /*tolerate_eexist*/ false);
-      }
+      ConcurrentlyProcessLinearAllocPage<kCopyMode>(
+          zeropage_ioctl,
+          copy_ioctl,
+          fault_page,
+          (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) != 0);
     }
   }
 }
 
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wframe-larger-than="
+template <int kMode, typename ZeropageType, typename CopyType>
+void MarkCompact::ConcurrentlyProcessMovingPage(ZeropageType& zeropage_ioctl,
+                                                CopyType& copy_ioctl,
+                                                uint8_t* fault_page,
+                                                uint8_t* buf,
+                                                size_t nr_moving_space_used_pages) {
+  class ScopedInProgressCount {
+   public:
+    explicit ScopedInProgressCount(MarkCompact* collector) : collector_(collector) {
+      collector_->compaction_in_progress_count_.fetch_add(1, std::memory_order_relaxed);
+    }
+
+    ~ScopedInProgressCount() {
+      collector_->compaction_in_progress_count_.fetch_add(-1, std::memory_order_relaxed);
+    }
+
+   private:
+    MarkCompact* collector_;
+  };
+
+  uint8_t* unused_space_begin =
+      bump_pointer_space_->Begin() + nr_moving_space_used_pages * kPageSize;
+  DCHECK(IsAligned<kPageSize>(unused_space_begin));
+  DCHECK(kMode == kCopyMode || fault_page < unused_space_begin);
+  if (kMode == kCopyMode && fault_page >= unused_space_begin) {
+    // There is a race which allows more than one thread to install a
+    // zero-page. But we can tolerate that. So absorb the EEXIST returned by
+    // the ioctl and move on.
+    zeropage_ioctl(fault_page, /*tolerate_eexist=*/true, /*tolerate_enoent=*/true);
+    return;
+  }
+  size_t page_idx = (fault_page - bump_pointer_space_->Begin()) / kPageSize;
+  mirror::Object* first_obj = first_objs_moving_space_[page_idx].AsMirrorPtr();
+  if (first_obj == nullptr) {
+    // We should never have a case where two workers are trying to install a
+    // zeropage in this range as we synchronize using moving_pages_status_[page_idx].
+    PageState expected_state = PageState::kUnprocessed;
+    if (moving_pages_status_[page_idx].compare_exchange_strong(
+            expected_state, PageState::kProcessingAndMapping, std::memory_order_relaxed)) {
+      zeropage_ioctl(fault_page, /*tolerate_eexist=*/false, /*tolerate_enoent=*/true);
+    } else {
+      DCHECK_EQ(expected_state, PageState::kProcessingAndMapping);
+    }
+    return;
+  }
+
+  PageState state = moving_pages_status_[page_idx].load(std::memory_order_relaxed);
+  while (true) {
+    switch (state) {
+      case PageState::kUnprocessed: {
+        // The increment to the in-progress counter must be done before updating
+        // the page's state. Otherwise, we will end up leaving a window wherein
+        // the GC-thread could observe that no worker is working on compaction
+        // and could end up unregistering the moving space from userfaultfd.
+        ScopedInProgressCount in_progress(this);
+        // Acquire order to ensure we don't start writing to shadow map, which is
+        // shared, before the CAS is successful. Release order to ensure that the
+        // increment to moving_compactions_in_progress above is not re-ordered
+        // after the CAS.
+        if (moving_pages_status_[page_idx].compare_exchange_strong(
+                state, PageState::kProcessingAndMapping, std::memory_order_acquire)) {
+          if (kMode == kMinorFaultMode) {
+            DCHECK_EQ(buf, nullptr);
+            buf = shadow_to_space_map_.Begin() + page_idx * kPageSize;
+          }
+
+          if (fault_page < post_compact_end_) {
+            // The page has to be compacted.
+            CompactPage(
+                first_obj, pre_compact_offset_moving_space_[page_idx], buf, kMode == kCopyMode);
+          } else {
+            DCHECK_NE(first_obj, nullptr);
+            DCHECK_GT(pre_compact_offset_moving_space_[page_idx], 0u);
+            uint8_t* pre_compact_page = black_allocations_begin_ + (fault_page - post_compact_end_);
+            DCHECK(IsAligned<kPageSize>(pre_compact_page));
+            SlideBlackPage(first_obj, page_idx, pre_compact_page, buf, kMode == kCopyMode);
+          }
+          if (kMode == kCopyMode) {
+            copy_ioctl(fault_page, buf);
+            return;
+          } else {
+            break;
+          }
+        }
+      }
+        continue;
+      case PageState::kProcessing:
+        DCHECK_EQ(kMode, kMinorFaultMode);
+        if (moving_pages_status_[page_idx].compare_exchange_strong(
+                state, PageState::kProcessingAndMapping, std::memory_order_relaxed)) {
+          // Somebody else took or will take care of finishing the compaction and
+          // then mapping the page.
+          return;
+        }
+        continue;
+      case PageState::kProcessed:
+        // The page is processed but not mapped. We should map it.
+        break;
+      default:
+        // Somebody else took care of the page.
+        return;
+    }
+    break;
+  }
+
+  DCHECK_EQ(kMode, kMinorFaultMode);
+  if (state == PageState::kUnprocessed) {
+    MapProcessedPages</*kFirstPageMapping=*/true>(
+        fault_page, moving_pages_status_, page_idx, nr_moving_space_used_pages);
+  } else {
+    DCHECK_EQ(state, PageState::kProcessed);
+    MapProcessedPages</*kFirstPageMapping=*/false>(
+        fault_page, moving_pages_status_, page_idx, nr_moving_space_used_pages);
+  }
+}
+
+template <int kMode, typename ZeropageType, typename CopyType>
+void MarkCompact::ConcurrentlyProcessLinearAllocPage(ZeropageType& zeropage_ioctl,
+                                                     CopyType& copy_ioctl,
+                                                     uint8_t* fault_page,
+                                                     bool is_minor_fault) {
+  DCHECK(!is_minor_fault || kMode == kMinorFaultMode);
+  auto arena_iter = linear_alloc_arenas_.end();
+  {
+    TrackedArena temp_arena(fault_page);
+    arena_iter = linear_alloc_arenas_.upper_bound(&temp_arena);
+    arena_iter = arena_iter != linear_alloc_arenas_.begin() ? std::prev(arena_iter)
+                                                            : linear_alloc_arenas_.end();
+  }
+  if (arena_iter == linear_alloc_arenas_.end() || arena_iter->second <= fault_page) {
+    // Fault page isn't in any of the arenas that existed before we started
+    // compaction. So map zeropage and return.
+    zeropage_ioctl(fault_page, /*tolerate_eexist=*/true, /*tolerate_enoent=*/false);
+  } else {
+    // fault_page should always belong to some arena.
+    DCHECK(arena_iter != linear_alloc_arenas_.end())
+        << "fault_page:" << static_cast<void*>(fault_page) << "is_minor_fault:" << is_minor_fault;
+    // Find the linear-alloc space containing fault-page
+    LinearAllocSpaceData* space_data = nullptr;
+    for (auto& data : linear_alloc_spaces_data_) {
+      if (data.begin_ <= fault_page && fault_page < data.end_) {
+        space_data = &data;
+        break;
+      }
+    }
+    DCHECK_NE(space_data, nullptr);
+    ptrdiff_t diff = space_data->shadow_.Begin() - space_data->begin_;
+    size_t page_idx = (fault_page - space_data->begin_) / kPageSize;
+    Atomic<PageState>* state_arr =
+        reinterpret_cast<Atomic<PageState>*>(space_data->page_status_map_.Begin());
+    PageState state = state_arr[page_idx].load(std::memory_order_relaxed);
+    while (true) {
+      switch (state) {
+        case PageState::kUnprocessed:
+            if (state_arr[page_idx].compare_exchange_strong(
+                    state, PageState::kProcessingAndMapping, std::memory_order_acquire)) {
+            if (kMode == kCopyMode || is_minor_fault) {
+              uint8_t* first_obj = arena_iter->first->GetFirstObject(fault_page);
+              DCHECK_NE(first_obj, nullptr);
+              LinearAllocPageUpdater updater(this);
+              updater(fault_page + diff, first_obj + diff);
+              if (kMode == kCopyMode) {
+                copy_ioctl(fault_page, fault_page + diff);
+                return;
+              }
+            } else {
+              // Don't touch the page in this case (there is no reason to do so
+              // anyways) as it would mean reading from first_obj, which could be on
+              // another missing page and hence may cause this thread to block, leading
+              // to deadlocks.
+              // Force read the page if it is missing so that a zeropage gets mapped on
+              // the shadow map and then CONTINUE ioctl will map it on linear-alloc.
+              ForceRead(fault_page + diff);
+            }
+            MapProcessedPages</*kFirstPageMapping=*/true>(
+                fault_page, state_arr, page_idx, space_data->page_status_map_.Size());
+            return;
+            }
+            continue;
+        case PageState::kProcessing:
+            DCHECK_EQ(kMode, kMinorFaultMode);
+            if (state_arr[page_idx].compare_exchange_strong(
+                    state, PageState::kProcessingAndMapping, std::memory_order_relaxed)) {
+            // Somebody else took or will take care of finishing the updates and
+            // then mapping the page.
+            return;
+            }
+            continue;
+        case PageState::kProcessed:
+            // The page is processed but not mapped. We should map it.
+            break;
+        default:
+            // Somebody else took care of the page.
+            return;
+      }
+      break;
+    }
+
+    DCHECK_EQ(kMode, kMinorFaultMode);
+    DCHECK_EQ(state, PageState::kProcessed);
+    if (!is_minor_fault) {
+      // Force read the page if it is missing so that a zeropage gets mapped on
+      // the shadow map and then CONTINUE ioctl will map it on linear-alloc.
+      ForceRead(fault_page + diff);
+    }
+    MapProcessedPages</*kFirstPageMapping=*/false>(
+        fault_page, state_arr, page_idx, space_data->page_status_map_.Size());
+  }
+}
+
+void MarkCompact::ProcessLinearAlloc() {
+  for (auto& pair : linear_alloc_arenas_) {
+    const TrackedArena* arena = pair.first;
+    uint8_t* last_byte = pair.second;
+    DCHECK_ALIGNED(last_byte, kPageSize);
+    bool others_processing = false;
+    // Find the linear-alloc space containing the arena
+    LinearAllocSpaceData* space_data = nullptr;
+    for (auto& data : linear_alloc_spaces_data_) {
+      if (data.begin_ <= arena->Begin() && arena->Begin() < data.end_) {
+        space_data = &data;
+        break;
+      }
+    }
+    DCHECK_NE(space_data, nullptr);
+    ptrdiff_t diff = space_data->shadow_.Begin() - space_data->begin_;
+    auto visitor = [space_data, last_byte, diff, this, &others_processing](
+                       uint8_t* page_begin,
+                       uint8_t* first_obj) REQUIRES_SHARED(Locks::mutator_lock_) {
+      // No need to process pages past last_byte as they already have updated
+      // gc-roots, if any.
+      if (page_begin >= last_byte) {
+        return;
+      }
+      LinearAllocPageUpdater updater(this);
+      size_t page_idx = (page_begin - space_data->begin_) / kPageSize;
+      DCHECK_LT(page_idx, space_data->page_status_map_.Size());
+      Atomic<PageState>* state_arr =
+          reinterpret_cast<Atomic<PageState>*>(space_data->page_status_map_.Begin());
+      PageState expected_state = PageState::kUnprocessed;
+      PageState desired_state =
+          minor_fault_initialized_ ? PageState::kProcessing : PageState::kProcessingAndMapping;
+      // Acquire order to ensure that we don't start accessing the shadow page,
+      // which is shared with other threads, prior to CAS. Also, for same
+      // reason, we used 'release' order for changing the state to 'processed'.
+      if (state_arr[page_idx].compare_exchange_strong(
+              expected_state, desired_state, std::memory_order_acquire)) {
+        updater(page_begin + diff, first_obj + diff);
+        expected_state = PageState::kProcessing;
+        if (!minor_fault_initialized_) {
+          struct uffdio_copy uffd_copy;
+          uffd_copy.src = reinterpret_cast<uintptr_t>(page_begin + diff);
+          uffd_copy.dst = reinterpret_cast<uintptr_t>(page_begin);
+          uffd_copy.len = kPageSize;
+          uffd_copy.mode = 0;
+          CHECK_EQ(ioctl(uffd_, UFFDIO_COPY, &uffd_copy), 0)
+              << "ioctl_userfaultfd: linear-alloc copy failed:" << strerror(errno)
+              << ". dst:" << static_cast<void*>(page_begin);
+          DCHECK_EQ(uffd_copy.copy, static_cast<ssize_t>(kPageSize));
+        } else if (!state_arr[page_idx].compare_exchange_strong(
+                       expected_state, PageState::kProcessed, std::memory_order_release)) {
+          DCHECK_EQ(expected_state, PageState::kProcessingAndMapping);
+          // Force read in case the page was missing and updater didn't touch it
+          // as there was nothing to do. This will ensure that a zeropage is
+          // faulted on the shadow map.
+          ForceRead(page_begin + diff);
+          MapProcessedPages</*kFirstPageMapping=*/true>(
+              page_begin, state_arr, page_idx, space_data->page_status_map_.Size());
+        }
+      } else {
+        others_processing = true;
+      }
+    };
+
+    arena->VisitRoots(visitor);
+    // If we are not in minor-fault mode and if no other thread was found to be
+    // processing any pages in this arena, then we can madvise the shadow size.
+    // Otherwise, we will double the memory use for linear-alloc.
+    if (!minor_fault_initialized_ && !others_processing) {
+      ZeroAndReleasePages(arena->Begin() + diff, arena->Size());
+    }
+  }
+}
+
+void MarkCompact::UnregisterUffd(uint8_t* start, size_t len) {
+  struct uffdio_range range;
+  range.start = reinterpret_cast<uintptr_t>(start);
+  range.len = len;
+  CHECK_EQ(ioctl(uffd_, UFFDIO_UNREGISTER, &range), 0)
+      << "ioctl_userfaultfd: unregister failed: " << strerror(errno)
+      << ". addr:" << static_cast<void*>(start) << " len:" << PrettySize(len);
+  // Due to an oversight in the kernel implementation of 'unregister', the
+  // waiting threads are woken up only for copy uffds. Therefore, for now, we
+  // have to explicitly wake up the threads in minor-fault case.
+  // TODO: The fix in the kernel is being worked on. Once the kernel version
+  // containing the fix is known, make it conditional on that as well.
+  if (minor_fault_initialized_) {
+    CHECK_EQ(ioctl(uffd_, UFFDIO_WAKE, &range), 0)
+        << "ioctl_userfaultfd: wake failed: " << strerror(errno)
+        << ". addr:" << static_cast<void*>(start) << " len:" << PrettySize(len);
+  }
+}
+
 void MarkCompact::CompactionPhase() {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   {
@@ -1995,45 +2802,66 @@
     RecordFree(ObjectBytePair(freed_objects_, freed_bytes));
   }
 
-  if (kObjPtrPoisoning) {
-    CompactMovingSpace</*kFallback*/false>(compaction_buffers_map_.Begin());
-    // madvise the page so that we can get userfaults on it. We don't need to
-    // do this when not using poisoning as in that case the address location is
-    // untouched during compaction.
-    ZeroAndReleasePages(conc_compaction_termination_page_, kPageSize);
+  if (CanCompactMovingSpaceWithMinorFault()) {
+    CompactMovingSpace<kMinorFaultMode>(/*page=*/nullptr);
   } else {
-    uint8_t buf[kPageSize];
-    CompactMovingSpace</*kFallback*/false>(buf);
+    CompactMovingSpace<kCopyMode>(compaction_buffers_map_.Begin());
   }
 
-  // The following triggers 'special' userfaults. When received by the
+  // madvise the page so that we can get userfaults on it.
+  ZeroAndReleasePages(conc_compaction_termination_page_, kPageSize);
+
+  // TODO: add more sophisticated logic here wherein we sleep after attempting
+  // yield a couple of times.
+  while (compaction_in_progress_count_.load(std::memory_order_relaxed) > 0) {
+    sched_yield();
+  }
+
+  size_t moving_space_size = bump_pointer_space_->Capacity();
+  UnregisterUffd(bump_pointer_space_->Begin(),
+                 minor_fault_initialized_ ?
+                     (moving_first_objs_count_ + black_page_count_) * kPageSize :
+                     moving_space_size);
+
+  // Release all of the memory taken by moving-space's from-map
+  if (minor_fault_initialized_) {
+    // Give write permission for the madvise(REMOVE) to succeed.
+    DCHECK_EQ(mprotect(from_space_begin_, moving_space_size, PROT_WRITE), 0)
+        << "mprotect failed: " << strerror(errno);
+    int ret = madvise(from_space_begin_, moving_space_size, MADV_REMOVE);
+    CHECK_EQ(ret, 0) << "madvise(MADV_REMOVE) failed for from-space map:" << strerror(errno);
+  } else {
+    from_space_map_.MadviseDontNeedAndZero();
+  }
+
+  ProcessLinearAlloc();
+
+  // The following load triggers 'special' userfaults. When received by the
   // thread-pool workers, they will exit out of the compaction task. This fault
   // happens because we madvise info_map_ above and it is at least kPageSize in length.
   DCHECK(IsAligned<kPageSize>(conc_compaction_termination_page_));
   CHECK_EQ(*reinterpret_cast<volatile uint8_t*>(conc_compaction_termination_page_), 0);
   DCHECK_EQ(thread_pool_counter_, 0);
 
-  struct uffdio_range unregister_range;
-  unregister_range.start = reinterpret_cast<uintptr_t>(bump_pointer_space_->Begin());
-  unregister_range.len = bump_pointer_space_->Capacity();
-  CHECK_EQ(ioctl(uffd_, UFFDIO_UNREGISTER, &unregister_range), 0)
-        << "ioctl_userfaultfd: unregister moving-space: " << strerror(errno);
-
-  // When poisoning ObjPtr, we are forced to use buffers for page compaction in
-  // lower 4GB. Now that the usage is done, madvise them. But skip the first
-  // page, which is used by the gc-thread for the next iteration. Otherwise, we
-  // get into a deadlock due to userfault on it in the next iteration. This page
-  // is not consuming any physical memory because we already madvised it above
-  // and then we triggered a read userfault, which maps a special zero-page.
-  if (kObjPtrPoisoning) {
-    ZeroAndReleasePages(compaction_buffers_map_.Begin() + kPageSize,
-                        compaction_buffers_map_.Size() - kPageSize);
-  } else {
-    ZeroAndReleasePages(conc_compaction_termination_page_, kPageSize);
+  // Unregister linear-alloc spaces
+  for (auto& data : linear_alloc_spaces_data_) {
+    DCHECK_EQ(data.end_ - data.begin_, static_cast<ssize_t>(data.shadow_.Size()));
+    UnregisterUffd(data.begin_, data.shadow_.Size());
+    // madvise linear-allocs's page-status array
+    data.page_status_map_.MadviseDontNeedAndZero();
+    // Madvise the entire linear-alloc space's shadow. In copy-mode it gets rid
+    // of the pages which are still mapped. In minor-fault mode this unmaps all
+    // pages, which is good in reducing the mremap (done in STW pause) time in
+    // next GC cycle.
+    data.shadow_.MadviseDontNeedAndZero();
+    if (minor_fault_initialized_) {
+      DCHECK_EQ(mprotect(data.shadow_.Begin(), data.shadow_.Size(), PROT_NONE), 0)
+          << "mprotect failed: " << strerror(errno);
+    }
   }
+
   heap_->GetThreadPool()->StopWorkers(thread_running_gc_);
 }
-#pragma clang diagnostic pop
 
 template <size_t kBufferSize>
 class MarkCompact::ThreadRootsVisitor : public RootVisitor {
@@ -2630,23 +3458,46 @@
 }
 
 void MarkCompact::FinishPhase() {
+  bool is_zygote = Runtime::Current()->IsZygote();
+  minor_fault_initialized_ = !is_zygote && uffd_minor_fault_supported_;
+  // When poisoning ObjPtr, we are forced to use buffers for page compaction in
+  // lower 4GB. Now that the usage is done, madvise them. But skip the first
+  // page, which is used by the gc-thread for the next iteration. Otherwise, we
+  // get into a deadlock due to userfault on it in the next iteration. This page
+  // is not consuming any physical memory because we already madvised it above
+  // and then we triggered a read userfault, which maps a special zero-page.
+  if (!minor_fault_initialized_ || !shadow_to_space_map_.IsValid() ||
+      shadow_to_space_map_.Size() < (moving_first_objs_count_ + black_page_count_) * kPageSize) {
+    ZeroAndReleasePages(compaction_buffers_map_.Begin() + kPageSize,
+                        compaction_buffers_map_.Size() - kPageSize);
+  } else if (shadow_to_space_map_.Size() == bump_pointer_space_->Capacity()) {
+    // Now that we are going to use minor-faults from next GC cycle, we can
+    // unmap the buffers used by worker threads.
+    compaction_buffers_map_.SetSize(kPageSize);
+  }
+
   info_map_.MadviseDontNeedAndZero();
   live_words_bitmap_->ClearBitmap();
-  from_space_map_.MadviseDontNeedAndZero();
-  if (UNLIKELY(Runtime::Current()->IsZygote() && uffd_ >= 0)) {
+
+  if (UNLIKELY(is_zygote && IsValidFd(uffd_))) {
     heap_->DeleteThreadPool();
+    // This unregisters all ranges as a side-effect.
     close(uffd_);
-    uffd_ = -1;
+    uffd_ = kFdUnused;
     uffd_initialized_ = false;
   }
   CHECK(mark_stack_->IsEmpty());  // Ensure that the mark stack is empty.
   mark_stack_->Reset();
   updated_roots_.clear();
   delete[] moving_pages_status_;
-  DCHECK_EQ(thread_running_gc_, Thread::Current());
-  ReaderMutexLock mu(thread_running_gc_, *Locks::mutator_lock_);
-  WriterMutexLock mu2(thread_running_gc_, *Locks::heap_bitmap_lock_);
-  heap_->ClearMarkedObjects();
+  linear_alloc_arenas_.clear();
+  {
+    DCHECK_EQ(thread_running_gc_, Thread::Current());
+    ReaderMutexLock mu(thread_running_gc_, *Locks::mutator_lock_);
+    WriterMutexLock mu2(thread_running_gc_, *Locks::heap_bitmap_lock_);
+    heap_->ClearMarkedObjects();
+  }
+  std::swap(moving_to_space_fd_, moving_from_space_fd_);
 }
 
 }  // namespace collector

diff --git a/runtime/gc/collector/mark_compact.h b/runtime/gc/collector/mark_compact.h
index cb7440c..9931059 100644
--- a/runtime/gc/collector/mark_compact.h
+++ b/runtime/gc/collector/mark_compact.h

@@ -17,11 +17,13 @@
 #ifndef ART_RUNTIME_GC_COLLECTOR_MARK_COMPACT_H_
 #define ART_RUNTIME_GC_COLLECTOR_MARK_COMPACT_H_
 
+#include <map>
 #include <memory>
 #include <unordered_set>
 
-#include "base/atomic.h"
 #include "barrier.h"
+#include "base/atomic.h"
+#include "base/gc_visited_arena_pool.h"
 #include "base/macros.h"
 #include "base/mutex.h"
 #include "garbage_collector.h"
@@ -36,7 +38,7 @@
 
 namespace mirror {
 class DexCache;
-}
+}  // namespace mirror
 
 namespace gc {
 
@@ -47,11 +49,16 @@
 }  // namespace space
 
 namespace collector {
-class MarkCompact : public GarbageCollector {
+class MarkCompact final : public GarbageCollector {
  public:
   static constexpr size_t kAlignment = kObjectAlignment;
-  // Fake file descriptor for fall back mode
-  static constexpr int kFallbackMode = -2;
+  static constexpr int kCopyMode = -1;
+  static constexpr int kMinorFaultMode = -2;
+  // Fake file descriptor for fall back mode (when uffd isn't available)
+  static constexpr int kFallbackMode = -3;
+
+  static constexpr int kFdSharedAnon = -1;
+  static constexpr int kFdUnused = -2;
 
   explicit MarkCompact(Heap* heap);
 
@@ -130,6 +137,23 @@
   // created or was already done.
   bool CreateUserfaultfd(bool post_fork);
 
+  bool IsUffdMinorFaultSupported() const { return uffd_minor_fault_supported_; }
+
+  // Add linear-alloc space data when a new space is added to
+  // GcVisitedArenaPool, which mostly happens only once.
+  void AddLinearAllocSpaceData(uint8_t* begin, size_t len, bool already_shared);
+
+  // In copy-mode of userfaultfd, we don't need to reach a 'processed' state as
+  // it's given that processing thread also copies the page, thereby mapping it.
+  // The order is important as we may treat them as integers.
+  enum class PageState : uint8_t {
+    kUnprocessed = 0,           // Not processed yet
+    kProcessing = 1,            // Being processed by GC thread and will not be mapped
+    kProcessed = 2,             // Processed but not mapped
+    kProcessingAndMapping = 3,  // Being processed by GC or mutator and will be mapped
+    kProcessedAndMapping = 4    // Processed and will be mapped mapped
+  };
+
  private:
   using ObjReference = mirror::ObjectReference</*kPoisonReferences*/ false, mirror::Object>;
   // Number of bits (live-words) covered by a single chunk-info (below)
@@ -276,12 +300,23 @@
   // Then update the references within the copied objects. The boundary objects are
   // partially updated such that only the references that lie in the page are updated.
   // This is necessary to avoid cascading userfaults.
-  void CompactPage(mirror::Object* obj, uint32_t offset, uint8_t* addr)
+  void CompactPage(mirror::Object* obj, uint32_t offset, uint8_t* addr, bool needs_memset_zero)
       REQUIRES_SHARED(Locks::mutator_lock_);
   // Compact the bump-pointer space. Pass page that should be used as buffer for
   // userfaultfd.
-  template <bool kFallback>
-  void CompactMovingSpace(uint8_t* page = nullptr) REQUIRES_SHARED(Locks::mutator_lock_);
+  template <int kMode>
+  void CompactMovingSpace(uint8_t* page) REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Compact the given page as per func and change its state. Also map/copy the
+  // page, if required.
+  template <int kMode, typename CompactionFn>
+  ALWAYS_INLINE void DoPageCompactionWithStateChange(size_t page_idx,
+                                                     size_t status_arr_len,
+                                                     uint8_t* to_space_page,
+                                                     uint8_t* page,
+                                                     CompactionFn func)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
   // Update all the objects in the given non-moving space page. 'first' object
   // could have started in some preceding page.
   void UpdateNonMovingPage(mirror::Object* first, uint8_t* page)
@@ -315,8 +350,8 @@
   void SlideBlackPage(mirror::Object* first_obj,
                       const size_t page_idx,
                       uint8_t* const pre_compact_page,
-                      uint8_t* dest)
-      REQUIRES_SHARED(Locks::mutator_lock_);
+                      uint8_t* dest,
+                      bool needs_memset_zero) REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Perform reference-processing and the likes before sweeping the non-movable
   // spaces.
@@ -403,25 +438,61 @@
   void SweepLargeObjects(bool swap_bitmaps) REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(Locks::heap_bitmap_lock_);
 
-  // Store all the dex-cache objects visited during marking phase.
-  // This is required during compaction phase to ensure that we don't miss any
-  // of them from visiting (to update references). Somehow, iterating over
-  // class-tables to fetch these misses some of them, leading to memory
-  // corruption.
-  // TODO: once we implement concurrent compaction of classes and dex-caches,
-  // which will visit all of them, we should remove this.
-  void RememberDexCaches(mirror::Object* obj) REQUIRES_SHARED(Locks::mutator_lock_);
   // Perform all kernel operations required for concurrent compaction. Includes
   // mremap to move pre-compact pages to from-space, followed by userfaultfd
-  // registration on the moving space.
+  // registration on the moving space and linear-alloc.
   void KernelPreparation();
-  // Called by thread-pool workers to read uffd_ and process fault events.
-  void ConcurrentCompaction(uint8_t* page) REQUIRES_SHARED(Locks::mutator_lock_);
+  // Called by KernelPreparation() for every memory range being prepared.
+  void KernelPrepareRange(uint8_t* to_addr,
+                          uint8_t* from_addr,
+                          size_t map_size,
+                          size_t uffd_size,
+                          int fd,
+                          int uffd_mode,
+                          uint8_t* shadow_addr = nullptr);
+  // Unregister given range from userfaultfd.
+  void UnregisterUffd(uint8_t* start, size_t len);
 
-  enum PageState : uint8_t {
-    kUncompacted = 0,  // The page has not been compacted yet
-    kCompacting       // Some thread (GC or mutator) is compacting the page
-  };
+  // Called by thread-pool workers to read uffd_ and process fault events.
+  template <int kMode>
+  void ConcurrentCompaction(uint8_t* buf) REQUIRES_SHARED(Locks::mutator_lock_);
+  // Called by thread-pool workers to compact and copy/map the fault page in
+  // moving space.
+  template <int kMode, typename ZeropageType, typename CopyType>
+  void ConcurrentlyProcessMovingPage(ZeropageType& zeropage_ioctl,
+                                     CopyType& copy_ioctl,
+                                     uint8_t* fault_page,
+                                     uint8_t* buf,
+                                     size_t nr_moving_space_used_pages)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Called by thread-pool workers to process and copy/map the fault page in
+  // linear-alloc.
+  template <int kMode, typename ZeropageType, typename CopyType>
+  void ConcurrentlyProcessLinearAllocPage(ZeropageType& zeropage_ioctl,
+                                          CopyType& copy_ioctl,
+                                          uint8_t* fault_page,
+                                          bool is_minor_fault)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Process concurrently all the pages in linear-alloc. Called by gc-thread.
+  void ProcessLinearAlloc() REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Returns true if the moving space can be compacted using uffd's minor-fault
+  // feature.
+  bool CanCompactMovingSpaceWithMinorFault();
+
+  // Maps processed pages (from moving space and linear-alloc) for uffd's
+  // minor-fault feature. We try to 'claim' all processed (and unmapped) pages
+  // contiguous to 'to_space_start'.
+  // kFirstPageMapping indicates if the first page is already claimed or not. It
+  // also indicates that the ioctl must succeed in mapping the first page.
+  template <bool kFirstPageMapping>
+  void MapProcessedPages(uint8_t* to_space_start,
+                         Atomic<PageState>* state_arr,
+                         size_t arr_idx,
+                         size_t arr_len) REQUIRES_SHARED(Locks::mutator_lock_);
+
+  bool IsValidFd(int fd) const { return fd >= 0; }
 
   // Buffers, one per worker thread + gc-thread, to be used when
   // kObjPtrPoisoning == true as in that case we can't have the buffer on the
@@ -450,13 +521,46 @@
   // TODO: Must be replaced with an efficient mechanism eventually. Or ensure
   // that double updation doesn't happen in the first place.
   std::unordered_set<void*> updated_roots_;
-  // Set of dex-caches visited during marking. See comment above
-  // RememberDexCaches() for the explanation.
-  std::unordered_set<uint32_t> dex_caches_;
   MemMap from_space_map_;
+  MemMap shadow_to_space_map_;
   // Any array of live-bytes in logical chunks of kOffsetChunkSize size
   // in the 'to-be-compacted' space.
   MemMap info_map_;
+
+  class LessByArenaAddr {
+   public:
+    bool operator()(const TrackedArena* a, const TrackedArena* b) const {
+      return std::less<uint8_t*>{}(a->Begin(), b->Begin());
+    }
+  };
+
+  // Map of arenas allocated in LinearAlloc arena-pool and last non-zero page,
+  // captured during compaction pause for concurrent updates.
+  std::map<const TrackedArena*, uint8_t*, LessByArenaAddr> linear_alloc_arenas_;
+  // Set of PageStatus arrays, one per arena-pool space. It's extremely rare to
+  // have more than one, but this is to be ready for the worst case.
+  class LinearAllocSpaceData {
+   public:
+    LinearAllocSpaceData(MemMap&& shadow,
+                         MemMap&& page_status_map,
+                         uint8_t* begin,
+                         uint8_t* end,
+                         bool already_shared)
+        : shadow_(std::move(shadow)),
+          page_status_map_(std::move(page_status_map)),
+          begin_(begin),
+          end_(end),
+          already_shared_(already_shared) {}
+
+    MemMap shadow_;
+    MemMap page_status_map_;
+    uint8_t* begin_;
+    uint8_t* end_;
+    // Indicates if the linear-alloc is already MAP_SHARED.
+    bool already_shared_;
+  };
+  std::vector<LinearAllocSpaceData> linear_alloc_spaces_data_;
+
   // The main space bitmap
   accounting::ContinuousSpaceBitmap* moving_space_bitmap_;
   accounting::ContinuousSpaceBitmap* non_moving_space_bitmap_;
@@ -520,16 +624,23 @@
   void* stack_end_;
 
   uint8_t* conc_compaction_termination_page_;
+  PointerSize pointer_size_;
   // Number of objects freed during this GC in moving space. It is decremented
   // every time an object is discovered. And total-object count is added to it
   // in MarkingPause(). It reaches the correct count only once the marking phase
   // is completed.
   int32_t freed_objects_;
+  // memfds for moving space for using userfaultfd's minor-fault feature.
+  // Initialized to kFdUnused to indicate that mmap should be MAP_PRIVATE in
+  // KernelPrepareRange().
+  int moving_to_space_fd_;
+  int moving_from_space_fd_;
   // Userfault file descriptor, accessed only by the GC itself.
   // kFallbackMode value indicates that we are in the fallback mode.
   int uffd_;
   // Used to exit from compaction loop at the end of concurrent compaction
   uint8_t thread_pool_counter_;
+  std::atomic<uint8_t> compaction_in_progress_count_;
   // True while compacting.
   bool compacting_;
   // Flag indicating whether one-time uffd initialization has been done. It will
@@ -538,6 +649,13 @@
   // Heap::PostForkChildAction() as it's invoked in app startup path. With
   // this, we register the compaction-termination page on the first GC.
   bool uffd_initialized_;
+  // Flag indicating if userfaultfd supports minor-faults. Set appropriately in
+  // CreateUserfaultfd(), where we get this information from the kernel.
+  bool uffd_minor_fault_supported_;
+  // For non-zygote processes this flah indicates if the spaces are ready to
+  // start using userfaultfd's minor-fault feature. This initialization involves
+  // starting to use shmem (memfd_create) for the userfaultfd protected spaces.
+  bool minor_fault_initialized_;
 
   class VerifyRootMarkedVisitor;
   class ScanObjectVisitor;
@@ -546,13 +664,17 @@
   class CardModifiedVisitor;
   class RefFieldsVisitor;
   template <bool kCheckBegin, bool kCheckEnd> class RefsUpdateVisitor;
-  class NativeRootsUpdateVisitor;
+  class ArenaPoolPageUpdater;
+  class ClassLoaderRootsUpdater;
+  class LinearAllocPageUpdater;
   class ImmuneSpaceUpdateObjVisitor;
   class ConcurrentCompactionGcTask;
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(MarkCompact);
 };
 
+std::ostream& operator<<(std::ostream& os, MarkCompact::PageState value);
+
 }  // namespace collector
 }  // namespace gc
 }  // namespace art

diff --git a/runtime/linear_alloc-inl.h b/runtime/linear_alloc-inl.h
index a6b3df3..928bffb 100644
--- a/runtime/linear_alloc-inl.h
+++ b/runtime/linear_alloc-inl.h

@@ -26,6 +26,9 @@
 
 inline void LinearAlloc::SetFirstObject(void* begin, size_t bytes) const {
   DCHECK(track_allocations_);
+  if (ArenaAllocator::IsRunningOnMemoryTool()) {
+    bytes += ArenaAllocator::kMemoryToolRedZoneBytes;
+  }
   uint8_t* end = static_cast<uint8_t*>(begin) + bytes;
   Arena* arena = allocator_.GetHeadArena();
   DCHECK_NE(arena, nullptr);

diff --git a/runtime/linear_alloc.h b/runtime/linear_alloc.h
index 7353721..12c772b 100644
--- a/runtime/linear_alloc.h
+++ b/runtime/linear_alloc.h

@@ -26,7 +26,7 @@
 class ArenaPool;
 
 enum class LinearAllocKind : uint32_t {
-  kNoGCRoots,
+  kNoGCRoots = 0,  // No GC-root kind should always be 0.
   kGCRootArray,
   kArtMethodArray,
   kArtFieldArray,
commit	485a714cbfa146528f7db9395197d855af43e188	[log] [tgz]
author	Lokesh Gidra <lokeshgidra@google.com>	Wed Oct 12 10:25:23 2022 -0700
committer	Lokesh Gidra <lokeshgidra@google.com>	Mon Dec 05 18:17:40 2022 +0000
tree	914b9c764f1cc81938fb09d94e6a27bbe6e5562e
parent	13c3ce1edf7fa9e8e97fb60625a62cb97a13f9a9 [diff]