59 files changed, 6879 insertions, 978 deletions
diff --git a/runtime/gc/accounting/atomic_stack.h b/runtime/gc/accounting/atomic_stack.h
index 5e6bd88d73..a90a31963b 100644
--- a/runtime/gc/accounting/atomic_stack.h
+++ b/runtime/gc/accounting/atomic_stack.h
@@ -130,6 +130,35 @@ class AtomicStack {
     }
   }
 
+  // Bump the back index by the given number of slots. Returns false if this
+  // operation will overflow the stack. New elements should be written
+  // to [*start_address, *end_address).
+  bool BumpBack(size_t num_slots,
+                StackReference<T>** start_address,
+                StackReference<T>** end_address)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (kIsDebugBuild) {
+      debug_is_sorted_ = false;
+    }
+    const int32_t index = back_index_.load(std::memory_order_relaxed);
+    const int32_t new_index = index + num_slots;
+    if (UNLIKELY(static_cast<size_t>(new_index) >= growth_limit_)) {
+      // Stack overflow.
+      return false;
+    }
+    back_index_.store(new_index, std::memory_order_relaxed);
+    *start_address = begin_ + index;
+    *end_address = begin_ + new_index;
+    if (kIsDebugBuild) {
+      // Check the memory is zero.
+      for (int32_t i = index; i < new_index; i++) {
+        DCHECK_EQ(begin_[i].AsMirrorPtr(), static_cast<T*>(nullptr))
+            << "i=" << i << " index=" << index << " new_index=" << new_index;
+      }
+    }
+    return true;
+  }
+
   void PushBack(T* value) REQUIRES_SHARED(Locks::mutator_lock_) {
     if (kIsDebugBuild) {
       debug_is_sorted_ = false;
@@ -144,8 +173,16 @@ class AtomicStack {
     DCHECK_GT(back_index_.load(std::memory_order_relaxed),
               front_index_.load(std::memory_order_relaxed));
     // Decrement the back index non atomically.
-    back_index_.store(back_index_.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed);
-    return begin_[back_index_.load(std::memory_order_relaxed)].AsMirrorPtr();
+    const int32_t index = back_index_.load(std::memory_order_relaxed) - 1;
+    back_index_.store(index, std::memory_order_relaxed);
+    T* ret = begin_[index].AsMirrorPtr();
+    // In debug builds we expect the stack elements to be null, which may not
+    // always be the case if the stack is being reused without resetting it
+    // in-between.
+    if (kIsDebugBuild) {
+      begin_[index].Clear();
+    }
+    return ret;
   }
 
   // Take an item from the front of the stack.
diff --git a/runtime/gc/accounting/bitmap.cc b/runtime/gc/accounting/bitmap.cc
index 37646b3728..bd10958496 100644
--- a/runtime/gc/accounting/bitmap.cc
+++ b/runtime/gc/accounting/bitmap.cc
@@ -21,6 +21,7 @@
 #include "base/bit_utils.h"
 #include "base/mem_map.h"
 #include "card_table.h"
+#include "gc/collector/mark_compact.h"
 #include "jit/jit_memory_region.h"
 
 namespace art {
@@ -98,6 +99,7 @@ MemoryRangeBitmap<kAlignment>* MemoryRangeBitmap<kAlignment>::CreateFromMemMap(
 
 template class MemoryRangeBitmap<CardTable::kCardSize>;
 template class MemoryRangeBitmap<jit::kJitCodeAccountingBytes>;
+template class MemoryRangeBitmap<collector::MarkCompact::kAlignment>;
 
 }  // namespace accounting
 }  // namespace gc
diff --git a/runtime/gc/accounting/bitmap.h b/runtime/gc/accounting/bitmap.h
index 68f2d049d0..06398d6b10 100644
--- a/runtime/gc/accounting/bitmap.h
+++ b/runtime/gc/accounting/bitmap.h
@@ -81,7 +81,7 @@ class Bitmap {
   void CopyFrom(Bitmap* source_bitmap);
 
   // Starting address of our internal storage.
-  uintptr_t* Begin() {
+  uintptr_t* Begin() const {
     return bitmap_begin_;
   }
 
@@ -98,7 +98,7 @@ class Bitmap {
   std::string Dump() const;
 
  protected:
-  static constexpr size_t kBitsPerBitmapWord = sizeof(uintptr_t) * kBitsPerByte;
+  static constexpr size_t kBitsPerBitmapWord = kBitsPerIntPtrT;
 
   Bitmap(MemMap&& mem_map, size_t bitmap_size);
   ~Bitmap();
@@ -109,7 +109,9 @@ class Bitmap {
   template<bool kSetBit>
   ALWAYS_INLINE bool ModifyBit(uintptr_t bit_index);
 
-  // Backing storage for bitmap.
+  // Backing storage for bitmap. This is interpreted as an array of
+  // kBitsPerBitmapWord-sized integers, with bits assigned in each word little
+  // endian first.
   MemMap mem_map_;
 
   // This bitmap itself, word sized for efficiency in scanning.
@@ -122,7 +124,7 @@ class Bitmap {
   DISALLOW_IMPLICIT_CONSTRUCTORS(Bitmap);
 };
 
-// One bit per kAlignment in range (start, end]
+// One bit per kAlignment in range [start, end)
 template<size_t kAlignment>
 class MemoryRangeBitmap : public Bitmap {
  public:
@@ -138,7 +140,7 @@ class MemoryRangeBitmap : public Bitmap {
 
   // End of the memory range that the bitmap covers.
   ALWAYS_INLINE uintptr_t CoverEnd() const {
-    return cover_end_;
+    return cover_begin_ + kAlignment * BitmapSize();
   }
 
   // Return the address associated with a bit index.
@@ -150,39 +152,47 @@ class MemoryRangeBitmap : public Bitmap {
 
   // Return the bit index associated with an address .
   ALWAYS_INLINE uintptr_t BitIndexFromAddr(uintptr_t addr) const {
-    DCHECK(HasAddress(addr)) << CoverBegin() << " <= " <<  addr << " < " << CoverEnd();
-    return (addr - CoverBegin()) / kAlignment;
+    uintptr_t result = (addr - CoverBegin()) / kAlignment;
+    DCHECK(result < BitmapSize()) << CoverBegin() << " <= " <<  addr << " < " << CoverEnd();
+    return result;
   }
 
   ALWAYS_INLINE bool HasAddress(const uintptr_t addr) const {
-    return cover_begin_ <= addr && addr < cover_end_;
+    // Don't use BitIndexFromAddr() here as the addr passed to this function
+    // could be outside the range. If addr < cover_begin_, then the result
+    // underflows to some very large value past the end of the bitmap.
+    // Therefore, all operations are unsigned here.
+    bool ret = (addr - CoverBegin()) / kAlignment < BitmapSize();
+    if (ret) {
+      DCHECK(CoverBegin() <= addr && addr < CoverEnd())
+          << CoverBegin() << " <= " <<  addr << " < " << CoverEnd();
+    }
+    return ret;
   }
 
   ALWAYS_INLINE bool Set(uintptr_t addr) {
     return SetBit(BitIndexFromAddr(addr));
   }
 
-  ALWAYS_INLINE bool Clear(size_t addr) {
+  ALWAYS_INLINE bool Clear(uintptr_t addr) {
     return ClearBit(BitIndexFromAddr(addr));
   }
 
-  ALWAYS_INLINE bool Test(size_t addr) const {
+  ALWAYS_INLINE bool Test(uintptr_t addr) const {
     return TestBit(BitIndexFromAddr(addr));
   }
 
   // Returns true if the object was previously set.
-  ALWAYS_INLINE bool AtomicTestAndSet(size_t addr) {
+  ALWAYS_INLINE bool AtomicTestAndSet(uintptr_t addr) {
     return AtomicTestAndSetBit(BitIndexFromAddr(addr));
   }
 
  private:
   MemoryRangeBitmap(MemMap&& mem_map, uintptr_t begin, size_t num_bits)
       : Bitmap(std::move(mem_map), num_bits),
-        cover_begin_(begin),
-        cover_end_(begin + kAlignment * num_bits) {}
+        cover_begin_(begin) {}
 
   uintptr_t const cover_begin_;
-  uintptr_t const cover_end_;
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(MemoryRangeBitmap);
 };
diff --git a/runtime/gc/accounting/card_table.cc b/runtime/gc/accounting/card_table.cc
index fdf1615f5e..b8b328c795 100644
--- a/runtime/gc/accounting/card_table.cc
+++ b/runtime/gc/accounting/card_table.cc
@@ -31,11 +31,6 @@ namespace art {
 namespace gc {
 namespace accounting {
 
-constexpr size_t CardTable::kCardShift;
-constexpr size_t CardTable::kCardSize;
-constexpr uint8_t CardTable::kCardClean;
-constexpr uint8_t CardTable::kCardDirty;
-
 /*
  * Maintain a card table from the write barrier. All writes of
  * non-null values to heap addresses should go through an entry in
diff --git a/runtime/gc/accounting/card_table_test.cc b/runtime/gc/accounting/card_table_test.cc
index 12baaa4b4e..b34a883f52 100644
--- a/runtime/gc/accounting/card_table_test.cc
+++ b/runtime/gc/accounting/card_table_test.cc
@@ -19,8 +19,8 @@
 #include <string>
 
 #include "base/atomic.h"
+#include "base/common_art_test.h"
 #include "base/utils.h"
-#include "common_runtime_test.h"
 #include "handle_scope-inl.h"
 #include "mirror/class-inl.h"
 #include "mirror/string-inl.h"  // Strings are easiest to allocate
@@ -36,7 +36,7 @@ class Object;
 namespace gc {
 namespace accounting {
 
-class CardTableTest : public CommonRuntimeTest {
+class CardTableTest : public CommonArtTest {
  public:
   std::unique_ptr<CardTable> card_table_;
 
diff --git a/runtime/gc/accounting/mod_union_table.cc b/runtime/gc/accounting/mod_union_table.cc
index b4026fc3f3..4a84799431 100644
--- a/runtime/gc/accounting/mod_union_table.cc
+++ b/runtime/gc/accounting/mod_union_table.cc
@@ -388,6 +388,11 @@ void ModUnionTableReferenceCache::Dump(std::ostream& os) {
 void ModUnionTableReferenceCache::VisitObjects(ObjectCallback callback, void* arg) {
   CardTable* const card_table = heap_->GetCardTable();
   ContinuousSpaceBitmap* live_bitmap = space_->GetLiveBitmap();
+  // Use an unordered_set for constant time search of card in the second loop.
+  // We don't want to change cleared_cards_ to unordered so that traversals are
+  // sequential in address order.
+  // TODO: Optimize this.
+  std::unordered_set<const uint8_t*> card_lookup_map;
   for (uint8_t* card : cleared_cards_) {
     uintptr_t start = reinterpret_cast<uintptr_t>(card_table->AddrFromCard(card));
     uintptr_t end = start + CardTable::kCardSize;
@@ -396,10 +401,13 @@ void ModUnionTableReferenceCache::VisitObjects(ObjectCallback callback, void* ar
                                   [callback, arg](mirror::Object* obj) {
       callback(obj, arg);
     });
+    card_lookup_map.insert(card);
   }
-  // This may visit the same card twice, TODO avoid this.
   for (const auto& pair : references_) {
     const uint8_t* card = pair.first;
+    if (card_lookup_map.find(card) != card_lookup_map.end()) {
+      continue;
+    }
     uintptr_t start = reinterpret_cast<uintptr_t>(card_table->AddrFromCard(card));
     uintptr_t end = start + CardTable::kCardSize;
     live_bitmap->VisitMarkedRange(start,
diff --git a/runtime/gc/accounting/mod_union_table_test.cc b/runtime/gc/accounting/mod_union_table_test.cc
index e42682a112..3f38f5069e 100644
--- a/runtime/gc/accounting/mod_union_table_test.cc
+++ b/runtime/gc/accounting/mod_union_table_test.cc
@@ -46,6 +46,7 @@ class ModUnionTableFactory {
 class ModUnionTableTest : public CommonRuntimeTest {
  public:
   ModUnionTableTest() : java_lang_object_array_(nullptr) {
+    use_boot_image_ = true;  // Make the Runtime creation cheaper.
   }
   mirror::ObjectArray<mirror::Object>* AllocObjectArray(
       Thread* self, space::ContinuousMemMapAllocSpace* space, size_t component_count)
diff --git a/runtime/gc/accounting/space_bitmap-inl.h b/runtime/gc/accounting/space_bitmap-inl.h
index d460e00075..e7825e6953 100644
--- a/runtime/gc/accounting/space_bitmap-inl.h
+++ b/runtime/gc/accounting/space_bitmap-inl.h
@@ -64,7 +64,44 @@ inline bool SpaceBitmap<kAlignment>::Test(const mirror::Object* obj) const {
 }
 
 template<size_t kAlignment>
-template<typename Visitor>
+inline mirror::Object* SpaceBitmap<kAlignment>::FindPrecedingObject(uintptr_t visit_begin,
+                                                                    uintptr_t visit_end) const {
+  // Covers [visit_end, visit_begin].
+  visit_end = std::max(heap_begin_, visit_end);
+  DCHECK_LE(visit_end, visit_begin);
+  DCHECK_LT(visit_begin, HeapLimit());
+
+  const uintptr_t offset_start = visit_begin - heap_begin_;
+  const uintptr_t offset_end = visit_end - heap_begin_;
+  uintptr_t index_start = OffsetToIndex(offset_start);
+  const uintptr_t index_end = OffsetToIndex(offset_end);
+
+  // Start with the right edge
+  uintptr_t word = bitmap_begin_[index_start].load(std::memory_order_relaxed);
+  // visit_begin could be the first word of the object we are looking for.
+  const uintptr_t right_edge_mask = OffsetToMask(offset_start);
+  word &= right_edge_mask | (right_edge_mask - 1);
+  while (index_start > index_end) {
+    if (word != 0) {
+      const uintptr_t ptr_base = IndexToOffset(index_start) + heap_begin_;
+      size_t pos_leading_set_bit = kBitsPerIntPtrT - CLZ(word) - 1;
+      return reinterpret_cast<mirror::Object*>(ptr_base + pos_leading_set_bit * kAlignment);
+    }
+    word = bitmap_begin_[--index_start].load(std::memory_order_relaxed);
+  }
+
+  word &= ~(OffsetToMask(offset_end) - 1);
+  if (word != 0) {
+    const uintptr_t ptr_base = IndexToOffset(index_end) + heap_begin_;
+    size_t pos_leading_set_bit = kBitsPerIntPtrT - CLZ(word) - 1;
+    return reinterpret_cast<mirror::Object*>(ptr_base + pos_leading_set_bit * kAlignment);
+  } else {
+    return nullptr;
+  }
+}
+
+template<size_t kAlignment>
+template<bool kVisitOnce, typename Visitor>
 inline void SpaceBitmap<kAlignment>::VisitMarkedRange(uintptr_t visit_begin,
                                                       uintptr_t visit_end,
                                                       Visitor&& visitor) const {
@@ -114,6 +151,9 @@ inline void SpaceBitmap<kAlignment>::VisitMarkedRange(uintptr_t visit_begin,
         const size_t shift = CTZ(left_edge);
         mirror::Object* obj = reinterpret_cast<mirror::Object*>(ptr_base + shift * kAlignment);
         visitor(obj);
+        if (kVisitOnce) {
+          return;
+        }
         left_edge ^= (static_cast<uintptr_t>(1)) << shift;
       } while (left_edge != 0);
     }
@@ -128,6 +168,9 @@ inline void SpaceBitmap<kAlignment>::VisitMarkedRange(uintptr_t visit_begin,
           const size_t shift = CTZ(w);
           mirror::Object* obj = reinterpret_cast<mirror::Object*>(ptr_base + shift * kAlignment);
           visitor(obj);
+          if (kVisitOnce) {
+            return;
+          }
           w ^= (static_cast<uintptr_t>(1)) << shift;
         } while (w != 0);
       }
@@ -155,6 +198,9 @@ inline void SpaceBitmap<kAlignment>::VisitMarkedRange(uintptr_t visit_begin,
       const size_t shift = CTZ(right_edge);
       mirror::Object* obj = reinterpret_cast<mirror::Object*>(ptr_base + shift * kAlignment);
       visitor(obj);
+      if (kVisitOnce) {
+        return;
+      }
       right_edge ^= (static_cast<uintptr_t>(1)) << shift;
     } while (right_edge != 0);
   }
diff --git a/runtime/gc/accounting/space_bitmap.cc b/runtime/gc/accounting/space_bitmap.cc
index 3c5688d5bd..a0458d2ae1 100644
--- a/runtime/gc/accounting/space_bitmap.cc
+++ b/runtime/gc/accounting/space_bitmap.cc
@@ -16,6 +16,9 @@
 
 #include "space_bitmap-inl.h"
 
+#include <iomanip>
+#include <sstream>
+
 #include "android-base/stringprintf.h"
 
 #include "art_field-inl.h"
@@ -113,6 +116,37 @@ std::string SpaceBitmap<kAlignment>::Dump() const {
                       reinterpret_cast<void*>(HeapLimit()));
 }
 
+template <size_t kAlignment>
+std::string SpaceBitmap<kAlignment>::DumpMemAround(mirror::Object* obj) const {
+  uintptr_t addr = reinterpret_cast<uintptr_t>(obj);
+  DCHECK_GE(addr, heap_begin_);
+  DCHECK(HasAddress(obj)) << obj;
+  const uintptr_t offset = addr - heap_begin_;
+  const size_t index = OffsetToIndex(offset);
+  const uintptr_t mask = OffsetToMask(offset);
+  size_t num_entries = bitmap_size_ / sizeof(uintptr_t);
+  DCHECK_LT(index, num_entries) << " bitmap_size_ = " << bitmap_size_;
+  Atomic<uintptr_t>* atomic_entry = &bitmap_begin_[index];
+  uintptr_t prev = 0;
+  uintptr_t next = 0;
+  if (index > 0) {
+    prev = (atomic_entry - 1)->load(std::memory_order_relaxed);
+  }
+  uintptr_t curr = atomic_entry->load(std::memory_order_relaxed);
+  if (index < num_entries - 1) {
+    next = (atomic_entry + 1)->load(std::memory_order_relaxed);
+  }
+  std::ostringstream oss;
+  oss << " offset: " << offset
+      << " index: " << index
+      << " mask: " << std::hex << std::setfill('0') << std::setw(16) << mask
+      << " words {" << std::hex << std::setfill('0') << std::setw(16) << prev
+      << ", " << std::hex << std::setfill('0') << std::setw(16) << curr
+      << ", " << std::hex <<std::setfill('0') << std::setw(16) << next
+      << "}";
+  return oss.str();
+}
+
 template<size_t kAlignment>
 void SpaceBitmap<kAlignment>::Clear() {
   if (bitmap_begin_ != nullptr) {
diff --git a/runtime/gc/accounting/space_bitmap.h b/runtime/gc/accounting/space_bitmap.h
index 0d8ffa0d67..e3189331c4 100644
--- a/runtime/gc/accounting/space_bitmap.h
+++ b/runtime/gc/accounting/space_bitmap.h
@@ -40,8 +40,8 @@ namespace accounting {
 template<size_t kAlignment>
 class SpaceBitmap {
  public:
-  typedef void ScanCallback(mirror::Object* obj, void* finger, void* arg);
-  typedef void SweepCallback(size_t ptr_count, mirror::Object** ptrs, void* arg);
+  using ScanCallback = void(mirror::Object* obj, void* finger, void* arg);
+  using SweepCallback = void(size_t ptr_count, mirror::Object** ptrs, void* arg);
 
   // Initialize a space bitmap so that it points to a bitmap large enough to cover a heap at
   // heap_begin of heap_capacity bytes, where objects are guaranteed to be kAlignment-aligned.
@@ -131,10 +131,15 @@ class SpaceBitmap {
     }
   }
 
-  // Visit the live objects in the range [visit_begin, visit_end).
+  // Find first object while scanning bitmap backwards from visit_begin -> visit_end.
+  // Covers [visit_end, visit_begin] range.
+  mirror::Object* FindPrecedingObject(uintptr_t visit_begin, uintptr_t visit_end = 0) const;
+
+  // Visit the live objects in the range [visit_begin, visit_end). If kVisitOnce
+  // is true, then only the first live object will be visited.
   // TODO: Use lock annotations when clang is fixed.
   // REQUIRES(Locks::heap_bitmap_lock_) REQUIRES_SHARED(Locks::mutator_lock_);
-  template <typename Visitor>
+  template <bool kVisitOnce = false, typename Visitor>
   void VisitMarkedRange(uintptr_t visit_begin, uintptr_t visit_end, Visitor&& visitor) const
       NO_THREAD_SAFETY_ANALYSIS;
 
@@ -159,7 +164,7 @@ class SpaceBitmap {
   void CopyFrom(SpaceBitmap* source_bitmap);
 
   // Starting address of our internal storage.
-  Atomic<uintptr_t>* Begin() {
+  Atomic<uintptr_t>* Begin() const {
     return bitmap_begin_;
   }
 
@@ -202,6 +207,9 @@ class SpaceBitmap {
 
   std::string Dump() const;
 
+  // Dump three bitmap words around obj.
+  std::string DumpMemAround(mirror::Object* obj) const;
+
   // Helper function for computing bitmap size based on a 64 bit capacity.
   static size_t ComputeBitmapSize(uint64_t capacity);
   static size_t ComputeHeapSize(uint64_t bitmap_bytes);
diff --git a/runtime/gc/accounting/space_bitmap_test.cc b/runtime/gc/accounting/space_bitmap_test.cc
index 3a69865267..8fcf102406 100644
--- a/runtime/gc/accounting/space_bitmap_test.cc
+++ b/runtime/gc/accounting/space_bitmap_test.cc
@@ -19,8 +19,8 @@
 #include <stdint.h>
 #include <memory>
 
+#include "base/common_art_test.h"
 #include "base/mutex.h"
-#include "common_runtime_test.h"
 #include "runtime_globals.h"
 #include "space_bitmap-inl.h"
 
@@ -28,7 +28,7 @@ namespace art {
 namespace gc {
 namespace accounting {
 
-class SpaceBitmapTest : public CommonRuntimeTest {};
+class SpaceBitmapTest : public CommonArtTest {};
 
 TEST_F(SpaceBitmapTest, Init) {
   uint8_t* heap_begin = reinterpret_cast<uint8_t*>(0x10000000);
diff --git a/runtime/gc/allocation_record.cc b/runtime/gc/allocation_record.cc
index 7bcf375b16..f0d379fde6 100644
--- a/runtime/gc/allocation_record.cc
+++ b/runtime/gc/allocation_record.cc
@@ -59,6 +59,11 @@ AllocRecordObjectMap::~AllocRecordObjectMap() {
 }
 
 void AllocRecordObjectMap::VisitRoots(RootVisitor* visitor) {
+  // When we are compacting in userfaultfd GC, the class GC-roots are already
+  // updated in SweepAllocationRecords()->SweepClassObject().
+  if (Runtime::Current()->GetHeap()->IsPerformingUffdCompaction()) {
+    return;
+  }
   CHECK_LE(recent_record_max_, alloc_record_max_);
   BufferedRootVisitor<kDefaultBufferedRootCount> buffered_visitor(visitor, RootInfo(kRootDebugger));
   size_t count = recent_record_max_;
@@ -92,7 +97,10 @@ static inline void SweepClassObject(AllocRecord* record, IsMarkedVisitor* visito
     mirror::Object* new_object = visitor->IsMarked(old_object);
     DCHECK(new_object != nullptr);
     if (UNLIKELY(old_object != new_object)) {
-      klass = GcRoot<mirror::Class>(new_object->AsClass());
+      // We can't use AsClass() as it uses IsClass in a DCHECK, which expects
+      // the class' contents to be there. This is not the case in userfaultfd
+      // GC.
+      klass = GcRoot<mirror::Class>(ObjPtr<mirror::Class>::DownCast(new_object));
     }
   }
 }
@@ -131,13 +139,13 @@ void AllocRecordObjectMap::SweepAllocationRecords(IsMarkedVisitor* visitor) {
 }
 
 void AllocRecordObjectMap::AllowNewAllocationRecords() {
-  CHECK(!kUseReadBarrier);
+  CHECK(!gUseReadBarrier);
   allow_new_record_ = true;
   new_record_condition_.Broadcast(Thread::Current());
 }
 
 void AllocRecordObjectMap::DisallowNewAllocationRecords() {
-  CHECK(!kUseReadBarrier);
+  CHECK(!gUseReadBarrier);
   allow_new_record_ = false;
 }
 
@@ -230,8 +238,8 @@ void AllocRecordObjectMap::RecordAllocation(Thread* self,
   // Since nobody seemed to really notice or care it might not be worth the trouble.
 
   // Wait for GC's sweeping to complete and allow new records.
-  while (UNLIKELY((!kUseReadBarrier && !allow_new_record_) ||
-                  (kUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
+  while (UNLIKELY((!gUseReadBarrier && !allow_new_record_) ||
+                  (gUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
     // Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
     // presence of threads blocking for weak ref access.
     self->CheckEmptyCheckpointFromWeakRefAccess(Locks::alloc_tracker_lock_);
diff --git a/runtime/gc/allocator/dlmalloc.cc b/runtime/gc/allocator/art-dlmalloc.cc
index 79d4fbfb5a..de0c85a407 100644
--- a/runtime/gc/allocator/dlmalloc.cc
+++ b/runtime/gc/allocator/art-dlmalloc.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "dlmalloc.h"
+#include "art-dlmalloc.h"
 
 #include <android-base/logging.h>
 
@@ -39,8 +39,8 @@ static void art_heap_usage_error(const char* function, void* p);
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wnull-pointer-arithmetic"
 #pragma GCC diagnostic ignored "-Wexpansion-to-defined"
-#include "../../../external/dlmalloc/malloc.c"
-// Note: malloc.c uses a DEBUG define to drive debug code. This interferes with the DEBUG severity
+#include "dlmalloc.c"  // NOLINT
+// Note: dlmalloc.c uses a DEBUG define to drive debug code. This interferes with the DEBUG severity
 //       of libbase, so undefine it now.
 #undef DEBUG
 #pragma GCC diagnostic pop
diff --git a/runtime/gc/allocator/dlmalloc.h b/runtime/gc/allocator/art-dlmalloc.h
index b12691ad0e..296de72c70 100644
--- a/runtime/gc/allocator/dlmalloc.h
+++ b/runtime/gc/allocator/art-dlmalloc.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef ART_RUNTIME_GC_ALLOCATOR_DLMALLOC_H_
-#define ART_RUNTIME_GC_ALLOCATOR_DLMALLOC_H_
+#ifndef ART_RUNTIME_GC_ALLOCATOR_ART_DLMALLOC_H_
+#define ART_RUNTIME_GC_ALLOCATOR_ART_DLMALLOC_H_
 
 #include <cstdint>
 
@@ -33,7 +33,7 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wredundant-decls"
 #pragma GCC diagnostic ignored "-Wnull-pointer-arithmetic"
-#include "../../external/dlmalloc/malloc.h"
+#include "dlmalloc.h"
 #pragma GCC diagnostic pop
 
 // Callback for dlmalloc_inspect_all or mspace_inspect_all that will madvise(2) unused
@@ -58,4 +58,4 @@ void* ArtDlMallocMoreCore(void* mspace, intptr_t increment);
 }  // namespace gc
 }  // namespace art
 
-#endif  // ART_RUNTIME_GC_ALLOCATOR_DLMALLOC_H_
+#endif  // ART_RUNTIME_GC_ALLOCATOR_ART_DLMALLOC_H_
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 0de62fef47..1f123aaff5 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -160,17 +160,31 @@ ConcurrentCopying::ConcurrentCopying(Heap* heap,
   if (young_gen_) {
     gc_time_histogram_ = metrics->YoungGcCollectionTime();
     metrics_gc_count_ = metrics->YoungGcCount();
+    metrics_gc_count_delta_ = metrics->YoungGcCountDelta();
     gc_throughput_histogram_ = metrics->YoungGcThroughput();
     gc_tracing_throughput_hist_ = metrics->YoungGcTracingThroughput();
     gc_throughput_avg_ = metrics->YoungGcThroughputAvg();
     gc_tracing_throughput_avg_ = metrics->YoungGcTracingThroughputAvg();
+    gc_scanned_bytes_ = metrics->YoungGcScannedBytes();
+    gc_scanned_bytes_delta_ = metrics->YoungGcScannedBytesDelta();
+    gc_freed_bytes_ = metrics->YoungGcFreedBytes();
+    gc_freed_bytes_delta_ = metrics->YoungGcFreedBytesDelta();
+    gc_duration_ = metrics->YoungGcDuration();
+    gc_duration_delta_ = metrics->YoungGcDurationDelta();
   } else {
     gc_time_histogram_ = metrics->FullGcCollectionTime();
     metrics_gc_count_ = metrics->FullGcCount();
+    metrics_gc_count_delta_ = metrics->FullGcCountDelta();
     gc_throughput_histogram_ = metrics->FullGcThroughput();
     gc_tracing_throughput_hist_ = metrics->FullGcTracingThroughput();
     gc_throughput_avg_ = metrics->FullGcThroughputAvg();
     gc_tracing_throughput_avg_ = metrics->FullGcTracingThroughputAvg();
+    gc_scanned_bytes_ = metrics->FullGcScannedBytes();
+    gc_scanned_bytes_delta_ = metrics->FullGcScannedBytesDelta();
+    gc_freed_bytes_ = metrics->FullGcFreedBytes();
+    gc_freed_bytes_delta_ = metrics->FullGcFreedBytesDelta();
+    gc_duration_ = metrics->FullGcDuration();
+    gc_duration_delta_ = metrics->FullGcDurationDelta();
   }
 }
 
@@ -575,10 +589,11 @@ class ConcurrentCopying::FlipCallback : public Closure {
     if (kIsDebugBuild && !cc->use_generational_cc_) {
       cc->region_space_->AssertAllRegionLiveBytesZeroOrCleared();
     }
-    if (UNLIKELY(Runtime::Current()->IsActiveTransaction())) {
-      CHECK(Runtime::Current()->IsAotCompiler());
+    Runtime* runtime = Runtime::Current();
+    if (UNLIKELY(runtime->IsActiveTransaction())) {
+      CHECK(runtime->IsAotCompiler());
       TimingLogger::ScopedTiming split3("(Paused)VisitTransactionRoots", cc->GetTimings());
-      Runtime::Current()->VisitTransactionRoots(cc);
+      runtime->VisitTransactionRoots(cc);
     }
     if (kUseBakerReadBarrier && kGrayDirtyImmuneObjects) {
       cc->GrayAllNewlyDirtyImmuneObjects();
@@ -587,15 +602,10 @@ class ConcurrentCopying::FlipCallback : public Closure {
         cc->VerifyGrayImmuneObjects();
       }
     }
-    // May be null during runtime creation, in this case leave java_lang_Object null.
-    // This is safe since single threaded behavior should mean FillWithFakeObject does not
-    // happen when java_lang_Object_ is null.
-    if (WellKnownClasses::java_lang_Object != nullptr) {
-      cc->java_lang_Object_ = down_cast<mirror::Class*>(cc->Mark(thread,
-          WellKnownClasses::ToClass(WellKnownClasses::java_lang_Object).Ptr()));
-    } else {
-      cc->java_lang_Object_ = nullptr;
-    }
+    ObjPtr<mirror::Class> java_lang_Object =
+        GetClassRoot<mirror::Object, kWithoutReadBarrier>(runtime->GetClassLinker());
+    DCHECK(java_lang_Object != nullptr);
+    cc->java_lang_Object_ = down_cast<mirror::Class*>(cc->Mark(thread, java_lang_Object.Ptr()));
   }
 
  private:
@@ -1692,8 +1702,6 @@ void ConcurrentCopying::CopyingPhase() {
     if (kVerboseMode) {
       LOG(INFO) << "SweepSystemWeaks done";
     }
-    // Free data for class loaders that we unloaded.
-    Runtime::Current()->GetClassLinker()->CleanupClassLoaders();
     // Marking is done. Disable marking.
     DisableMarking();
     CheckEmptyMarkStack();
@@ -1739,6 +1747,10 @@ class ConcurrentCopying::DisableMarkingCheckpoint : public Closure {
            thread->IsSuspended() ||
            thread->GetState() == ThreadState::kWaitingPerformingGc)
         << thread->GetState() << " thread " << thread << " self " << self;
+    // We sweep interpreter caches here so that it can be done after all
+    // reachable objects are marked and the mutators can sweep their caches
+    // without synchronization.
+    thread->SweepInterpreterCache(concurrent_copying_);
     // Disable the thread-local is_gc_marking flag.
     // Note a thread that has just started right before this checkpoint may have already this flag
     // set to false, which is ok.
@@ -1887,7 +1899,10 @@ void ConcurrentCopying::PushOntoMarkStack(Thread* const self, mirror::Object* to
         << " cc->is_marking=" << is_marking_;
     CHECK(self == thread_running_gc_)
         << "Only GC-running thread should access the mark stack "
-        << "in the GC exclusive mark stack mode";
+        << "in the GC exclusive mark stack mode. "
+        << "ref=" << to_ref
+        << " self->gc_marking=" << self->GetIsGcMarking()
+        << " cc->is_marking=" << is_marking_;
     // Access the GC mark stack without a lock.
     if (UNLIKELY(gc_mark_stack_->IsFull())) {
       ExpandGcMarkStack();
@@ -2716,6 +2731,11 @@ void ConcurrentCopying::ReclaimPhase() {
   }
   Thread* self = Thread::Current();
 
+  // Free data for class loaders that we unloaded. This includes removing
+  // dead methods from JIT's internal maps. This must be done before
+  // reclaiming the memory of the dead methods' declaring classes.
+  Runtime::Current()->GetClassLinker()->CleanupClassLoaders();
+
   {
     // Double-check that the mark stack is empty.
     // Note: need to set this after VerifyNoFromSpaceRef().
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index c274fed23b..888c38aa95 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -161,6 +161,10 @@ class ConcurrentCopying : public GarbageCollector {
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   void AssertNoThreadMarkStackMapping(Thread* thread) REQUIRES(!mark_stack_lock_);
+  // Dump information about reference `ref` and return it as a string.
+  // Use `ref_name` to name the reference in messages. Each message is prefixed with `indent`.
+  std::string DumpReferenceInfo(mirror::Object* ref, const char* ref_name, const char* indent = "")
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
  private:
   void PushOntoMarkStack(Thread* const self, mirror::Object* obj)
@@ -282,10 +286,6 @@ class ConcurrentCopying : public GarbageCollector {
   void ComputeUnevacFromSpaceLiveRatio();
   void LogFromSpaceRefHolder(mirror::Object* obj, MemberOffset offset)
       REQUIRES_SHARED(Locks::mutator_lock_);
-  // Dump information about reference `ref` and return it as a string.
-  // Use `ref_name` to name the reference in messages. Each message is prefixed with `indent`.
-  std::string DumpReferenceInfo(mirror::Object* ref, const char* ref_name, const char* indent = "")
-      REQUIRES_SHARED(Locks::mutator_lock_);
   // Dump information about heap reference `ref`, referenced from object `obj` at offset `offset`,
   // and return it as a string.
   std::string DumpHeapReference(mirror::Object* obj, MemberOffset offset, mirror::Object* ref)
diff --git a/runtime/gc/collector/garbage_collector.cc b/runtime/gc/collector/garbage_collector.cc
index 80b39824ec..03a432dbf4 100644
--- a/runtime/gc/collector/garbage_collector.cc
+++ b/runtime/gc/collector/garbage_collector.cc
@@ -72,10 +72,17 @@ GarbageCollector::GarbageCollector(Heap* heap, const std::string& name)
       freed_bytes_histogram_((name_ + " freed-bytes").c_str(), kMemBucketSize, kMemBucketCount),
       gc_time_histogram_(nullptr),
       metrics_gc_count_(nullptr),
+      metrics_gc_count_delta_(nullptr),
       gc_throughput_histogram_(nullptr),
       gc_tracing_throughput_hist_(nullptr),
       gc_throughput_avg_(nullptr),
       gc_tracing_throughput_avg_(nullptr),
+      gc_scanned_bytes_(nullptr),
+      gc_scanned_bytes_delta_(nullptr),
+      gc_freed_bytes_(nullptr),
+      gc_freed_bytes_delta_(nullptr),
+      gc_duration_(nullptr),
+      gc_duration_delta_(nullptr),
       cumulative_timings_(name),
       pause_histogram_lock_("pause histogram lock", kDefaultMutexLevel, true),
       is_transaction_active_(false),
@@ -189,19 +196,26 @@ void GarbageCollector::Run(GcCause gc_cause, bool clear_soft_references) {
     RegisterPause(duration_ns);
   }
   total_time_ns_ += duration_ns;
-  uint64_t total_pause_time = 0;
+  uint64_t total_pause_time_ns = 0;
   for (uint64_t pause_time : current_iteration->GetPauseTimes()) {
     MutexLock mu(self, pause_histogram_lock_);
     pause_histogram_.AdjustAndAddValue(pause_time);
-    total_pause_time += pause_time;
+    total_pause_time_ns += pause_time;
   }
   metrics::ArtMetrics* metrics = runtime->GetMetrics();
   // Report STW pause time in microseconds.
-  metrics->WorldStopTimeDuringGCAvg()->Add(total_pause_time / 1'000);
+  const uint64_t total_pause_time_us = total_pause_time_ns / 1'000;
+  metrics->WorldStopTimeDuringGCAvg()->Add(total_pause_time_us);
+  metrics->GcWorldStopTime()->Add(total_pause_time_us);
+  metrics->GcWorldStopTimeDelta()->Add(total_pause_time_us);
+  metrics->GcWorldStopCount()->AddOne();
+  metrics->GcWorldStopCountDelta()->AddOne();
   // Report total collection time of all GCs put together.
   metrics->TotalGcCollectionTime()->Add(NsToMs(duration_ns));
+  metrics->TotalGcCollectionTimeDelta()->Add(NsToMs(duration_ns));
   if (are_metrics_initialized_) {
     metrics_gc_count_->Add(1);
+    metrics_gc_count_delta_->Add(1);
     // Report GC time in milliseconds.
     gc_time_histogram_->Add(NsToMs(duration_ns));
     // Throughput in bytes/s. Add 1us to prevent possible division by 0.
@@ -216,6 +230,13 @@ void GarbageCollector::Run(GcCause gc_cause, bool clear_soft_references) {
     throughput = current_iteration->GetEstimatedThroughput() / MB;
     gc_throughput_histogram_->Add(throughput);
     gc_throughput_avg_->Add(throughput);
+
+    gc_scanned_bytes_->Add(current_iteration->GetScannedBytes());
+    gc_scanned_bytes_delta_->Add(current_iteration->GetScannedBytes());
+    gc_freed_bytes_->Add(current_iteration->GetFreedBytes());
+    gc_freed_bytes_delta_->Add(current_iteration->GetFreedBytes());
+    gc_duration_->Add(NsToMs(current_iteration->GetDurationNs()));
+    gc_duration_delta_->Add(NsToMs(current_iteration->GetDurationNs()));
   }
   is_transaction_active_ = false;
 }
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index d439914621..948a868bd2 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -162,10 +162,17 @@ class GarbageCollector : public RootVisitor, public IsMarkedVisitor, public Mark
   Histogram<size_t> freed_bytes_histogram_;
   metrics::MetricsBase<int64_t>* gc_time_histogram_;
   metrics::MetricsBase<uint64_t>* metrics_gc_count_;
+  metrics::MetricsBase<uint64_t>* metrics_gc_count_delta_;
   metrics::MetricsBase<int64_t>* gc_throughput_histogram_;
   metrics::MetricsBase<int64_t>* gc_tracing_throughput_hist_;
   metrics::MetricsBase<uint64_t>* gc_throughput_avg_;
   metrics::MetricsBase<uint64_t>* gc_tracing_throughput_avg_;
+  metrics::MetricsBase<uint64_t>* gc_scanned_bytes_;
+  metrics::MetricsBase<uint64_t>* gc_scanned_bytes_delta_;
+  metrics::MetricsBase<uint64_t>* gc_freed_bytes_;
+  metrics::MetricsBase<uint64_t>* gc_freed_bytes_delta_;
+  metrics::MetricsBase<uint64_t>* gc_duration_;
+  metrics::MetricsBase<uint64_t>* gc_duration_delta_;
   uint64_t total_thread_cpu_time_ns_;
   uint64_t total_time_ns_;
   uint64_t total_freed_objects_;
diff --git a/runtime/gc/collector/immune_spaces_test.cc b/runtime/gc/collector/immune_spaces_test.cc
index a0ea60d4c5..caa8106228 100644
--- a/runtime/gc/collector/immune_spaces_test.cc
+++ b/runtime/gc/collector/immune_spaces_test.cc
@@ -16,7 +16,8 @@
 
 #include <sys/mman.h>
 
-#include "common_runtime_test.h"
+#include "base/common_art_test.h"
+#include "base/utils.h"
 #include "gc/collector/immune_spaces.h"
 #include "gc/space/image_space.h"
 #include "gc/space/space-inl.h"
@@ -46,7 +47,7 @@ class FakeImageSpace : public space::ImageSpace {
                  MemMap&& oat_map)
       : ImageSpace("FakeImageSpace",
                    /*image_location=*/"",
-                   /*profile_file=*/{},
+                   /*profile_files=*/{},
                    std::move(map),
                    std::move(live_bitmap),
                    map.End()),
@@ -59,7 +60,7 @@ class FakeImageSpace : public space::ImageSpace {
   MemMap oat_map_;
 };
 
-class ImmuneSpacesTest : public CommonRuntimeTest {
+class ImmuneSpacesTest : public CommonArtTest {
   static constexpr size_t kMaxBitmaps = 10;
 
  public:
diff --git a/runtime/gc/collector/mark_compact-inl.h b/runtime/gc/collector/mark_compact-inl.h
new file mode 100644
index 0000000000..c9b792e8f6
--- /dev/null
+++ b/runtime/gc/collector/mark_compact-inl.h
@@ -0,0 +1,394 @@
+/*
+ * Copyright 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_GC_COLLECTOR_MARK_COMPACT_INL_H_
+#define ART_RUNTIME_GC_COLLECTOR_MARK_COMPACT_INL_H_
+
+#include "gc/space/bump_pointer_space.h"
+#include "mark_compact.h"
+#include "mirror/object-inl.h"
+
+namespace art {
+namespace gc {
+namespace collector {
+
+inline void MarkCompact::UpdateClassAfterObjectMap(mirror::Object* obj) {
+  mirror::Class* klass = obj->GetClass<kVerifyNone, kWithoutReadBarrier>();
+  // Track a class if it needs walking super-classes for visiting references or
+  // if it's higher in address order than its objects and is in moving space.
+  if (UNLIKELY(
+          (std::less<mirror::Object*>{}(obj, klass) && bump_pointer_space_->HasAddress(klass)) ||
+          (klass->GetReferenceInstanceOffsets<kVerifyNone>() == mirror::Class::kClassWalkSuper &&
+           walk_super_class_cache_ != klass))) {
+    // Since this function gets invoked in the compaction pause as well, it is
+    // preferable to store such super class separately rather than updating key
+    // as the latter would require traversing the hierarchy for every object of 'klass'.
+    auto ret1 = class_after_obj_hash_map_.try_emplace(ObjReference::FromMirrorPtr(klass),
+                                                      ObjReference::FromMirrorPtr(obj));
+    if (ret1.second) {
+      if (klass->GetReferenceInstanceOffsets<kVerifyNone>() == mirror::Class::kClassWalkSuper) {
+        // In this case we require traversing through the super class hierarchy
+        // and find the super class at the highest address order.
+        mirror::Class* highest_klass = bump_pointer_space_->HasAddress(klass) ? klass : nullptr;
+        for (ObjPtr<mirror::Class> k = klass->GetSuperClass<kVerifyNone, kWithoutReadBarrier>();
+             k != nullptr;
+             k = k->GetSuperClass<kVerifyNone, kWithoutReadBarrier>()) {
+          // TODO: Can we break once we encounter a super class outside the moving space?
+          if (bump_pointer_space_->HasAddress(k.Ptr())) {
+            highest_klass = std::max(highest_klass, k.Ptr(), std::less<mirror::Class*>());
+          }
+        }
+        if (highest_klass != nullptr && highest_klass != klass) {
+          auto ret2 = super_class_after_class_hash_map_.try_emplace(
+              ObjReference::FromMirrorPtr(klass), ObjReference::FromMirrorPtr(highest_klass));
+          DCHECK(ret2.second);
+        } else {
+          walk_super_class_cache_ = klass;
+        }
+      }
+    } else if (std::less<mirror::Object*>{}(obj, ret1.first->second.AsMirrorPtr())) {
+      ret1.first->second = ObjReference::FromMirrorPtr(obj);
+    }
+  }
+}
+
+template <size_t kAlignment>
+inline uintptr_t MarkCompact::LiveWordsBitmap<kAlignment>::SetLiveWords(uintptr_t begin,
+                                                                        size_t size) {
+  const uintptr_t begin_bit_idx = MemRangeBitmap::BitIndexFromAddr(begin);
+  DCHECK(!Bitmap::TestBit(begin_bit_idx));
+  // Range to set bit: [begin, end]
+  uintptr_t end = begin + size - kAlignment;
+  const uintptr_t end_bit_idx = MemRangeBitmap::BitIndexFromAddr(end);
+  uintptr_t* begin_bm_address = Bitmap::Begin() + Bitmap::BitIndexToWordIndex(begin_bit_idx);
+  uintptr_t* end_bm_address = Bitmap::Begin() + Bitmap::BitIndexToWordIndex(end_bit_idx);
+  ptrdiff_t diff = end_bm_address - begin_bm_address;
+  uintptr_t mask = Bitmap::BitIndexToMask(begin_bit_idx);
+  // Bits that needs to be set in the first word, if it's not also the last word
+  mask = ~(mask - 1);
+  if (diff > 0) {
+    *begin_bm_address |= mask;
+    mask = ~0;
+    // Even though memset can handle the (diff == 1) case but we should avoid the
+    // overhead of a function call for this, highly likely (as most of the objects
+    // are small), case.
+    if (diff > 1) {
+      // Set all intermediate bits to 1.
+      std::memset(static_cast<void*>(begin_bm_address + 1), 0xff, (diff - 1) * sizeof(uintptr_t));
+    }
+  }
+  uintptr_t end_mask = Bitmap::BitIndexToMask(end_bit_idx);
+  *end_bm_address |= mask & (end_mask | (end_mask - 1));
+  return begin_bit_idx;
+}
+
+template <size_t kAlignment> template <typename Visitor>
+inline void MarkCompact::LiveWordsBitmap<kAlignment>::VisitLiveStrides(uintptr_t begin_bit_idx,
+                                                                       uint8_t* end,
+                                                                       const size_t bytes,
+                                                                       Visitor&& visitor) const {
+  // Range to visit [begin_bit_idx, end_bit_idx]
+  DCHECK(IsAligned<kAlignment>(end));
+  end -= kAlignment;
+  const uintptr_t end_bit_idx = MemRangeBitmap::BitIndexFromAddr(reinterpret_cast<uintptr_t>(end));
+  DCHECK_LE(begin_bit_idx, end_bit_idx);
+  uintptr_t begin_word_idx = Bitmap::BitIndexToWordIndex(begin_bit_idx);
+  const uintptr_t end_word_idx = Bitmap::BitIndexToWordIndex(end_bit_idx);
+  DCHECK(Bitmap::TestBit(begin_bit_idx));
+  size_t stride_size = 0;
+  size_t idx_in_word = 0;
+  size_t num_heap_words = bytes / kAlignment;
+  uintptr_t live_stride_start_idx;
+  uintptr_t word = Bitmap::Begin()[begin_word_idx];
+
+  // Setup the first word.
+  word &= ~(Bitmap::BitIndexToMask(begin_bit_idx) - 1);
+  begin_bit_idx = RoundDown(begin_bit_idx, Bitmap::kBitsPerBitmapWord);
+
+  do {
+    if (UNLIKELY(begin_word_idx == end_word_idx)) {
+      uintptr_t mask = Bitmap::BitIndexToMask(end_bit_idx);
+      word &= mask | (mask - 1);
+    }
+    if (~word == 0) {
+      // All bits in the word are marked.
+      if (stride_size == 0) {
+        live_stride_start_idx = begin_bit_idx;
+      }
+      stride_size += Bitmap::kBitsPerBitmapWord;
+      if (num_heap_words <= stride_size) {
+        break;
+      }
+    } else {
+      while (word != 0) {
+        // discard 0s
+        size_t shift = CTZ(word);
+        idx_in_word += shift;
+        word >>= shift;
+        if (stride_size > 0) {
+          if (shift > 0) {
+            if (num_heap_words <= stride_size) {
+              break;
+            }
+            visitor(live_stride_start_idx, stride_size, /*is_last*/ false);
+            num_heap_words -= stride_size;
+            live_stride_start_idx = begin_bit_idx + idx_in_word;
+            stride_size = 0;
+          }
+        } else {
+          live_stride_start_idx = begin_bit_idx + idx_in_word;
+        }
+        // consume 1s
+        shift = CTZ(~word);
+        DCHECK_NE(shift, 0u);
+        word >>= shift;
+        idx_in_word += shift;
+        stride_size += shift;
+      }
+      // If the whole word == 0 or the higher bits are 0s, then we exit out of
+      // the above loop without completely consuming the word, so call visitor,
+      // if needed.
+      if (idx_in_word < Bitmap::kBitsPerBitmapWord && stride_size > 0) {
+        if (num_heap_words <= stride_size) {
+          break;
+        }
+        visitor(live_stride_start_idx, stride_size, /*is_last*/ false);
+        num_heap_words -= stride_size;
+        stride_size = 0;
+      }
+      idx_in_word = 0;
+    }
+    begin_bit_idx += Bitmap::kBitsPerBitmapWord;
+    begin_word_idx++;
+    if (UNLIKELY(begin_word_idx > end_word_idx)) {
+      num_heap_words = std::min(stride_size, num_heap_words);
+      break;
+    }
+    word = Bitmap::Begin()[begin_word_idx];
+  } while (true);
+
+  if (stride_size > 0) {
+    visitor(live_stride_start_idx, num_heap_words, /*is_last*/ true);
+  }
+}
+
+template <size_t kAlignment>
+inline
+uint32_t MarkCompact::LiveWordsBitmap<kAlignment>::FindNthLiveWordOffset(size_t chunk_idx,
+                                                                         uint32_t n) const {
+  DCHECK_LT(n, kBitsPerVectorWord);
+  const size_t index = chunk_idx * kBitmapWordsPerVectorWord;
+  for (uint32_t i = 0; i < kBitmapWordsPerVectorWord; i++) {
+    uintptr_t word = Bitmap::Begin()[index + i];
+    if (~word == 0) {
+      if (n < Bitmap::kBitsPerBitmapWord) {
+        return i * Bitmap::kBitsPerBitmapWord + n;
+      }
+      n -= Bitmap::kBitsPerBitmapWord;
+    } else {
+      uint32_t j = 0;
+      while (word != 0) {
+        // count contiguous 0s
+        uint32_t shift = CTZ(word);
+        word >>= shift;
+        j += shift;
+        // count contiguous 1s
+        shift = CTZ(~word);
+        DCHECK_NE(shift, 0u);
+        if (shift > n) {
+          return i * Bitmap::kBitsPerBitmapWord + j + n;
+        }
+        n -= shift;
+        word >>= shift;
+        j += shift;
+      }
+    }
+  }
+  UNREACHABLE();
+}
+
+inline void MarkCompact::UpdateRef(mirror::Object* obj, MemberOffset offset) {
+  mirror::Object* old_ref = obj->GetFieldObject<
+      mirror::Object, kVerifyNone, kWithoutReadBarrier, /*kIsVolatile*/false>(offset);
+  if (kIsDebugBuild) {
+    if (live_words_bitmap_->HasAddress(old_ref)
+        && reinterpret_cast<uint8_t*>(old_ref) < black_allocations_begin_
+        && !moving_space_bitmap_->Test(old_ref)) {
+      mirror::Object* from_ref = GetFromSpaceAddr(old_ref);
+      std::ostringstream oss;
+      heap_->DumpSpaces(oss);
+      MemMap::DumpMaps(oss, /* terse= */ true);
+      LOG(FATAL) << "Not marked in the bitmap ref=" << old_ref
+                 << " from_ref=" << from_ref
+                 << " offset=" << offset
+                 << " obj=" << obj
+                 << " obj-validity=" << IsValidObject(obj)
+                 << " from-space=" << static_cast<void*>(from_space_begin_)
+                 << " bitmap= " << moving_space_bitmap_->DumpMemAround(old_ref)
+                 << " from_ref "
+                 << heap_->GetVerification()->DumpRAMAroundAddress(
+                     reinterpret_cast<uintptr_t>(from_ref), 128)
+                 << " obj "
+                 << heap_->GetVerification()->DumpRAMAroundAddress(
+                     reinterpret_cast<uintptr_t>(obj), 128)
+                 << " old_ref " << heap_->GetVerification()->DumpRAMAroundAddress(
+                     reinterpret_cast<uintptr_t>(old_ref), 128)
+                 << " maps\n" << oss.str();
+    }
+  }
+  mirror::Object* new_ref = PostCompactAddress(old_ref);
+  if (new_ref != old_ref) {
+    obj->SetFieldObjectWithoutWriteBarrier<
+        /*kTransactionActive*/false, /*kCheckTransaction*/false, kVerifyNone, /*kIsVolatile*/false>(
+            offset,
+            new_ref);
+  }
+}
+
+inline bool MarkCompact::VerifyRootSingleUpdate(void* root,
+                                                mirror::Object* old_ref,
+                                                const RootInfo& info) {
+  // ASAN promotes stack-frames to heap in order to detect
+  // stack-use-after-return issues. So skip using this double-root update
+  // detection on ASAN as well.
+  if (kIsDebugBuild && !kMemoryToolIsAvailable) {
+    void* stack_low_addr = stack_low_addr_;
+    void* stack_high_addr = stack_high_addr_;
+    if (!live_words_bitmap_->HasAddress(old_ref)) {
+      return false;
+    }
+    Thread* self = Thread::Current();
+    if (UNLIKELY(stack_low_addr == nullptr)) {
+      stack_low_addr = self->GetStackEnd();
+      stack_high_addr = reinterpret_cast<char*>(stack_low_addr) + self->GetStackSize();
+    }
+    if (root < stack_low_addr || root > stack_high_addr) {
+      MutexLock mu(self, lock_);
+      auto ret = updated_roots_->insert(root);
+      DCHECK(ret.second) << "root=" << root << " old_ref=" << old_ref
+                         << " stack_low_addr=" << stack_low_addr
+                         << " stack_high_addr=" << stack_high_addr;
+    }
+    DCHECK(reinterpret_cast<uint8_t*>(old_ref) >= black_allocations_begin_ ||
+           live_words_bitmap_->Test(old_ref))
+        << "ref=" << old_ref << " <" << mirror::Object::PrettyTypeOf(old_ref) << "> RootInfo ["
+        << info << "]";
+  }
+  return true;
+}
+
+inline void MarkCompact::UpdateRoot(mirror::CompressedReference<mirror::Object>* root,
+                                    const RootInfo& info) {
+  DCHECK(!root->IsNull());
+  mirror::Object* old_ref = root->AsMirrorPtr();
+  if (VerifyRootSingleUpdate(root, old_ref, info)) {
+    mirror::Object* new_ref = PostCompactAddress(old_ref);
+    if (old_ref != new_ref) {
+      root->Assign(new_ref);
+    }
+  }
+}
+
+inline void MarkCompact::UpdateRoot(mirror::Object** root, const RootInfo& info) {
+  mirror::Object* old_ref = *root;
+  if (VerifyRootSingleUpdate(root, old_ref, info)) {
+    mirror::Object* new_ref = PostCompactAddress(old_ref);
+    if (old_ref != new_ref) {
+      *root = new_ref;
+    }
+  }
+}
+
+template <size_t kAlignment>
+inline size_t MarkCompact::LiveWordsBitmap<kAlignment>::CountLiveWordsUpto(size_t bit_idx) const {
+  const size_t word_offset = Bitmap::BitIndexToWordIndex(bit_idx);
+  uintptr_t word;
+  size_t ret = 0;
+  // This is needed only if we decide to make chunks 128-bit but still
+  // choose to use 64-bit word for bitmap. Ideally we should use 128-bit
+  // SIMD instructions to compute popcount.
+  if (kBitmapWordsPerVectorWord > 1) {
+    for (size_t i = RoundDown(word_offset, kBitmapWordsPerVectorWord); i < word_offset; i++) {
+      word = Bitmap::Begin()[i];
+      ret += POPCOUNT(word);
+    }
+  }
+  word = Bitmap::Begin()[word_offset];
+  const uintptr_t mask = Bitmap::BitIndexToMask(bit_idx);
+  DCHECK_NE(word & mask, 0u)
+        << " word_offset:" << word_offset
+        << " bit_idx:" << bit_idx
+        << " bit_idx_in_word:" << (bit_idx % Bitmap::kBitsPerBitmapWord)
+        << std::hex << " word: 0x" << word
+        << " mask: 0x" << mask << std::dec;
+  ret += POPCOUNT(word & (mask - 1));
+  return ret;
+}
+
+inline mirror::Object* MarkCompact::PostCompactBlackObjAddr(mirror::Object* old_ref) const {
+  return reinterpret_cast<mirror::Object*>(reinterpret_cast<uint8_t*>(old_ref)
+                                           - black_objs_slide_diff_);
+}
+
+inline mirror::Object* MarkCompact::PostCompactOldObjAddr(mirror::Object* old_ref) const {
+  const uintptr_t begin = live_words_bitmap_->Begin();
+  const uintptr_t addr_offset = reinterpret_cast<uintptr_t>(old_ref) - begin;
+  const size_t vec_idx = addr_offset / kOffsetChunkSize;
+  const size_t live_bytes_in_bitmap_word =
+      live_words_bitmap_->CountLiveWordsUpto(addr_offset / kAlignment) * kAlignment;
+  return reinterpret_cast<mirror::Object*>(begin
+                                           + chunk_info_vec_[vec_idx]
+                                           + live_bytes_in_bitmap_word);
+}
+
+inline mirror::Object* MarkCompact::PostCompactAddressUnchecked(mirror::Object* old_ref) const {
+  if (reinterpret_cast<uint8_t*>(old_ref) >= black_allocations_begin_) {
+    return PostCompactBlackObjAddr(old_ref);
+  }
+  if (kIsDebugBuild) {
+    mirror::Object* from_ref = GetFromSpaceAddr(old_ref);
+    DCHECK(live_words_bitmap_->Test(old_ref))
+         << "ref=" << old_ref;
+    if (!moving_space_bitmap_->Test(old_ref)) {
+      std::ostringstream oss;
+      Runtime::Current()->GetHeap()->DumpSpaces(oss);
+      MemMap::DumpMaps(oss, /* terse= */ true);
+      LOG(FATAL) << "ref=" << old_ref
+                 << " from_ref=" << from_ref
+                 << " from-space=" << static_cast<void*>(from_space_begin_)
+                 << " bitmap= " << moving_space_bitmap_->DumpMemAround(old_ref)
+                 << heap_->GetVerification()->DumpRAMAroundAddress(
+                         reinterpret_cast<uintptr_t>(from_ref), 128)
+                 << " maps\n" << oss.str();
+    }
+  }
+  return PostCompactOldObjAddr(old_ref);
+}
+
+inline mirror::Object* MarkCompact::PostCompactAddress(mirror::Object* old_ref) const {
+  // TODO: To further speedup the check, maybe we should consider caching heap
+  // start/end in this object.
+  if (LIKELY(live_words_bitmap_->HasAddress(old_ref))) {
+    return PostCompactAddressUnchecked(old_ref);
+  }
+  return old_ref;
+}
+
+}  // namespace collector
+}  // namespace gc
+}  // namespace art
+
+#endif  // ART_RUNTIME_GC_COLLECTOR_MARK_COMPACT_INL_H_
diff --git a/runtime/gc/collector/mark_compact.cc b/runtime/gc/collector/mark_compact.cc
new file mode 100644
index 0000000000..bb34068fb1
--- /dev/null
+++ b/runtime/gc/collector/mark_compact.cc
@@ -0,0 +1,4300 @@
+/*
+ * Copyright 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fcntl.h>
+// Glibc v2.19 doesn't include these in fcntl.h so host builds will fail without.
+#if !defined(FALLOC_FL_PUNCH_HOLE) || !defined(FALLOC_FL_KEEP_SIZE)
+#include <linux/falloc.h>
+#endif
+#include <linux/userfaultfd.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <fstream>
+#include <numeric>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "android-base/file.h"
+#include "android-base/parsebool.h"
+#include "android-base/parseint.h"
+#include "android-base/properties.h"
+#include "android-base/strings.h"
+#include "base/file_utils.h"
+#include "base/memfd.h"
+#include "base/quasi_atomic.h"
+#include "base/systrace.h"
+#include "base/utils.h"
+#include "gc/accounting/mod_union_table-inl.h"
+#include "gc/collector_type.h"
+#include "gc/reference_processor.h"
+#include "gc/space/bump_pointer_space.h"
+#include "gc/task_processor.h"
+#include "gc/verification-inl.h"
+#include "jit/jit_code_cache.h"
+#include "mark_compact-inl.h"
+#include "mirror/object-refvisitor-inl.h"
+#include "read_barrier_config.h"
+#include "scoped_thread_state_change-inl.h"
+#include "sigchain.h"
+#include "thread_list.h"
+
+#ifdef ART_TARGET_ANDROID
+#include "com_android_art.h"
+#endif
+
+#ifndef __BIONIC__
+#ifndef MREMAP_DONTUNMAP
+#define MREMAP_DONTUNMAP 4
+#endif
+#ifndef MAP_FIXED_NOREPLACE
+#define MAP_FIXED_NOREPLACE 0x100000
+#endif
+#ifndef __NR_userfaultfd
+#if defined(__x86_64__)
+#define __NR_userfaultfd 323
+#elif defined(__i386__)
+#define __NR_userfaultfd 374
+#elif defined(__aarch64__)
+#define __NR_userfaultfd 282
+#elif defined(__arm__)
+#define __NR_userfaultfd 388
+#else
+#error "__NR_userfaultfd undefined"
+#endif
+#endif  // __NR_userfaultfd
+#endif  // __BIONIC__
+
+namespace {
+
+using ::android::base::GetBoolProperty;
+using ::android::base::ParseBool;
+using ::android::base::ParseBoolResult;
+
+}  // namespace
+
+namespace art {
+
+static bool HaveMremapDontunmap() {
+  void* old = mmap(nullptr, kPageSize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+  CHECK_NE(old, MAP_FAILED);
+  void* addr = mremap(old, kPageSize, kPageSize, MREMAP_MAYMOVE | MREMAP_DONTUNMAP, nullptr);
+  CHECK_EQ(munmap(old, kPageSize), 0);
+  if (addr != MAP_FAILED) {
+    CHECK_EQ(munmap(addr, kPageSize), 0);
+    return true;
+  } else {
+    return false;
+  }
+}
+// We require MREMAP_DONTUNMAP functionality of the mremap syscall, which was
+// introduced in 5.13 kernel version. But it was backported to GKI kernels.
+static bool gHaveMremapDontunmap = IsKernelVersionAtLeast(5, 13) || HaveMremapDontunmap();
+// Bitmap of features supported by userfaultfd. This is obtained via uffd API ioctl.
+static uint64_t gUffdFeatures = 0;
+// Both, missing and minor faults on shmem are needed only for minor-fault mode.
+static constexpr uint64_t kUffdFeaturesForMinorFault =
+    UFFD_FEATURE_MISSING_SHMEM | UFFD_FEATURE_MINOR_SHMEM;
+static constexpr uint64_t kUffdFeaturesForSigbus = UFFD_FEATURE_SIGBUS;
+// We consider SIGBUS feature necessary to enable this GC as it's superior than
+// threading-based implementation for janks. However, since we have the latter
+// already implemented, for testing purposes, we allow choosing either of the
+// two at boot time in the constructor below.
+// Note that having minor-fault feature implies having SIGBUS feature as the
+// latter was introduced earlier than the former. In other words, having
+// minor-fault feature implies having SIGBUS. We still want minor-fault to be
+// available for making jit-code-cache updation concurrent, which uses shmem.
+static constexpr uint64_t kUffdFeaturesRequired =
+    kUffdFeaturesForMinorFault | kUffdFeaturesForSigbus;
+
+bool KernelSupportsUffd() {
+#ifdef __linux__
+  if (gHaveMremapDontunmap) {
+    int fd = syscall(__NR_userfaultfd, O_CLOEXEC | UFFD_USER_MODE_ONLY);
+    // On non-android devices we may not have the kernel patches that restrict
+    // userfaultfd to user mode. But that is not a security concern as we are
+    // on host. Therefore, attempt one more time without UFFD_USER_MODE_ONLY.
+    if (!kIsTargetAndroid && fd == -1 && errno == EINVAL) {
+      fd = syscall(__NR_userfaultfd, O_CLOEXEC);
+    }
+    if (fd >= 0) {
+      // We are only fetching the available features, which is returned by the
+      // ioctl.
+      struct uffdio_api api = {.api = UFFD_API, .features = 0, .ioctls = 0};
+      CHECK_EQ(ioctl(fd, UFFDIO_API, &api), 0) << "ioctl_userfaultfd : API:" << strerror(errno);
+      gUffdFeatures = api.features;
+      close(fd);
+      // Allow this GC to be used only if minor-fault and sigbus feature is available.
+      return (api.features & kUffdFeaturesRequired) == kUffdFeaturesRequired;
+    }
+  }
+#endif
+  return false;
+}
+
+// The other cases are defined as constexpr in runtime/read_barrier_config.h
+#if !defined(ART_FORCE_USE_READ_BARRIER) && defined(ART_USE_READ_BARRIER)
+// Returns collector type asked to be used on the cmdline.
+static gc::CollectorType FetchCmdlineGcType() {
+  std::string argv;
+  gc::CollectorType gc_type = gc::CollectorType::kCollectorTypeNone;
+  if (android::base::ReadFileToString("/proc/self/cmdline", &argv)) {
+    if (argv.find("-Xgc:CMC") != std::string::npos) {
+      gc_type = gc::CollectorType::kCollectorTypeCMC;
+    } else if (argv.find("-Xgc:CC") != std::string::npos) {
+      gc_type = gc::CollectorType::kCollectorTypeCC;
+    }
+  }
+  return gc_type;
+}
+
+#ifdef ART_TARGET_ANDROID
+static int GetOverrideCacheInfoFd() {
+  std::string args_str;
+  if (!android::base::ReadFileToString("/proc/self/cmdline", &args_str)) {
+    LOG(WARNING) << "Failed to load /proc/self/cmdline";
+    return -1;
+  }
+  std::vector<std::string_view> args;
+  Split(std::string_view(args_str), /*separator=*/'\0', &args);
+  for (std::string_view arg : args) {
+    if (android::base::ConsumePrefix(&arg, "--cache-info-fd=")) {  // This is a dex2oat flag.
+      int fd;
+      if (!android::base::ParseInt(std::string(arg), &fd)) {
+        LOG(ERROR) << "Failed to parse --cache-info-fd (value: '" << arg << "')";
+        return -1;
+      }
+      return fd;
+    }
+  }
+  return -1;
+}
+
+static bool GetCachedBoolProperty(const std::string& key, bool default_value) {
+  // For simplicity, we don't handle multiple calls because otherwise we would have to reset the fd.
+  static bool called = false;
+  CHECK(!called) << "GetCachedBoolProperty can be called only once";
+  called = true;
+
+  std::string cache_info_contents;
+  int fd = GetOverrideCacheInfoFd();
+  if (fd >= 0) {
+    if (!android::base::ReadFdToString(fd, &cache_info_contents)) {
+      PLOG(ERROR) << "Failed to read cache-info from fd " << fd;
+      return default_value;
+    }
+  } else {
+    std::string path = GetApexDataDalvikCacheDirectory(InstructionSet::kNone) + "/cache-info.xml";
+    if (!android::base::ReadFileToString(path, &cache_info_contents)) {
+      // If the file is not found, then we are in chroot or in a standalone runtime process (e.g.,
+      // IncidentHelper), or odsign/odrefresh failed to generate and sign the cache info. There's
+      // nothing we can do.
+      if (errno != ENOENT) {
+        PLOG(ERROR) << "Failed to read cache-info from the default path";
+      }
+      return default_value;
+    }
+  }
+
+  std::optional<com::android::art::CacheInfo> cache_info =
+      com::android::art::parse(cache_info_contents.c_str());
+  if (!cache_info.has_value()) {
+    // This should never happen.
+    LOG(ERROR) << "Failed to parse cache-info";
+    return default_value;
+  }
+  const com::android::art::KeyValuePairList* list = cache_info->getFirstSystemProperties();
+  if (list == nullptr) {
+    // This should never happen.
+    LOG(ERROR) << "Missing system properties from cache-info";
+    return default_value;
+  }
+  const std::vector<com::android::art::KeyValuePair>& properties = list->getItem();
+  for (const com::android::art::KeyValuePair& pair : properties) {
+    if (pair.getK() == key) {
+      ParseBoolResult result = ParseBool(pair.getV());
+      switch (result) {
+        case ParseBoolResult::kTrue:
+          return true;
+        case ParseBoolResult::kFalse:
+          return false;
+        case ParseBoolResult::kError:
+          return default_value;
+      }
+    }
+  }
+  return default_value;
+}
+
+static bool SysPropSaysUffdGc() {
+  // The phenotype flag can change at time time after boot, but it shouldn't take effect until a
+  // reboot. Therefore, we read the phenotype flag from the cache info, which is generated on boot.
+  return GetCachedBoolProperty("persist.device_config.runtime_native_boot.enable_uffd_gc",
+                               GetBoolProperty("ro.dalvik.vm.enable_uffd_gc", false));
+}
+#else
+// Never called.
+static bool SysPropSaysUffdGc() { return false; }
+#endif
+
+static bool ShouldUseUserfaultfd() {
+  static_assert(kUseBakerReadBarrier || kUseTableLookupReadBarrier);
+#ifdef __linux__
+  // Use CMC/CC if that is being explicitly asked for on cmdline. Otherwise,
+  // always use CC on host. On target, use CMC only if system properties says so
+  // and the kernel supports it.
+  gc::CollectorType gc_type = FetchCmdlineGcType();
+  return gc_type == gc::CollectorType::kCollectorTypeCMC ||
+         (gc_type == gc::CollectorType::kCollectorTypeNone &&
+          kIsTargetAndroid &&
+          SysPropSaysUffdGc() &&
+          KernelSupportsUffd());
+#else
+  return false;
+#endif
+}
+
+const bool gUseUserfaultfd = ShouldUseUserfaultfd();
+const bool gUseReadBarrier = !gUseUserfaultfd;
+#endif
+
+namespace gc {
+namespace collector {
+
+// Turn off kCheckLocks when profiling the GC as it slows down the GC
+// significantly.
+static constexpr bool kCheckLocks = kDebugLocking;
+static constexpr bool kVerifyRootsMarked = kIsDebugBuild;
+// Two threads should suffice on devices.
+static constexpr size_t kMaxNumUffdWorkers = 2;
+// Number of compaction buffers reserved for mutator threads in SIGBUS feature
+// case. It's extremely unlikely that we will ever have more than these number
+// of mutator threads trying to access the moving-space during one compaction
+// phase. Using a lower number in debug builds to hopefully catch the issue
+// before it becomes a problem on user builds.
+static constexpr size_t kMutatorCompactionBufferCount = kIsDebugBuild ? 256 : 512;
+// Minimum from-space chunk to be madvised (during concurrent compaction) in one go.
+static constexpr ssize_t kMinFromSpaceMadviseSize = 1 * MB;
+// Concurrent compaction termination logic is different (and slightly more efficient) if the
+// kernel has the fault-retry feature (allowing repeated faults on the same page), which was
+// introduced in 5.7 (https://android-review.git.corp.google.com/c/kernel/common/+/1540088).
+// This allows a single page fault to be handled, in turn, by each worker thread, only waking
+// up the GC thread at the end.
+static const bool gKernelHasFaultRetry = IsKernelVersionAtLeast(5, 7);
+
+std::pair<bool, bool> MarkCompact::GetUffdAndMinorFault() {
+  bool uffd_available;
+  // In most cases the gUffdFeatures will already be initialized at boot time
+  // when libart is loaded. On very old kernels we may get '0' from the kernel,
+  // in which case we would be doing the syscalls each time this function is
+  // called. But that's very unlikely case. There are no correctness issues as
+  // the response from kernel never changes after boot.
+  if (UNLIKELY(gUffdFeatures == 0)) {
+    uffd_available = KernelSupportsUffd();
+  } else {
+    // We can have any uffd features only if uffd exists.
+    uffd_available = true;
+  }
+  bool minor_fault_available =
+      (gUffdFeatures & kUffdFeaturesForMinorFault) == kUffdFeaturesForMinorFault;
+  return std::pair<bool, bool>(uffd_available, minor_fault_available);
+}
+
+bool MarkCompact::CreateUserfaultfd(bool post_fork) {
+  if (post_fork || uffd_ == kFdUnused) {
+    // Check if we have MREMAP_DONTUNMAP here for cases where
+    // 'ART_USE_READ_BARRIER=false' is used. Additionally, this check ensures
+    // that userfaultfd isn't used on old kernels, which cause random ioctl
+    // failures.
+    if (gHaveMremapDontunmap) {
+      // Don't use O_NONBLOCK as we rely on read waiting on uffd_ if there isn't
+      // any read event available. We don't use poll.
+      uffd_ = syscall(__NR_userfaultfd, O_CLOEXEC | UFFD_USER_MODE_ONLY);
+      // On non-android devices we may not have the kernel patches that restrict
+      // userfaultfd to user mode. But that is not a security concern as we are
+      // on host. Therefore, attempt one more time without UFFD_USER_MODE_ONLY.
+      if (!kIsTargetAndroid && UNLIKELY(uffd_ == -1 && errno == EINVAL)) {
+        uffd_ = syscall(__NR_userfaultfd, O_CLOEXEC);
+      }
+      if (UNLIKELY(uffd_ == -1)) {
+        uffd_ = kFallbackMode;
+        LOG(WARNING) << "Userfaultfd isn't supported (reason: " << strerror(errno)
+                     << ") and therefore falling back to stop-the-world compaction.";
+      } else {
+        DCHECK(IsValidFd(uffd_));
+        // Initialize uffd with the features which are required and available.
+        struct uffdio_api api = {.api = UFFD_API, .features = gUffdFeatures, .ioctls = 0};
+        api.features &= use_uffd_sigbus_ ? kUffdFeaturesRequired : kUffdFeaturesForMinorFault;
+        CHECK_EQ(ioctl(uffd_, UFFDIO_API, &api), 0)
+            << "ioctl_userfaultfd: API: " << strerror(errno);
+      }
+    } else {
+      uffd_ = kFallbackMode;
+    }
+  }
+  uffd_initialized_ = !post_fork || uffd_ == kFallbackMode;
+  return IsValidFd(uffd_);
+}
+
+template <size_t kAlignment>
+MarkCompact::LiveWordsBitmap<kAlignment>* MarkCompact::LiveWordsBitmap<kAlignment>::Create(
+    uintptr_t begin, uintptr_t end) {
+  return static_cast<LiveWordsBitmap<kAlignment>*>(
+          MemRangeBitmap::Create("Concurrent Mark Compact live words bitmap", begin, end));
+}
+
+static bool IsSigbusFeatureAvailable() {
+  MarkCompact::GetUffdAndMinorFault();
+  return gUffdFeatures & UFFD_FEATURE_SIGBUS;
+}
+
+MarkCompact::MarkCompact(Heap* heap)
+    : GarbageCollector(heap, "concurrent mark compact"),
+      gc_barrier_(0),
+      lock_("mark compact lock", kGenericBottomLock),
+      bump_pointer_space_(heap->GetBumpPointerSpace()),
+      moving_space_bitmap_(bump_pointer_space_->GetMarkBitmap()),
+      moving_to_space_fd_(kFdUnused),
+      moving_from_space_fd_(kFdUnused),
+      uffd_(kFdUnused),
+      sigbus_in_progress_count_(kSigbusCounterCompactionDoneMask),
+      compaction_in_progress_count_(0),
+      thread_pool_counter_(0),
+      compacting_(false),
+      uffd_initialized_(false),
+      uffd_minor_fault_supported_(false),
+      use_uffd_sigbus_(IsSigbusFeatureAvailable()),
+      minor_fault_initialized_(false),
+      map_linear_alloc_shared_(false) {
+  if (kIsDebugBuild) {
+    updated_roots_.reset(new std::unordered_set<void*>());
+  }
+  // TODO: When using minor-fault feature, the first GC after zygote-fork
+  // requires mapping the linear-alloc again with MAP_SHARED. This leaves a
+  // gap for suspended threads to access linear-alloc when it's empty (after
+  // mremap) and not yet userfaultfd registered. This cannot be fixed by merely
+  // doing uffd registration first. For now, just assert that we are not using
+  // minor-fault. Eventually, a cleanup of linear-alloc update logic to only
+  // use private anonymous would be ideal.
+  CHECK(!uffd_minor_fault_supported_);
+
+  // TODO: Depending on how the bump-pointer space move is implemented. If we
+  // switch between two virtual memories each time, then we will have to
+  // initialize live_words_bitmap_ accordingly.
+  live_words_bitmap_.reset(LiveWordsBitmap<kAlignment>::Create(
+          reinterpret_cast<uintptr_t>(bump_pointer_space_->Begin()),
+          reinterpret_cast<uintptr_t>(bump_pointer_space_->Limit())));
+
+  // Create one MemMap for all the data structures
+  size_t moving_space_size = bump_pointer_space_->Capacity();
+  size_t chunk_info_vec_size = moving_space_size / kOffsetChunkSize;
+  size_t nr_moving_pages = moving_space_size / kPageSize;
+  size_t nr_non_moving_pages = heap->GetNonMovingSpace()->Capacity() / kPageSize;
+
+  std::string err_msg;
+  info_map_ = MemMap::MapAnonymous("Concurrent mark-compact chunk-info vector",
+                                   chunk_info_vec_size * sizeof(uint32_t)
+                                   + nr_non_moving_pages * sizeof(ObjReference)
+                                   + nr_moving_pages * sizeof(ObjReference)
+                                   + nr_moving_pages * sizeof(uint32_t),
+                                   PROT_READ | PROT_WRITE,
+                                   /*low_4gb=*/ false,
+                                   &err_msg);
+  if (UNLIKELY(!info_map_.IsValid())) {
+    LOG(FATAL) << "Failed to allocate concurrent mark-compact chunk-info vector: " << err_msg;
+  } else {
+    uint8_t* p = info_map_.Begin();
+    chunk_info_vec_ = reinterpret_cast<uint32_t*>(p);
+    vector_length_ = chunk_info_vec_size;
+
+    p += chunk_info_vec_size * sizeof(uint32_t);
+    first_objs_non_moving_space_ = reinterpret_cast<ObjReference*>(p);
+
+    p += nr_non_moving_pages * sizeof(ObjReference);
+    first_objs_moving_space_ = reinterpret_cast<ObjReference*>(p);
+
+    p += nr_moving_pages * sizeof(ObjReference);
+    pre_compact_offset_moving_space_ = reinterpret_cast<uint32_t*>(p);
+  }
+
+  size_t moving_space_alignment = BestPageTableAlignment(moving_space_size);
+  // The moving space is created at a fixed address, which is expected to be
+  // PMD-size aligned.
+  if (!IsAlignedParam(bump_pointer_space_->Begin(), moving_space_alignment)) {
+    LOG(WARNING) << "Bump pointer space is not aligned to " << PrettySize(moving_space_alignment)
+                 << ". This can lead to longer stop-the-world pauses for compaction";
+  }
+  // NOTE: PROT_NONE is used here as these mappings are for address space reservation
+  // only and will be used only after appropriately remapping them.
+  from_space_map_ = MemMap::MapAnonymousAligned("Concurrent mark-compact from-space",
+                                                moving_space_size,
+                                                PROT_NONE,
+                                                /*low_4gb=*/kObjPtrPoisoning,
+                                                moving_space_alignment,
+                                                &err_msg);
+  if (UNLIKELY(!from_space_map_.IsValid())) {
+    LOG(FATAL) << "Failed to allocate concurrent mark-compact from-space" << err_msg;
+  } else {
+    from_space_begin_ = from_space_map_.Begin();
+  }
+
+  // In some cases (32-bit or kObjPtrPoisoning) it's too much to ask for 3
+  // heap-sized mappings in low-4GB. So tolerate failure here by attempting to
+  // mmap again right before the compaction pause. And if even that fails, then
+  // running the GC cycle in copy-mode rather than minor-fault.
+  //
+  // This map doesn't have to be aligned to 2MB as we don't mremap on it.
+  if (!kObjPtrPoisoning && uffd_minor_fault_supported_) {
+    // We need this map only if minor-fault feature is supported. But in that case
+    // don't create the mapping if obj-ptr poisoning is enabled as then the mapping
+    // has to be created in low_4gb. Doing this here rather than later causes the
+    // Dex2oatImageTest.TestExtension gtest to fail in 64-bit platforms.
+    shadow_to_space_map_ = MemMap::MapAnonymous("Concurrent mark-compact moving-space shadow",
+                                                moving_space_size,
+                                                PROT_NONE,
+                                                /*low_4gb=*/false,
+                                                &err_msg);
+    if (!shadow_to_space_map_.IsValid()) {
+      LOG(WARNING) << "Failed to allocate concurrent mark-compact moving-space shadow: " << err_msg;
+    }
+  }
+  const size_t num_pages =
+      1 + (use_uffd_sigbus_ ? kMutatorCompactionBufferCount :
+                              std::min(heap_->GetParallelGCThreadCount(), kMaxNumUffdWorkers));
+  compaction_buffers_map_ = MemMap::MapAnonymous("Concurrent mark-compact compaction buffers",
+                                                 kPageSize * num_pages,
+                                                 PROT_READ | PROT_WRITE,
+                                                 /*low_4gb=*/kObjPtrPoisoning,
+                                                 &err_msg);
+  if (UNLIKELY(!compaction_buffers_map_.IsValid())) {
+    LOG(FATAL) << "Failed to allocate concurrent mark-compact compaction buffers" << err_msg;
+  }
+  // We also use the first page-sized buffer for the purpose of terminating concurrent compaction.
+  conc_compaction_termination_page_ = compaction_buffers_map_.Begin();
+  // Touch the page deliberately to avoid userfaults on it. We madvise it in
+  // CompactionPhase() before using it to terminate concurrent compaction.
+  ForceRead(conc_compaction_termination_page_);
+
+  // In most of the cases, we don't expect more than one LinearAlloc space.
+  linear_alloc_spaces_data_.reserve(1);
+
+  // Initialize GC metrics.
+  metrics::ArtMetrics* metrics = GetMetrics();
+  // The mark-compact collector supports only full-heap collections at the moment.
+  gc_time_histogram_ = metrics->FullGcCollectionTime();
+  metrics_gc_count_ = metrics->FullGcCount();
+  metrics_gc_count_delta_ = metrics->FullGcCountDelta();
+  gc_throughput_histogram_ = metrics->FullGcThroughput();
+  gc_tracing_throughput_hist_ = metrics->FullGcTracingThroughput();
+  gc_throughput_avg_ = metrics->FullGcThroughputAvg();
+  gc_tracing_throughput_avg_ = metrics->FullGcTracingThroughputAvg();
+  gc_scanned_bytes_ = metrics->FullGcScannedBytes();
+  gc_scanned_bytes_delta_ = metrics->FullGcScannedBytesDelta();
+  gc_freed_bytes_ = metrics->FullGcFreedBytes();
+  gc_freed_bytes_delta_ = metrics->FullGcFreedBytesDelta();
+  gc_duration_ = metrics->FullGcDuration();
+  gc_duration_delta_ = metrics->FullGcDurationDelta();
+  are_metrics_initialized_ = true;
+}
+
+void MarkCompact::AddLinearAllocSpaceData(uint8_t* begin, size_t len) {
+  DCHECK_ALIGNED(begin, kPageSize);
+  DCHECK_ALIGNED(len, kPageSize);
+  DCHECK_GE(len, kPMDSize);
+  size_t alignment = BestPageTableAlignment(len);
+  bool is_shared = false;
+  // We use MAP_SHARED on non-zygote processes for leveraging userfaultfd's minor-fault feature.
+  if (map_linear_alloc_shared_) {
+    void* ret = mmap(begin,
+                     len,
+                     PROT_READ | PROT_WRITE,
+                     MAP_ANONYMOUS | MAP_SHARED | MAP_FIXED,
+                     /*fd=*/-1,
+                     /*offset=*/0);
+    CHECK_EQ(ret, begin) << "mmap failed: " << strerror(errno);
+    is_shared = true;
+  }
+  std::string err_msg;
+  MemMap shadow(MemMap::MapAnonymousAligned("linear-alloc shadow map",
+                                            len,
+                                            PROT_NONE,
+                                            /*low_4gb=*/false,
+                                            alignment,
+                                            &err_msg));
+  if (!shadow.IsValid()) {
+    LOG(FATAL) << "Failed to allocate linear-alloc shadow map: " << err_msg;
+    UNREACHABLE();
+  }
+
+  MemMap page_status_map(MemMap::MapAnonymous("linear-alloc page-status map",
+                                              len / kPageSize,
+                                              PROT_READ | PROT_WRITE,
+                                              /*low_4gb=*/false,
+                                              &err_msg));
+  if (!page_status_map.IsValid()) {
+    LOG(FATAL) << "Failed to allocate linear-alloc page-status shadow map: " << err_msg;
+    UNREACHABLE();
+  }
+  linear_alloc_spaces_data_.emplace_back(std::forward<MemMap>(shadow),
+                                         std::forward<MemMap>(page_status_map),
+                                         begin,
+                                         begin + len,
+                                         is_shared);
+}
+
+void MarkCompact::BindAndResetBitmaps() {
+  // TODO: We need to hold heap_bitmap_lock_ only for populating immune_spaces.
+  // The card-table and mod-union-table processing can be done without it. So
+  // change the logic below. Note that the bitmap clearing would require the
+  // lock.
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  accounting::CardTable* const card_table = heap_->GetCardTable();
+  // Mark all of the spaces we never collect as immune.
+  for (const auto& space : GetHeap()->GetContinuousSpaces()) {
+    if (space->GetGcRetentionPolicy() == space::kGcRetentionPolicyNeverCollect ||
+        space->GetGcRetentionPolicy() == space::kGcRetentionPolicyFullCollect) {
+      CHECK(space->IsZygoteSpace() || space->IsImageSpace());
+      immune_spaces_.AddSpace(space);
+      accounting::ModUnionTable* table = heap_->FindModUnionTableFromSpace(space);
+      if (table != nullptr) {
+        table->ProcessCards();
+      } else {
+        // Keep cards aged if we don't have a mod-union table since we may need
+        // to scan them in future GCs. This case is for app images.
+        // TODO: We could probably scan the objects right here to avoid doing
+        // another scan through the card-table.
+        card_table->ModifyCardsAtomic(
+            space->Begin(),
+            space->End(),
+            [](uint8_t card) {
+              return (card == gc::accounting::CardTable::kCardClean)
+                  ? card
+                  : gc::accounting::CardTable::kCardAged;
+            },
+            /* card modified visitor */ VoidFunctor());
+      }
+    } else {
+      CHECK(!space->IsZygoteSpace());
+      CHECK(!space->IsImageSpace());
+      // The card-table corresponding to bump-pointer and non-moving space can
+      // be cleared, because we are going to traverse all the reachable objects
+      // in these spaces. This card-table will eventually be used to track
+      // mutations while concurrent marking is going on.
+      card_table->ClearCardRange(space->Begin(), space->Limit());
+      if (space != bump_pointer_space_) {
+        CHECK_EQ(space, heap_->GetNonMovingSpace());
+        non_moving_space_ = space;
+        non_moving_space_bitmap_ = space->GetMarkBitmap();
+      }
+    }
+  }
+}
+
+void MarkCompact::MarkZygoteLargeObjects() {
+  Thread* self = thread_running_gc_;
+  DCHECK_EQ(self, Thread::Current());
+  space::LargeObjectSpace* const los = heap_->GetLargeObjectsSpace();
+  if (los != nullptr) {
+    // Pick the current live bitmap (mark bitmap if swapped).
+    accounting::LargeObjectBitmap* const live_bitmap = los->GetLiveBitmap();
+    accounting::LargeObjectBitmap* const mark_bitmap = los->GetMarkBitmap();
+    // Walk through all of the objects and explicitly mark the zygote ones so they don't get swept.
+    std::pair<uint8_t*, uint8_t*> range = los->GetBeginEndAtomic();
+    live_bitmap->VisitMarkedRange(reinterpret_cast<uintptr_t>(range.first),
+                                  reinterpret_cast<uintptr_t>(range.second),
+                                  [mark_bitmap, los, self](mirror::Object* obj)
+                                      REQUIRES(Locks::heap_bitmap_lock_)
+                                          REQUIRES_SHARED(Locks::mutator_lock_) {
+                                            if (los->IsZygoteLargeObject(self, obj)) {
+                                              mark_bitmap->Set(obj);
+                                            }
+                                          });
+  }
+}
+
+void MarkCompact::InitializePhase() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  mark_stack_ = heap_->GetMarkStack();
+  CHECK(mark_stack_->IsEmpty());
+  immune_spaces_.Reset();
+  moving_first_objs_count_ = 0;
+  non_moving_first_objs_count_ = 0;
+  black_page_count_ = 0;
+  bytes_scanned_ = 0;
+  freed_objects_ = 0;
+  // The first buffer is used by gc-thread.
+  compaction_buffer_counter_.store(1, std::memory_order_relaxed);
+  from_space_slide_diff_ = from_space_begin_ - bump_pointer_space_->Begin();
+  black_allocations_begin_ = bump_pointer_space_->Limit();
+  walk_super_class_cache_ = nullptr;
+  // TODO: Would it suffice to read it once in the constructor, which is called
+  // in zygote process?
+  pointer_size_ = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
+}
+
+class MarkCompact::ThreadFlipVisitor : public Closure {
+ public:
+  explicit ThreadFlipVisitor(MarkCompact* collector) : collector_(collector) {}
+
+  void Run(Thread* thread) override REQUIRES_SHARED(Locks::mutator_lock_) {
+    // Note: self is not necessarily equal to thread since thread may be suspended.
+    Thread* self = Thread::Current();
+    CHECK(thread == self || thread->GetState() != ThreadState::kRunnable)
+        << thread->GetState() << " thread " << thread << " self " << self;
+    thread->VisitRoots(collector_, kVisitRootFlagAllRoots);
+    // Interpreter cache is thread-local so it needs to be swept either in a
+    // flip, or a stop-the-world pause.
+    CHECK(collector_->compacting_);
+    thread->SweepInterpreterCache(collector_);
+    thread->AdjustTlab(collector_->black_objs_slide_diff_);
+    collector_->GetBarrier().Pass(self);
+  }
+
+ private:
+  MarkCompact* const collector_;
+};
+
+class MarkCompact::FlipCallback : public Closure {
+ public:
+  explicit FlipCallback(MarkCompact* collector) : collector_(collector) {}
+
+  void Run(Thread* thread ATTRIBUTE_UNUSED) override REQUIRES(Locks::mutator_lock_) {
+    collector_->CompactionPause();
+  }
+
+ private:
+  MarkCompact* const collector_;
+};
+
+void MarkCompact::RunPhases() {
+  Thread* self = Thread::Current();
+  thread_running_gc_ = self;
+  Runtime* runtime = Runtime::Current();
+  InitializePhase();
+  GetHeap()->PreGcVerification(this);
+  {
+    ReaderMutexLock mu(self, *Locks::mutator_lock_);
+    MarkingPhase();
+  }
+  {
+    // Marking pause
+    ScopedPause pause(this);
+    MarkingPause();
+    if (kIsDebugBuild) {
+      bump_pointer_space_->AssertAllThreadLocalBuffersAreRevoked();
+    }
+  }
+  // To increase likelihood of black allocations. For testing purposes only.
+  if (kIsDebugBuild && heap_->GetTaskProcessor()->GetRunningThread() == thread_running_gc_) {
+    usleep(500'000);
+  }
+  {
+    ReaderMutexLock mu(self, *Locks::mutator_lock_);
+    ReclaimPhase();
+    PrepareForCompaction();
+  }
+  if (uffd_ != kFallbackMode && !use_uffd_sigbus_) {
+    heap_->GetThreadPool()->WaitForWorkersToBeCreated();
+  }
+
+  {
+    // Compaction pause
+    gc_barrier_.Init(self, 0);
+    ThreadFlipVisitor visitor(this);
+    FlipCallback callback(this);
+    size_t barrier_count = runtime->GetThreadList()->FlipThreadRoots(
+        &visitor, &callback, this, GetHeap()->GetGcPauseListener());
+    {
+      ScopedThreadStateChange tsc(self, ThreadState::kWaitingForCheckPointsToRun);
+      gc_barrier_.Increment(self, barrier_count);
+    }
+  }
+
+  if (IsValidFd(uffd_)) {
+    ReaderMutexLock mu(self, *Locks::mutator_lock_);
+    CompactionPhase();
+  }
+
+  FinishPhase();
+  thread_running_gc_ = nullptr;
+  GetHeap()->PostGcVerification(this);
+}
+
+void MarkCompact::InitMovingSpaceFirstObjects(const size_t vec_len) {
+  // Find the first live word first.
+  size_t to_space_page_idx = 0;
+  uint32_t offset_in_chunk_word;
+  uint32_t offset;
+  mirror::Object* obj;
+  const uintptr_t heap_begin = moving_space_bitmap_->HeapBegin();
+
+  size_t chunk_idx;
+  // Find the first live word in the space
+  for (chunk_idx = 0; chunk_info_vec_[chunk_idx] == 0; chunk_idx++) {
+    if (chunk_idx > vec_len) {
+      // We don't have any live data on the moving-space.
+      return;
+    }
+  }
+  // Use live-words bitmap to find the first word
+  offset_in_chunk_word = live_words_bitmap_->FindNthLiveWordOffset(chunk_idx, /*n*/ 0);
+  offset = chunk_idx * kBitsPerVectorWord + offset_in_chunk_word;
+  DCHECK(live_words_bitmap_->Test(offset)) << "offset=" << offset
+                                           << " chunk_idx=" << chunk_idx
+                                           << " N=0"
+                                           << " offset_in_word=" << offset_in_chunk_word
+                                           << " word=" << std::hex
+                                           << live_words_bitmap_->GetWord(chunk_idx);
+  // The first object doesn't require using FindPrecedingObject().
+  obj = reinterpret_cast<mirror::Object*>(heap_begin + offset * kAlignment);
+  // TODO: add a check to validate the object.
+
+  pre_compact_offset_moving_space_[to_space_page_idx] = offset;
+  first_objs_moving_space_[to_space_page_idx].Assign(obj);
+  to_space_page_idx++;
+
+  uint32_t page_live_bytes = 0;
+  while (true) {
+    for (; page_live_bytes <= kPageSize; chunk_idx++) {
+      if (chunk_idx > vec_len) {
+        moving_first_objs_count_ = to_space_page_idx;
+        return;
+      }
+      page_live_bytes += chunk_info_vec_[chunk_idx];
+    }
+    chunk_idx--;
+    page_live_bytes -= kPageSize;
+    DCHECK_LE(page_live_bytes, kOffsetChunkSize);
+    DCHECK_LE(page_live_bytes, chunk_info_vec_[chunk_idx])
+        << " chunk_idx=" << chunk_idx
+        << " to_space_page_idx=" << to_space_page_idx
+        << " vec_len=" << vec_len;
+    DCHECK(IsAligned<kAlignment>(chunk_info_vec_[chunk_idx] - page_live_bytes));
+    offset_in_chunk_word =
+            live_words_bitmap_->FindNthLiveWordOffset(
+                chunk_idx, (chunk_info_vec_[chunk_idx] - page_live_bytes) / kAlignment);
+    offset = chunk_idx * kBitsPerVectorWord + offset_in_chunk_word;
+    DCHECK(live_words_bitmap_->Test(offset))
+        << "offset=" << offset
+        << " chunk_idx=" << chunk_idx
+        << " N=" << ((chunk_info_vec_[chunk_idx] - page_live_bytes) / kAlignment)
+        << " offset_in_word=" << offset_in_chunk_word
+        << " word=" << std::hex << live_words_bitmap_->GetWord(chunk_idx);
+    // TODO: Can we optimize this for large objects? If we are continuing a
+    // large object that spans multiple pages, then we may be able to do without
+    // calling FindPrecedingObject().
+    //
+    // Find the object which encapsulates offset in it, which could be
+    // starting at offset itself.
+    obj = moving_space_bitmap_->FindPrecedingObject(heap_begin + offset * kAlignment);
+    // TODO: add a check to validate the object.
+    pre_compact_offset_moving_space_[to_space_page_idx] = offset;
+    first_objs_moving_space_[to_space_page_idx].Assign(obj);
+    to_space_page_idx++;
+    chunk_idx++;
+  }
+}
+
+void MarkCompact::InitNonMovingSpaceFirstObjects() {
+  accounting::ContinuousSpaceBitmap* bitmap = non_moving_space_->GetLiveBitmap();
+  uintptr_t begin = reinterpret_cast<uintptr_t>(non_moving_space_->Begin());
+  const uintptr_t end = reinterpret_cast<uintptr_t>(non_moving_space_->End());
+  mirror::Object* prev_obj;
+  size_t page_idx;
+  {
+    // Find first live object
+    mirror::Object* obj = nullptr;
+    bitmap->VisitMarkedRange</*kVisitOnce*/ true>(begin,
+                                                  end,
+                                                  [&obj] (mirror::Object* o) {
+                                                    obj = o;
+                                                  });
+    if (obj == nullptr) {
+      // There are no live objects in the non-moving space
+      return;
+    }
+    page_idx = (reinterpret_cast<uintptr_t>(obj) - begin) / kPageSize;
+    first_objs_non_moving_space_[page_idx++].Assign(obj);
+    prev_obj = obj;
+  }
+  // TODO: check obj is valid
+  uintptr_t prev_obj_end = reinterpret_cast<uintptr_t>(prev_obj)
+                           + RoundUp(prev_obj->SizeOf<kDefaultVerifyFlags>(), kAlignment);
+  // For every page find the object starting from which we need to call
+  // VisitReferences. It could either be an object that started on some
+  // preceding page, or some object starting within this page.
+  begin = RoundDown(reinterpret_cast<uintptr_t>(prev_obj) + kPageSize, kPageSize);
+  while (begin < end) {
+    // Utilize, if any, large object that started in some preceding page, but
+    // overlaps with this page as well.
+    if (prev_obj != nullptr && prev_obj_end > begin) {
+      DCHECK_LT(prev_obj, reinterpret_cast<mirror::Object*>(begin));
+      first_objs_non_moving_space_[page_idx].Assign(prev_obj);
+      mirror::Class* klass = prev_obj->GetClass<kVerifyNone, kWithoutReadBarrier>();
+      if (bump_pointer_space_->HasAddress(klass)) {
+        LOG(WARNING) << "found inter-page object " << prev_obj
+                     << " in non-moving space with klass " << klass
+                     << " in moving space";
+      }
+    } else {
+      prev_obj_end = 0;
+      // It's sufficient to only search for previous object in the preceding page.
+      // If no live object started in that page and some object had started in
+      // the page preceding to that page, which was big enough to overlap with
+      // the current page, then we wouldn't be in the else part.
+      prev_obj = bitmap->FindPrecedingObject(begin, begin - kPageSize);
+      if (prev_obj != nullptr) {
+        prev_obj_end = reinterpret_cast<uintptr_t>(prev_obj)
+                        + RoundUp(prev_obj->SizeOf<kDefaultVerifyFlags>(), kAlignment);
+      }
+      if (prev_obj_end > begin) {
+        mirror::Class* klass = prev_obj->GetClass<kVerifyNone, kWithoutReadBarrier>();
+        if (bump_pointer_space_->HasAddress(klass)) {
+          LOG(WARNING) << "found inter-page object " << prev_obj
+                       << " in non-moving space with klass " << klass
+                       << " in moving space";
+        }
+        first_objs_non_moving_space_[page_idx].Assign(prev_obj);
+      } else {
+        // Find the first live object in this page
+        bitmap->VisitMarkedRange</*kVisitOnce*/ true>(
+                begin,
+                begin + kPageSize,
+                [this, page_idx] (mirror::Object* obj) {
+                  first_objs_non_moving_space_[page_idx].Assign(obj);
+                });
+      }
+      // An empty entry indicates that the page has no live objects and hence
+      // can be skipped.
+    }
+    begin += kPageSize;
+    page_idx++;
+  }
+  non_moving_first_objs_count_ = page_idx;
+}
+
+bool MarkCompact::CanCompactMovingSpaceWithMinorFault() {
+  size_t min_size = (moving_first_objs_count_ + black_page_count_) * kPageSize;
+  return minor_fault_initialized_ && shadow_to_space_map_.IsValid() &&
+         shadow_to_space_map_.Size() >= min_size;
+}
+
+class MarkCompact::ConcurrentCompactionGcTask : public SelfDeletingTask {
+ public:
+  explicit ConcurrentCompactionGcTask(MarkCompact* collector, size_t idx)
+      : collector_(collector), index_(idx) {}
+
+  void Run(Thread* self ATTRIBUTE_UNUSED) override REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (collector_->CanCompactMovingSpaceWithMinorFault()) {
+      collector_->ConcurrentCompaction<MarkCompact::kMinorFaultMode>(/*buf=*/nullptr);
+    } else {
+      // The passed page/buf to ConcurrentCompaction is used by the thread as a
+      // kPageSize buffer for compacting and updating objects into and then
+      // passing the buf to uffd ioctls.
+      uint8_t* buf = collector_->compaction_buffers_map_.Begin() + index_ * kPageSize;
+      collector_->ConcurrentCompaction<MarkCompact::kCopyMode>(buf);
+    }
+  }
+
+ private:
+  MarkCompact* const collector_;
+  size_t index_;
+};
+
+void MarkCompact::PrepareForCompaction() {
+  uint8_t* space_begin = bump_pointer_space_->Begin();
+  size_t vector_len = (black_allocations_begin_ - space_begin) / kOffsetChunkSize;
+  DCHECK_LE(vector_len, vector_length_);
+  for (size_t i = 0; i < vector_len; i++) {
+    DCHECK_LE(chunk_info_vec_[i], kOffsetChunkSize);
+    DCHECK_EQ(chunk_info_vec_[i], live_words_bitmap_->LiveBytesInBitmapWord(i));
+  }
+  InitMovingSpaceFirstObjects(vector_len);
+  InitNonMovingSpaceFirstObjects();
+
+  // TODO: We can do a lot of neat tricks with this offset vector to tune the
+  // compaction as we wish. Originally, the compaction algorithm slides all
+  // live objects towards the beginning of the heap. This is nice because it
+  // keeps the spatial locality of objects intact.
+  // However, sometimes it's desired to compact objects in certain portions
+  // of the heap. For instance, it is expected that, over time,
+  // objects towards the beginning of the heap are long lived and are always
+  // densely packed. In this case, it makes sense to only update references in
+  // there and not try to compact it.
+  // Furthermore, we might have some large objects and may not want to move such
+  // objects.
+  // We can adjust, without too much effort, the values in the chunk_info_vec_ such
+  // that the objects in the dense beginning area aren't moved. OTOH, large
+  // objects, which could be anywhere in the heap, could also be kept from
+  // moving by using a similar trick. The only issue is that by doing this we will
+  // leave an unused hole in the middle of the heap which can't be used for
+  // allocations until we do a *full* compaction.
+  //
+  // At this point every element in the chunk_info_vec_ contains the live-bytes
+  // of the corresponding chunk. For old-to-new address computation we need
+  // every element to reflect total live-bytes till the corresponding chunk.
+
+  // Live-bytes count is required to compute post_compact_end_ below.
+  uint32_t total;
+  // Update the vector one past the heap usage as it is required for black
+  // allocated objects' post-compact address computation.
+  if (vector_len < vector_length_) {
+    vector_len++;
+    total = 0;
+  } else {
+    // Fetch the value stored in the last element before it gets overwritten by
+    // std::exclusive_scan().
+    total = chunk_info_vec_[vector_len - 1];
+  }
+  std::exclusive_scan(chunk_info_vec_, chunk_info_vec_ + vector_len, chunk_info_vec_, 0);
+  total += chunk_info_vec_[vector_len - 1];
+
+  for (size_t i = vector_len; i < vector_length_; i++) {
+    DCHECK_EQ(chunk_info_vec_[i], 0u);
+  }
+  post_compact_end_ = AlignUp(space_begin + total, kPageSize);
+  CHECK_EQ(post_compact_end_, space_begin + moving_first_objs_count_ * kPageSize);
+  black_objs_slide_diff_ = black_allocations_begin_ - post_compact_end_;
+  // How do we handle compaction of heap portion used for allocations after the
+  // marking-pause?
+  // All allocations after the marking-pause are considered black (reachable)
+  // for this GC cycle. However, they need not be allocated contiguously as
+  // different mutators use TLABs. So we will compact the heap till the point
+  // where allocations took place before the marking-pause. And everything after
+  // that will be slid with TLAB holes, and then TLAB info in TLS will be
+  // appropriately updated in the pre-compaction pause.
+  // The chunk-info vector entries for the post marking-pause allocations will be
+  // also updated in the pre-compaction pause.
+
+  bool is_zygote = Runtime::Current()->IsZygote();
+  if (!uffd_initialized_ && CreateUserfaultfd(/*post_fork*/false)) {
+    if (!use_uffd_sigbus_) {
+      // Register the buffer that we use for terminating concurrent compaction
+      struct uffdio_register uffd_register;
+      uffd_register.range.start = reinterpret_cast<uintptr_t>(conc_compaction_termination_page_);
+      uffd_register.range.len = kPageSize;
+      uffd_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+      CHECK_EQ(ioctl(uffd_, UFFDIO_REGISTER, &uffd_register), 0)
+          << "ioctl_userfaultfd: register compaction termination page: " << strerror(errno);
+    }
+    if (!uffd_minor_fault_supported_ && shadow_to_space_map_.IsValid()) {
+      // A valid shadow-map for moving space is only possible if we
+      // were able to map it in the constructor. That also means that its size
+      // matches the moving-space.
+      CHECK_EQ(shadow_to_space_map_.Size(), bump_pointer_space_->Capacity());
+      // Release the shadow map for moving-space if we don't support minor-fault
+      // as it's not required.
+      shadow_to_space_map_.Reset();
+    }
+  }
+  // For zygote we create the thread pool each time before starting compaction,
+  // and get rid of it when finished. This is expected to happen rarely as
+  // zygote spends most of the time in native fork loop.
+  if (uffd_ != kFallbackMode) {
+    if (!use_uffd_sigbus_) {
+      ThreadPool* pool = heap_->GetThreadPool();
+      if (UNLIKELY(pool == nullptr)) {
+        // On devices with 2 cores, GetParallelGCThreadCount() will return 1,
+        // which is desired number of workers on such devices.
+        heap_->CreateThreadPool(std::min(heap_->GetParallelGCThreadCount(), kMaxNumUffdWorkers));
+        pool = heap_->GetThreadPool();
+      }
+      size_t num_threads = pool->GetThreadCount();
+      thread_pool_counter_ = num_threads;
+      for (size_t i = 0; i < num_threads; i++) {
+        pool->AddTask(thread_running_gc_, new ConcurrentCompactionGcTask(this, i + 1));
+      }
+      CHECK_EQ(pool->GetTaskCount(thread_running_gc_), num_threads);
+    }
+    /*
+     * Possible scenarios for mappings:
+     * A) All zygote GCs (or if minor-fault feature isn't available): uses
+     * uffd's copy mode
+     *  1) For moving-space ('to' space is same as the moving-space):
+     *    a) Private-anonymous mappings for 'to' and 'from' space are created in
+     *    the constructor.
+     *    b) In the compaction pause, we mremap(dontunmap) from 'to' space to
+     *    'from' space. This results in moving all pages to 'from' space and
+     *    emptying the 'to' space, thereby preparing it for userfaultfd
+     *    registration.
+     *
+     *  2) For linear-alloc space:
+     *    a) Private-anonymous mappings for the linear-alloc and its 'shadow'
+     *    are created by the arena-pool.
+     *    b) In the compaction pause, we mremap(dontumap) with similar effect as
+     *    (A.1.b) above.
+     *
+     * B) First GC after zygote: uses uffd's copy-mode
+     *  1) For moving-space:
+     *    a) If the mmap for shadow-map has been successful in the constructor,
+     *    then we remap it (mmap with MAP_FIXED) to get a shared-anonymous
+     *    mapping.
+     *    b) Else, we create two memfd and ftruncate them to the moving-space
+     *    size.
+     *    c) Same as (A.1.b)
+     *    d) If (B.1.a), then mremap(dontunmap) from shadow-map to
+     *    'to' space. This will make both of them map to the same pages
+     *    e) If (B.1.b), then mmap with the first memfd in shared mode on the
+     *    'to' space.
+     *    f) At the end of compaction, we will have moved the moving-space
+     *    objects to a MAP_SHARED mapping, readying it for minor-fault from next
+     *    GC cycle.
+     *
+     *  2) For linear-alloc space:
+     *    a) Same as (A.2.b)
+     *    b) mmap a shared-anonymous mapping onto the linear-alloc space.
+     *    c) Same as (B.1.f)
+     *
+     * C) All subsequent GCs: preferable minor-fault mode. But may also require
+     * using copy-mode.
+     *  1) For moving-space:
+     *    a) If the shadow-map is created and no memfd was used, then that means
+     *    we are using shared-anonymous. Therefore, mmap a shared-anonymous on
+     *    the shadow-space.
+     *    b) If the shadow-map is not mapped yet, then mmap one with a size
+     *    big enough to hold the compacted moving space. This may fail, in which
+     *    case we will use uffd's copy-mode.
+     *    c) If (b) is successful, then mmap the free memfd onto shadow-map.
+     *    d) Same as (A.1.b)
+     *    e) In compaction pause, if the shadow-map was not created, then use
+     *    copy-mode.
+     *    f) Else, if the created map is smaller than the required-size, then
+     *    use mremap (without dontunmap) to expand the size. If failed, then use
+     *    copy-mode.
+     *    g) Otherwise, same as (B.1.d) and use minor-fault mode.
+     *
+     *  2) For linear-alloc space:
+     *    a) Same as (A.2.b)
+     *    b) Use minor-fault mode
+     */
+    auto mmap_shadow_map = [this](int flags, int fd) {
+      void* ret = mmap(shadow_to_space_map_.Begin(),
+                       shadow_to_space_map_.Size(),
+                       PROT_READ | PROT_WRITE,
+                       flags,
+                       fd,
+                       /*offset=*/0);
+      DCHECK_NE(ret, MAP_FAILED) << "mmap for moving-space shadow failed:" << strerror(errno);
+    };
+    // Setup all the virtual memory ranges required for concurrent compaction.
+    if (minor_fault_initialized_) {
+      DCHECK(!is_zygote);
+      if (UNLIKELY(!shadow_to_space_map_.IsValid())) {
+        // This case happens only once on the first GC in minor-fault mode, if
+        // we were unable to reserve shadow-map for moving-space in the
+        // beginning.
+        DCHECK_GE(moving_to_space_fd_, 0);
+        // Take extra 4MB to reduce the likelihood of requiring resizing this
+        // map in the pause due to black allocations.
+        size_t reqd_size = std::min(moving_first_objs_count_ * kPageSize + 4 * MB,
+                                    bump_pointer_space_->Capacity());
+        // We cannot support memory-tool with shadow-map (as it requires
+        // appending a redzone) in this case because the mapping may have to be expanded
+        // using mremap (in KernelPreparation()), which would ignore the redzone.
+        // MemMap::MapFile() appends a redzone, but MemMap::MapAnonymous() doesn't.
+        std::string err_msg;
+        shadow_to_space_map_ = MemMap::MapAnonymous("moving-space-shadow",
+                                                    reqd_size,
+                                                    PROT_NONE,
+                                                    /*low_4gb=*/kObjPtrPoisoning,
+                                                    &err_msg);
+
+        if (shadow_to_space_map_.IsValid()) {
+          CHECK(!kMemoryToolAddsRedzones || shadow_to_space_map_.GetRedzoneSize() == 0u);
+          // We want to use MemMap to get low-4GB mapping, if required, but then also
+          // want to have its ownership as we may grow it (in
+          // KernelPreparation()). If the ownership is not taken and we try to
+          // resize MemMap, then it unmaps the virtual range.
+          MemMap temp = shadow_to_space_map_.TakeReservedMemory(shadow_to_space_map_.Size(),
+                                                                /*reuse*/ true);
+          std::swap(temp, shadow_to_space_map_);
+          DCHECK(!temp.IsValid());
+        } else {
+          LOG(WARNING) << "Failed to create moving space's shadow map of " << PrettySize(reqd_size)
+                       << " size. " << err_msg;
+        }
+      }
+
+      if (LIKELY(shadow_to_space_map_.IsValid())) {
+        int fd = moving_to_space_fd_;
+        int mmap_flags = MAP_SHARED | MAP_FIXED;
+        if (fd == kFdUnused) {
+          // Unused moving-to-space fd means we are using anonymous shared
+          // mapping.
+          DCHECK_EQ(shadow_to_space_map_.Size(), bump_pointer_space_->Capacity());
+          mmap_flags |= MAP_ANONYMOUS;
+          fd = -1;
+        }
+        // If the map is smaller than required, then we'll do mremap in the
+        // compaction pause to increase the size.
+        mmap_shadow_map(mmap_flags, fd);
+      }
+
+      for (auto& data : linear_alloc_spaces_data_) {
+        DCHECK_EQ(mprotect(data.shadow_.Begin(), data.shadow_.Size(), PROT_READ | PROT_WRITE), 0)
+            << "mprotect failed: " << strerror(errno);
+      }
+    } else if (!is_zygote && uffd_minor_fault_supported_) {
+      // First GC after zygote-fork. We will still use uffd's copy mode but will
+      // use it to move objects to MAP_SHARED (to prepare for subsequent GCs, which
+      // will use uffd's minor-fault feature).
+      if (shadow_to_space_map_.IsValid() &&
+          shadow_to_space_map_.Size() == bump_pointer_space_->Capacity()) {
+        mmap_shadow_map(MAP_SHARED | MAP_FIXED | MAP_ANONYMOUS, /*fd=*/-1);
+      } else {
+        size_t size = bump_pointer_space_->Capacity();
+        DCHECK_EQ(moving_to_space_fd_, kFdUnused);
+        DCHECK_EQ(moving_from_space_fd_, kFdUnused);
+        const char* name = bump_pointer_space_->GetName();
+        moving_to_space_fd_ = memfd_create(name, MFD_CLOEXEC);
+        CHECK_NE(moving_to_space_fd_, -1)
+            << "memfd_create: failed for " << name << ": " << strerror(errno);
+        moving_from_space_fd_ = memfd_create(name, MFD_CLOEXEC);
+        CHECK_NE(moving_from_space_fd_, -1)
+            << "memfd_create: failed for " << name << ": " << strerror(errno);
+
+        // memfds are considered as files from resource limits point of view.
+        // And the moving space could be several hundred MBs. So increase the
+        // limit, if it's lower than moving-space size.
+        bool rlimit_changed = false;
+        rlimit rlim_read;
+        CHECK_EQ(getrlimit(RLIMIT_FSIZE, &rlim_read), 0) << "getrlimit failed: " << strerror(errno);
+        if (rlim_read.rlim_cur < size) {
+          rlimit_changed = true;
+          rlimit rlim = rlim_read;
+          rlim.rlim_cur = size;
+          CHECK_EQ(setrlimit(RLIMIT_FSIZE, &rlim), 0) << "setrlimit failed: " << strerror(errno);
+        }
+
+        // moving-space will map this fd so that we compact objects into it.
+        int ret = ftruncate(moving_to_space_fd_, size);
+        CHECK_EQ(ret, 0) << "ftruncate failed for moving-space:" << strerror(errno);
+        ret = ftruncate(moving_from_space_fd_, size);
+        CHECK_EQ(ret, 0) << "ftruncate failed for moving-space:" << strerror(errno);
+
+        if (rlimit_changed) {
+          // reset the rlimit to the original limits.
+          CHECK_EQ(setrlimit(RLIMIT_FSIZE, &rlim_read), 0)
+              << "setrlimit failed: " << strerror(errno);
+        }
+      }
+    }
+  }
+}
+
+class MarkCompact::VerifyRootMarkedVisitor : public SingleRootVisitor {
+ public:
+  explicit VerifyRootMarkedVisitor(MarkCompact* collector) : collector_(collector) { }
+
+  void VisitRoot(mirror::Object* root, const RootInfo& info) override
+      REQUIRES_SHARED(Locks::mutator_lock_, Locks::heap_bitmap_lock_) {
+    CHECK(collector_->IsMarked(root) != nullptr) << info.ToString();
+  }
+
+ private:
+  MarkCompact* const collector_;
+};
+
+void MarkCompact::ReMarkRoots(Runtime* runtime) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  DCHECK_EQ(thread_running_gc_, Thread::Current());
+  Locks::mutator_lock_->AssertExclusiveHeld(thread_running_gc_);
+  MarkNonThreadRoots(runtime);
+  MarkConcurrentRoots(static_cast<VisitRootFlags>(kVisitRootFlagNewRoots
+                                                  | kVisitRootFlagStopLoggingNewRoots
+                                                  | kVisitRootFlagClearRootLog),
+                      runtime);
+
+  if (kVerifyRootsMarked) {
+    TimingLogger::ScopedTiming t2("(Paused)VerifyRoots", GetTimings());
+    VerifyRootMarkedVisitor visitor(this);
+    runtime->VisitRoots(&visitor);
+  }
+}
+
+void MarkCompact::MarkingPause() {
+  TimingLogger::ScopedTiming t("(Paused)MarkingPause", GetTimings());
+  Runtime* runtime = Runtime::Current();
+  Locks::mutator_lock_->AssertExclusiveHeld(thread_running_gc_);
+  {
+    // Handle the dirty objects as we are a concurrent GC
+    WriterMutexLock mu(thread_running_gc_, *Locks::heap_bitmap_lock_);
+    {
+      MutexLock mu2(thread_running_gc_, *Locks::runtime_shutdown_lock_);
+      MutexLock mu3(thread_running_gc_, *Locks::thread_list_lock_);
+      std::list<Thread*> thread_list = runtime->GetThreadList()->GetList();
+      for (Thread* thread : thread_list) {
+        thread->VisitRoots(this, static_cast<VisitRootFlags>(0));
+        DCHECK_EQ(thread->GetThreadLocalGcBuffer(), nullptr);
+        // Need to revoke all the thread-local allocation stacks since we will
+        // swap the allocation stacks (below) and don't want anybody to allocate
+        // into the live stack.
+        thread->RevokeThreadLocalAllocationStack();
+        bump_pointer_space_->RevokeThreadLocalBuffers(thread);
+      }
+    }
+    // Fetch only the accumulated objects-allocated count as it is guaranteed to
+    // be up-to-date after the TLAB revocation above.
+    freed_objects_ += bump_pointer_space_->GetAccumulatedObjectsAllocated();
+    // Capture 'end' of moving-space at this point. Every allocation beyond this
+    // point will be considered as black.
+    // Align-up to page boundary so that black allocations happen from next page
+    // onwards. Also, it ensures that 'end' is aligned for card-table's
+    // ClearCardRange().
+    black_allocations_begin_ = bump_pointer_space_->AlignEnd(thread_running_gc_, kPageSize);
+    DCHECK(IsAligned<kAlignment>(black_allocations_begin_));
+    black_allocations_begin_ = AlignUp(black_allocations_begin_, kPageSize);
+
+    // Re-mark root set. Doesn't include thread-roots as they are already marked
+    // above.
+    ReMarkRoots(runtime);
+    // Scan dirty objects.
+    RecursiveMarkDirtyObjects(/*paused*/ true, accounting::CardTable::kCardDirty);
+    {
+      TimingLogger::ScopedTiming t2("SwapStacks", GetTimings());
+      heap_->SwapStacks();
+      live_stack_freeze_size_ = heap_->GetLiveStack()->Size();
+    }
+  }
+  // TODO: For PreSweepingGcVerification(), find correct strategy to visit/walk
+  // objects in bump-pointer space when we have a mark-bitmap to indicate live
+  // objects. At the same time we also need to be able to visit black allocations,
+  // even though they are not marked in the bitmap. Without both of these we fail
+  // pre-sweeping verification. As well as we leave windows open wherein a
+  // VisitObjects/Walk on the space would either miss some objects or visit
+  // unreachable ones. These windows are when we are switching from shared
+  // mutator-lock to exclusive and vice-versa starting from here till compaction pause.
+  // heap_->PreSweepingGcVerification(this);
+
+  // Disallow new system weaks to prevent a race which occurs when someone adds
+  // a new system weak before we sweep them. Since this new system weak may not
+  // be marked, the GC may incorrectly sweep it. This also fixes a race where
+  // interning may attempt to return a strong reference to a string that is
+  // about to be swept.
+  runtime->DisallowNewSystemWeaks();
+  // Enable the reference processing slow path, needs to be done with mutators
+  // paused since there is no lock in the GetReferent fast path.
+  heap_->GetReferenceProcessor()->EnableSlowPath();
+}
+
+void MarkCompact::SweepSystemWeaks(Thread* self, Runtime* runtime, const bool paused) {
+  TimingLogger::ScopedTiming t(paused ? "(Paused)SweepSystemWeaks" : "SweepSystemWeaks",
+                               GetTimings());
+  ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
+  runtime->SweepSystemWeaks(this);
+}
+
+void MarkCompact::ProcessReferences(Thread* self) {
+  WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
+  GetHeap()->GetReferenceProcessor()->ProcessReferences(self, GetTimings());
+}
+
+void MarkCompact::Sweep(bool swap_bitmaps) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  // Ensure that nobody inserted objects in the live stack after we swapped the
+  // stacks.
+  CHECK_GE(live_stack_freeze_size_, GetHeap()->GetLiveStack()->Size());
+  {
+    TimingLogger::ScopedTiming t2("MarkAllocStackAsLive", GetTimings());
+    // Mark everything allocated since the last GC as live so that we can sweep
+    // concurrently, knowing that new allocations won't be marked as live.
+    accounting::ObjectStack* live_stack = heap_->GetLiveStack();
+    heap_->MarkAllocStackAsLive(live_stack);
+    live_stack->Reset();
+    DCHECK(mark_stack_->IsEmpty());
+  }
+  for (const auto& space : GetHeap()->GetContinuousSpaces()) {
+    if (space->IsContinuousMemMapAllocSpace() && space != bump_pointer_space_) {
+      space::ContinuousMemMapAllocSpace* alloc_space = space->AsContinuousMemMapAllocSpace();
+      TimingLogger::ScopedTiming split(
+          alloc_space->IsZygoteSpace() ? "SweepZygoteSpace" : "SweepMallocSpace",
+          GetTimings());
+      RecordFree(alloc_space->Sweep(swap_bitmaps));
+    }
+  }
+  SweepLargeObjects(swap_bitmaps);
+}
+
+void MarkCompact::SweepLargeObjects(bool swap_bitmaps) {
+  space::LargeObjectSpace* los = heap_->GetLargeObjectsSpace();
+  if (los != nullptr) {
+    TimingLogger::ScopedTiming split(__FUNCTION__, GetTimings());
+    RecordFreeLOS(los->Sweep(swap_bitmaps));
+  }
+}
+
+void MarkCompact::ReclaimPhase() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  DCHECK(thread_running_gc_ == Thread::Current());
+  Runtime* const runtime = Runtime::Current();
+  // Process the references concurrently.
+  ProcessReferences(thread_running_gc_);
+  // TODO: Try to merge this system-weak sweeping with the one while updating
+  // references during the compaction pause.
+  SweepSystemWeaks(thread_running_gc_, runtime, /*paused*/ false);
+  runtime->AllowNewSystemWeaks();
+  // Clean up class loaders after system weaks are swept since that is how we know if class
+  // unloading occurred.
+  runtime->GetClassLinker()->CleanupClassLoaders();
+  {
+    WriterMutexLock mu(thread_running_gc_, *Locks::heap_bitmap_lock_);
+    // Reclaim unmarked objects.
+    Sweep(false);
+    // Swap the live and mark bitmaps for each space which we modified space. This is an
+    // optimization that enables us to not clear live bits inside of the sweep. Only swaps unbound
+    // bitmaps.
+    SwapBitmaps();
+    // Unbind the live and mark bitmaps.
+    GetHeap()->UnBindBitmaps();
+  }
+}
+
+// We want to avoid checking for every reference if it's within the page or
+// not. This can be done if we know where in the page the holder object lies.
+// If it doesn't overlap either boundaries then we can skip the checks.
+template <bool kCheckBegin, bool kCheckEnd>
+class MarkCompact::RefsUpdateVisitor {
+ public:
+  explicit RefsUpdateVisitor(MarkCompact* collector,
+                             mirror::Object* obj,
+                             uint8_t* begin,
+                             uint8_t* end)
+      : collector_(collector), obj_(obj), begin_(begin), end_(end) {
+    DCHECK(!kCheckBegin || begin != nullptr);
+    DCHECK(!kCheckEnd || end != nullptr);
+  }
+
+  void operator()(mirror::Object* old ATTRIBUTE_UNUSED, MemberOffset offset, bool /* is_static */)
+      const ALWAYS_INLINE REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES_SHARED(Locks::heap_bitmap_lock_) {
+    bool update = true;
+    if (kCheckBegin || kCheckEnd) {
+      uint8_t* ref = reinterpret_cast<uint8_t*>(obj_) + offset.Int32Value();
+      update = (!kCheckBegin || ref >= begin_) && (!kCheckEnd || ref < end_);
+    }
+    if (update) {
+      collector_->UpdateRef(obj_, offset);
+    }
+  }
+
+  // For object arrays we don't need to check boundaries here as it's done in
+  // VisitReferenes().
+  // TODO: Optimize reference updating using SIMD instructions. Object arrays
+  // are perfect as all references are tightly packed.
+  void operator()(mirror::Object* old ATTRIBUTE_UNUSED,
+                  MemberOffset offset,
+                  bool /*is_static*/,
+                  bool /*is_obj_array*/)
+      const ALWAYS_INLINE REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES_SHARED(Locks::heap_bitmap_lock_) {
+    collector_->UpdateRef(obj_, offset);
+  }
+
+  void VisitRootIfNonNull(mirror::CompressedReference<mirror::Object>* root) const
+      ALWAYS_INLINE
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (!root->IsNull()) {
+      VisitRoot(root);
+    }
+  }
+
+  void VisitRoot(mirror::CompressedReference<mirror::Object>* root) const
+      ALWAYS_INLINE
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    collector_->UpdateRoot(root);
+  }
+
+ private:
+  MarkCompact* const collector_;
+  mirror::Object* const obj_;
+  uint8_t* const begin_;
+  uint8_t* const end_;
+};
+
+bool MarkCompact::IsValidObject(mirror::Object* obj) const {
+  mirror::Class* klass = obj->GetClass<kVerifyNone, kWithoutReadBarrier>();
+  if (!heap_->GetVerification()->IsValidHeapObjectAddress(klass)) {
+    return false;
+  }
+  return heap_->GetVerification()->IsValidClassUnchecked<kWithFromSpaceBarrier>(
+          obj->GetClass<kVerifyNone, kWithFromSpaceBarrier>());
+}
+
+template <typename Callback>
+void MarkCompact::VerifyObject(mirror::Object* ref, Callback& callback) const {
+  if (kIsDebugBuild) {
+    mirror::Class* klass = ref->GetClass<kVerifyNone, kWithFromSpaceBarrier>();
+    mirror::Class* pre_compact_klass = ref->GetClass<kVerifyNone, kWithoutReadBarrier>();
+    mirror::Class* klass_klass = klass->GetClass<kVerifyNone, kWithFromSpaceBarrier>();
+    mirror::Class* klass_klass_klass = klass_klass->GetClass<kVerifyNone, kWithFromSpaceBarrier>();
+    if (bump_pointer_space_->HasAddress(pre_compact_klass) &&
+        reinterpret_cast<uint8_t*>(pre_compact_klass) < black_allocations_begin_) {
+      CHECK(moving_space_bitmap_->Test(pre_compact_klass))
+          << "ref=" << ref
+          << " post_compact_end=" << static_cast<void*>(post_compact_end_)
+          << " pre_compact_klass=" << pre_compact_klass
+          << " black_allocations_begin=" << static_cast<void*>(black_allocations_begin_);
+      CHECK(live_words_bitmap_->Test(pre_compact_klass));
+    }
+    if (!IsValidObject(ref)) {
+      std::ostringstream oss;
+      oss << "Invalid object: "
+          << "ref=" << ref
+          << " klass=" << klass
+          << " klass_klass=" << klass_klass
+          << " klass_klass_klass=" << klass_klass_klass
+          << " pre_compact_klass=" << pre_compact_klass
+          << " from_space_begin=" << static_cast<void*>(from_space_begin_)
+          << " pre_compact_begin=" << static_cast<void*>(bump_pointer_space_->Begin())
+          << " post_compact_end=" << static_cast<void*>(post_compact_end_)
+          << " black_allocations_begin=" << static_cast<void*>(black_allocations_begin_);
+
+      // Call callback before dumping larger data like RAM and space dumps.
+      callback(oss);
+
+      oss << " \nobject="
+          << heap_->GetVerification()->DumpRAMAroundAddress(reinterpret_cast<uintptr_t>(ref), 128)
+          << " \nklass(from)="
+          << heap_->GetVerification()->DumpRAMAroundAddress(reinterpret_cast<uintptr_t>(klass), 128)
+          << "spaces:\n";
+      heap_->DumpSpaces(oss);
+      LOG(FATAL) << oss.str();
+    }
+  }
+}
+
+void MarkCompact::CompactPage(mirror::Object* obj,
+                              uint32_t offset,
+                              uint8_t* addr,
+                              bool needs_memset_zero) {
+  DCHECK(moving_space_bitmap_->Test(obj)
+         && live_words_bitmap_->Test(obj));
+  DCHECK(live_words_bitmap_->Test(offset)) << "obj=" << obj
+                                           << " offset=" << offset
+                                           << " addr=" << static_cast<void*>(addr)
+                                           << " black_allocs_begin="
+                                           << static_cast<void*>(black_allocations_begin_)
+                                           << " post_compact_addr="
+                                           << static_cast<void*>(post_compact_end_);
+  uint8_t* const start_addr = addr;
+  // How many distinct live-strides do we have.
+  size_t stride_count = 0;
+  uint8_t* last_stride = addr;
+  uint32_t last_stride_begin = 0;
+  auto verify_obj_callback = [&] (std::ostream& os) {
+                               os << " stride_count=" << stride_count
+                                  << " last_stride=" << static_cast<void*>(last_stride)
+                                  << " offset=" << offset
+                                  << " start_addr=" << static_cast<void*>(start_addr);
+                             };
+  obj = GetFromSpaceAddr(obj);
+  live_words_bitmap_->VisitLiveStrides(offset,
+                                       black_allocations_begin_,
+                                       kPageSize,
+                                       [&addr,
+                                        &last_stride,
+                                        &stride_count,
+                                        &last_stride_begin,
+                                        verify_obj_callback,
+                                        this] (uint32_t stride_begin,
+                                               size_t stride_size,
+                                               bool /*is_last*/)
+                                        REQUIRES_SHARED(Locks::mutator_lock_) {
+                                         const size_t stride_in_bytes = stride_size * kAlignment;
+                                         DCHECK_LE(stride_in_bytes, kPageSize);
+                                         last_stride_begin = stride_begin;
+                                         DCHECK(IsAligned<kAlignment>(addr));
+                                         memcpy(addr,
+                                                from_space_begin_ + stride_begin * kAlignment,
+                                                stride_in_bytes);
+                                         if (kIsDebugBuild) {
+                                           uint8_t* space_begin = bump_pointer_space_->Begin();
+                                           // We can interpret the first word of the stride as an
+                                           // obj only from second stride onwards, as the first
+                                           // stride's first-object may have started on previous
+                                           // page. The only exception is the first page of the
+                                           // moving space.
+                                           if (stride_count > 0
+                                               || stride_begin * kAlignment < kPageSize) {
+                                             mirror::Object* o =
+                                                reinterpret_cast<mirror::Object*>(space_begin
+                                                                                  + stride_begin
+                                                                                  * kAlignment);
+                                             CHECK(live_words_bitmap_->Test(o)) << "ref=" << o;
+                                             CHECK(moving_space_bitmap_->Test(o))
+                                                 << "ref=" << o
+                                                 << " bitmap: "
+                                                 << moving_space_bitmap_->DumpMemAround(o);
+                                             VerifyObject(reinterpret_cast<mirror::Object*>(addr),
+                                                          verify_obj_callback);
+                                           }
+                                         }
+                                         last_stride = addr;
+                                         addr += stride_in_bytes;
+                                         stride_count++;
+                                       });
+  DCHECK_LT(last_stride, start_addr + kPageSize);
+  DCHECK_GT(stride_count, 0u);
+  size_t obj_size = 0;
+  uint32_t offset_within_obj = offset * kAlignment
+                               - (reinterpret_cast<uint8_t*>(obj) - from_space_begin_);
+  // First object
+  if (offset_within_obj > 0) {
+    mirror::Object* to_ref = reinterpret_cast<mirror::Object*>(start_addr - offset_within_obj);
+    if (stride_count > 1) {
+      RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/false> visitor(this,
+                                                                         to_ref,
+                                                                         start_addr,
+                                                                         nullptr);
+      obj_size = obj->VisitRefsForCompaction</*kFetchObjSize*/true, /*kVisitNativeRoots*/false>(
+              visitor, MemberOffset(offset_within_obj), MemberOffset(-1));
+    } else {
+      RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/true> visitor(this,
+                                                                        to_ref,
+                                                                        start_addr,
+                                                                        start_addr + kPageSize);
+      obj_size = obj->VisitRefsForCompaction</*kFetchObjSize*/true, /*kVisitNativeRoots*/false>(
+              visitor, MemberOffset(offset_within_obj), MemberOffset(offset_within_obj
+                                                                     + kPageSize));
+    }
+    obj_size = RoundUp(obj_size, kAlignment);
+    DCHECK_GT(obj_size, offset_within_obj)
+        << "obj:" << obj
+        << " class:"
+        << obj->GetClass<kDefaultVerifyFlags, kWithFromSpaceBarrier>()
+        << " to_addr:" << to_ref
+        << " black-allocation-begin:" << reinterpret_cast<void*>(black_allocations_begin_)
+        << " post-compact-end:" << reinterpret_cast<void*>(post_compact_end_)
+        << " offset:" << offset * kAlignment
+        << " class-after-obj-iter:"
+        << (class_after_obj_iter_ != class_after_obj_ordered_map_.rend() ?
+            class_after_obj_iter_->first.AsMirrorPtr() : nullptr)
+        << " last-reclaimed-page:" << reinterpret_cast<void*>(last_reclaimed_page_)
+        << " last-checked-reclaim-page-idx:" << last_checked_reclaim_page_idx_
+        << " offset-of-last-idx:"
+        << pre_compact_offset_moving_space_[last_checked_reclaim_page_idx_] * kAlignment
+        << " first-obj-of-last-idx:"
+        << first_objs_moving_space_[last_checked_reclaim_page_idx_].AsMirrorPtr();
+
+    obj_size -= offset_within_obj;
+    // If there is only one stride, then adjust last_stride_begin to the
+    // end of the first object.
+    if (stride_count == 1) {
+      last_stride_begin += obj_size / kAlignment;
+    }
+  }
+
+  // Except for the last page being compacted, the pages will have addr ==
+  // start_addr + kPageSize.
+  uint8_t* const end_addr = addr;
+  addr = start_addr;
+  size_t bytes_done = obj_size;
+  // All strides except the last one can be updated without any boundary
+  // checks.
+  DCHECK_LE(addr, last_stride);
+  size_t bytes_to_visit = last_stride - addr;
+  DCHECK_LE(bytes_to_visit, kPageSize);
+  while (bytes_to_visit > bytes_done) {
+    mirror::Object* ref = reinterpret_cast<mirror::Object*>(addr + bytes_done);
+    VerifyObject(ref, verify_obj_callback);
+    RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/false>
+            visitor(this, ref, nullptr, nullptr);
+    obj_size = ref->VisitRefsForCompaction(visitor, MemberOffset(0), MemberOffset(-1));
+    obj_size = RoundUp(obj_size, kAlignment);
+    bytes_done += obj_size;
+  }
+  // Last stride may have multiple objects in it and we don't know where the
+  // last object which crosses the page boundary starts, therefore check
+  // page-end in all of these objects. Also, we need to call
+  // VisitRefsForCompaction() with from-space object as we fetch object size,
+  // which in case of klass requires 'class_size_'.
+  uint8_t* from_addr = from_space_begin_ + last_stride_begin * kAlignment;
+  bytes_to_visit = end_addr - addr;
+  DCHECK_LE(bytes_to_visit, kPageSize);
+  while (bytes_to_visit > bytes_done) {
+    mirror::Object* ref = reinterpret_cast<mirror::Object*>(addr + bytes_done);
+    obj = reinterpret_cast<mirror::Object*>(from_addr);
+    VerifyObject(ref, verify_obj_callback);
+    RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/true>
+            visitor(this, ref, nullptr, start_addr + kPageSize);
+    obj_size = obj->VisitRefsForCompaction(visitor,
+                                           MemberOffset(0),
+                                           MemberOffset(end_addr - (addr + bytes_done)));
+    obj_size = RoundUp(obj_size, kAlignment);
+    DCHECK_GT(obj_size, 0u)
+        << "from_addr:" << obj
+        << " from-space-class:"
+        << obj->GetClass<kDefaultVerifyFlags, kWithFromSpaceBarrier>()
+        << " to_addr:" << ref
+        << " black-allocation-begin:" << reinterpret_cast<void*>(black_allocations_begin_)
+        << " post-compact-end:" << reinterpret_cast<void*>(post_compact_end_)
+        << " offset:" << offset * kAlignment
+        << " bytes_done:" << bytes_done
+        << " class-after-obj-iter:"
+        << (class_after_obj_iter_ != class_after_obj_ordered_map_.rend() ?
+            class_after_obj_iter_->first.AsMirrorPtr() : nullptr)
+        << " last-reclaimed-page:" << reinterpret_cast<void*>(last_reclaimed_page_)
+        << " last-checked-reclaim-page-idx:" << last_checked_reclaim_page_idx_
+        << " offset-of-last-idx:"
+        << pre_compact_offset_moving_space_[last_checked_reclaim_page_idx_] * kAlignment
+        << " first-obj-of-last-idx:"
+        << first_objs_moving_space_[last_checked_reclaim_page_idx_].AsMirrorPtr();
+
+    from_addr += obj_size;
+    bytes_done += obj_size;
+  }
+  // The last page that we compact may have some bytes left untouched in the
+  // end, we should zero them as the kernel copies at page granularity.
+  if (needs_memset_zero && UNLIKELY(bytes_done < kPageSize)) {
+    std::memset(addr + bytes_done, 0x0, kPageSize - bytes_done);
+  }
+}
+
+// We store the starting point (pre_compact_page - first_obj) and first-chunk's
+// size. If more TLAB(s) started in this page, then those chunks are identified
+// using mark bitmap. All this info is prepared in UpdateMovingSpaceBlackAllocations().
+// If we find a set bit in the bitmap, then we copy the remaining page and then
+// use the bitmap to visit each object for updating references.
+void MarkCompact::SlideBlackPage(mirror::Object* first_obj,
+                                 const size_t page_idx,
+                                 uint8_t* const pre_compact_page,
+                                 uint8_t* dest,
+                                 bool needs_memset_zero) {
+  DCHECK(IsAligned<kPageSize>(pre_compact_page));
+  size_t bytes_copied;
+  const uint32_t first_chunk_size = black_alloc_pages_first_chunk_size_[page_idx];
+  mirror::Object* next_page_first_obj = first_objs_moving_space_[page_idx + 1].AsMirrorPtr();
+  uint8_t* src_addr = reinterpret_cast<uint8_t*>(GetFromSpaceAddr(first_obj));
+  uint8_t* pre_compact_addr = reinterpret_cast<uint8_t*>(first_obj);
+  uint8_t* const pre_compact_page_end = pre_compact_page + kPageSize;
+  uint8_t* const dest_page_end = dest + kPageSize;
+
+  auto verify_obj_callback = [&] (std::ostream& os) {
+                               os << " first_obj=" << first_obj
+                                  << " next_page_first_obj=" << next_page_first_obj
+                                  << " first_chunk_sie=" << first_chunk_size
+                                  << " dest=" << static_cast<void*>(dest)
+                                  << " pre_compact_page="
+                                  << static_cast<void* const>(pre_compact_page);
+                             };
+  // We have empty portion at the beginning of the page. Zero it.
+  if (pre_compact_addr > pre_compact_page) {
+    bytes_copied = pre_compact_addr - pre_compact_page;
+    DCHECK_LT(bytes_copied, kPageSize);
+    if (needs_memset_zero) {
+      std::memset(dest, 0x0, bytes_copied);
+    }
+    dest += bytes_copied;
+  } else {
+    bytes_copied = 0;
+    size_t offset = pre_compact_page - pre_compact_addr;
+    pre_compact_addr = pre_compact_page;
+    src_addr += offset;
+    DCHECK(IsAligned<kPageSize>(src_addr));
+  }
+  // Copy the first chunk of live words
+  std::memcpy(dest, src_addr, first_chunk_size);
+  // Update references in the first chunk. Use object size to find next object.
+  {
+    size_t bytes_to_visit = first_chunk_size;
+    size_t obj_size;
+    // The first object started in some previous page. So we need to check the
+    // beginning.
+    DCHECK_LE(reinterpret_cast<uint8_t*>(first_obj), pre_compact_addr);
+    size_t offset = pre_compact_addr - reinterpret_cast<uint8_t*>(first_obj);
+    if (bytes_copied == 0 && offset > 0) {
+      mirror::Object* to_obj = reinterpret_cast<mirror::Object*>(dest - offset);
+      mirror::Object* from_obj = reinterpret_cast<mirror::Object*>(src_addr - offset);
+      // If the next page's first-obj is in this page or nullptr, then we don't
+      // need to check end boundary
+      if (next_page_first_obj == nullptr
+          || (first_obj != next_page_first_obj
+              && reinterpret_cast<uint8_t*>(next_page_first_obj) <= pre_compact_page_end)) {
+        RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/false> visitor(this,
+                                                                           to_obj,
+                                                                           dest,
+                                                                           nullptr);
+        obj_size = from_obj->VisitRefsForCompaction<
+                /*kFetchObjSize*/true, /*kVisitNativeRoots*/false>(visitor,
+                                                                   MemberOffset(offset),
+                                                                   MemberOffset(-1));
+      } else {
+        RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/true> visitor(this,
+                                                                          to_obj,
+                                                                          dest,
+                                                                          dest_page_end);
+        obj_size = from_obj->VisitRefsForCompaction<
+                /*kFetchObjSize*/true, /*kVisitNativeRoots*/false>(visitor,
+                                                                   MemberOffset(offset),
+                                                                   MemberOffset(offset
+                                                                                + kPageSize));
+        if (first_obj == next_page_first_obj) {
+          // First object is the only object on this page. So there's nothing else left to do.
+          return;
+        }
+      }
+      obj_size = RoundUp(obj_size, kAlignment);
+      obj_size -= offset;
+      dest += obj_size;
+      bytes_to_visit -= obj_size;
+    }
+    bytes_copied += first_chunk_size;
+    // If the last object in this page is next_page_first_obj, then we need to check end boundary
+    bool check_last_obj = false;
+    if (next_page_first_obj != nullptr
+        && reinterpret_cast<uint8_t*>(next_page_first_obj) < pre_compact_page_end
+        && bytes_copied == kPageSize) {
+      size_t diff = pre_compact_page_end - reinterpret_cast<uint8_t*>(next_page_first_obj);
+      DCHECK_LE(diff, kPageSize);
+      DCHECK_LE(diff, bytes_to_visit);
+      bytes_to_visit -= diff;
+      check_last_obj = true;
+    }
+    while (bytes_to_visit > 0) {
+      mirror::Object* dest_obj = reinterpret_cast<mirror::Object*>(dest);
+      VerifyObject(dest_obj, verify_obj_callback);
+      RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/false> visitor(this,
+                                                                          dest_obj,
+                                                                          nullptr,
+                                                                          nullptr);
+      obj_size = dest_obj->VisitRefsForCompaction(visitor, MemberOffset(0), MemberOffset(-1));
+      obj_size = RoundUp(obj_size, kAlignment);
+      bytes_to_visit -= obj_size;
+      dest += obj_size;
+    }
+    DCHECK_EQ(bytes_to_visit, 0u);
+    if (check_last_obj) {
+      mirror::Object* dest_obj = reinterpret_cast<mirror::Object*>(dest);
+      VerifyObject(dest_obj, verify_obj_callback);
+      RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/true> visitor(this,
+                                                                         dest_obj,
+                                                                         nullptr,
+                                                                         dest_page_end);
+      mirror::Object* obj = GetFromSpaceAddr(next_page_first_obj);
+      obj->VisitRefsForCompaction</*kFetchObjSize*/false>(visitor,
+                                                          MemberOffset(0),
+                                                          MemberOffset(dest_page_end - dest));
+      return;
+    }
+  }
+
+  // Probably a TLAB finished on this page and/or a new TLAB started as well.
+  if (bytes_copied < kPageSize) {
+    src_addr += first_chunk_size;
+    pre_compact_addr += first_chunk_size;
+    // Use mark-bitmap to identify where objects are. First call
+    // VisitMarkedRange for only the first marked bit. If found, zero all bytes
+    // until that object and then call memcpy on the rest of the page.
+    // Then call VisitMarkedRange for all marked bits *after* the one found in
+    // this invocation. This time to visit references.
+    uintptr_t start_visit = reinterpret_cast<uintptr_t>(pre_compact_addr);
+    uintptr_t page_end = reinterpret_cast<uintptr_t>(pre_compact_page_end);
+    mirror::Object* found_obj = nullptr;
+    moving_space_bitmap_->VisitMarkedRange</*kVisitOnce*/true>(start_visit,
+                                                                page_end,
+                                                                [&found_obj](mirror::Object* obj) {
+                                                                  found_obj = obj;
+                                                                });
+    size_t remaining_bytes = kPageSize - bytes_copied;
+    if (found_obj == nullptr) {
+      if (needs_memset_zero) {
+        // No more black objects in this page. Zero the remaining bytes and return.
+        std::memset(dest, 0x0, remaining_bytes);
+      }
+      return;
+    }
+    // Copy everything in this page, which includes any zeroed regions
+    // in-between.
+    std::memcpy(dest, src_addr, remaining_bytes);
+    DCHECK_LT(reinterpret_cast<uintptr_t>(found_obj), page_end);
+    moving_space_bitmap_->VisitMarkedRange(
+            reinterpret_cast<uintptr_t>(found_obj) + mirror::kObjectHeaderSize,
+            page_end,
+            [&found_obj, pre_compact_addr, dest, this, verify_obj_callback] (mirror::Object* obj)
+            REQUIRES_SHARED(Locks::mutator_lock_) {
+              ptrdiff_t diff = reinterpret_cast<uint8_t*>(found_obj) - pre_compact_addr;
+              mirror::Object* ref = reinterpret_cast<mirror::Object*>(dest + diff);
+              VerifyObject(ref, verify_obj_callback);
+              RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/false>
+                      visitor(this, ref, nullptr, nullptr);
+              ref->VisitRefsForCompaction</*kFetchObjSize*/false>(visitor,
+                                                                  MemberOffset(0),
+                                                                  MemberOffset(-1));
+              // Remember for next round.
+              found_obj = obj;
+            });
+    // found_obj may have been updated in VisitMarkedRange. Visit the last found
+    // object.
+    DCHECK_GT(reinterpret_cast<uint8_t*>(found_obj), pre_compact_addr);
+    DCHECK_LT(reinterpret_cast<uintptr_t>(found_obj), page_end);
+    ptrdiff_t diff = reinterpret_cast<uint8_t*>(found_obj) - pre_compact_addr;
+    mirror::Object* ref = reinterpret_cast<mirror::Object*>(dest + diff);
+    VerifyObject(ref, verify_obj_callback);
+    RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/true> visitor(this,
+                                                                       ref,
+                                                                       nullptr,
+                                                                       dest_page_end);
+    ref->VisitRefsForCompaction</*kFetchObjSize*/false>(
+            visitor, MemberOffset(0), MemberOffset(page_end -
+                                                   reinterpret_cast<uintptr_t>(found_obj)));
+  }
+}
+
+template <bool kFirstPageMapping>
+void MarkCompact::MapProcessedPages(uint8_t* to_space_start,
+                                    Atomic<PageState>* state_arr,
+                                    size_t arr_idx,
+                                    size_t arr_len) {
+  DCHECK(minor_fault_initialized_);
+  DCHECK_LT(arr_idx, arr_len);
+  DCHECK_ALIGNED(to_space_start, kPageSize);
+  // Claim all the contiguous pages, which are ready to be mapped, and then do
+  // so in a single ioctl. This helps avoid the overhead of invoking syscall
+  // several times and also maps the already-processed pages, avoiding
+  // unnecessary faults on them.
+  size_t length = kFirstPageMapping ? kPageSize : 0;
+  if (kFirstPageMapping) {
+    arr_idx++;
+  }
+  // We need to guarantee that we don't end up sucsessfully marking a later
+  // page 'mapping' and then fail to mark an earlier page. To guarantee that
+  // we use acq_rel order.
+  for (; arr_idx < arr_len; arr_idx++, length += kPageSize) {
+    PageState expected_state = PageState::kProcessed;
+    if (!state_arr[arr_idx].compare_exchange_strong(
+            expected_state, PageState::kProcessedAndMapping, std::memory_order_acq_rel)) {
+      break;
+    }
+  }
+  if (length > 0) {
+    // Note: We need the first page to be attempted (to be mapped) by the ioctl
+    // as this function is called due to some mutator thread waiting on the
+    // 'to_space_start' page. Therefore, the ioctl must always be called
+    // with 'to_space_start' as the 'start' address because it can bail out in
+    // the middle (not attempting to map the subsequent pages) if it finds any
+    // page either already mapped in between, or missing on the shadow-map.
+    struct uffdio_continue uffd_continue;
+    uffd_continue.range.start = reinterpret_cast<uintptr_t>(to_space_start);
+    uffd_continue.range.len = length;
+    uffd_continue.mode = 0;
+    int ret = ioctl(uffd_, UFFDIO_CONTINUE, &uffd_continue);
+    if (UNLIKELY(ret == -1 && errno == EAGAIN)) {
+      // This can happen only in linear-alloc.
+      DCHECK(linear_alloc_spaces_data_.end() !=
+             std::find_if(linear_alloc_spaces_data_.begin(),
+                          linear_alloc_spaces_data_.end(),
+                          [to_space_start](const LinearAllocSpaceData& data) {
+                            return data.begin_ <= to_space_start && to_space_start < data.end_;
+                          }));
+
+      // This could happen if userfaultfd couldn't find any pages mapped in the
+      // shadow map. For instance, if there are certain (contiguous) pages on
+      // linear-alloc which are allocated and have first-object set-up but have
+      // not been accessed yet.
+      // Bail out by setting the remaining pages' state back to kProcessed and
+      // then waking up any waiting threads.
+      DCHECK_GE(uffd_continue.mapped, 0);
+      DCHECK_ALIGNED(uffd_continue.mapped, kPageSize);
+      DCHECK_LT(uffd_continue.mapped, static_cast<ssize_t>(length));
+      if (kFirstPageMapping) {
+        // In this case the first page must be mapped.
+        DCHECK_GE(uffd_continue.mapped, static_cast<ssize_t>(kPageSize));
+      }
+      // Nobody would modify these pages' state simultaneously so only atomic
+      // store is sufficient. Use 'release' order to ensure that all states are
+      // modified sequentially.
+      for (size_t remaining_len = length - uffd_continue.mapped; remaining_len > 0;
+           remaining_len -= kPageSize) {
+        arr_idx--;
+        DCHECK_EQ(state_arr[arr_idx].load(std::memory_order_relaxed),
+                  PageState::kProcessedAndMapping);
+        state_arr[arr_idx].store(PageState::kProcessed, std::memory_order_release);
+      }
+      uffd_continue.range.start =
+          reinterpret_cast<uintptr_t>(to_space_start) + uffd_continue.mapped;
+      uffd_continue.range.len = length - uffd_continue.mapped;
+      ret = ioctl(uffd_, UFFDIO_WAKE, &uffd_continue.range);
+      CHECK_EQ(ret, 0) << "ioctl_userfaultfd: wake failed: " << strerror(errno);
+    } else {
+      // We may receive ENOENT if gc-thread unregisters the
+      // range behind our back, which is fine because that
+      // happens only when it knows compaction is done.
+      CHECK(ret == 0 || !kFirstPageMapping || errno == ENOENT)
+          << "ioctl_userfaultfd: continue failed: " << strerror(errno);
+      if (ret == 0) {
+        DCHECK_EQ(uffd_continue.mapped, static_cast<ssize_t>(length));
+      }
+    }
+    if (use_uffd_sigbus_) {
+      // Nobody else would modify these pages' state simultaneously so atomic
+      // store is sufficient.
+      for (; uffd_continue.mapped > 0; uffd_continue.mapped -= kPageSize) {
+        arr_idx--;
+        DCHECK_EQ(state_arr[arr_idx].load(std::memory_order_relaxed),
+                  PageState::kProcessedAndMapping);
+        state_arr[arr_idx].store(PageState::kProcessedAndMapped, std::memory_order_release);
+      }
+    }
+  }
+}
+
+void MarkCompact::ZeropageIoctl(void* addr, bool tolerate_eexist, bool tolerate_enoent) {
+  struct uffdio_zeropage uffd_zeropage;
+  DCHECK(IsAligned<kPageSize>(addr));
+  uffd_zeropage.range.start = reinterpret_cast<uintptr_t>(addr);
+  uffd_zeropage.range.len = kPageSize;
+  uffd_zeropage.mode = 0;
+  int ret = ioctl(uffd_, UFFDIO_ZEROPAGE, &uffd_zeropage);
+  if (LIKELY(ret == 0)) {
+    DCHECK_EQ(uffd_zeropage.zeropage, static_cast<ssize_t>(kPageSize));
+  } else {
+    CHECK((tolerate_enoent && errno == ENOENT) || (tolerate_eexist && errno == EEXIST))
+        << "ioctl_userfaultfd: zeropage failed: " << strerror(errno) << ". addr:" << addr;
+  }
+}
+
+void MarkCompact::CopyIoctl(void* dst, void* buffer) {
+  struct uffdio_copy uffd_copy;
+  uffd_copy.src = reinterpret_cast<uintptr_t>(buffer);
+  uffd_copy.dst = reinterpret_cast<uintptr_t>(dst);
+  uffd_copy.len = kPageSize;
+  uffd_copy.mode = 0;
+  CHECK_EQ(ioctl(uffd_, UFFDIO_COPY, &uffd_copy), 0)
+      << "ioctl_userfaultfd: copy failed: " << strerror(errno) << ". src:" << buffer
+      << " dst:" << dst;
+  DCHECK_EQ(uffd_copy.copy, static_cast<ssize_t>(kPageSize));
+}
+
+template <int kMode, typename CompactionFn>
+void MarkCompact::DoPageCompactionWithStateChange(size_t page_idx,
+                                                  size_t status_arr_len,
+                                                  uint8_t* to_space_page,
+                                                  uint8_t* page,
+                                                  CompactionFn func) {
+  PageState expected_state = PageState::kUnprocessed;
+  PageState desired_state =
+      kMode == kCopyMode ? PageState::kProcessingAndMapping : PageState::kProcessing;
+  // In the concurrent case (kMode != kFallbackMode) we need to ensure that the update
+  // to moving_spaces_status_[page_idx] is released before the contents of the page are
+  // made accessible to other threads.
+  //
+  // We need acquire ordering here to ensure that when the CAS fails, another thread
+  // has completed processing the page, which is guaranteed by the release below.
+  if (kMode == kFallbackMode || moving_pages_status_[page_idx].compare_exchange_strong(
+                                    expected_state, desired_state, std::memory_order_acquire)) {
+    func();
+    if (kMode == kCopyMode) {
+      CopyIoctl(to_space_page, page);
+      if (use_uffd_sigbus_) {
+        // Store is sufficient as no other thread would modify the status at this point.
+        moving_pages_status_[page_idx].store(PageState::kProcessedAndMapped,
+                                             std::memory_order_release);
+      }
+    } else if (kMode == kMinorFaultMode) {
+      expected_state = PageState::kProcessing;
+      desired_state = PageState::kProcessed;
+      // the CAS needs to be with release order to ensure that stores to the
+      // page makes it to memory *before* other threads observe that it's
+      // ready to be mapped.
+      if (!moving_pages_status_[page_idx].compare_exchange_strong(
+              expected_state, desired_state, std::memory_order_release)) {
+        // Some mutator has requested to map the page after processing it.
+        DCHECK_EQ(expected_state, PageState::kProcessingAndMapping);
+        MapProcessedPages</*kFirstPageMapping=*/true>(
+            to_space_page, moving_pages_status_, page_idx, status_arr_len);
+      }
+    }
+  } else {
+    DCHECK_GT(expected_state, PageState::kProcessed);
+  }
+}
+
+void MarkCompact::FreeFromSpacePages(size_t cur_page_idx, int mode) {
+  // Thanks to sliding compaction, bump-pointer allocations, and reverse
+  // compaction (see CompactMovingSpace) the logic here is pretty simple: find
+  // the to-space page up to which compaction has finished, all the from-space
+  // pages corresponding to this onwards can be freed. There are some corner
+  // cases to be taken care of, which are described below.
+  size_t idx = last_checked_reclaim_page_idx_;
+  // Find the to-space page up to which the corresponding from-space pages can be
+  // freed.
+  for (; idx > cur_page_idx; idx--) {
+    PageState state = moving_pages_status_[idx - 1].load(std::memory_order_acquire);
+    if (state == PageState::kMutatorProcessing) {
+      // Some mutator is working on the page.
+      break;
+    }
+    DCHECK(state >= PageState::kProcessed ||
+           (state == PageState::kUnprocessed &&
+            (mode == kFallbackMode || idx > moving_first_objs_count_)));
+  }
+  DCHECK_LE(idx, last_checked_reclaim_page_idx_);
+  if (idx == last_checked_reclaim_page_idx_) {
+    // Nothing to do.
+    return;
+  }
+
+  uint8_t* reclaim_begin;
+  uint8_t* idx_addr;
+  // Calculate the first from-space page to be freed using 'idx'. If the
+  // first-object of the idx'th to-space page started before the corresponding
+  // from-space page, which is almost always the case in the compaction portion
+  // of the moving-space, then it indicates that the subsequent pages that are
+  // yet to be compacted will need the from-space pages. Therefore, find the page
+  // (from the already compacted pages) whose first-object is different from
+  // ours. All the from-space pages starting from that one are safe to be
+  // removed. Please note that this iteration is not expected to be long in
+  // normal cases as objects are smaller than page size.
+  if (idx >= moving_first_objs_count_) {
+    // black-allocated portion of the moving-space
+    idx_addr = black_allocations_begin_ + (idx - moving_first_objs_count_) * kPageSize;
+    reclaim_begin = idx_addr;
+    mirror::Object* first_obj = first_objs_moving_space_[idx].AsMirrorPtr();
+    if (first_obj != nullptr && reinterpret_cast<uint8_t*>(first_obj) < reclaim_begin) {
+      size_t idx_len = moving_first_objs_count_ + black_page_count_;
+      for (size_t i = idx + 1; i < idx_len; i++) {
+        mirror::Object* obj = first_objs_moving_space_[i].AsMirrorPtr();
+        // A null first-object indicates that the corresponding to-space page is
+        // not used yet. So we can compute its from-space page and use that.
+        if (obj != first_obj) {
+          reclaim_begin = obj != nullptr
+                          ? AlignUp(reinterpret_cast<uint8_t*>(obj), kPageSize)
+                          : (black_allocations_begin_ + (i - moving_first_objs_count_) * kPageSize);
+          break;
+        }
+      }
+    }
+  } else {
+    DCHECK_GE(pre_compact_offset_moving_space_[idx], 0u);
+    idx_addr = bump_pointer_space_->Begin() + pre_compact_offset_moving_space_[idx] * kAlignment;
+    reclaim_begin = idx_addr;
+    DCHECK_LE(reclaim_begin, black_allocations_begin_);
+    mirror::Object* first_obj = first_objs_moving_space_[idx].AsMirrorPtr();
+    if (reinterpret_cast<uint8_t*>(first_obj) < reclaim_begin) {
+      DCHECK_LT(idx, moving_first_objs_count_);
+      mirror::Object* obj = first_obj;
+      for (size_t i = idx + 1; i < moving_first_objs_count_; i++) {
+        obj = first_objs_moving_space_[i].AsMirrorPtr();
+        if (first_obj != obj) {
+          DCHECK_LT(first_obj, obj);
+          DCHECK_LT(reclaim_begin, reinterpret_cast<uint8_t*>(obj));
+          reclaim_begin = reinterpret_cast<uint8_t*>(obj);
+          break;
+        }
+      }
+      if (obj == first_obj) {
+        reclaim_begin = black_allocations_begin_;
+      }
+    }
+    reclaim_begin = AlignUp(reclaim_begin, kPageSize);
+  }
+
+  DCHECK_NE(reclaim_begin, nullptr);
+  DCHECK_ALIGNED(reclaim_begin, kPageSize);
+  DCHECK_ALIGNED(last_reclaimed_page_, kPageSize);
+  // Check if the 'class_after_obj_map_' map allows pages to be freed.
+  for (; class_after_obj_iter_ != class_after_obj_ordered_map_.rend(); class_after_obj_iter_++) {
+    mirror::Object* klass = class_after_obj_iter_->first.AsMirrorPtr();
+    mirror::Class* from_klass = static_cast<mirror::Class*>(GetFromSpaceAddr(klass));
+    // Check with class' end to ensure that, if required, the entire class survives.
+    uint8_t* klass_end = reinterpret_cast<uint8_t*>(klass) + from_klass->SizeOf<kVerifyNone>();
+    DCHECK_LE(klass_end, last_reclaimed_page_);
+    if (reinterpret_cast<uint8_t*>(klass_end) >= reclaim_begin) {
+      // Found a class which is in the reclaim range.
+      uint8_t* obj_addr = reinterpret_cast<uint8_t*>(class_after_obj_iter_->second.AsMirrorPtr());
+      // NOTE: Don't assert that obj is of 'klass' type as klass could instead
+      // be its super-class.
+      if (obj_addr < idx_addr) {
+        // Its lowest-address object is not compacted yet. Reclaim starting from
+        // the end of this class.
+        reclaim_begin = AlignUp(klass_end, kPageSize);
+      } else {
+        // Continue consuming pairs wherein the lowest address object has already
+        // been compacted.
+        continue;
+      }
+    }
+    // All the remaining class (and thereby corresponding object) addresses are
+    // lower than the reclaim range.
+    break;
+  }
+
+  ssize_t size = last_reclaimed_page_ - reclaim_begin;
+  if (size >= kMinFromSpaceMadviseSize) {
+    int behavior = minor_fault_initialized_ ? MADV_REMOVE : MADV_DONTNEED;
+    CHECK_EQ(madvise(reclaim_begin + from_space_slide_diff_, size, behavior), 0)
+        << "madvise of from-space failed: " << strerror(errno);
+    last_reclaimed_page_ = reclaim_begin;
+  }
+  last_checked_reclaim_page_idx_ = idx;
+}
+
+void MarkCompact::UpdateClassAfterObjMap() {
+  CHECK(class_after_obj_ordered_map_.empty());
+  for (const auto& pair : class_after_obj_hash_map_) {
+    auto super_class_iter = super_class_after_class_hash_map_.find(pair.first);
+    ObjReference key = super_class_iter != super_class_after_class_hash_map_.end()
+                       ? super_class_iter->second
+                       : pair.first;
+    if (std::less<mirror::Object*>{}(pair.second.AsMirrorPtr(), key.AsMirrorPtr()) &&
+        bump_pointer_space_->HasAddress(key.AsMirrorPtr())) {
+      auto [ret_iter, success] = class_after_obj_ordered_map_.try_emplace(key, pair.second);
+      // It could fail only if the class 'key' has objects of its own, which are lower in
+      // address order, as well of some of its derived class. In this case
+      // choose the lowest address object.
+      if (!success &&
+          std::less<mirror::Object*>{}(pair.second.AsMirrorPtr(), ret_iter->second.AsMirrorPtr())) {
+        ret_iter->second = pair.second;
+      }
+    }
+  }
+  class_after_obj_hash_map_.clear();
+  super_class_after_class_hash_map_.clear();
+}
+
+template <int kMode>
+void MarkCompact::CompactMovingSpace(uint8_t* page) {
+  // For every page we have a starting object, which may have started in some
+  // preceding page, and an offset within that object from where we must start
+  // copying.
+  // Consult the live-words bitmap to copy all contiguously live words at a
+  // time. These words may constitute multiple objects. To avoid the need for
+  // consulting mark-bitmap to find where does the next live object start, we
+  // use the object-size returned by VisitRefsForCompaction.
+  //
+  // We do the compaction in reverse direction so that the pages containing
+  // TLAB and latest allocations are processed first.
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  size_t page_status_arr_len = moving_first_objs_count_ + black_page_count_;
+  size_t idx = page_status_arr_len;
+  uint8_t* to_space_end = bump_pointer_space_->Begin() + page_status_arr_len * kPageSize;
+  uint8_t* shadow_space_end = nullptr;
+  if (kMode == kMinorFaultMode) {
+    shadow_space_end = shadow_to_space_map_.Begin() + page_status_arr_len * kPageSize;
+  }
+  uint8_t* pre_compact_page = black_allocations_begin_ + (black_page_count_ * kPageSize);
+
+  DCHECK(IsAligned<kPageSize>(pre_compact_page));
+
+  UpdateClassAfterObjMap();
+  // These variables are maintained by FreeFromSpacePages().
+  last_reclaimed_page_ = pre_compact_page;
+  last_checked_reclaim_page_idx_ = idx;
+  class_after_obj_iter_ = class_after_obj_ordered_map_.rbegin();
+  // Allocated-black pages
+  while (idx > moving_first_objs_count_) {
+    idx--;
+    pre_compact_page -= kPageSize;
+    to_space_end -= kPageSize;
+    if (kMode == kMinorFaultMode) {
+      shadow_space_end -= kPageSize;
+      page = shadow_space_end;
+    } else if (kMode == kFallbackMode) {
+      page = to_space_end;
+    }
+    mirror::Object* first_obj = first_objs_moving_space_[idx].AsMirrorPtr();
+    if (first_obj != nullptr) {
+      DoPageCompactionWithStateChange<kMode>(
+          idx,
+          page_status_arr_len,
+          to_space_end,
+          page,
+          [&]() REQUIRES_SHARED(Locks::mutator_lock_) {
+            SlideBlackPage(first_obj, idx, pre_compact_page, page, kMode == kCopyMode);
+          });
+      // We are sliding here, so no point attempting to madvise for every
+      // page. Wait for enough pages to be done.
+      if (idx % (kMinFromSpaceMadviseSize / kPageSize) == 0) {
+        FreeFromSpacePages(idx, kMode);
+      }
+    }
+  }
+  DCHECK_EQ(pre_compact_page, black_allocations_begin_);
+
+  while (idx > 0) {
+    idx--;
+    to_space_end -= kPageSize;
+    if (kMode == kMinorFaultMode) {
+      shadow_space_end -= kPageSize;
+      page = shadow_space_end;
+    } else if (kMode == kFallbackMode) {
+      page = to_space_end;
+    }
+    mirror::Object* first_obj = first_objs_moving_space_[idx].AsMirrorPtr();
+    DoPageCompactionWithStateChange<kMode>(
+        idx, page_status_arr_len, to_space_end, page, [&]() REQUIRES_SHARED(Locks::mutator_lock_) {
+          CompactPage(first_obj, pre_compact_offset_moving_space_[idx], page, kMode == kCopyMode);
+        });
+    FreeFromSpacePages(idx, kMode);
+  }
+  DCHECK_EQ(to_space_end, bump_pointer_space_->Begin());
+}
+
+void MarkCompact::UpdateNonMovingPage(mirror::Object* first, uint8_t* page) {
+  DCHECK_LT(reinterpret_cast<uint8_t*>(first), page + kPageSize);
+  // For every object found in the page, visit the previous object. This ensures
+  // that we can visit without checking page-end boundary.
+  // Call VisitRefsForCompaction with from-space read-barrier as the klass object and
+  // super-class loads require it.
+  // TODO: Set kVisitNativeRoots to false once we implement concurrent
+  // compaction
+  mirror::Object* curr_obj = first;
+  non_moving_space_bitmap_->VisitMarkedRange(
+          reinterpret_cast<uintptr_t>(first) + mirror::kObjectHeaderSize,
+          reinterpret_cast<uintptr_t>(page + kPageSize),
+          [&](mirror::Object* next_obj) {
+            // TODO: Once non-moving space update becomes concurrent, we'll
+            // require fetching the from-space address of 'curr_obj' and then call
+            // visitor on that.
+            if (reinterpret_cast<uint8_t*>(curr_obj) < page) {
+              RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/false>
+                      visitor(this, curr_obj, page, page + kPageSize);
+              MemberOffset begin_offset(page - reinterpret_cast<uint8_t*>(curr_obj));
+              // Native roots shouldn't be visited as they are done when this
+              // object's beginning was visited in the preceding page.
+              curr_obj->VisitRefsForCompaction</*kFetchObjSize*/false, /*kVisitNativeRoots*/false>(
+                      visitor, begin_offset, MemberOffset(-1));
+            } else {
+              RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/false>
+                      visitor(this, curr_obj, page, page + kPageSize);
+              curr_obj->VisitRefsForCompaction</*kFetchObjSize*/false>(visitor,
+                                                                       MemberOffset(0),
+                                                                       MemberOffset(-1));
+            }
+            curr_obj = next_obj;
+          });
+
+  MemberOffset end_offset(page + kPageSize - reinterpret_cast<uint8_t*>(curr_obj));
+  if (reinterpret_cast<uint8_t*>(curr_obj) < page) {
+    RefsUpdateVisitor</*kCheckBegin*/true, /*kCheckEnd*/true>
+            visitor(this, curr_obj, page, page + kPageSize);
+    curr_obj->VisitRefsForCompaction</*kFetchObjSize*/false, /*kVisitNativeRoots*/false>(
+            visitor, MemberOffset(page - reinterpret_cast<uint8_t*>(curr_obj)), end_offset);
+  } else {
+    RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/true>
+            visitor(this, curr_obj, page, page + kPageSize);
+    curr_obj->VisitRefsForCompaction</*kFetchObjSize*/false>(visitor, MemberOffset(0), end_offset);
+  }
+}
+
+void MarkCompact::UpdateNonMovingSpace() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  // Iterating in reverse ensures that the class pointer in objects which span
+  // across more than one page gets updated in the end. This is necessary for
+  // VisitRefsForCompaction() to work correctly.
+  // TODO: If and when we make non-moving space update concurrent, implement a
+  // mechanism to remember class pointers for such objects off-heap and pass it
+  // to VisitRefsForCompaction().
+  uint8_t* page = non_moving_space_->Begin() + non_moving_first_objs_count_ * kPageSize;
+  for (ssize_t i = non_moving_first_objs_count_ - 1; i >= 0; i--) {
+    mirror::Object* obj = first_objs_non_moving_space_[i].AsMirrorPtr();
+    page -= kPageSize;
+    // null means there are no objects on the page to update references.
+    if (obj != nullptr) {
+      UpdateNonMovingPage(obj, page);
+    }
+  }
+}
+
+void MarkCompact::UpdateMovingSpaceBlackAllocations() {
+  // For sliding black pages, we need the first-object, which overlaps with the
+  // first byte of the page. Additionally, we compute the size of first chunk of
+  // black objects. This will suffice for most black pages. Unlike, compaction
+  // pages, here we don't need to pre-compute the offset within first-obj from
+  // where sliding has to start. That can be calculated using the pre-compact
+  // address of the page. Therefore, to save space, we store the first chunk's
+  // size in black_alloc_pages_first_chunk_size_ array.
+  // For the pages which may have holes after the first chunk, which could happen
+  // if a new TLAB starts in the middle of the page, we mark the objects in
+  // the mark-bitmap. So, if the first-chunk size is smaller than kPageSize,
+  // then we use the mark-bitmap for the remainder of the page.
+  uint8_t* const begin = bump_pointer_space_->Begin();
+  uint8_t* black_allocs = black_allocations_begin_;
+  DCHECK_LE(begin, black_allocs);
+  size_t consumed_blocks_count = 0;
+  size_t first_block_size;
+  // Get the list of all blocks allocated in the bump-pointer space.
+  std::vector<size_t>* block_sizes = bump_pointer_space_->GetBlockSizes(thread_running_gc_,
+                                                                        &first_block_size);
+  DCHECK_LE(first_block_size, (size_t)(black_allocs - begin));
+  if (block_sizes != nullptr) {
+    size_t black_page_idx = moving_first_objs_count_;
+    uint8_t* block_end = begin + first_block_size;
+    uint32_t remaining_chunk_size = 0;
+    uint32_t first_chunk_size = 0;
+    mirror::Object* first_obj = nullptr;
+    for (size_t block_size : *block_sizes) {
+      block_end += block_size;
+      // Skip the blocks that are prior to the black allocations. These will be
+      // merged with the main-block later.
+      if (black_allocs >= block_end) {
+        consumed_blocks_count++;
+        continue;
+      }
+      mirror::Object* obj = reinterpret_cast<mirror::Object*>(black_allocs);
+      bool set_mark_bit = remaining_chunk_size > 0;
+      // We don't know how many objects are allocated in the current block. When we hit
+      // a null assume it's the end. This works as every block is expected to
+      // have objects allocated linearly using bump-pointer.
+      // BumpPointerSpace::Walk() also works similarly.
+      while (black_allocs < block_end
+             && obj->GetClass<kDefaultVerifyFlags, kWithoutReadBarrier>() != nullptr) {
+        // Try to keep instructions which access class instance together to
+        // avoid reloading the pointer from object.
+        size_t obj_size = obj->SizeOf();
+        bytes_scanned_ += obj_size;
+        obj_size = RoundUp(obj_size, kAlignment);
+        UpdateClassAfterObjectMap(obj);
+        if (first_obj == nullptr) {
+          first_obj = obj;
+        }
+        // We only need the mark-bitmap in the pages wherein a new TLAB starts in
+        // the middle of the page.
+        if (set_mark_bit) {
+          moving_space_bitmap_->Set(obj);
+        }
+        // Handle objects which cross page boundary, including objects larger
+        // than page size.
+        if (remaining_chunk_size + obj_size >= kPageSize) {
+          set_mark_bit = false;
+          first_chunk_size += kPageSize - remaining_chunk_size;
+          remaining_chunk_size += obj_size;
+          // We should not store first-object and remaining_chunk_size if there were
+          // unused bytes before this TLAB, in which case we must have already
+          // stored the values (below).
+          if (black_alloc_pages_first_chunk_size_[black_page_idx] == 0) {
+            black_alloc_pages_first_chunk_size_[black_page_idx] = first_chunk_size;
+            first_objs_moving_space_[black_page_idx].Assign(first_obj);
+          }
+          black_page_idx++;
+          remaining_chunk_size -= kPageSize;
+          // Consume an object larger than page size.
+          while (remaining_chunk_size >= kPageSize) {
+            black_alloc_pages_first_chunk_size_[black_page_idx] = kPageSize;
+            first_objs_moving_space_[black_page_idx].Assign(obj);
+            black_page_idx++;
+            remaining_chunk_size -= kPageSize;
+          }
+          first_obj = remaining_chunk_size > 0 ? obj : nullptr;
+          first_chunk_size = remaining_chunk_size;
+        } else {
+          DCHECK_LE(first_chunk_size, remaining_chunk_size);
+          first_chunk_size += obj_size;
+          remaining_chunk_size += obj_size;
+        }
+        black_allocs += obj_size;
+        obj = reinterpret_cast<mirror::Object*>(black_allocs);
+      }
+      DCHECK_LE(black_allocs, block_end);
+      DCHECK_LT(remaining_chunk_size, kPageSize);
+      // consume the unallocated portion of the block
+      if (black_allocs < block_end) {
+        // first-chunk of the current page ends here. Store it.
+        if (first_chunk_size > 0 && black_alloc_pages_first_chunk_size_[black_page_idx] == 0) {
+          black_alloc_pages_first_chunk_size_[black_page_idx] = first_chunk_size;
+          first_objs_moving_space_[black_page_idx].Assign(first_obj);
+        }
+        first_chunk_size = 0;
+        first_obj = nullptr;
+        size_t page_remaining = kPageSize - remaining_chunk_size;
+        size_t block_remaining = block_end - black_allocs;
+        if (page_remaining <= block_remaining) {
+          block_remaining -= page_remaining;
+          // current page and the subsequent empty pages in the block
+          black_page_idx += 1 + block_remaining / kPageSize;
+          remaining_chunk_size = block_remaining % kPageSize;
+        } else {
+          remaining_chunk_size += block_remaining;
+        }
+        black_allocs = block_end;
+      }
+    }
+    if (black_page_idx < bump_pointer_space_->Size() / kPageSize) {
+      // Store the leftover first-chunk, if any, and update page index.
+      if (black_alloc_pages_first_chunk_size_[black_page_idx] > 0) {
+        black_page_idx++;
+      } else if (first_chunk_size > 0) {
+        black_alloc_pages_first_chunk_size_[black_page_idx] = first_chunk_size;
+        first_objs_moving_space_[black_page_idx].Assign(first_obj);
+        black_page_idx++;
+      }
+    }
+    black_page_count_ = black_page_idx - moving_first_objs_count_;
+    delete block_sizes;
+  }
+  // Update bump-pointer space by consuming all the pre-black blocks into the
+  // main one.
+  bump_pointer_space_->SetBlockSizes(thread_running_gc_,
+                                     post_compact_end_ - begin,
+                                     consumed_blocks_count);
+}
+
+void MarkCompact::UpdateNonMovingSpaceBlackAllocations() {
+  accounting::ObjectStack* stack = heap_->GetAllocationStack();
+  const StackReference<mirror::Object>* limit = stack->End();
+  uint8_t* const space_begin = non_moving_space_->Begin();
+  for (StackReference<mirror::Object>* it = stack->Begin(); it != limit; ++it) {
+    mirror::Object* obj = it->AsMirrorPtr();
+    if (obj != nullptr && non_moving_space_bitmap_->HasAddress(obj)) {
+      non_moving_space_bitmap_->Set(obj);
+      // Clear so that we don't try to set the bit again in the next GC-cycle.
+      it->Clear();
+      size_t idx = (reinterpret_cast<uint8_t*>(obj) - space_begin) / kPageSize;
+      uint8_t* page_begin = AlignDown(reinterpret_cast<uint8_t*>(obj), kPageSize);
+      mirror::Object* first_obj = first_objs_non_moving_space_[idx].AsMirrorPtr();
+      if (first_obj == nullptr
+          || (obj < first_obj && reinterpret_cast<uint8_t*>(first_obj) > page_begin)) {
+        first_objs_non_moving_space_[idx].Assign(obj);
+      }
+      mirror::Object* next_page_first_obj = first_objs_non_moving_space_[++idx].AsMirrorPtr();
+      uint8_t* next_page_begin = page_begin + kPageSize;
+      if (next_page_first_obj == nullptr
+          || reinterpret_cast<uint8_t*>(next_page_first_obj) > next_page_begin) {
+        size_t obj_size = RoundUp(obj->SizeOf<kDefaultVerifyFlags>(), kAlignment);
+        uint8_t* obj_end = reinterpret_cast<uint8_t*>(obj) + obj_size;
+        while (next_page_begin < obj_end) {
+          first_objs_non_moving_space_[idx++].Assign(obj);
+          next_page_begin += kPageSize;
+        }
+      }
+      // update first_objs count in case we went past non_moving_first_objs_count_
+      non_moving_first_objs_count_ = std::max(non_moving_first_objs_count_, idx);
+    }
+  }
+}
+
+class MarkCompact::ImmuneSpaceUpdateObjVisitor {
+ public:
+  ImmuneSpaceUpdateObjVisitor(MarkCompact* collector, bool visit_native_roots)
+      : collector_(collector), visit_native_roots_(visit_native_roots) {}
+
+  ALWAYS_INLINE void operator()(mirror::Object* obj) const REQUIRES(Locks::mutator_lock_) {
+    RefsUpdateVisitor</*kCheckBegin*/false, /*kCheckEnd*/false> visitor(collector_,
+                                                                        obj,
+                                                                        /*begin_*/nullptr,
+                                                                        /*end_*/nullptr);
+    if (visit_native_roots_) {
+      obj->VisitRefsForCompaction</*kFetchObjSize*/ false, /*kVisitNativeRoots*/ true>(
+          visitor, MemberOffset(0), MemberOffset(-1));
+    } else {
+      obj->VisitRefsForCompaction</*kFetchObjSize*/ false>(
+          visitor, MemberOffset(0), MemberOffset(-1));
+    }
+  }
+
+  static void Callback(mirror::Object* obj, void* arg) REQUIRES(Locks::mutator_lock_) {
+    reinterpret_cast<ImmuneSpaceUpdateObjVisitor*>(arg)->operator()(obj);
+  }
+
+ private:
+  MarkCompact* const collector_;
+  const bool visit_native_roots_;
+};
+
+class MarkCompact::ClassLoaderRootsUpdater : public ClassLoaderVisitor {
+ public:
+  explicit ClassLoaderRootsUpdater(MarkCompact* collector) : collector_(collector) {}
+
+  void Visit(ObjPtr<mirror::ClassLoader> class_loader) override
+      REQUIRES_SHARED(Locks::classlinker_classes_lock_, Locks::mutator_lock_) {
+    ClassTable* const class_table = class_loader->GetClassTable();
+    if (class_table != nullptr) {
+      class_table->VisitRoots(*this);
+    }
+  }
+
+  void VisitRootIfNonNull(mirror::CompressedReference<mirror::Object>* root) const
+      REQUIRES(Locks::heap_bitmap_lock_) REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (!root->IsNull()) {
+      VisitRoot(root);
+    }
+  }
+
+  void VisitRoot(mirror::CompressedReference<mirror::Object>* root) const
+      REQUIRES(Locks::heap_bitmap_lock_) REQUIRES_SHARED(Locks::mutator_lock_) {
+    collector_->VisitRoots(&root, 1, RootInfo(RootType::kRootVMInternal));
+  }
+
+ private:
+  MarkCompact* collector_;
+};
+
+class MarkCompact::LinearAllocPageUpdater {
+ public:
+  explicit LinearAllocPageUpdater(MarkCompact* collector) : collector_(collector) {}
+
+  void operator()(uint8_t* page_begin, uint8_t* first_obj) ALWAYS_INLINE
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    DCHECK_ALIGNED(page_begin, kPageSize);
+    uint8_t* page_end = page_begin + kPageSize;
+    uint32_t obj_size;
+    for (uint8_t* byte = first_obj; byte < page_end;) {
+      TrackingHeader* header = reinterpret_cast<TrackingHeader*>(byte);
+      obj_size = header->GetSize();
+      if (UNLIKELY(obj_size == 0)) {
+        // No more objects in this page to visit.
+        last_page_touched_ = byte >= page_begin;
+        return;
+      }
+      uint8_t* obj = byte + sizeof(TrackingHeader);
+      uint8_t* obj_end = byte + obj_size;
+      if (header->Is16Aligned()) {
+        obj = AlignUp(obj, 16);
+      }
+      uint8_t* begin_boundary = std::max(obj, page_begin);
+      uint8_t* end_boundary = std::min(obj_end, page_end);
+      if (begin_boundary < end_boundary) {
+        VisitObject(header->GetKind(), obj, begin_boundary, end_boundary);
+      }
+      if (ArenaAllocator::IsRunningOnMemoryTool()) {
+        obj_size += ArenaAllocator::kMemoryToolRedZoneBytes;
+      }
+      byte += RoundUp(obj_size, LinearAlloc::kAlignment);
+    }
+    last_page_touched_ = true;
+  }
+
+  bool WasLastPageTouched() const { return last_page_touched_; }
+
+  void VisitRootIfNonNull(mirror::CompressedReference<mirror::Object>* root) const
+      ALWAYS_INLINE REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (!root->IsNull()) {
+      VisitRoot(root);
+    }
+  }
+
+  void VisitRoot(mirror::CompressedReference<mirror::Object>* root) const
+      ALWAYS_INLINE REQUIRES_SHARED(Locks::mutator_lock_) {
+    mirror::Object* old_ref = root->AsMirrorPtr();
+    DCHECK_NE(old_ref, nullptr);
+    if (collector_->live_words_bitmap_->HasAddress(old_ref)) {
+      mirror::Object* new_ref = old_ref;
+      if (reinterpret_cast<uint8_t*>(old_ref) >= collector_->black_allocations_begin_) {
+        new_ref = collector_->PostCompactBlackObjAddr(old_ref);
+      } else if (collector_->live_words_bitmap_->Test(old_ref)) {
+        DCHECK(collector_->moving_space_bitmap_->Test(old_ref)) << old_ref;
+        new_ref = collector_->PostCompactOldObjAddr(old_ref);
+      }
+      if (old_ref != new_ref) {
+        root->Assign(new_ref);
+      }
+    }
+  }
+
+ private:
+  void VisitObject(LinearAllocKind kind,
+                   void* obj,
+                   uint8_t* start_boundary,
+                   uint8_t* end_boundary) const REQUIRES_SHARED(Locks::mutator_lock_) {
+    switch (kind) {
+      case LinearAllocKind::kNoGCRoots:
+        break;
+      case LinearAllocKind::kGCRootArray:
+        {
+          GcRoot<mirror::Object>* root = reinterpret_cast<GcRoot<mirror::Object>*>(start_boundary);
+          GcRoot<mirror::Object>* last = reinterpret_cast<GcRoot<mirror::Object>*>(end_boundary);
+          for (; root < last; root++) {
+            VisitRootIfNonNull(root->AddressWithoutBarrier());
+          }
+        }
+        break;
+      case LinearAllocKind::kArtMethodArray:
+        {
+          LengthPrefixedArray<ArtMethod>* array = static_cast<LengthPrefixedArray<ArtMethod>*>(obj);
+          // Old methods are clobbered in debug builds. Check size to confirm if the array
+          // has any GC roots to visit. See ClassLinker::LinkMethodsHelper::ClobberOldMethods()
+          if (array->size() > 0) {
+            if (collector_->pointer_size_ == PointerSize::k64) {
+              ArtMethod::VisitArrayRoots<PointerSize::k64>(
+                  *this, start_boundary, end_boundary, array);
+            } else {
+              DCHECK_EQ(collector_->pointer_size_, PointerSize::k32);
+              ArtMethod::VisitArrayRoots<PointerSize::k32>(
+                  *this, start_boundary, end_boundary, array);
+            }
+          }
+        }
+        break;
+      case LinearAllocKind::kArtMethod:
+        ArtMethod::VisitRoots(*this, start_boundary, end_boundary, static_cast<ArtMethod*>(obj));
+        break;
+      case LinearAllocKind::kArtFieldArray:
+        ArtField::VisitArrayRoots(*this,
+                                  start_boundary,
+                                  end_boundary,
+                                  static_cast<LengthPrefixedArray<ArtField>*>(obj));
+        break;
+      case LinearAllocKind::kDexCacheArray:
+        {
+          mirror::DexCachePair<mirror::Object>* first =
+              reinterpret_cast<mirror::DexCachePair<mirror::Object>*>(start_boundary);
+          mirror::DexCachePair<mirror::Object>* last =
+              reinterpret_cast<mirror::DexCachePair<mirror::Object>*>(end_boundary);
+          mirror::DexCache::VisitDexCachePairRoots(*this, first, last);
+      }
+    }
+  }
+
+  MarkCompact* const collector_;
+  // Whether the last page was touched or not.
+  bool last_page_touched_;
+};
+
+void MarkCompact::CompactionPause() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  Runtime* runtime = Runtime::Current();
+  non_moving_space_bitmap_ = non_moving_space_->GetLiveBitmap();
+  if (kIsDebugBuild) {
+    DCHECK_EQ(thread_running_gc_, Thread::Current());
+    stack_low_addr_ = thread_running_gc_->GetStackEnd();
+    stack_high_addr_ =
+        reinterpret_cast<char*>(stack_low_addr_) + thread_running_gc_->GetStackSize();
+  }
+  {
+    TimingLogger::ScopedTiming t2("(Paused)UpdateCompactionDataStructures", GetTimings());
+    ReaderMutexLock rmu(thread_running_gc_, *Locks::heap_bitmap_lock_);
+    // Refresh data-structures to catch-up on allocations that may have
+    // happened since marking-phase pause.
+    // There could be several TLABs that got allocated since marking pause. We
+    // don't want to compact them and instead update the TLAB info in TLS and
+    // let mutators continue to use the TLABs.
+    // We need to set all the bits in live-words bitmap corresponding to allocated
+    // objects. Also, we need to find the objects that are overlapping with
+    // page-begin boundaries. Unlike objects allocated before
+    // black_allocations_begin_, which can be identified via mark-bitmap, we can get
+    // this info only via walking the space past black_allocations_begin_, which
+    // involves fetching object size.
+    // TODO: We can reduce the time spent on this in a pause by performing one
+    // round of this concurrently prior to the pause.
+    UpdateMovingSpaceBlackAllocations();
+    // TODO: If we want to avoid this allocation in a pause then we will have to
+    // allocate an array for the entire moving-space size, which can be made
+    // part of info_map_.
+    moving_pages_status_ = new Atomic<PageState>[moving_first_objs_count_ + black_page_count_];
+    if (kIsDebugBuild) {
+      size_t len = moving_first_objs_count_ + black_page_count_;
+      for (size_t i = 0; i < len; i++) {
+          CHECK_EQ(moving_pages_status_[i].load(std::memory_order_relaxed),
+                   PageState::kUnprocessed);
+      }
+    }
+    // Iterate over the allocation_stack_, for every object in the non-moving
+    // space:
+    // 1. Mark the object in live bitmap
+    // 2. Erase the object from allocation stack
+    // 3. In the corresponding page, if the first-object vector needs updating
+    // then do so.
+    UpdateNonMovingSpaceBlackAllocations();
+
+    // This store is visible to mutator (or uffd worker threads) as the mutator
+    // lock's unlock guarantees that.
+    compacting_ = true;
+    // Start updating roots and system weaks now.
+    heap_->GetReferenceProcessor()->UpdateRoots(this);
+  }
+  {
+    TimingLogger::ScopedTiming t2("(Paused)UpdateClassLoaderRoots", GetTimings());
+    ReaderMutexLock rmu(thread_running_gc_, *Locks::classlinker_classes_lock_);
+    {
+      ClassLoaderRootsUpdater updater(this);
+      runtime->GetClassLinker()->VisitClassLoaders(&updater);
+    }
+  }
+
+  bool has_zygote_space = heap_->HasZygoteSpace();
+  // TODO: Find out why it's not sufficient to visit native roots of immune
+  // spaces, and why all the pre-zygote fork arenas have to be linearly updated.
+  // Is it possible that some native root starts getting pointed to by some object
+  // in moving space after fork? Or are we missing a write-barrier somewhere
+  // when a native root is updated?
+  GcVisitedArenaPool* arena_pool =
+      static_cast<GcVisitedArenaPool*>(runtime->GetLinearAllocArenaPool());
+  if (uffd_ == kFallbackMode || (!has_zygote_space && runtime->IsZygote())) {
+    // Besides fallback-mode, visit linear-alloc space in the pause for zygote
+    // processes prior to first fork (that's when zygote space gets created).
+    if (kIsDebugBuild && IsValidFd(uffd_)) {
+      // All arenas allocated so far are expected to be pre-zygote fork.
+      arena_pool->ForEachAllocatedArena(
+          [](const TrackedArena& arena)
+              REQUIRES_SHARED(Locks::mutator_lock_) { CHECK(arena.IsPreZygoteForkArena()); });
+    }
+    LinearAllocPageUpdater updater(this);
+    arena_pool->VisitRoots(updater);
+  } else {
+    // Clear the flag as we care about this only if arenas are freed during
+    // concurrent compaction.
+    arena_pool->ClearArenasFreed();
+    arena_pool->ForEachAllocatedArena(
+        [this](const TrackedArena& arena) REQUIRES_SHARED(Locks::mutator_lock_) {
+          // The pre-zygote fork arenas are not visited concurrently in the
+          // zygote children processes. The native roots of the dirty objects
+          // are visited during immune space visit below.
+          if (!arena.IsPreZygoteForkArena()) {
+            uint8_t* last_byte = arena.GetLastUsedByte();
+            CHECK(linear_alloc_arenas_.insert({&arena, last_byte}).second);
+          } else {
+            LinearAllocPageUpdater updater(this);
+            arena.VisitRoots(updater);
+          }
+        });
+  }
+
+  SweepSystemWeaks(thread_running_gc_, runtime, /*paused*/ true);
+
+  {
+    TimingLogger::ScopedTiming t2("(Paused)UpdateConcurrentRoots", GetTimings());
+    runtime->VisitConcurrentRoots(this, kVisitRootFlagAllRoots);
+  }
+  {
+    // TODO: don't visit the transaction roots if it's not active.
+    TimingLogger::ScopedTiming t2("(Paused)UpdateNonThreadRoots", GetTimings());
+    runtime->VisitNonThreadRoots(this);
+  }
+
+  {
+    // TODO: Immune space updation has to happen either before or after
+    // remapping pre-compact pages to from-space. And depending on when it's
+    // done, we have to invoke VisitRefsForCompaction() with or without
+    // read-barrier.
+    TimingLogger::ScopedTiming t2("(Paused)UpdateImmuneSpaces", GetTimings());
+    accounting::CardTable* const card_table = heap_->GetCardTable();
+    for (auto& space : immune_spaces_.GetSpaces()) {
+      DCHECK(space->IsImageSpace() || space->IsZygoteSpace());
+      accounting::ContinuousSpaceBitmap* live_bitmap = space->GetLiveBitmap();
+      accounting::ModUnionTable* table = heap_->FindModUnionTableFromSpace(space);
+      // Having zygote-space indicates that the first zygote fork has taken
+      // place and that the classes/dex-caches in immune-spaces may have allocations
+      // (ArtMethod/ArtField arrays, dex-cache array, etc.) in the
+      // non-userfaultfd visited private-anonymous mappings. Visit them here.
+      ImmuneSpaceUpdateObjVisitor visitor(this, /*visit_native_roots=*/false);
+      if (table != nullptr) {
+        table->ProcessCards();
+        table->VisitObjects(ImmuneSpaceUpdateObjVisitor::Callback, &visitor);
+      } else {
+        WriterMutexLock wmu(thread_running_gc_, *Locks::heap_bitmap_lock_);
+        card_table->Scan<false>(
+            live_bitmap,
+            space->Begin(),
+            space->Limit(),
+            visitor,
+            accounting::CardTable::kCardDirty - 1);
+      }
+    }
+  }
+
+  if (use_uffd_sigbus_) {
+    // Release order wrt to mutator threads' SIGBUS handler load.
+    sigbus_in_progress_count_.store(0, std::memory_order_release);
+  }
+  KernelPreparation();
+  UpdateNonMovingSpace();
+  // fallback mode
+  if (uffd_ == kFallbackMode) {
+    CompactMovingSpace<kFallbackMode>(nullptr);
+
+    int32_t freed_bytes = black_objs_slide_diff_;
+    bump_pointer_space_->RecordFree(freed_objects_, freed_bytes);
+    RecordFree(ObjectBytePair(freed_objects_, freed_bytes));
+  } else {
+    DCHECK_EQ(compaction_in_progress_count_.load(std::memory_order_relaxed), 0u);
+    DCHECK_EQ(compaction_buffer_counter_.load(std::memory_order_relaxed), 1);
+    if (!use_uffd_sigbus_) {
+      // We must start worker threads before resuming mutators to avoid deadlocks.
+      heap_->GetThreadPool()->StartWorkers(thread_running_gc_);
+    }
+  }
+  stack_low_addr_ = nullptr;
+}
+
+void MarkCompact::KernelPrepareRangeForUffd(uint8_t* to_addr,
+                                            uint8_t* from_addr,
+                                            size_t map_size,
+                                            int fd,
+                                            uint8_t* shadow_addr) {
+  int mremap_flags = MREMAP_MAYMOVE | MREMAP_FIXED;
+  if (gHaveMremapDontunmap) {
+    mremap_flags |= MREMAP_DONTUNMAP;
+  }
+
+  void* ret = mremap(to_addr, map_size, map_size, mremap_flags, from_addr);
+  CHECK_EQ(ret, static_cast<void*>(from_addr))
+      << "mremap to move pages failed: " << strerror(errno)
+      << ". space-addr=" << reinterpret_cast<void*>(to_addr) << " size=" << PrettySize(map_size);
+
+  if (shadow_addr != nullptr) {
+    DCHECK_EQ(fd, kFdUnused);
+    DCHECK(gHaveMremapDontunmap);
+    ret = mremap(shadow_addr, map_size, map_size, mremap_flags, to_addr);
+    CHECK_EQ(ret, static_cast<void*>(to_addr))
+        << "mremap from shadow to to-space map failed: " << strerror(errno);
+  } else if (!gHaveMremapDontunmap || fd > kFdUnused) {
+    // Without MREMAP_DONTUNMAP the source mapping is unmapped by mremap. So mmap
+    // the moving space again.
+    int mmap_flags = MAP_FIXED;
+    if (fd == kFdUnused) {
+      // Use MAP_FIXED_NOREPLACE so that if someone else reserves 'to_addr'
+      // mapping in meantime, which can happen when MREMAP_DONTUNMAP isn't
+      // available, to avoid unmapping someone else' mapping and then causing
+      // crashes elsewhere.
+      mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE;
+      // On some platforms MAP_ANONYMOUS expects fd to be -1.
+      fd = -1;
+    } else if (IsValidFd(fd)) {
+      mmap_flags |= MAP_SHARED;
+    } else {
+      DCHECK_EQ(fd, kFdSharedAnon);
+      mmap_flags |= MAP_SHARED | MAP_ANONYMOUS;
+    }
+    ret = mmap(to_addr, map_size, PROT_READ | PROT_WRITE, mmap_flags, fd, 0);
+    CHECK_EQ(ret, static_cast<void*>(to_addr))
+        << "mmap for moving space failed: " << strerror(errno);
+  }
+}
+
+void MarkCompact::KernelPreparation() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  uint8_t* moving_space_begin = bump_pointer_space_->Begin();
+  size_t moving_space_size = bump_pointer_space_->Capacity();
+  int mode = kCopyMode;
+  size_t moving_space_register_sz;
+  if (minor_fault_initialized_) {
+    moving_space_register_sz = (moving_first_objs_count_ + black_page_count_) * kPageSize;
+    if (shadow_to_space_map_.IsValid()) {
+      size_t shadow_size = shadow_to_space_map_.Size();
+      void* addr = shadow_to_space_map_.Begin();
+      if (shadow_size < moving_space_register_sz) {
+        addr = mremap(addr,
+                      shadow_size,
+                      moving_space_register_sz,
+                      // Don't allow moving with obj-ptr poisoning as the
+                      // mapping needs to be in <4GB address space.
+                      kObjPtrPoisoning ? 0 : MREMAP_MAYMOVE,
+                      /*new_address=*/nullptr);
+        if (addr != MAP_FAILED) {
+          // Succeeded in expanding the mapping. Update the MemMap entry for shadow map.
+          MemMap temp = MemMap::MapPlaceholder(
+              "moving-space-shadow", static_cast<uint8_t*>(addr), moving_space_register_sz);
+          std::swap(shadow_to_space_map_, temp);
+        }
+      }
+      if (addr != MAP_FAILED) {
+        mode = kMinorFaultMode;
+      } else {
+        // We are not going to use shadow map. So protect it to catch any
+        // potential bugs.
+        DCHECK_EQ(mprotect(shadow_to_space_map_.Begin(), shadow_to_space_map_.Size(), PROT_NONE), 0)
+            << "mprotect failed: " << strerror(errno);
+      }
+    }
+  } else {
+    moving_space_register_sz = moving_space_size;
+  }
+
+  bool map_shared =
+      minor_fault_initialized_ || (!Runtime::Current()->IsZygote() && uffd_minor_fault_supported_);
+  uint8_t* shadow_addr = nullptr;
+  if (moving_to_space_fd_ == kFdUnused && map_shared) {
+    DCHECK(gHaveMremapDontunmap);
+    DCHECK(shadow_to_space_map_.IsValid());
+    DCHECK_EQ(shadow_to_space_map_.Size(), moving_space_size);
+    shadow_addr = shadow_to_space_map_.Begin();
+  }
+
+  KernelPrepareRangeForUffd(moving_space_begin,
+                            from_space_begin_,
+                            moving_space_size,
+                            moving_to_space_fd_,
+                            shadow_addr);
+
+  if (IsValidFd(uffd_)) {
+    // Register the moving space with userfaultfd.
+    RegisterUffd(moving_space_begin, moving_space_register_sz, mode);
+    // Prepare linear-alloc for concurrent compaction.
+    for (auto& data : linear_alloc_spaces_data_) {
+      bool mmap_again = map_shared && !data.already_shared_;
+      DCHECK_EQ(static_cast<ssize_t>(data.shadow_.Size()), data.end_ - data.begin_);
+      // There could be threads running in suspended mode when the compaction
+      // pause is being executed. In order to make the userfaultfd setup atomic,
+      // the registration has to be done *before* moving the pages to shadow map.
+      if (!mmap_again) {
+        // See the comment in the constructor as to why it's conditionally done.
+        RegisterUffd(data.begin_,
+                     data.shadow_.Size(),
+                     minor_fault_initialized_ ? kMinorFaultMode : kCopyMode);
+      }
+      KernelPrepareRangeForUffd(data.begin_,
+                                data.shadow_.Begin(),
+                                data.shadow_.Size(),
+                                mmap_again ? kFdSharedAnon : kFdUnused);
+      if (mmap_again) {
+        data.already_shared_ = true;
+        RegisterUffd(data.begin_,
+                     data.shadow_.Size(),
+                     minor_fault_initialized_ ? kMinorFaultMode : kCopyMode);
+      }
+    }
+  }
+  if (map_shared) {
+    // Start mapping linear-alloc MAP_SHARED only after the compaction pause of
+    // the first GC in non-zygote processes. This is the GC which sets up
+    // mappings for using minor-fault in future. Up to this point we run
+    // userfaultfd in copy-mode, which requires the mappings (of linear-alloc)
+    // to be MAP_PRIVATE.
+    map_linear_alloc_shared_ = true;
+  }
+}
+
+template <int kMode>
+void MarkCompact::ConcurrentCompaction(uint8_t* buf) {
+  DCHECK_NE(kMode, kFallbackMode);
+  DCHECK(kMode != kCopyMode || buf != nullptr);
+  size_t nr_moving_space_used_pages = moving_first_objs_count_ + black_page_count_;
+  while (true) {
+    struct uffd_msg msg;
+    ssize_t nread = read(uffd_, &msg, sizeof(msg));
+    CHECK_GT(nread, 0);
+    CHECK_EQ(msg.event, UFFD_EVENT_PAGEFAULT);
+    DCHECK_EQ(nread, static_cast<ssize_t>(sizeof(msg)));
+    uint8_t* fault_addr = reinterpret_cast<uint8_t*>(msg.arg.pagefault.address);
+    if (fault_addr == conc_compaction_termination_page_) {
+      // The counter doesn't need to be updated atomically as only one thread
+      // would wake up against the gc-thread's load to this fault_addr. In fact,
+      // the other threads would wake up serially because every exiting thread
+      // will wake up gc-thread, which would retry load but again would find the
+      // page missing. Also, the value will be flushed to caches due to the ioctl
+      // syscall below.
+      uint8_t ret = thread_pool_counter_--;
+      // If 'gKernelHasFaultRetry == true' then only the last thread should map the
+      // zeropage so that the gc-thread can proceed. Otherwise, each thread does
+      // it and the gc-thread will repeat this fault until thread_pool_counter == 0.
+      if (!gKernelHasFaultRetry || ret == 1) {
+        ZeropageIoctl(fault_addr, /*tolerate_eexist=*/false, /*tolerate_enoent=*/false);
+      } else {
+        struct uffdio_range uffd_range;
+        uffd_range.start = msg.arg.pagefault.address;
+        uffd_range.len = kPageSize;
+        CHECK_EQ(ioctl(uffd_, UFFDIO_WAKE, &uffd_range), 0)
+            << "ioctl_userfaultfd: wake failed for concurrent-compaction termination page: "
+            << strerror(errno);
+      }
+      break;
+    }
+    uint8_t* fault_page = AlignDown(fault_addr, kPageSize);
+    if (bump_pointer_space_->HasAddress(reinterpret_cast<mirror::Object*>(fault_addr))) {
+      ConcurrentlyProcessMovingPage<kMode>(fault_page, buf, nr_moving_space_used_pages);
+    } else if (minor_fault_initialized_) {
+      ConcurrentlyProcessLinearAllocPage<kMinorFaultMode>(
+          fault_page, (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) != 0);
+    } else {
+      ConcurrentlyProcessLinearAllocPage<kCopyMode>(
+          fault_page, (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) != 0);
+    }
+  }
+}
+
+bool MarkCompact::SigbusHandler(siginfo_t* info) {
+  class ScopedInProgressCount {
+   public:
+    explicit ScopedInProgressCount(MarkCompact* collector) : collector_(collector) {
+      // Increment the count only if compaction is not done yet.
+      SigbusCounterType prev =
+          collector_->sigbus_in_progress_count_.load(std::memory_order_relaxed);
+      while ((prev & kSigbusCounterCompactionDoneMask) == 0) {
+        if (collector_->sigbus_in_progress_count_.compare_exchange_strong(
+                prev, prev + 1, std::memory_order_acquire)) {
+          DCHECK_LT(prev, kSigbusCounterCompactionDoneMask - 1);
+          compaction_done_ = false;
+          return;
+        }
+      }
+      compaction_done_ = true;
+    }
+
+    bool IsCompactionDone() const {
+      return compaction_done_;
+    }
+
+    ~ScopedInProgressCount() {
+      if (!IsCompactionDone()) {
+        collector_->sigbus_in_progress_count_.fetch_sub(1, std::memory_order_release);
+      }
+    }
+
+   private:
+    MarkCompact* const collector_;
+    bool compaction_done_;
+  };
+
+  DCHECK(use_uffd_sigbus_);
+  if (info->si_code != BUS_ADRERR) {
+    // Userfaultfd raises SIGBUS with BUS_ADRERR. All other causes can't be
+    // handled here.
+    return false;
+  }
+
+  ScopedInProgressCount spc(this);
+  uint8_t* fault_page = AlignDown(reinterpret_cast<uint8_t*>(info->si_addr), kPageSize);
+  if (!spc.IsCompactionDone()) {
+    if (bump_pointer_space_->HasAddress(reinterpret_cast<mirror::Object*>(fault_page))) {
+      Thread* self = Thread::Current();
+      Locks::mutator_lock_->AssertSharedHeld(self);
+      size_t nr_moving_space_used_pages = moving_first_objs_count_ + black_page_count_;
+      if (minor_fault_initialized_) {
+        ConcurrentlyProcessMovingPage<kMinorFaultMode>(
+            fault_page, nullptr, nr_moving_space_used_pages);
+      } else {
+        ConcurrentlyProcessMovingPage<kCopyMode>(
+            fault_page, self->GetThreadLocalGcBuffer(), nr_moving_space_used_pages);
+      }
+      return true;
+    } else {
+      // Find the linear-alloc space containing fault-addr
+      for (auto& data : linear_alloc_spaces_data_) {
+        if (data.begin_ <= fault_page && data.end_ > fault_page) {
+          if (minor_fault_initialized_) {
+            ConcurrentlyProcessLinearAllocPage<kMinorFaultMode>(fault_page, false);
+          } else {
+            ConcurrentlyProcessLinearAllocPage<kCopyMode>(fault_page, false);
+          }
+          return true;
+        }
+      }
+      // Fault address doesn't belong to either moving-space or linear-alloc.
+      return false;
+    }
+  } else {
+    // We may spuriously get SIGBUS fault, which was initiated before the
+    // compaction was finished, but ends up here. In that case, if the fault
+    // address is valid then consider it handled.
+    return bump_pointer_space_->HasAddress(reinterpret_cast<mirror::Object*>(fault_page)) ||
+           linear_alloc_spaces_data_.end() !=
+               std::find_if(linear_alloc_spaces_data_.begin(),
+                            linear_alloc_spaces_data_.end(),
+                            [fault_page](const LinearAllocSpaceData& data) {
+                              return data.begin_ <= fault_page && data.end_ > fault_page;
+                            });
+  }
+}
+
+static void BackOff(uint32_t i) {
+  static constexpr uint32_t kYieldMax = 5;
+  // TODO: Consider adding x86 PAUSE and/or ARM YIELD here.
+  if (i <= kYieldMax) {
+    sched_yield();
+  } else {
+    // nanosleep is not in the async-signal-safe list, but bionic implements it
+    // with a pure system call, so it should be fine.
+    NanoSleep(10000ull * (i - kYieldMax));
+  }
+}
+
+template <int kMode>
+void MarkCompact::ConcurrentlyProcessMovingPage(uint8_t* fault_page,
+                                                uint8_t* buf,
+                                                size_t nr_moving_space_used_pages) {
+  class ScopedInProgressCount {
+   public:
+    explicit ScopedInProgressCount(MarkCompact* collector) : collector_(collector) {
+      collector_->compaction_in_progress_count_.fetch_add(1, std::memory_order_relaxed);
+    }
+
+    ~ScopedInProgressCount() {
+      collector_->compaction_in_progress_count_.fetch_sub(1, std::memory_order_relaxed);
+    }
+
+   private:
+    MarkCompact* collector_;
+  };
+
+  uint8_t* unused_space_begin =
+      bump_pointer_space_->Begin() + nr_moving_space_used_pages * kPageSize;
+  DCHECK(IsAligned<kPageSize>(unused_space_begin));
+  DCHECK(kMode == kCopyMode || fault_page < unused_space_begin);
+  if (kMode == kCopyMode && fault_page >= unused_space_begin) {
+    // There is a race which allows more than one thread to install a
+    // zero-page. But we can tolerate that. So absorb the EEXIST returned by
+    // the ioctl and move on.
+    ZeropageIoctl(fault_page, /*tolerate_eexist=*/true, /*tolerate_enoent=*/true);
+    return;
+  }
+  size_t page_idx = (fault_page - bump_pointer_space_->Begin()) / kPageSize;
+  mirror::Object* first_obj = first_objs_moving_space_[page_idx].AsMirrorPtr();
+  if (first_obj == nullptr) {
+    // We should never have a case where two workers are trying to install a
+    // zeropage in this range as we synchronize using moving_pages_status_[page_idx].
+    PageState expected_state = PageState::kUnprocessed;
+    if (moving_pages_status_[page_idx].compare_exchange_strong(
+            expected_state, PageState::kProcessedAndMapping, std::memory_order_relaxed)) {
+      // Note: ioctl acts as an acquire fence.
+      ZeropageIoctl(fault_page, /*tolerate_eexist=*/false, /*tolerate_enoent=*/true);
+    } else {
+      DCHECK_EQ(expected_state, PageState::kProcessedAndMapping);
+    }
+    return;
+  }
+
+  PageState state = moving_pages_status_[page_idx].load(
+      use_uffd_sigbus_ ? std::memory_order_acquire : std::memory_order_relaxed);
+  uint32_t backoff_count = 0;
+  while (true) {
+    switch (state) {
+      case PageState::kUnprocessed: {
+        // The increment to the in-progress counter must be done before updating
+        // the page's state. Otherwise, we will end up leaving a window wherein
+        // the GC-thread could observe that no worker is working on compaction
+        // and could end up unregistering the moving space from userfaultfd.
+        ScopedInProgressCount spc(this);
+        // Acquire order to ensure we don't start writing to shadow map, which is
+        // shared, before the CAS is successful. Release order to ensure that the
+        // increment to moving_compactions_in_progress above is not re-ordered
+        // after the CAS.
+        if (moving_pages_status_[page_idx].compare_exchange_strong(
+                state, PageState::kMutatorProcessing, std::memory_order_acq_rel)) {
+          if (kMode == kMinorFaultMode) {
+            DCHECK_EQ(buf, nullptr);
+            buf = shadow_to_space_map_.Begin() + page_idx * kPageSize;
+          } else if (UNLIKELY(buf == nullptr)) {
+            DCHECK_EQ(kMode, kCopyMode);
+            uint16_t idx = compaction_buffer_counter_.fetch_add(1, std::memory_order_relaxed);
+            // The buffer-map is one page bigger as the first buffer is used by GC-thread.
+            CHECK_LE(idx, kMutatorCompactionBufferCount);
+            buf = compaction_buffers_map_.Begin() + idx * kPageSize;
+            DCHECK(compaction_buffers_map_.HasAddress(buf));
+            Thread::Current()->SetThreadLocalGcBuffer(buf);
+          }
+
+          if (fault_page < post_compact_end_) {
+            // The page has to be compacted.
+            CompactPage(
+                first_obj, pre_compact_offset_moving_space_[page_idx], buf, kMode == kCopyMode);
+          } else {
+            DCHECK_NE(first_obj, nullptr);
+            DCHECK_GT(pre_compact_offset_moving_space_[page_idx], 0u);
+            uint8_t* pre_compact_page = black_allocations_begin_ + (fault_page - post_compact_end_);
+            DCHECK(IsAligned<kPageSize>(pre_compact_page));
+            SlideBlackPage(first_obj, page_idx, pre_compact_page, buf, kMode == kCopyMode);
+          }
+          // Nobody else would simultaneously modify this page's state so an
+          // atomic store is sufficient. Use 'release' order to guarantee that
+          // loads/stores to the page are finished before this store.
+          moving_pages_status_[page_idx].store(PageState::kProcessedAndMapping,
+                                               std::memory_order_release);
+          if (kMode == kCopyMode) {
+            CopyIoctl(fault_page, buf);
+            if (use_uffd_sigbus_) {
+              // Store is sufficient as no other thread modifies the status at this stage.
+              moving_pages_status_[page_idx].store(PageState::kProcessedAndMapped,
+                                                   std::memory_order_release);
+            }
+            return;
+          } else {
+            break;
+          }
+        }
+      }
+        continue;
+      case PageState::kProcessing:
+        DCHECK_EQ(kMode, kMinorFaultMode);
+        if (moving_pages_status_[page_idx].compare_exchange_strong(
+                state, PageState::kProcessingAndMapping, std::memory_order_relaxed) &&
+            !use_uffd_sigbus_) {
+          // Somebody else took or will take care of finishing the compaction and
+          // then mapping the page.
+          return;
+        }
+        continue;
+      case PageState::kProcessed:
+        // The page is processed but not mapped. We should map it.
+        break;
+      case PageState::kProcessingAndMapping:
+      case PageState::kMutatorProcessing:
+      case PageState::kProcessedAndMapping:
+        if (use_uffd_sigbus_) {
+          // Wait for the page to be mapped before returning.
+          BackOff(backoff_count++);
+          state = moving_pages_status_[page_idx].load(std::memory_order_acquire);
+          continue;
+        }
+        return;
+      case PageState::kProcessedAndMapped:
+        // Somebody else took care of the page.
+        return;
+    }
+    break;
+  }
+
+  DCHECK_EQ(kMode, kMinorFaultMode);
+  if (state == PageState::kUnprocessed) {
+    MapProcessedPages</*kFirstPageMapping=*/true>(
+        fault_page, moving_pages_status_, page_idx, nr_moving_space_used_pages);
+  } else {
+    DCHECK_EQ(state, PageState::kProcessed);
+    MapProcessedPages</*kFirstPageMapping=*/false>(
+        fault_page, moving_pages_status_, page_idx, nr_moving_space_used_pages);
+  }
+}
+
+void MarkCompact::MapUpdatedLinearAllocPage(uint8_t* page,
+                                            uint8_t* shadow_page,
+                                            Atomic<PageState>& state,
+                                            bool page_touched) {
+  DCHECK(!minor_fault_initialized_);
+  if (page_touched) {
+    CopyIoctl(page, shadow_page);
+  } else {
+    // If the page wasn't touched, then it means it is empty and
+    // is most likely not present on the shadow-side. Furthermore,
+    // since the shadow is also userfaultfd registered doing copy
+    // ioctl fail as the copy-from-user in the kernel will cause
+    // userfault. Instead, just map a zeropage, which is not only
+    // correct but also efficient as it avoids unnecessary memcpy
+    // in the kernel.
+    ZeropageIoctl(page, /*tolerate_eexist=*/false, /*tolerate_enoent=*/false);
+  }
+  if (use_uffd_sigbus_) {
+    // Store is sufficient as no other thread can modify the
+    // status of this page at this point.
+    state.store(PageState::kProcessedAndMapped, std::memory_order_release);
+  }
+}
+
+template <int kMode>
+void MarkCompact::ConcurrentlyProcessLinearAllocPage(uint8_t* fault_page, bool is_minor_fault) {
+  DCHECK(!is_minor_fault || kMode == kMinorFaultMode);
+  auto arena_iter = linear_alloc_arenas_.end();
+  {
+    TrackedArena temp_arena(fault_page);
+    arena_iter = linear_alloc_arenas_.upper_bound(&temp_arena);
+    arena_iter = arena_iter != linear_alloc_arenas_.begin() ? std::prev(arena_iter)
+                                                            : linear_alloc_arenas_.end();
+  }
+  if (arena_iter == linear_alloc_arenas_.end() || arena_iter->second <= fault_page) {
+    // Fault page isn't in any of the arenas that existed before we started
+    // compaction. So map zeropage and return.
+    ZeropageIoctl(fault_page, /*tolerate_eexist=*/true, /*tolerate_enoent=*/false);
+  } else {
+    // fault_page should always belong to some arena.
+    DCHECK(arena_iter != linear_alloc_arenas_.end())
+        << "fault_page:" << static_cast<void*>(fault_page) << "is_minor_fault:" << is_minor_fault;
+    // Find the linear-alloc space containing fault-page
+    LinearAllocSpaceData* space_data = nullptr;
+    for (auto& data : linear_alloc_spaces_data_) {
+      if (data.begin_ <= fault_page && fault_page < data.end_) {
+        space_data = &data;
+        break;
+      }
+    }
+    DCHECK_NE(space_data, nullptr);
+    ptrdiff_t diff = space_data->shadow_.Begin() - space_data->begin_;
+    size_t page_idx = (fault_page - space_data->begin_) / kPageSize;
+    Atomic<PageState>* state_arr =
+        reinterpret_cast<Atomic<PageState>*>(space_data->page_status_map_.Begin());
+    PageState state = state_arr[page_idx].load(use_uffd_sigbus_ ? std::memory_order_acquire :
+                                                                  std::memory_order_relaxed);
+    uint32_t backoff_count = 0;
+    while (true) {
+      switch (state) {
+        case PageState::kUnprocessed: {
+          // Acquire order to ensure we don't start writing to shadow map, which is
+          // shared, before the CAS is successful.
+          if (state_arr[page_idx].compare_exchange_strong(
+                  state, PageState::kProcessingAndMapping, std::memory_order_acquire)) {
+            if (kMode == kCopyMode || is_minor_fault) {
+              uint8_t* first_obj = arena_iter->first->GetFirstObject(fault_page);
+              DCHECK_NE(first_obj, nullptr);
+              LinearAllocPageUpdater updater(this);
+              updater(fault_page + diff, first_obj + diff);
+              if (kMode == kCopyMode) {
+                MapUpdatedLinearAllocPage(fault_page,
+                                          fault_page + diff,
+                                          state_arr[page_idx],
+                                          updater.WasLastPageTouched());
+                return;
+              }
+            } else {
+              // Don't touch the page in this case (there is no reason to do so
+              // anyways) as it would mean reading from first_obj, which could be on
+              // another missing page and hence may cause this thread to block, leading
+              // to deadlocks.
+              // Force read the page if it is missing so that a zeropage gets mapped on
+              // the shadow map and then CONTINUE ioctl will map it on linear-alloc.
+              ForceRead(fault_page + diff);
+            }
+            MapProcessedPages</*kFirstPageMapping=*/true>(
+                fault_page, state_arr, page_idx, space_data->page_status_map_.Size());
+            return;
+          }
+        }
+          continue;
+        case PageState::kProcessing:
+          DCHECK_EQ(kMode, kMinorFaultMode);
+          if (state_arr[page_idx].compare_exchange_strong(
+                  state, PageState::kProcessingAndMapping, std::memory_order_relaxed) &&
+              !use_uffd_sigbus_) {
+            // Somebody else took or will take care of finishing the updates and
+            // then mapping the page.
+            return;
+          }
+          continue;
+        case PageState::kProcessed:
+          // The page is processed but not mapped. We should map it.
+          break;
+        case PageState::kMutatorProcessing:
+          UNREACHABLE();
+        case PageState::kProcessingAndMapping:
+        case PageState::kProcessedAndMapping:
+          if (use_uffd_sigbus_) {
+            // Wait for the page to be mapped before returning.
+            BackOff(backoff_count++);
+            state = state_arr[page_idx].load(std::memory_order_acquire);
+            continue;
+          }
+          return;
+        case PageState::kProcessedAndMapped:
+          // Somebody else took care of the page.
+          return;
+      }
+      break;
+    }
+
+    DCHECK_EQ(kMode, kMinorFaultMode);
+    DCHECK_EQ(state, PageState::kProcessed);
+    if (!is_minor_fault) {
+      // Force read the page if it is missing so that a zeropage gets mapped on
+      // the shadow map and then CONTINUE ioctl will map it on linear-alloc.
+      ForceRead(fault_page + diff);
+    }
+    MapProcessedPages</*kFirstPageMapping=*/false>(
+        fault_page, state_arr, page_idx, space_data->page_status_map_.Size());
+  }
+}
+
+void MarkCompact::ProcessLinearAlloc() {
+  GcVisitedArenaPool* arena_pool =
+      static_cast<GcVisitedArenaPool*>(Runtime::Current()->GetLinearAllocArenaPool());
+  for (auto& pair : linear_alloc_arenas_) {
+    const TrackedArena* arena = pair.first;
+    size_t arena_size;
+    uint8_t* arena_begin;
+    ptrdiff_t diff;
+    bool others_processing;
+    {
+      // Acquire arena-pool's lock so that the arena being worked cannot be
+      // deallocated at the same time.
+      std::lock_guard<std::mutex> lock(arena_pool->GetLock());
+      // If any arenas were freed since compaction pause then skip them from
+      // visiting.
+      if (arena_pool->AreArenasFreed() && !arena_pool->FindAllocatedArena(arena)) {
+        continue;
+      }
+      uint8_t* last_byte = pair.second;
+      DCHECK_ALIGNED(last_byte, kPageSize);
+      others_processing = false;
+      arena_begin = arena->Begin();
+      arena_size = arena->Size();
+      // Find the linear-alloc space containing the arena
+      LinearAllocSpaceData* space_data = nullptr;
+      for (auto& data : linear_alloc_spaces_data_) {
+        if (data.begin_ <= arena_begin && arena_begin < data.end_) {
+          space_data = &data;
+          break;
+        }
+      }
+      DCHECK_NE(space_data, nullptr);
+      diff = space_data->shadow_.Begin() - space_data->begin_;
+      auto visitor = [space_data, last_byte, diff, this, &others_processing](
+                         uint8_t* page_begin,
+                         uint8_t* first_obj) REQUIRES_SHARED(Locks::mutator_lock_) {
+        // No need to process pages past last_byte as they already have updated
+        // gc-roots, if any.
+        if (page_begin >= last_byte) {
+          return;
+        }
+        LinearAllocPageUpdater updater(this);
+        size_t page_idx = (page_begin - space_data->begin_) / kPageSize;
+        DCHECK_LT(page_idx, space_data->page_status_map_.Size());
+        Atomic<PageState>* state_arr =
+            reinterpret_cast<Atomic<PageState>*>(space_data->page_status_map_.Begin());
+        PageState expected_state = PageState::kUnprocessed;
+        PageState desired_state =
+            minor_fault_initialized_ ? PageState::kProcessing : PageState::kProcessingAndMapping;
+        // Acquire order to ensure that we don't start accessing the shadow page,
+        // which is shared with other threads, prior to CAS. Also, for same
+        // reason, we used 'release' order for changing the state to 'processed'.
+        if (state_arr[page_idx].compare_exchange_strong(
+                expected_state, desired_state, std::memory_order_acquire)) {
+          updater(page_begin + diff, first_obj + diff);
+          expected_state = PageState::kProcessing;
+          if (!minor_fault_initialized_) {
+            MapUpdatedLinearAllocPage(
+                page_begin, page_begin + diff, state_arr[page_idx], updater.WasLastPageTouched());
+          } else if (!state_arr[page_idx].compare_exchange_strong(
+                         expected_state, PageState::kProcessed, std::memory_order_release)) {
+            DCHECK_EQ(expected_state, PageState::kProcessingAndMapping);
+            // Force read in case the page was missing and updater didn't touch it
+            // as there was nothing to do. This will ensure that a zeropage is
+            // faulted on the shadow map.
+            ForceRead(page_begin + diff);
+            MapProcessedPages</*kFirstPageMapping=*/true>(
+                page_begin, state_arr, page_idx, space_data->page_status_map_.Size());
+          }
+        } else {
+          others_processing = true;
+        }
+      };
+
+      arena->VisitRoots(visitor);
+    }
+    // If we are not in minor-fault mode and if no other thread was found to be
+    // processing any pages in this arena, then we can madvise the shadow size.
+    // Otherwise, we will double the memory use for linear-alloc.
+    if (!minor_fault_initialized_ && !others_processing) {
+      ZeroAndReleasePages(arena_begin + diff, arena_size);
+    }
+  }
+}
+
+void MarkCompact::RegisterUffd(void* addr, size_t size, int mode) {
+  DCHECK(IsValidFd(uffd_));
+  struct uffdio_register uffd_register;
+  uffd_register.range.start = reinterpret_cast<uintptr_t>(addr);
+  uffd_register.range.len = size;
+  uffd_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+  if (mode == kMinorFaultMode) {
+    uffd_register.mode |= UFFDIO_REGISTER_MODE_MINOR;
+  }
+  CHECK_EQ(ioctl(uffd_, UFFDIO_REGISTER, &uffd_register), 0)
+      << "ioctl_userfaultfd: register failed: " << strerror(errno)
+      << ". start:" << static_cast<void*>(addr) << " len:" << PrettySize(size);
+}
+
+void MarkCompact::UnregisterUffd(uint8_t* start, size_t len) {
+  DCHECK(IsValidFd(uffd_));
+  struct uffdio_range range;
+  range.start = reinterpret_cast<uintptr_t>(start);
+  range.len = len;
+  CHECK_EQ(ioctl(uffd_, UFFDIO_UNREGISTER, &range), 0)
+      << "ioctl_userfaultfd: unregister failed: " << strerror(errno)
+      << ". addr:" << static_cast<void*>(start) << " len:" << PrettySize(len);
+  // Due to an oversight in the kernel implementation of 'unregister', the
+  // waiting threads are woken up only for copy uffds. Therefore, for now, we
+  // have to explicitly wake up the threads in minor-fault case.
+  // TODO: The fix in the kernel is being worked on. Once the kernel version
+  // containing the fix is known, make it conditional on that as well.
+  if (minor_fault_initialized_) {
+    CHECK_EQ(ioctl(uffd_, UFFDIO_WAKE, &range), 0)
+        << "ioctl_userfaultfd: wake failed: " << strerror(errno)
+        << ". addr:" << static_cast<void*>(start) << " len:" << PrettySize(len);
+  }
+}
+
+void MarkCompact::CompactionPhase() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  {
+    int32_t freed_bytes = black_objs_slide_diff_;
+    bump_pointer_space_->RecordFree(freed_objects_, freed_bytes);
+    RecordFree(ObjectBytePair(freed_objects_, freed_bytes));
+  }
+
+  if (CanCompactMovingSpaceWithMinorFault()) {
+    CompactMovingSpace<kMinorFaultMode>(/*page=*/nullptr);
+  } else {
+    CompactMovingSpace<kCopyMode>(compaction_buffers_map_.Begin());
+  }
+
+  // Make sure no mutator is reading from the from-space before unregistering
+  // userfaultfd from moving-space and then zapping from-space. The mutator
+  // and GC may race to set a page state to processing or further along. The two
+  // attempts are ordered. If the collector wins, then the mutator will see that
+  // and not access the from-space page. If the muator wins, then the
+  // compaction_in_progress_count_ increment by the mutator happens-before the test
+  // here, and we will not see a zero value until the mutator has completed.
+  for (uint32_t i = 0; compaction_in_progress_count_.load(std::memory_order_acquire) > 0; i++) {
+    BackOff(i);
+  }
+
+  size_t moving_space_size = bump_pointer_space_->Capacity();
+  UnregisterUffd(bump_pointer_space_->Begin(),
+                 minor_fault_initialized_ ?
+                     (moving_first_objs_count_ + black_page_count_) * kPageSize :
+                     moving_space_size);
+
+  // Release all of the memory taken by moving-space's from-map
+  if (minor_fault_initialized_) {
+    if (IsValidFd(moving_from_space_fd_)) {
+      // A strange behavior is observed wherein between GC cycles the from-space'
+      // first page is accessed. But the memfd that is mapped on from-space, is
+      // used on to-space in next GC cycle, causing issues with userfaultfd as the
+      // page isn't missing. A possible reason for this could be prefetches. The
+      // mprotect ensures that such accesses don't succeed.
+      int ret = mprotect(from_space_begin_, moving_space_size, PROT_NONE);
+      CHECK_EQ(ret, 0) << "mprotect(PROT_NONE) for from-space failed: " << strerror(errno);
+      // madvise(MADV_REMOVE) needs PROT_WRITE. Use fallocate() instead, which
+      // does the same thing.
+      ret = fallocate(moving_from_space_fd_,
+                      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                      /*offset=*/0,
+                      moving_space_size);
+      CHECK_EQ(ret, 0) << "fallocate for from-space failed: " << strerror(errno);
+    } else {
+      // We don't have a valid fd, so use madvise(MADV_REMOVE) instead. mprotect
+      // is not required in this case as we create fresh
+      // MAP_SHARED+MAP_ANONYMOUS mapping in each GC cycle.
+      int ret = madvise(from_space_begin_, moving_space_size, MADV_REMOVE);
+      CHECK_EQ(ret, 0) << "madvise(MADV_REMOVE) failed for from-space map:" << strerror(errno);
+    }
+  } else {
+    from_space_map_.MadviseDontNeedAndZero();
+  }
+  // mprotect(PROT_NONE) all maps except to-space in debug-mode to catch any unexpected accesses.
+  if (shadow_to_space_map_.IsValid()) {
+    DCHECK_EQ(mprotect(shadow_to_space_map_.Begin(), shadow_to_space_map_.Size(), PROT_NONE), 0)
+        << "mprotect(PROT_NONE) for shadow-map failed:" << strerror(errno);
+  }
+  if (!IsValidFd(moving_from_space_fd_)) {
+    // The other case is already mprotected above.
+    DCHECK_EQ(mprotect(from_space_begin_, moving_space_size, PROT_NONE), 0)
+        << "mprotect(PROT_NONE) for from-space failed: " << strerror(errno);
+  }
+
+  ProcessLinearAlloc();
+
+  if (use_uffd_sigbus_) {
+    // Set compaction-done bit so that no new mutator threads start compaction
+    // process in the SIGBUS handler.
+    SigbusCounterType count = sigbus_in_progress_count_.fetch_or(kSigbusCounterCompactionDoneMask,
+                                                                 std::memory_order_acq_rel);
+    // Wait for SIGBUS handlers already in play.
+    for (uint32_t i = 0; count > 0; i++) {
+      BackOff(i);
+      count = sigbus_in_progress_count_.load(std::memory_order_acquire);
+      count &= ~kSigbusCounterCompactionDoneMask;
+    }
+  } else {
+    DCHECK(IsAligned<kPageSize>(conc_compaction_termination_page_));
+    // We will only iterate once if gKernelHasFaultRetry is true.
+    do {
+      // madvise the page so that we can get userfaults on it.
+      ZeroAndReleasePages(conc_compaction_termination_page_, kPageSize);
+      // The following load triggers 'special' userfaults. When received by the
+      // thread-pool workers, they will exit out of the compaction task. This fault
+      // happens because we madvised the page.
+      ForceRead(conc_compaction_termination_page_);
+    } while (thread_pool_counter_ > 0);
+  }
+  // Unregister linear-alloc spaces
+  for (auto& data : linear_alloc_spaces_data_) {
+    DCHECK_EQ(data.end_ - data.begin_, static_cast<ssize_t>(data.shadow_.Size()));
+    UnregisterUffd(data.begin_, data.shadow_.Size());
+    // madvise linear-allocs's page-status array
+    data.page_status_map_.MadviseDontNeedAndZero();
+    // Madvise the entire linear-alloc space's shadow. In copy-mode it gets rid
+    // of the pages which are still mapped. In minor-fault mode this unmaps all
+    // pages, which is good in reducing the mremap (done in STW pause) time in
+    // next GC cycle.
+    data.shadow_.MadviseDontNeedAndZero();
+    if (minor_fault_initialized_) {
+      DCHECK_EQ(mprotect(data.shadow_.Begin(), data.shadow_.Size(), PROT_NONE), 0)
+          << "mprotect failed: " << strerror(errno);
+    }
+  }
+
+  if (!use_uffd_sigbus_) {
+    heap_->GetThreadPool()->StopWorkers(thread_running_gc_);
+  }
+}
+
+template <size_t kBufferSize>
+class MarkCompact::ThreadRootsVisitor : public RootVisitor {
+ public:
+  explicit ThreadRootsVisitor(MarkCompact* mark_compact, Thread* const self)
+        : mark_compact_(mark_compact), self_(self) {}
+
+  ~ThreadRootsVisitor() {
+    Flush();
+  }
+
+  void VisitRoots(mirror::Object*** roots, size_t count, const RootInfo& info ATTRIBUTE_UNUSED)
+      override REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_) {
+    for (size_t i = 0; i < count; i++) {
+      mirror::Object* obj = *roots[i];
+      if (mark_compact_->MarkObjectNonNullNoPush</*kParallel*/true>(obj)) {
+        Push(obj);
+      }
+    }
+  }
+
+  void VisitRoots(mirror::CompressedReference<mirror::Object>** roots,
+                  size_t count,
+                  const RootInfo& info ATTRIBUTE_UNUSED)
+      override REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_) {
+    for (size_t i = 0; i < count; i++) {
+      mirror::Object* obj = roots[i]->AsMirrorPtr();
+      if (mark_compact_->MarkObjectNonNullNoPush</*kParallel*/true>(obj)) {
+        Push(obj);
+      }
+    }
+  }
+
+ private:
+  void Flush() REQUIRES_SHARED(Locks::mutator_lock_)
+               REQUIRES(Locks::heap_bitmap_lock_) {
+    StackReference<mirror::Object>* start;
+    StackReference<mirror::Object>* end;
+    {
+      MutexLock mu(self_, mark_compact_->lock_);
+      // Loop here because even after expanding once it may not be sufficient to
+      // accommodate all references. It's almost impossible, but there is no harm
+      // in implementing it this way.
+      while (!mark_compact_->mark_stack_->BumpBack(idx_, &start, &end)) {
+        mark_compact_->ExpandMarkStack();
+      }
+    }
+    while (idx_ > 0) {
+      *start++ = roots_[--idx_];
+    }
+    DCHECK_EQ(start, end);
+  }
+
+  void Push(mirror::Object* obj) REQUIRES_SHARED(Locks::mutator_lock_)
+                                 REQUIRES(Locks::heap_bitmap_lock_) {
+    if (UNLIKELY(idx_ >= kBufferSize)) {
+      Flush();
+    }
+    roots_[idx_++].Assign(obj);
+  }
+
+  StackReference<mirror::Object> roots_[kBufferSize];
+  size_t idx_ = 0;
+  MarkCompact* const mark_compact_;
+  Thread* const self_;
+};
+
+class MarkCompact::CheckpointMarkThreadRoots : public Closure {
+ public:
+  explicit CheckpointMarkThreadRoots(MarkCompact* mark_compact) : mark_compact_(mark_compact) {}
+
+  void Run(Thread* thread) override NO_THREAD_SAFETY_ANALYSIS {
+    ScopedTrace trace("Marking thread roots");
+    // Note: self is not necessarily equal to thread since thread may be
+    // suspended.
+    Thread* const self = Thread::Current();
+    CHECK(thread == self
+          || thread->IsSuspended()
+          || thread->GetState() == ThreadState::kWaitingPerformingGc)
+        << thread->GetState() << " thread " << thread << " self " << self;
+    {
+      ThreadRootsVisitor</*kBufferSize*/ 20> visitor(mark_compact_, self);
+      thread->VisitRoots(&visitor, kVisitRootFlagAllRoots);
+    }
+    // Clear page-buffer to prepare for compaction phase.
+    thread->SetThreadLocalGcBuffer(nullptr);
+
+    // If thread is a running mutator, then act on behalf of the garbage
+    // collector. See the code in ThreadList::RunCheckpoint.
+    mark_compact_->GetBarrier().Pass(self);
+  }
+
+ private:
+  MarkCompact* const mark_compact_;
+};
+
+void MarkCompact::MarkRootsCheckpoint(Thread* self, Runtime* runtime) {
+  // We revote TLABs later during paused round of marking.
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  CheckpointMarkThreadRoots check_point(this);
+  ThreadList* thread_list = runtime->GetThreadList();
+  gc_barrier_.Init(self, 0);
+  // Request the check point is run on all threads returning a count of the threads that must
+  // run through the barrier including self.
+  size_t barrier_count = thread_list->RunCheckpoint(&check_point);
+  // Release locks then wait for all mutator threads to pass the barrier.
+  // If there are no threads to wait which implys that all the checkpoint functions are finished,
+  // then no need to release locks.
+  if (barrier_count == 0) {
+    return;
+  }
+  Locks::heap_bitmap_lock_->ExclusiveUnlock(self);
+  Locks::mutator_lock_->SharedUnlock(self);
+  {
+    ScopedThreadStateChange tsc(self, ThreadState::kWaitingForCheckPointsToRun);
+    gc_barrier_.Increment(self, barrier_count);
+  }
+  Locks::mutator_lock_->SharedLock(self);
+  Locks::heap_bitmap_lock_->ExclusiveLock(self);
+}
+
+void MarkCompact::MarkNonThreadRoots(Runtime* runtime) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  runtime->VisitNonThreadRoots(this);
+}
+
+void MarkCompact::MarkConcurrentRoots(VisitRootFlags flags, Runtime* runtime) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  runtime->VisitConcurrentRoots(this, flags);
+}
+
+void MarkCompact::RevokeAllThreadLocalBuffers() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  bump_pointer_space_->RevokeAllThreadLocalBuffers();
+}
+
+class MarkCompact::ScanObjectVisitor {
+ public:
+  explicit ScanObjectVisitor(MarkCompact* const mark_compact) ALWAYS_INLINE
+      : mark_compact_(mark_compact) {}
+
+  void operator()(ObjPtr<mirror::Object> obj) const
+      ALWAYS_INLINE
+      REQUIRES(Locks::heap_bitmap_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    mark_compact_->ScanObject</*kUpdateLiveWords*/ false>(obj.Ptr());
+  }
+
+ private:
+  MarkCompact* const mark_compact_;
+};
+
+void MarkCompact::UpdateAndMarkModUnion() {
+  accounting::CardTable* const card_table = heap_->GetCardTable();
+  for (const auto& space : immune_spaces_.GetSpaces()) {
+    const char* name = space->IsZygoteSpace()
+        ? "UpdateAndMarkZygoteModUnionTable"
+        : "UpdateAndMarkImageModUnionTable";
+    DCHECK(space->IsZygoteSpace() || space->IsImageSpace()) << *space;
+    TimingLogger::ScopedTiming t(name, GetTimings());
+    accounting::ModUnionTable* table = heap_->FindModUnionTableFromSpace(space);
+    if (table != nullptr) {
+      // UpdateAndMarkReferences() doesn't visit Reference-type objects. But
+      // that's fine because these objects are immutable enough (referent can
+      // only be cleared) and hence the only referents they can have are intra-space.
+      table->UpdateAndMarkReferences(this);
+    } else {
+      // No mod-union table, scan all dirty/aged cards in the corresponding
+      // card-table. This can only occur for app images.
+      card_table->Scan</*kClearCard*/ false>(space->GetMarkBitmap(),
+                                             space->Begin(),
+                                             space->End(),
+                                             ScanObjectVisitor(this),
+                                             gc::accounting::CardTable::kCardAged);
+    }
+  }
+}
+
+void MarkCompact::MarkReachableObjects() {
+  UpdateAndMarkModUnion();
+  // Recursively mark all the non-image bits set in the mark bitmap.
+  ProcessMarkStack();
+}
+
+class MarkCompact::CardModifiedVisitor {
+ public:
+  explicit CardModifiedVisitor(MarkCompact* const mark_compact,
+                               accounting::ContinuousSpaceBitmap* const bitmap,
+                               accounting::CardTable* const card_table)
+      : visitor_(mark_compact), bitmap_(bitmap), card_table_(card_table) {}
+
+  void operator()(uint8_t* card,
+                  uint8_t expected_value,
+                  uint8_t new_value ATTRIBUTE_UNUSED) const {
+    if (expected_value == accounting::CardTable::kCardDirty) {
+      uintptr_t start = reinterpret_cast<uintptr_t>(card_table_->AddrFromCard(card));
+      bitmap_->VisitMarkedRange(start, start + accounting::CardTable::kCardSize, visitor_);
+    }
+  }
+
+ private:
+  ScanObjectVisitor visitor_;
+  accounting::ContinuousSpaceBitmap* bitmap_;
+  accounting::CardTable* const card_table_;
+};
+
+void MarkCompact::ScanDirtyObjects(bool paused, uint8_t minimum_age) {
+  accounting::CardTable* card_table = heap_->GetCardTable();
+  for (const auto& space : heap_->GetContinuousSpaces()) {
+    const char* name = nullptr;
+    switch (space->GetGcRetentionPolicy()) {
+    case space::kGcRetentionPolicyNeverCollect:
+      name = paused ? "(Paused)ScanGrayImmuneSpaceObjects" : "ScanGrayImmuneSpaceObjects";
+      break;
+    case space::kGcRetentionPolicyFullCollect:
+      name = paused ? "(Paused)ScanGrayZygoteSpaceObjects" : "ScanGrayZygoteSpaceObjects";
+      break;
+    case space::kGcRetentionPolicyAlwaysCollect:
+      name = paused ? "(Paused)ScanGrayAllocSpaceObjects" : "ScanGrayAllocSpaceObjects";
+      break;
+    default:
+      LOG(FATAL) << "Unreachable";
+      UNREACHABLE();
+    }
+    TimingLogger::ScopedTiming t(name, GetTimings());
+    ScanObjectVisitor visitor(this);
+    const bool is_immune_space = space->IsZygoteSpace() || space->IsImageSpace();
+    if (paused) {
+      DCHECK_EQ(minimum_age, gc::accounting::CardTable::kCardDirty);
+      // We can clear the card-table for any non-immune space.
+      if (is_immune_space) {
+        card_table->Scan</*kClearCard*/false>(space->GetMarkBitmap(),
+                                              space->Begin(),
+                                              space->End(),
+                                              visitor,
+                                              minimum_age);
+      } else {
+        card_table->Scan</*kClearCard*/true>(space->GetMarkBitmap(),
+                                             space->Begin(),
+                                             space->End(),
+                                             visitor,
+                                             minimum_age);
+      }
+    } else {
+      DCHECK_EQ(minimum_age, gc::accounting::CardTable::kCardAged);
+      accounting::ModUnionTable* table = heap_->FindModUnionTableFromSpace(space);
+      if (table) {
+        table->ProcessCards();
+        card_table->Scan</*kClearCard*/false>(space->GetMarkBitmap(),
+                                              space->Begin(),
+                                              space->End(),
+                                              visitor,
+                                              minimum_age);
+      } else {
+        CardModifiedVisitor card_modified_visitor(this, space->GetMarkBitmap(), card_table);
+        // For the alloc spaces we should age the dirty cards and clear the rest.
+        // For image and zygote-space without mod-union-table, age the dirty
+        // cards but keep the already aged cards unchanged.
+        // In either case, visit the objects on the cards that were changed from
+        // dirty to aged.
+        if (is_immune_space) {
+          card_table->ModifyCardsAtomic(space->Begin(),
+                                        space->End(),
+                                        [](uint8_t card) {
+                                          return (card == gc::accounting::CardTable::kCardClean)
+                                                  ? card
+                                                  : gc::accounting::CardTable::kCardAged;
+                                        },
+                                        card_modified_visitor);
+        } else {
+          card_table->ModifyCardsAtomic(space->Begin(),
+                                        space->End(),
+                                        AgeCardVisitor(),
+                                        card_modified_visitor);
+        }
+      }
+    }
+  }
+}
+
+void MarkCompact::RecursiveMarkDirtyObjects(bool paused, uint8_t minimum_age) {
+  ScanDirtyObjects(paused, minimum_age);
+  ProcessMarkStack();
+}
+
+void MarkCompact::MarkRoots(VisitRootFlags flags) {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  Runtime* runtime = Runtime::Current();
+  // Make sure that the checkpoint which collects the stack roots is the first
+  // one capturning GC-roots. As this one is supposed to find the address
+  // everything allocated after that (during this marking phase) will be
+  // considered 'marked'.
+  MarkRootsCheckpoint(thread_running_gc_, runtime);
+  MarkNonThreadRoots(runtime);
+  MarkConcurrentRoots(flags, runtime);
+}
+
+void MarkCompact::PreCleanCards() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  CHECK(!Locks::mutator_lock_->IsExclusiveHeld(thread_running_gc_));
+  MarkRoots(static_cast<VisitRootFlags>(kVisitRootFlagClearRootLog | kVisitRootFlagNewRoots));
+  RecursiveMarkDirtyObjects(/*paused*/ false, accounting::CardTable::kCardDirty - 1);
+}
+
+// In a concurrent marking algorithm, if we are not using a write/read barrier, as
+// in this case, then we need a stop-the-world (STW) round in the end to mark
+// objects which were written into concurrently while concurrent marking was
+// performed.
+// In order to minimize the pause time, we could take one of the two approaches:
+// 1. Keep repeating concurrent marking of dirty cards until the time spent goes
+// below a threshold.
+// 2. Do two rounds concurrently and then attempt a paused one. If we figure
+// that it's taking too long, then resume mutators and retry.
+//
+// Given the non-trivial fixed overhead of running a round (card table and root
+// scan), it might be better to go with approach 2.
+void MarkCompact::MarkingPhase() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  DCHECK_EQ(thread_running_gc_, Thread::Current());
+  WriterMutexLock mu(thread_running_gc_, *Locks::heap_bitmap_lock_);
+  BindAndResetBitmaps();
+  MarkZygoteLargeObjects();
+  MarkRoots(
+        static_cast<VisitRootFlags>(kVisitRootFlagAllRoots | kVisitRootFlagStartLoggingNewRoots));
+  MarkReachableObjects();
+  // Pre-clean dirtied cards to reduce pauses.
+  PreCleanCards();
+
+  // Setup reference processing and forward soft references once before enabling
+  // slow path (in MarkingPause)
+  ReferenceProcessor* rp = GetHeap()->GetReferenceProcessor();
+  bool clear_soft_references = GetCurrentIteration()->GetClearSoftReferences();
+  rp->Setup(thread_running_gc_, this, /*concurrent=*/ true, clear_soft_references);
+  if (!clear_soft_references) {
+    // Forward as many SoftReferences as possible before inhibiting reference access.
+    rp->ForwardSoftReferences(GetTimings());
+  }
+}
+
+class MarkCompact::RefFieldsVisitor {
+ public:
+  ALWAYS_INLINE explicit RefFieldsVisitor(MarkCompact* const mark_compact)
+    : mark_compact_(mark_compact) {}
+
+  ALWAYS_INLINE void operator()(mirror::Object* obj,
+                                MemberOffset offset,
+                                bool is_static ATTRIBUTE_UNUSED) const
+      REQUIRES(Locks::heap_bitmap_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (kCheckLocks) {
+      Locks::mutator_lock_->AssertSharedHeld(Thread::Current());
+      Locks::heap_bitmap_lock_->AssertExclusiveHeld(Thread::Current());
+    }
+    mark_compact_->MarkObject(obj->GetFieldObject<mirror::Object>(offset), obj, offset);
+  }
+
+  void operator()(ObjPtr<mirror::Class> klass, ObjPtr<mirror::Reference> ref) const
+      REQUIRES(Locks::heap_bitmap_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    mark_compact_->DelayReferenceReferent(klass, ref);
+  }
+
+  void VisitRootIfNonNull(mirror::CompressedReference<mirror::Object>* root) const
+      REQUIRES(Locks::heap_bitmap_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (!root->IsNull()) {
+      VisitRoot(root);
+    }
+  }
+
+  void VisitRoot(mirror::CompressedReference<mirror::Object>* root) const
+      REQUIRES(Locks::heap_bitmap_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (kCheckLocks) {
+      Locks::mutator_lock_->AssertSharedHeld(Thread::Current());
+      Locks::heap_bitmap_lock_->AssertExclusiveHeld(Thread::Current());
+    }
+    mark_compact_->MarkObject(root->AsMirrorPtr());
+  }
+
+ private:
+  MarkCompact* const mark_compact_;
+};
+
+template <size_t kAlignment>
+size_t MarkCompact::LiveWordsBitmap<kAlignment>::LiveBytesInBitmapWord(size_t chunk_idx) const {
+  const size_t index = chunk_idx * kBitmapWordsPerVectorWord;
+  size_t words = 0;
+  for (uint32_t i = 0; i < kBitmapWordsPerVectorWord; i++) {
+    words += POPCOUNT(Bitmap::Begin()[index + i]);
+  }
+  return words * kAlignment;
+}
+
+void MarkCompact::UpdateLivenessInfo(mirror::Object* obj, size_t obj_size) {
+  DCHECK(obj != nullptr);
+  DCHECK_EQ(obj_size, obj->SizeOf<kDefaultVerifyFlags>());
+  uintptr_t obj_begin = reinterpret_cast<uintptr_t>(obj);
+  UpdateClassAfterObjectMap(obj);
+  size_t size = RoundUp(obj_size, kAlignment);
+  uintptr_t bit_index = live_words_bitmap_->SetLiveWords(obj_begin, size);
+  size_t chunk_idx = (obj_begin - live_words_bitmap_->Begin()) / kOffsetChunkSize;
+  // Compute the bit-index within the chunk-info vector word.
+  bit_index %= kBitsPerVectorWord;
+  size_t first_chunk_portion = std::min(size, (kBitsPerVectorWord - bit_index) * kAlignment);
+
+  chunk_info_vec_[chunk_idx++] += first_chunk_portion;
+  DCHECK_LE(first_chunk_portion, size);
+  for (size -= first_chunk_portion; size > kOffsetChunkSize; size -= kOffsetChunkSize) {
+    DCHECK_EQ(chunk_info_vec_[chunk_idx], 0u);
+    chunk_info_vec_[chunk_idx++] = kOffsetChunkSize;
+  }
+  chunk_info_vec_[chunk_idx] += size;
+  freed_objects_--;
+}
+
+template <bool kUpdateLiveWords>
+void MarkCompact::ScanObject(mirror::Object* obj) {
+  // The size of `obj` is used both here (to update `bytes_scanned_`) and in
+  // `UpdateLivenessInfo`. As fetching this value can be expensive, do it once
+  // here and pass that information to `UpdateLivenessInfo`.
+  size_t obj_size = obj->SizeOf<kDefaultVerifyFlags>();
+  bytes_scanned_ += obj_size;
+
+  RefFieldsVisitor visitor(this);
+  DCHECK(IsMarked(obj)) << "Scanning marked object " << obj << "\n" << heap_->DumpSpaces();
+  if (kUpdateLiveWords && moving_space_bitmap_->HasAddress(obj)) {
+    UpdateLivenessInfo(obj, obj_size);
+  }
+  obj->VisitReferences(visitor, visitor);
+}
+
+// Scan anything that's on the mark stack.
+void MarkCompact::ProcessMarkStack() {
+  TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
+  // TODO: try prefetch like in CMS
+  while (!mark_stack_->IsEmpty()) {
+    mirror::Object* obj = mark_stack_->PopBack();
+    DCHECK(obj != nullptr);
+    ScanObject</*kUpdateLiveWords*/ true>(obj);
+  }
+}
+
+void MarkCompact::ExpandMarkStack() {
+  const size_t new_size = mark_stack_->Capacity() * 2;
+  std::vector<StackReference<mirror::Object>> temp(mark_stack_->Begin(),
+                                                   mark_stack_->End());
+  mark_stack_->Resize(new_size);
+  for (auto& ref : temp) {
+    mark_stack_->PushBack(ref.AsMirrorPtr());
+  }
+  DCHECK(!mark_stack_->IsFull());
+}
+
+inline void MarkCompact::PushOnMarkStack(mirror::Object* obj) {
+  if (UNLIKELY(mark_stack_->IsFull())) {
+    ExpandMarkStack();
+  }
+  mark_stack_->PushBack(obj);
+}
+
+inline void MarkCompact::MarkObjectNonNull(mirror::Object* obj,
+                                           mirror::Object* holder,
+                                           MemberOffset offset) {
+  DCHECK(obj != nullptr);
+  if (MarkObjectNonNullNoPush</*kParallel*/false>(obj, holder, offset)) {
+    PushOnMarkStack(obj);
+  }
+}
+
+template <bool kParallel>
+inline bool MarkCompact::MarkObjectNonNullNoPush(mirror::Object* obj,
+                                                 mirror::Object* holder,
+                                                 MemberOffset offset) {
+  // We expect most of the referenes to be in bump-pointer space, so try that
+  // first to keep the cost of this function minimal.
+  if (LIKELY(moving_space_bitmap_->HasAddress(obj))) {
+    return kParallel ? !moving_space_bitmap_->AtomicTestAndSet(obj)
+                     : !moving_space_bitmap_->Set(obj);
+  } else if (non_moving_space_bitmap_->HasAddress(obj)) {
+    return kParallel ? !non_moving_space_bitmap_->AtomicTestAndSet(obj)
+                     : !non_moving_space_bitmap_->Set(obj);
+  } else if (immune_spaces_.ContainsObject(obj)) {
+    DCHECK(IsMarked(obj) != nullptr);
+    return false;
+  } else {
+    // Must be a large-object space, otherwise it's a case of heap corruption.
+    if (!IsAligned<kPageSize>(obj)) {
+      // Objects in large-object space are page aligned. So if we have an object
+      // which doesn't belong to any space and is not page-aligned as well, then
+      // it's memory corruption.
+      // TODO: implement protect/unprotect in bump-pointer space.
+      heap_->GetVerification()->LogHeapCorruption(holder, offset, obj, /*fatal*/ true);
+    }
+    DCHECK_NE(heap_->GetLargeObjectsSpace(), nullptr)
+        << "ref=" << obj
+        << " doesn't belong to any of the spaces and large object space doesn't exist";
+    accounting::LargeObjectBitmap* los_bitmap = heap_->GetLargeObjectsSpace()->GetMarkBitmap();
+    DCHECK(los_bitmap->HasAddress(obj));
+    if (kParallel) {
+      los_bitmap->AtomicTestAndSet(obj);
+    } else {
+      los_bitmap->Set(obj);
+    }
+    // We only have primitive arrays in large object space. So there is no
+    // reason to push into mark-stack.
+    DCHECK(obj->IsString() || (obj->IsArrayInstance() && !obj->IsObjectArray()));
+    return false;
+  }
+}
+
+inline void MarkCompact::MarkObject(mirror::Object* obj,
+                                    mirror::Object* holder,
+                                    MemberOffset offset) {
+  if (obj != nullptr) {
+    MarkObjectNonNull(obj, holder, offset);
+  }
+}
+
+mirror::Object* MarkCompact::MarkObject(mirror::Object* obj) {
+  MarkObject(obj, nullptr, MemberOffset(0));
+  return obj;
+}
+
+void MarkCompact::MarkHeapReference(mirror::HeapReference<mirror::Object>* obj,
+                                    bool do_atomic_update ATTRIBUTE_UNUSED) {
+  MarkObject(obj->AsMirrorPtr(), nullptr, MemberOffset(0));
+}
+
+void MarkCompact::VisitRoots(mirror::Object*** roots,
+                             size_t count,
+                             const RootInfo& info) {
+  if (compacting_) {
+    for (size_t i = 0; i < count; ++i) {
+      UpdateRoot(roots[i], info);
+    }
+  } else {
+    for (size_t i = 0; i < count; ++i) {
+      MarkObjectNonNull(*roots[i]);
+    }
+  }
+}
+
+void MarkCompact::VisitRoots(mirror::CompressedReference<mirror::Object>** roots,
+                             size_t count,
+                             const RootInfo& info) {
+  // TODO: do we need to check if the root is null or not?
+  if (compacting_) {
+    for (size_t i = 0; i < count; ++i) {
+      UpdateRoot(roots[i], info);
+    }
+  } else {
+    for (size_t i = 0; i < count; ++i) {
+      MarkObjectNonNull(roots[i]->AsMirrorPtr());
+    }
+  }
+}
+
+mirror::Object* MarkCompact::IsMarked(mirror::Object* obj) {
+  if (moving_space_bitmap_->HasAddress(obj)) {
+    const bool is_black = reinterpret_cast<uint8_t*>(obj) >= black_allocations_begin_;
+    if (compacting_) {
+      if (is_black) {
+        return PostCompactBlackObjAddr(obj);
+      } else if (live_words_bitmap_->Test(obj)) {
+        return PostCompactOldObjAddr(obj);
+      } else {
+        return nullptr;
+      }
+    }
+    return (is_black || moving_space_bitmap_->Test(obj)) ? obj : nullptr;
+  } else if (non_moving_space_bitmap_->HasAddress(obj)) {
+    return non_moving_space_bitmap_->Test(obj) ? obj : nullptr;
+  } else if (immune_spaces_.ContainsObject(obj)) {
+    return obj;
+  } else {
+    DCHECK(heap_->GetLargeObjectsSpace())
+        << "ref=" << obj
+        << " doesn't belong to any of the spaces and large object space doesn't exist";
+    accounting::LargeObjectBitmap* los_bitmap = heap_->GetLargeObjectsSpace()->GetMarkBitmap();
+    if (los_bitmap->HasAddress(obj)) {
+      DCHECK(IsAligned<kPageSize>(obj));
+      return los_bitmap->Test(obj) ? obj : nullptr;
+    } else {
+      // The given obj is not in any of the known spaces, so return null. This could
+      // happen for instance in interpreter caches wherein a concurrent updation
+      // to the cache could result in obj being a non-reference. This is
+      // tolerable because SweepInterpreterCaches only updates if the given
+      // object has moved, which can't be the case for the non-reference.
+      return nullptr;
+    }
+  }
+}
+
+bool MarkCompact::IsNullOrMarkedHeapReference(mirror::HeapReference<mirror::Object>* obj,
+                                              bool do_atomic_update ATTRIBUTE_UNUSED) {
+  mirror::Object* ref = obj->AsMirrorPtr();
+  if (ref == nullptr) {
+    return true;
+  }
+  return IsMarked(ref);
+}
+
+// Process the 'referent' field in a java.lang.ref.Reference. If the referent
+// has not yet been marked, put it on the appropriate list in the heap for later
+// processing.
+void MarkCompact::DelayReferenceReferent(ObjPtr<mirror::Class> klass,
+                                         ObjPtr<mirror::Reference> ref) {
+  heap_->GetReferenceProcessor()->DelayReferenceReferent(klass, ref, this);
+}
+
+void MarkCompact::FinishPhase() {
+  GetCurrentIteration()->SetScannedBytes(bytes_scanned_);
+  bool is_zygote = Runtime::Current()->IsZygote();
+  compacting_ = false;
+  minor_fault_initialized_ = !is_zygote && uffd_minor_fault_supported_;
+  // Madvise compaction buffers. When using threaded implementation, skip the first page,
+  // which is used by the gc-thread for the next iteration. Otherwise, we get into a
+  // deadlock due to userfault on it in the next iteration. This page is not consuming any
+  // physical memory because we already madvised it above and then we triggered a read
+  // userfault, which maps a special zero-page.
+  if (use_uffd_sigbus_ || !minor_fault_initialized_ || !shadow_to_space_map_.IsValid() ||
+      shadow_to_space_map_.Size() < (moving_first_objs_count_ + black_page_count_) * kPageSize) {
+    size_t adjustment = use_uffd_sigbus_ ? 0 : kPageSize;
+    ZeroAndReleasePages(compaction_buffers_map_.Begin() + adjustment,
+                        compaction_buffers_map_.Size() - adjustment);
+  } else if (shadow_to_space_map_.Size() == bump_pointer_space_->Capacity()) {
+    // Now that we are going to use minor-faults from next GC cycle, we can
+    // unmap the buffers used by worker threads.
+    compaction_buffers_map_.SetSize(kPageSize);
+  }
+  info_map_.MadviseDontNeedAndZero();
+  live_words_bitmap_->ClearBitmap();
+  // TODO: We can clear this bitmap right before compaction pause. But in that
+  // case we need to ensure that we don't assert on this bitmap afterwards.
+  // Also, we would still need to clear it here again as we may have to use the
+  // bitmap for black-allocations (see UpdateMovingSpaceBlackAllocations()).
+  moving_space_bitmap_->Clear();
+
+  if (UNLIKELY(is_zygote && IsValidFd(uffd_))) {
+    heap_->DeleteThreadPool();
+    // This unregisters all ranges as a side-effect.
+    close(uffd_);
+    uffd_ = kFdUnused;
+    uffd_initialized_ = false;
+  }
+  CHECK(mark_stack_->IsEmpty());  // Ensure that the mark stack is empty.
+  mark_stack_->Reset();
+  DCHECK_EQ(thread_running_gc_, Thread::Current());
+  if (kIsDebugBuild) {
+    MutexLock mu(thread_running_gc_, lock_);
+    if (updated_roots_.get() != nullptr) {
+      updated_roots_->clear();
+    }
+  }
+  class_after_obj_ordered_map_.clear();
+  delete[] moving_pages_status_;
+  linear_alloc_arenas_.clear();
+  {
+    ReaderMutexLock mu(thread_running_gc_, *Locks::mutator_lock_);
+    WriterMutexLock mu2(thread_running_gc_, *Locks::heap_bitmap_lock_);
+    heap_->ClearMarkedObjects();
+  }
+  std::swap(moving_to_space_fd_, moving_from_space_fd_);
+  if (IsValidFd(moving_to_space_fd_)) {
+    // Confirm that the memfd to be used on to-space in next GC cycle is empty.
+    struct stat buf;
+    DCHECK_EQ(fstat(moving_to_space_fd_, &buf), 0) << "fstat failed: " << strerror(errno);
+    DCHECK_EQ(buf.st_blocks, 0u);
+  }
+}
+
+}  // namespace collector
+}  // namespace gc
+}  // namespace art
diff --git a/runtime/gc/collector/mark_compact.h b/runtime/gc/collector/mark_compact.h
new file mode 100644
index 0000000000..d73f40d436
--- /dev/null
+++ b/runtime/gc/collector/mark_compact.h
@@ -0,0 +1,789 @@
+/*
+ * Copyright 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_GC_COLLECTOR_MARK_COMPACT_H_
+#define ART_RUNTIME_GC_COLLECTOR_MARK_COMPACT_H_
+
+#include <signal.h>
+
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "barrier.h"
+#include "base/atomic.h"
+#include "base/gc_visited_arena_pool.h"
+#include "base/macros.h"
+#include "base/mutex.h"
+#include "garbage_collector.h"
+#include "gc/accounting/atomic_stack.h"
+#include "gc/accounting/bitmap-inl.h"
+#include "gc/accounting/heap_bitmap.h"
+#include "gc_root.h"
+#include "immune_spaces.h"
+#include "offsets.h"
+
+namespace art {
+
+bool KernelSupportsUffd();
+
+namespace mirror {
+class DexCache;
+}  // namespace mirror
+
+namespace gc {
+
+class Heap;
+
+namespace space {
+class BumpPointerSpace;
+}  // namespace space
+
+namespace collector {
+class MarkCompact final : public GarbageCollector {
+ public:
+  using SigbusCounterType = uint32_t;
+
+  static constexpr size_t kAlignment = kObjectAlignment;
+  static constexpr int kCopyMode = -1;
+  static constexpr int kMinorFaultMode = -2;
+  // Fake file descriptor for fall back mode (when uffd isn't available)
+  static constexpr int kFallbackMode = -3;
+  static constexpr int kFdSharedAnon = -1;
+  static constexpr int kFdUnused = -2;
+
+  // Bitmask for the compaction-done bit in the sigbus_in_progress_count_.
+  static constexpr SigbusCounterType kSigbusCounterCompactionDoneMask =
+      1u << (BitSizeOf<SigbusCounterType>() - 1);
+
+  explicit MarkCompact(Heap* heap);
+
+  ~MarkCompact() {}
+
+  void RunPhases() override REQUIRES(!Locks::mutator_lock_, !lock_);
+
+  // Updated before (or in) pre-compaction pause and is accessed only in the
+  // pause or during concurrent compaction. The flag is reset in next GC cycle's
+  // InitializePhase(). Therefore, it's safe to update without any memory ordering.
+  bool IsCompacting() const { return compacting_; }
+
+  bool IsUsingSigbusFeature() const { return use_uffd_sigbus_; }
+
+  // Called by SIGBUS handler. NO_THREAD_SAFETY_ANALYSIS for mutator-lock, which
+  // is asserted in the function.
+  bool SigbusHandler(siginfo_t* info) REQUIRES(!lock_) NO_THREAD_SAFETY_ANALYSIS;
+
+  GcType GetGcType() const override {
+    return kGcTypeFull;
+  }
+
+  CollectorType GetCollectorType() const override {
+    return kCollectorTypeCMC;
+  }
+
+  Barrier& GetBarrier() {
+    return gc_barrier_;
+  }
+
+  mirror::Object* MarkObject(mirror::Object* obj) override
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  void MarkHeapReference(mirror::HeapReference<mirror::Object>* obj,
+                         bool do_atomic_update) override
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  void VisitRoots(mirror::Object*** roots,
+                  size_t count,
+                  const RootInfo& info) override
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  void VisitRoots(mirror::CompressedReference<mirror::Object>** roots,
+                  size_t count,
+                  const RootInfo& info) override
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  bool IsNullOrMarkedHeapReference(mirror::HeapReference<mirror::Object>* obj,
+                                   bool do_atomic_update) override
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  void RevokeAllThreadLocalBuffers() override;
+
+  void DelayReferenceReferent(ObjPtr<mirror::Class> klass,
+                              ObjPtr<mirror::Reference> reference) override
+      REQUIRES_SHARED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+
+  mirror::Object* IsMarked(mirror::Object* obj) override
+      REQUIRES_SHARED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+
+  mirror::Object* GetFromSpaceAddrFromBarrier(mirror::Object* old_ref) {
+    CHECK(compacting_);
+    if (live_words_bitmap_->HasAddress(old_ref)) {
+      return GetFromSpaceAddr(old_ref);
+    }
+    return old_ref;
+  }
+  // Called from Heap::PostForkChildAction() for non-zygote processes and from
+  // PrepareForCompaction() for zygote processes. Returns true if uffd was
+  // created or was already done.
+  bool CreateUserfaultfd(bool post_fork);
+
+  // Returns a pair indicating if userfaultfd itself is available (first) and if
+  // so then whether its minor-fault feature is available or not (second).
+  static std::pair<bool, bool> GetUffdAndMinorFault();
+
+  // Add linear-alloc space data when a new space is added to
+  // GcVisitedArenaPool, which mostly happens only once.
+  void AddLinearAllocSpaceData(uint8_t* begin, size_t len);
+
+  // In copy-mode of userfaultfd, we don't need to reach a 'processed' state as
+  // it's given that processing thread also copies the page, thereby mapping it.
+  // The order is important as we may treat them as integers.
+  enum class PageState : uint8_t {
+    kUnprocessed = 0,           // Not processed yet
+    kProcessing = 1,            // Being processed by GC thread and will not be mapped
+    kProcessed = 2,             // Processed but not mapped
+    kProcessingAndMapping = 3,  // Being processed by GC or mutator and will be mapped
+    kMutatorProcessing = 4,     // Being processed by mutator thread
+    kProcessedAndMapping = 5,   // Processed and will be mapped
+    kProcessedAndMapped = 6     // Processed and mapped. For SIGBUS.
+  };
+
+ private:
+  using ObjReference = mirror::CompressedReference<mirror::Object>;
+  // Number of bits (live-words) covered by a single chunk-info (below)
+  // entry/word.
+  // TODO: Since popcount is performed usomg SIMD instructions, we should
+  // consider using 128-bit in order to halve the chunk-info size.
+  static constexpr uint32_t kBitsPerVectorWord = kBitsPerIntPtrT;
+  static constexpr uint32_t kOffsetChunkSize = kBitsPerVectorWord * kAlignment;
+  static_assert(kOffsetChunkSize < kPageSize);
+  // Bitmap with bits corresponding to every live word set. For an object
+  // which is 4 words in size will have the corresponding 4 bits set. This is
+  // required for efficient computation of new-address (post-compaction) from
+  // the given old-address (pre-compaction).
+  template <size_t kAlignment>
+  class LiveWordsBitmap : private accounting::MemoryRangeBitmap<kAlignment> {
+    using Bitmap = accounting::Bitmap;
+    using MemRangeBitmap = accounting::MemoryRangeBitmap<kAlignment>;
+
+   public:
+    static_assert(IsPowerOfTwo(kBitsPerVectorWord));
+    static_assert(IsPowerOfTwo(Bitmap::kBitsPerBitmapWord));
+    static_assert(kBitsPerVectorWord >= Bitmap::kBitsPerBitmapWord);
+    static constexpr uint32_t kBitmapWordsPerVectorWord =
+            kBitsPerVectorWord / Bitmap::kBitsPerBitmapWord;
+    static_assert(IsPowerOfTwo(kBitmapWordsPerVectorWord));
+    static LiveWordsBitmap* Create(uintptr_t begin, uintptr_t end);
+
+    // Return offset (within the indexed chunk-info) of the nth live word.
+    uint32_t FindNthLiveWordOffset(size_t chunk_idx, uint32_t n) const;
+    // Sets all bits in the bitmap corresponding to the given range. Also
+    // returns the bit-index of the first word.
+    ALWAYS_INLINE uintptr_t SetLiveWords(uintptr_t begin, size_t size);
+    // Count number of live words upto the given bit-index. This is to be used
+    // to compute the post-compact address of an old reference.
+    ALWAYS_INLINE size_t CountLiveWordsUpto(size_t bit_idx) const;
+    // Call 'visitor' for every stride of contiguous marked bits in the live-words
+    // bitmap, starting from begin_bit_idx. Only visit 'bytes' live bytes or
+    // until 'end', whichever comes first.
+    // Visitor is called with index of the first marked bit in the stride,
+    // stride size and whether it's the last stride in the given range or not.
+    template <typename Visitor>
+    ALWAYS_INLINE void VisitLiveStrides(uintptr_t begin_bit_idx,
+                                        uint8_t* end,
+                                        const size_t bytes,
+                                        Visitor&& visitor) const
+        REQUIRES_SHARED(Locks::mutator_lock_);
+    // Count the number of live bytes in the given vector entry.
+    size_t LiveBytesInBitmapWord(size_t chunk_idx) const;
+    void ClearBitmap() { Bitmap::Clear(); }
+    ALWAYS_INLINE uintptr_t Begin() const { return MemRangeBitmap::CoverBegin(); }
+    ALWAYS_INLINE bool HasAddress(mirror::Object* obj) const {
+      return MemRangeBitmap::HasAddress(reinterpret_cast<uintptr_t>(obj));
+    }
+    ALWAYS_INLINE bool Test(uintptr_t bit_index) const {
+      return Bitmap::TestBit(bit_index);
+    }
+    ALWAYS_INLINE bool Test(mirror::Object* obj) const {
+      return MemRangeBitmap::Test(reinterpret_cast<uintptr_t>(obj));
+    }
+    ALWAYS_INLINE uintptr_t GetWord(size_t index) const {
+      static_assert(kBitmapWordsPerVectorWord == 1);
+      return Bitmap::Begin()[index * kBitmapWordsPerVectorWord];
+    }
+  };
+
+  // For a given object address in pre-compact space, return the corresponding
+  // address in the from-space, where heap pages are relocated in the compaction
+  // pause.
+  mirror::Object* GetFromSpaceAddr(mirror::Object* obj) const {
+    DCHECK(live_words_bitmap_->HasAddress(obj)) << " obj=" << obj;
+    return reinterpret_cast<mirror::Object*>(reinterpret_cast<uintptr_t>(obj)
+                                             + from_space_slide_diff_);
+  }
+
+  // Verifies that that given object reference refers to a valid object.
+  // Otherwise fataly dumps logs, including those from callback.
+  template <typename Callback>
+  void VerifyObject(mirror::Object* ref, Callback& callback) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Check if the obj is within heap and has a klass which is likely to be valid
+  // mirror::Class.
+  bool IsValidObject(mirror::Object* obj) const REQUIRES_SHARED(Locks::mutator_lock_);
+  void InitializePhase();
+  void FinishPhase() REQUIRES(!Locks::mutator_lock_, !Locks::heap_bitmap_lock_, !lock_);
+  void MarkingPhase() REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!Locks::heap_bitmap_lock_);
+  void CompactionPhase() REQUIRES_SHARED(Locks::mutator_lock_);
+
+  void SweepSystemWeaks(Thread* self, Runtime* runtime, const bool paused)
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(!Locks::heap_bitmap_lock_);
+  // Update the reference at given offset in the given object with post-compact
+  // address.
+  ALWAYS_INLINE void UpdateRef(mirror::Object* obj, MemberOffset offset)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Verify that the gc-root is updated only once. Returns false if the update
+  // shouldn't be done.
+  ALWAYS_INLINE bool VerifyRootSingleUpdate(void* root,
+                                            mirror::Object* old_ref,
+                                            const RootInfo& info)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Update the given root with post-compact address.
+  ALWAYS_INLINE void UpdateRoot(mirror::CompressedReference<mirror::Object>* root,
+                                const RootInfo& info = RootInfo(RootType::kRootUnknown))
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  ALWAYS_INLINE void UpdateRoot(mirror::Object** root,
+                                const RootInfo& info = RootInfo(RootType::kRootUnknown))
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Given the pre-compact address, the function returns the post-compact
+  // address of the given object.
+  ALWAYS_INLINE mirror::Object* PostCompactAddress(mirror::Object* old_ref) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Compute post-compact address of an object in moving space. This function
+  // assumes that old_ref is in moving space.
+  ALWAYS_INLINE mirror::Object* PostCompactAddressUnchecked(mirror::Object* old_ref) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Compute the new address for an object which was allocated prior to starting
+  // this GC cycle.
+  ALWAYS_INLINE mirror::Object* PostCompactOldObjAddr(mirror::Object* old_ref) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Compute the new address for an object which was black allocated during this
+  // GC cycle.
+  ALWAYS_INLINE mirror::Object* PostCompactBlackObjAddr(mirror::Object* old_ref) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Identify immune spaces and reset card-table, mod-union-table, and mark
+  // bitmaps.
+  void BindAndResetBitmaps() REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  // Perform one last round of marking, identifying roots from dirty cards
+  // during a stop-the-world (STW) pause.
+  void MarkingPause() REQUIRES(Locks::mutator_lock_, !Locks::heap_bitmap_lock_);
+  // Perform stop-the-world pause prior to concurrent compaction.
+  // Updates GC-roots and protects heap so that during the concurrent
+  // compaction phase we can receive faults and compact the corresponding pages
+  // on the fly.
+  void CompactionPause() REQUIRES(Locks::mutator_lock_);
+  // Compute offsets (in chunk_info_vec_) and other data structures required
+  // during concurrent compaction.
+  void PrepareForCompaction() REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Copy kPageSize live bytes starting from 'offset' (within the moving space),
+  // which must be within 'obj', into the kPageSize sized memory pointed by 'addr'.
+  // Then update the references within the copied objects. The boundary objects are
+  // partially updated such that only the references that lie in the page are updated.
+  // This is necessary to avoid cascading userfaults.
+  void CompactPage(mirror::Object* obj, uint32_t offset, uint8_t* addr, bool needs_memset_zero)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Compact the bump-pointer space. Pass page that should be used as buffer for
+  // userfaultfd.
+  template <int kMode>
+  void CompactMovingSpace(uint8_t* page) REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Compact the given page as per func and change its state. Also map/copy the
+  // page, if required.
+  template <int kMode, typename CompactionFn>
+  ALWAYS_INLINE void DoPageCompactionWithStateChange(size_t page_idx,
+                                                     size_t status_arr_len,
+                                                     uint8_t* to_space_page,
+                                                     uint8_t* page,
+                                                     CompactionFn func)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Update all the objects in the given non-moving space page. 'first' object
+  // could have started in some preceding page.
+  void UpdateNonMovingPage(mirror::Object* first, uint8_t* page)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Update all the references in the non-moving space.
+  void UpdateNonMovingSpace() REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // For all the pages in non-moving space, find the first object that overlaps
+  // with the pages' start address, and store in first_objs_non_moving_space_ array.
+  void InitNonMovingSpaceFirstObjects() REQUIRES_SHARED(Locks::mutator_lock_);
+  // In addition to the first-objects for every post-compact moving space page,
+  // also find offsets within those objects from where the contents should be
+  // copied to the page. The offsets are relative to the moving-space's
+  // beginning. Store the computed first-object and offset in first_objs_moving_space_
+  // and pre_compact_offset_moving_space_ respectively.
+  void InitMovingSpaceFirstObjects(const size_t vec_len) REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Gather the info related to black allocations from bump-pointer space to
+  // enable concurrent sliding of these pages.
+  void UpdateMovingSpaceBlackAllocations() REQUIRES(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+  // Update first-object info from allocation-stack for non-moving space black
+  // allocations.
+  void UpdateNonMovingSpaceBlackAllocations() REQUIRES(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+
+  // Slides (retain the empty holes, which are usually part of some in-use TLAB)
+  // black page in the moving space. 'first_obj' is the object that overlaps with
+  // the first byte of the page being slid. pre_compact_page is the pre-compact
+  // address of the page being slid. 'page_idx' is used to fetch the first
+  // allocated chunk's size and next page's first_obj. 'dest' is the kPageSize
+  // sized memory where the contents would be copied.
+  void SlideBlackPage(mirror::Object* first_obj,
+                      const size_t page_idx,
+                      uint8_t* const pre_compact_page,
+                      uint8_t* dest,
+                      bool needs_memset_zero) REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Perform reference-processing and the likes before sweeping the non-movable
+  // spaces.
+  void ReclaimPhase() REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(!Locks::heap_bitmap_lock_);
+
+  // Mark GC-roots (except from immune spaces and thread-stacks) during a STW pause.
+  void ReMarkRoots(Runtime* runtime) REQUIRES(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+  // Concurrently mark GC-roots, except from immune spaces.
+  void MarkRoots(VisitRootFlags flags) REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  // Collect thread stack roots via a checkpoint.
+  void MarkRootsCheckpoint(Thread* self, Runtime* runtime) REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  // Second round of concurrent marking. Mark all gray objects that got dirtied
+  // since the first round.
+  void PreCleanCards() REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_);
+
+  void MarkNonThreadRoots(Runtime* runtime) REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  void MarkConcurrentRoots(VisitRootFlags flags, Runtime* runtime)
+      REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_);
+
+  // Traverse through the reachable objects and mark them.
+  void MarkReachableObjects() REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  // Scan (only) immune spaces looking for references into the garbage collected
+  // spaces.
+  void UpdateAndMarkModUnion() REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  // Scan mod-union and card tables, covering all the spaces, to identify dirty objects.
+  // These are in 'minimum age' cards, which is 'kCardAged' in case of concurrent (second round)
+  // marking and kCardDirty during the STW pause.
+  void ScanDirtyObjects(bool paused, uint8_t minimum_age) REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  // Recursively mark dirty objects. Invoked both concurrently as well in a STW
+  // pause in PausePhase().
+  void RecursiveMarkDirtyObjects(bool paused, uint8_t minimum_age)
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  // Go through all the objects in the mark-stack until it's empty.
+  void ProcessMarkStack() override REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  void ExpandMarkStack() REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  // Scan object for references. If kUpdateLivewords is true then set bits in
+  // the live-words bitmap and add size to chunk-info.
+  template <bool kUpdateLiveWords>
+  void ScanObject(mirror::Object* obj) REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  // Push objects to the mark-stack right after successfully marking objects.
+  void PushOnMarkStack(mirror::Object* obj)
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  // Update the live-words bitmap as well as add the object size to the
+  // chunk-info vector. Both are required for computation of post-compact addresses.
+  // Also updates freed_objects_ counter.
+  void UpdateLivenessInfo(mirror::Object* obj, size_t obj_size)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  void ProcessReferences(Thread* self)
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(!Locks::heap_bitmap_lock_);
+
+  void MarkObjectNonNull(mirror::Object* obj,
+                         mirror::Object* holder = nullptr,
+                         MemberOffset offset = MemberOffset(0))
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  void MarkObject(mirror::Object* obj, mirror::Object* holder, MemberOffset offset)
+      REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  template <bool kParallel>
+  bool MarkObjectNonNullNoPush(mirror::Object* obj,
+                               mirror::Object* holder = nullptr,
+                               MemberOffset offset = MemberOffset(0))
+      REQUIRES(Locks::heap_bitmap_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  void Sweep(bool swap_bitmaps) REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+  void SweepLargeObjects(bool swap_bitmaps) REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  // Perform all kernel operations required for concurrent compaction. Includes
+  // mremap to move pre-compact pages to from-space, followed by userfaultfd
+  // registration on the moving space and linear-alloc.
+  void KernelPreparation();
+  // Called by KernelPreparation() for every memory range being prepared for
+  // userfaultfd registration.
+  void KernelPrepareRangeForUffd(uint8_t* to_addr,
+                                 uint8_t* from_addr,
+                                 size_t map_size,
+                                 int fd,
+                                 uint8_t* shadow_addr = nullptr);
+
+  void RegisterUffd(void* addr, size_t size, int mode);
+  void UnregisterUffd(uint8_t* start, size_t len);
+
+  // Called by thread-pool workers to read uffd_ and process fault events.
+  template <int kMode>
+  void ConcurrentCompaction(uint8_t* buf) REQUIRES_SHARED(Locks::mutator_lock_);
+  // Called by thread-pool workers to compact and copy/map the fault page in
+  // moving space.
+  template <int kMode>
+  void ConcurrentlyProcessMovingPage(uint8_t* fault_page,
+                                     uint8_t* buf,
+                                     size_t nr_moving_space_used_pages)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Called by thread-pool workers to process and copy/map the fault page in
+  // linear-alloc.
+  template <int kMode>
+  void ConcurrentlyProcessLinearAllocPage(uint8_t* fault_page, bool is_minor_fault)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Process concurrently all the pages in linear-alloc. Called by gc-thread.
+  void ProcessLinearAlloc() REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Returns true if the moving space can be compacted using uffd's minor-fault
+  // feature.
+  bool CanCompactMovingSpaceWithMinorFault();
+
+  void FreeFromSpacePages(size_t cur_page_idx, int mode) REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Maps processed pages (from moving space and linear-alloc) for uffd's
+  // minor-fault feature. We try to 'claim' all processed (and unmapped) pages
+  // contiguous to 'to_space_start'.
+  // kFirstPageMapping indicates if the first page is already claimed or not. It
+  // also indicates that the ioctl must succeed in mapping the first page.
+  template <bool kFirstPageMapping>
+  void MapProcessedPages(uint8_t* to_space_start,
+                         Atomic<PageState>* state_arr,
+                         size_t arr_idx,
+                         size_t arr_len) REQUIRES_SHARED(Locks::mutator_lock_);
+
+  bool IsValidFd(int fd) const { return fd >= 0; }
+  // Add/update <class, obj> pair if class > obj and obj is the lowest address
+  // object of class.
+  ALWAYS_INLINE void UpdateClassAfterObjectMap(mirror::Object* obj)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Updates 'class_after_obj_map_' map by updating the keys (class) with its
+  // highest-address super-class (obtained from 'super_class_after_class_map_'),
+  // if there is any. This is to ensure we don't free from-space pages before
+  // the lowest-address obj is compacted.
+  void UpdateClassAfterObjMap();
+
+  void MarkZygoteLargeObjects() REQUIRES_SHARED(Locks::mutator_lock_)
+      REQUIRES(Locks::heap_bitmap_lock_);
+
+  void ZeropageIoctl(void* addr, bool tolerate_eexist, bool tolerate_enoent);
+  void CopyIoctl(void* dst, void* buffer);
+  // Called after updating a linear-alloc page to either map a zero-page if the
+  // page wasn't touched during updation, or map the page via copy-ioctl. And
+  // then updates the page's state to indicate the page is mapped.
+  void MapUpdatedLinearAllocPage(uint8_t* page,
+                                 uint8_t* shadow_page,
+                                 Atomic<PageState>& state,
+                                 bool page_touched);
+
+  // For checkpoints
+  Barrier gc_barrier_;
+  // Every object inside the immune spaces is assumed to be marked.
+  ImmuneSpaces immune_spaces_;
+  // Required only when mark-stack is accessed in shared mode, which happens
+  // when collecting thread-stack roots using checkpoint. Otherwise, we use it
+  // to synchronize on updated_roots_ in debug-builds.
+  Mutex lock_;
+  accounting::ObjectStack* mark_stack_;
+  // Special bitmap wherein all the bits corresponding to an object are set.
+  // TODO: make LiveWordsBitmap encapsulated in this class rather than a
+  // pointer. We tend to access its members in performance-sensitive
+  // code-path. Also, use a single MemMap for all the GC's data structures,
+  // which we will clear in the end. This would help in limiting the number of
+  // VMAs that get created in the kernel.
+  std::unique_ptr<LiveWordsBitmap<kAlignment>> live_words_bitmap_;
+  // Track GC-roots updated so far in a GC-cycle. This is to confirm that no
+  // GC-root is updated twice.
+  // TODO: Must be replaced with an efficient mechanism eventually. Or ensure
+  // that double updation doesn't happen in the first place.
+  std::unique_ptr<std::unordered_set<void*>> updated_roots_ GUARDED_BY(lock_);
+  MemMap from_space_map_;
+  MemMap shadow_to_space_map_;
+  // Any array of live-bytes in logical chunks of kOffsetChunkSize size
+  // in the 'to-be-compacted' space.
+  MemMap info_map_;
+  // Set of page-sized buffers used for compaction. The first page is used by
+  // the GC thread. Subdequent pages are used by mutator threads in case of
+  // SIGBUS feature, and by uffd-worker threads otherwise. In the latter case
+  // the first page is also used for termination of concurrent compaction by
+  // making worker threads terminate the userfaultfd read loop.
+  MemMap compaction_buffers_map_;
+
+  class LessByArenaAddr {
+   public:
+    bool operator()(const TrackedArena* a, const TrackedArena* b) const {
+      return std::less<uint8_t*>{}(a->Begin(), b->Begin());
+    }
+  };
+
+  // Map of arenas allocated in LinearAlloc arena-pool and last non-zero page,
+  // captured during compaction pause for concurrent updates.
+  std::map<const TrackedArena*, uint8_t*, LessByArenaAddr> linear_alloc_arenas_;
+  // Set of PageStatus arrays, one per arena-pool space. It's extremely rare to
+  // have more than one, but this is to be ready for the worst case.
+  class LinearAllocSpaceData {
+   public:
+    LinearAllocSpaceData(MemMap&& shadow,
+                         MemMap&& page_status_map,
+                         uint8_t* begin,
+                         uint8_t* end,
+                         bool already_shared)
+        : shadow_(std::move(shadow)),
+          page_status_map_(std::move(page_status_map)),
+          begin_(begin),
+          end_(end),
+          already_shared_(already_shared) {}
+
+    MemMap shadow_;
+    MemMap page_status_map_;
+    uint8_t* begin_;
+    uint8_t* end_;
+    // Indicates if the linear-alloc is already MAP_SHARED.
+    bool already_shared_;
+  };
+
+  std::vector<LinearAllocSpaceData> linear_alloc_spaces_data_;
+
+  class ObjReferenceHash {
+   public:
+    uint32_t operator()(const ObjReference& ref) const {
+      return ref.AsVRegValue() >> kObjectAlignmentShift;
+    }
+  };
+
+  class ObjReferenceEqualFn {
+   public:
+    bool operator()(const ObjReference& a, const ObjReference& b) const {
+      return a.AsMirrorPtr() == b.AsMirrorPtr();
+    }
+  };
+
+  class LessByObjReference {
+   public:
+    bool operator()(const ObjReference& a, const ObjReference& b) const {
+      return std::less<mirror::Object*>{}(a.AsMirrorPtr(), b.AsMirrorPtr());
+    }
+  };
+
+  // Data structures used to track objects whose layout information is stored in later
+  // allocated classes (at higher addresses). We must be careful not to free the
+  // corresponding from-space pages prematurely.
+  using ObjObjOrderedMap = std::map<ObjReference, ObjReference, LessByObjReference>;
+  using ObjObjUnorderedMap =
+      std::unordered_map<ObjReference, ObjReference, ObjReferenceHash, ObjReferenceEqualFn>;
+  // Unordered map of <K, S> such that the class K (in moving space) has kClassWalkSuper
+  // in reference bitmap and S is its highest address super class.
+  ObjObjUnorderedMap super_class_after_class_hash_map_;
+  // Unordered map of <K, V> such that the class K (in moving space) is after its objects
+  // or would require iterating super-class hierarchy when visiting references. And V is
+  // its lowest address object (in moving space).
+  ObjObjUnorderedMap class_after_obj_hash_map_;
+  // Ordered map constructed before starting compaction using the above two maps. Key is a
+  // class (or super-class) which is higher in address order than some of its object(s) and
+  // value is the corresponding object with lowest address.
+  ObjObjOrderedMap class_after_obj_ordered_map_;
+  // Since the compaction is done in reverse, we use a reverse iterator. It is maintained
+  // either at the pair whose class is lower than the first page to be freed, or at the
+  // pair whose object is not yet compacted.
+  ObjObjOrderedMap::const_reverse_iterator class_after_obj_iter_;
+  // Cached reference to the last class which has kClassWalkSuper in reference
+  // bitmap but has all its super classes lower address order than itself.
+  mirror::Class* walk_super_class_cache_;
+  // Used by FreeFromSpacePages() for maintaining markers in the moving space for
+  // how far the pages have been reclaimed/checked.
+  size_t last_checked_reclaim_page_idx_;
+  uint8_t* last_reclaimed_page_;
+
+  space::ContinuousSpace* non_moving_space_;
+  space::BumpPointerSpace* const bump_pointer_space_;
+  // The main space bitmap
+  accounting::ContinuousSpaceBitmap* const moving_space_bitmap_;
+  accounting::ContinuousSpaceBitmap* non_moving_space_bitmap_;
+  Thread* thread_running_gc_;
+  // Array of moving-space's pages' compaction status.
+  Atomic<PageState>* moving_pages_status_;
+  size_t vector_length_;
+  size_t live_stack_freeze_size_;
+
+  uint64_t bytes_scanned_;
+
+  // For every page in the to-space (post-compact heap) we need to know the
+  // first object from which we must compact and/or update references. This is
+  // for both non-moving and moving space. Additionally, for the moving-space,
+  // we also need the offset within the object from where we need to start
+  // copying.
+  // chunk_info_vec_ holds live bytes for chunks during marking phase. After
+  // marking we perform an exclusive scan to compute offset for every chunk.
+  uint32_t* chunk_info_vec_;
+  // For pages before black allocations, pre_compact_offset_moving_space_[i]
+  // holds offset within the space from where the objects need to be copied in
+  // the ith post-compact page.
+  // Otherwise, black_alloc_pages_first_chunk_size_[i] holds the size of first
+  // non-empty chunk in the ith black-allocations page.
+  union {
+    uint32_t* pre_compact_offset_moving_space_;
+    uint32_t* black_alloc_pages_first_chunk_size_;
+  };
+  // first_objs_moving_space_[i] is the pre-compact address of the object which
+  // would overlap with the starting boundary of the ith post-compact page.
+  ObjReference* first_objs_moving_space_;
+  // First object for every page. It could be greater than the page's start
+  // address, or null if the page is empty.
+  ObjReference* first_objs_non_moving_space_;
+  size_t non_moving_first_objs_count_;
+  // Length of first_objs_moving_space_ and pre_compact_offset_moving_space_
+  // arrays. Also the number of pages which are to be compacted.
+  size_t moving_first_objs_count_;
+  // Number of pages containing black-allocated objects, indicating number of
+  // pages to be slid.
+  size_t black_page_count_;
+
+  uint8_t* from_space_begin_;
+  // moving-space's end pointer at the marking pause. All allocations beyond
+  // this will be considered black in the current GC cycle. Aligned up to page
+  // size.
+  uint8_t* black_allocations_begin_;
+  // End of compacted space. Use for computing post-compact addr of black
+  // allocated objects. Aligned up to page size.
+  uint8_t* post_compact_end_;
+  // Cache (black_allocations_begin_ - post_compact_end_) for post-compact
+  // address computations.
+  ptrdiff_t black_objs_slide_diff_;
+  // Cache (from_space_begin_ - bump_pointer_space_->Begin()) so that we can
+  // compute from-space address of a given pre-comapct addr efficiently.
+  ptrdiff_t from_space_slide_diff_;
+
+  // TODO: Remove once an efficient mechanism to deal with double root updation
+  // is incorporated.
+  void* stack_high_addr_;
+  void* stack_low_addr_;
+
+  uint8_t* conc_compaction_termination_page_;
+
+  PointerSize pointer_size_;
+  // Number of objects freed during this GC in moving space. It is decremented
+  // every time an object is discovered. And total-object count is added to it
+  // in MarkingPause(). It reaches the correct count only once the marking phase
+  // is completed.
+  int32_t freed_objects_;
+  // memfds for moving space for using userfaultfd's minor-fault feature.
+  // Initialized to kFdUnused to indicate that mmap should be MAP_PRIVATE in
+  // KernelPrepareRange().
+  int moving_to_space_fd_;
+  int moving_from_space_fd_;
+  // Userfault file descriptor, accessed only by the GC itself.
+  // kFallbackMode value indicates that we are in the fallback mode.
+  int uffd_;
+  // Number of mutator-threads currently executing SIGBUS handler. When the
+  // GC-thread is done with compaction, it set the most significant bit to
+  // indicate that. Mutator threads check for the flag when incrementing in the
+  // handler.
+  std::atomic<SigbusCounterType> sigbus_in_progress_count_;
+  // Number of mutator-threads/uffd-workers working on moving-space page. It
+  // must be 0 before gc-thread can unregister the space after it's done
+  // sequentially compacting all pages of the space.
+  std::atomic<uint16_t> compaction_in_progress_count_;
+  // When using SIGBUS feature, this counter is used by mutators to claim a page
+  // out of compaction buffers to be used for the entire compaction cycle.
+  std::atomic<uint16_t> compaction_buffer_counter_;
+  // Used to exit from compaction loop at the end of concurrent compaction
+  uint8_t thread_pool_counter_;
+  // True while compacting.
+  bool compacting_;
+  // Flag indicating whether one-time uffd initialization has been done. It will
+  // be false on the first GC for non-zygote processes, and always for zygote.
+  // Its purpose is to minimize the userfaultfd overhead to the minimal in
+  // Heap::PostForkChildAction() as it's invoked in app startup path. With
+  // this, we register the compaction-termination page on the first GC.
+  bool uffd_initialized_;
+  // Flag indicating if userfaultfd supports minor-faults. Set appropriately in
+  // CreateUserfaultfd(), where we get this information from the kernel.
+  const bool uffd_minor_fault_supported_;
+  // Flag indicating if we should use sigbus signals instead of threads to
+  // handle userfaults.
+  const bool use_uffd_sigbus_;
+  // For non-zygote processes this flag indicates if the spaces are ready to
+  // start using userfaultfd's minor-fault feature. This initialization involves
+  // starting to use shmem (memfd_create) for the userfaultfd protected spaces.
+  bool minor_fault_initialized_;
+  // Set to true when linear-alloc can start mapping with MAP_SHARED. Set on
+  // non-zygote processes during first GC, which sets up everyting for using
+  // minor-fault from next GC.
+  bool map_linear_alloc_shared_;
+
+  class FlipCallback;
+  class ThreadFlipVisitor;
+  class VerifyRootMarkedVisitor;
+  class ScanObjectVisitor;
+  class CheckpointMarkThreadRoots;
+  template<size_t kBufferSize> class ThreadRootsVisitor;
+  class CardModifiedVisitor;
+  class RefFieldsVisitor;
+  template <bool kCheckBegin, bool kCheckEnd> class RefsUpdateVisitor;
+  class ArenaPoolPageUpdater;
+  class ClassLoaderRootsUpdater;
+  class LinearAllocPageUpdater;
+  class ImmuneSpaceUpdateObjVisitor;
+  class ConcurrentCompactionGcTask;
+
+  DISALLOW_IMPLICIT_CONSTRUCTORS(MarkCompact);
+};
+
+std::ostream& operator<<(std::ostream& os, MarkCompact::PageState value);
+
+}  // namespace collector
+}  // namespace gc
+}  // namespace art
+
+#endif  // ART_RUNTIME_GC_COLLECTOR_MARK_COMPACT_H_
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index bd5ce37b2c..4fefe6557c 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -340,6 +340,8 @@ void MarkSweep::ReclaimPhase() {
   Thread* const self = Thread::Current();
   // Process the references concurrently.
   ProcessReferences(self);
+  // There is no need to sweep interpreter caches as this GC doesn't move
+  // objects and hence would be a nop.
   SweepSystemWeaks(self);
   Runtime* const runtime = Runtime::Current();
   runtime->AllowNewSystemWeaks();
@@ -1127,7 +1129,8 @@ void MarkSweep::VerifySystemWeaks() {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
   // Verify system weaks, uses a special object visitor which returns the input object.
   VerifySystemWeakVisitor visitor(this);
-  Runtime::Current()->SweepSystemWeaks(&visitor);
+  Runtime* runtime = Runtime::Current();
+  runtime->SweepSystemWeaks(&visitor);
 }
 
 class MarkSweep::CheckpointMarkThreadRoots : public Closure, public RootVisitor {
@@ -1455,6 +1458,8 @@ inline mirror::Object* MarkSweep::IsMarked(mirror::Object* object) {
   if (current_space_bitmap_->HasAddress(object)) {
     return current_space_bitmap_->Test(object) ? object : nullptr;
   }
+  // This function returns nullptr for objects allocated after marking phase as
+  // they are not marked in the bitmap.
   return mark_bitmap_->Test(object) ? object : nullptr;
 }
 
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index 6af7c54600..12fd7f9995 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -181,7 +181,7 @@ class MarkSweep : public GarbageCollector {
       REQUIRES_SHARED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
 
   void VerifySystemWeaks()
-      REQUIRES_SHARED(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
+      REQUIRES(Locks::mutator_lock_) REQUIRES_SHARED(Locks::heap_bitmap_lock_);
 
   // Verify that an object is live, either in a live bitmap or in the allocation stack.
   void VerifyIsLive(const mirror::Object* obj)
diff --git a/runtime/gc/collector/partial_mark_sweep.cc b/runtime/gc/collector/partial_mark_sweep.cc
index f6ca867e69..e283a9583a 100644
--- a/runtime/gc/collector/partial_mark_sweep.cc
+++ b/runtime/gc/collector/partial_mark_sweep.cc
@@ -18,7 +18,6 @@
 
 #include "gc/heap.h"
 #include "gc/space/space.h"
-#include "partial_mark_sweep.h"
 #include "thread-current-inl.h"
 
 namespace art {
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index 53b060483f..acd4807a4f 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -500,7 +500,9 @@ void SemiSpace::MarkRoots() {
 
 void SemiSpace::SweepSystemWeaks() {
   TimingLogger::ScopedTiming t(__FUNCTION__, GetTimings());
-  Runtime::Current()->SweepSystemWeaks(this);
+  Runtime* runtime = Runtime::Current();
+  runtime->SweepSystemWeaks(this);
+  runtime->GetThreadList()->SweepInterpreterCaches(this);
 }
 
 bool SemiSpace::ShouldSweepSpace(space::ContinuousSpace* space) const {
diff --git a/runtime/gc/collector/semi_space.h b/runtime/gc/collector/semi_space.h
index 245ea10558..6d3ac0846e 100644
--- a/runtime/gc/collector/semi_space.h
+++ b/runtime/gc/collector/semi_space.h
@@ -143,7 +143,7 @@ class SemiSpace : public GarbageCollector {
   void SweepLargeObjects(bool swap_bitmaps) REQUIRES(Locks::heap_bitmap_lock_);
 
   void SweepSystemWeaks()
-      REQUIRES_SHARED(Locks::heap_bitmap_lock_, Locks::mutator_lock_);
+      REQUIRES_SHARED(Locks::heap_bitmap_lock_) REQUIRES(Locks::mutator_lock_);
 
   void VisitRoots(mirror::Object*** roots, size_t count, const RootInfo& info) override
       REQUIRES(Locks::mutator_lock_, Locks::heap_bitmap_lock_);
diff --git a/runtime/gc/collector_type.h b/runtime/gc/collector_type.h
index 9c9996458c..c20e3a7347 100644
--- a/runtime/gc/collector_type.h
+++ b/runtime/gc/collector_type.h
@@ -30,6 +30,8 @@ enum CollectorType {
   kCollectorTypeMS,
   // Concurrent mark-sweep.
   kCollectorTypeCMS,
+  // Concurrent mark-compact.
+  kCollectorTypeCMC,
   // Semi-space / mark-sweep hybrid, enables compaction.
   kCollectorTypeSS,
   // Heap trimming collector, doesn't do any actual collecting.
@@ -63,12 +65,13 @@ enum CollectorType {
 std::ostream& operator<<(std::ostream& os, CollectorType collector_type);
 
 static constexpr CollectorType kCollectorTypeDefault =
-#if ART_DEFAULT_GC_TYPE_IS_CMS
-    kCollectorTypeCMS
+#if ART_DEFAULT_GC_TYPE_IS_CMC
+    kCollectorTypeCMC
 #elif ART_DEFAULT_GC_TYPE_IS_SS
     kCollectorTypeSS
-#else
+#elif ART_DEFAULT_GC_TYPE_IS_CMS
     kCollectorTypeCMS
+#else
 #error "ART default GC type must be set"
 #endif
     ;  // NOLINT [whitespace/semicolon] [5]
diff --git a/runtime/gc/gc_cause.cc b/runtime/gc/gc_cause.cc
index b197a99a20..02fe2f975c 100644
--- a/runtime/gc/gc_cause.cc
+++ b/runtime/gc/gc_cause.cc
@@ -46,7 +46,7 @@ const char* PrettyCause(GcCause cause) {
     case kGcCauseHprof: return "Hprof";
     case kGcCauseGetObjectsAllocated: return "ObjectsAllocated";
     case kGcCauseProfileSaver: return "ProfileSaver";
-    case kGcCauseRunEmptyCheckpoint: return "RunEmptyCheckpoint";
+    case kGcCauseDeletingDexCacheArrays: return "DeletingDexCacheArrays";
   }
   LOG(FATAL) << "Unreachable";
   UNREACHABLE();
diff --git a/runtime/gc/gc_cause.h b/runtime/gc/gc_cause.h
index 4dae585e4c..5c039b31ee 100644
--- a/runtime/gc/gc_cause.h
+++ b/runtime/gc/gc_cause.h
@@ -62,8 +62,8 @@ enum GcCause {
   kGcCauseGetObjectsAllocated,
   // GC cause for the profile saver.
   kGcCauseProfileSaver,
-  // GC cause for running an empty checkpoint.
-  kGcCauseRunEmptyCheckpoint,
+  // GC cause for deleting dex cache arrays at startup.
+  kGcCauseDeletingDexCacheArrays,
 };
 
 const char* PrettyCause(GcCause cause);
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 9e1524e657..922b58870d 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -209,13 +209,12 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self,
       }
       // IsGcConcurrent() isn't known at compile time so we can optimize by not checking it for the
       // BumpPointer or TLAB allocators. This is nice since it allows the entire if statement to be
-      // optimized out. And for the other allocators, AllocatorMayHaveConcurrentGC is a constant
-      // since the allocator_type should be constant propagated.
-      if (AllocatorMayHaveConcurrentGC(allocator) && IsGcConcurrent()
-          && UNLIKELY(ShouldConcurrentGCForJava(new_num_bytes_allocated))) {
+      // optimized out.
+      if (IsGcConcurrent() && UNLIKELY(ShouldConcurrentGCForJava(new_num_bytes_allocated))) {
         need_gc = true;
       }
       GetMetrics()->TotalBytesAllocated()->Add(bytes_tl_bulk_allocated);
+      GetMetrics()->TotalBytesAllocatedDelta()->Add(bytes_tl_bulk_allocated);
     }
   }
   if (kIsDebugBuild && Runtime::Current()->IsStarted()) {
@@ -442,7 +441,7 @@ inline bool Heap::ShouldAllocLargeObject(ObjPtr<mirror::Class> c, size_t byte_co
   return byte_count >= large_object_threshold_ && (c->IsPrimitiveArray() || c->IsStringClass());
 }
 
-inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type,
+inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type ATTRIBUTE_UNUSED,
                                             size_t alloc_size,
                                             bool grow) {
   size_t old_target = target_footprint_.load(std::memory_order_relaxed);
@@ -457,7 +456,7 @@ inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type,
       return true;
     }
     // We are between target_footprint_ and growth_limit_ .
-    if (AllocatorMayHaveConcurrentGC(allocator_type) && IsGcConcurrent()) {
+    if (IsGcConcurrent()) {
       return false;
     } else {
       if (grow) {
diff --git a/runtime/gc/heap-visit-objects-inl.h b/runtime/gc/heap-visit-objects-inl.h
index e20d981fa3..a235c44033 100644
--- a/runtime/gc/heap-visit-objects-inl.h
+++ b/runtime/gc/heap-visit-objects-inl.h
@@ -118,7 +118,7 @@ inline void Heap::VisitObjectsInternal(Visitor&& visitor) {
       // For speed reasons, only perform it when Rosalloc could possibly be used.
       // (Disabled for read barriers because it never uses Rosalloc).
       // (See the DCHECK in RosAllocSpace constructor).
-      if (!kUseReadBarrier) {
+      if (!gUseReadBarrier) {
         // Rosalloc has a race in allocation. Objects can be written into the allocation
         // stack before their header writes are visible to this thread.
         // See b/28790624 for more details.
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 8407ba4376..f27bddb361 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -21,10 +21,6 @@
 #if defined(__BIONIC__) || defined(__GLIBC__)
 #include <malloc.h>  // For mallinfo()
 #endif
-#if defined(__BIONIC__) && defined(ART_TARGET)
-#include <linux/userfaultfd.h>
-#include <sys/ioctl.h>
-#endif
 #include <memory>
 #include <random>
 #include <unistd.h>
@@ -61,6 +57,7 @@
 #include "gc/accounting/remembered_set.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/collector/concurrent_copying.h"
+#include "gc/collector/mark_compact.h"
 #include "gc/collector/mark_sweep.h"
 #include "gc/collector/partial_mark_sweep.h"
 #include "gc/collector/semi_space.h"
@@ -106,6 +103,7 @@
 #include "runtime.h"
 #include "javaheapprof/javaheapsampler.h"
 #include "scoped_thread_state_change-inl.h"
+#include "thread-inl.h"
 #include "thread_list.h"
 #include "verify_object-inl.h"
 #include "well_known_classes.h"
@@ -339,6 +337,7 @@ Heap::Heap(size_t initial_size,
       // this one.
       process_state_update_lock_("process state update lock", kPostMonitorLock),
       min_foreground_target_footprint_(0),
+      min_foreground_concurrent_start_bytes_(0),
       concurrent_start_bytes_(std::numeric_limits<size_t>::max()),
       total_bytes_freed_ever_(0),
       total_objects_freed_ever_(0),
@@ -410,7 +409,6 @@ Heap::Heap(size_t initial_size,
       backtrace_lock_(nullptr),
       seen_backtrace_count_(0u),
       unique_backtrace_count_(0u),
-      uffd_(-1),
       gc_disabled_for_shutdown_(false),
       dump_region_info_before_gc_(dump_region_info_before_gc),
       dump_region_info_after_gc_(dump_region_info_after_gc),
@@ -421,7 +419,19 @@ Heap::Heap(size_t initial_size,
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "Heap() entering";
   }
-  if (kUseReadBarrier) {
+
+  LOG(INFO) << "Using " << foreground_collector_type_ << " GC.";
+  if (!gUseUserfaultfd) {
+    // This ensures that userfaultfd syscall is done before any seccomp filter is installed.
+    // TODO(b/266731037): Remove this when we no longer need to collect metric on userfaultfd
+    // support.
+    auto [uffd_supported, minor_fault_supported] = collector::MarkCompact::GetUffdAndMinorFault();
+    // The check is just to ensure that compiler doesn't eliminate the function call above.
+    // Userfaultfd support is certain to be there if its minor-fault feature is supported.
+    CHECK_IMPLIES(minor_fault_supported, uffd_supported);
+  }
+
+  if (gUseReadBarrier) {
     CHECK_EQ(foreground_collector_type_, kCollectorTypeCC);
     CHECK_EQ(background_collector_type_, kCollectorTypeCCBackground);
   } else if (background_collector_type_ != gc::kCollectorTypeHomogeneousSpaceCompact) {
@@ -448,7 +458,8 @@ Heap::Heap(size_t initial_size,
   mark_bitmap_.reset(new accounting::HeapBitmap(this));
 
   // We don't have hspace compaction enabled with CC.
-  if (foreground_collector_type_ == kCollectorTypeCC) {
+  if (foreground_collector_type_ == kCollectorTypeCC
+      || foreground_collector_type_ == kCollectorTypeCMC) {
     use_homogeneous_space_compaction_for_oom_ = false;
   }
   bool support_homogeneous_space_compaction =
@@ -486,6 +497,7 @@ Heap::Heap(size_t initial_size,
                                        runtime->ShouldRelocate(),
                                        /*executable=*/ !runtime->IsAotCompiler(),
                                        heap_reservation_size,
+                                       runtime->AllowInMemoryCompilation(),
                                        &boot_image_spaces,
                                        &heap_reservation)) {
     DCHECK_EQ(heap_reservation_size, heap_reservation.IsValid() ? heap_reservation.Size() : 0u);
@@ -629,10 +641,14 @@ Heap::Heap(size_t initial_size,
                                                                     std::move(main_mem_map_1));
     CHECK(bump_pointer_space_ != nullptr) << "Failed to create bump pointer space";
     AddSpace(bump_pointer_space_);
-    temp_space_ = space::BumpPointerSpace::CreateFromMemMap("Bump pointer space 2",
-                                                            std::move(main_mem_map_2));
-    CHECK(temp_space_ != nullptr) << "Failed to create bump pointer space";
-    AddSpace(temp_space_);
+    // For Concurrent Mark-compact GC we don't need the temp space to be in
+    // lower 4GB. So its temp space will be created by the GC itself.
+    if (foreground_collector_type_ != kCollectorTypeCMC) {
+      temp_space_ = space::BumpPointerSpace::CreateFromMemMap("Bump pointer space 2",
+                                                              std::move(main_mem_map_2));
+      CHECK(temp_space_ != nullptr) << "Failed to create bump pointer space";
+      AddSpace(temp_space_);
+    }
     CHECK(separate_non_moving_space);
   } else {
     CreateMainMallocSpace(std::move(main_mem_map_1), initial_size, growth_limit_, capacity_);
@@ -758,6 +774,10 @@ Heap::Heap(size_t initial_size,
       semi_space_collector_ = new collector::SemiSpace(this);
       garbage_collectors_.push_back(semi_space_collector_);
     }
+    if (MayUseCollector(kCollectorTypeCMC)) {
+      mark_compact_ = new collector::MarkCompact(this);
+      garbage_collectors_.push_back(mark_compact_);
+    }
     if (MayUseCollector(kCollectorTypeCC)) {
       concurrent_copying_collector_ = new collector::ConcurrentCopying(this,
                                                                        /*young_gen=*/false,
@@ -963,7 +983,6 @@ void Heap::DecrementDisableMovingGC(Thread* self) {
 
 void Heap::IncrementDisableThreadFlip(Thread* self) {
   // Supposed to be called by mutators. If thread_flip_running_ is true, block. Otherwise, go ahead.
-  CHECK(kUseReadBarrier);
   bool is_nested = self->GetDisableThreadFlipCount() > 0;
   self->IncrementDisableThreadFlipCount();
   if (is_nested) {
@@ -994,10 +1013,23 @@ void Heap::IncrementDisableThreadFlip(Thread* self) {
   }
 }
 
+void Heap::EnsureObjectUserfaulted(ObjPtr<mirror::Object> obj) {
+  if (gUseUserfaultfd) {
+    // Use volatile to ensure that compiler loads from memory to trigger userfaults, if required.
+    const uint8_t* start = reinterpret_cast<uint8_t*>(obj.Ptr());
+    const uint8_t* end = AlignUp(start + obj->SizeOf(), kPageSize);
+    // The first page is already touched by SizeOf().
+    start += kPageSize;
+    while (start < end) {
+      ForceRead(start);
+      start += kPageSize;
+    }
+  }
+}
+
 void Heap::DecrementDisableThreadFlip(Thread* self) {
   // Supposed to be called by mutators. Decrement disable_thread_flip_count_ and potentially wake up
   // the GC waiting before doing a thread flip.
-  CHECK(kUseReadBarrier);
   self->DecrementDisableThreadFlipCount();
   bool is_outermost = self->GetDisableThreadFlipCount() == 0;
   if (!is_outermost) {
@@ -1017,7 +1049,6 @@ void Heap::DecrementDisableThreadFlip(Thread* self) {
 void Heap::ThreadFlipBegin(Thread* self) {
   // Supposed to be called by GC. Set thread_flip_running_ to be true. If disable_thread_flip_count_
   // > 0, block. Otherwise, go ahead.
-  CHECK(kUseReadBarrier);
   ScopedThreadStateChange tsc(self, ThreadState::kWaitingForGcThreadFlip);
   MutexLock mu(self, *thread_flip_lock_);
   thread_flip_cond_->CheckSafeToWait(self);
@@ -1043,7 +1074,6 @@ void Heap::ThreadFlipBegin(Thread* self) {
 void Heap::ThreadFlipEnd(Thread* self) {
   // Supposed to be called by GC. Set thread_flip_running_ to false and potentially wake up mutators
   // waiting before doing a JNI critical.
-  CHECK(kUseReadBarrier);
   MutexLock mu(self, *thread_flip_lock_);
   CHECK(thread_flip_running_);
   thread_flip_running_ = false;
@@ -1059,7 +1089,9 @@ void Heap::GrowHeapOnJankPerceptibleSwitch() {
                                               min_foreground_target_footprint_,
                                               std::memory_order_relaxed);
   }
-  min_foreground_target_footprint_ = 0;
+  if (IsGcConcurrent() && concurrent_start_bytes_ < min_foreground_concurrent_start_bytes_) {
+    concurrent_start_bytes_ = min_foreground_concurrent_start_bytes_;
+  }
 }
 
 void Heap::UpdateProcessState(ProcessState old_process_state, ProcessState new_process_state) {
@@ -1070,26 +1102,32 @@ void Heap::UpdateProcessState(ProcessState old_process_state, ProcessState new_p
       RequestCollectorTransition(foreground_collector_type_, 0);
       GrowHeapOnJankPerceptibleSwitch();
     } else {
-      // Don't delay for debug builds since we may want to stress test the GC.
       // If background_collector_type_ is kCollectorTypeHomogeneousSpaceCompact then we have
       // special handling which does a homogenous space compaction once but then doesn't transition
       // the collector. Similarly, we invoke a full compaction for kCollectorTypeCC but don't
       // transition the collector.
-      RequestCollectorTransition(background_collector_type_,
-                                 kStressCollectorTransition
-                                     ? 0
-                                     : kCollectorTransitionWait);
+      RequestCollectorTransition(background_collector_type_, 0);
     }
   }
 }
 
-void Heap::CreateThreadPool() {
-  const size_t num_threads = std::max(parallel_gc_threads_, conc_gc_threads_);
+void Heap::CreateThreadPool(size_t num_threads) {
+  if (num_threads == 0) {
+    num_threads = std::max(parallel_gc_threads_, conc_gc_threads_);
+  }
   if (num_threads != 0) {
     thread_pool_.reset(new ThreadPool("Heap thread pool", num_threads));
   }
 }
 
+void Heap::WaitForWorkersToBeCreated() {
+  DCHECK(!Runtime::Current()->IsShuttingDown(Thread::Current()))
+      << "Cannot create new threads during runtime shutdown";
+  if (thread_pool_ != nullptr) {
+    thread_pool_->WaitForWorkersToBeCreated();
+  }
+}
+
 void Heap::MarkAllocStackAsLive(accounting::ObjectStack* stack) {
   space::ContinuousSpace* space1 = main_space_ != nullptr ? main_space_ : non_moving_space_;
   space::ContinuousSpace* space2 = non_moving_space_;
@@ -1451,6 +1489,8 @@ void Heap::ThrowOutOfMemoryError(Thread* self, size_t byte_count, AllocatorType
         Runtime::Current()->GetPreAllocatedOutOfMemoryErrorWhenHandlingStackOverflow());
     return;
   }
+  // Allow plugins to intercept out of memory errors.
+  Runtime::Current()->OutOfMemoryErrorHook();
 
   std::ostringstream oss;
   size_t total_bytes_free = GetFreeMemory();
@@ -1497,6 +1537,23 @@ void Heap::ThrowOutOfMemoryError(Thread* self, size_t byte_count, AllocatorType
 
 void Heap::DoPendingCollectorTransition() {
   CollectorType desired_collector_type = desired_collector_type_;
+
+  if (collector_type_ == kCollectorTypeCC || collector_type_ == kCollectorTypeCMC) {
+    // App's allocations (since last GC) more than the threshold then do TransitionGC
+    // when the app was in background. If not then don't do TransitionGC.
+    // num_bytes_allocated_since_gc should always be positive even if initially
+    // num_bytes_alive_after_gc_ is coming from Zygote. This gives positive or zero value.
+    size_t num_bytes_allocated_since_gc =
+        UnsignedDifference(GetBytesAllocated(), num_bytes_alive_after_gc_);
+    if (num_bytes_allocated_since_gc <
+        (UnsignedDifference(target_footprint_.load(std::memory_order_relaxed),
+                            num_bytes_alive_after_gc_)/4)
+        && !kStressCollectorTransition
+        && !IsLowMemoryMode()) {
+      return;
+    }
+  }
+
   // Launch homogeneous space compaction if it is desired.
   if (desired_collector_type == kCollectorTypeHomogeneousSpaceCompact) {
     if (!CareAboutPauseTimes()) {
@@ -1504,15 +1561,15 @@ void Heap::DoPendingCollectorTransition() {
     } else {
       VLOG(gc) << "Homogeneous compaction ignored due to jank perceptible process state";
     }
-  } else if (desired_collector_type == kCollectorTypeCCBackground) {
-    DCHECK(kUseReadBarrier);
+  } else if (desired_collector_type == kCollectorTypeCCBackground ||
+             desired_collector_type == kCollectorTypeCMC) {
     if (!CareAboutPauseTimes()) {
-      // Invoke CC full compaction.
+      // Invoke full compaction.
       CollectGarbageInternal(collector::kGcTypeFull,
                              kGcCauseCollectorTransition,
-                             /*clear_soft_references=*/false, GC_NUM_ANY);
+                             /*clear_soft_references=*/false, GetCurrentGcNum() + 1);
     } else {
-      VLOG(gc) << "CC background compaction ignored due to jank perceptible process state";
+      VLOG(gc) << "background compaction ignored due to jank perceptible process state";
     }
   } else {
     CHECK_EQ(desired_collector_type, collector_type_) << "Unsupported collector transition";
@@ -1761,7 +1818,7 @@ void Heap::VerifyObjectBody(ObjPtr<mirror::Object> obj) {
 
 void Heap::VerifyHeap() {
   ReaderMutexLock mu(Thread::Current(), *Locks::heap_bitmap_lock_);
-  auto visitor = [&](mirror::Object* obj) {
+  auto visitor = [&](mirror::Object* obj) NO_THREAD_SAFETY_ANALYSIS {
     VerifyObjectBody(obj);
   };
   // Technically we need the mutator lock here to call Visit. However, VerifyObjectBody is already
@@ -2199,6 +2256,15 @@ void Heap::ChangeCollector(CollectorType collector_type) {
         }
         break;
       }
+      case kCollectorTypeCMC: {
+        gc_plan_.push_back(collector::kGcTypeFull);
+        if (use_tlab_) {
+          ChangeAllocator(kAllocatorTypeTLAB);
+        } else {
+          ChangeAllocator(kAllocatorTypeBumpPointer);
+        }
+        break;
+      }
       case kCollectorTypeSS: {
         gc_plan_.push_back(collector::kGcTypeFull);
         if (use_tlab_) {
@@ -2368,18 +2434,16 @@ void Heap::PreZygoteFork() {
   }
   // We need to close userfaultfd fd for app/webview zygotes to avoid getattr
   // (stat) on the fd during fork.
-  if (uffd_ >= 0) {
-    close(uffd_);
-    uffd_ = -1;
-  }
   Thread* self = Thread::Current();
   MutexLock mu(self, zygote_creation_lock_);
   // Try to see if we have any Zygote spaces.
   if (HasZygoteSpace()) {
     return;
   }
-  Runtime::Current()->GetInternTable()->AddNewTable();
-  Runtime::Current()->GetClassLinker()->MoveClassTableToPreZygote();
+  Runtime* runtime = Runtime::Current();
+  runtime->GetInternTable()->AddNewTable();
+  runtime->GetClassLinker()->MoveClassTableToPreZygote();
+  runtime->SetupLinearAllocForPostZygoteFork(self);
   VLOG(heap) << "Starting PreZygoteFork";
   // The end of the non-moving space may be protected, unprotect it so that we can copy the zygote
   // there.
@@ -2488,7 +2552,7 @@ void Heap::PreZygoteFork() {
       new accounting::ModUnionTableCardCache("zygote space mod-union table", this, zygote_space_);
   CHECK(mod_union_table != nullptr) << "Failed to create zygote space mod-union table";
 
-  if (collector_type_ != kCollectorTypeCC) {
+  if (collector_type_ != kCollectorTypeCC && collector_type_ != kCollectorTypeCMC) {
     // Set all the cards in the mod-union table since we don't know which objects contain references
     // to large objects.
     mod_union_table->SetCards();
@@ -2500,10 +2564,10 @@ void Heap::PreZygoteFork() {
     mod_union_table->ProcessCards();
     mod_union_table->ClearTable();
 
-    // For CC we never collect zygote large objects. This means we do not need to set the cards for
-    // the zygote mod-union table and we can also clear all of the existing image mod-union tables.
-    // The existing mod-union tables are only for image spaces and may only reference zygote and
-    // image objects.
+    // For CC and CMC we never collect zygote large objects. This means we do not need to set the
+    // cards for the zygote mod-union table and we can also clear all of the existing image
+    // mod-union tables. The existing mod-union tables are only for image spaces and may only
+    // reference zygote and image objects.
     for (auto& pair : mod_union_tables_) {
       CHECK(pair.first->IsImageSpace());
       CHECK(!pair.first->AsImageSpace()->GetImageHeader().IsAppImage());
@@ -2710,6 +2774,9 @@ collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type,
         semi_space_collector_->SetSwapSemiSpaces(true);
         collector = semi_space_collector_;
         break;
+      case kCollectorTypeCMC:
+        collector = mark_compact_;
+        break;
       case kCollectorTypeCC:
         collector::ConcurrentCopying* active_cc_collector;
         if (use_generational_cc_) {
@@ -2728,7 +2795,9 @@ collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type,
       default:
         LOG(FATAL) << "Invalid collector type " << static_cast<size_t>(collector_type_);
     }
-    if (collector != active_concurrent_copying_collector_.load(std::memory_order_relaxed)) {
+    // temp_space_ will be null for kCollectorTypeCMC.
+    if (temp_space_ != nullptr
+        && collector != active_concurrent_copying_collector_.load(std::memory_order_relaxed)) {
       temp_space_->GetMemMap()->Protect(PROT_READ | PROT_WRITE);
       if (kIsDebugBuild) {
         // Try to read each page of the memory map in case mprotect didn't work properly b/19894268.
@@ -3561,6 +3630,15 @@ collector::GcType Heap::WaitForGcToCompleteLocked(GcCause cause, Thread* self) {
 void Heap::DumpForSigQuit(std::ostream& os) {
   os << "Heap: " << GetPercentFree() << "% free, " << PrettySize(GetBytesAllocated()) << "/"
      << PrettySize(GetTotalMemory()) << "; " << GetObjectsAllocated() << " objects\n";
+  {
+    os << "Image spaces:\n";
+    ScopedObjectAccess soa(Thread::Current());
+    for (const auto& space : continuous_spaces_) {
+      if (space->IsImageSpace()) {
+        os << space->GetName() << "\n";
+      }
+    }
+  }
   DumpGcPerformanceInfo(os);
 }
 
@@ -3680,7 +3758,9 @@ void Heap::GrowForUtilization(collector::GarbageCollector* collector_ran,
     // process-state switch.
     min_foreground_target_footprint_ =
         (multiplier <= 1.0 && grow_bytes > 0)
-        ? bytes_allocated + static_cast<size_t>(grow_bytes * foreground_heap_growth_multiplier_)
+        ? std::min(
+          bytes_allocated + static_cast<size_t>(grow_bytes * foreground_heap_growth_multiplier_),
+          GetMaxMemory())
         : 0;
 
     if (IsGcConcurrent()) {
@@ -3712,6 +3792,12 @@ void Heap::GrowForUtilization(collector::GarbageCollector* collector_ran,
       // allocation rate is very high, remaining_bytes could tell us that we should start a GC
       // right away.
       concurrent_start_bytes_ = std::max(target_footprint - remaining_bytes, bytes_allocated);
+      // Store concurrent_start_bytes_ (computed with foreground heap growth multiplier) for update
+      // itself when process state switches to foreground.
+      min_foreground_concurrent_start_bytes_ =
+          min_foreground_target_footprint_ != 0
+          ? std::max(min_foreground_target_footprint_ - remaining_bytes, bytes_allocated)
+          : 0;
     }
   }
 }
@@ -3762,12 +3848,11 @@ void Heap::ClearGrowthLimit() {
 
 void Heap::AddFinalizerReference(Thread* self, ObjPtr<mirror::Object>* object) {
   ScopedObjectAccess soa(self);
-  ScopedLocalRef<jobject> arg(self->GetJniEnv(), soa.AddLocalReference<jobject>(*object));
-  jvalue args[1];
-  args[0].l = arg.get();
-  InvokeWithJValues(soa, nullptr, WellKnownClasses::java_lang_ref_FinalizerReference_add, args);
-  // Restore object in case it gets moved.
-  *object = soa.Decode<mirror::Object>(arg.get());
+  StackHandleScope<1u> hs(self);
+  // Use handle wrapper to update the `*object` if the object gets moved.
+  HandleWrapperObjPtr<mirror::Object> h_object = hs.NewHandleWrapper(object);
+  WellKnownClasses::java_lang_ref_FinalizerReference_add->InvokeStatic<'V', 'L'>(
+      self, h_object.Get());
 }
 
 void Heap::RequestConcurrentGCAndSaveObject(Thread* self,
@@ -3829,70 +3914,6 @@ bool Heap::RequestConcurrentGC(Thread* self,
   return true;  // Vacuously.
 }
 
-#if defined(__BIONIC__) && defined(ART_TARGET)
-void Heap::MaybePerformUffdIoctls(GcCause cause, uint32_t requested_gc_num) const {
-  if (uffd_ >= 0
-      && cause == kGcCauseBackground
-      && (requested_gc_num < 5 || requested_gc_num % 5 == 0)) {
-    // Attempt to use all userfaultfd ioctls that we intend to use.
-    // Register ioctl
-    {
-      struct uffdio_register uffd_register;
-      uffd_register.range.start = 0;
-      uffd_register.range.len = 0;
-      uffd_register.mode = UFFDIO_REGISTER_MODE_MISSING;
-      int ret = ioctl(uffd_, UFFDIO_REGISTER, &uffd_register);
-      CHECK_EQ(ret, -1);
-      CHECK_EQ(errno, EINVAL);
-    }
-    // Copy ioctl
-    {
-      struct uffdio_copy uffd_copy = {.src = 0, .dst = 0, .len = 0, .mode = 0};
-      int ret = ioctl(uffd_, UFFDIO_COPY, &uffd_copy);
-      CHECK_EQ(ret, -1);
-      CHECK_EQ(errno, EINVAL);
-    }
-    // Zeropage ioctl
-    {
-      struct uffdio_zeropage uffd_zeropage;
-      uffd_zeropage.range.start = 0;
-      uffd_zeropage.range.len = 0;
-      uffd_zeropage.mode = 0;
-      int ret = ioctl(uffd_, UFFDIO_ZEROPAGE, &uffd_zeropage);
-      CHECK_EQ(ret, -1);
-      CHECK_EQ(errno, EINVAL);
-    }
-    // Continue ioctl
-    {
-      struct uffdio_continue uffd_continue;
-      uffd_continue.range.start = 0;
-      uffd_continue.range.len = 0;
-      uffd_continue.mode = 0;
-      int ret = ioctl(uffd_, UFFDIO_CONTINUE, &uffd_continue);
-      CHECK_EQ(ret, -1);
-      CHECK_EQ(errno, EINVAL);
-    }
-    // Wake ioctl
-    {
-      struct uffdio_range uffd_range = {.start = 0, .len = 0};
-      int ret = ioctl(uffd_, UFFDIO_WAKE, &uffd_range);
-      CHECK_EQ(ret, -1);
-      CHECK_EQ(errno, EINVAL);
-    }
-    // Unregister ioctl
-    {
-      struct uffdio_range uffd_range = {.start = 0, .len = 0};
-      int ret = ioctl(uffd_, UFFDIO_UNREGISTER, &uffd_range);
-      CHECK_EQ(ret, -1);
-      CHECK_EQ(errno, EINVAL);
-    }
-  }
-}
-#else
-void Heap::MaybePerformUffdIoctls(GcCause cause ATTRIBUTE_UNUSED,
-                                  uint32_t requested_gc_num ATTRIBUTE_UNUSED) const {}
-#endif
-
 void Heap::ConcurrentGC(Thread* self, GcCause cause, bool force_full, uint32_t requested_gc_num) {
   if (!Runtime::Current()->IsShuttingDown(self)) {
     // Wait for any GCs currently running to finish. If this incremented GC number, we're done.
@@ -3905,7 +3926,7 @@ void Heap::ConcurrentGC(Thread* self, GcCause cause, bool force_full, uint32_t r
       }
       // If we can't run the GC type we wanted to run, find the next appropriate one and try
       // that instead. E.g. can't do partial, so do full instead.
-      // We must ensure that we run something that ends up inrementing gcs_completed_.
+      // We must ensure that we run something that ends up incrementing gcs_completed_.
       // In the kGcTypePartial case, the initial CollectGarbageInternal call may not have that
       // effect, but the subsequent KGcTypeFull call will.
       if (CollectGarbageInternal(next_gc_type, cause, false, requested_gc_num)
@@ -3919,12 +3940,9 @@ void Heap::ConcurrentGC(Thread* self, GcCause cause, bool force_full, uint32_t r
           if (gc_type > next_gc_type &&
               CollectGarbageInternal(gc_type, cause, false, requested_gc_num)
               != collector::kGcTypeNone) {
-            MaybePerformUffdIoctls(cause, requested_gc_num);
             break;
           }
         }
-      } else {
-        MaybePerformUffdIoctls(cause, requested_gc_num);
       }
     }
   }
@@ -3956,16 +3974,6 @@ void Heap::RequestCollectorTransition(CollectorType desired_collector_type, uint
     // For CC, we invoke a full compaction when going to the background, but the collector type
     // doesn't change.
     DCHECK_EQ(desired_collector_type_, kCollectorTypeCCBackground);
-    // App's allocations (since last GC) more than the threshold then do TransitionGC
-    // when the app was in background. If not then don't do TransitionGC.
-    size_t num_bytes_allocated_since_gc = GetBytesAllocated() - num_bytes_alive_after_gc_;
-    if (num_bytes_allocated_since_gc <
-        (UnsignedDifference(target_footprint_.load(std::memory_order_relaxed),
-                            num_bytes_alive_after_gc_)/4)
-        && !kStressCollectorTransition
-        && !IsLowMemoryMode()) {
-      return;
-    }
   }
   DCHECK_NE(collector_type_, kCollectorTypeCCBackground);
   CollectorTransitionTask* added_task = nullptr;
@@ -4076,12 +4084,6 @@ void Heap::RevokeAllThreadLocalBuffers() {
   }
 }
 
-void Heap::RunFinalization(JNIEnv* env, uint64_t timeout) {
-  env->CallStaticVoidMethod(WellKnownClasses::dalvik_system_VMRuntime,
-                            WellKnownClasses::dalvik_system_VMRuntime_runFinalization,
-                            static_cast<jlong>(timeout));
-}
-
 // For GC triggering purposes, we count old (pre-last-GC) and new native allocations as
 // different fractions of Java allocations.
 // For now, we essentially do not count old native allocations at all, so that we can preserve the
@@ -4167,7 +4169,7 @@ inline void Heap::CheckGCForNative(Thread* self) {
 // About kNotifyNativeInterval allocations have occurred. Check whether we should garbage collect.
 void Heap::NotifyNativeAllocations(JNIEnv* env) {
   native_objects_notified_.fetch_add(kNotifyNativeInterval, std::memory_order_relaxed);
-  CheckGCForNative(ThreadForEnv(env));
+  CheckGCForNative(Thread::ForEnv(env));
 }
 
 // Register a native allocation with an explicit size.
@@ -4181,7 +4183,7 @@ void Heap::RegisterNativeAllocation(JNIEnv* env, size_t bytes) {
       native_objects_notified_.fetch_add(1, std::memory_order_relaxed);
   if (objects_notified % kNotifyNativeInterval == kNotifyNativeInterval - 1
       || bytes > kCheckImmediatelyThreshold) {
-    CheckGCForNative(ThreadForEnv(env));
+    CheckGCForNative(Thread::ForEnv(env));
   }
   // Heap profiler treats this as a Java allocation with a null object.
   JHPCheckNonTlabSampleAllocation(Thread::Current(), nullptr, bytes);
@@ -4280,7 +4282,7 @@ void Heap::SweepAllocationRecords(IsMarkedVisitor* visitor) const {
 }
 
 void Heap::AllowNewAllocationRecords() const {
-  CHECK(!kUseReadBarrier);
+  CHECK(!gUseReadBarrier);
   MutexLock mu(Thread::Current(), *Locks::alloc_tracker_lock_);
   AllocRecordObjectMap* allocation_records = GetAllocationRecords();
   if (allocation_records != nullptr) {
@@ -4289,7 +4291,7 @@ void Heap::AllowNewAllocationRecords() const {
 }
 
 void Heap::DisallowNewAllocationRecords() const {
-  CHECK(!kUseReadBarrier);
+  CHECK(!gUseReadBarrier);
   MutexLock mu(Thread::Current(), *Locks::alloc_tracker_lock_);
   AllocRecordObjectMap* allocation_records = GetAllocationRecords();
   if (allocation_records != nullptr) {
@@ -4412,12 +4414,15 @@ void Heap::CheckGcStressMode(Thread* self, ObjPtr<mirror::Object>* obj) {
 }
 
 void Heap::DisableGCForShutdown() {
-  Thread* const self = Thread::Current();
-  CHECK(Runtime::Current()->IsShuttingDown(self));
-  MutexLock mu(self, *gc_complete_lock_);
+  MutexLock mu(Thread::Current(), *gc_complete_lock_);
   gc_disabled_for_shutdown_ = true;
 }
 
+bool Heap::IsGCDisabledForShutdown() const {
+  MutexLock mu(Thread::Current(), *gc_complete_lock_);
+  return gc_disabled_for_shutdown_;
+}
+
 bool Heap::ObjectIsInBootImageSpace(ObjPtr<mirror::Object> obj) const {
   DCHECK_EQ(IsBootImageAddress(obj.Ptr()),
             any_of(boot_image_spaces_.begin(),
@@ -4494,8 +4499,13 @@ mirror::Object* Heap::AllocWithNewTLAB(Thread* self,
     DCHECK_LE(alloc_size, self->TlabSize());
   } else if (allocator_type == kAllocatorTypeTLAB) {
     DCHECK(bump_pointer_space_ != nullptr);
+    // Try to allocate a page-aligned TLAB (not necessary though).
+    // TODO: for large allocations, which are rare, maybe we should allocate
+    // that object and return. There is no need to revoke the current TLAB,
+    // particularly if it's mostly unutilized.
+    size_t def_pr_tlab_size = RoundDown(alloc_size + kDefaultTLABSize, kPageSize) - alloc_size;
     size_t next_tlab_size = JHPCalculateNextTlabSize(self,
-                                                     kDefaultTLABSize,
+                                                     def_pr_tlab_size,
                                                      alloc_size,
                                                      &take_sample,
                                                      &bytes_until_sample);
@@ -4658,42 +4668,33 @@ void Heap::PostForkChildAction(Thread* self) {
   uint64_t last_adj_time = NanoTime();
   next_gc_type_ = NonStickyGcType();  // Always start with a full gc.
 
-#if defined(__BIONIC__) && defined(ART_TARGET)
-  uffd_ = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
-  if (uffd_ >= 0) {
-    struct uffdio_api api = {.api = UFFD_API, .features = 0};
-    int ret = ioctl(uffd_, UFFDIO_API, &api);
-    CHECK_EQ(ret, 0) << "ioctl_userfaultfd: API: " << strerror(errno);
-  } else {
-    // The syscall should fail only if it doesn't exist in the kernel or if it's
-    // denied by SELinux.
-    CHECK(errno == ENOSYS || errno == EACCES) << "userfaultfd: " << strerror(errno);
+  LOG(INFO) << "Using " << foreground_collector_type_ << " GC.";
+  if (gUseUserfaultfd) {
+    DCHECK_NE(mark_compact_, nullptr);
+    mark_compact_->CreateUserfaultfd(/*post_fork*/true);
   }
-#endif
 
   // Temporarily increase target_footprint_ and concurrent_start_bytes_ to
   // max values to avoid GC during app launch.
-  if (!IsLowMemoryMode()) {
-    // Set target_footprint_ to the largest allowed value.
-    SetIdealFootprint(growth_limit_);
-    SetDefaultConcurrentStartBytes();
-
-    // Shrink heap after kPostForkMaxHeapDurationMS, to force a memory hog process to GC.
-    // This remains high enough that many processes will continue without a GC.
-    if (initial_heap_size_ < growth_limit_) {
-      size_t first_shrink_size = std::max(growth_limit_ / 4, initial_heap_size_);
-      last_adj_time += MsToNs(kPostForkMaxHeapDurationMS);
+  // Set target_footprint_ to the largest allowed value.
+  SetIdealFootprint(growth_limit_);
+  SetDefaultConcurrentStartBytes();
+
+  // Shrink heap after kPostForkMaxHeapDurationMS, to force a memory hog process to GC.
+  // This remains high enough that many processes will continue without a GC.
+  if (initial_heap_size_ < growth_limit_) {
+    size_t first_shrink_size = std::max(growth_limit_ / 4, initial_heap_size_);
+    last_adj_time += MsToNs(kPostForkMaxHeapDurationMS);
+    GetTaskProcessor()->AddTask(
+        self, new ReduceTargetFootprintTask(last_adj_time, first_shrink_size, starting_gc_num));
+    // Shrink to a small value after a substantial time period. This will typically force a
+    // GC if none has occurred yet. Has no effect if there was a GC before this anyway, which
+    // is commonly the case, e.g. because of a process transition.
+    if (initial_heap_size_ < first_shrink_size) {
+      last_adj_time += MsToNs(4 * kPostForkMaxHeapDurationMS);
       GetTaskProcessor()->AddTask(
-          self, new ReduceTargetFootprintTask(last_adj_time, first_shrink_size, starting_gc_num));
-      // Shrink to a small value after a substantial time period. This will typically force a
-      // GC if none has occurred yet. Has no effect if there was a GC before this anyway, which
-      // is commonly the case, e.g. because of a process transition.
-      if (initial_heap_size_ < first_shrink_size) {
-        last_adj_time += MsToNs(4 * kPostForkMaxHeapDurationMS);
-        GetTaskProcessor()->AddTask(
-            self,
-            new ReduceTargetFootprintTask(last_adj_time, initial_heap_size_, starting_gc_num));
-      }
+          self,
+          new ReduceTargetFootprintTask(last_adj_time, initial_heap_size_, starting_gc_num));
     }
   }
   // Schedule a GC after a substantial period of time. This will become a no-op if another GC is
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 232c96b914..31a1b2b6a2 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -34,6 +34,7 @@
 #include "base/time_utils.h"
 #include "gc/collector/gc_type.h"
 #include "gc/collector/iteration.h"
+#include "gc/collector/mark_compact.h"
 #include "gc/collector_type.h"
 #include "gc/gc_cause.h"
 #include "gc/space/large_object_space.h"
@@ -150,7 +151,7 @@ class Heap {
   static constexpr size_t kMinLargeObjectThreshold = 3 * kPageSize;
   static constexpr size_t kDefaultLargeObjectThreshold = kMinLargeObjectThreshold;
   // Whether or not parallel GC is enabled. If not, then we never create the thread pool.
-  static constexpr bool kDefaultEnableParallelGC = false;
+  static constexpr bool kDefaultEnableParallelGC = true;
   static uint8_t* const kPreferredAllocSpaceBegin;
 
   // Whether or not we use the free list large object space. Only use it if USE_ART_LOW_4G_ALLOCATOR
@@ -181,10 +182,8 @@ class Heap {
 
   // How often we allow heap trimming to happen (nanoseconds).
   static constexpr uint64_t kHeapTrimWait = MsToNs(5000);
-  // How long we wait after a transition request to perform a collector transition (nanoseconds).
-  static constexpr uint64_t kCollectorTransitionWait = MsToNs(5000);
-  // Whether the transition-wait applies or not. Zero wait will stress the
-  // transition code and collector, but increases jank probability.
+  // Whether the transition-GC heap threshold condition applies or not for non-low memory devices.
+  // Stressing GC will bypass the heap threshold condition.
   DECLARE_RUNTIME_DEBUG_FLAG(kStressCollectorTransition);
 
   // Create a heap with the requested sizes. The possible empty
@@ -385,6 +384,9 @@ class Heap {
   void ThreadFlipBegin(Thread* self) REQUIRES(!*thread_flip_lock_);
   void ThreadFlipEnd(Thread* self) REQUIRES(!*thread_flip_lock_);
 
+  // Ensures that the obj doesn't cause userfaultfd in JNI critical calls.
+  void EnsureObjectUserfaulted(ObjPtr<mirror::Object> obj) REQUIRES_SHARED(Locks::mutator_lock_);
+
   // Clear all of the mark bits, doesn't clear bitmaps which have the same live bits as mark bits.
   // Mutator lock is required for GetContinuousSpaces.
   void ClearMarkedObjects()
@@ -578,6 +580,9 @@ class Heap {
     return region_space_;
   }
 
+  space::BumpPointerSpace* GetBumpPointerSpace() const {
+    return bump_pointer_space_;
+  }
   // Implements java.lang.Runtime.maxMemory, returning the maximum amount of memory a program can
   // consume. For a regular VM this would relate to the -Xmx option and would return -1 if no Xmx
   // were specified. Android apps start with a growth limit (small heap size) which is
@@ -661,6 +666,10 @@ class Heap {
     return live_stack_.get();
   }
 
+  accounting::ObjectStack* GetAllocationStack() REQUIRES_SHARED(Locks::heap_bitmap_lock_) {
+    return allocation_stack_.get();
+  }
+
   void PreZygoteFork() NO_THREAD_SAFETY_ANALYSIS;
 
   // Mark and empty stack.
@@ -760,8 +769,10 @@ class Heap {
       REQUIRES(!*gc_complete_lock_);
   void ResetGcPerformanceInfo() REQUIRES(!*gc_complete_lock_);
 
-  // Thread pool.
-  void CreateThreadPool();
+  // Thread pool. Create either the given number of threads, or as per the
+  // values of conc_gc_threads_ and parallel_gc_threads_.
+  void CreateThreadPool(size_t num_threads = 0);
+  void WaitForWorkersToBeCreated();
   void DeleteThreadPool();
   ThreadPool* GetThreadPool() {
     return thread_pool_.get();
@@ -812,10 +823,22 @@ class Heap {
     return active_collector;
   }
 
-  CollectorType CurrentCollectorType() {
+  collector::MarkCompact* MarkCompactCollector() {
+    DCHECK(!gUseUserfaultfd || mark_compact_ != nullptr);
+    return mark_compact_;
+  }
+
+  bool IsPerformingUffdCompaction() { return gUseUserfaultfd && mark_compact_->IsCompacting(); }
+
+  CollectorType CurrentCollectorType() const {
+    DCHECK(!gUseUserfaultfd || collector_type_ == kCollectorTypeCMC);
     return collector_type_;
   }
 
+  bool IsMovingGc() const { return IsMovingGc(CurrentCollectorType()); }
+
+  CollectorType GetForegroundCollectorType() const { return foreground_collector_type_; }
+
   bool IsGcConcurrentAndMoving() const {
     if (IsGcConcurrent() && IsMovingGc(collector_type_)) {
       // Assume no transition when a concurrent moving collector is used.
@@ -939,6 +962,7 @@ class Heap {
       REQUIRES(!Locks::alloc_tracker_lock_);
 
   void DisableGCForShutdown() REQUIRES(!*gc_complete_lock_);
+  bool IsGCDisabledForShutdown() const REQUIRES(!*gc_complete_lock_);
 
   // Create a new alloc space and compact default alloc space to it.
   HomogeneousSpaceCompactResult PerformHomogeneousSpaceCompact()
@@ -1001,9 +1025,6 @@ class Heap {
     return main_space_backup_ != nullptr;
   }
 
-  // Attempt to use all the userfaultfd related ioctls.
-  void MaybePerformUffdIoctls(GcCause cause, uint32_t requested_gc_num) const;
-
   // Size_t saturating arithmetic
   static ALWAYS_INLINE size_t UnsignedDifference(size_t x, size_t y) {
     return x > y ? x - y : 0;
@@ -1019,19 +1040,11 @@ class Heap {
         allocator_type != kAllocatorTypeTLAB &&
         allocator_type != kAllocatorTypeRegion;
   }
-  static ALWAYS_INLINE bool AllocatorMayHaveConcurrentGC(AllocatorType allocator_type) {
-    if (kUseReadBarrier) {
-      // Read barrier may have the TLAB allocator but is always concurrent. TODO: clean this up.
-      return true;
-    }
-    return
-        allocator_type != kAllocatorTypeTLAB &&
-        allocator_type != kAllocatorTypeBumpPointer;
-  }
   static bool IsMovingGc(CollectorType collector_type) {
     return
         collector_type == kCollectorTypeCC ||
         collector_type == kCollectorTypeSS ||
+        collector_type == kCollectorTypeCMC ||
         collector_type == kCollectorTypeCCBackground ||
         collector_type == kCollectorTypeHomogeneousSpaceCompact;
   }
@@ -1117,9 +1130,6 @@ class Heap {
                                                size_t alloc_size,
                                                bool grow);
 
-  // Run the finalizers. If timeout is non zero, then we use the VMRuntime version.
-  void RunFinalization(JNIEnv* env, uint64_t timeout);
-
   // Blocks the caller until the garbage collector becomes idle and returns the type of GC we
   // waited for.
   collector::GcType WaitForGcToCompleteLocked(GcCause cause, Thread* self)
@@ -1223,6 +1233,7 @@ class Heap {
   // sweep GC, false for other GC types.
   bool IsGcConcurrent() const ALWAYS_INLINE {
     return collector_type_ == kCollectorTypeCC ||
+        collector_type_ == kCollectorTypeCMC ||
         collector_type_ == kCollectorTypeCMS ||
         collector_type_ == kCollectorTypeCCBackground;
   }
@@ -1326,7 +1337,7 @@ class Heap {
   // The current collector type.
   CollectorType collector_type_;
   // Which collector we use when the app is in the foreground.
-  CollectorType foreground_collector_type_;
+  const CollectorType foreground_collector_type_;
   // Which collector we will use when the app is notified of a transition to background.
   CollectorType background_collector_type_;
   // Desired collector type, heap trimming daemon transitions the heap if it is != collector_type_.
@@ -1437,8 +1448,9 @@ class Heap {
 
   // Computed with foreground-multiplier in GrowForUtilization() when run in
   // jank non-perceptible state. On update to process state from background to
-  // foreground we set target_footprint_ to this value.
+  // foreground we set target_footprint_ and concurrent_start_bytes_ to the corresponding value.
   size_t min_foreground_target_footprint_ GUARDED_BY(process_state_update_lock_);
+  size_t min_foreground_concurrent_start_bytes_ GUARDED_BY(process_state_update_lock_);
 
   // When num_bytes_allocated_ exceeds this amount then a concurrent GC should be requested so that
   // it completes ahead of an allocation failing.
@@ -1588,6 +1600,7 @@ class Heap {
 
   std::vector<collector::GarbageCollector*> garbage_collectors_;
   collector::SemiSpace* semi_space_collector_;
+  collector::MarkCompact* mark_compact_;
   Atomic<collector::ConcurrentCopying*> active_concurrent_copying_collector_;
   collector::ConcurrentCopying* young_concurrent_copying_collector_;
   collector::ConcurrentCopying* concurrent_copying_collector_;
@@ -1680,9 +1693,6 @@ class Heap {
   // Stack trace hashes that we already saw,
   std::unordered_set<uint64_t> seen_backtraces_ GUARDED_BY(backtrace_lock_);
 
-  // Userfaultfd file descriptor.
-  // TODO (lokeshgidra): remove this when the userfaultfd-based GC is in use.
-  int uffd_;
   // We disable GC when we are shutting down the runtime in case there are daemon threads still
   // allocating.
   bool gc_disabled_for_shutdown_ GUARDED_BY(gc_complete_lock_);
@@ -1712,6 +1722,7 @@ class Heap {
   friend class CollectorTransitionTask;
   friend class collector::GarbageCollector;
   friend class collector::ConcurrentCopying;
+  friend class collector::MarkCompact;
   friend class collector::MarkSweep;
   friend class collector::SemiSpace;
   friend class GCCriticalSection;
diff --git a/runtime/gc/heap_test.cc b/runtime/gc/heap_test.cc
index 5e8c1e368a..b569241bdc 100644
--- a/runtime/gc/heap_test.cc
+++ b/runtime/gc/heap_test.cc
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include <algorithm>
+
+#include "base/metrics/metrics.h"
 #include "class_linker-inl.h"
 #include "common_runtime_test.h"
 #include "gc/accounting/card_table-inl.h"
@@ -30,6 +33,10 @@ namespace gc {
 
 class HeapTest : public CommonRuntimeTest {
  public:
+  HeapTest() {
+    use_boot_image_ = true;  // Make the Runtime creation cheaper.
+  }
+
   void SetUp() override {
     MemMap::Init();
     std::string error_msg;
@@ -99,7 +106,160 @@ TEST_F(HeapTest, DumpGCPerformanceOnShutdown) {
   Runtime::Current()->SetDumpGCPerformanceOnShutdown(true);
 }
 
+bool AnyIsFalse(bool x, bool y) { return !x || !y; }
+
+TEST_F(HeapTest, GCMetrics) {
+  // Allocate a few string objects (to be collected), then trigger garbage
+  // collection, and check that GC metrics are updated (where applicable).
+  {
+    constexpr const size_t kNumObj = 128;
+    ScopedObjectAccess soa(Thread::Current());
+    StackHandleScope<kNumObj> hs(soa.Self());
+    for (size_t i = 0u; i < kNumObj; ++i) {
+      Handle<mirror::String> string [[maybe_unused]] (
+          hs.NewHandle(mirror::String::AllocFromModifiedUtf8(soa.Self(), "test")));
+    }
+  }
+  Heap* heap = Runtime::Current()->GetHeap();
+  heap->CollectGarbage(/* clear_soft_references= */ false);
+
+  // ART Metrics.
+  metrics::ArtMetrics* metrics = Runtime::Current()->GetMetrics();
+  // ART full-heap GC metrics.
+  metrics::MetricsBase<int64_t>* full_gc_collection_time = metrics->FullGcCollectionTime();
+  metrics::MetricsBase<uint64_t>* full_gc_count = metrics->FullGcCount();
+  metrics::MetricsBase<uint64_t>* full_gc_count_delta = metrics->FullGcCountDelta();
+  metrics::MetricsBase<int64_t>* full_gc_throughput = metrics->FullGcThroughput();
+  metrics::MetricsBase<int64_t>* full_gc_tracing_throughput = metrics->FullGcTracingThroughput();
+  metrics::MetricsBase<uint64_t>* full_gc_throughput_avg = metrics->FullGcThroughputAvg();
+  metrics::MetricsBase<uint64_t>* full_gc_tracing_throughput_avg =
+      metrics->FullGcTracingThroughputAvg();
+  metrics::MetricsBase<uint64_t>* full_gc_scanned_bytes = metrics->FullGcScannedBytes();
+  metrics::MetricsBase<uint64_t>* full_gc_scanned_bytes_delta = metrics->FullGcScannedBytesDelta();
+  metrics::MetricsBase<uint64_t>* full_gc_freed_bytes = metrics->FullGcFreedBytes();
+  metrics::MetricsBase<uint64_t>* full_gc_freed_bytes_delta = metrics->FullGcFreedBytesDelta();
+  metrics::MetricsBase<uint64_t>* full_gc_duration = metrics->FullGcDuration();
+  metrics::MetricsBase<uint64_t>* full_gc_duration_delta = metrics->FullGcDurationDelta();
+  // ART young-generation GC metrics.
+  metrics::MetricsBase<int64_t>* young_gc_collection_time = metrics->YoungGcCollectionTime();
+  metrics::MetricsBase<uint64_t>* young_gc_count = metrics->YoungGcCount();
+  metrics::MetricsBase<uint64_t>* young_gc_count_delta = metrics->YoungGcCountDelta();
+  metrics::MetricsBase<int64_t>* young_gc_throughput = metrics->YoungGcThroughput();
+  metrics::MetricsBase<int64_t>* young_gc_tracing_throughput = metrics->YoungGcTracingThroughput();
+  metrics::MetricsBase<uint64_t>* young_gc_throughput_avg = metrics->YoungGcThroughputAvg();
+  metrics::MetricsBase<uint64_t>* young_gc_tracing_throughput_avg =
+      metrics->YoungGcTracingThroughputAvg();
+  metrics::MetricsBase<uint64_t>* young_gc_scanned_bytes = metrics->YoungGcScannedBytes();
+  metrics::MetricsBase<uint64_t>* young_gc_scanned_bytes_delta =
+      metrics->YoungGcScannedBytesDelta();
+  metrics::MetricsBase<uint64_t>* young_gc_freed_bytes = metrics->YoungGcFreedBytes();
+  metrics::MetricsBase<uint64_t>* young_gc_freed_bytes_delta = metrics->YoungGcFreedBytesDelta();
+  metrics::MetricsBase<uint64_t>* young_gc_duration = metrics->YoungGcDuration();
+  metrics::MetricsBase<uint64_t>* young_gc_duration_delta = metrics->YoungGcDurationDelta();
+
+  CollectorType fg_collector_type = heap->GetForegroundCollectorType();
+  if (fg_collector_type == kCollectorTypeCC || fg_collector_type == kCollectorTypeCMC) {
+    // Only the Concurrent Copying and Concurrent Mark-Compact collectors enable
+    // GC metrics at the moment.
+    if (heap->GetUseGenerationalCC()) {
+      // Check that full-heap and/or young-generation GC metrics are non-null
+      // after trigerring the collection.
+      EXPECT_PRED2(
+          AnyIsFalse, full_gc_collection_time->IsNull(), young_gc_collection_time->IsNull());
+      EXPECT_PRED2(AnyIsFalse, full_gc_count->IsNull(), young_gc_count->IsNull());
+      EXPECT_PRED2(AnyIsFalse, full_gc_count_delta->IsNull(), young_gc_count_delta->IsNull());
+      EXPECT_PRED2(AnyIsFalse, full_gc_throughput->IsNull(), young_gc_throughput->IsNull());
+      EXPECT_PRED2(
+          AnyIsFalse, full_gc_tracing_throughput->IsNull(), young_gc_tracing_throughput->IsNull());
+      EXPECT_PRED2(AnyIsFalse, full_gc_throughput_avg->IsNull(), young_gc_throughput_avg->IsNull());
+      EXPECT_PRED2(AnyIsFalse,
+                   full_gc_tracing_throughput_avg->IsNull(),
+                   young_gc_tracing_throughput_avg->IsNull());
+      EXPECT_PRED2(AnyIsFalse, full_gc_scanned_bytes->IsNull(), young_gc_scanned_bytes->IsNull());
+      EXPECT_PRED2(AnyIsFalse,
+                   full_gc_scanned_bytes_delta->IsNull(),
+                   young_gc_scanned_bytes_delta->IsNull());
+      EXPECT_PRED2(AnyIsFalse, full_gc_freed_bytes->IsNull(), young_gc_freed_bytes->IsNull());
+      EXPECT_PRED2(
+          AnyIsFalse, full_gc_freed_bytes_delta->IsNull(), young_gc_freed_bytes_delta->IsNull());
+      // We have observed that sometimes the GC duration (both for full-heap and
+      // young-generation collections) is null (b/271112044). Temporarily
+      // suspend the following checks while we investigate.
+      //
+      // TODO(b/271112044): Investigate and adjust these expectations and/or the
+      // corresponding metric logic.
+#if 0
+      EXPECT_PRED2(AnyIsFalse, full_gc_duration->IsNull(), young_gc_duration->IsNull());
+      EXPECT_PRED2(AnyIsFalse, full_gc_duration_delta->IsNull(), young_gc_duration_delta->IsNull());
+#endif
+    } else {
+      // Check that only full-heap GC metrics are non-null after trigerring the collection.
+      EXPECT_FALSE(full_gc_collection_time->IsNull());
+      EXPECT_FALSE(full_gc_count->IsNull());
+      EXPECT_FALSE(full_gc_count_delta->IsNull());
+      EXPECT_FALSE(full_gc_throughput->IsNull());
+      EXPECT_FALSE(full_gc_tracing_throughput->IsNull());
+      EXPECT_FALSE(full_gc_throughput_avg->IsNull());
+      EXPECT_FALSE(full_gc_tracing_throughput_avg->IsNull());
+      EXPECT_FALSE(full_gc_scanned_bytes->IsNull());
+      EXPECT_FALSE(full_gc_scanned_bytes_delta->IsNull());
+      EXPECT_FALSE(full_gc_freed_bytes->IsNull());
+      EXPECT_FALSE(full_gc_freed_bytes_delta->IsNull());
+      EXPECT_FALSE(full_gc_duration->IsNull());
+      EXPECT_FALSE(full_gc_duration_delta->IsNull());
+
+      EXPECT_TRUE(young_gc_collection_time->IsNull());
+      EXPECT_TRUE(young_gc_count->IsNull());
+      EXPECT_TRUE(young_gc_count_delta->IsNull());
+      EXPECT_TRUE(young_gc_throughput->IsNull());
+      EXPECT_TRUE(young_gc_tracing_throughput->IsNull());
+      EXPECT_TRUE(young_gc_throughput_avg->IsNull());
+      EXPECT_TRUE(young_gc_tracing_throughput_avg->IsNull());
+      EXPECT_TRUE(young_gc_scanned_bytes->IsNull());
+      EXPECT_TRUE(young_gc_scanned_bytes_delta->IsNull());
+      EXPECT_TRUE(young_gc_freed_bytes->IsNull());
+      EXPECT_TRUE(young_gc_freed_bytes_delta->IsNull());
+      EXPECT_TRUE(young_gc_duration->IsNull());
+      EXPECT_TRUE(young_gc_duration_delta->IsNull());
+    }
+  } else {
+    // Check that all metrics are null after trigerring the collection.
+    EXPECT_TRUE(full_gc_collection_time->IsNull());
+    EXPECT_TRUE(full_gc_count->IsNull());
+    EXPECT_TRUE(full_gc_count_delta->IsNull());
+    EXPECT_TRUE(full_gc_throughput->IsNull());
+    EXPECT_TRUE(full_gc_tracing_throughput->IsNull());
+    EXPECT_TRUE(full_gc_throughput_avg->IsNull());
+    EXPECT_TRUE(full_gc_tracing_throughput_avg->IsNull());
+    EXPECT_TRUE(full_gc_scanned_bytes->IsNull());
+    EXPECT_TRUE(full_gc_scanned_bytes_delta->IsNull());
+    EXPECT_TRUE(full_gc_freed_bytes->IsNull());
+    EXPECT_TRUE(full_gc_freed_bytes_delta->IsNull());
+    EXPECT_TRUE(full_gc_duration->IsNull());
+    EXPECT_TRUE(full_gc_duration_delta->IsNull());
+
+    EXPECT_TRUE(young_gc_collection_time->IsNull());
+    EXPECT_TRUE(young_gc_count->IsNull());
+    EXPECT_TRUE(young_gc_count_delta->IsNull());
+    EXPECT_TRUE(young_gc_throughput->IsNull());
+    EXPECT_TRUE(young_gc_tracing_throughput->IsNull());
+    EXPECT_TRUE(young_gc_throughput_avg->IsNull());
+    EXPECT_TRUE(young_gc_tracing_throughput_avg->IsNull());
+    EXPECT_TRUE(young_gc_scanned_bytes->IsNull());
+    EXPECT_TRUE(young_gc_scanned_bytes_delta->IsNull());
+    EXPECT_TRUE(young_gc_freed_bytes->IsNull());
+    EXPECT_TRUE(young_gc_freed_bytes_delta->IsNull());
+    EXPECT_TRUE(young_gc_duration->IsNull());
+    EXPECT_TRUE(young_gc_duration_delta->IsNull());
+  }
+}
+
 class ZygoteHeapTest : public CommonRuntimeTest {
+ public:
+  ZygoteHeapTest() {
+    use_boot_image_ = true;  // Make the Runtime creation cheaper.
+  }
+
   void SetUpRuntimeOptions(RuntimeOptions* options) override {
     CommonRuntimeTest::SetUpRuntimeOptions(options);
     options->push_back(std::make_pair("-Xzygote", nullptr));
diff --git a/runtime/gc/heap_verification_test.cc b/runtime/gc/heap_verification_test.cc
index ca6a30b11d..a7583fe7f1 100644
--- a/runtime/gc/heap_verification_test.cc
+++ b/runtime/gc/heap_verification_test.cc
@@ -26,14 +26,16 @@
 #include "mirror/string.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
-#include "verification.h"
+#include "verification-inl.h"
 
 namespace art {
 namespace gc {
 
 class VerificationTest : public CommonRuntimeTest {
  protected:
-  VerificationTest() {}
+  VerificationTest() {
+    use_boot_image_ = true;  // Make the Runtime creation cheaper.
+  }
 
   template <class T>
   ObjPtr<mirror::ObjectArray<T>> AllocObjectArray(Thread* self, size_t length)
@@ -76,11 +78,11 @@ TEST_F(VerificationTest, IsValidClassOrNotInHeap) {
   Handle<mirror::String> string(
       hs.NewHandle(mirror::String::AllocFromModifiedUtf8(soa.Self(), "test")));
   const Verification* const v = Runtime::Current()->GetHeap()->GetVerification();
-  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<const void*>(1)));
-  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<const void*>(4)));
+  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<mirror::Class*>(1)));
+  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<mirror::Class*>(4)));
   EXPECT_FALSE(v->IsValidClass(nullptr));
   EXPECT_TRUE(v->IsValidClass(string->GetClass()));
-  EXPECT_FALSE(v->IsValidClass(string.Get()));
+  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<mirror::Class*>(string.Get())));
 }
 
 TEST_F(VerificationTest, IsValidClassInHeap) {
@@ -95,9 +97,9 @@ TEST_F(VerificationTest, IsValidClassInHeap) {
   Handle<mirror::String> string(
       hs.NewHandle(mirror::String::AllocFromModifiedUtf8(soa.Self(), "test")));
   const Verification* const v = Runtime::Current()->GetHeap()->GetVerification();
-  const uintptr_t uint_klass = reinterpret_cast<uintptr_t>(string->GetClass());
-  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<const void*>(uint_klass - kObjectAlignment)));
-  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<const void*>(&uint_klass)));
+  uintptr_t uint_klass = reinterpret_cast<uintptr_t>(string->GetClass());
+  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<mirror::Class*>(uint_klass - kObjectAlignment)));
+  EXPECT_FALSE(v->IsValidClass(reinterpret_cast<mirror::Class*>(&uint_klass)));
 }
 
 TEST_F(VerificationTest, DumpInvalidObjectInfo) {
diff --git a/runtime/gc/reference_processor.cc b/runtime/gc/reference_processor.cc
index 5e41ee4ef8..f24c94279c 100644
--- a/runtime/gc/reference_processor.cc
+++ b/runtime/gc/reference_processor.cc
@@ -90,7 +90,7 @@ void ReferenceProcessor::BroadcastForSlowPath(Thread* self) {
 ObjPtr<mirror::Object> ReferenceProcessor::GetReferent(Thread* self,
                                                        ObjPtr<mirror::Reference> reference) {
   auto slow_path_required = [this, self]() REQUIRES_SHARED(Locks::mutator_lock_) {
-    return kUseReadBarrier ? !self->GetWeakRefAccessEnabled() : SlowPathEnabled();
+    return gUseReadBarrier ? !self->GetWeakRefAccessEnabled() : SlowPathEnabled();
   };
   if (!slow_path_required()) {
     return reference->GetReferent();
@@ -118,10 +118,10 @@ ObjPtr<mirror::Object> ReferenceProcessor::GetReferent(Thread* self,
   // Keeping reference_processor_lock_ blocks the broadcast when we try to reenable the fast path.
   while (slow_path_required()) {
     DCHECK(collector_ != nullptr);
-    constexpr bool kOtherReadBarrier = kUseReadBarrier && !kUseBakerReadBarrier;
+    const bool other_read_barrier = !kUseBakerReadBarrier && gUseReadBarrier;
     if (UNLIKELY(reference->IsFinalizerReferenceInstance()
                  || rp_state_ == RpState::kStarting /* too early to determine mark state */
-                 || (kOtherReadBarrier && reference->IsPhantomReferenceInstance()))) {
+                 || (other_read_barrier && reference->IsPhantomReferenceInstance()))) {
       // Odd cases in which it doesn't hurt to just wait, or the wait is likely to be very brief.
 
       // Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
@@ -210,7 +210,7 @@ void ReferenceProcessor::ProcessReferences(Thread* self, TimingLogger* timings)
   }
   {
     MutexLock mu(self, *Locks::reference_processor_lock_);
-    if (!kUseReadBarrier) {
+    if (!gUseReadBarrier) {
       CHECK_EQ(SlowPathEnabled(), concurrent_) << "Slow path must be enabled iff concurrent";
     } else {
       // Weak ref access is enabled at Zygote compaction by SemiSpace (concurrent_ == false).
@@ -305,7 +305,7 @@ void ReferenceProcessor::ProcessReferences(Thread* self, TimingLogger* timings)
     // could result in a stale is_marked_callback_ being called before the reference processing
     // starts since there is a small window of time where slow_path_enabled_ is enabled but the
     // callback isn't yet set.
-    if (!kUseReadBarrier && concurrent_) {
+    if (!gUseReadBarrier && concurrent_) {
       // Done processing, disable the slow path and broadcast to the waiters.
       DisableSlowPath(self);
     }
@@ -363,9 +363,8 @@ class ClearedReferenceTask : public HeapTask {
   }
   void Run(Thread* thread) override {
     ScopedObjectAccess soa(thread);
-    jvalue args[1];
-    args[0].l = cleared_references_;
-    InvokeWithJValues(soa, nullptr, WellKnownClasses::java_lang_ref_ReferenceQueue_add, args);
+    WellKnownClasses::java_lang_ref_ReferenceQueue_add->InvokeStatic<'V', 'L'>(
+        thread, soa.Decode<mirror::Object>(cleared_references_));
     soa.Env()->DeleteGlobalRef(cleared_references_);
   }
 
@@ -418,8 +417,8 @@ void ReferenceProcessor::ClearReferent(ObjPtr<mirror::Reference> ref) {
 
 void ReferenceProcessor::WaitUntilDoneProcessingReferences(Thread* self) {
   // Wait until we are done processing reference.
-  while ((!kUseReadBarrier && SlowPathEnabled()) ||
-         (kUseReadBarrier && !self->GetWeakRefAccessEnabled())) {
+  while ((!gUseReadBarrier && SlowPathEnabled()) ||
+         (gUseReadBarrier && !self->GetWeakRefAccessEnabled())) {
     // Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
     // presence of threads blocking for weak ref access.
     self->CheckEmptyCheckpointFromWeakRefAccess(Locks::reference_processor_lock_);
diff --git a/runtime/gc/reference_queue_test.cc b/runtime/gc/reference_queue_test.cc
index c680fb5781..c8e71b02ac 100644
--- a/runtime/gc/reference_queue_test.cc
+++ b/runtime/gc/reference_queue_test.cc
@@ -26,7 +26,12 @@
 namespace art {
 namespace gc {
 
-class ReferenceQueueTest : public CommonRuntimeTest {};
+class ReferenceQueueTest : public CommonRuntimeTest {
+ protected:
+  ReferenceQueueTest() {
+    use_boot_image_ = true;  // Make the Runtime creation cheaper.
+  }
+};
 
 TEST_F(ReferenceQueueTest, EnqueueDequeue) {
   Thread* self = Thread::Current();
diff --git a/runtime/gc/scoped_gc_critical_section.cc b/runtime/gc/scoped_gc_critical_section.cc
index eaede43e79..7a0a6e8736 100644
--- a/runtime/gc/scoped_gc_critical_section.cc
+++ b/runtime/gc/scoped_gc_critical_section.cc
@@ -58,17 +58,5 @@ ScopedGCCriticalSection::~ScopedGCCriticalSection() {
   critical_section_.Exit(old_no_suspend_reason_);
 }
 
-ScopedInterruptibleGCCriticalSection::ScopedInterruptibleGCCriticalSection(
-    Thread* self,
-    GcCause cause,
-    CollectorType type) : self_(self) {
-  DCHECK(self != nullptr);
-  Runtime::Current()->GetHeap()->StartGC(self_, cause, type);
-}
-
-ScopedInterruptibleGCCriticalSection::~ScopedInterruptibleGCCriticalSection() {
-  Runtime::Current()->GetHeap()->FinishGC(self_, collector::kGcTypeNone);
-}
-
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/scoped_gc_critical_section.h b/runtime/gc/scoped_gc_critical_section.h
index b3a897c76b..8ad01580c2 100644
--- a/runtime/gc/scoped_gc_critical_section.h
+++ b/runtime/gc/scoped_gc_critical_section.h
@@ -59,19 +59,6 @@ class ScopedGCCriticalSection {
   const char* old_no_suspend_reason_;
 };
 
-// The use of ScopedGCCriticalSection should be preferred whenever possible.
-// This class allows thread suspension but should never be used with allocations because of the
-// deadlock risk. TODO: Add a new thread role for "no allocations" that still allows suspension.
-class ScopedInterruptibleGCCriticalSection {
- public:
-  ScopedInterruptibleGCCriticalSection(Thread* self, GcCause cause, CollectorType type);
-  ~ScopedInterruptibleGCCriticalSection();
-
- private:
-  Thread* const self_;
-};
-
-
 }  // namespace gc
 }  // namespace art
 
diff --git a/runtime/gc/space/bump_pointer_space-inl.h b/runtime/gc/space/bump_pointer_space-inl.h
index 20f7a93eb1..2774b9e71c 100644
--- a/runtime/gc/space/bump_pointer_space-inl.h
+++ b/runtime/gc/space/bump_pointer_space-inl.h
@@ -20,6 +20,7 @@
 #include "bump_pointer_space.h"
 
 #include "base/bit_utils.h"
+#include "mirror/object-inl.h"
 
 namespace art {
 namespace gc {
@@ -89,6 +90,11 @@ inline mirror::Object* BumpPointerSpace::AllocNonvirtual(size_t num_bytes) {
   return ret;
 }
 
+inline mirror::Object* BumpPointerSpace::GetNextObject(mirror::Object* obj) {
+  const uintptr_t position = reinterpret_cast<uintptr_t>(obj) + obj->SizeOf();
+  return reinterpret_cast<mirror::Object*>(RoundUp(position, kAlignment));
+}
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/space/bump_pointer_space-walk-inl.h b/runtime/gc/space/bump_pointer_space-walk-inl.h
index 5d05ea2d65..a978f62c61 100644
--- a/runtime/gc/space/bump_pointer_space-walk-inl.h
+++ b/runtime/gc/space/bump_pointer_space-walk-inl.h
@@ -17,12 +17,14 @@
 #ifndef ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_WALK_INL_H_
 #define ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_WALK_INL_H_
 
-#include "bump_pointer_space.h"
+#include "bump_pointer_space-inl.h"
 
 #include "base/bit_utils.h"
 #include "mirror/object-inl.h"
 #include "thread-current-inl.h"
 
+#include <memory>
+
 namespace art {
 namespace gc {
 namespace space {
@@ -32,6 +34,7 @@ inline void BumpPointerSpace::Walk(Visitor&& visitor) {
   uint8_t* pos = Begin();
   uint8_t* end = End();
   uint8_t* main_end = pos;
+  std::unique_ptr<std::vector<size_t>> block_sizes_copy;
   // Internal indirection w/ NO_THREAD_SAFETY_ANALYSIS. Optimally, we'd like to have an annotation
   // like
   //   REQUIRES_AS(visitor.operator(mirror::Object*))
@@ -49,15 +52,17 @@ inline void BumpPointerSpace::Walk(Visitor&& visitor) {
     MutexLock mu(Thread::Current(), block_lock_);
     // If we have 0 blocks then we need to update the main header since we have bump pointer style
     // allocation into an unbounded region (actually bounded by Capacity()).
-    if (num_blocks_ == 0) {
+    if (block_sizes_.empty()) {
       UpdateMainBlock();
     }
     main_end = Begin() + main_block_size_;
-    if (num_blocks_ == 0) {
+    if (block_sizes_.empty()) {
       // We don't have any other blocks, this means someone else may be allocating into the main
       // block. In this case, we don't want to try and visit the other blocks after the main block
       // since these could actually be part of the main block.
       end = main_end;
+    } else {
+      block_sizes_copy.reset(new std::vector<size_t>(block_sizes_.begin(), block_sizes_.end()));
     }
   }
   // Walk all of the objects in the main block first.
@@ -66,31 +71,33 @@ inline void BumpPointerSpace::Walk(Visitor&& visitor) {
     // No read barrier because obj may not be a valid object.
     if (obj->GetClass<kDefaultVerifyFlags, kWithoutReadBarrier>() == nullptr) {
       // There is a race condition where a thread has just allocated an object but not set the
-      // class. We can't know the size of this object, so we don't visit it and exit the function
-      // since there is guaranteed to be not other blocks.
-      return;
+      // class. We can't know the size of this object, so we don't visit it and break the loop
+      pos = main_end;
+      break;
     } else {
       no_thread_safety_analysis_visit(obj);
       pos = reinterpret_cast<uint8_t*>(GetNextObject(obj));
     }
   }
   // Walk the other blocks (currently only TLABs).
-  while (pos < end) {
-    BlockHeader* header = reinterpret_cast<BlockHeader*>(pos);
-    size_t block_size = header->size_;
-    pos += sizeof(BlockHeader);  // Skip the header so that we know where the objects
-    mirror::Object* obj = reinterpret_cast<mirror::Object*>(pos);
-    const mirror::Object* end_obj = reinterpret_cast<const mirror::Object*>(pos + block_size);
-    CHECK_LE(reinterpret_cast<const uint8_t*>(end_obj), End());
-    // We don't know how many objects are allocated in the current block. When we hit a null class
-    // assume its the end. TODO: Have a thread update the header when it flushes the block?
-    // No read barrier because obj may not be a valid object.
-    while (obj < end_obj && obj->GetClass<kDefaultVerifyFlags, kWithoutReadBarrier>() != nullptr) {
-      no_thread_safety_analysis_visit(obj);
-      obj = GetNextObject(obj);
+  if (block_sizes_copy != nullptr) {
+    for (size_t block_size : *block_sizes_copy) {
+      mirror::Object* obj = reinterpret_cast<mirror::Object*>(pos);
+      const mirror::Object* end_obj = reinterpret_cast<const mirror::Object*>(pos + block_size);
+      CHECK_LE(reinterpret_cast<const uint8_t*>(end_obj), End());
+      // We don't know how many objects are allocated in the current block. When we hit a null class
+      // assume it's the end. TODO: Have a thread update the header when it flushes the block?
+      // No read barrier because obj may not be a valid object.
+      while (obj < end_obj && obj->GetClass<kDefaultVerifyFlags, kWithoutReadBarrier>() != nullptr) {
+        no_thread_safety_analysis_visit(obj);
+        obj = GetNextObject(obj);
+      }
+      pos += block_size;
     }
-    pos += block_size;
+  } else {
+    CHECK_EQ(end, main_end);
   }
+  CHECK_EQ(pos, end);
 }
 
 }  // namespace space
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index 3a0155a278..7753f73ca4 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -54,8 +54,9 @@ BumpPointerSpace::BumpPointerSpace(const std::string& name, uint8_t* begin, uint
       growth_end_(limit),
       objects_allocated_(0), bytes_allocated_(0),
       block_lock_("Block lock"),
-      main_block_size_(0),
-      num_blocks_(0) {
+      main_block_size_(0) {
+  // This constructor gets called only from Heap::PreZygoteFork(), which
+  // doesn't require a mark_bitmap.
 }
 
 BumpPointerSpace::BumpPointerSpace(const std::string& name, MemMap&& mem_map)
@@ -68,8 +69,11 @@ BumpPointerSpace::BumpPointerSpace(const std::string& name, MemMap&& mem_map)
       growth_end_(mem_map_.End()),
       objects_allocated_(0), bytes_allocated_(0),
       block_lock_("Block lock", kBumpPointerSpaceBlockLock),
-      main_block_size_(0),
-      num_blocks_(0) {
+      main_block_size_(0) {
+  mark_bitmap_ =
+      accounting::ContinuousSpaceBitmap::Create("bump-pointer space live bitmap",
+                                                Begin(),
+                                                Capacity());
 }
 
 void BumpPointerSpace::Clear() {
@@ -86,7 +90,7 @@ void BumpPointerSpace::Clear() {
   growth_end_ = Limit();
   {
     MutexLock mu(Thread::Current(), block_lock_);
-    num_blocks_ = 0;
+    block_sizes_.clear();
     main_block_size_ = 0;
   }
 }
@@ -97,11 +101,6 @@ void BumpPointerSpace::Dump(std::ostream& os) const {
       << reinterpret_cast<void*>(Limit());
 }
 
-mirror::Object* BumpPointerSpace::GetNextObject(mirror::Object* obj) {
-  const uintptr_t position = reinterpret_cast<uintptr_t>(obj) + obj->SizeOf();
-  return reinterpret_cast<mirror::Object*>(RoundUp(position, kAlignment));
-}
-
 size_t BumpPointerSpace::RevokeThreadLocalBuffers(Thread* thread) {
   MutexLock mu(Thread::Current(), block_lock_);
   RevokeThreadLocalBuffersLocked(thread);
@@ -141,23 +140,19 @@ void BumpPointerSpace::AssertAllThreadLocalBuffersAreRevoked() {
 }
 
 void BumpPointerSpace::UpdateMainBlock() {
-  DCHECK_EQ(num_blocks_, 0U);
+  DCHECK(block_sizes_.empty());
   main_block_size_ = Size();
 }
 
 // Returns the start of the storage.
 uint8_t* BumpPointerSpace::AllocBlock(size_t bytes) {
   bytes = RoundUp(bytes, kAlignment);
-  if (!num_blocks_) {
+  if (block_sizes_.empty()) {
     UpdateMainBlock();
   }
-  uint8_t* storage = reinterpret_cast<uint8_t*>(
-      AllocNonvirtualWithoutAccounting(bytes + sizeof(BlockHeader)));
+  uint8_t* storage = reinterpret_cast<uint8_t*>(AllocNonvirtualWithoutAccounting(bytes));
   if (LIKELY(storage != nullptr)) {
-    BlockHeader* header = reinterpret_cast<BlockHeader*>(storage);
-    header->size_ = bytes;  // Write out the block header.
-    storage += sizeof(BlockHeader);
-    ++num_blocks_;
+    block_sizes_.push_back(bytes);
   }
   return storage;
 }
@@ -177,7 +172,7 @@ uint64_t BumpPointerSpace::GetBytesAllocated() {
   MutexLock mu3(Thread::Current(), block_lock_);
   // If we don't have any blocks, we don't have any thread local buffers. This check is required
   // since there can exist multiple bump pointer spaces which exist at the same time.
-  if (num_blocks_ > 0) {
+  if (!block_sizes_.empty()) {
     for (Thread* thread : thread_list) {
       total += thread->GetThreadLocalBytesAllocated();
     }
@@ -195,7 +190,7 @@ uint64_t BumpPointerSpace::GetObjectsAllocated() {
   MutexLock mu3(Thread::Current(), block_lock_);
   // If we don't have any blocks, we don't have any thread local buffers. This check is required
   // since there can exist multiple bump pointer spaces which exist at the same time.
-  if (num_blocks_ > 0) {
+  if (!block_sizes_.empty()) {
     for (Thread* thread : thread_list) {
       total += thread->GetThreadLocalObjectsAllocated();
     }
@@ -240,6 +235,52 @@ size_t BumpPointerSpace::AllocationSizeNonvirtual(mirror::Object* obj, size_t* u
   return num_bytes;
 }
 
+uint8_t* BumpPointerSpace::AlignEnd(Thread* self, size_t alignment) {
+  Locks::mutator_lock_->AssertExclusiveHeld(self);
+  DCHECK(IsAligned<kAlignment>(alignment));
+  uint8_t* end = end_.load(std::memory_order_relaxed);
+  uint8_t* aligned_end = AlignUp(end, alignment);
+  ptrdiff_t diff = aligned_end - end;
+  if (diff > 0) {
+    end_.store(aligned_end, std::memory_order_relaxed);
+    // If we have blocks after the main one. Then just add the diff to the last
+    // block.
+    MutexLock mu(self, block_lock_);
+    if (!block_sizes_.empty()) {
+      block_sizes_.back() += diff;
+    }
+  }
+  return end;
+}
+
+std::vector<size_t>* BumpPointerSpace::GetBlockSizes(Thread* self, size_t* main_block_size) {
+  std::vector<size_t>* block_sizes = nullptr;
+  MutexLock mu(self, block_lock_);
+  if (!block_sizes_.empty()) {
+    block_sizes = new std::vector<size_t>(block_sizes_.begin(), block_sizes_.end());
+  } else {
+    UpdateMainBlock();
+  }
+  *main_block_size = main_block_size_;
+  return block_sizes;
+}
+
+void BumpPointerSpace::SetBlockSizes(Thread* self,
+                                     const size_t main_block_size,
+                                     const size_t first_valid_idx) {
+  MutexLock mu(self, block_lock_);
+  main_block_size_ = main_block_size;
+  if (!block_sizes_.empty()) {
+    block_sizes_.erase(block_sizes_.begin(), block_sizes_.begin() + first_valid_idx);
+  }
+  size_t size = main_block_size;
+  for (size_t block_size : block_sizes_) {
+    size += block_size;
+  }
+  DCHECK(IsAligned<kAlignment>(size));
+  end_.store(Begin() + size, std::memory_order_relaxed);
+}
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/space/bump_pointer_space.h b/runtime/gc/space/bump_pointer_space.h
index 08ed503b5f..bba171109d 100644
--- a/runtime/gc/space/bump_pointer_space.h
+++ b/runtime/gc/space/bump_pointer_space.h
@@ -17,9 +17,10 @@
 #ifndef ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_H_
 #define ART_RUNTIME_GC_SPACE_BUMP_POINTER_SPACE_H_
 
+#include "base/mutex.h"
 #include "space.h"
 
-#include "base/mutex.h"
+#include <deque>
 
 namespace art {
 
@@ -30,6 +31,7 @@ class Object;
 namespace gc {
 
 namespace collector {
+class MarkCompact;
 class MarkSweep;
 }  // namespace collector
 
@@ -39,7 +41,7 @@ namespace space {
 // implementation as its intended to be evacuated.
 class BumpPointerSpace final : public ContinuousMemMapAllocSpace {
  public:
-  typedef void(*WalkCallback)(void *start, void *end, size_t num_bytes, void* callback_arg);
+  using WalkCallback = void (*)(void *, void *, int, void *);
 
   SpaceType GetType() const override {
     return kSpaceTypeBumpPointerSpace;
@@ -100,10 +102,6 @@ class BumpPointerSpace final : public ContinuousMemMapAllocSpace {
     return nullptr;
   }
 
-  accounting::ContinuousSpaceBitmap* GetMarkBitmap() override {
-    return nullptr;
-  }
-
   // Reset the space to empty.
   void Clear() override REQUIRES(!block_lock_);
 
@@ -120,6 +118,11 @@ class BumpPointerSpace final : public ContinuousMemMapAllocSpace {
       REQUIRES(!*Locks::runtime_shutdown_lock_, !*Locks::thread_list_lock_, !block_lock_);
   uint64_t GetObjectsAllocated() override REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!*Locks::runtime_shutdown_lock_, !*Locks::thread_list_lock_, !block_lock_);
+  // Return the pre-determined allocated object count. This could be beneficial
+  // when we know that all the TLABs are revoked.
+  int32_t GetAccumulatedObjectsAllocated() REQUIRES_SHARED(Locks::mutator_lock_) {
+    return objects_allocated_.load(std::memory_order_relaxed);
+  }
   bool IsEmpty() const {
     return Begin() == End();
   }
@@ -128,18 +131,9 @@ class BumpPointerSpace final : public ContinuousMemMapAllocSpace {
     return true;
   }
 
-  bool Contains(const mirror::Object* obj) const override {
-    const uint8_t* byte_obj = reinterpret_cast<const uint8_t*>(obj);
-    return byte_obj >= Begin() && byte_obj < End();
-  }
-
   // TODO: Change this? Mainly used for compacting to a particular region of memory.
   BumpPointerSpace(const std::string& name, uint8_t* begin, uint8_t* limit);
 
-  // Return the object which comes after obj, while ensuring alignment.
-  static mirror::Object* GetNextObject(mirror::Object* obj)
-      REQUIRES_SHARED(Locks::mutator_lock_);
-
   // Allocate a new TLAB, returns false if the allocation failed.
   bool AllocNewTlab(Thread* self, size_t bytes) REQUIRES(!block_lock_);
 
@@ -165,7 +159,7 @@ class BumpPointerSpace final : public ContinuousMemMapAllocSpace {
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Object alignment within the space.
-  static constexpr size_t kAlignment = 8;
+  static constexpr size_t kAlignment = kObjectAlignment;
 
  protected:
   BumpPointerSpace(const std::string& name, MemMap&& mem_map);
@@ -183,23 +177,40 @@ class BumpPointerSpace final : public ContinuousMemMapAllocSpace {
   AtomicInteger objects_allocated_;  // Accumulated from revoked thread local regions.
   AtomicInteger bytes_allocated_;  // Accumulated from revoked thread local regions.
   Mutex block_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
-  // The objects at the start of the space are stored in the main block. The main block doesn't
-  // have a header, this lets us walk empty spaces which are mprotected.
+  // The objects at the start of the space are stored in the main block.
   size_t main_block_size_ GUARDED_BY(block_lock_);
-  // The number of blocks in the space, if it is 0 then the space has one long continuous block
-  // which doesn't have an updated header.
-  size_t num_blocks_ GUARDED_BY(block_lock_);
+  // List of block sizes (in bytes) after the main-block. Needed for Walk().
+  // If empty then the space has only one long continuous block. Each TLAB
+  // allocation has one entry in this deque.
+  // Keeping block-sizes off-heap simplifies sliding compaction algorithms.
+  // The compaction algorithm should ideally compact all objects into the main
+  // block, thereby enabling erasing corresponding entries from here.
+  std::deque<size_t> block_sizes_ GUARDED_BY(block_lock_);
 
  private:
-  struct BlockHeader {
-    size_t size_;  // Size of the block in bytes, does not include the header.
-    size_t unused_;  // Ensures alignment of kAlignment.
-  };
+  // Return the object which comes after obj, while ensuring alignment.
+  static mirror::Object* GetNextObject(mirror::Object* obj)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Return a vector of block sizes on the space. Required by MarkCompact GC for
+  // walking black objects allocated after marking phase.
+  std::vector<size_t>* GetBlockSizes(Thread* self, size_t* main_block_size) REQUIRES(!block_lock_);
+
+  // Once the MarkCompact decides the post-compact layout of the space in the
+  // pre-compaction pause, it calls this function to update the block sizes. It is
+  // done by passing the new main-block size, which consumes a bunch of blocks
+  // into itself, and the index of first unconsumed block. This works as all the
+  // block sizes are ordered. Also updates 'end_' to reflect the change.
+  void SetBlockSizes(Thread* self, const size_t main_block_size, const size_t first_valid_idx)
+      REQUIRES(!block_lock_, Locks::mutator_lock_);
 
-  static_assert(sizeof(BlockHeader) % kAlignment == 0,
-                "continuous block must be kAlignment aligned");
+  // Align end to the given alignment. This is done in MarkCompact GC when
+  // mutators are suspended so that upcoming TLAB allocations start with a new
+  // page. Returns the pre-alignment end.
+  uint8_t* AlignEnd(Thread* self, size_t alignment) REQUIRES(Locks::mutator_lock_);
 
   friend class collector::MarkSweep;
+  friend class collector::MarkCompact;
   DISALLOW_COPY_AND_ASSIGN(BumpPointerSpace);
 };
 
diff --git a/runtime/gc/space/dlmalloc_space-inl.h b/runtime/gc/space/dlmalloc_space-inl.h
index 4fc4adac91..6041fd02af 100644
--- a/runtime/gc/space/dlmalloc_space-inl.h
+++ b/runtime/gc/space/dlmalloc_space-inl.h
@@ -18,7 +18,7 @@
 #define ART_RUNTIME_GC_SPACE_DLMALLOC_SPACE_INL_H_
 
 #include "dlmalloc_space.h"
-#include "gc/allocator/dlmalloc.h"
+#include "gc/allocator/art-dlmalloc.h"
 #include "thread.h"
 
 namespace art {
diff --git a/runtime/gc/space/dlmalloc_space.cc b/runtime/gc/space/dlmalloc_space.cc
index 25cac7efde..1edcdbdf91 100644
--- a/runtime/gc/space/dlmalloc_space.cc
+++ b/runtime/gc/space/dlmalloc_space.cc
@@ -350,11 +350,18 @@ void DlMallocSpace::CheckMoreCoreForPrecondition() {
 }
 #endif
 
+struct MspaceCbArgs {
+  size_t max_contiguous;
+  size_t used;
+};
+
 static void MSpaceChunkCallback(void* start, void* end, size_t used_bytes, void* arg) {
   size_t chunk_size = reinterpret_cast<uint8_t*>(end) - reinterpret_cast<uint8_t*>(start);
+  MspaceCbArgs* mspace_cb_args = reinterpret_cast<MspaceCbArgs*>(arg);
+  mspace_cb_args->used += used_bytes;
   if (used_bytes < chunk_size) {
     size_t chunk_free_bytes = chunk_size - used_bytes;
-    size_t& max_contiguous_allocation = *reinterpret_cast<size_t*>(arg);
+    size_t& max_contiguous_allocation = mspace_cb_args->max_contiguous;
     max_contiguous_allocation = std::max(max_contiguous_allocation, chunk_free_bytes);
   }
 }
@@ -362,16 +369,17 @@ static void MSpaceChunkCallback(void* start, void* end, size_t used_bytes, void*
 bool DlMallocSpace::LogFragmentationAllocFailure(std::ostream& os,
                                                  size_t failed_alloc_bytes) {
   Thread* const self = Thread::Current();
-  size_t max_contiguous_allocation = 0;
+  MspaceCbArgs mspace_cb_args = {0, 0};
   // To allow the Walk/InspectAll() to exclusively-lock the mutator
   // lock, temporarily release the shared access to the mutator
   // lock here by transitioning to the suspended state.
   Locks::mutator_lock_->AssertSharedHeld(self);
   ScopedThreadSuspension sts(self, ThreadState::kSuspended);
-  Walk(MSpaceChunkCallback, &max_contiguous_allocation);
-  if (failed_alloc_bytes > max_contiguous_allocation) {
-    os << "; failed due to fragmentation (largest possible contiguous allocation "
-       <<  max_contiguous_allocation << " bytes)";
+  Walk(MSpaceChunkCallback, &mspace_cb_args);
+  if (failed_alloc_bytes > mspace_cb_args.max_contiguous) {
+    os << "; failed due to malloc_space fragmentation (largest possible contiguous allocation "
+       << mspace_cb_args.max_contiguous << " bytes, space in use " << mspace_cb_args.used
+       << " bytes, capacity = " << Capacity() << ")";
     return true;
   }
   return false;
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index 6afd63e4a5..13966d8d97 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -23,7 +23,9 @@
 #include <memory>
 #include <random>
 #include <string>
+#include <vector>
 
+#include "android-base/logging.h"
 #include "android-base/stringprintf.h"
 #include "android-base/strings.h"
 #include "android-base/unique_fd.h"
@@ -49,6 +51,7 @@
 #include "dex/art_dex_file_loader.h"
 #include "dex/dex_file_loader.h"
 #include "exec_utils.h"
+#include "fmt/format.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "gc/task_processor.h"
 #include "image-inl.h"
@@ -69,14 +72,20 @@ namespace art {
 namespace gc {
 namespace space {
 
-using android::base::Join;
-using android::base::StringAppendF;
-using android::base::StringPrintf;
+namespace {
+
+using ::android::base::Join;
+using ::android::base::StringAppendF;
+using ::android::base::StringPrintf;
+
+using ::fmt::literals::operator""_format;  // NOLINT
 
 // We do not allow the boot image and extensions to take more than 1GiB. They are
 // supposed to be much smaller and allocating more that this would likely fail anyway.
 static constexpr size_t kMaxTotalImageReservationSize = 1 * GB;
 
+}  // namespace
+
 Atomic<uint32_t> ImageSpace::bitmap_index_(0);
 
 ImageSpace::ImageSpace(const std::string& image_filename,
@@ -198,7 +207,6 @@ void ImageSpace::VerifyImageAllocations() {
 // Helper class for relocating from one range of memory to another.
 class RelocationRange {
  public:
-  RelocationRange() = default;
   RelocationRange(const RelocationRange&) = default;
   RelocationRange(uintptr_t source, uintptr_t dest, uintptr_t length)
       : source_(source),
@@ -372,6 +380,64 @@ class ImageSpace::PatchObjectVisitor final {
       const {}
   void VisitRoot(mirror::CompressedReference<mirror::Object>* root ATTRIBUTE_UNUSED) const {}
 
+  template <typename T> void VisitNativeDexCacheArray(mirror::NativeArray<T>* array)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (array == nullptr) {
+      return;
+    }
+    DCHECK_ALIGNED(array, static_cast<size_t>(kPointerSize));
+    uint32_t size = (kPointerSize == PointerSize::k32)
+        ? reinterpret_cast<uint32_t*>(array)[-1]
+        : dchecked_integral_cast<uint32_t>(reinterpret_cast<uint64_t*>(array)[-1]);
+    for (uint32_t i = 0; i < size; ++i) {
+      PatchNativePointer(array->GetPtrEntryPtrSize(i, kPointerSize));
+    }
+  }
+
+  template <typename T> void VisitGcRootDexCacheArray(mirror::GcRootArray<T>* array)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (array == nullptr) {
+      return;
+    }
+    DCHECK_ALIGNED(array, sizeof(GcRoot<T>));
+    static_assert(sizeof(GcRoot<T>) == sizeof(uint32_t));
+    uint32_t size = reinterpret_cast<uint32_t*>(array)[-1];
+    for (uint32_t i = 0; i < size; ++i) {
+      PatchGcRoot(array->GetGcRootAddress(i));
+    }
+  }
+
+  void VisitDexCacheArrays(ObjPtr<mirror::DexCache> dex_cache)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    mirror::NativeArray<ArtMethod>* old_resolved_methods = dex_cache->GetResolvedMethodsArray();
+    if (old_resolved_methods != nullptr) {
+      mirror::NativeArray<ArtMethod>* resolved_methods = native_visitor_(old_resolved_methods);
+      dex_cache->SetResolvedMethodsArray(resolved_methods);
+      VisitNativeDexCacheArray(resolved_methods);
+    }
+
+    mirror::NativeArray<ArtField>* old_resolved_fields = dex_cache->GetResolvedFieldsArray();
+    if (old_resolved_fields != nullptr) {
+      mirror::NativeArray<ArtField>* resolved_fields = native_visitor_(old_resolved_fields);
+      dex_cache->SetResolvedFieldsArray(resolved_fields);
+      VisitNativeDexCacheArray(resolved_fields);
+    }
+
+    mirror::GcRootArray<mirror::String>* old_strings = dex_cache->GetStringsArray();
+    if (old_strings != nullptr) {
+      mirror::GcRootArray<mirror::String>* strings = native_visitor_(old_strings);
+      dex_cache->SetStringsArray(strings);
+      VisitGcRootDexCacheArray(strings);
+    }
+
+    mirror::GcRootArray<mirror::Class>* old_types = dex_cache->GetResolvedTypesArray();
+    if (old_types != nullptr) {
+      mirror::GcRootArray<mirror::Class>* types = native_visitor_(old_types);
+      dex_cache->SetResolvedTypesArray(types);
+      VisitGcRootDexCacheArray(types);
+    }
+  }
+
   template <bool kMayBeNull = true, typename T>
   ALWAYS_INLINE void PatchGcRoot(/*inout*/GcRoot<T>* root) const
       REQUIRES_SHARED(Locks::mutator_lock_) {
@@ -513,7 +579,8 @@ class ImageSpace::Loader {
       // Check the oat file checksum.
       const uint32_t oat_checksum = oat_file->GetOatHeader().GetChecksum();
       const uint32_t image_oat_checksum = image_header.GetOatChecksum();
-      if (oat_checksum != image_oat_checksum) {
+      // Note image_oat_checksum is 0 for images generated by the runtime.
+      if (image_oat_checksum != 0u && oat_checksum != image_oat_checksum) {
         *error_msg = StringPrintf("Oat checksum 0x%x does not match the image one 0x%x in image %s",
                                   oat_checksum,
                                   image_oat_checksum,
@@ -1299,6 +1366,16 @@ class ImageSpace::Loader {
       image_header->RelocateImageReferences(app_image_objects.Delta());
       image_header->RelocateBootImageReferences(boot_image.Delta());
       CHECK_EQ(image_header->GetImageBegin(), target_base);
+
+      // Fix up dex cache arrays.
+      ObjPtr<mirror::ObjectArray<mirror::DexCache>> dex_caches =
+          image_header->GetImageRoot<kWithoutReadBarrier>(ImageHeader::kDexCaches)
+              ->AsObjectArray<mirror::DexCache, kVerifyNone>();
+      for (int32_t i = 0, count = dex_caches->GetLength(); i < count; ++i) {
+        ObjPtr<mirror::DexCache> dex_cache =
+            dex_caches->GetWithoutChecks<kVerifyNone, kWithoutReadBarrier>(i);
+        patch_object_visitor.VisitDexCacheArrays(dex_cache);
+      }
     }
     {
       // Only touches objects in the app image, no need for mutator lock.
@@ -1366,9 +1443,9 @@ class ImageSpace::Loader {
   }
 };
 
-static void AppendImageChecksum(uint32_t component_count,
-                                uint32_t checksum,
-                                /*inout*/std::string* checksums) {
+void ImageSpace::AppendImageChecksum(uint32_t component_count,
+                                     uint32_t checksum,
+                                     /*inout*/ std::string* checksums) {
   static_assert(ImageSpace::kImageChecksumPrefix == 'i', "Format prefix check.");
   StringAppendF(checksums, "i;%u/%08x", component_count, checksum);
 }
@@ -1378,7 +1455,7 @@ static bool CheckAndRemoveImageChecksum(uint32_t component_count,
                                         /*inout*/std::string_view* oat_checksums,
                                         /*out*/std::string* error_msg) {
   std::string image_checksum;
-  AppendImageChecksum(component_count, checksum, &image_checksum);
+  ImageSpace::AppendImageChecksum(component_count, checksum, &image_checksum);
   if (!StartsWith(*oat_checksums, image_checksum)) {
     *error_msg = StringPrintf("Image checksum mismatch, expected %s to start with %s",
                               std::string(*oat_checksums).c_str(),
@@ -1389,182 +1466,6 @@ static bool CheckAndRemoveImageChecksum(uint32_t component_count,
   return true;
 }
 
-// Helper class to find the primary boot image and boot image extensions
-// and determine the boot image layout.
-class ImageSpace::BootImageLayout {
- public:
-  // Description of a "chunk" of the boot image, i.e. either primary boot image
-  // or a boot image extension, used in conjunction with the boot class path to
-  // load boot image components.
-  struct ImageChunk {
-    std::string base_location;
-    std::string base_filename;
-    std::vector<std::string> profile_files;
-    size_t start_index;
-    uint32_t component_count;
-    uint32_t image_space_count;
-    uint32_t reservation_size;
-    uint32_t checksum;
-    uint32_t boot_image_component_count;
-    uint32_t boot_image_checksum;
-    uint32_t boot_image_size;
-
-    // The following file descriptors hold the memfd files for extensions compiled
-    // in memory and described by the above fields. We want to use them to mmap()
-    // the contents and then close them while treating the ImageChunk description
-    // as immutable (const), so make these fields explicitly mutable.
-    mutable android::base::unique_fd art_fd;
-    mutable android::base::unique_fd vdex_fd;
-    mutable android::base::unique_fd oat_fd;
-  };
-
-  BootImageLayout(ArrayRef<const std::string> image_locations,
-                  ArrayRef<const std::string> boot_class_path,
-                  ArrayRef<const std::string> boot_class_path_locations,
-                  ArrayRef<const int> boot_class_path_fds,
-                  ArrayRef<const int> boot_class_path_image_fds,
-                  ArrayRef<const int> boot_class_path_vdex_fds,
-                  ArrayRef<const int> boot_class_path_oat_fds)
-     : image_locations_(image_locations),
-       boot_class_path_(boot_class_path),
-       boot_class_path_locations_(boot_class_path_locations),
-       boot_class_path_fds_(boot_class_path_fds),
-       boot_class_path_image_fds_(boot_class_path_image_fds),
-       boot_class_path_vdex_fds_(boot_class_path_vdex_fds),
-       boot_class_path_oat_fds_(boot_class_path_oat_fds) {}
-
-  std::string GetPrimaryImageLocation();
-
-  bool LoadFromSystem(InstructionSet image_isa, /*out*/std::string* error_msg) {
-    return LoadOrValidateFromSystem(image_isa, /*oat_checksums=*/ nullptr, error_msg);
-  }
-
-  bool ValidateFromSystem(InstructionSet image_isa,
-                          /*inout*/std::string_view* oat_checksums,
-                          /*out*/std::string* error_msg) {
-    DCHECK(oat_checksums != nullptr);
-    return LoadOrValidateFromSystem(image_isa, oat_checksums, error_msg);
-  }
-
-  ArrayRef<const ImageChunk> GetChunks() const {
-    return ArrayRef<const ImageChunk>(chunks_);
-  }
-
-  uint32_t GetBaseAddress() const {
-    return base_address_;
-  }
-
-  size_t GetNextBcpIndex() const {
-    return next_bcp_index_;
-  }
-
-  size_t GetTotalComponentCount() const {
-    return total_component_count_;
-  }
-
-  size_t GetTotalReservationSize() const {
-    return total_reservation_size_;
-  }
-
- private:
-  struct NamedComponentLocation {
-    std::string base_location;
-    size_t bcp_index;
-    std::vector<std::string> profile_filenames;
-  };
-
-  std::string ExpandLocationImpl(const std::string& location,
-                                 size_t bcp_index,
-                                 bool boot_image_extension) {
-    std::vector<std::string> expanded = ExpandMultiImageLocations(
-        ArrayRef<const std::string>(boot_class_path_).SubArray(bcp_index, 1u),
-        location,
-        boot_image_extension);
-    DCHECK_EQ(expanded.size(), 1u);
-    return expanded[0];
-  }
-
-  std::string ExpandLocation(const std::string& location, size_t bcp_index) {
-    if (bcp_index == 0u) {
-      DCHECK_EQ(location, ExpandLocationImpl(location, bcp_index, /*boot_image_extension=*/ false));
-      return location;
-    } else {
-      return ExpandLocationImpl(location, bcp_index, /*boot_image_extension=*/ true);
-    }
-  }
-
-  std::string GetBcpComponentPath(size_t bcp_index) {
-    DCHECK_LE(bcp_index, boot_class_path_.size());
-    size_t bcp_slash_pos = boot_class_path_[bcp_index].rfind('/');
-    DCHECK_NE(bcp_slash_pos, std::string::npos);
-    return boot_class_path_[bcp_index].substr(0u, bcp_slash_pos + 1u);
-  }
-
-  bool VerifyImageLocation(ArrayRef<const std::string> components,
-                           /*out*/size_t* named_components_count,
-                           /*out*/std::string* error_msg);
-
-  bool MatchNamedComponents(
-      ArrayRef<const std::string> named_components,
-      /*out*/std::vector<NamedComponentLocation>* named_component_locations,
-      /*out*/std::string* error_msg);
-
-  bool ValidateBootImageChecksum(const char* file_description,
-                                 const ImageHeader& header,
-                                 /*out*/std::string* error_msg);
-
-  bool ValidateHeader(const ImageHeader& header,
-                      size_t bcp_index,
-                      const char* file_description,
-                      /*out*/std::string* error_msg);
-
-  bool ValidateOatFile(const std::string& base_location,
-                       const std::string& base_filename,
-                       size_t bcp_index,
-                       size_t component_count,
-                       /*out*/std::string* error_msg);
-
-  bool ReadHeader(const std::string& base_location,
-                  const std::string& base_filename,
-                  size_t bcp_index,
-                  /*out*/std::string* error_msg);
-
-  // Compiles a consecutive subsequence of bootclasspath dex files, whose contents are included in
-  // the profiles specified by `profile_filenames`, starting from `bcp_index`.
-  bool CompileBootclasspathElements(const std::string& base_location,
-                                    const std::string& base_filename,
-                                    size_t bcp_index,
-                                    const std::vector<std::string>& profile_filenames,
-                                    ArrayRef<const std::string> dependencies,
-                                    /*out*/std::string* error_msg);
-
-  bool CheckAndRemoveLastChunkChecksum(/*inout*/std::string_view* oat_checksums,
-                                       /*out*/std::string* error_msg);
-
-  template <typename FilenameFn>
-  bool LoadOrValidate(FilenameFn&& filename_fn,
-                      /*inout*/std::string_view* oat_checksums,
-                      /*out*/std::string* error_msg);
-
-  bool LoadOrValidateFromSystem(InstructionSet image_isa,
-                                /*inout*/std::string_view* oat_checksums,
-                                /*out*/std::string* error_msg);
-
-  ArrayRef<const std::string> image_locations_;
-  ArrayRef<const std::string> boot_class_path_;
-  ArrayRef<const std::string> boot_class_path_locations_;
-  ArrayRef<const int> boot_class_path_fds_;
-  ArrayRef<const int> boot_class_path_image_fds_;
-  ArrayRef<const int> boot_class_path_vdex_fds_;
-  ArrayRef<const int> boot_class_path_oat_fds_;
-
-  std::vector<ImageChunk> chunks_;
-  uint32_t base_address_ = 0u;
-  size_t next_bcp_index_ = 0u;
-  size_t total_component_count_ = 0u;
-  size_t total_reservation_size_ = 0u;
-};
-
 std::string ImageSpace::BootImageLayout::GetPrimaryImageLocation() {
   DCHECK(!image_locations_.empty());
   std::string location = image_locations_[0];
@@ -1886,7 +1787,7 @@ bool ImageSpace::BootImageLayout::ValidateOatFile(
                               error_msg->c_str());
     return false;
   }
-  if (!ImageSpace::ValidateOatFile(*oat_file, error_msg, dex_filenames, dex_fds)) {
+  if (!ImageSpace::ValidateOatFile(*oat_file, error_msg, dex_filenames, dex_fds, apex_versions_)) {
     return false;
   }
   return true;
@@ -2151,48 +2052,12 @@ bool ImageSpace::BootImageLayout::CompileBootclasspathElements(
   return true;
 }
 
-bool ImageSpace::BootImageLayout::CheckAndRemoveLastChunkChecksum(
-    /*inout*/std::string_view* oat_checksums,
-    /*out*/std::string* error_msg) {
-  DCHECK(oat_checksums != nullptr);
-  DCHECK(!chunks_.empty());
-  const ImageChunk& chunk = chunks_.back();
-  size_t component_count = chunk.component_count;
-  size_t checksum = chunk.checksum;
-  if (!CheckAndRemoveImageChecksum(component_count, checksum, oat_checksums, error_msg)) {
-    DCHECK(!error_msg->empty());
-    return false;
-  }
-  if (oat_checksums->empty()) {
-    if (next_bcp_index_ != boot_class_path_.size()) {
-      *error_msg = StringPrintf("Checksum too short, missing %zu components.",
-                                boot_class_path_.size() - next_bcp_index_);
-      return false;
-    }
-    return true;
-  }
-  if (!StartsWith(*oat_checksums, ":")) {
-    *error_msg = StringPrintf("Missing ':' separator at start of %s",
-                              std::string(*oat_checksums).c_str());
-    return false;
-  }
-  oat_checksums->remove_prefix(1u);
-  if (oat_checksums->empty()) {
-    *error_msg = "Missing checksums after the ':' separator.";
-    return false;
-  }
-  return true;
-}
-
 template <typename FilenameFn>
-bool ImageSpace::BootImageLayout::LoadOrValidate(FilenameFn&& filename_fn,
-                                                 /*inout*/std::string_view* oat_checksums,
-                                                 /*out*/std::string* error_msg) {
+bool ImageSpace::BootImageLayout::Load(FilenameFn&& filename_fn,
+                                       bool allow_in_memory_compilation,
+                                       /*out*/ std::string* error_msg) {
   DCHECK(GetChunks().empty());
   DCHECK_EQ(GetBaseAddress(), 0u);
-  bool validate = (oat_checksums != nullptr);
-  static_assert(ImageSpace::kImageChecksumPrefix == 'i', "Format prefix check.");
-  DCHECK_IMPLIES(validate, StartsWith(*oat_checksums, "i"));
 
   ArrayRef<const std::string> components = image_locations_;
   size_t named_components_count = 0u;
@@ -2223,24 +2088,21 @@ bool ImageSpace::BootImageLayout::LoadOrValidate(FilenameFn&& filename_fn,
       LOG(ERROR) << "Named image component already covered by previous image: " << base_location;
       continue;
     }
-    if (validate && bcp_index > bcp_pos) {
-      *error_msg = StringPrintf("End of contiguous boot class path images, remaining checksum: %s",
-                                std::string(*oat_checksums).c_str());
-      return false;
-    }
     std::string local_error_msg;
-    std::string* err_msg = validate ? error_msg : &local_error_msg;
     std::string base_filename;
-    if (!filename_fn(base_location, &base_filename, err_msg) ||
-        !ReadHeader(base_location, base_filename, bcp_index, err_msg)) {
-      if (validate) {
-        return false;
-      }
+    if (!filename_fn(base_location, &base_filename, &local_error_msg) ||
+        !ReadHeader(base_location, base_filename, bcp_index, &local_error_msg)) {
       LOG(ERROR) << "Error reading named image component header for " << base_location
                  << ", error: " << local_error_msg;
       // If the primary boot image is invalid, we generate a single full image. This is faster than
       // generating the primary boot image and the extension separately.
       if (bcp_index == 0) {
+        if (!allow_in_memory_compilation) {
+          // The boot image is unusable and we can't continue by generating a boot image in memory.
+          // All we can do is to return.
+          *error_msg = std::move(local_error_msg);
+          return false;
+        }
         // We must at least have profiles for the core libraries.
         if (profile_filenames.empty()) {
           *error_msg = "Full boot image cannot be compiled because no profile is provided.";
@@ -2264,14 +2126,15 @@ bool ImageSpace::BootImageLayout::LoadOrValidate(FilenameFn&& filename_fn,
         // No extensions are needed.
         return true;
       }
-      if (profile_filenames.empty() ||
+      bool should_compile_extension = allow_in_memory_compilation && !profile_filenames.empty();
+      if (!should_compile_extension ||
           !CompileBootclasspathElements(base_location,
                                         base_filename,
                                         bcp_index,
                                         profile_filenames,
                                         components.SubArray(/*pos=*/ 0, /*length=*/ 1),
                                         &local_error_msg)) {
-        if (!profile_filenames.empty()) {
+        if (should_compile_extension) {
           LOG(ERROR) << "Error compiling boot image extension for " << boot_class_path_[bcp_index]
                      << ", error: " << local_error_msg;
         }
@@ -2280,14 +2143,6 @@ bool ImageSpace::BootImageLayout::LoadOrValidate(FilenameFn&& filename_fn,
         continue;
       }
     }
-    if (validate) {
-      if (!CheckAndRemoveLastChunkChecksum(oat_checksums, error_msg)) {
-        return false;
-      }
-      if (oat_checksums->empty() || !StartsWith(*oat_checksums, "i")) {
-        return true;  // Let the caller deal with the dex file checksums if any.
-      }
-    }
     bcp_pos = GetNextBcpIndex();
   }
 
@@ -2320,24 +2175,10 @@ bool ImageSpace::BootImageLayout::LoadOrValidate(FilenameFn&& filename_fn,
           VLOG(image) << "Found image extension for " << ExpandLocation(base_location, bcp_pos);
           bcp_pos = GetNextBcpIndex();
           found = true;
-          if (validate) {
-            if (!CheckAndRemoveLastChunkChecksum(oat_checksums, error_msg)) {
-              return false;
-            }
-            if (oat_checksums->empty() || !StartsWith(*oat_checksums, "i")) {
-              return true;  // Let the caller deal with the dex file checksums if any.
-            }
-          }
           break;
         }
       }
       if (!found) {
-        if (validate) {
-          *error_msg = StringPrintf("Missing extension for %s, remaining checksum: %s",
-                                    bcp_component.c_str(),
-                                    std::string(*oat_checksums).c_str());
-          return false;
-        }
         ++bcp_pos;
       }
     }
@@ -2346,16 +2187,16 @@ bool ImageSpace::BootImageLayout::LoadOrValidate(FilenameFn&& filename_fn,
   return true;
 }
 
-bool ImageSpace::BootImageLayout::LoadOrValidateFromSystem(InstructionSet image_isa,
-                                                           /*inout*/std::string_view* oat_checksums,
-                                                           /*out*/std::string* error_msg) {
+bool ImageSpace::BootImageLayout::LoadFromSystem(InstructionSet image_isa,
+                                                 bool allow_in_memory_compilation,
+                                                 /*out*/ std::string* error_msg) {
   auto filename_fn = [image_isa](const std::string& location,
                                  /*out*/std::string* filename,
                                  /*out*/std::string* err_msg ATTRIBUTE_UNUSED) {
     *filename = GetSystemImageFilename(location.c_str(), image_isa);
     return true;
   };
-  return LoadOrValidate(filename_fn, oat_checksums, error_msg);
+  return Load(filename_fn, allow_in_memory_compilation, error_msg);
 }
 
 class ImageSpace::BootImageLoader {
@@ -2403,6 +2244,7 @@ class ImageSpace::BootImageLoader {
   bool HasSystem() const { return has_system_; }
 
   bool LoadFromSystem(size_t extra_reservation_size,
+                      bool allow_in_memory_compilation,
                       /*out*/std::vector<std::unique_ptr<ImageSpace>>* boot_image_spaces,
                       /*out*/MemMap* extra_reservation,
                       /*out*/std::string* error_msg) REQUIRES_SHARED(Locks::mutator_lock_);
@@ -2697,7 +2539,8 @@ class ImageSpace::BootImageLoader {
       int32_t class_roots_index = enum_cast<int32_t>(ImageHeader::kClassRoots);
       DCHECK_LT(class_roots_index, image_roots->GetLength<kVerifyNone>());
       class_roots = ObjPtr<mirror::ObjectArray<mirror::Class>>::DownCast(base_relocate_visitor(
-          image_roots->GetWithoutChecks<kVerifyNone>(class_roots_index).Ptr()));
+          image_roots->GetWithoutChecks<kVerifyNone,
+                                        kWithoutReadBarrier>(class_roots_index).Ptr()));
       if (kExtension) {
         // Class roots must have been visited if we relocated the primary boot image.
         DCHECK(base_diff == 0 || patched_objects->Test(class_roots.Ptr()));
@@ -2863,6 +2706,14 @@ class ImageSpace::BootImageLoader {
       DCHECK_EQ(base_diff64, 0);
     }
 
+    // While `Thread::Current()` is null, the `ScopedDebugDisallowReadBarriers`
+    // cannot be used but the class `ReadBarrier` shall not allow read barriers anyway.
+    // For some gtests we actually have an initialized `Thread:Current()`.
+    std::optional<ScopedDebugDisallowReadBarriers> sddrb(std::nullopt);
+    if (kCheckDebugDisallowReadBarrierCount && Thread::Current() != nullptr) {
+      sddrb.emplace(Thread::Current());
+    }
+
     ArrayRef<const std::unique_ptr<ImageSpace>> spaces_ref(spaces);
     PointerSize pointer_size = first_space_header.GetPointerSize();
     if (pointer_size == PointerSize::k64) {
@@ -3111,8 +2962,24 @@ class ImageSpace::BootImageLoader {
         return false;
       }
     }
+
+    // As an optimization, madvise the oat file into memory if it's being used
+    // for execution with an active runtime. This can significantly improve
+    // ZygoteInit class preload performance.
+    if (executable_) {
+      Runtime* runtime = Runtime::Current();
+      if (runtime != nullptr) {
+        Runtime::MadviseFileForRange(runtime->GetMadviseWillNeedSizeOdex(),
+                                     oat_file->Size(),
+                                     oat_file->Begin(),
+                                     oat_file->End(),
+                                     oat_file->GetLocation());
+      }
+    }
+
     space->oat_file_ = std::move(oat_file);
     space->oat_file_non_owned_ = space->oat_file_.get();
+
     return true;
   }
 
@@ -3345,6 +3212,7 @@ class ImageSpace::BootImageLoader {
 
 bool ImageSpace::BootImageLoader::LoadFromSystem(
     size_t extra_reservation_size,
+    bool allow_in_memory_compilation,
     /*out*/std::vector<std::unique_ptr<ImageSpace>>* boot_image_spaces,
     /*out*/MemMap* extra_reservation,
     /*out*/std::string* error_msg) {
@@ -3357,7 +3225,7 @@ bool ImageSpace::BootImageLoader::LoadFromSystem(
                          boot_class_path_image_fds_,
                          boot_class_path_vdex_fds_,
                          boot_class_path_oat_fds_);
-  if (!layout.LoadFromSystem(image_isa_, error_msg)) {
+  if (!layout.LoadFromSystem(image_isa_, allow_in_memory_compilation, error_msg)) {
     return false;
   }
 
@@ -3420,6 +3288,7 @@ bool ImageSpace::LoadBootImage(
     bool relocate,
     bool executable,
     size_t extra_reservation_size,
+    bool allow_in_memory_compilation,
     /*out*/std::vector<std::unique_ptr<ImageSpace>>* boot_image_spaces,
     /*out*/MemMap* extra_reservation) {
   ScopedTrace trace(__FUNCTION__);
@@ -3450,8 +3319,11 @@ bool ImageSpace::LoadBootImage(
   std::vector<std::string> error_msgs;
 
   std::string error_msg;
-  if (loader.LoadFromSystem(
-          extra_reservation_size, boot_image_spaces, extra_reservation, &error_msg)) {
+  if (loader.LoadFromSystem(extra_reservation_size,
+                            allow_in_memory_compilation,
+                            boot_image_spaces,
+                            extra_reservation,
+                            &error_msg)) {
     return true;
   }
   error_msgs.push_back(error_msg);
@@ -3519,54 +3391,66 @@ void ImageSpace::Dump(std::ostream& os) const {
       << ",name=\"" << GetName() << "\"]";
 }
 
-bool ImageSpace::ValidateApexVersions(const OatFile& oat_file, std::string* error_msg) {
+bool ImageSpace::ValidateApexVersions(const OatHeader& oat_header,
+                                      const std::string& apex_versions,
+                                      const std::string& file_location,
+                                      std::string* error_msg) {
   // For a boot image, the key value store only exists in the first OAT file. Skip other OAT files.
-  if (oat_file.GetOatHeader().GetKeyValueStoreSize() == 0) {
-    return true;
-  }
-
-  // The OAT files in the ART APEX is built on host, so they don't have the right APEX versions. It
-  // is safe to assume that they are always up-to-date because they are shipped along with the
-  // runtime and the dex files.
-  if (kIsTargetAndroid && android::base::StartsWith(oat_file.GetLocation(), GetArtRoot())) {
+  if (oat_header.GetKeyValueStoreSize() == 0) {
     return true;
   }
 
-  const char* oat_apex_versions =
-      oat_file.GetOatHeader().GetStoreValueByKey(OatHeader::kApexVersionsKey);
+  const char* oat_apex_versions = oat_header.GetStoreValueByKey(OatHeader::kApexVersionsKey);
   if (oat_apex_versions == nullptr) {
     *error_msg = StringPrintf("ValidateApexVersions failed to get APEX versions from oat file '%s'",
-                              oat_file.GetLocation().c_str());
+                              file_location.c_str());
     return false;
   }
   // For a boot image, it can be generated from a subset of the bootclasspath.
   // For an app image, some dex files get compiled with a subset of the bootclasspath.
   // For such cases, the OAT APEX versions will be a prefix of the runtime APEX versions.
-  if (!android::base::StartsWith(Runtime::Current()->GetApexVersions(), oat_apex_versions)) {
+  if (!android::base::StartsWith(apex_versions, oat_apex_versions)) {
     *error_msg = StringPrintf(
         "ValidateApexVersions found APEX versions mismatch between oat file '%s' and the runtime "
         "(Oat file: '%s', Runtime: '%s')",
-        oat_file.GetLocation().c_str(),
+        file_location.c_str(),
         oat_apex_versions,
-        Runtime::Current()->GetApexVersions().c_str());
+        apex_versions.c_str());
     return false;
   }
   return true;
 }
 
 bool ImageSpace::ValidateOatFile(const OatFile& oat_file, std::string* error_msg) {
-  return ValidateOatFile(oat_file, error_msg, ArrayRef<const std::string>(), ArrayRef<const int>());
+  DCHECK(Runtime::Current() != nullptr);
+  return ValidateOatFile(oat_file,
+                         error_msg,
+                         ArrayRef<const std::string>(),
+                         ArrayRef<const int>(),
+                         Runtime::Current()->GetApexVersions());
 }
 
 bool ImageSpace::ValidateOatFile(const OatFile& oat_file,
                                  std::string* error_msg,
                                  ArrayRef<const std::string> dex_filenames,
-                                 ArrayRef<const int> dex_fds) {
-  if (!ValidateApexVersions(oat_file, error_msg)) {
+                                 ArrayRef<const int> dex_fds,
+                                 const std::string& apex_versions) {
+  if (!ValidateApexVersions(oat_file.GetOatHeader(),
+                            apex_versions,
+                            oat_file.GetLocation(),
+                            error_msg)) {
+    return false;
+  }
+
+  // For a boot image, the key value store only exists in the first OAT file. Skip other OAT files.
+  if (oat_file.GetOatHeader().GetKeyValueStoreSize() != 0 &&
+      oat_file.GetOatHeader().IsConcurrentCopying() != gUseReadBarrier) {
+    *error_msg =
+        "ValidateOatFile found read barrier state mismatch (oat file: {}, runtime: {})"_format(
+            oat_file.GetOatHeader().IsConcurrentCopying(), gUseReadBarrier);
     return false;
   }
 
-  const ArtDexFileLoader dex_file_loader;
   size_t dex_file_index = 0;
   for (const OatDexFile* oat_dex_file : oat_file.GetOatDexFiles()) {
     // Skip multidex locations - These will be checked when we visit their
@@ -3583,7 +3467,7 @@ bool ImageSpace::ValidateOatFile(const OatFile& oat_file,
 
     std::vector<uint32_t> checksums;
     std::vector<std::string> dex_locations_ignored;
-    if (!dex_file_loader.GetMultiDexChecksums(
+    if (!ArtDexFileLoader::GetMultiDexChecksums(
             dex_file_location.c_str(), &checksums, &dex_locations_ignored, error_msg, dex_fd)) {
       *error_msg = StringPrintf("ValidateOatFile failed to get checksums of dex file '%s' "
                                 "referenced by oat file %s: %s",
@@ -3695,7 +3579,7 @@ size_t ImageSpace::GetNumberOfComponents(ArrayRef<ImageSpace* const> image_space
   return n;
 }
 
-static size_t CheckAndCountBCPComponents(std::string_view oat_boot_class_path,
+size_t ImageSpace::CheckAndCountBCPComponents(std::string_view oat_boot_class_path,
                                          ArrayRef<const std::string> boot_class_path,
                                          /*out*/std::string* error_msg) {
   // Check that the oat BCP is a prefix of current BCP locations and count components.
@@ -3727,110 +3611,6 @@ static size_t CheckAndCountBCPComponents(std::string_view oat_boot_class_path,
   return component_count;
 }
 
-bool ImageSpace::VerifyBootClassPathChecksums(std::string_view oat_checksums,
-                                              std::string_view oat_boot_class_path,
-                                              ArrayRef<const std::string> image_locations,
-                                              ArrayRef<const std::string> boot_class_path_locations,
-                                              ArrayRef<const std::string> boot_class_path,
-                                              ArrayRef<const int> boot_class_path_fds,
-                                              InstructionSet image_isa,
-                                              /*out*/std::string* error_msg) {
-  if (oat_checksums.empty() || oat_boot_class_path.empty()) {
-    *error_msg = oat_checksums.empty() ? "Empty checksums." : "Empty boot class path.";
-    return false;
-  }
-
-  DCHECK_EQ(boot_class_path_locations.size(), boot_class_path.size());
-  size_t bcp_size =
-      CheckAndCountBCPComponents(oat_boot_class_path, boot_class_path_locations, error_msg);
-  if (bcp_size == static_cast<size_t>(-1)) {
-    DCHECK(!error_msg->empty());
-    return false;
-  }
-
-  size_t bcp_pos = 0u;
-  if (StartsWith(oat_checksums, "i")) {
-    // Use only the matching part of the BCP for validation.  FDs are optional, so only pass the
-    // sub-array if provided.
-    ArrayRef<const int> bcp_fds = boot_class_path_fds.empty()
-        ? ArrayRef<const int>()
-        : boot_class_path_fds.SubArray(/*pos=*/ 0u, bcp_size);
-    BootImageLayout layout(image_locations,
-                           boot_class_path.SubArray(/*pos=*/ 0u, bcp_size),
-                           boot_class_path_locations.SubArray(/*pos=*/ 0u, bcp_size),
-                           bcp_fds,
-                           /*boot_class_path_image_fds=*/ ArrayRef<const int>(),
-                           /*boot_class_path_vdex_fds=*/ ArrayRef<const int>(),
-                           /*boot_class_path_oat_fds=*/ ArrayRef<const int>());
-    std::string primary_image_location = layout.GetPrimaryImageLocation();
-    std::string system_filename;
-    bool has_system = false;
-    if (!FindImageFilename(primary_image_location.c_str(),
-                           image_isa,
-                           &system_filename,
-                           &has_system)) {
-      *error_msg = StringPrintf("Unable to find image file for %s and %s",
-                                android::base::Join(image_locations, kComponentSeparator).c_str(),
-                                GetInstructionSetString(image_isa));
-      return false;
-    }
-
-    DCHECK(has_system);
-    if (!layout.ValidateFromSystem(image_isa, &oat_checksums, error_msg)) {
-      return false;
-    }
-    bcp_pos = layout.GetNextBcpIndex();
-  }
-
-  for ( ; bcp_pos != bcp_size; ++bcp_pos) {
-    static_assert(ImageSpace::kDexFileChecksumPrefix == 'd', "Format prefix check.");
-    if (!StartsWith(oat_checksums, "d")) {
-      *error_msg = StringPrintf("Missing dex checksums, expected %s to start with 'd'",
-                                std::string(oat_checksums).c_str());
-      return false;
-    }
-    oat_checksums.remove_prefix(1u);
-
-    const std::string& bcp_filename = boot_class_path[bcp_pos];
-    std::vector<uint32_t> checksums;
-    std::vector<std::string> dex_locations;
-    const ArtDexFileLoader dex_file_loader;
-    if (!dex_file_loader.GetMultiDexChecksums(bcp_filename.c_str(),
-                                              &checksums,
-                                              &dex_locations,
-                                              error_msg)) {
-      return false;
-    }
-    DCHECK(!checksums.empty());
-    for (uint32_t checksum : checksums) {
-      std::string dex_file_checksum = StringPrintf("/%08x", checksum);
-      if (!StartsWith(oat_checksums, dex_file_checksum)) {
-        *error_msg = StringPrintf(
-            "Dex checksum mismatch for bootclasspath file %s, expected %s to start with %s",
-            bcp_filename.c_str(),
-            std::string(oat_checksums).c_str(),
-            dex_file_checksum.c_str());
-        return false;
-      }
-      oat_checksums.remove_prefix(dex_file_checksum.size());
-    }
-    if (bcp_pos + 1u != bcp_size) {
-      if (!StartsWith(oat_checksums, ":")) {
-        *error_msg = StringPrintf("Missing ':' separator at start of %s",
-                                  std::string(oat_checksums).c_str());
-        return false;
-      }
-      oat_checksums.remove_prefix(1u);
-    }
-  }
-  if (!oat_checksums.empty()) {
-    *error_msg = StringPrintf("Checksum too long, unexpected tail %s",
-                              std::string(oat_checksums).c_str());
-    return false;
-  }
-  return true;
-}
-
 bool ImageSpace::VerifyBootClassPathChecksums(
     std::string_view oat_checksums,
     std::string_view oat_boot_class_path,
diff --git a/runtime/gc/space/image_space.h b/runtime/gc/space/image_space.h
index 8a93f2bad1..1a85456961 100644
--- a/runtime/gc/space/image_space.h
+++ b/runtime/gc/space/image_space.h
@@ -17,16 +17,19 @@
 #ifndef ART_RUNTIME_GC_SPACE_IMAGE_SPACE_H_
 #define ART_RUNTIME_GC_SPACE_IMAGE_SPACE_H_
 
+#include "android-base/unique_fd.h"
+#include "base/array_ref.h"
 #include "gc/accounting/space_bitmap.h"
 #include "image.h"
+#include "runtime.h"
 #include "space.h"
 
 namespace art {
 
-template <typename T> class ArrayRef;
 class DexFile;
 enum class InstructionSet;
 class OatFile;
+class OatHeader;
 
 namespace gc {
 namespace space {
@@ -142,6 +145,7 @@ class ImageSpace : public MemMapSpace {
       bool relocate,
       bool executable,
       size_t extra_reservation_size,
+      bool allow_in_memory_compilation,
       /*out*/std::vector<std::unique_ptr<ImageSpace>>* boot_image_spaces,
       /*out*/MemMap* extra_reservation) REQUIRES_SHARED(Locks::mutator_lock_);
 
@@ -239,18 +243,6 @@ class ImageSpace : public MemMapSpace {
   // Returns the total number of components (jar files) associated with the image spaces.
   static size_t GetNumberOfComponents(ArrayRef<gc::space::ImageSpace* const> image_spaces);
 
-  // Returns whether the checksums are valid for the given boot class path,
-  // image location and ISA (may differ from the ISA of an initialized Runtime).
-  // The boot image and dex files do not need to be loaded in memory.
-  static bool VerifyBootClassPathChecksums(std::string_view oat_checksums,
-                                           std::string_view oat_boot_class_path,
-                                           ArrayRef<const std::string> image_locations,
-                                           ArrayRef<const std::string> boot_class_path_locations,
-                                           ArrayRef<const std::string> boot_class_path,
-                                           ArrayRef<const int> boot_class_path_fds,
-                                           InstructionSet image_isa,
-                                           /*out*/std::string* error_msg);
-
   // Returns whether the oat checksums and boot class path description are valid
   // for the given boot image spaces and boot class path. Used for boot image extensions.
   static bool VerifyBootClassPathChecksums(
@@ -267,8 +259,11 @@ class ImageSpace : public MemMapSpace {
       const std::string& image_location,
       bool boot_image_extension = false);
 
-  // Returns true if the APEX versions in the OAT file match the current APEX versions.
-  static bool ValidateApexVersions(const OatFile& oat_file, std::string* error_msg);
+  // Returns true if the APEX versions in the OAT header match the given APEX versions.
+  static bool ValidateApexVersions(const OatHeader& oat_header,
+                                   const std::string& apex_versions,
+                                   const std::string& file_location,
+                                   std::string* error_msg);
 
   // Returns true if the dex checksums in the given oat file match the
   // checksums of the original dex files on disk. This is intended to be used
@@ -279,17 +274,23 @@ class ImageSpace : public MemMapSpace {
   // oat and odex file.
   //
   // This function is exposed for testing purposes.
+  //
+  // Calling this function requires an active runtime.
   static bool ValidateOatFile(const OatFile& oat_file, std::string* error_msg);
 
   // Same as above, but allows to use `dex_filenames` and `dex_fds` to find the dex files instead of
-  // using the dex filenames in the header of the oat file. This overload is useful when the actual
-  // dex filenames are different from what's in the header (e.g., when we run dex2oat on host), or
-  // when the runtime can only access files through FDs (e.g., when we run dex2oat on target in a
-  // restricted SELinux domain).
+  // using the dex filenames in the header of the oat file, and also takes `apex_versions` from the
+  // input. This overload is useful when the actual dex filenames are different from what's in the
+  // header (e.g., when we run dex2oat on host), when the runtime can only access files through FDs
+  // (e.g., when we run dex2oat on target in a restricted SELinux domain), or when there is no
+  // active runtime.
+  //
+  // Calling this function does not require an active runtime.
   static bool ValidateOatFile(const OatFile& oat_file,
                               std::string* error_msg,
                               ArrayRef<const std::string> dex_filenames,
-                              ArrayRef<const int> dex_fds);
+                              ArrayRef<const int> dex_fds,
+                              const std::string& apex_versions);
 
   // Return the end of the image which includes non-heap objects such as ArtMethods and ArtFields.
   uint8_t* GetImageEnd() const {
@@ -303,6 +304,182 @@ class ImageSpace : public MemMapSpace {
 
   void ReleaseMetadata() REQUIRES_SHARED(Locks::mutator_lock_);
 
+  static void AppendImageChecksum(uint32_t component_count,
+                                  uint32_t checksum,
+                                  /*inout*/ std::string* checksums);
+
+  static size_t CheckAndCountBCPComponents(std::string_view oat_boot_class_path,
+                                           ArrayRef<const std::string> boot_class_path,
+                                           /*out*/ std::string* error_msg);
+
+  // Helper class to find the primary boot image and boot image extensions
+  // and determine the boot image layout.
+  class BootImageLayout {
+   public:
+    // Description of a "chunk" of the boot image, i.e. either primary boot image
+    // or a boot image extension, used in conjunction with the boot class path to
+    // load boot image components.
+    struct ImageChunk {
+      std::string base_location;
+      std::string base_filename;
+      std::vector<std::string> profile_files;
+      size_t start_index;
+      uint32_t component_count;
+      uint32_t image_space_count;
+      uint32_t reservation_size;
+      uint32_t checksum;
+      uint32_t boot_image_component_count;
+      uint32_t boot_image_checksum;
+      uint32_t boot_image_size;
+
+      // The following file descriptors hold the memfd files for extensions compiled
+      // in memory and described by the above fields. We want to use them to mmap()
+      // the contents and then close them while treating the ImageChunk description
+      // as immutable (const), so make these fields explicitly mutable.
+      mutable android::base::unique_fd art_fd;
+      mutable android::base::unique_fd vdex_fd;
+      mutable android::base::unique_fd oat_fd;
+    };
+
+    BootImageLayout(ArrayRef<const std::string> image_locations,
+                    ArrayRef<const std::string> boot_class_path,
+                    ArrayRef<const std::string> boot_class_path_locations,
+                    ArrayRef<const int> boot_class_path_fds,
+                    ArrayRef<const int> boot_class_path_image_fds,
+                    ArrayRef<const int> boot_class_path_vdex_fds,
+                    ArrayRef<const int> boot_class_path_oat_fds,
+                    const std::string* apex_versions = nullptr)
+        : image_locations_(image_locations),
+          boot_class_path_(boot_class_path),
+          boot_class_path_locations_(boot_class_path_locations),
+          boot_class_path_fds_(boot_class_path_fds),
+          boot_class_path_image_fds_(boot_class_path_image_fds),
+          boot_class_path_vdex_fds_(boot_class_path_vdex_fds),
+          boot_class_path_oat_fds_(boot_class_path_oat_fds),
+          apex_versions_(GetApexVersions(apex_versions)) {}
+
+    std::string GetPrimaryImageLocation();
+
+    bool LoadFromSystem(InstructionSet image_isa,
+                        bool allow_in_memory_compilation,
+                        /*out*/ std::string* error_msg);
+
+    ArrayRef<const ImageChunk> GetChunks() const { return ArrayRef<const ImageChunk>(chunks_); }
+
+    uint32_t GetBaseAddress() const { return base_address_; }
+
+    size_t GetNextBcpIndex() const { return next_bcp_index_; }
+
+    size_t GetTotalComponentCount() const { return total_component_count_; }
+
+    size_t GetTotalReservationSize() const { return total_reservation_size_; }
+
+   private:
+    struct NamedComponentLocation {
+      std::string base_location;
+      size_t bcp_index;
+      std::vector<std::string> profile_filenames;
+    };
+
+    std::string ExpandLocationImpl(const std::string& location,
+                                   size_t bcp_index,
+                                   bool boot_image_extension) {
+      std::vector<std::string> expanded = ExpandMultiImageLocations(
+          ArrayRef<const std::string>(boot_class_path_).SubArray(bcp_index, 1u),
+          location,
+          boot_image_extension);
+      DCHECK_EQ(expanded.size(), 1u);
+      return expanded[0];
+    }
+
+    std::string ExpandLocation(const std::string& location, size_t bcp_index) {
+      if (bcp_index == 0u) {
+        DCHECK_EQ(location,
+                  ExpandLocationImpl(location, bcp_index, /*boot_image_extension=*/false));
+        return location;
+      } else {
+        return ExpandLocationImpl(location, bcp_index, /*boot_image_extension=*/true);
+      }
+    }
+
+    std::string GetBcpComponentPath(size_t bcp_index) {
+      DCHECK_LE(bcp_index, boot_class_path_.size());
+      size_t bcp_slash_pos = boot_class_path_[bcp_index].rfind('/');
+      DCHECK_NE(bcp_slash_pos, std::string::npos);
+      return boot_class_path_[bcp_index].substr(0u, bcp_slash_pos + 1u);
+    }
+
+    bool VerifyImageLocation(ArrayRef<const std::string> components,
+                             /*out*/ size_t* named_components_count,
+                             /*out*/ std::string* error_msg);
+
+    bool MatchNamedComponents(
+        ArrayRef<const std::string> named_components,
+        /*out*/ std::vector<NamedComponentLocation>* named_component_locations,
+        /*out*/ std::string* error_msg);
+
+    bool ValidateBootImageChecksum(const char* file_description,
+                                   const ImageHeader& header,
+                                   /*out*/ std::string* error_msg);
+
+    bool ValidateHeader(const ImageHeader& header,
+                        size_t bcp_index,
+                        const char* file_description,
+                        /*out*/ std::string* error_msg);
+
+    bool ValidateOatFile(const std::string& base_location,
+                         const std::string& base_filename,
+                         size_t bcp_index,
+                         size_t component_count,
+                         /*out*/ std::string* error_msg);
+
+    bool ReadHeader(const std::string& base_location,
+                    const std::string& base_filename,
+                    size_t bcp_index,
+                    /*out*/ std::string* error_msg);
+
+    // Compiles a consecutive subsequence of bootclasspath dex files, whose contents are included in
+    // the profiles specified by `profile_filenames`, starting from `bcp_index`.
+    bool CompileBootclasspathElements(const std::string& base_location,
+                                      const std::string& base_filename,
+                                      size_t bcp_index,
+                                      const std::vector<std::string>& profile_filenames,
+                                      ArrayRef<const std::string> dependencies,
+                                      /*out*/ std::string* error_msg);
+
+    // Returns true if a least one chuck has been loaded.
+    template <typename FilenameFn>
+    bool Load(FilenameFn&& filename_fn,
+              bool allow_in_memory_compilation,
+              /*out*/ std::string* error_msg);
+
+    // This function prefers taking APEX versions from the input instead of from the runtime if
+    // possible. If the input is present, `ValidateFromSystem` can work without an active runtime.
+    static const std::string& GetApexVersions(const std::string* apex_versions) {
+      if (apex_versions == nullptr) {
+        DCHECK(Runtime::Current() != nullptr);
+        return Runtime::Current()->GetApexVersions();
+      } else {
+        return *apex_versions;
+      }
+    }
+
+    ArrayRef<const std::string> image_locations_;
+    ArrayRef<const std::string> boot_class_path_;
+    ArrayRef<const std::string> boot_class_path_locations_;
+    ArrayRef<const int> boot_class_path_fds_;
+    ArrayRef<const int> boot_class_path_image_fds_;
+    ArrayRef<const int> boot_class_path_vdex_fds_;
+    ArrayRef<const int> boot_class_path_oat_fds_;
+
+    std::vector<ImageChunk> chunks_;
+    uint32_t base_address_ = 0u;
+    size_t next_bcp_index_ = 0u;
+    size_t total_component_count_ = 0u;
+    size_t total_reservation_size_ = 0u;
+    const std::string& apex_versions_;
+  };
+
  protected:
   // Tries to initialize an ImageSpace from the given image path, returning null on error.
   //
@@ -342,7 +519,6 @@ class ImageSpace : public MemMapSpace {
   friend class Space;
 
  private:
-  class BootImageLayout;
   class BootImageLoader;
   template <typename ReferenceVisitor>
   class ClassTableVisitor;
diff --git a/runtime/gc/space/image_space_test.cc b/runtime/gc/space/image_space_test.cc
index 3a6d0e12e2..d6bb86b11b 100644
--- a/runtime/gc/space/image_space_test.cc
+++ b/runtime/gc/space/image_space_test.cc
@@ -50,7 +50,7 @@ class ImageSpaceTest : public CommonRuntimeTest {
 };
 
 TEST_F(ImageSpaceTest, StringDeduplication) {
-  const char* const kBaseNames[] = { "Extension1", "Extension2" };
+  const char* const kBaseNames[] = {"Extension1", "Extension2"};
 
   ScratchDir scratch;
   const std::string& scratch_dir = scratch.GetPath();
@@ -77,7 +77,7 @@ TEST_F(ImageSpaceTest, StringDeduplication) {
   std::vector<std::string> extension_image_locations;
   for (const char* base_name : kBaseNames) {
     std::string jar_name = GetTestDexFileName(base_name);
-    ArrayRef<const std::string> dex_files(&jar_name, /*size=*/ 1u);
+    ArrayRef<const std::string> dex_files(&jar_name, /*size=*/1u);
     ScratchFile profile_file;
     GenerateBootProfile(dex_files, profile_file.GetFile());
     std::vector<std::string> extra_args = {
@@ -94,8 +94,8 @@ TEST_F(ImageSpaceTest, StringDeduplication) {
     ASSERT_TRUE(success) << error_msg;
     bcp.push_back(jar_name);
     bcp_locations.push_back(jar_name);
-    extension_image_locations.push_back(
-        scratch_dir + prefix + '-' + GetFilenameBase(jar_name) + ".art");
+    extension_image_locations.push_back(scratch_dir + prefix + '-' + GetFilenameBase(jar_name) +
+                                        ".art");
   }
 
   // Also compile the second extension as an app with app image.
@@ -104,26 +104,27 @@ TEST_F(ImageSpaceTest, StringDeduplication) {
   std::string app_odex_name = scratch_dir + app_base_name + ".odex";
   std::string app_image_name = scratch_dir + app_base_name + ".art";
   {
-    ArrayRef<const std::string> dex_files(&app_jar_name, /*size=*/ 1u);
+    ArrayRef<const std::string> dex_files(&app_jar_name, /*size=*/1u);
     ScratchFile profile_file;
     GenerateProfile(dex_files, profile_file.GetFile());
     std::vector<std::string> argv;
     std::string error_msg;
-    bool success = StartDex2OatCommandLine(&argv, &error_msg, /*use_runtime_bcp_and_image=*/ false);
+    bool success = StartDex2OatCommandLine(&argv, &error_msg, /*use_runtime_bcp_and_image=*/false);
     ASSERT_TRUE(success) << error_msg;
-    argv.insert(argv.end(), {
-        "--profile-file=" + profile_file.GetFilename(),
-        "--runtime-arg",
-        "-Xbootclasspath:" + base_bcp_string,
-        "--runtime-arg",
-        "-Xbootclasspath-locations:" + base_bcp_locations_string,
-        "--boot-image=" + base_image_location,
-        "--dex-file=" + app_jar_name,
-        "--dex-location=" + app_jar_name,
-        "--oat-file=" + app_odex_name,
-        "--app-image-file=" + app_image_name,
-        "--initialize-app-image-classes=true",
-    });
+    argv.insert(argv.end(),
+                {
+                    "--profile-file=" + profile_file.GetFilename(),
+                    "--runtime-arg",
+                    "-Xbootclasspath:" + base_bcp_string,
+                    "--runtime-arg",
+                    "-Xbootclasspath-locations:" + base_bcp_locations_string,
+                    "--boot-image=" + base_image_location,
+                    "--dex-file=" + app_jar_name,
+                    "--dex-location=" + app_jar_name,
+                    "--oat-file=" + app_odex_name,
+                    "--app-image-file=" + app_image_name,
+                    "--initialize-app-image-classes=true",
+                });
     success = RunDex2Oat(argv, &error_msg);
     ASSERT_TRUE(success) << error_msg;
   }
@@ -136,15 +137,16 @@ TEST_F(ImageSpaceTest, StringDeduplication) {
     extra_reservation = MemMap::Invalid();
     return ImageSpace::LoadBootImage(bcp,
                                      bcp_locations,
-                                     /*boot_class_path_fds=*/ std::vector<int>(),
-                                     /*boot_class_path_image_fds=*/ std::vector<int>(),
-                                     /*boot_class_path_vdex_fds=*/ std::vector<int>(),
-                                     /*boot_class_path_oat_fds=*/ std::vector<int>(),
+                                     /*boot_class_path_fds=*/std::vector<int>(),
+                                     /*boot_class_path_image_fds=*/std::vector<int>(),
+                                     /*boot_class_path_vdex_fds=*/std::vector<int>(),
+                                     /*boot_class_path_oat_fds=*/std::vector<int>(),
                                      full_image_locations,
                                      kRuntimeISA,
-                                     /*relocate=*/ false,
-                                     /*executable=*/ true,
-                                     /*extra_reservation_size=*/ 0u,
+                                     /*relocate=*/false,
+                                     /*executable=*/true,
+                                     /*extra_reservation_size=*/0u,
+                                     /*allow_in_memory_compilation=*/false,
                                      &boot_image_spaces,
                                      &extra_reservation);
   };
@@ -153,13 +155,13 @@ TEST_F(ImageSpaceTest, StringDeduplication) {
   size_t test_string_length = std::size(test_string) - 1u;  // Equals UTF-16 length.
   uint32_t hash = InternTable::Utf8String::Hash(test_string_length, test_string);
   InternTable::Utf8String utf8_test_string(test_string_length, test_string);
-  auto contains_test_string = [utf8_test_string, hash](ImageSpace* space)
-      REQUIRES_SHARED(Locks::mutator_lock_) {
+  auto contains_test_string = [utf8_test_string,
+                               hash](ImageSpace* space) REQUIRES_SHARED(Locks::mutator_lock_) {
     const ImageHeader& image_header = space->GetImageHeader();
     if (image_header.GetInternedStringsSection().Size() != 0u) {
       const uint8_t* data = space->Begin() + image_header.GetInternedStringsSection().Offset();
       size_t read_count;
-      InternTable::UnorderedSet temp_set(data, /*make_copy_of_data=*/ false, &read_count);
+      InternTable::UnorderedSet temp_set(data, /*make_copy_of_data=*/false, &read_count);
       return temp_set.FindWithHash(utf8_test_string, hash) != temp_set.end();
     } else {
       return false;
@@ -170,8 +172,7 @@ TEST_F(ImageSpaceTest, StringDeduplication) {
   ScopedObjectAccess soa(Thread::Current());
   ASSERT_EQ(2u, extension_image_locations.size());
   full_image_locations = {
-    base_image_location, extension_image_locations[0], extension_image_locations[1]
-  };
+      base_image_location, extension_image_locations[0], extension_image_locations[1]};
   bool success = load_boot_image();
   ASSERT_TRUE(success);
   ASSERT_EQ(bcp.size(), boot_image_spaces.size());
@@ -183,8 +184,7 @@ TEST_F(ImageSpaceTest, StringDeduplication) {
   std::swap(bcp[bcp.size() - 2u], bcp[bcp.size() - 1u]);
   std::swap(bcp_locations[bcp_locations.size() - 2u], bcp_locations[bcp_locations.size() - 1u]);
   full_image_locations = {
-    base_image_location, extension_image_locations[1], extension_image_locations[0]
-  };
+      base_image_location, extension_image_locations[1], extension_image_locations[0]};
   success = load_boot_image();
   ASSERT_TRUE(success);
   ASSERT_EQ(bcp.size(), boot_image_spaces.size());
@@ -203,21 +203,21 @@ TEST_F(ImageSpaceTest, StringDeduplication) {
 
   // Load the app odex file and app image.
   std::string error_msg;
-  std::unique_ptr<OatFile> odex_file(OatFile::Open(/*zip_fd=*/ -1,
-                                                   app_odex_name.c_str(),
-                                                   app_odex_name.c_str(),
-                                                   /*executable=*/ false,
-                                                   /*low_4gb=*/ false,
+  std::unique_ptr<OatFile> odex_file(OatFile::Open(/*zip_fd=*/-1,
+                                                   app_odex_name,
+                                                   app_odex_name,
+                                                   /*executable=*/false,
+                                                   /*low_4gb=*/false,
                                                    app_jar_name,
                                                    &error_msg));
   ASSERT_TRUE(odex_file != nullptr) << error_msg;
   std::vector<ImageSpace*> non_owning_boot_image_spaces =
       MakeNonOwningPointerVector(boot_image_spaces);
-  std::unique_ptr<ImageSpace> app_image_space = ImageSpace::CreateFromAppImage(
-      app_image_name.c_str(),
-      odex_file.get(),
-      ArrayRef<ImageSpace* const>(non_owning_boot_image_spaces),
-      &error_msg);
+  std::unique_ptr<ImageSpace> app_image_space =
+      ImageSpace::CreateFromAppImage(app_image_name.c_str(),
+                                     odex_file.get(),
+                                     ArrayRef<ImageSpace* const>(non_owning_boot_image_spaces),
+                                     &error_msg);
   ASSERT_TRUE(app_image_space != nullptr) << error_msg;
 
   // The string in the app image should be replaced and removed from interned string section.
@@ -242,25 +242,25 @@ TEST_F(DexoptTest, ValidateOatFile) {
   args.push_back("--oat-file=" + oat_location);
   ASSERT_TRUE(Dex2Oat(args, &error_msg)) << error_msg;
 
-  std::unique_ptr<OatFile> oat(OatFile::Open(/*zip_fd=*/ -1,
-                                             oat_location.c_str(),
-                                             oat_location.c_str(),
-                                             /*executable=*/ false,
-                                             /*low_4gb=*/ false,
+  std::unique_ptr<OatFile> oat(OatFile::Open(/*zip_fd=*/-1,
+                                             oat_location,
+                                             oat_location,
+                                             /*executable=*/false,
+                                             /*low_4gb=*/false,
                                              &error_msg));
   ASSERT_TRUE(oat != nullptr) << error_msg;
 
   {
     // Test opening the oat file also with explicit dex filenames.
-    std::vector<std::string> dex_filenames{ dex1, multidex1, dex2 };
-    std::unique_ptr<OatFile> oat2(OatFile::Open(/*zip_fd=*/ -1,
-                                                oat_location.c_str(),
-                                                oat_location.c_str(),
-                                                /*executable=*/ false,
-                                                /*low_4gb=*/ false,
+    std::vector<std::string> dex_filenames{dex1, multidex1, dex2};
+    std::unique_ptr<OatFile> oat2(OatFile::Open(/*zip_fd=*/-1,
+                                                oat_location,
+                                                oat_location,
+                                                /*executable=*/false,
+                                                /*low_4gb=*/false,
                                                 ArrayRef<const std::string>(dex_filenames),
-                                                /*dex_fds=*/ ArrayRef<const int>(),
-                                                /*reservation=*/ nullptr,
+                                                /*dex_fds=*/ArrayRef<const int>(),
+                                                /*reservation=*/nullptr,
                                                 &error_msg));
     ASSERT_TRUE(oat2 != nullptr) << error_msg;
   }
@@ -321,56 +321,6 @@ TEST_F(DexoptTest, ValidateOatFile) {
   EXPECT_FALSE(ImageSpace::ValidateOatFile(*oat, &error_msg));
 }
 
-TEST_F(DexoptTest, Checksums) {
-  Runtime* runtime = Runtime::Current();
-  ASSERT_TRUE(runtime != nullptr);
-  ASSERT_FALSE(runtime->GetHeap()->GetBootImageSpaces().empty());
-
-  std::vector<std::string> bcp = runtime->GetBootClassPath();
-  std::vector<std::string> bcp_locations = runtime->GetBootClassPathLocations();
-  std::vector<const DexFile*> dex_files = runtime->GetClassLinker()->GetBootClassPath();
-
-  std::string error_msg;
-  auto create_and_verify = [&]() {
-    std::string checksums = gc::space::ImageSpace::GetBootClassPathChecksums(
-        ArrayRef<gc::space::ImageSpace* const>(runtime->GetHeap()->GetBootImageSpaces()),
-        ArrayRef<const DexFile* const>(dex_files));
-    return gc::space::ImageSpace::VerifyBootClassPathChecksums(
-        checksums,
-        android::base::Join(bcp_locations, ':'),
-        ArrayRef<const std::string>(runtime->GetImageLocations()),
-        ArrayRef<const std::string>(bcp_locations),
-        ArrayRef<const std::string>(bcp),
-        /*boot_class_path_fds=*/ ArrayRef<const int>(),
-        kRuntimeISA,
-        &error_msg);
-  };
-
-  ASSERT_TRUE(create_and_verify()) << error_msg;
-
-  std::vector<std::unique_ptr<const DexFile>> opened_dex_files;
-  for (const std::string& src : { GetDexSrc1(), GetDexSrc2() }) {
-    std::vector<std::unique_ptr<const DexFile>> new_dex_files;
-    const ArtDexFileLoader dex_file_loader;
-    ASSERT_TRUE(dex_file_loader.Open(src.c_str(),
-                                     src,
-                                     /*verify=*/ true,
-                                     /*verify_checksum=*/ false,
-                                     &error_msg,
-                                     &new_dex_files))
-        << error_msg;
-
-    bcp.push_back(src);
-    bcp_locations.push_back(src);
-    for (std::unique_ptr<const DexFile>& df : new_dex_files) {
-      dex_files.push_back(df.get());
-      opened_dex_files.push_back(std::move(df));
-    }
-
-    ASSERT_TRUE(create_and_verify()) << error_msg;
-  }
-}
-
 template <bool kImage, bool kRelocate>
 class ImageSpaceLoadingTest : public CommonRuntimeTest {
  protected:
@@ -380,6 +330,7 @@ class ImageSpaceLoadingTest : public CommonRuntimeTest {
     options->emplace_back(android::base::StringPrintf("-Ximage:%s", image_location.c_str()),
                           nullptr);
     options->emplace_back(kRelocate ? "-Xrelocate" : "-Xnorelocate", nullptr);
+    options->emplace_back("-Xallowinmemorycompilation", nullptr);
 
     // We want to test the relocation behavior of ImageSpace. As such, don't pretend we're a
     // compiler.
diff --git a/runtime/gc/space/large_object_space.cc b/runtime/gc/space/large_object_space.cc
index 2d17a18a36..f1df45f19a 100644
--- a/runtime/gc/space/large_object_space.cc
+++ b/runtime/gc/space/large_object_space.cc
@@ -336,7 +336,7 @@ class AllocationInfo {
 
 size_t FreeListSpace::GetSlotIndexForAllocationInfo(const AllocationInfo* info) const {
   DCHECK_GE(info, allocation_info_);
-  DCHECK_LT(info, reinterpret_cast<AllocationInfo*>(allocation_info_map_.End()));
+  DCHECK_LE(info, reinterpret_cast<AllocationInfo*>(allocation_info_map_.End()));
   return info - allocation_info_;
 }
 
@@ -457,6 +457,10 @@ size_t FreeListSpace::Free(Thread* self, mirror::Object* obj) {
     // The previous allocation info must not be free since we are supposed to always coalesce.
     DCHECK_EQ(info->GetPrevFreeBytes(), 0U) << "Previous allocation was free";
   }
+  // NOTE: next_info could be pointing right after the allocation_info_map_
+  // when freeing object in the very end of the space. But that's safe
+  // as we don't dereference it in that case. We only use it to calculate
+  // next_addr using offset within the map.
   uintptr_t next_addr = GetAddressForAllocationInfo(next_info);
   if (next_addr >= free_end_start) {
     // Easy case, the next chunk is the end free region.
diff --git a/runtime/gc/space/malloc_space.h b/runtime/gc/space/malloc_space.h
index 50006568ca..59ab3f3214 100644
--- a/runtime/gc/space/malloc_space.h
+++ b/runtime/gc/space/malloc_space.h
@@ -38,7 +38,7 @@ class ZygoteSpace;
 // A common parent of DlMallocSpace and RosAllocSpace.
 class MallocSpace : public ContinuousMemMapAllocSpace {
  public:
-  typedef void(*WalkCallback)(void *start, void *end, size_t num_bytes, void* callback_arg);
+  using WalkCallback = void (*)(void *start, void *end, size_t num_bytes, void* callback_arg);
 
   SpaceType GetType() const override {
     return kSpaceTypeMallocSpace;
diff --git a/runtime/gc/space/region_space-inl.h b/runtime/gc/space/region_space-inl.h
index 901568e546..1026f42c27 100644
--- a/runtime/gc/space/region_space-inl.h
+++ b/runtime/gc/space/region_space-inl.h
@@ -17,8 +17,6 @@
 #ifndef ART_RUNTIME_GC_SPACE_REGION_SPACE_INL_H_
 #define ART_RUNTIME_GC_SPACE_REGION_SPACE_INL_H_
 
-#include "region_space.h"
-
 #include "base/mutex-inl.h"
 #include "mirror/object-inl.h"
 #include "region_space.h"
diff --git a/runtime/gc/space/region_space.cc b/runtime/gc/space/region_space.cc
index 171c5cdebc..60141d656b 100644
--- a/runtime/gc/space/region_space.cc
+++ b/runtime/gc/space/region_space.cc
@@ -36,7 +36,7 @@ static constexpr uint kEvacuateLivePercentThreshold = 75U;
 static constexpr bool kProtectClearedRegions = kIsDebugBuild;
 
 // Wether we poison memory areas occupied by dead objects in unevacuated regions.
-static constexpr bool kPoisonDeadObjectsInUnevacuatedRegions = true;
+static constexpr bool kPoisonDeadObjectsInUnevacuatedRegions = kIsDebugBuild;
 
 // Special 32-bit value used to poison memory areas occupied by dead
 // objects in unevacuated regions. Dereferencing this value is expected
@@ -741,10 +741,19 @@ bool RegionSpace::LogFragmentationAllocFailure(std::ostream& os,
   max_contiguous_allocation = std::min(max_contiguous_allocation,
                                        regions_free_for_alloc * kRegionSize);
   if (failed_alloc_bytes > max_contiguous_allocation) {
+    // Region space does not normally fragment in the conventional sense. However we can run out
+    // of region space prematurely if we have many threads, each with a partially committed TLAB.
+    // The whole TLAB uses up region address space, but we only count the section that was
+    // actually given to the thread so far as allocated. For unlikely allocation request sequences
+    // involving largish objects that don't qualify for large objects space, we may also be unable
+    // to fully utilize entire TLABs, and thus generate enough actual fragmentation to get
+    // here. This appears less likely, since we usually reuse sufficiently large TLAB "tails"
+    // that are no longer needed.
     os << "; failed due to fragmentation (largest possible contiguous allocation "
-       <<  max_contiguous_allocation << " bytes). Number of "
-       << PrettySize(kRegionSize)
-       << " sized free regions are: " << regions_free_for_alloc;
+       << max_contiguous_allocation << " bytes). Number of " << PrettySize(kRegionSize)
+       << " sized free regions are: " << regions_free_for_alloc
+       << ". Likely cause: (1) Too much memory in use, and "
+       << "(2) many threads or many larger objects of the wrong kind";
     return true;
   }
   // Caller's job to print failed_alloc_bytes.
diff --git a/runtime/gc/space/region_space.h b/runtime/gc/space/region_space.h
index 1463eb7d2a..27b9e9c367 100644
--- a/runtime/gc/space/region_space.h
+++ b/runtime/gc/space/region_space.h
@@ -46,7 +46,7 @@ static constexpr bool kCyclicRegionAllocation = kIsDebugBuild;
 // A space that consists of equal-sized regions.
 class RegionSpace final : public ContinuousMemMapAllocSpace {
  public:
-  typedef void(*WalkCallback)(void *start, void *end, size_t num_bytes, void* callback_arg);
+  using WalkCallback = void (*)(void *start, void *end, size_t num_bytes, void* callback_arg);
 
   enum EvacMode {
     kEvacModeNewlyAllocated,
diff --git a/runtime/gc/system_weak.h b/runtime/gc/system_weak.h
index ef85b3942f..77b9548211 100644
--- a/runtime/gc/system_weak.h
+++ b/runtime/gc/system_weak.h
@@ -48,7 +48,7 @@ class SystemWeakHolder : public AbstractSystemWeakHolder {
   void Allow() override
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!allow_disallow_lock_) {
-    CHECK(!kUseReadBarrier);
+    CHECK(!gUseReadBarrier);
     MutexLock mu(Thread::Current(), allow_disallow_lock_);
     allow_new_system_weak_ = true;
     new_weak_condition_.Broadcast(Thread::Current());
@@ -57,7 +57,7 @@ class SystemWeakHolder : public AbstractSystemWeakHolder {
   void Disallow() override
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!allow_disallow_lock_) {
-    CHECK(!kUseReadBarrier);
+    CHECK(!gUseReadBarrier);
     MutexLock mu(Thread::Current(), allow_disallow_lock_);
     allow_new_system_weak_ = false;
   }
@@ -78,8 +78,8 @@ class SystemWeakHolder : public AbstractSystemWeakHolder {
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(allow_disallow_lock_) {
     // Wait for GC's sweeping to complete and allow new records
-    while (UNLIKELY((!kUseReadBarrier && !allow_new_system_weak_) ||
-                    (kUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
+    while (UNLIKELY((!gUseReadBarrier && !allow_new_system_weak_) ||
+                    (gUseReadBarrier && !self->GetWeakRefAccessEnabled()))) {
       // Check and run the empty checkpoint before blocking so the empty checkpoint will work in the
       // presence of threads blocking for weak ref access.
       self->CheckEmptyCheckpointFromWeakRefAccess(&allow_disallow_lock_);
diff --git a/runtime/gc/system_weak_test.cc b/runtime/gc/system_weak_test.cc
index ca112972c2..dd936538e5 100644
--- a/runtime/gc/system_weak_test.cc
+++ b/runtime/gc/system_weak_test.cc
@@ -35,6 +35,10 @@ namespace art {
 namespace gc {
 
 class SystemWeakTest : public CommonRuntimeTest {
+ protected:
+  SystemWeakTest() {
+    use_boot_image_ = true;  // Make the Runtime creation cheaper.
+  }
 };
 
 struct CountingSystemWeakHolder : public SystemWeakHolder {
@@ -111,6 +115,7 @@ static bool CollectorDoesAllowOrBroadcast() {
   CollectorType type = Runtime::Current()->GetHeap()->CurrentCollectorType();
   switch (type) {
     case CollectorType::kCollectorTypeCMS:
+    case CollectorType::kCollectorTypeCMC:
     case CollectorType::kCollectorTypeCC:
     case CollectorType::kCollectorTypeSS:
       return true;
@@ -124,6 +129,7 @@ static bool CollectorDoesDisallow() {
   CollectorType type = Runtime::Current()->GetHeap()->CurrentCollectorType();
   switch (type) {
     case CollectorType::kCollectorTypeCMS:
+    case CollectorType::kCollectorTypeCMC:
       return true;
 
     default:
@@ -149,7 +155,12 @@ TEST_F(SystemWeakTest, Keep) {
   // Expect the holder to have been called.
   EXPECT_EQ(CollectorDoesAllowOrBroadcast() ? 1U : 0U, cswh.allow_count_);
   EXPECT_EQ(CollectorDoesDisallow() ? 1U : 0U, cswh.disallow_count_);
-  EXPECT_EQ(1U, cswh.sweep_count_);
+  // Userfaultfd GC uses SweepSystemWeaks also for concurrent updation.
+  // TODO: Explore this can be reverted back to unconditionally compare with 1
+  // once concurrent updation of native roots is full implemented in userfaultfd
+  // GC.
+  size_t expected_sweep_count = gUseUserfaultfd ? 2U : 1U;
+  EXPECT_EQ(expected_sweep_count, cswh.sweep_count_);
 
   // Expect the weak to not be cleared.
   EXPECT_FALSE(cswh.Get().IsNull());
@@ -170,7 +181,12 @@ TEST_F(SystemWeakTest, Discard) {
   // Expect the holder to have been called.
   EXPECT_EQ(CollectorDoesAllowOrBroadcast() ? 1U : 0U, cswh.allow_count_);
   EXPECT_EQ(CollectorDoesDisallow() ? 1U : 0U, cswh.disallow_count_);
-  EXPECT_EQ(1U, cswh.sweep_count_);
+  // Userfaultfd GC uses SweepSystemWeaks also for concurrent updation.
+  // TODO: Explore this can be reverted back to unconditionally compare with 1
+  // once concurrent updation of native roots is full implemented in userfaultfd
+  // GC.
+  size_t expected_sweep_count = gUseUserfaultfd ? 2U : 1U;
+  EXPECT_EQ(expected_sweep_count, cswh.sweep_count_);
 
   // Expect the weak to be cleared.
   EXPECT_TRUE(cswh.Get().IsNull());
@@ -194,7 +210,12 @@ TEST_F(SystemWeakTest, Remove) {
   // Expect the holder to have been called.
   ASSERT_EQ(CollectorDoesAllowOrBroadcast() ? 1U : 0U, cswh.allow_count_);
   ASSERT_EQ(CollectorDoesDisallow() ? 1U : 0U, cswh.disallow_count_);
-  ASSERT_EQ(1U, cswh.sweep_count_);
+  // Userfaultfd GC uses SweepSystemWeaks also for concurrent updation.
+  // TODO: Explore this can be reverted back to unconditionally compare with 1
+  // once concurrent updation of native roots is full implemented in userfaultfd
+  // GC.
+  size_t expected_sweep_count = gUseUserfaultfd ? 2U : 1U;
+  EXPECT_EQ(expected_sweep_count, cswh.sweep_count_);
 
   // Expect the weak to not be cleared.
   ASSERT_FALSE(cswh.Get().IsNull());
@@ -209,7 +230,7 @@ TEST_F(SystemWeakTest, Remove) {
   // Expectation: no change in the numbers.
   EXPECT_EQ(CollectorDoesAllowOrBroadcast() ? 1U : 0U, cswh.allow_count_);
   EXPECT_EQ(CollectorDoesDisallow() ? 1U : 0U, cswh.disallow_count_);
-  EXPECT_EQ(1U, cswh.sweep_count_);
+  EXPECT_EQ(expected_sweep_count, cswh.sweep_count_);
 }
 
 }  // namespace gc
diff --git a/runtime/gc/verification-inl.h b/runtime/gc/verification-inl.h
new file mode 100644
index 0000000000..1ef96e2954
--- /dev/null
+++ b/runtime/gc/verification-inl.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_GC_VERIFICATION_INL_H_
+#define ART_RUNTIME_GC_VERIFICATION_INL_H_
+
+#include "verification.h"
+
+#include "mirror/class-inl.h"
+
+namespace art {
+namespace gc {
+
+template <ReadBarrierOption kReadBarrierOption>
+bool Verification::IsValidClassUnchecked(mirror::Class* klass) const {
+  mirror::Class* k1 = klass->GetClass<kVerifyNone, kReadBarrierOption>();
+  if (!IsValidHeapObjectAddress(k1)) {
+    return false;
+  }
+  // `k1` should be class class, take the class again to verify.
+  // Note that this check may not be valid for the no image space
+  // since the class class might move around from moving GC.
+  mirror::Class* k2 = k1->GetClass<kVerifyNone, kReadBarrierOption>();
+  if (!IsValidHeapObjectAddress(k2)) {
+    return false;
+  }
+  return k1 == k2;
+}
+
+template <ReadBarrierOption kReadBarrierOption>
+bool Verification::IsValidClass(mirror::Class* klass) const {
+  if (!IsValidHeapObjectAddress(klass)) {
+    return false;
+  }
+  return IsValidClassUnchecked<kReadBarrierOption>(klass);
+}
+
+template <ReadBarrierOption kReadBarrierOption>
+bool Verification::IsValidObject(mirror::Object* obj) const {
+  if (!IsValidHeapObjectAddress(obj)) {
+    return false;
+  }
+  mirror::Class* klass = obj->GetClass<kVerifyNone, kReadBarrierOption>();
+  return IsValidClass(klass);
+}
+
+}  // namespace gc
+}  // namespace art
+
+#endif  // ART_RUNTIME_GC_VERIFICATION_INL_H_
diff --git a/runtime/gc/verification.cc b/runtime/gc/verification.cc
index 9e0b8a2ff1..195986f04d 100644
--- a/runtime/gc/verification.cc
+++ b/runtime/gc/verification.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "verification.h"
+#include "verification-inl.h"
 
 #include <iomanip>
 #include <sstream>
@@ -29,23 +29,16 @@ namespace art {
 namespace gc {
 
 std::string Verification::DumpRAMAroundAddress(uintptr_t addr, uintptr_t bytes) const {
-  const uintptr_t dump_start = addr - bytes;
-  const uintptr_t dump_end = addr + bytes;
+  uintptr_t* dump_start = reinterpret_cast<uintptr_t*>(addr - bytes);
+  uintptr_t* dump_end = reinterpret_cast<uintptr_t*>(addr + bytes);
   std::ostringstream oss;
-  if (dump_start < dump_end &&
-      IsAddressInHeapSpace(reinterpret_cast<const void*>(dump_start)) &&
-      IsAddressInHeapSpace(reinterpret_cast<const void*>(dump_end - 1))) {
-    oss << " adjacent_ram=";
-    for (uintptr_t p = dump_start; p < dump_end; ++p) {
-      if (p == addr) {
-        // Marker of where the address is.
-        oss << "|";
-      }
-      uint8_t* ptr = reinterpret_cast<uint8_t*>(p);
-      oss << std::hex << std::setfill('0') << std::setw(2) << static_cast<uintptr_t>(*ptr);
+  oss << " adjacent_ram=";
+  for (const uintptr_t* p = dump_start; p < dump_end; ++p) {
+    if (p == reinterpret_cast<uintptr_t*>(addr)) {
+      // Marker of where the address is.
+      oss << "|";
     }
-  } else {
-    oss << " <invalid address>";
+    oss << std::hex << std::setfill('0') << std::setw(sizeof(uintptr_t) * 2) << *p << " ";
   }
   return oss.str();
 }
@@ -93,7 +86,7 @@ void Verification::LogHeapCorruption(ObjPtr<mirror::Object> holder,
   std::ostringstream oss;
   oss << "GC tried to mark invalid reference " << ref << std::endl;
   oss << DumpObjectInfo(ref, "ref") << "\n";
-  oss << DumpObjectInfo(holder.Ptr(), "holder");
+  oss << DumpObjectInfo(holder.Ptr(), "holder") << "\n";
   if (holder != nullptr) {
     mirror::Class* holder_klass = holder->GetClass<kVerifyNone, kWithoutReadBarrier>();
     if (IsValidClass(holder_klass)) {
@@ -132,25 +125,6 @@ bool Verification::IsValidHeapObjectAddress(const void* addr, space::Space** out
   return IsAligned<kObjectAlignment>(addr) && IsAddressInHeapSpace(addr, out_space);
 }
 
-bool Verification::IsValidClass(const void* addr) const {
-  if (!IsValidHeapObjectAddress(addr)) {
-    return false;
-  }
-  mirror::Class* klass = reinterpret_cast<mirror::Class*>(const_cast<void*>(addr));
-  mirror::Class* k1 = klass->GetClass<kVerifyNone, kWithoutReadBarrier>();
-  if (!IsValidHeapObjectAddress(k1)) {
-    return false;
-  }
-  // `k1` should be class class, take the class again to verify.
-  // Note that this check may not be valid for the no image space since the class class might move
-  // around from moving GC.
-  mirror::Class* k2 = k1->GetClass<kVerifyNone, kWithoutReadBarrier>();
-  if (!IsValidHeapObjectAddress(k2)) {
-    return false;
-  }
-  return k1 == k2;
-}
-
 using ObjectSet = std::set<mirror::Object*>;
 using WorkQueue = std::deque<std::pair<mirror::Object*, std::string>>;
 
diff --git a/runtime/gc/verification.h b/runtime/gc/verification.h
index 6b456fd349..7a5d01a40a 100644
--- a/runtime/gc/verification.h
+++ b/runtime/gc/verification.h
@@ -19,6 +19,7 @@
 
 #include "obj_ptr.h"
 #include "offsets.h"
+#include "read_barrier_option.h"
 
 namespace art {
 
@@ -50,7 +51,16 @@ class Verification {
                          bool fatal) const REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Return true if the klass is likely to be a valid mirror::Class.
-  bool IsValidClass(const void* klass) const REQUIRES_SHARED(Locks::mutator_lock_);
+  // Returns true if the class is a valid mirror::Class or possibly spuriously.
+  template <ReadBarrierOption kReadBarrierOption = kWithoutReadBarrier>
+  bool IsValidClassUnchecked(mirror::Class* klass) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
+  // Return true if the klass is likely to be a valid mirror::Class.
+  template <ReadBarrierOption kReadBarrierOption = kWithoutReadBarrier>
+  bool IsValidClass(mirror::Class* klass) const REQUIRES_SHARED(Locks::mutator_lock_);
+  // Return true if the obj is likely to be a valid obj with valid mirror::Class.
+  template <ReadBarrierOption kReadBarrierOption = kWithoutReadBarrier>
+  bool IsValidObject(mirror::Object* obj) const REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Does not allow null, checks alignment.
   bool IsValidHeapObjectAddress(const void* addr, space::Space** out_space = nullptr) const