10 files changed, 585 insertions, 25 deletions
diff --git a/runtime/base/atomic_pair.h b/runtime/base/atomic_pair.h
index 3e9e820c43..1523b3b049 100644
--- a/runtime/base/atomic_pair.h
+++ b/runtime/base/atomic_pair.h
@@ -40,18 +40,16 @@ struct PACKED(2 * sizeof(IntType)) AtomicPair {
 template <typename IntType>
 ALWAYS_INLINE static inline AtomicPair<IntType> AtomicPairLoadAcquire(
     std::atomic<AtomicPair<IntType>>* target) {
-  static_assert(std::atomic<AtomicPair<IntType>>::is_always_lock_free);
   return target->load(std::memory_order_acquire);
 }
 
 template <typename IntType>
-ALWAYS_INLINE static inline void AtomicPairStoreRelease(
-    std::atomic<AtomicPair<IntType>>* target, AtomicPair<IntType> value) {
-  static_assert(std::atomic<AtomicPair<IntType>>::is_always_lock_free);
+ALWAYS_INLINE static inline void AtomicPairStoreRelease(std::atomic<AtomicPair<IntType>>* target,
+                                                        AtomicPair<IntType> value) {
   target->store(value, std::memory_order_release);
 }
 
-// llvm does not implement 16-byte atomic operations on x86-64.
+// LLVM uses generic lock-based implementation for x86_64, we can do better with CMPXCHG16B.
 #if defined(__x86_64__)
 ALWAYS_INLINE static inline AtomicPair<uint64_t> AtomicPairLoadAcquire(
     std::atomic<AtomicPair<uint64_t>>* target) {
diff --git a/runtime/base/gc_visited_arena_pool.cc b/runtime/base/gc_visited_arena_pool.cc
new file mode 100644
index 0000000000..52b3829401
--- /dev/null
+++ b/runtime/base/gc_visited_arena_pool.cc
@@ -0,0 +1,293 @@
+/*
+ * Copyright 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "base/gc_visited_arena_pool.h"
+
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "base/arena_allocator-inl.h"
+#include "base/memfd.h"
+#include "base/utils.h"
+#include "gc/collector/mark_compact-inl.h"
+
+namespace art {
+
+TrackedArena::TrackedArena(uint8_t* start, size_t size, bool pre_zygote_fork)
+    : Arena(), first_obj_array_(nullptr), pre_zygote_fork_(pre_zygote_fork) {
+  static_assert(ArenaAllocator::kArenaAlignment <= kPageSize,
+                "Arena should not need stronger alignment than kPageSize.");
+  DCHECK_ALIGNED(size, kPageSize);
+  DCHECK_ALIGNED(start, kPageSize);
+  memory_ = start;
+  size_ = size;
+  size_t arr_size = size / kPageSize;
+  first_obj_array_.reset(new uint8_t*[arr_size]);
+  std::fill_n(first_obj_array_.get(), arr_size, nullptr);
+}
+
+void TrackedArena::Release() {
+  if (bytes_allocated_ > 0) {
+    // Userfaultfd GC uses MAP_SHARED mappings for linear-alloc and therefore
+    // MADV_DONTNEED will not free the pages from page cache. Therefore use
+    // MADV_REMOVE instead, which is meant for this purpose.
+    // Arenas allocated pre-zygote fork are private anonymous and hence must be
+    // released using MADV_DONTNEED.
+    if (!gUseUserfaultfd || pre_zygote_fork_ ||
+        (madvise(Begin(), Size(), MADV_REMOVE) == -1 && errno == EINVAL)) {
+      // MADV_REMOVE fails if invoked on anonymous mapping, which could happen
+      // if the arena is released before userfaultfd-GC starts using memfd. So
+      // use MADV_DONTNEED.
+      ZeroAndReleasePages(Begin(), Size());
+    }
+    std::fill_n(first_obj_array_.get(), Size() / kPageSize, nullptr);
+    bytes_allocated_ = 0;
+  }
+}
+
+void TrackedArena::SetFirstObject(uint8_t* obj_begin, uint8_t* obj_end) {
+  DCHECK_LE(static_cast<void*>(Begin()), static_cast<void*>(obj_end));
+  DCHECK_LT(static_cast<void*>(obj_begin), static_cast<void*>(obj_end));
+  size_t idx = static_cast<size_t>(obj_begin - Begin()) / kPageSize;
+  size_t last_byte_idx = static_cast<size_t>(obj_end - 1 - Begin()) / kPageSize;
+  // If the addr is at the beginning of a page, then we set it for that page too.
+  if (IsAligned<kPageSize>(obj_begin)) {
+    first_obj_array_[idx] = obj_begin;
+  }
+  while (idx < last_byte_idx) {
+    first_obj_array_[++idx] = obj_begin;
+  }
+}
+
+uint8_t* GcVisitedArenaPool::AddMap(size_t min_size) {
+  size_t size = std::max(min_size, kLinearAllocPoolSize);
+#if defined(__LP64__)
+  // This is true only when we are running a 64-bit dex2oat to compile a 32-bit image.
+  if (low_4gb_) {
+    size = std::max(min_size, kLow4GBLinearAllocPoolSize);
+  }
+#endif
+  size_t alignment = BestPageTableAlignment(size);
+  DCHECK_GE(size, kPMDSize);
+  std::string err_msg;
+  maps_.emplace_back(MemMap::MapAnonymousAligned(
+      name_, size, PROT_READ | PROT_WRITE, low_4gb_, alignment, &err_msg));
+  MemMap& map = maps_.back();
+  if (!map.IsValid()) {
+    LOG(FATAL) << "Failed to allocate " << name_ << ": " << err_msg;
+    UNREACHABLE();
+  }
+
+  if (gUseUserfaultfd) {
+    // Create a shadow-map for the map being added for userfaultfd GC
+    gc::collector::MarkCompact* mark_compact =
+        Runtime::Current()->GetHeap()->MarkCompactCollector();
+    DCHECK_NE(mark_compact, nullptr);
+    mark_compact->AddLinearAllocSpaceData(map.Begin(), map.Size());
+  }
+  Chunk* chunk = new Chunk(map.Begin(), map.Size());
+  best_fit_allocs_.insert(chunk);
+  free_chunks_.insert(chunk);
+  return map.Begin();
+}
+
+GcVisitedArenaPool::GcVisitedArenaPool(bool low_4gb, bool is_zygote, const char* name)
+    : bytes_allocated_(0), name_(name), low_4gb_(low_4gb), pre_zygote_fork_(is_zygote) {}
+
+GcVisitedArenaPool::~GcVisitedArenaPool() {
+  for (Chunk* chunk : free_chunks_) {
+    delete chunk;
+  }
+  // Must not delete chunks from best_fit_allocs_ as they are shared with
+  // free_chunks_.
+}
+
+size_t GcVisitedArenaPool::GetBytesAllocated() const {
+  std::lock_guard<std::mutex> lock(lock_);
+  return bytes_allocated_;
+}
+
+uint8_t* GcVisitedArenaPool::AddPreZygoteForkMap(size_t size) {
+  DCHECK(pre_zygote_fork_);
+  DCHECK(Runtime::Current()->IsZygote());
+  std::string pre_fork_name = "Pre-zygote-";
+  pre_fork_name += name_;
+  std::string err_msg;
+  maps_.emplace_back(MemMap::MapAnonymous(
+      pre_fork_name.c_str(), size, PROT_READ | PROT_WRITE, low_4gb_, &err_msg));
+  MemMap& map = maps_.back();
+  if (!map.IsValid()) {
+    LOG(FATAL) << "Failed to allocate " << pre_fork_name << ": " << err_msg;
+    UNREACHABLE();
+  }
+  return map.Begin();
+}
+
+Arena* GcVisitedArenaPool::AllocArena(size_t size) {
+  // Return only page aligned sizes so that madvise can be leveraged.
+  size = RoundUp(size, kPageSize);
+  std::lock_guard<std::mutex> lock(lock_);
+
+  if (pre_zygote_fork_) {
+    // The first fork out of zygote hasn't happened yet. Allocate arena in a
+    // private-anonymous mapping to retain clean pages across fork.
+    DCHECK(Runtime::Current()->IsZygote());
+    uint8_t* addr = AddPreZygoteForkMap(size);
+    auto emplace_result = allocated_arenas_.emplace(addr, size, /*pre_zygote_fork=*/true);
+    return const_cast<TrackedArena*>(&(*emplace_result.first));
+  }
+
+  Chunk temp_chunk(nullptr, size);
+  auto best_fit_iter = best_fit_allocs_.lower_bound(&temp_chunk);
+  if (UNLIKELY(best_fit_iter == best_fit_allocs_.end())) {
+    AddMap(size);
+    best_fit_iter = best_fit_allocs_.lower_bound(&temp_chunk);
+    CHECK(best_fit_iter != best_fit_allocs_.end());
+  }
+  auto free_chunks_iter = free_chunks_.find(*best_fit_iter);
+  DCHECK(free_chunks_iter != free_chunks_.end());
+  Chunk* chunk = *best_fit_iter;
+  DCHECK_EQ(chunk, *free_chunks_iter);
+  // if the best-fit chunk < 2x the requested size, then give the whole chunk.
+  if (chunk->size_ < 2 * size) {
+    DCHECK_GE(chunk->size_, size);
+    auto emplace_result = allocated_arenas_.emplace(chunk->addr_,
+                                                    chunk->size_,
+                                                    /*pre_zygote_fork=*/false);
+    DCHECK(emplace_result.second);
+    free_chunks_.erase(free_chunks_iter);
+    best_fit_allocs_.erase(best_fit_iter);
+    delete chunk;
+    return const_cast<TrackedArena*>(&(*emplace_result.first));
+  } else {
+    auto emplace_result = allocated_arenas_.emplace(chunk->addr_,
+                                                    size,
+                                                    /*pre_zygote_fork=*/false);
+    DCHECK(emplace_result.second);
+    // Compute next iterators for faster insert later.
+    auto next_best_fit_iter = best_fit_iter;
+    next_best_fit_iter++;
+    auto next_free_chunks_iter = free_chunks_iter;
+    next_free_chunks_iter++;
+    auto best_fit_nh = best_fit_allocs_.extract(best_fit_iter);
+    auto free_chunks_nh = free_chunks_.extract(free_chunks_iter);
+    best_fit_nh.value()->addr_ += size;
+    best_fit_nh.value()->size_ -= size;
+    DCHECK_EQ(free_chunks_nh.value()->addr_, chunk->addr_);
+    best_fit_allocs_.insert(next_best_fit_iter, std::move(best_fit_nh));
+    free_chunks_.insert(next_free_chunks_iter, std::move(free_chunks_nh));
+    return const_cast<TrackedArena*>(&(*emplace_result.first));
+  }
+}
+
+void GcVisitedArenaPool::FreeRangeLocked(uint8_t* range_begin, size_t range_size) {
+  Chunk temp_chunk(range_begin, range_size);
+  bool merge_with_next = false;
+  bool merge_with_prev = false;
+  auto next_iter = free_chunks_.lower_bound(&temp_chunk);
+  auto iter_for_extract = free_chunks_.end();
+  // Can we merge with the previous chunk?
+  if (next_iter != free_chunks_.begin()) {
+    auto prev_iter = next_iter;
+    prev_iter--;
+    merge_with_prev = (*prev_iter)->addr_ + (*prev_iter)->size_ == range_begin;
+    if (merge_with_prev) {
+      range_begin = (*prev_iter)->addr_;
+      range_size += (*prev_iter)->size_;
+      // Hold on to the iterator for faster extract later
+      iter_for_extract = prev_iter;
+    }
+  }
+  // Can we merge with the next chunk?
+  if (next_iter != free_chunks_.end()) {
+    merge_with_next = range_begin + range_size == (*next_iter)->addr_;
+    if (merge_with_next) {
+      range_size += (*next_iter)->size_;
+      if (merge_with_prev) {
+        auto iter = next_iter;
+        next_iter++;
+        // Keep only one of the two chunks to be expanded.
+        Chunk* chunk = *iter;
+        size_t erase_res = best_fit_allocs_.erase(chunk);
+        DCHECK_EQ(erase_res, 1u);
+        free_chunks_.erase(iter);
+        delete chunk;
+      } else {
+        iter_for_extract = next_iter;
+        next_iter++;
+      }
+    }
+  }
+
+  // Extract-insert avoids 2/4 destroys and 2/2 creations
+  // as compared to erase-insert, so use that when merging.
+  if (merge_with_prev || merge_with_next) {
+    auto free_chunks_nh = free_chunks_.extract(iter_for_extract);
+    auto best_fit_allocs_nh = best_fit_allocs_.extract(*iter_for_extract);
+
+    free_chunks_nh.value()->addr_ = range_begin;
+    DCHECK_EQ(best_fit_allocs_nh.value()->addr_, range_begin);
+    free_chunks_nh.value()->size_ = range_size;
+    DCHECK_EQ(best_fit_allocs_nh.value()->size_, range_size);
+
+    free_chunks_.insert(next_iter, std::move(free_chunks_nh));
+    // Since the chunk's size has expanded, the hint won't be useful
+    // for best-fit set.
+    best_fit_allocs_.insert(std::move(best_fit_allocs_nh));
+  } else {
+    DCHECK(iter_for_extract == free_chunks_.end());
+    Chunk* chunk = new Chunk(range_begin, range_size);
+    free_chunks_.insert(next_iter, chunk);
+    best_fit_allocs_.insert(chunk);
+  }
+}
+
+void GcVisitedArenaPool::FreeArenaChain(Arena* first) {
+  if (kRunningOnMemoryTool) {
+    for (Arena* arena = first; arena != nullptr; arena = arena->Next()) {
+      MEMORY_TOOL_MAKE_UNDEFINED(arena->Begin(), arena->GetBytesAllocated());
+    }
+  }
+
+  // TODO: Handle the case when arena_allocator::kArenaAllocatorPreciseTracking
+  // is true. See MemMapArenaPool::FreeArenaChain() for example.
+  CHECK(!arena_allocator::kArenaAllocatorPreciseTracking);
+
+  // madvise the arenas before acquiring lock for scalability
+  for (Arena* temp = first; temp != nullptr; temp = temp->Next()) {
+    temp->Release();
+  }
+
+  std::lock_guard<std::mutex> lock(lock_);
+  arenas_freed_ = true;
+  while (first != nullptr) {
+    FreeRangeLocked(first->Begin(), first->Size());
+    // In other implementations of ArenaPool this is calculated when asked for,
+    // thanks to the list of free arenas that is kept around. But in this case,
+    // we release the freed arena back to the pool and therefore need to
+    // calculate here.
+    bytes_allocated_ += first->GetBytesAllocated();
+    TrackedArena* temp = down_cast<TrackedArena*>(first);
+    // TODO: Add logic to unmap the maps corresponding to pre-zygote-fork
+    // arenas, which are expected to be released only during shutdown.
+    first = first->Next();
+    size_t erase_count = allocated_arenas_.erase(*temp);
+    DCHECK_EQ(erase_count, 1u);
+  }
+}
+
+}  // namespace art
diff --git a/runtime/base/gc_visited_arena_pool.h b/runtime/base/gc_visited_arena_pool.h
new file mode 100644
index 0000000000..e307147c9e
--- /dev/null
+++ b/runtime/base/gc_visited_arena_pool.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_BASE_GC_VISITED_ARENA_POOL_H_
+#define ART_RUNTIME_BASE_GC_VISITED_ARENA_POOL_H_
+
+#include "base/casts.h"
+#include "base/arena_allocator.h"
+#include "base/locks.h"
+#include "base/mem_map.h"
+
+#include <set>
+
+namespace art {
+
+// GcVisitedArenaPool can be used for tracking allocations so that they can
+// be visited during GC to update the GC-roots inside them.
+
+// An Arena which tracks its allocations.
+class TrackedArena final : public Arena {
+ public:
+  // Used for searching in maps. Only arena's starting address is relevant.
+  explicit TrackedArena(uint8_t* addr) : pre_zygote_fork_(false) { memory_ = addr; }
+  TrackedArena(uint8_t* start, size_t size, bool pre_zygote_fork);
+
+  template <typename PageVisitor>
+  void VisitRoots(PageVisitor& visitor) const REQUIRES_SHARED(Locks::mutator_lock_) {
+    DCHECK_ALIGNED(Size(), kPageSize);
+    DCHECK_ALIGNED(Begin(), kPageSize);
+    int nr_pages = Size() / kPageSize;
+    uint8_t* page_begin = Begin();
+    for (int i = 0; i < nr_pages && first_obj_array_[i] != nullptr; i++, page_begin += kPageSize) {
+      visitor(page_begin, first_obj_array_[i]);
+    }
+  }
+
+  // Return the page addr of the first page with first_obj set to nullptr.
+  uint8_t* GetLastUsedByte() const REQUIRES_SHARED(Locks::mutator_lock_) {
+    DCHECK_ALIGNED(Begin(), kPageSize);
+    DCHECK_ALIGNED(End(), kPageSize);
+    // Jump past bytes-allocated for arenas which are not currently being used
+    // by arena-allocator. This helps in reducing loop iterations below.
+    uint8_t* last_byte = AlignUp(Begin() + GetBytesAllocated(), kPageSize);
+    DCHECK_LE(last_byte, End());
+    for (size_t i = (last_byte - Begin()) / kPageSize;
+         last_byte < End() && first_obj_array_[i] != nullptr;
+         last_byte += kPageSize, i++) {
+      // No body.
+    }
+    return last_byte;
+  }
+
+  uint8_t* GetFirstObject(uint8_t* addr) const REQUIRES_SHARED(Locks::mutator_lock_) {
+    DCHECK_LE(Begin(), addr);
+    DCHECK_GT(End(), addr);
+    return first_obj_array_[(addr - Begin()) / kPageSize];
+  }
+
+  // Set 'obj_begin' in first_obj_array_ in every element for which it's the
+  // first object.
+  void SetFirstObject(uint8_t* obj_begin, uint8_t* obj_end);
+
+  void Release() override;
+  bool IsPreZygoteForkArena() const { return pre_zygote_fork_; }
+
+ private:
+  // first_obj_array_[i] is the object that overlaps with the ith page's
+  // beginning, i.e. first_obj_array_[i] <= ith page_begin.
+  std::unique_ptr<uint8_t*[]> first_obj_array_;
+  const bool pre_zygote_fork_;
+};
+
+// An arena-pool wherein allocations can be tracked so that the GC can visit all
+// the GC roots. All the arenas are allocated in one sufficiently large memory
+// range to avoid multiple calls to mremapped/mprotected syscalls.
+class GcVisitedArenaPool final : public ArenaPool {
+ public:
+#if defined(__LP64__)
+  // Use a size in multiples of 1GB as that can utilize the optimized mremap
+  // page-table move.
+  static constexpr size_t kLinearAllocPoolSize = 1 * GB;
+  static constexpr size_t kLow4GBLinearAllocPoolSize = 32 * MB;
+#else
+  static constexpr size_t kLinearAllocPoolSize = 32 * MB;
+#endif
+
+  explicit GcVisitedArenaPool(bool low_4gb = false,
+                              bool is_zygote = false,
+                              const char* name = "LinearAlloc");
+  virtual ~GcVisitedArenaPool();
+  Arena* AllocArena(size_t size) override;
+  void FreeArenaChain(Arena* first) override;
+  size_t GetBytesAllocated() const override;
+  void ReclaimMemory() override {}
+  void LockReclaimMemory() override {}
+  void TrimMaps() override {}
+
+  bool Contains(void* ptr) {
+    std::lock_guard<std::mutex> lock(lock_);
+    for (auto& map : maps_) {
+      if (map.HasAddress(ptr)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  template <typename PageVisitor>
+  void VisitRoots(PageVisitor& visitor) REQUIRES_SHARED(Locks::mutator_lock_) {
+    std::lock_guard<std::mutex> lock(lock_);
+    for (auto& arena : allocated_arenas_) {
+      arena.VisitRoots(visitor);
+    }
+  }
+
+  template <typename Callback>
+  void ForEachAllocatedArena(Callback cb) REQUIRES_SHARED(Locks::mutator_lock_) {
+    std::lock_guard<std::mutex> lock(lock_);
+    for (auto& arena : allocated_arenas_) {
+      cb(arena);
+    }
+  }
+
+  // Called in Heap::PreZygoteFork(). All allocations after this are done in
+  // arena-pool which is visited by userfaultfd.
+  void SetupPostZygoteMode() {
+    std::lock_guard<std::mutex> lock(lock_);
+    DCHECK(pre_zygote_fork_);
+    pre_zygote_fork_ = false;
+  }
+
+  // For userfaultfd GC to be able to acquire the lock to avoid concurrent
+  // release of arenas when it is visiting them.
+  std::mutex& GetLock() { return lock_; }
+
+  // Find the given arena in allocated_arenas_. The function is called with
+  // lock_ acquired.
+  bool FindAllocatedArena(const TrackedArena* arena) const NO_THREAD_SAFETY_ANALYSIS {
+    for (auto& allocated_arena : allocated_arenas_) {
+      if (arena == &allocated_arena) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void ClearArenasFreed() {
+    std::lock_guard<std::mutex> lock(lock_);
+    arenas_freed_ = false;
+  }
+
+  // The function is called with lock_ acquired.
+  bool AreArenasFreed() const NO_THREAD_SAFETY_ANALYSIS { return arenas_freed_; }
+
+ private:
+  void FreeRangeLocked(uint8_t* range_begin, size_t range_size) REQUIRES(lock_);
+  // Add a map (to be visited by userfaultfd) to the pool of at least min_size
+  // and return its address.
+  uint8_t* AddMap(size_t min_size) REQUIRES(lock_);
+  // Add a private anonymous map prior to zygote fork to the pool and return its
+  // address.
+  uint8_t* AddPreZygoteForkMap(size_t size) REQUIRES(lock_);
+
+  class Chunk {
+   public:
+    Chunk(uint8_t* addr, size_t size) : addr_(addr), size_(size) {}
+    uint8_t* addr_;
+    size_t size_;
+  };
+
+  class LessByChunkAddr {
+   public:
+    bool operator()(const Chunk* a, const Chunk* b) const {
+      return std::less<uint8_t*>{}(a->addr_, b->addr_);
+    }
+  };
+
+  class LessByChunkSize {
+   public:
+    // Since two chunks could have the same size, use addr when that happens.
+    bool operator()(const Chunk* a, const Chunk* b) const {
+      return a->size_ < b->size_ ||
+             (a->size_ == b->size_ && std::less<uint8_t*>{}(a->addr_, b->addr_));
+    }
+  };
+
+  class LessByArenaAddr {
+   public:
+    bool operator()(const TrackedArena& a, const TrackedArena& b) const {
+      return std::less<uint8_t*>{}(a.Begin(), b.Begin());
+    }
+  };
+
+  // Use a std::mutex here as Arenas are second-from-the-bottom when using MemMaps, and MemMap
+  // itself uses std::mutex scoped to within an allocate/free only.
+  mutable std::mutex lock_;
+  std::vector<MemMap> maps_ GUARDED_BY(lock_);
+  std::set<Chunk*, LessByChunkSize> best_fit_allocs_ GUARDED_BY(lock_);
+  std::set<Chunk*, LessByChunkAddr> free_chunks_ GUARDED_BY(lock_);
+  // Set of allocated arenas. It's required to be able to find the arena
+  // corresponding to a given address.
+  // TODO: consider using HashSet, which is more memory efficient.
+  std::set<TrackedArena, LessByArenaAddr> allocated_arenas_ GUARDED_BY(lock_);
+  // Number of bytes allocated so far.
+  size_t bytes_allocated_ GUARDED_BY(lock_);
+  const char* name_;
+  // Flag to indicate that some arenas have been freed. This flag is used as an
+  // optimization by GC to know if it needs to find if the arena being visited
+  // has been freed or not. The flag is cleared in the compaction pause and read
+  // when linear-alloc space is concurrently visited updated to update GC roots.
+  bool arenas_freed_ GUARDED_BY(lock_);
+  const bool low_4gb_;
+  // Set to true in zygote process so that all linear-alloc allocations are in
+  // private-anonymous mappings and not on userfaultfd visited pages. At
+  // first zygote fork, it's set to false, after which all allocations are done
+  // in userfaultfd visited space.
+  bool pre_zygote_fork_ GUARDED_BY(lock_);
+
+  DISALLOW_COPY_AND_ASSIGN(GcVisitedArenaPool);
+};
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_BASE_GC_VISITED_ARENA_POOL_H_
diff --git a/runtime/base/locks.h b/runtime/base/locks.h
index 829adff8ee..c15e5dee71 100644
--- a/runtime/base/locks.h
+++ b/runtime/base/locks.h
@@ -68,12 +68,12 @@ enum LockLevel : uint8_t {
   // Can be held while GC related work is done, and thus must be above kMarkSweepMarkStackLock
   kThreadWaitLock,
   kCHALock,
-  kJitCodeCacheLock,
   kRosAllocGlobalLock,
   kRosAllocBracketLock,
   kRosAllocBulkFreeLock,
   kAllocSpaceLock,
   kTaggingLockLevel,
+  kJitCodeCacheLock,
   kTransactionLogLock,
   kCustomTlsLock,
   kJniFunctionTableLock,
diff --git a/runtime/base/mem_map_arena_pool.cc b/runtime/base/mem_map_arena_pool.cc
index ae7db45024..fc1a61e8c8 100644
--- a/runtime/base/mem_map_arena_pool.cc
+++ b/runtime/base/mem_map_arena_pool.cc
@@ -57,13 +57,24 @@ MemMap MemMapArena::Allocate(size_t size, bool low_4gb, const char* name) {
   // and we want to be able to use all memory that we actually allocate.
   size = RoundUp(size, kPageSize);
   std::string error_msg;
-  MemMap map = MemMap::MapAnonymous(name,
-                                    size,
-                                    PROT_READ | PROT_WRITE,
-                                    low_4gb,
-                                    &error_msg);
-  CHECK(map.IsValid()) << error_msg;
-  return map;
+  // TODO(b/278665389): remove this retry logic if the root cause is found.
+  constexpr int MAX_RETRY_CNT = 3;
+  int retry_cnt = 0;
+  while (true) {
+    MemMap map = MemMap::MapAnonymous(name, size, PROT_READ | PROT_WRITE, low_4gb, &error_msg);
+    if (map.IsValid()) {
+      if (retry_cnt > 0) {
+        LOG(WARNING) << "Succeed with retry(cnt=" << retry_cnt << ")";
+      }
+      return map;
+    } else {
+      if (retry_cnt == MAX_RETRY_CNT) {
+        CHECK(map.IsValid()) << error_msg << "(retried " << retry_cnt << " times)";
+      }
+    }
+    retry_cnt++;
+    LOG(ERROR) << error_msg << " but retry(cnt=" << retry_cnt << ")";
+  }
 }
 
 MemMapArena::~MemMapArena() {
diff --git a/runtime/base/message_queue_test.cc b/runtime/base/message_queue_test.cc
index 7a788a9dfc..09dbc3271d 100644
--- a/runtime/base/message_queue_test.cc
+++ b/runtime/base/message_queue_test.cc
@@ -20,10 +20,16 @@
 
 #include "common_runtime_test.h"
 #include "thread-current-inl.h"
+#include "runtime.h"
 
 namespace art {
 
-class MessageQueueTest : public CommonRuntimeTest {};
+class MessageQueueTest : public CommonRuntimeTest {
+ protected:
+  MessageQueueTest() {
+    this->use_boot_image_ = true;  // Make the Runtime creation cheaper.
+  }
+};
 
 namespace {
 
@@ -81,6 +87,8 @@ TEST_F(MessageQueueTest, TestTimeout) {
 }
 
 TEST_F(MessageQueueTest, TwoWayMessaging) {
+  CHECK(Runtime::Current() != nullptr);  // Runtime is needed by Mutex.
+
   TestMessageQueue queue1;
   TestMessageQueue queue2;
 
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 5709333756..728dc842c2 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -28,6 +28,7 @@
 #include "base/systrace.h"
 #include "base/time_utils.h"
 #include "base/value_object.h"
+#include "monitor.h"
 #include "mutex-inl.h"
 #include "scoped_thread_state_change-inl.h"
 #include "thread-inl.h"
@@ -59,18 +60,19 @@ struct DumpStackLastTimeTLSData : public art::TLSData {
 };
 
 #if ART_USE_FUTEXES
+// Compute a relative timespec as *result_ts = lhs - rhs.
+// Return false (and produce an invalid *result_ts) if lhs < rhs.
 static bool ComputeRelativeTimeSpec(timespec* result_ts, const timespec& lhs, const timespec& rhs) {
   const int32_t one_sec = 1000 * 1000 * 1000;  // one second in nanoseconds.
+  static_assert(std::is_signed<decltype(result_ts->tv_sec)>::value);  // Signed on Linux.
   result_ts->tv_sec = lhs.tv_sec - rhs.tv_sec;
   result_ts->tv_nsec = lhs.tv_nsec - rhs.tv_nsec;
   if (result_ts->tv_nsec < 0) {
     result_ts->tv_sec--;
     result_ts->tv_nsec += one_sec;
-  } else if (result_ts->tv_nsec > one_sec) {
-    result_ts->tv_sec++;
-    result_ts->tv_nsec -= one_sec;
   }
-  return result_ts->tv_sec < 0;
+  DCHECK(result_ts->tv_nsec >= 0 && result_ts->tv_nsec < one_sec);
+  return result_ts->tv_sec >= 0;
 }
 #endif
 
@@ -462,7 +464,10 @@ void Mutex::ExclusiveLock(Thread* self) {
           do {
             timespec timeout_ts;
             timeout_ts.tv_sec = 0;
-            timeout_ts.tv_nsec = Runtime::Current()->GetMonitorTimeoutNs();
+            // NB: Some tests use the mutex without the runtime.
+            timeout_ts.tv_nsec = Runtime::Current() != nullptr
+                ? Runtime::Current()->GetMonitorTimeoutNs()
+                : Monitor::kDefaultMonitorTimeoutMs;
             if (futex(state_and_contenders_.Address(), FUTEX_WAIT_PRIVATE, cur_state,
                       enable_monitor_timeout_ ? &timeout_ts : nullptr , nullptr, 0) != 0) {
               // We only went to sleep after incrementing and contenders and checking that the
@@ -512,6 +517,7 @@ void Mutex::DumpStack(Thread* self, uint64_t wait_start_ms, uint64_t try_times)
   Locks::thread_list_lock_->ExclusiveLock(self);
   std::string owner_stack_dump;
   pid_t owner_tid = GetExclusiveOwnerTid();
+  CHECK(Runtime::Current() != nullptr);
   Thread *owner = Runtime::Current()->GetThreadList()->FindThreadByTid(owner_tid);
   if (owner != nullptr) {
     if (IsDumpFrequent(owner, try_times)) {
@@ -852,7 +858,7 @@ bool ReaderWriterMutex::ExclusiveLockWithTimeout(Thread* self, int64_t ms, int32
       timespec now_abs_ts;
       InitTimeSpec(true, CLOCK_MONOTONIC, 0, 0, &now_abs_ts);
       timespec rel_ts;
-      if (ComputeRelativeTimeSpec(&rel_ts, end_abs_ts, now_abs_ts)) {
+      if (!ComputeRelativeTimeSpec(&rel_ts, end_abs_ts, now_abs_ts)) {
         return false;  // Timed out.
       }
       ScopedContentionRecorder scr(this, SafeGetTid(self), GetExclusiveOwnerTid());
@@ -869,6 +875,7 @@ bool ReaderWriterMutex::ExclusiveLockWithTimeout(Thread* self, int64_t ms, int32
             // EAGAIN and EINTR both indicate a spurious failure,
             // recompute the relative time out from now and try again.
             // We don't use TEMP_FAILURE_RETRY so we can recompute rel_ts;
+            num_contenders_.fetch_sub(1);  // Unlikely to matter.
             PLOG(FATAL) << "timed futex wait failed for " << name_;
           }
         }
diff --git a/runtime/base/mutex_test.cc b/runtime/base/mutex_test.cc
index 7eba50b49c..f1b4e49f69 100644
--- a/runtime/base/mutex_test.cc
+++ b/runtime/base/mutex_test.cc
@@ -21,7 +21,12 @@
 
 namespace art {
 
-class MutexTest : public CommonRuntimeTest {};
+class MutexTest : public CommonRuntimeTest {
+ protected:
+  MutexTest() {
+    use_boot_image_ = true;  // Make the Runtime creation cheaper.
+  }
+};
 
 struct MutexTester {
   static void AssertDepth(Mutex& mu, uint32_t expected_depth) {
@@ -37,6 +42,9 @@ struct MutexTester {
 };
 
 TEST_F(MutexTest, LockUnlock) {
+  // TODO: Remove `Mutex` dependency on `Runtime` or at least make sure it works
+  // without a `Runtime` with reasonable defaults (and without dumping stack for timeout).
+  ASSERT_TRUE(Runtime::Current() != nullptr);
   Mutex mu("test mutex");
   MutexTester::AssertDepth(mu, 0U);
   mu.Lock(Thread::Current());
diff --git a/runtime/base/timing_logger.cc b/runtime/base/timing_logger.cc
index abf4f58b2c..c39b44e93d 100644
--- a/runtime/base/timing_logger.cc
+++ b/runtime/base/timing_logger.cc
@@ -33,8 +33,6 @@
 
 namespace art {
 
-constexpr size_t TimingLogger::kIndexNotFound;
-
 CumulativeLogger::CumulativeLogger(const std::string& name)
     : name_(name),
       lock_name_("CumulativeLoggerLock" + name),
diff --git a/runtime/base/timing_logger_test.cc b/runtime/base/timing_logger_test.cc
index 6f8d8cdf72..38ae9a5a57 100644
--- a/runtime/base/timing_logger_test.cc
+++ b/runtime/base/timing_logger_test.cc
@@ -16,11 +16,11 @@
 
 #include "timing_logger.h"
 
-#include "common_runtime_test.h"
+#include "base/common_art_test.h"
 
 namespace art {
 
-class TimingLoggerTest : public CommonRuntimeTest {};
+class TimingLoggerTest : public CommonArtTest {};
 
 // TODO: Negative test cases (improper pairing of EndSplit, etc.)