Decrease lock uses in RosAlloc::BulkFree().

Read rosalloc page map entries without a lock.

Disabled for now.

This change speeds up Ritz MemAllocTest by ~25% on host and reduces
the GC sweep time, but somehow slows it down by ~5% on N4, which is
why it's disabled for now. TODO: look into the slowdown on N4 more.

Bug: 8262791
Bug: 11790317
Change-Id: I936bbee9cfbd389e70d6343503bf0923865d2a2c
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index d02b851..e86bee6 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -40,15 +40,17 @@
 size_t RosAlloc::threadLocalFreeBitMapOffsets[kNumOfSizeBrackets];
 bool RosAlloc::initialized_ = false;
 
-RosAlloc::RosAlloc(void* base, size_t capacity,
+RosAlloc::RosAlloc(void* base, size_t capacity, size_t max_capacity,
                    PageReleaseMode page_release_mode, size_t page_release_size_threshold)
     : base_(reinterpret_cast<byte*>(base)), footprint_(capacity),
-      capacity_(capacity),
+      capacity_(capacity), max_capacity_(max_capacity),
       lock_("rosalloc global lock", kRosAllocGlobalLock),
       bulk_free_lock_("rosalloc bulk free lock", kRosAllocBulkFreeLock),
       page_release_mode_(page_release_mode),
       page_release_size_threshold_(page_release_size_threshold) {
   DCHECK(RoundUp(capacity, kPageSize) == capacity);
+  DCHECK(RoundUp(max_capacity, kPageSize) == max_capacity);
+  CHECK_LE(capacity, max_capacity);
   CHECK(IsAligned<kPageSize>(page_release_size_threshold_));
   if (!initialized_) {
     Initialize();
@@ -56,16 +58,24 @@
   VLOG(heap) << "RosAlloc base="
              << std::hex << (intptr_t)base_ << ", end="
              << std::hex << (intptr_t)(base_ + capacity_)
-             << ", capacity=" << std::dec << capacity_;
+             << ", capacity=" << std::dec << capacity_
+             << ", max_capacity=" << std::dec << max_capacity_;
   memset(current_runs_, 0, sizeof(current_runs_));
   for (size_t i = 0; i < kNumOfSizeBrackets; i++) {
     size_bracket_locks_[i] = new Mutex("an rosalloc size bracket lock",
                                        kRosAllocBracketLock);
   }
-  size_t num_of_pages = capacity_ / kPageSize;
-  page_map_.resize(num_of_pages);
+  DCHECK_EQ(footprint_, capacity_);
+  size_t num_of_pages = footprint_ / kPageSize;
+  size_t max_num_of_pages = max_capacity_ / kPageSize;
+  std::string error_msg;
+  page_map_mem_map_.reset(MemMap::MapAnonymous("rosalloc page map", NULL, RoundUp(max_num_of_pages, kPageSize),
+                                               PROT_READ | PROT_WRITE, false, &error_msg));
+  CHECK(page_map_mem_map_.get() != NULL) << "Couldn't allocate the page map : " << error_msg;
+  page_map_ = page_map_mem_map_->Begin();
+  page_map_size_ = num_of_pages;
+  max_page_map_size_ = max_num_of_pages;
   free_page_run_size_map_.resize(num_of_pages);
-
   FreePageRun* free_pages = reinterpret_cast<FreePageRun*>(base_);
   if (kIsDebugBuild) {
     free_pages->magic_num_ = kMagicNumFree;
@@ -149,9 +159,10 @@
       DCHECK_EQ(increment % kPageSize, static_cast<size_t>(0));
       size_t new_footprint = footprint_ + increment;
       size_t new_num_of_pages = new_footprint / kPageSize;
-      DCHECK_LT(page_map_.size(), new_num_of_pages);
+      DCHECK_LT(page_map_size_, new_num_of_pages);
       DCHECK_LT(free_page_run_size_map_.size(), new_num_of_pages);
-      page_map_.resize(new_num_of_pages);
+      page_map_size_ = new_num_of_pages;
+      DCHECK_LE(page_map_size_, max_page_map_size_);
       free_page_run_size_map_.resize(new_num_of_pages);
       art_heap_rosalloc_morecore(this, increment);
       if (last_free_page_run_size > 0) {
@@ -265,7 +276,7 @@
 void RosAlloc::FreePages(Thread* self, void* ptr) {
   lock_.AssertHeld(self);
   size_t pm_idx = ToPageMapIndex(ptr);
-  DCHECK(pm_idx < page_map_.size());
+  DCHECK(pm_idx < page_map_size_);
   byte pm_type = page_map_[pm_idx];
   DCHECK(pm_type == kPageMapRun || pm_type == kPageMapLargeObject);
   byte pm_part_type;
@@ -287,7 +298,7 @@
   size_t num_pages = 1;
   page_map_[pm_idx] = kPageMapEmpty;
   size_t idx = pm_idx + 1;
-  size_t end = page_map_.size();
+  size_t end = page_map_size_;
   while (idx < end && page_map_[idx] == pm_part_type) {
     page_map_[idx] = kPageMapEmpty;
     num_pages++;
@@ -315,7 +326,7 @@
       LOG(INFO) << "RosAlloc::FreePages() : trying to coalesce a free page run 0x"
                 << std::hex << reinterpret_cast<uintptr_t>(fpr) << " [" << std::dec << pm_idx << "] -0x"
                 << std::hex << reinterpret_cast<uintptr_t>(fpr->End(this)) << " [" << std::dec
-                << (fpr->End(this) == End() ? page_map_.size() : ToPageMapIndex(fpr->End(this))) << "]";
+                << (fpr->End(this) == End() ? page_map_size_ : ToPageMapIndex(fpr->End(this))) << "]";
     }
     auto higher_it = free_page_runs_.upper_bound(fpr);
     if (higher_it != free_page_runs_.end()) {
@@ -326,7 +337,7 @@
           LOG(INFO) << "RosAlloc::FreePages() : trying to coalesce with a higher free page run 0x"
                     << std::hex << reinterpret_cast<uintptr_t>(h) << " [" << std::dec << ToPageMapIndex(h) << "] -0x"
                     << std::hex << reinterpret_cast<uintptr_t>(h->End(this)) << " [" << std::dec
-                    << (h->End(this) == End() ? page_map_.size() : ToPageMapIndex(h->End(this))) << "]";
+                    << (h->End(this) == End() ? page_map_size_ : ToPageMapIndex(h->End(this))) << "]";
         }
         if (fpr->End(this) == h->Begin()) {
           if (kTraceRosAlloc) {
@@ -364,7 +375,7 @@
           LOG(INFO) << "RosAlloc::FreePages() : trying to coalesce with a lower free page run 0x"
                     << std::hex << reinterpret_cast<uintptr_t>(l) << " [" << std::dec << ToPageMapIndex(l) << "] -0x"
                     << std::hex << reinterpret_cast<uintptr_t>(l->End(this)) << " [" << std::dec
-                    << (l->End(this) == End() ? page_map_.size() : ToPageMapIndex(l->End(this))) << "]";
+                    << (l->End(this) == End() ? page_map_size_ : ToPageMapIndex(l->End(this))) << "]";
         }
         if (l->End(this) == fpr->Begin()) {
           if (kTraceRosAlloc) {
@@ -450,7 +461,7 @@
   Run* run = NULL;
   {
     MutexLock mu(self, lock_);
-    DCHECK(pm_idx < page_map_.size());
+    DCHECK(pm_idx < page_map_size_);
     byte page_map_entry = page_map_[pm_idx];
     if (kTraceRosAlloc) {
       LOG(INFO) << "RosAlloc::FreeInternal() : " << std::hex << ptr << ", pm_idx=" << std::dec << pm_idx
@@ -1050,6 +1061,12 @@
   }
 }
 
+// If true, read the page map entries in BulkFree() without using the
+// lock for better performance, assuming that the existence of an
+// allocated chunk/pointer being freed in BulkFree() guarantees that
+// the page map entry won't change. Disabled for now.
+static constexpr bool kReadPageMapEntryWithoutLockInBulkFree = false;
+
 void RosAlloc::BulkFree(Thread* self, void** ptrs, size_t num_ptrs) {
   if (false) {
     // Used only to test Free() as GC uses only BulkFree().
@@ -1068,17 +1085,58 @@
 #else
   hash_set<Run*, hash_run, eq_run> runs;
 #endif
-  {
-    for (size_t i = 0; i < num_ptrs; i++) {
-      void* ptr = ptrs[i];
-      ptrs[i] = NULL;
-      DCHECK(base_ <= ptr && ptr < base_ + footprint_);
-      size_t pm_idx = RoundDownToPageMapIndex(ptr);
+  for (size_t i = 0; i < num_ptrs; i++) {
+    void* ptr = ptrs[i];
+    ptrs[i] = NULL;
+    DCHECK(base_ <= ptr && ptr < base_ + footprint_);
+    size_t pm_idx = RoundDownToPageMapIndex(ptr);
+    Run* run = NULL;
+    if (kReadPageMapEntryWithoutLockInBulkFree) {
+      // Read the page map entries without locking the lock.
+      byte page_map_entry = page_map_[pm_idx];
+      if (kTraceRosAlloc) {
+        LOG(INFO) << "RosAlloc::BulkFree() : " << std::hex << ptr << ", pm_idx="
+                  << std::dec << pm_idx
+                  << ", page_map_entry=" << static_cast<int>(page_map_entry);
+      }
+      if (LIKELY(page_map_entry == kPageMapRun)) {
+        run = reinterpret_cast<Run*>(base_ + pm_idx * kPageSize);
+        DCHECK(run->magic_num_ == kMagicNum);
+      } else if (LIKELY(page_map_entry == kPageMapRunPart)) {
+        size_t pi = pm_idx;
+        DCHECK(page_map_[pi] == kPageMapRun || page_map_[pi] == kPageMapRunPart);
+        // Find the beginning of the run.
+        while (page_map_[pi] != kPageMapRun) {
+          pi--;
+          DCHECK(pi < capacity_ / kPageSize);
+        }
+        DCHECK(page_map_[pi] == kPageMapRun);
+        run = reinterpret_cast<Run*>(base_ + pi * kPageSize);
+        DCHECK(run->magic_num_ == kMagicNum);
+      } else if (page_map_entry == kPageMapLargeObject) {
+        MutexLock mu(self, lock_);
+        FreePages(self, ptr);
+        continue;
+      } else {
+        LOG(FATAL) << "Unreachable - page map type: " << page_map_entry;
+      }
+      DCHECK(run != NULL);
+      // Set the bit in the bulk free bit map.
+      run->MarkBulkFreeBitMap(ptr);
+#ifdef HAVE_ANDROID_OS
+      if (!run->to_be_bulk_freed_) {
+        run->to_be_bulk_freed_ = true;
+        runs.push_back(run);
+      }
+#else
+      runs.insert(run);
+#endif
+    } else {
+      // Read the page map entries with a lock.
       bool free_from_run = false;
-      Run* run = NULL;
       {
         MutexLock mu(self, lock_);
-        DCHECK(pm_idx < page_map_.size());
+        DCHECK(pm_idx < page_map_size_);
         byte page_map_entry = page_map_[pm_idx];
         if (kTraceRosAlloc) {
           LOG(INFO) << "RosAlloc::BulkFree() : " << std::hex << ptr << ", pm_idx="
@@ -1243,7 +1301,7 @@
   std::ostringstream stream;
   stream << "RosAlloc PageMap: " << std::endl;
   lock_.AssertHeld(Thread::Current());
-  size_t end = page_map_.size();
+  size_t end = page_map_size_;
   FreePageRun* curr_fpr = NULL;
   size_t curr_fpr_size = 0;
   size_t remaining_curr_fpr_size = 0;
@@ -1340,7 +1398,7 @@
   case kPageMapLargeObject: {
     size_t num_pages = 1;
     size_t idx = pm_idx + 1;
-    size_t end = page_map_.size();
+    size_t end = page_map_size_;
     while (idx < end && page_map_[idx] == kPageMapLargeObjectPart) {
       num_pages++;
       idx++;
@@ -1390,9 +1448,21 @@
     size_t new_footprint = footprint_ - decrement;
     DCHECK_EQ(new_footprint % kPageSize, static_cast<size_t>(0));
     size_t new_num_of_pages = new_footprint / kPageSize;
-    DCHECK_GE(page_map_.size(), new_num_of_pages);
-    page_map_.resize(new_num_of_pages);
-    DCHECK_EQ(page_map_.size(), new_num_of_pages);
+    DCHECK_GE(page_map_size_, new_num_of_pages);
+    // Zero out the tail of the page map.
+    byte* zero_begin = page_map_ + new_num_of_pages;
+    byte* madvise_begin = AlignUp(zero_begin, kPageSize);
+    DCHECK_LE(madvise_begin, page_map_mem_map_->End());
+    size_t madvise_size = page_map_mem_map_->End() - madvise_begin;
+    if (madvise_size > 0) {
+      DCHECK_ALIGNED(madvise_begin, kPageSize);
+      DCHECK_EQ(RoundUp(madvise_size, kPageSize), madvise_size);
+      CHECK_EQ(madvise(madvise_begin, madvise_size, MADV_DONTNEED), 0);
+    }
+    if (madvise_begin - zero_begin) {
+      memset(zero_begin, 0, madvise_begin - zero_begin);
+    }
+    page_map_size_ = new_num_of_pages;
     free_page_run_size_map_.resize(new_num_of_pages);
     DCHECK_EQ(free_page_run_size_map_.size(), new_num_of_pages);
     art_heap_rosalloc_morecore(this, -(static_cast<intptr_t>(decrement)));
@@ -1415,7 +1485,7 @@
     return;
   }
   MutexLock mu(Thread::Current(), lock_);
-  size_t pm_end = page_map_.size();
+  size_t pm_end = page_map_size_;
   size_t i = 0;
   while (i < pm_end) {
     byte pm = page_map_[i];
@@ -1508,6 +1578,7 @@
   DCHECK_EQ(RoundUp(new_capacity, kPageSize), new_capacity);
   // Only growing is supported here. But Trim() is supported.
   if (capacity_ < new_capacity) {
+    CHECK_LE(new_capacity, max_capacity_);
     capacity_ = new_capacity;
     VLOG(heap) << "new capacity=" << capacity_;
   }
@@ -1688,7 +1759,7 @@
   std::vector<Run*> runs;
   {
     MutexLock mu(self, lock_);
-    size_t pm_end = page_map_.size();
+    size_t pm_end = page_map_size_;
     size_t i = 0;
     while (i < pm_end) {
       byte pm = page_map_[i];
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
index dd2bb5d..4b0dd79 100644
--- a/runtime/gc/allocator/rosalloc.h
+++ b/runtime/gc/allocator/rosalloc.h
@@ -27,6 +27,8 @@
 #include "base/mutex.h"
 #include "base/logging.h"
 #include "globals.h"
+#include "mem_map.h"
+#include "UniquePtr.h"
 #include "utils.h"
 
 // A boilerplate to use hash_map/hash_set both on host and device.
@@ -51,6 +53,7 @@
 #endif  // HAVE_ANDROID_OS
 
 namespace art {
+
 namespace gc {
 namespace allocator {
 
@@ -428,9 +431,13 @@
   size_t footprint_;
 
   // The maximum footprint. The address, base_ + capacity_, indicates
-  // the end of the memory region that's managed by this allocator.
+  // the end of the memory region that's currently managed by this allocator.
   size_t capacity_;
 
+  // The maximum capacity. The address, base_ + max_capacity_, indicates
+  // the end of the memory region that's ever managed by this allocator.
+  size_t max_capacity_;
+
   // The run sets that hold the runs whose slots are not all
   // full. non_full_runs_[i] is guarded by size_bracket_locks_[i].
   std::set<Run*> non_full_runs_[kNumOfSizeBrackets];
@@ -454,7 +461,11 @@
     kPageMapLargeObjectPart = 4,  // The non-beginning part of a large object.
   };
   // The table that indicates what pages are currently used for.
-  std::vector<byte> page_map_ GUARDED_BY(lock_);
+  byte* page_map_;  // No GUARDED_BY(lock_) for kReadPageMapEntryWithoutLockInBulkFree.
+  size_t page_map_size_;
+  size_t max_page_map_size_;
+  UniquePtr<MemMap> page_map_mem_map_;
+
   // The table that indicates the size of free page runs. These sizes
   // are stored here to avoid storing in the free page header and
   // release backing pages.
@@ -501,7 +512,7 @@
   void* AllocLargeObject(Thread* self, size_t size, size_t* bytes_allocated) LOCKS_EXCLUDED(lock_);
 
  public:
-  RosAlloc(void* base, size_t capacity,
+  RosAlloc(void* base, size_t capacity, size_t max_capacity,
            PageReleaseMode page_release_mode,
            size_t page_release_size_threshold = kDefaultPageReleaseSizeThreshold);
   void* Alloc(Thread* self, size_t size, size_t* bytes_allocated)
diff --git a/runtime/gc/space/dlmalloc_space.h b/runtime/gc/space/dlmalloc_space.h
index 4bf16ce..76c4489 100644
--- a/runtime/gc/space/dlmalloc_space.h
+++ b/runtime/gc/space/dlmalloc_space.h
@@ -133,7 +133,7 @@
       EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   void* CreateAllocator(void* base, size_t morecore_start, size_t initial_size,
-                        bool /*low_memory_mode*/) OVERRIDE {
+                        size_t /*maximum_size*/, bool /*low_memory_mode*/) OVERRIDE {
     return CreateMspace(base, morecore_start, initial_size);
   }
   static void* CreateMspace(void* base, size_t morecore_start, size_t initial_size);
diff --git a/runtime/gc/space/malloc_space.cc b/runtime/gc/space/malloc_space.cc
index ee31112..61d1071 100644
--- a/runtime/gc/space/malloc_space.cc
+++ b/runtime/gc/space/malloc_space.cc
@@ -193,7 +193,7 @@
   UniquePtr<MemMap> mem_map(GetMemMap()->RemapAtEnd(end_, alloc_space_name,
                                                     PROT_READ | PROT_WRITE, &error_msg));
   CHECK(mem_map.get() != nullptr) << error_msg;
-  void* allocator = CreateAllocator(end_, starting_size, initial_size, low_memory_mode);
+  void* allocator = CreateAllocator(end_, starting_size, initial_size, capacity, low_memory_mode);
   // Protect memory beyond the initial size.
   byte* end = mem_map->Begin() + starting_size;
   if (capacity - initial_size > 0) {
diff --git a/runtime/gc/space/malloc_space.h b/runtime/gc/space/malloc_space.h
index 8e34fd0..30c7c45 100644
--- a/runtime/gc/space/malloc_space.h
+++ b/runtime/gc/space/malloc_space.h
@@ -137,7 +137,7 @@
   // When true the low memory mode argument specifies that the heap wishes the created allocator to
   // be more aggressive in releasing unused pages.
   virtual void* CreateAllocator(void* base, size_t morecore_start, size_t initial_size,
-                                bool low_memory_mode) = 0;
+                                size_t maximum_size, bool low_memory_mode) = 0;
 
   void RegisterRecentFree(mirror::Object* ptr)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
diff --git a/runtime/gc/space/rosalloc_space.cc b/runtime/gc/space/rosalloc_space.cc
index fb621ea..b13ac3d 100644
--- a/runtime/gc/space/rosalloc_space.cc
+++ b/runtime/gc/space/rosalloc_space.cc
@@ -50,7 +50,7 @@
                                                bool low_memory_mode) {
   DCHECK(mem_map != nullptr);
   allocator::RosAlloc* rosalloc = CreateRosAlloc(mem_map->Begin(), starting_size, initial_size,
-                                                 low_memory_mode);
+                                                 capacity, low_memory_mode);
   if (rosalloc == NULL) {
     LOG(ERROR) << "Failed to initialize rosalloc for alloc space (" << name << ")";
     return NULL;
@@ -109,14 +109,14 @@
 }
 
 allocator::RosAlloc* RosAllocSpace::CreateRosAlloc(void* begin, size_t morecore_start, size_t initial_size,
-                                                   bool low_memory_mode) {
+                                                   size_t maximum_size, bool low_memory_mode) {
   // clear errno to allow PLOG on error
   errno = 0;
   // create rosalloc using our backing storage starting at begin and
   // with a footprint of morecore_start. When morecore_start bytes of
   // memory is exhaused morecore will be called.
   allocator::RosAlloc* rosalloc = new art::gc::allocator::RosAlloc(
-      begin, morecore_start,
+      begin, morecore_start, maximum_size,
       low_memory_mode ?
           art::gc::allocator::RosAlloc::kPageReleaseModeAll :
           art::gc::allocator::RosAlloc::kPageReleaseModeSizeAndEnd);
diff --git a/runtime/gc/space/rosalloc_space.h b/runtime/gc/space/rosalloc_space.h
index 5bc425d..9f756aa 100644
--- a/runtime/gc/space/rosalloc_space.h
+++ b/runtime/gc/space/rosalloc_space.h
@@ -114,11 +114,11 @@
                               size_t* usable_size);
 
   void* CreateAllocator(void* base, size_t morecore_start, size_t initial_size,
-                        bool low_memory_mode) OVERRIDE {
-    return CreateRosAlloc(base, morecore_start, initial_size, low_memory_mode);
+                        size_t maximum_size, bool low_memory_mode) OVERRIDE {
+    return CreateRosAlloc(base, morecore_start, initial_size, maximum_size, low_memory_mode);
   }
   static allocator::RosAlloc* CreateRosAlloc(void* base, size_t morecore_start, size_t initial_size,
-                                             bool low_memory_mode);
+                                             size_t maximum_size, bool low_memory_mode);
 
   void InspectAllRosAlloc(void (*callback)(void *start, void *end, size_t num_bytes, void* callback_arg),
                           void* arg)