A custom 'runs-of-slots' memory allocator.

Bug: 9986565
Change-Id: I0eb73b9458752113f519483616536d219d5f798b
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
new file mode 100644
index 0000000..5dda80c
--- /dev/null
+++ b/runtime/gc/allocator/rosalloc.h
@@ -0,0 +1,480 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_GC_ALLOCATOR_ROSALLOC_H_
+#define ART_RUNTIME_GC_ALLOCATOR_ROSALLOC_H_
+
+#include <set>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string>
+#include <sys/mman.h>
+#include <vector>
+
+#include "base/mutex.h"
+#include "base/logging.h"
+#include "globals.h"
+#include "utils.h"
+
+// A boilerplate to use hash_map/hash_set both on host and device.
+#ifdef HAVE_ANDROID_OS
+#include <hash_map>
+#include <hash_set>
+using std::hash_map;
+using std::hash_set;
+#else  // HAVE_ANDROID_OS
+#ifdef __DEPRECATED
+#define ROSALLOC_OLD__DEPRECATED __DEPRECATED
+#undef __DEPRECATED
+#endif
+#include <ext/hash_map>
+#include <ext/hash_set>
+#ifdef ROSALLOC_OLD__DEPRECATED
+#define __DEPRECATED ROSALLOC_OLD__DEPRECATED
+#undef ROSALLOC_OLD__DEPRECATED
+#endif
+using __gnu_cxx::hash_map;
+using __gnu_cxx::hash_set;
+#endif  // HAVE_ANDROID_OS
+
+namespace art {
+namespace gc {
+namespace allocator {
+
+// A Runs-of-slots memory allocator.
+class RosAlloc {
+ private:
+  // Rerepresents a run of free pages.
+  class FreePageRun {
+   public:
+    byte magic_num_;  // The magic number used for debugging only.
+
+    bool IsFree() const {
+      if (kIsDebugBuild) {
+        return magic_num_ == kMagicNumFree;
+      }
+      return true;
+    }
+    size_t ByteSize(RosAlloc* rosalloc) const EXCLUSIVE_LOCKS_REQUIRED(rosalloc->lock_) {
+      const byte* fpr_base = reinterpret_cast<const byte*>(this);
+      size_t pm_idx = rosalloc->ToPageMapIndex(fpr_base);
+      size_t byte_size = rosalloc->free_page_run_size_map_[pm_idx];
+      DCHECK_GE(byte_size, static_cast<size_t>(0));
+      DCHECK_EQ(byte_size % kPageSize, static_cast<size_t>(0));
+      return byte_size;
+    }
+    void SetByteSize(RosAlloc* rosalloc, size_t byte_size)
+        EXCLUSIVE_LOCKS_REQUIRED(rosalloc->lock_) {
+      DCHECK_EQ(byte_size % kPageSize, static_cast<size_t>(0));
+      byte* fpr_base = reinterpret_cast<byte*>(this);
+      size_t pm_idx = rosalloc->ToPageMapIndex(fpr_base);
+      rosalloc->free_page_run_size_map_[pm_idx] = byte_size;
+    }
+    void* Begin() {
+      return reinterpret_cast<void*>(this);
+    }
+    void* End(RosAlloc* rosalloc) EXCLUSIVE_LOCKS_REQUIRED(rosalloc->lock_) {
+      byte* fpr_base = reinterpret_cast<byte*>(this);
+      byte* end = fpr_base + ByteSize(rosalloc);
+      return end;
+    }
+    void ReleasePages(RosAlloc* rosalloc) EXCLUSIVE_LOCKS_REQUIRED(rosalloc->lock_) {
+      size_t byte_size = ByteSize(rosalloc);
+      DCHECK_EQ(byte_size % kPageSize, static_cast<size_t>(0));
+      if (kIsDebugBuild) {
+        // Exclude the first page that stores the magic number.
+        DCHECK_GE(byte_size, static_cast<size_t>(kPageSize));
+        byte_size -= kPageSize;
+        if (byte_size > 0) {
+          madvise(reinterpret_cast<byte*>(this) + kPageSize, byte_size, MADV_DONTNEED);
+        }
+      } else {
+        madvise(this, byte_size, MADV_DONTNEED);
+      }
+    }
+  };
+
+  // Represents a run of memory slots of the same size.
+  //
+  // A run's memory layout:
+  //
+  // +-------------------+
+  // | magic_num         |
+  // +-------------------+
+  // | size_bracket_idx  |
+  // +-------------------+
+  // | is_thread_local   |
+  // +-------------------+
+  // | to_be_bulk_freed  |
+  // +-------------------+
+  // | top_slot_idx      |
+  // +-------------------+
+  // |                   |
+  // | alloc bit map     |
+  // |                   |
+  // +-------------------+
+  // |                   |
+  // | bulk free bit map |
+  // |                   |
+  // +-------------------+
+  // |                   |
+  // | thread-local free |
+  // | bit map           |
+  // |                   |
+  // +-------------------+
+  // | padding due to    |
+  // | alignment         |
+  // +-------------------+
+  // | slot 0            |
+  // +-------------------+
+  // | slot 1            |
+  // +-------------------+
+  // | slot 2            |
+  // +-------------------+
+  // ...
+  // +-------------------+
+  // | last slot         |
+  // +-------------------+
+  //
+  class Run {
+   public:
+    byte magic_num_;            // The magic number used for debugging.
+    byte size_bracket_idx_;     // The index of the size bracket of this run.
+    byte is_thread_local_;      // True if this run is used as a thread-local run.
+    byte to_be_bulk_freed_;     // Used within BulkFree() to flag a run that's involved with a bulk free.
+    uint32_t top_slot_idx_;     // The top slot index when this run is in bump index mode.
+    uint32_t alloc_bit_map_[0]; // The bit map that allocates if each slot is in use.
+
+    // bulk_free_bit_map_[] : The bit map that is used for GC to
+    // temporarily mark the slots to free without using a lock. After
+    // all the slots to be freed in a run are marked, all those slots
+    // get freed in bulk with one locking per run, as opposed to one
+    // locking per slot to minimize the lock contention. This is used
+    // within BulkFree().
+
+    // thread_local_free_bit_map_[] : The bit map that is used for GC
+    // to temporarily mark the slots to free in a thread-local run
+    // without using a lock (without synchronizing the thread that
+    // owns the thread-local run.) When the thread-local run becomes
+    // full, the thread will check this bit map and update the
+    // allocation bit map of the run (that is, the slots get freed.)
+
+    // Returns the byte size of the header except for the bit maps.
+    static size_t fixed_header_size() {
+      Run temp;
+      size_t size = reinterpret_cast<byte*>(&temp.alloc_bit_map_) - reinterpret_cast<byte*>(&temp);
+      DCHECK_EQ(size, static_cast<size_t>(8));
+      return size;
+    }
+    // Returns the base address of the free bit map.
+    uint32_t* bulk_free_bit_map() {
+      return reinterpret_cast<uint32_t*>(reinterpret_cast<byte*>(this) + bulkFreeBitMapOffsets[size_bracket_idx_]);
+    }
+    // Returns the base address of the thread local free bit map.
+    uint32_t* thread_local_free_bit_map() {
+      return reinterpret_cast<uint32_t*>(reinterpret_cast<byte*>(this) + threadLocalFreeBitMapOffsets[size_bracket_idx_]);
+    }
+    void* End() {
+      return reinterpret_cast<byte*>(this) + kPageSize * numOfPages[size_bracket_idx_];
+    }
+    // Frees slots in the allocation bit map with regard to the
+    // thread-local free bit map. Used when a thread-local run becomes
+    // full.
+    bool MergeThreadLocalFreeBitMapToAllocBitMap(bool* is_all_free_after_out);
+    // Frees slots in the allocation bit map with regard to the bulk
+    // free bit map. Used in a bulk free.
+    void MergeBulkFreeBitMapIntoAllocBitMap();
+    // Unions the slots to be freed in the free bit map into the
+    // thread-local free bit map. In a bulk free, as a two-step
+    // process, GC will first record all the slots to free in a run in
+    // the free bit map where it can write without a lock, and later
+    // acquire a lock once per run to union the bits of the free bit
+    // map to the thread-local free bit map.
+    void UnionBulkFreeBitMapToThreadLocalFreeBitMap();
+    // Allocates a slot in a run.
+    void* AllocSlot();
+    // Frees a slot in a run. This is used in a non-bulk free.
+    void FreeSlot(void* ptr);
+    // Marks the slots to free in the bulk free bit map.
+    void MarkBulkFreeBitMap(void* ptr);
+    // Marks the slots to free in the thread-local free bit map.
+    void MarkThreadLocalFreeBitMap(void* ptr);
+    // Returns true if all the slots in the run are not in use.
+    bool IsAllFree();
+    // Returns true if all the slots in the run are in use.
+    bool IsFull();
+    // Clear all the bit maps.
+    void ClearBitMaps();
+    // Iterate over all the slots and apply the given function.
+    void InspectAllSlots(void (*handler)(void* start, void* end, size_t used_bytes, void* callback_arg), void* arg);
+    // Dump the run metadata for debugging.
+    void Dump();
+
+   private:
+    // The common part of MarkFreeBitMap() and MarkThreadLocalFreeBitMap().
+    void MarkFreeBitMapShared(void* ptr, uint32_t* free_bit_map_base, const char* caller_name);
+  };
+
+  // The magic number for a run.
+  static const byte kMagicNum = 42;
+  // The magic number for free pages.
+  static const byte kMagicNumFree = 43;
+  // The number of size brackets. Sync this with the length of Thread::rosalloc_runs_.
+  static const size_t kNumOfSizeBrackets = 34;
+  // The number of smaller size brackets that are 16 bytes apart.
+  static const size_t kNumOfQuantumSizeBrackets = 32;
+  // The sizes (the slot sizes, in bytes) of the size brackets.
+  static size_t bracketSizes[kNumOfSizeBrackets];
+  // The numbers of pages that are used for runs for each size bracket.
+  static size_t numOfPages[kNumOfSizeBrackets];
+  // The numbers of slots of the runs for each size bracket.
+  static size_t numOfSlots[kNumOfSizeBrackets];
+  // The header sizes in bytes of the runs for each size bracket.
+  static size_t headerSizes[kNumOfSizeBrackets];
+  // The byte offsets of the bulk free bit maps of the runs for each size bracket.
+  static size_t bulkFreeBitMapOffsets[kNumOfSizeBrackets];
+  // The byte offsets of the thread-local free bit maps of the runs for each size bracket.
+  static size_t threadLocalFreeBitMapOffsets[kNumOfSizeBrackets];
+
+  // Initialize the run specs (the above arrays).
+  static void Initialize();
+  static bool initialized_;
+
+  // Returns the byte size of the bracket size from the index.
+  static size_t IndexToBracketSize(size_t idx) {
+    DCHECK(idx < kNumOfSizeBrackets);
+    return bracketSizes[idx];
+  }
+  // Returns the index of the size bracket from the bracket size.
+  static size_t BracketSizeToIndex(size_t size) {
+    DCHECK(16 <= size && ((size < 1 * KB && size % 16 == 0) || size == 1 * KB || size == 2 * KB));
+    size_t idx;
+    if (UNLIKELY(size == 1 * KB)) {
+      idx = kNumOfSizeBrackets - 2;
+    } else if (UNLIKELY(size == 2 * KB)) {
+      idx = kNumOfSizeBrackets - 1;
+    } else {
+      DCHECK(size < 1 * KB);
+      DCHECK_EQ(size % 16, static_cast<size_t>(0));
+      idx = size / 16 - 1;
+    }
+    DCHECK(bracketSizes[idx] == size);
+    return idx;
+  }
+  // Rounds up the size up the nearest bracket size.
+  static size_t RoundToBracketSize(size_t size) {
+    DCHECK(size <= kLargeSizeThreshold);
+    if (LIKELY(size <= 512)) {
+      return RoundUp(size, 16);
+    } else if (512 < size && size <= 1 * KB) {
+      return 1 * KB;
+    } else {
+      DCHECK(1 * KB < size && size <= 2 * KB);
+      return 2 * KB;
+    }
+  }
+  // Returns the size bracket index from the byte size with rounding.
+  static size_t SizeToIndex(size_t size) {
+    DCHECK(size <= kLargeSizeThreshold);
+    if (LIKELY(size <= 512)) {
+      return RoundUp(size, 16) / 16 - 1;
+    } else if (512 < size && size <= 1 * KB) {
+      return kNumOfSizeBrackets - 2;
+    } else {
+      DCHECK(1 * KB < size && size <= 2 * KB);
+      return kNumOfSizeBrackets - 1;
+    }
+  }
+  // A combination of SizeToIndex() and RoundToBracketSize().
+  static size_t SizeToIndexAndBracketSize(size_t size, size_t* bracket_size_out) {
+    DCHECK(size <= kLargeSizeThreshold);
+    if (LIKELY(size <= 512)) {
+      size_t bracket_size = RoundUp(size, 16);
+      *bracket_size_out = bracket_size;
+      size_t idx = bracket_size / 16 - 1;
+      DCHECK_EQ(bracket_size, IndexToBracketSize(idx));
+      return idx;
+    } else if (512 < size && size <= 1 * KB) {
+      size_t bracket_size = 1024;
+      *bracket_size_out = bracket_size;
+      size_t idx = kNumOfSizeBrackets - 2;
+      DCHECK_EQ(bracket_size, IndexToBracketSize(idx));
+      return idx;
+    } else {
+      DCHECK(1 * KB < size && size <= 2 * KB);
+      size_t bracket_size = 2048;
+      *bracket_size_out = bracket_size;
+      size_t idx = kNumOfSizeBrackets - 1;
+      DCHECK_EQ(bracket_size, IndexToBracketSize(idx));
+      return idx;
+    }
+  }
+  // Returns the page map index from an address. Requires that the
+  // address is page size aligned.
+  size_t ToPageMapIndex(const void* addr) const {
+    DCHECK(base_ <= addr && addr < base_ + capacity_);
+    size_t byte_offset = reinterpret_cast<const byte*>(addr) - base_;
+    DCHECK_EQ(byte_offset % static_cast<size_t>(kPageSize), static_cast<size_t>(0));
+    return byte_offset / kPageSize;
+  }
+  // Returns the page map index from an address with rounding.
+  size_t RoundDownToPageMapIndex(void* addr) {
+    DCHECK(base_ <= addr && addr < reinterpret_cast<byte*>(base_) + capacity_);
+    return (reinterpret_cast<uintptr_t>(addr) - reinterpret_cast<uintptr_t>(base_)) / kPageSize;
+  }
+
+  // A memory allocation request larger than this size is treated as a large object and allocated
+  // at a page-granularity.
+  static const size_t kLargeSizeThreshold = 2048;
+
+  // We use use thread-local runs for the size Brackets whose indexes
+  // are less than or equal to this index. We use shared (current)
+  // runs for the rest.
+  static const size_t kMaxThreadLocalSizeBracketIdx = 10;
+
+  struct hash_run {
+    size_t operator()(const RosAlloc::Run* r) const {
+      return reinterpret_cast<size_t>(r);
+    }
+  };
+
+  struct eq_run {
+    bool operator()(const RosAlloc::Run* r1, const RosAlloc::Run* r2) const {
+      return r1 == r2;
+    }
+  };
+
+  // The base address of the memory region that's managed by this allocator.
+  byte* base_;
+
+  // The footprint in bytes of the currently allocated portion of the
+  // memory region.
+  size_t footprint_;
+
+  // The maximum footprint. The address, base_ + capacity_, indicates
+  // the end of the memory region that's managed by this allocator.
+  size_t capacity_;
+
+  // The run sets that hold the runs whose slots are not all
+  // full. non_full_runs_[i] is guarded by size_bracket_locks_[i].
+  std::set<Run*> non_full_runs_[kNumOfSizeBrackets];
+  // The run sets that hold the runs whose slots are all full. This is
+  // debug only. full_runs_[i] is guarded by size_bracket_locks_[i].
+  hash_set<Run*, hash_run, eq_run> full_runs_[kNumOfSizeBrackets];
+  // The set of free pages.
+  std::set<FreePageRun*> free_page_runs_ GUARDED_BY(lock_);
+  // The free page run whose end address is the end of the memory
+  // region that's managed by this allocator, if any.
+  FreePageRun* last_free_page_run_;
+  // The current runs where the allocations are first attempted for
+  // the size brackes that do not use thread-local
+  // runs. current_runs_[i] is guarded by size_bracket_locks_[i].
+  Run* current_runs_[kNumOfSizeBrackets];
+  // The mutexes, one per size bracket.
+  Mutex* size_bracket_locks_[kNumOfSizeBrackets];
+  // The types of page map entries.
+  enum {
+    kPageMapEmpty           = 0,  // Not allocated.
+    kPageMapRun             = 1,  // The beginning of a run.
+    kPageMapRunPart         = 2,  // The non-beginning part of a run.
+    kPageMapLargeObject     = 3,  // The beginning of a large object.
+    kPageMapLargeObjectPart = 4,  // The non-beginning part of a large object.
+  };
+  // The table that indicates what pages are currently used for.
+  std::vector<byte> page_map_ GUARDED_BY(lock_);
+  // The table that indicates the size of free page runs. These sizes
+  // are stored here to avoid storing in the free page header and
+  // release backing pages.
+  std::vector<size_t> free_page_run_size_map_ GUARDED_BY(lock_);
+  // The global lock. Used to guard the page map, the free page set,
+  // and the footprint.
+  Mutex lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  // The reader-writer lock to allow one bulk free at a time while
+  // allowing multiple individual frees at the same time.
+  ReaderWriterMutex bulk_free_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+
+  // The base address of the memory region that's managed by this allocator.
+  byte* Begin() { return base_; }
+  // The end address of the memory region that's managed by this allocator.
+  byte* End() { return base_ + capacity_; }
+
+  // Page-granularity alloc/free
+  void* AllocPages(Thread* self, size_t num_pages, byte page_map_type)
+      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void FreePages(Thread* self, void* ptr) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // Allocate/free a run slot.
+  void* AllocFromRun(Thread* self, size_t size, size_t* bytes_allocated)
+      LOCKS_EXCLUDED(lock_);
+  void FreeFromRun(Thread* self, void* ptr, Run* run)
+      LOCKS_EXCLUDED(lock_);
+
+  // Used to acquire a new/reused run for a size bracket. Used when a
+  // thread-local or current run gets full.
+  Run* RefillRun(Thread* self, size_t idx) LOCKS_EXCLUDED(lock_);
+
+  // The internal of non-bulk Free().
+  void FreeInternal(Thread* self, void* ptr) LOCKS_EXCLUDED(lock_);
+
+ public:
+  RosAlloc(void* base, size_t capacity);
+  void* Alloc(Thread* self, size_t size, size_t* bytes_allocated)
+      LOCKS_EXCLUDED(lock_);
+  void Free(Thread* self, void* ptr)
+      LOCKS_EXCLUDED(bulk_free_lock_);
+  void BulkFree(Thread* self, void** ptrs, size_t num_ptrs)
+      LOCKS_EXCLUDED(bulk_free_lock_);
+  // Returns the size of the allocated slot for a given allocated memory chunk.
+  size_t UsableSize(void* ptr);
+  // Returns the size of the allocated slot for a given size.
+  size_t UsableSize(size_t bytes) {
+    if (UNLIKELY(bytes > kLargeSizeThreshold)) {
+      return RoundUp(bytes, kPageSize);
+    } else {
+      return RoundToBracketSize(bytes);
+    }
+  }
+  // Try to reduce the current footprint by releasing the free page
+  // run at the end of the memory region, if any.
+  bool Trim();
+  // Iterates over all the memory slots and apply the given function.
+  void InspectAll(void (*handler)(void* start, void* end, size_t used_bytes, void* callback_arg),
+                  void* arg)
+      LOCKS_EXCLUDED(lock_);
+  // Returns the current footprint.
+  size_t Footprint() LOCKS_EXCLUDED(lock_);
+  // Returns the current capacity, maximum footprint.
+  size_t FootprintLimit() LOCKS_EXCLUDED(lock_);
+  // Update the current capacity.
+  void SetFootprintLimit(size_t bytes) LOCKS_EXCLUDED(lock_);
+  // Releases the thread-local runs assigned to the given thread back to the common set of runs.
+  void RevokeThreadLocalRuns(Thread* thread);
+  // Releases the thread-local runs assigned to all the threads back to the common set of runs.
+  void RevokeAllThreadLocalRuns() LOCKS_EXCLUDED(Locks::thread_list_lock_);
+  // Dumps the page map for debugging.
+  void DumpPageMap(Thread* self);
+
+  // Callbacks for InspectAll that will count the number of bytes
+  // allocated and objects allocated, respectively.
+  static void BytesAllocatedCallback(void* start, void* end, size_t used_bytes, void* arg);
+  static void ObjectsAllocatedCallback(void* start, void* end, size_t used_bytes, void* arg);
+};
+
+}  // namespace allocator
+}  // namespace gc
+}  // namespace art
+
+#endif  // ART_RUNTIME_GC_ALLOCATOR_ROSALLOC_H_