Use free lists instead of bitmaps within rosalloc runs.

Speedups (CMS GC/N5)
BinaryTrees:  2008 -> 1694 ms (-16%)
MemAllocTest: 2303 -> 2076 ms (-10%)

TODO: Add assembly fast path code.

Bug: 9986565

Change-Id: I9dd7cbfd8e1ae083a399e70abaf2064a959f24fa
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
index a7f29af..87f1392 100644
--- a/runtime/gc/allocator/rosalloc.h
+++ b/runtime/gc/allocator/rosalloc.h
@@ -112,6 +112,198 @@
     DISALLOW_COPY_AND_ASSIGN(FreePageRun);
   };
 
+  // The slot header.
+  class Slot {
+   public:
+    Slot* Next() const {
+      return next_;
+    }
+    void SetNext(Slot* next) {
+      next_ = next;
+    }
+    // The slot right before this slot in terms of the address.
+    Slot* Left(size_t bracket_size) {
+      return reinterpret_cast<Slot*>(reinterpret_cast<uintptr_t>(this) - bracket_size);
+    }
+    void Clear() {
+      next_ = nullptr;
+    }
+
+   private:
+    Slot* next_;  // Next slot in the list.
+  };
+
+  // We use the tail (kUseTail == true) for the bulk or thread-local free lists to avoid the need to
+  // traverse the list from the head to the tail when merging free lists.
+  // We don't use the tail (kUseTail == false) for the free list to avoid the need to manage the
+  // tail in the allocation fast path for a performance reason.
+  template<bool kUseTail = true>
+  class SlotFreeList {
+   public:
+    SlotFreeList() : head_(0U), tail_(0), size_(0) {}
+    Slot* Head() const {
+      return reinterpret_cast<Slot*>(head_);
+    }
+    Slot* Tail() const {
+      CHECK(kUseTail);
+      return reinterpret_cast<Slot*>(tail_);
+    }
+    size_t Size() const {
+      return size_;
+    }
+    // Removes from the head of the free list.
+    Slot* Remove() {
+      Slot* slot;
+      if (kIsDebugBuild) {
+        Verify();
+      }
+      Slot** headp = reinterpret_cast<Slot**>(&head_);
+      Slot** tailp = kUseTail ? reinterpret_cast<Slot**>(&tail_) : nullptr;
+      Slot* old_head = *headp;
+      if (old_head == nullptr) {
+        // List was empty.
+        if (kUseTail) {
+          DCHECK(*tailp == nullptr);
+        }
+        return nullptr;
+      } else {
+        // List wasn't empty.
+        if (kUseTail) {
+          DCHECK(*tailp != nullptr);
+        }
+        Slot* old_head_next = old_head->Next();
+        slot = old_head;
+        *headp = old_head_next;
+        if (kUseTail && old_head_next == nullptr) {
+          // List becomes empty.
+          *tailp = nullptr;
+        }
+      }
+      slot->Clear();
+      --size_;
+      if (kIsDebugBuild) {
+        Verify();
+      }
+      return slot;
+    }
+    void Add(Slot* slot) {
+      if (kIsDebugBuild) {
+        Verify();
+      }
+      DCHECK(slot != nullptr);
+      Slot** headp = reinterpret_cast<Slot**>(&head_);
+      Slot** tailp = kUseTail ? reinterpret_cast<Slot**>(&tail_) : nullptr;
+      Slot* old_head = *headp;
+      if (old_head == nullptr) {
+        // List was empty.
+        if (kUseTail) {
+          DCHECK(*tailp == nullptr);
+        }
+        *headp = slot;
+        if (kUseTail) {
+          *tailp = slot;
+        }
+      } else {
+        // List wasn't empty.
+        if (kUseTail) {
+          DCHECK(*tailp != nullptr);
+        }
+        *headp = slot;
+        slot->SetNext(old_head);
+      }
+      ++size_;
+      if (kIsDebugBuild) {
+        Verify();
+      }
+    }
+    // Merge the given list into this list. Empty the given list.
+    // Deliberately support only a kUseTail == true SlotFreeList parameter because 1) we don't
+    // currently have a situation where we need a kUseTail == false SlotFreeList parameter, and 2)
+    // supporting the kUseTail == false parameter would require a O(n) linked list traversal to do
+    // the merge if 'this' SlotFreeList has kUseTail == false, which we'd like to avoid.
+    void Merge(SlotFreeList<true>* list) {
+      if (kIsDebugBuild) {
+        Verify();
+        CHECK(list != nullptr);
+        list->Verify();
+      }
+      if (list->Size() == 0) {
+        return;
+      }
+      Slot** headp = reinterpret_cast<Slot**>(&head_);
+      Slot** tailp = kUseTail ? reinterpret_cast<Slot**>(&tail_) : nullptr;
+      Slot* old_head = *headp;
+      if (old_head == nullptr) {
+        // List was empty.
+        *headp = list->Head();
+        if (kUseTail) {
+          *tailp = list->Tail();
+        }
+        size_ = list->Size();
+      } else {
+        // List wasn't empty.
+        DCHECK(list->Head() != nullptr);
+        *headp = list->Head();
+        DCHECK(list->Tail() != nullptr);
+        list->Tail()->SetNext(old_head);
+        // if kUseTail, no change to tailp.
+        size_ += list->Size();
+      }
+      list->Reset();
+      if (kIsDebugBuild) {
+        Verify();
+      }
+    }
+
+    void Reset() {
+      head_ = 0;
+      if (kUseTail) {
+        tail_ = 0;
+      }
+      size_ = 0;
+    }
+
+    void Verify() {
+      Slot* head = reinterpret_cast<Slot*>(head_);
+      Slot* tail = kUseTail ? reinterpret_cast<Slot*>(tail_) : nullptr;
+      if (size_ == 0) {
+        CHECK(head == nullptr);
+        if (kUseTail) {
+          CHECK(tail == nullptr);
+        }
+      } else {
+        CHECK(head != nullptr);
+        if (kUseTail) {
+          CHECK(tail != nullptr);
+        }
+        size_t count = 0;
+        for (Slot* slot = head; slot != nullptr; slot = slot->Next()) {
+          ++count;
+          if (kUseTail && slot->Next() == nullptr) {
+            CHECK_EQ(slot, tail);
+          }
+        }
+        CHECK_EQ(size_, count);
+      }
+    }
+
+   private:
+    // A pointer (Slot*) to the head of the list. Always 8 bytes so that we will have the same
+    // layout between 32 bit and 64 bit, which is not strictly necessary, but we do so for 1)
+    // uniformity, 2) we won't need to change this code if we move to a non-low 4G heap in the
+    // future, and 3) the space savings by using 32 bit fields in 32 bit would be lost in noise
+    // (won't open up enough space to cause an extra slot to be available).
+    uint64_t head_;
+    // A pointer (Slot*) to the tail of the list. Always 8 bytes so that we will have the same
+    // layout between 32 bit and 64 bit. The tail is stored to speed up merging of lists.
+    // Unused if kUseTail is false.
+    uint64_t tail_;
+    // The number of slots in the list. This is used to make it fast to check if a free list is all
+    // free without traversing the whole free list.
+    uint32_t size_;
+    uint32_t padding_ ATTRIBUTE_UNUSED;
+  };
+
   // Represents a run of memory slots of the same size.
   //
   // A run's memory layout:
@@ -125,19 +317,17 @@
   // +-------------------+
   // | to_be_bulk_freed  |
   // +-------------------+
-  // | top_bitmap_idx    |
-  // +-------------------+
   // |                   |
-  // | alloc bit map     |
+  // | free list         |
   // |                   |
   // +-------------------+
   // |                   |
-  // | bulk free bit map |
+  // | bulk free list    |
   // |                   |
   // +-------------------+
   // |                   |
   // | thread-local free |
-  // | bit map           |
+  // | list              |
   // |                   |
   // +-------------------+
   // | padding due to    |
@@ -160,94 +350,100 @@
     uint8_t size_bracket_idx_;          // The index of the size bracket of this run.
     uint8_t is_thread_local_;           // True if this run is used as a thread-local run.
     uint8_t to_be_bulk_freed_;          // Used within BulkFree() to flag a run that's involved with a bulk free.
-    uint32_t first_search_vec_idx_;  // The index of the first bitmap vector which may contain an available slot.
-    uint32_t alloc_bit_map_[0];      // The bit map that allocates if each slot is in use.
+    uint32_t padding_ ATTRIBUTE_UNUSED;
+    // Use a tailless free list for free_list_ so that the alloc fast path does not manage the tail.
+    SlotFreeList<false> free_list_;
+    SlotFreeList<true> bulk_free_list_;
+    SlotFreeList<true> thread_local_free_list_;
+    // Padding due to alignment
+    // Slot 0
+    // Slot 1
+    // ...
 
-    // bulk_free_bit_map_[] : The bit map that is used for GC to
-    // temporarily mark the slots to free without using a lock. After
-    // all the slots to be freed in a run are marked, all those slots
-    // get freed in bulk with one locking per run, as opposed to one
-    // locking per slot to minimize the lock contention. This is used
-    // within BulkFree().
-
-    // thread_local_free_bit_map_[] : The bit map that is used for GC
-    // to temporarily mark the slots to free in a thread-local run
-    // without using a lock (without synchronizing the thread that
-    // owns the thread-local run.) When the thread-local run becomes
-    // full, the thread will check this bit map and update the
-    // allocation bit map of the run (that is, the slots get freed.)
-
-    // Returns the byte size of the header except for the bit maps.
+    // Returns the byte size of the header.
     static size_t fixed_header_size() {
-      Run temp;
-      size_t size = reinterpret_cast<uint8_t*>(&temp.alloc_bit_map_) - reinterpret_cast<uint8_t*>(&temp);
-      DCHECK_EQ(size, static_cast<size_t>(8));
-      return size;
+      return sizeof(Run);
     }
-    // Returns the base address of the free bit map.
-    uint32_t* BulkFreeBitMap() {
-      return reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(this) + bulkFreeBitMapOffsets[size_bracket_idx_]);
+    Slot* FirstSlot() {
+      const uint8_t idx = size_bracket_idx_;
+      return reinterpret_cast<Slot*>(reinterpret_cast<uintptr_t>(this) + headerSizes[idx]);
     }
-    // Returns the base address of the thread local free bit map.
-    uint32_t* ThreadLocalFreeBitMap() {
-      return reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(this) + threadLocalFreeBitMapOffsets[size_bracket_idx_]);
+    Slot* LastSlot() {
+      const uint8_t idx = size_bracket_idx_;
+      const size_t bracket_size = bracketSizes[idx];
+      uintptr_t end = reinterpret_cast<uintptr_t>(End());
+      Slot* last_slot = reinterpret_cast<Slot*>(end - bracket_size);
+      DCHECK_LE(FirstSlot(), last_slot);
+      return last_slot;
+    }
+    SlotFreeList<false>* FreeList() {
+      return &free_list_;
+    }
+    SlotFreeList<true>* BulkFreeList() {
+      return &bulk_free_list_;
+    }
+    SlotFreeList<true>* ThreadLocalFreeList() {
+      return &thread_local_free_list_;
     }
     void* End() {
       return reinterpret_cast<uint8_t*>(this) + kPageSize * numOfPages[size_bracket_idx_];
     }
-    // Returns the number of bitmap words per run.
-    size_t NumberOfBitmapVectors() const {
-      return RoundUp(numOfSlots[size_bracket_idx_], 32) / 32;
-    }
     void SetIsThreadLocal(bool is_thread_local) {
       is_thread_local_  = is_thread_local ? 1 : 0;
     }
     bool IsThreadLocal() const {
       return is_thread_local_ != 0;
     }
-    // Frees slots in the allocation bit map with regard to the
-    // thread-local free bit map. Used when a thread-local run becomes
+    // Set up the free list for a new/empty run.
+    void InitFreeList() {
+      const uint8_t idx = size_bracket_idx_;
+      const size_t bracket_size = bracketSizes[idx];
+      Slot* first_slot = FirstSlot();
+      // Add backwards so the first slot is at the head of the list.
+      for (Slot* slot = LastSlot(); slot >= first_slot; slot = slot->Left(bracket_size)) {
+        free_list_.Add(slot);
+      }
+    }
+    // Merge the thread local free list to the free list.  Used when a thread-local run becomes
     // full.
-    bool MergeThreadLocalFreeBitMapToAllocBitMap(bool* is_all_free_after_out);
-    // Frees slots in the allocation bit map with regard to the bulk
-    // free bit map. Used in a bulk free.
-    void MergeBulkFreeBitMapIntoAllocBitMap();
-    // Unions the slots to be freed in the free bit map into the
-    // thread-local free bit map. In a bulk free, as a two-step
-    // process, GC will first record all the slots to free in a run in
-    // the free bit map where it can write without a lock, and later
-    // acquire a lock once per run to union the bits of the free bit
-    // map to the thread-local free bit map.
-    void UnionBulkFreeBitMapToThreadLocalFreeBitMap();
+    bool MergeThreadLocalFreeListToFreeList(bool* is_all_free_after_out);
+    // Merge the bulk free list to the free list. Used in a bulk free.
+    void MergeBulkFreeListToFreeList();
+    // Merge the bulk free list to the thread local free list. In a bulk free, as a two-step
+    // process, GC will first record all the slots to free in a run in the bulk free list where it
+    // can write without a lock, and later acquire a lock once per run to merge the bulk free list
+    // to the thread-local free list.
+    void MergeBulkFreeListToThreadLocalFreeList();
     // Allocates a slot in a run.
-    void* AllocSlot();
+    ALWAYS_INLINE void* AllocSlot();
     // Frees a slot in a run. This is used in a non-bulk free.
     void FreeSlot(void* ptr);
-    // Marks the slots to free in the bulk free bit map. Returns the bracket size.
-    size_t MarkBulkFreeBitMap(void* ptr);
-    // Marks the slots to free in the thread-local free bit map.
-    void MarkThreadLocalFreeBitMap(void* ptr);
-    // Last word mask, all of the bits in the last word which aren't valid slots are set to
-    // optimize allocation path.
-    static uint32_t GetBitmapLastVectorMask(size_t num_slots, size_t num_vec);
+    // Add the given slot to the bulk free list. Returns the bracket size.
+    size_t AddToBulkFreeList(void* ptr);
+    // Add the given slot to the thread-local free list.
+    void AddToThreadLocalFreeList(void* ptr);
     // Returns true if all the slots in the run are not in use.
-    bool IsAllFree();
+    bool IsAllFree() const {
+      return free_list_.Size() == numOfSlots[size_bracket_idx_];
+    }
     // Returns the number of free slots.
-    size_t NumberOfFreeSlots();
+    size_t NumberOfFreeSlots() {
+      return free_list_.Size();
+    }
     // Returns true if all the slots in the run are in use.
     ALWAYS_INLINE bool IsFull();
-    // Returns true if the bulk free bit map is clean.
-    bool IsBulkFreeBitmapClean();
-    // Returns true if the thread local free bit map is clean.
-    bool IsThreadLocalFreeBitmapClean();
-    // Set the alloc_bit_map_ bits for slots that are past the end of the run.
-    void SetAllocBitMapBitsForInvalidSlots();
+    // Returns true if the bulk free list is empty.
+    bool IsBulkFreeListEmpty() const {
+      return bulk_free_list_.Size() == 0;
+    }
+    // Returns true if the thread local free list is empty.
+    bool IsThreadLocalFreeListEmpty() const {
+      return thread_local_free_list_.Size() == 0;
+    }
     // Zero the run's data.
     void ZeroData();
-    // Zero the run's header.
-    void ZeroHeader();
-    // Fill the alloc bitmap with 1s.
-    void FillAllocBitMap();
+    // Zero the run's header and the slot headers.
+    void ZeroHeaderAndSlotHeaders();
     // Iterate over all the slots and apply the given function.
     void InspectAllSlots(void (*handler)(void* start, void* end, size_t used_bytes, void* callback_arg), void* arg);
     // Dump the run metadata for debugging.
@@ -258,11 +454,24 @@
         REQUIRES(Locks::thread_list_lock_);
 
    private:
-    // The common part of MarkFreeBitMap() and MarkThreadLocalFreeBitMap(). Returns the bracket
+    // The common part of AddToBulkFreeList() and AddToThreadLocalFreeList(). Returns the bracket
     // size.
-    size_t MarkFreeBitMapShared(void* ptr, uint32_t* free_bit_map_base, const char* caller_name);
-    // Turns the bit map into a string for debugging.
-    static std::string BitMapToStr(uint32_t* bit_map_base, size_t num_vec);
+    size_t AddToFreeListShared(void* ptr, SlotFreeList<true>* free_list, const char* caller_name);
+    // Turns a FreeList into a string for debugging.
+    template<bool kUseTail>
+    std::string FreeListToStr(SlotFreeList<kUseTail>* free_list);
+    // Check a given pointer is a valid slot address and return it as Slot*.
+    Slot* ToSlot(void* ptr) {
+      const uint8_t idx = size_bracket_idx_;
+      const size_t bracket_size = bracketSizes[idx];
+      const size_t offset_from_slot_base = reinterpret_cast<uint8_t*>(ptr)
+          - reinterpret_cast<uint8_t*>(FirstSlot());
+      DCHECK_EQ(offset_from_slot_base % bracket_size, static_cast<size_t>(0));
+      size_t slot_idx = offset_from_slot_base / bracket_size;
+      DCHECK_LT(slot_idx, numOfSlots[idx]);
+      return reinterpret_cast<Slot*>(ptr);
+    }
+    size_t SlotIndex(Slot* slot);
 
     // TODO: DISALLOW_COPY_AND_ASSIGN(Run);
   };
@@ -283,10 +492,6 @@
   static size_t numOfSlots[kNumOfSizeBrackets];
   // The header sizes in bytes of the runs for each size bracket.
   static size_t headerSizes[kNumOfSizeBrackets];
-  // The byte offsets of the bulk free bit maps of the runs for each size bracket.
-  static size_t bulkFreeBitMapOffsets[kNumOfSizeBrackets];
-  // The byte offsets of the thread-local free bit maps of the runs for each size bracket.
-  static size_t threadLocalFreeBitMapOffsets[kNumOfSizeBrackets];
 
   // Initialize the run specs (the above arrays).
   static void Initialize();
@@ -493,7 +698,7 @@
   // The reader-writer lock to allow one bulk free at a time while
   // allowing multiple individual frees at the same time. Also, this
   // is used to avoid race conditions between BulkFree() and
-  // RevokeThreadLocalRuns() on the bulk free bitmaps.
+  // RevokeThreadLocalRuns() on the bulk free list.
   ReaderWriterMutex bulk_free_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
 
   // The page release mode.