Revert^2 "Stack maps: Interleave consecutive varints."

Reorder the layout of consecutive varints.  Store all the 'headers'
which define the varint size first and then store any large values.

The size is unchanged, but it makes the reading from memory faster.

This speeds up CodeInfo by 10%, and maps startup by 0.1%.
Change in size is negligible (the bits mostly just move).

This reverts commit 1b2a49b7aba39ed6663a69dfdf63d0df069f0d42.

Test: test.py -b --host --64 --optimizing
Change-Id: Ica7b42180ef2bae637445c0ce44fd3833ec0ecfc
diff --git a/compiler/optimizing/stack_map_stream.cc b/compiler/optimizing/stack_map_stream.cc
index 87702cc..87e15ba 100644
--- a/compiler/optimizing/stack_map_stream.cc
+++ b/compiler/optimizing/stack_map_stream.cc
@@ -307,12 +307,14 @@
 
   ScopedArenaVector<uint8_t> buffer(allocator_->Adapter(kArenaAllocStackMapStream));
   BitMemoryWriter<ScopedArenaVector<uint8_t>> out(&buffer);
-  out.WriteVarint(flags);
-  out.WriteVarint(packed_frame_size_);
-  out.WriteVarint(core_spill_mask_);
-  out.WriteVarint(fp_spill_mask_);
-  out.WriteVarint(num_dex_registers_);
-  out.WriteVarint(bit_table_flags);
+  out.WriteInterleavedVarints(std::array<uint32_t, CodeInfo::kNumHeaders>{
+    flags,
+    packed_frame_size_,
+    core_spill_mask_,
+    fp_spill_mask_,
+    num_dex_registers_,
+    bit_table_flags,
+  });
   ForEachBitTable([&out](size_t, auto bit_table) {
     if (bit_table->size() != 0) {  // Skip empty bit-tables.
       bit_table->Encode(out);
diff --git a/libartbase/base/bit_memory_region.h b/libartbase/base/bit_memory_region.h
index 50d132e..9f0d546 100644
--- a/libartbase/base/bit_memory_region.h
+++ b/libartbase/base/bit_memory_region.h
@@ -22,6 +22,8 @@
 #include "bit_utils.h"
 #include "memory_tool.h"
 
+#include <array>
+
 namespace art {
 
 // Bit memory region is a bit offset subregion of a normal memoryregion. This is useful for
@@ -37,11 +39,9 @@
   BitMemoryRegion() = default;
   ALWAYS_INLINE BitMemoryRegion(uint8_t* data, ssize_t bit_start, size_t bit_size) {
     // Normalize the data pointer. Note that bit_start may be negative.
-    uint8_t* aligned_data = AlignDown(data + (bit_start >> kBitsPerByteLog2), sizeof(uintptr_t));
-    data_ = reinterpret_cast<uintptr_t*>(aligned_data);
-    bit_start_ = bit_start + kBitsPerByte * (data - aligned_data);
+    data_ = AlignDown(data + (bit_start >> kBitsPerByteLog2), kPageSize);
+    bit_start_ = bit_start + kBitsPerByte * (data - data_);
     bit_size_ = bit_size;
-    DCHECK_LT(bit_start_, static_cast<size_t>(kBitsPerIntPtrT));
   }
   ALWAYS_INLINE explicit BitMemoryRegion(MemoryRegion region)
     : BitMemoryRegion(region.begin(), /* bit_start */ 0, region.size_in_bits()) {
@@ -55,7 +55,7 @@
 
   const uint8_t* data() const {
     DCHECK_ALIGNED(bit_start_, kBitsPerByte);
-    return reinterpret_cast<const uint8_t*>(data_) + bit_start_ / kBitsPerByte;
+    return data_ + bit_start_ / kBitsPerByte;
   }
 
   size_t size_in_bits() const {
@@ -87,42 +87,45 @@
   // significant bit in the first byte.
   ALWAYS_INLINE bool LoadBit(size_t bit_offset) const {
     DCHECK_LT(bit_offset, bit_size_);
-    uint8_t* data = reinterpret_cast<uint8_t*>(data_);
     size_t index = (bit_start_ + bit_offset) / kBitsPerByte;
     size_t shift = (bit_start_ + bit_offset) % kBitsPerByte;
-    return ((data[index] >> shift) & 1) != 0;
+    return ((data_[index] >> shift) & 1) != 0;
   }
 
   ALWAYS_INLINE void StoreBit(size_t bit_offset, bool value) {
     DCHECK_LT(bit_offset, bit_size_);
-    uint8_t* data = reinterpret_cast<uint8_t*>(data_);
     size_t index = (bit_start_ + bit_offset) / kBitsPerByte;
     size_t shift = (bit_start_ + bit_offset) % kBitsPerByte;
-    data[index] &= ~(1 << shift);  // Clear bit.
-    data[index] |= (value ? 1 : 0) << shift;  // Set bit.
+    data_[index] &= ~(1 << shift);  // Clear bit.
+    data_[index] |= (value ? 1 : 0) << shift;  // Set bit.
     DCHECK_EQ(value, LoadBit(bit_offset));
   }
 
   // Load `bit_length` bits from `data` starting at given `bit_offset`.
   // The least significant bit is stored in the smallest memory offset.
+  template<typename Result = size_t>
   ATTRIBUTE_NO_SANITIZE_ADDRESS  // We might touch extra bytes due to the alignment.
-  ALWAYS_INLINE uint32_t LoadBits(size_t bit_offset, size_t bit_length) const {
-    DCHECK(IsAligned<sizeof(uintptr_t)>(data_));
+  ALWAYS_INLINE Result LoadBits(size_t bit_offset, size_t bit_length) const {
+    static_assert(std::is_integral<Result>::value, "Result must be integral");
+    static_assert(std::is_unsigned<Result>::value, "Result must be unsigned");
+    DCHECK(IsAligned<sizeof(Result)>(data_));
     DCHECK_LE(bit_offset, bit_size_);
     DCHECK_LE(bit_length, bit_size_ - bit_offset);
-    DCHECK_LE(bit_length, BitSizeOf<uint32_t>());
+    DCHECK_LE(bit_length, BitSizeOf<Result>());
     if (bit_length == 0) {
       return 0;
     }
-    uintptr_t mask = std::numeric_limits<uintptr_t>::max() >> (kBitsPerIntPtrT - bit_length);
-    size_t index = (bit_start_ + bit_offset) / kBitsPerIntPtrT;
-    size_t shift = (bit_start_ + bit_offset) % kBitsPerIntPtrT;
-    uintptr_t value = data_[index] >> shift;
-    size_t finished_bits = kBitsPerIntPtrT - shift;
+    Result* data = reinterpret_cast<Result*>(data_);
+    size_t width = BitSizeOf<Result>();
+    Result clear = (std::numeric_limits<Result>::max() << 1) << (bit_length - 1);
+    size_t index = (bit_start_ + bit_offset) / width;
+    size_t shift = (bit_start_ + bit_offset) % width;
+    Result value = data[index] >> shift;
+    size_t finished_bits = width - shift;
     if (finished_bits < bit_length) {
-      value |= data_[index + 1] << finished_bits;
+      value |= data[index + 1] << finished_bits;
     }
-    return value & mask;
+    return value & ~clear;
   }
 
   // Store `bit_length` bits in `data` starting at given `bit_offset`.
@@ -137,16 +140,15 @@
     }
     // Write data byte by byte to avoid races with other threads
     // on bytes that do not overlap with this region.
-    uint8_t* data = reinterpret_cast<uint8_t*>(data_);
     uint32_t mask = std::numeric_limits<uint32_t>::max() >> (BitSizeOf<uint32_t>() - bit_length);
     size_t index = (bit_start_ + bit_offset) / kBitsPerByte;
     size_t shift = (bit_start_ + bit_offset) % kBitsPerByte;
-    data[index] &= ~(mask << shift);  // Clear bits.
-    data[index] |= (value << shift);  // Set bits.
+    data_[index] &= ~(mask << shift);  // Clear bits.
+    data_[index] |= (value << shift);  // Set bits.
     size_t finished_bits = kBitsPerByte - shift;
     for (int i = 1; finished_bits < bit_length; i++, finished_bits += kBitsPerByte) {
-      data[index + i] &= ~(mask >> finished_bits);  // Clear bits.
-      data[index + i] |= (value >> finished_bits);  // Set bits.
+      data_[index + i] &= ~(mask >> finished_bits);  // Clear bits.
+      data_[index + i] |= (value >> finished_bits);  // Set bits.
     }
     DCHECK_EQ(value, LoadBits(bit_offset, bit_length));
   }
@@ -201,14 +203,13 @@
   }
 
  private:
-  // The data pointer must be naturally aligned. This makes loading code faster.
-  uintptr_t* data_ = nullptr;
+  uint8_t* data_ = nullptr;  // The pointer is page aligned.
   size_t bit_start_ = 0;
   size_t bit_size_ = 0;
 };
 
-constexpr uint32_t kVarintHeaderBits = 4;
-constexpr uint32_t kVarintSmallValue = 11;  // Maximum value which is stored as-is.
+constexpr uint32_t kVarintBits = 4;  // Minimum number of bits used for varint.
+constexpr uint32_t kVarintMax = 11;  // Maximum value which is stored "inline".
 
 class BitMemoryReader {
  public:
@@ -232,8 +233,9 @@
     return finished_region_.Subregion(bit_offset, bit_length);
   }
 
-  ALWAYS_INLINE uint32_t ReadBits(size_t bit_length) {
-    return ReadRegion(bit_length).LoadBits(/* bit_offset */ 0, bit_length);
+  template<typename Result = size_t>
+  ALWAYS_INLINE Result ReadBits(size_t bit_length) {
+    return ReadRegion(bit_length).LoadBits<Result>(/* bit_offset */ 0, bit_length);
   }
 
   ALWAYS_INLINE bool ReadBit() {
@@ -245,35 +247,24 @@
   //   Values 0..11 represent the result as-is, with no further following bits.
   //   Values 12..15 mean the result is in the next 8/16/24/32-bits respectively.
   ALWAYS_INLINE uint32_t ReadVarint() {
-    uint32_t x = ReadBits(kVarintHeaderBits);
-    if (x > kVarintSmallValue) {
-      x = ReadBits((x - kVarintSmallValue) * kBitsPerByte);
-    }
-    return x;
+    uint32_t x = ReadBits(kVarintBits);
+    return (x <= kVarintMax) ? x : ReadBits((x - kVarintMax) * kBitsPerByte);
   }
 
-  // Optimized version to read several consecutive varints.
-  // It reads all the headers at once in a single bit read.
-  template<size_t N>  // Inference works only with ref-arrays.
-  ALWAYS_INLINE void ReadVarints(uint32_t (&varints)[N]) {
-    constexpr size_t kBatch = std::min(N, sizeof(uint32_t) * kBitsPerByte / kVarintHeaderBits);
-    uint32_t headers = ReadBits(kBatch * kVarintHeaderBits);
-    uint32_t* out = varints;
-    for (size_t i = 0; i < kBatch; out++) {
-      uint32_t header = BitFieldExtract(headers, (i++) * kVarintHeaderBits, kVarintHeaderBits);
-      if (LIKELY(header <= kVarintSmallValue)) {
-        // Fast-path: consume one of the headers and continue to the next varint.
-        *out = header;
-      } else {
-        // Slow-path: rollback reader, read large value, and read remaning headers.
-        finished_region_.Resize(finished_region_.size_in_bits() - (kBatch-i) * kVarintHeaderBits);
-        *out = ReadBits((header - kVarintSmallValue) * kBitsPerByte);
-        headers = ReadBits((kBatch-i) * kVarintHeaderBits) << (i * kVarintHeaderBits);
-      }
+  // Read N 'interleaved' varints (different to just reading consecutive varints).
+  // All small values are stored first and the large values are stored after them.
+  // This requires fewer bit-reads compared to indidually storing the varints.
+  template<size_t N>
+  ALWAYS_INLINE std::array<uint32_t, N> ReadInterleavedVarints() {
+    static_assert(N * kVarintBits <= sizeof(uint64_t) * kBitsPerByte, "N too big");
+    std::array<uint32_t, N> values;
+    // StackMap BitTable uses over 8 varints in the header, so we need uint64_t.
+    uint64_t data = ReadBits<uint64_t>(N * kVarintBits);
+    for (size_t i = 0; i < N; i++) {
+      uint32_t x = BitFieldExtract(data, i * kVarintBits, kVarintBits);
+      values[i] = LIKELY(x <= kVarintMax) ? x : ReadBits((x - kVarintMax) * kBitsPerByte);
     }
-    for (size_t i = kBatch; i < N; i++, out++) {
-      *out = ReadVarint();
-    }
+    return values;
   }
 
  private:
@@ -320,16 +311,26 @@
     Allocate(1).StoreBit(/* bit_offset */ 0, value);
   }
 
-  // Write variable-length bit-packed integer.
-  ALWAYS_INLINE void WriteVarint(uint32_t value) {
-    if (value <= kVarintSmallValue) {
-      WriteBits(value, kVarintHeaderBits);
-    } else {
-      uint32_t num_bits = RoundUp(MinimumBitsToStore(value), kBitsPerByte);
-      uint32_t header = kVarintSmallValue + num_bits / kBitsPerByte;
-      WriteBits(header, kVarintHeaderBits);
-      WriteBits(value, num_bits);
+  template<size_t N>
+  ALWAYS_INLINE void WriteInterleavedVarints(std::array<uint32_t, N> values) {
+    // Write small values (or the number of bytes needed for the large values).
+    for (uint32_t value : values) {
+      if (value > kVarintMax) {
+        WriteBits(kVarintMax + BitsToBytesRoundUp(MinimumBitsToStore(value)), kVarintBits);
+      } else {
+        WriteBits(value, kVarintBits);
+      }
     }
+    // Write large values.
+    for (uint32_t value : values) {
+      if (value > kVarintMax) {
+        WriteBits(value, BitsToBytesRoundUp(MinimumBitsToStore(value)) * kBitsPerByte);
+      }
+    }
+  }
+
+  ALWAYS_INLINE void WriteVarint(uint32_t value) {
+    WriteInterleavedVarints<1>({value});
   }
 
   ALWAYS_INLINE void ByteAlign() {
diff --git a/libartbase/base/bit_memory_region_test.cc b/libartbase/base/bit_memory_region_test.cc
index 02623bf..bf9c6d6 100644
--- a/libartbase/base/bit_memory_region_test.cc
+++ b/libartbase/base/bit_memory_region_test.cc
@@ -43,7 +43,7 @@
 
       BitMemoryReader reader(buffer.data(), start_bit_offset);
       uint32_t result = reader.ReadVarint();
-      uint32_t upper_bound = RoundUp(MinimumBitsToStore(value), kBitsPerByte) + kVarintHeaderBits;
+      uint32_t upper_bound = RoundUp(MinimumBitsToStore(value), kBitsPerByte) + kVarintBits;
       EXPECT_EQ(writer.NumberOfWrittenBits(), reader.NumberOfReadBits());
       EXPECT_EQ(value, result);
       EXPECT_GE(upper_bound, writer.NumberOfWrittenBits());
diff --git a/libartbase/base/bit_table.h b/libartbase/base/bit_table.h
index 1984265..5ec162d 100644
--- a/libartbase/base/bit_table.h
+++ b/libartbase/base/bit_table.h
@@ -49,8 +49,7 @@
 
   ALWAYS_INLINE void Decode(BitMemoryReader& reader) {
     // Decode row count and column sizes from the table header.
-    uint32_t header[1 + kNumColumns];
-    reader.ReadVarints(header);
+    std::array<uint32_t, 1+kNumColumns> header = reader.ReadInterleavedVarints<1+kNumColumns>();
     num_rows_ = header[0];
     column_offset_[0] = 0;
     for (uint32_t i = 0; i < kNumColumns; i++) {
@@ -335,7 +334,7 @@
   }
 
   // Calculate the column bit widths based on the current data.
-  void Measure(/*out*/ std::array<uint32_t, kNumColumns>* column_bits) const {
+  void Measure(/*out*/ uint32_t* column_bits) const {
     uint32_t max_column_value[kNumColumns];
     std::fill_n(max_column_value, kNumColumns, 0);
     for (uint32_t r = 0; r < size(); r++) {
@@ -344,7 +343,7 @@
       }
     }
     for (uint32_t c = 0; c < kNumColumns; c++) {
-      (*column_bits)[c] = MinimumBitsToStore(max_column_value[c]);
+      column_bits[c] = MinimumBitsToStore(max_column_value[c]);
     }
   }
 
@@ -353,14 +352,12 @@
   void Encode(BitMemoryWriter<Vector>& out) const {
     size_t initial_bit_offset = out.NumberOfWrittenBits();
 
-    std::array<uint32_t, kNumColumns> column_bits;
-    Measure(&column_bits);
-
     // Write table header.
-    out.WriteVarint(size());
-    for (uint32_t c = 0; c < kNumColumns; c++) {
-      out.WriteVarint(column_bits[c]);
-    }
+    std::array<uint32_t, 1 + kNumColumns> header;
+    header[0] = size();
+    uint32_t* column_bits = header.data() + 1;
+    Measure(column_bits);
+    out.WriteInterleavedVarints(header);
 
     // Write table data.
     for (uint32_t r = 0; r < size(); r++) {
@@ -444,8 +441,10 @@
     size_t initial_bit_offset = out.NumberOfWrittenBits();
 
     // Write table header.
-    out.WriteVarint(size());
-    out.WriteVarint(max_num_bits_);
+    out.WriteInterleavedVarints(std::array<uint32_t, 2>{
+      dchecked_integral_cast<uint32_t>(size()),
+      dchecked_integral_cast<uint32_t>(max_num_bits_),
+    });
 
     // Write table data.
     for (MemoryRegion row : rows_) {
diff --git a/runtime/oat.h b/runtime/oat.h
index c02ac0b..54d111c 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,8 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr std::array<uint8_t, 4> kOatMagic { { 'o', 'a', 't', '\n' } };
-  // Last oat version changed reason: Stack maps: Handle special cases using flags.
-  static constexpr std::array<uint8_t, 4> kOatVersion { { '1', '7', '2', '\0' } };
+  // Last oat version changed reason: Optimize stack map decoding - interleave varints.
+  static constexpr std::array<uint8_t, 4> kOatVersion { { '1', '7', '3', '\0' } };
 
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
   static constexpr const char* kDebuggableKey = "debuggable";
diff --git a/runtime/stack_map.cc b/runtime/stack_map.cc
index 2300d1f..eebca85 100644
--- a/runtime/stack_map.cc
+++ b/runtime/stack_map.cc
@@ -33,14 +33,13 @@
 
 void CodeInfo::Decode(const uint8_t* data, DecodeFlags flags) {
   BitMemoryReader reader(data);
-  uint32_t header[kNumHeaders];
-  reader.ReadVarints(header);
+  std::array<uint32_t, kNumHeaders> header = reader.ReadInterleavedVarints<kNumHeaders>();
   ForEachHeaderField([this, &header](size_t i, auto member_pointer) {
     this->*member_pointer = header[i];
   });
   ForEachBitTableField([this, &reader](size_t i, auto member_pointer) {
     auto& table = this->*member_pointer;
-    if (HasBitTable(i)) {
+    if (LIKELY(HasBitTable(i))) {
       if (UNLIKELY(IsBitTableDeduped(i))) {
         ssize_t bit_offset = reader.NumberOfReadBits() - reader.ReadVarint();
         BitMemoryReader reader2(reader.data(), bit_offset);  // The offset is negative.
@@ -60,12 +59,16 @@
   writer_.ByteAlign();
   size_t deduped_offset = writer_.NumberOfWrittenBits() / kBitsPerByte;
 
+  // The back-reference offset takes space so dedupe is not worth it for tiny tables.
+  constexpr size_t kMinDedupSize = 32;  // Assume 32-bit offset on average.
+
   // Read the existing code info and find (and keep) dedup-map iterator for each table.
   // The iterator stores BitMemoryRegion and bit_offset of previous identical BitTable.
   BitMemoryReader reader(code_info_data);
   CodeInfo code_info;  // Temporary storage for decoded data.
-  ForEachHeaderField([&reader, &code_info](size_t, auto member_pointer) {
-    code_info.*member_pointer = reader.ReadVarint();
+  std::array<uint32_t, kNumHeaders> header = reader.ReadInterleavedVarints<kNumHeaders>();
+  ForEachHeaderField([&code_info, &header](size_t i, auto member_pointer) {
+    code_info.*member_pointer = header[i];
   });
   std::map<BitMemoryRegion, uint32_t, BitMemoryRegion::Less>::iterator it[kNumBitTables];
   ForEachBitTableField([this, &reader, &code_info, &it](size_t i, auto member_pointer) {
@@ -75,16 +78,17 @@
       (code_info.*member_pointer).Decode(reader);
       BitMemoryRegion region = reader.GetReadRegion().Subregion(bit_table_start);
       it[i] = dedupe_map_.emplace(region, /* default bit_offset */ 0).first;
-      if (it[i]->second != 0 && region.size_in_bits() > 32) {  // Seen before and large?
+      if (it[i]->second != 0 && region.size_in_bits() > kMinDedupSize) {  // Seen before and large?
         code_info.SetBitTableDeduped(i);  // Mark as deduped before we write header.
       }
     }
   });
 
   // Write the code info back, but replace deduped tables with relative offsets.
-  ForEachHeaderField([this, &code_info](size_t, auto member_pointer) {
-    writer_.WriteVarint(code_info.*member_pointer);
+  ForEachHeaderField([&code_info, &header](size_t i, auto member_pointer) {
+    header[i] = code_info.*member_pointer;
   });
+  writer_.WriteInterleavedVarints(header);
   ForEachBitTableField([this, &code_info, &it](size_t i, auto) {
     if (code_info.HasBitTable(i)) {
       uint32_t& bit_offset = it[i]->second;
@@ -106,7 +110,8 @@
         DCHECK_EQ(old_code_info.*member_pointer, new_code_info.*member_pointer);
       }
     });
-    ForEachBitTableField([&old_code_info, &new_code_info](size_t, auto member_pointer) {
+    ForEachBitTableField([&old_code_info, &new_code_info](size_t i, auto member_pointer) {
+      DCHECK_EQ(old_code_info.HasBitTable(i), new_code_info.HasBitTable(i));
       DCHECK((old_code_info.*member_pointer).Equals(new_code_info.*member_pointer));
     });
   }
@@ -203,8 +208,9 @@
   Stats* codeinfo_stats = parent->Child("CodeInfo");
   BitMemoryReader reader(code_info_data);
   CodeInfo code_info;  // Temporary storage for decoded tables.
-  ForEachHeaderField([&reader, &code_info](size_t, auto member_pointer) {
-    code_info.*member_pointer = reader.ReadVarint();
+  std::array<uint32_t, kNumHeaders> header = reader.ReadInterleavedVarints<kNumHeaders>();
+  ForEachHeaderField([&code_info, &header](size_t i, auto member_pointer) {
+    code_info.*member_pointer = header[i];
   });
   codeinfo_stats->Child("Header")->AddBits(reader.NumberOfReadBits());
   ForEachBitTableField([codeinfo_stats, &reader, &code_info](size_t i, auto member_pointer) {
diff --git a/runtime/stack_map.h b/runtime/stack_map.h
index c088eb6..b438074 100644
--- a/runtime/stack_map.h
+++ b/runtime/stack_map.h
@@ -443,8 +443,7 @@
 
   ALWAYS_INLINE static QuickMethodFrameInfo DecodeFrameInfo(const uint8_t* code_info_data) {
     BitMemoryReader reader(code_info_data);
-    uint32_t header[4];  // flags, packed_frame_size, core_spill_mask, fp_spill_mask.
-    reader.ReadVarints(header);
+    std::array<uint32_t, kNumHeaders> header = reader.ReadInterleavedVarints<kNumHeaders>();
     return QuickMethodFrameInfo(header[1] * kStackAlignment, header[2], header[3]);
   }