Revert "Stack maps: Interleave consecutive varints."

This reverts commit a2b34561a7faca95d0a4f8194ad155798e238e37.

Reason for revert: <INSERT REASONING HERE>

Change-Id: Ie5b220e429e101bb5fa2606665a9c8cb64308ad3

Bug: 135638469
diff --git a/compiler/optimizing/stack_map_stream.cc b/compiler/optimizing/stack_map_stream.cc
index 87e15ba..87702cc 100644
--- a/compiler/optimizing/stack_map_stream.cc
+++ b/compiler/optimizing/stack_map_stream.cc
@@ -307,14 +307,12 @@
 
   ScopedArenaVector<uint8_t> buffer(allocator_->Adapter(kArenaAllocStackMapStream));
   BitMemoryWriter<ScopedArenaVector<uint8_t>> out(&buffer);
-  out.WriteInterleavedVarints(std::array<uint32_t, CodeInfo::kNumHeaders>{
-    flags,
-    packed_frame_size_,
-    core_spill_mask_,
-    fp_spill_mask_,
-    num_dex_registers_,
-    bit_table_flags,
-  });
+  out.WriteVarint(flags);
+  out.WriteVarint(packed_frame_size_);
+  out.WriteVarint(core_spill_mask_);
+  out.WriteVarint(fp_spill_mask_);
+  out.WriteVarint(num_dex_registers_);
+  out.WriteVarint(bit_table_flags);
   ForEachBitTable([&out](size_t, auto bit_table) {
     if (bit_table->size() != 0) {  // Skip empty bit-tables.
       bit_table->Encode(out);
diff --git a/libartbase/base/bit_memory_region.h b/libartbase/base/bit_memory_region.h
index f19bcf1..50d132e 100644
--- a/libartbase/base/bit_memory_region.h
+++ b/libartbase/base/bit_memory_region.h
@@ -22,8 +22,6 @@
 #include "bit_utils.h"
 #include "memory_tool.h"
 
-#include <array>
-
 namespace art {
 
 // Bit memory region is a bit offset subregion of a normal memoryregion. This is useful for
@@ -39,9 +37,11 @@
   BitMemoryRegion() = default;
   ALWAYS_INLINE BitMemoryRegion(uint8_t* data, ssize_t bit_start, size_t bit_size) {
     // Normalize the data pointer. Note that bit_start may be negative.
-    data_ = AlignDown(data + (bit_start >> kBitsPerByteLog2), kPageSize);
-    bit_start_ = bit_start + kBitsPerByte * (data - data_);
+    uint8_t* aligned_data = AlignDown(data + (bit_start >> kBitsPerByteLog2), sizeof(uintptr_t));
+    data_ = reinterpret_cast<uintptr_t*>(aligned_data);
+    bit_start_ = bit_start + kBitsPerByte * (data - aligned_data);
     bit_size_ = bit_size;
+    DCHECK_LT(bit_start_, static_cast<size_t>(kBitsPerIntPtrT));
   }
   ALWAYS_INLINE explicit BitMemoryRegion(MemoryRegion region)
     : BitMemoryRegion(region.begin(), /* bit_start */ 0, region.size_in_bits()) {
@@ -55,7 +55,7 @@
 
   const uint8_t* data() const {
     DCHECK_ALIGNED(bit_start_, kBitsPerByte);
-    return data_ + bit_start_ / kBitsPerByte;
+    return reinterpret_cast<const uint8_t*>(data_) + bit_start_ / kBitsPerByte;
   }
 
   size_t size_in_bits() const {
@@ -87,45 +87,42 @@
   // significant bit in the first byte.
   ALWAYS_INLINE bool LoadBit(size_t bit_offset) const {
     DCHECK_LT(bit_offset, bit_size_);
+    uint8_t* data = reinterpret_cast<uint8_t*>(data_);
     size_t index = (bit_start_ + bit_offset) / kBitsPerByte;
     size_t shift = (bit_start_ + bit_offset) % kBitsPerByte;
-    return ((data_[index] >> shift) & 1) != 0;
+    return ((data[index] >> shift) & 1) != 0;
   }
 
   ALWAYS_INLINE void StoreBit(size_t bit_offset, bool value) {
     DCHECK_LT(bit_offset, bit_size_);
+    uint8_t* data = reinterpret_cast<uint8_t*>(data_);
     size_t index = (bit_start_ + bit_offset) / kBitsPerByte;
     size_t shift = (bit_start_ + bit_offset) % kBitsPerByte;
-    data_[index] &= ~(1 << shift);  // Clear bit.
-    data_[index] |= (value ? 1 : 0) << shift;  // Set bit.
+    data[index] &= ~(1 << shift);  // Clear bit.
+    data[index] |= (value ? 1 : 0) << shift;  // Set bit.
     DCHECK_EQ(value, LoadBit(bit_offset));
   }
 
   // Load `bit_length` bits from `data` starting at given `bit_offset`.
   // The least significant bit is stored in the smallest memory offset.
   ATTRIBUTE_NO_SANITIZE_ADDRESS  // We might touch extra bytes due to the alignment.
-  template<typename Result = size_t>
-  ALWAYS_INLINE Result LoadBits(size_t bit_offset, size_t bit_length) const {
-    static_assert(std::is_integral<Result>::value, "Result must be integral");
-    static_assert(std::is_unsigned<Result>::value, "Result must be unsigned");
-    DCHECK(IsAligned<sizeof(Result)>(data_));
+  ALWAYS_INLINE uint32_t LoadBits(size_t bit_offset, size_t bit_length) const {
+    DCHECK(IsAligned<sizeof(uintptr_t)>(data_));
     DCHECK_LE(bit_offset, bit_size_);
     DCHECK_LE(bit_length, bit_size_ - bit_offset);
-    DCHECK_LE(bit_length, BitSizeOf<Result>());
+    DCHECK_LE(bit_length, BitSizeOf<uint32_t>());
     if (bit_length == 0) {
       return 0;
     }
-    Result* data = reinterpret_cast<Result*>(data_);
-    size_t width = BitSizeOf<Result>();
-    Result clear = (std::numeric_limits<Result>::max() << 1) << (bit_length - 1);
-    size_t index = (bit_start_ + bit_offset) / width;
-    size_t shift = (bit_start_ + bit_offset) % width;
-    Result value = data[index] >> shift;
-    size_t finished_bits = width - shift;
+    uintptr_t mask = std::numeric_limits<uintptr_t>::max() >> (kBitsPerIntPtrT - bit_length);
+    size_t index = (bit_start_ + bit_offset) / kBitsPerIntPtrT;
+    size_t shift = (bit_start_ + bit_offset) % kBitsPerIntPtrT;
+    uintptr_t value = data_[index] >> shift;
+    size_t finished_bits = kBitsPerIntPtrT - shift;
     if (finished_bits < bit_length) {
-      value |= data[index + 1] << finished_bits;
+      value |= data_[index + 1] << finished_bits;
     }
-    return value & ~clear;
+    return value & mask;
   }
 
   // Store `bit_length` bits in `data` starting at given `bit_offset`.
@@ -140,15 +137,16 @@
     }
     // Write data byte by byte to avoid races with other threads
     // on bytes that do not overlap with this region.
+    uint8_t* data = reinterpret_cast<uint8_t*>(data_);
     uint32_t mask = std::numeric_limits<uint32_t>::max() >> (BitSizeOf<uint32_t>() - bit_length);
     size_t index = (bit_start_ + bit_offset) / kBitsPerByte;
     size_t shift = (bit_start_ + bit_offset) % kBitsPerByte;
-    data_[index] &= ~(mask << shift);  // Clear bits.
-    data_[index] |= (value << shift);  // Set bits.
+    data[index] &= ~(mask << shift);  // Clear bits.
+    data[index] |= (value << shift);  // Set bits.
     size_t finished_bits = kBitsPerByte - shift;
     for (int i = 1; finished_bits < bit_length; i++, finished_bits += kBitsPerByte) {
-      data_[index + i] &= ~(mask >> finished_bits);  // Clear bits.
-      data_[index + i] |= (value >> finished_bits);  // Set bits.
+      data[index + i] &= ~(mask >> finished_bits);  // Clear bits.
+      data[index + i] |= (value >> finished_bits);  // Set bits.
     }
     DCHECK_EQ(value, LoadBits(bit_offset, bit_length));
   }
@@ -203,13 +201,14 @@
   }
 
  private:
-  uint8_t* data_ = nullptr;  // The pointer is page aligned.
+  // The data pointer must be naturally aligned. This makes loading code faster.
+  uintptr_t* data_ = nullptr;
   size_t bit_start_ = 0;
   size_t bit_size_ = 0;
 };
 
-constexpr uint32_t kVarintBits = 4;  // Minimum number of bits used for varint.
-constexpr uint32_t kVarintMax = 11;  // Maximum value which is stored "inline".
+constexpr uint32_t kVarintHeaderBits = 4;
+constexpr uint32_t kVarintSmallValue = 11;  // Maximum value which is stored as-is.
 
 class BitMemoryReader {
  public:
@@ -233,9 +232,8 @@
     return finished_region_.Subregion(bit_offset, bit_length);
   }
 
-  template<typename Result = size_t>
-  ALWAYS_INLINE Result ReadBits(size_t bit_length) {
-    return ReadRegion(bit_length).LoadBits<Result>(/* bit_offset */ 0, bit_length);
+  ALWAYS_INLINE uint32_t ReadBits(size_t bit_length) {
+    return ReadRegion(bit_length).LoadBits(/* bit_offset */ 0, bit_length);
   }
 
   ALWAYS_INLINE bool ReadBit() {
@@ -247,24 +245,35 @@
   //   Values 0..11 represent the result as-is, with no further following bits.
   //   Values 12..15 mean the result is in the next 8/16/24/32-bits respectively.
   ALWAYS_INLINE uint32_t ReadVarint() {
-    uint32_t x = ReadBits(kVarintBits);
-    return (x <= kVarintMax) ? x : ReadBits((x - kVarintMax) * kBitsPerByte);
+    uint32_t x = ReadBits(kVarintHeaderBits);
+    if (x > kVarintSmallValue) {
+      x = ReadBits((x - kVarintSmallValue) * kBitsPerByte);
+    }
+    return x;
   }
 
-  // Read N 'interleaved' varints (different to just reading consecutive varints).
-  // All small values are stored first and the large values are stored after them.
-  // This requires fewer bit-reads compared to indidually storing the varints.
-  template<size_t N>
-  ALWAYS_INLINE std::array<uint32_t, N> ReadInterleavedVarints() {
-    static_assert(N * kVarintBits <= sizeof(uint64_t) * kBitsPerByte, "N too big");
-    std::array<uint32_t, N> values;
-    // StackMap BitTable uses over 8 varints in the header, so we need uint64_t.
-    uint64_t data = ReadBits<uint64_t>(N * kVarintBits);
-    for (size_t i = 0; i < N; i++) {
-      uint32_t x = BitFieldExtract(data, i * kVarintBits, kVarintBits);
-      values[i] = LIKELY(x <= kVarintMax) ? x : ReadBits((x - kVarintMax) * kBitsPerByte);
+  // Optimized version to read several consecutive varints.
+  // It reads all the headers at once in a single bit read.
+  template<size_t N>  // Inference works only with ref-arrays.
+  ALWAYS_INLINE void ReadVarints(uint32_t (&varints)[N]) {
+    constexpr size_t kBatch = std::min(N, sizeof(uint32_t) * kBitsPerByte / kVarintHeaderBits);
+    uint32_t headers = ReadBits(kBatch * kVarintHeaderBits);
+    uint32_t* out = varints;
+    for (size_t i = 0; i < kBatch; out++) {
+      uint32_t header = BitFieldExtract(headers, (i++) * kVarintHeaderBits, kVarintHeaderBits);
+      if (LIKELY(header <= kVarintSmallValue)) {
+        // Fast-path: consume one of the headers and continue to the next varint.
+        *out = header;
+      } else {
+        // Slow-path: rollback reader, read large value, and read remaning headers.
+        finished_region_.Resize(finished_region_.size_in_bits() - (kBatch-i) * kVarintHeaderBits);
+        *out = ReadBits((header - kVarintSmallValue) * kBitsPerByte);
+        headers = ReadBits((kBatch-i) * kVarintHeaderBits) << (i * kVarintHeaderBits);
+      }
     }
-    return values;
+    for (size_t i = kBatch; i < N; i++, out++) {
+      *out = ReadVarint();
+    }
   }
 
  private:
@@ -311,26 +320,16 @@
     Allocate(1).StoreBit(/* bit_offset */ 0, value);
   }
 
-  template<size_t N>
-  ALWAYS_INLINE void WriteInterleavedVarints(std::array<uint32_t, N> values) {
-    // Write small values (or the number of bytes needed for the large values).
-    for (uint32_t value : values) {
-      if (value > kVarintMax) {
-        WriteBits(kVarintMax + BitsToBytesRoundUp(MinimumBitsToStore(value)), kVarintBits);
-      } else {
-        WriteBits(value, kVarintBits);
-      }
-    }
-    // Write large values.
-    for (uint32_t value : values) {
-      if (value > kVarintMax) {
-        WriteBits(value, BitsToBytesRoundUp(MinimumBitsToStore(value)) * kBitsPerByte);
-      }
-    }
-  }
-
+  // Write variable-length bit-packed integer.
   ALWAYS_INLINE void WriteVarint(uint32_t value) {
-    WriteInterleavedVarints<1>({value});
+    if (value <= kVarintSmallValue) {
+      WriteBits(value, kVarintHeaderBits);
+    } else {
+      uint32_t num_bits = RoundUp(MinimumBitsToStore(value), kBitsPerByte);
+      uint32_t header = kVarintSmallValue + num_bits / kBitsPerByte;
+      WriteBits(header, kVarintHeaderBits);
+      WriteBits(value, num_bits);
+    }
   }
 
   ALWAYS_INLINE void ByteAlign() {
diff --git a/libartbase/base/bit_memory_region_test.cc b/libartbase/base/bit_memory_region_test.cc
index bf9c6d6..02623bf 100644
--- a/libartbase/base/bit_memory_region_test.cc
+++ b/libartbase/base/bit_memory_region_test.cc
@@ -43,7 +43,7 @@
 
       BitMemoryReader reader(buffer.data(), start_bit_offset);
       uint32_t result = reader.ReadVarint();
-      uint32_t upper_bound = RoundUp(MinimumBitsToStore(value), kBitsPerByte) + kVarintBits;
+      uint32_t upper_bound = RoundUp(MinimumBitsToStore(value), kBitsPerByte) + kVarintHeaderBits;
       EXPECT_EQ(writer.NumberOfWrittenBits(), reader.NumberOfReadBits());
       EXPECT_EQ(value, result);
       EXPECT_GE(upper_bound, writer.NumberOfWrittenBits());
diff --git a/libartbase/base/bit_table.h b/libartbase/base/bit_table.h
index 5ec162d..1984265 100644
--- a/libartbase/base/bit_table.h
+++ b/libartbase/base/bit_table.h
@@ -49,7 +49,8 @@
 
   ALWAYS_INLINE void Decode(BitMemoryReader& reader) {
     // Decode row count and column sizes from the table header.
-    std::array<uint32_t, 1+kNumColumns> header = reader.ReadInterleavedVarints<1+kNumColumns>();
+    uint32_t header[1 + kNumColumns];
+    reader.ReadVarints(header);
     num_rows_ = header[0];
     column_offset_[0] = 0;
     for (uint32_t i = 0; i < kNumColumns; i++) {
@@ -334,7 +335,7 @@
   }
 
   // Calculate the column bit widths based on the current data.
-  void Measure(/*out*/ uint32_t* column_bits) const {
+  void Measure(/*out*/ std::array<uint32_t, kNumColumns>* column_bits) const {
     uint32_t max_column_value[kNumColumns];
     std::fill_n(max_column_value, kNumColumns, 0);
     for (uint32_t r = 0; r < size(); r++) {
@@ -343,7 +344,7 @@
       }
     }
     for (uint32_t c = 0; c < kNumColumns; c++) {
-      column_bits[c] = MinimumBitsToStore(max_column_value[c]);
+      (*column_bits)[c] = MinimumBitsToStore(max_column_value[c]);
     }
   }
 
@@ -352,12 +353,14 @@
   void Encode(BitMemoryWriter<Vector>& out) const {
     size_t initial_bit_offset = out.NumberOfWrittenBits();
 
+    std::array<uint32_t, kNumColumns> column_bits;
+    Measure(&column_bits);
+
     // Write table header.
-    std::array<uint32_t, 1 + kNumColumns> header;
-    header[0] = size();
-    uint32_t* column_bits = header.data() + 1;
-    Measure(column_bits);
-    out.WriteInterleavedVarints(header);
+    out.WriteVarint(size());
+    for (uint32_t c = 0; c < kNumColumns; c++) {
+      out.WriteVarint(column_bits[c]);
+    }
 
     // Write table data.
     for (uint32_t r = 0; r < size(); r++) {
@@ -441,10 +444,8 @@
     size_t initial_bit_offset = out.NumberOfWrittenBits();
 
     // Write table header.
-    out.WriteInterleavedVarints(std::array<uint32_t, 2>{
-      dchecked_integral_cast<uint32_t>(size()),
-      dchecked_integral_cast<uint32_t>(max_num_bits_),
-    });
+    out.WriteVarint(size());
+    out.WriteVarint(max_num_bits_);
 
     // Write table data.
     for (MemoryRegion row : rows_) {
diff --git a/runtime/oat.h b/runtime/oat.h
index 54d111c..c02ac0b 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,8 +32,8 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr std::array<uint8_t, 4> kOatMagic { { 'o', 'a', 't', '\n' } };
-  // Last oat version changed reason: Optimize stack map decoding - interleave varints.
-  static constexpr std::array<uint8_t, 4> kOatVersion { { '1', '7', '3', '\0' } };
+  // Last oat version changed reason: Stack maps: Handle special cases using flags.
+  static constexpr std::array<uint8_t, 4> kOatVersion { { '1', '7', '2', '\0' } };
 
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
   static constexpr const char* kDebuggableKey = "debuggable";
diff --git a/runtime/stack_map.cc b/runtime/stack_map.cc
index eebca85..2300d1f 100644
--- a/runtime/stack_map.cc
+++ b/runtime/stack_map.cc
@@ -33,13 +33,14 @@
 
 void CodeInfo::Decode(const uint8_t* data, DecodeFlags flags) {
   BitMemoryReader reader(data);
-  std::array<uint32_t, kNumHeaders> header = reader.ReadInterleavedVarints<kNumHeaders>();
+  uint32_t header[kNumHeaders];
+  reader.ReadVarints(header);
   ForEachHeaderField([this, &header](size_t i, auto member_pointer) {
     this->*member_pointer = header[i];
   });
   ForEachBitTableField([this, &reader](size_t i, auto member_pointer) {
     auto& table = this->*member_pointer;
-    if (LIKELY(HasBitTable(i))) {
+    if (HasBitTable(i)) {
       if (UNLIKELY(IsBitTableDeduped(i))) {
         ssize_t bit_offset = reader.NumberOfReadBits() - reader.ReadVarint();
         BitMemoryReader reader2(reader.data(), bit_offset);  // The offset is negative.
@@ -59,16 +60,12 @@
   writer_.ByteAlign();
   size_t deduped_offset = writer_.NumberOfWrittenBits() / kBitsPerByte;
 
-  // The back-reference offset takes space so dedupe is not worth it for tiny tables.
-  constexpr size_t kMinDedupSize = 32;  // Assume 32-bit offset on average.
-
   // Read the existing code info and find (and keep) dedup-map iterator for each table.
   // The iterator stores BitMemoryRegion and bit_offset of previous identical BitTable.
   BitMemoryReader reader(code_info_data);
   CodeInfo code_info;  // Temporary storage for decoded data.
-  std::array<uint32_t, kNumHeaders> header = reader.ReadInterleavedVarints<kNumHeaders>();
-  ForEachHeaderField([&code_info, &header](size_t i, auto member_pointer) {
-    code_info.*member_pointer = header[i];
+  ForEachHeaderField([&reader, &code_info](size_t, auto member_pointer) {
+    code_info.*member_pointer = reader.ReadVarint();
   });
   std::map<BitMemoryRegion, uint32_t, BitMemoryRegion::Less>::iterator it[kNumBitTables];
   ForEachBitTableField([this, &reader, &code_info, &it](size_t i, auto member_pointer) {
@@ -78,17 +75,16 @@
       (code_info.*member_pointer).Decode(reader);
       BitMemoryRegion region = reader.GetReadRegion().Subregion(bit_table_start);
       it[i] = dedupe_map_.emplace(region, /* default bit_offset */ 0).first;
-      if (it[i]->second != 0 && region.size_in_bits() > kMinDedupSize) {  // Seen before and large?
+      if (it[i]->second != 0 && region.size_in_bits() > 32) {  // Seen before and large?
         code_info.SetBitTableDeduped(i);  // Mark as deduped before we write header.
       }
     }
   });
 
   // Write the code info back, but replace deduped tables with relative offsets.
-  ForEachHeaderField([&code_info, &header](size_t i, auto member_pointer) {
-    header[i] = code_info.*member_pointer;
+  ForEachHeaderField([this, &code_info](size_t, auto member_pointer) {
+    writer_.WriteVarint(code_info.*member_pointer);
   });
-  writer_.WriteInterleavedVarints(header);
   ForEachBitTableField([this, &code_info, &it](size_t i, auto) {
     if (code_info.HasBitTable(i)) {
       uint32_t& bit_offset = it[i]->second;
@@ -110,8 +106,7 @@
         DCHECK_EQ(old_code_info.*member_pointer, new_code_info.*member_pointer);
       }
     });
-    ForEachBitTableField([&old_code_info, &new_code_info](size_t i, auto member_pointer) {
-      DCHECK_EQ(old_code_info.HasBitTable(i), new_code_info.HasBitTable(i));
+    ForEachBitTableField([&old_code_info, &new_code_info](size_t, auto member_pointer) {
       DCHECK((old_code_info.*member_pointer).Equals(new_code_info.*member_pointer));
     });
   }
@@ -208,9 +203,8 @@
   Stats* codeinfo_stats = parent->Child("CodeInfo");
   BitMemoryReader reader(code_info_data);
   CodeInfo code_info;  // Temporary storage for decoded tables.
-  std::array<uint32_t, kNumHeaders> header = reader.ReadInterleavedVarints<kNumHeaders>();
-  ForEachHeaderField([&code_info, &header](size_t i, auto member_pointer) {
-    code_info.*member_pointer = header[i];
+  ForEachHeaderField([&reader, &code_info](size_t, auto member_pointer) {
+    code_info.*member_pointer = reader.ReadVarint();
   });
   codeinfo_stats->Child("Header")->AddBits(reader.NumberOfReadBits());
   ForEachBitTableField([codeinfo_stats, &reader, &code_info](size_t i, auto member_pointer) {
diff --git a/runtime/stack_map.h b/runtime/stack_map.h
index b438074..c088eb6 100644
--- a/runtime/stack_map.h
+++ b/runtime/stack_map.h
@@ -443,7 +443,8 @@
 
   ALWAYS_INLINE static QuickMethodFrameInfo DecodeFrameInfo(const uint8_t* code_info_data) {
     BitMemoryReader reader(code_info_data);
-    std::array<uint32_t, kNumHeaders> header = reader.ReadInterleavedVarints<kNumHeaders>();
+    uint32_t header[4];  // flags, packed_frame_size, core_spill_mask, fp_spill_mask.
+    reader.ReadVarints(header);
     return QuickMethodFrameInfo(header[1] * kStackAlignment, header[2], header[3]);
   }