diff options
author | 2019-06-20 01:41:31 +0000 | |
---|---|---|
committer | 2019-06-20 01:43:54 +0000 | |
commit | 1b2a49b7aba39ed6663a69dfdf63d0df069f0d42 (patch) | |
tree | 15a22f6390135758cb9eeaa1ef816f4634cc70f9 | |
parent | a2b34561a7faca95d0a4f8194ad155798e238e37 (diff) |
Revert "Stack maps: Interleave consecutive varints."
This reverts commit a2b34561a7faca95d0a4f8194ad155798e238e37.
Reason for revert: <INSERT REASONING HERE>
Change-Id: Ie5b220e429e101bb5fa2606665a9c8cb64308ad3
Bug: 135638469
-rw-r--r-- | compiler/optimizing/stack_map_stream.cc | 14 | ||||
-rw-r--r-- | libartbase/base/bit_memory_region.h | 133 | ||||
-rw-r--r-- | libartbase/base/bit_memory_region_test.cc | 2 | ||||
-rw-r--r-- | libartbase/base/bit_table.h | 25 | ||||
-rw-r--r-- | runtime/oat.h | 4 | ||||
-rw-r--r-- | runtime/stack_map.cc | 28 | ||||
-rw-r--r-- | runtime/stack_map.h | 3 |
7 files changed, 101 insertions, 108 deletions
diff --git a/compiler/optimizing/stack_map_stream.cc b/compiler/optimizing/stack_map_stream.cc index 87e15baa31..87702cc798 100644 --- a/compiler/optimizing/stack_map_stream.cc +++ b/compiler/optimizing/stack_map_stream.cc @@ -307,14 +307,12 @@ ScopedArenaVector<uint8_t> StackMapStream::Encode() { ScopedArenaVector<uint8_t> buffer(allocator_->Adapter(kArenaAllocStackMapStream)); BitMemoryWriter<ScopedArenaVector<uint8_t>> out(&buffer); - out.WriteInterleavedVarints(std::array<uint32_t, CodeInfo::kNumHeaders>{ - flags, - packed_frame_size_, - core_spill_mask_, - fp_spill_mask_, - num_dex_registers_, - bit_table_flags, - }); + out.WriteVarint(flags); + out.WriteVarint(packed_frame_size_); + out.WriteVarint(core_spill_mask_); + out.WriteVarint(fp_spill_mask_); + out.WriteVarint(num_dex_registers_); + out.WriteVarint(bit_table_flags); ForEachBitTable([&out](size_t, auto bit_table) { if (bit_table->size() != 0) { // Skip empty bit-tables. bit_table->Encode(out); diff --git a/libartbase/base/bit_memory_region.h b/libartbase/base/bit_memory_region.h index f19bcf1928..50d132e2fc 100644 --- a/libartbase/base/bit_memory_region.h +++ b/libartbase/base/bit_memory_region.h @@ -22,8 +22,6 @@ #include "bit_utils.h" #include "memory_tool.h" -#include <array> - namespace art { // Bit memory region is a bit offset subregion of a normal memoryregion. This is useful for @@ -39,9 +37,11 @@ class BitMemoryRegion final : public ValueObject { BitMemoryRegion() = default; ALWAYS_INLINE BitMemoryRegion(uint8_t* data, ssize_t bit_start, size_t bit_size) { // Normalize the data pointer. Note that bit_start may be negative. - data_ = AlignDown(data + (bit_start >> kBitsPerByteLog2), kPageSize); - bit_start_ = bit_start + kBitsPerByte * (data - data_); + uint8_t* aligned_data = AlignDown(data + (bit_start >> kBitsPerByteLog2), sizeof(uintptr_t)); + data_ = reinterpret_cast<uintptr_t*>(aligned_data); + bit_start_ = bit_start + kBitsPerByte * (data - aligned_data); bit_size_ = bit_size; + DCHECK_LT(bit_start_, static_cast<size_t>(kBitsPerIntPtrT)); } ALWAYS_INLINE explicit BitMemoryRegion(MemoryRegion region) : BitMemoryRegion(region.begin(), /* bit_start */ 0, region.size_in_bits()) { @@ -55,7 +55,7 @@ class BitMemoryRegion final : public ValueObject { const uint8_t* data() const { DCHECK_ALIGNED(bit_start_, kBitsPerByte); - return data_ + bit_start_ / kBitsPerByte; + return reinterpret_cast<const uint8_t*>(data_) + bit_start_ / kBitsPerByte; } size_t size_in_bits() const { @@ -87,45 +87,42 @@ class BitMemoryRegion final : public ValueObject { // significant bit in the first byte. ALWAYS_INLINE bool LoadBit(size_t bit_offset) const { DCHECK_LT(bit_offset, bit_size_); + uint8_t* data = reinterpret_cast<uint8_t*>(data_); size_t index = (bit_start_ + bit_offset) / kBitsPerByte; size_t shift = (bit_start_ + bit_offset) % kBitsPerByte; - return ((data_[index] >> shift) & 1) != 0; + return ((data[index] >> shift) & 1) != 0; } ALWAYS_INLINE void StoreBit(size_t bit_offset, bool value) { DCHECK_LT(bit_offset, bit_size_); + uint8_t* data = reinterpret_cast<uint8_t*>(data_); size_t index = (bit_start_ + bit_offset) / kBitsPerByte; size_t shift = (bit_start_ + bit_offset) % kBitsPerByte; - data_[index] &= ~(1 << shift); // Clear bit. - data_[index] |= (value ? 1 : 0) << shift; // Set bit. + data[index] &= ~(1 << shift); // Clear bit. + data[index] |= (value ? 1 : 0) << shift; // Set bit. DCHECK_EQ(value, LoadBit(bit_offset)); } // Load `bit_length` bits from `data` starting at given `bit_offset`. // The least significant bit is stored in the smallest memory offset. ATTRIBUTE_NO_SANITIZE_ADDRESS // We might touch extra bytes due to the alignment. - template<typename Result = size_t> - ALWAYS_INLINE Result LoadBits(size_t bit_offset, size_t bit_length) const { - static_assert(std::is_integral<Result>::value, "Result must be integral"); - static_assert(std::is_unsigned<Result>::value, "Result must be unsigned"); - DCHECK(IsAligned<sizeof(Result)>(data_)); + ALWAYS_INLINE uint32_t LoadBits(size_t bit_offset, size_t bit_length) const { + DCHECK(IsAligned<sizeof(uintptr_t)>(data_)); DCHECK_LE(bit_offset, bit_size_); DCHECK_LE(bit_length, bit_size_ - bit_offset); - DCHECK_LE(bit_length, BitSizeOf<Result>()); + DCHECK_LE(bit_length, BitSizeOf<uint32_t>()); if (bit_length == 0) { return 0; } - Result* data = reinterpret_cast<Result*>(data_); - size_t width = BitSizeOf<Result>(); - Result clear = (std::numeric_limits<Result>::max() << 1) << (bit_length - 1); - size_t index = (bit_start_ + bit_offset) / width; - size_t shift = (bit_start_ + bit_offset) % width; - Result value = data[index] >> shift; - size_t finished_bits = width - shift; + uintptr_t mask = std::numeric_limits<uintptr_t>::max() >> (kBitsPerIntPtrT - bit_length); + size_t index = (bit_start_ + bit_offset) / kBitsPerIntPtrT; + size_t shift = (bit_start_ + bit_offset) % kBitsPerIntPtrT; + uintptr_t value = data_[index] >> shift; + size_t finished_bits = kBitsPerIntPtrT - shift; if (finished_bits < bit_length) { - value |= data[index + 1] << finished_bits; + value |= data_[index + 1] << finished_bits; } - return value & ~clear; + return value & mask; } // Store `bit_length` bits in `data` starting at given `bit_offset`. @@ -140,15 +137,16 @@ class BitMemoryRegion final : public ValueObject { } // Write data byte by byte to avoid races with other threads // on bytes that do not overlap with this region. + uint8_t* data = reinterpret_cast<uint8_t*>(data_); uint32_t mask = std::numeric_limits<uint32_t>::max() >> (BitSizeOf<uint32_t>() - bit_length); size_t index = (bit_start_ + bit_offset) / kBitsPerByte; size_t shift = (bit_start_ + bit_offset) % kBitsPerByte; - data_[index] &= ~(mask << shift); // Clear bits. - data_[index] |= (value << shift); // Set bits. + data[index] &= ~(mask << shift); // Clear bits. + data[index] |= (value << shift); // Set bits. size_t finished_bits = kBitsPerByte - shift; for (int i = 1; finished_bits < bit_length; i++, finished_bits += kBitsPerByte) { - data_[index + i] &= ~(mask >> finished_bits); // Clear bits. - data_[index + i] |= (value >> finished_bits); // Set bits. + data[index + i] &= ~(mask >> finished_bits); // Clear bits. + data[index + i] |= (value >> finished_bits); // Set bits. } DCHECK_EQ(value, LoadBits(bit_offset, bit_length)); } @@ -203,13 +201,14 @@ class BitMemoryRegion final : public ValueObject { } private: - uint8_t* data_ = nullptr; // The pointer is page aligned. + // The data pointer must be naturally aligned. This makes loading code faster. + uintptr_t* data_ = nullptr; size_t bit_start_ = 0; size_t bit_size_ = 0; }; -constexpr uint32_t kVarintBits = 4; // Minimum number of bits used for varint. -constexpr uint32_t kVarintMax = 11; // Maximum value which is stored "inline". +constexpr uint32_t kVarintHeaderBits = 4; +constexpr uint32_t kVarintSmallValue = 11; // Maximum value which is stored as-is. class BitMemoryReader { public: @@ -233,9 +232,8 @@ class BitMemoryReader { return finished_region_.Subregion(bit_offset, bit_length); } - template<typename Result = size_t> - ALWAYS_INLINE Result ReadBits(size_t bit_length) { - return ReadRegion(bit_length).LoadBits<Result>(/* bit_offset */ 0, bit_length); + ALWAYS_INLINE uint32_t ReadBits(size_t bit_length) { + return ReadRegion(bit_length).LoadBits(/* bit_offset */ 0, bit_length); } ALWAYS_INLINE bool ReadBit() { @@ -247,24 +245,35 @@ class BitMemoryReader { // Values 0..11 represent the result as-is, with no further following bits. // Values 12..15 mean the result is in the next 8/16/24/32-bits respectively. ALWAYS_INLINE uint32_t ReadVarint() { - uint32_t x = ReadBits(kVarintBits); - return (x <= kVarintMax) ? x : ReadBits((x - kVarintMax) * kBitsPerByte); + uint32_t x = ReadBits(kVarintHeaderBits); + if (x > kVarintSmallValue) { + x = ReadBits((x - kVarintSmallValue) * kBitsPerByte); + } + return x; } - // Read N 'interleaved' varints (different to just reading consecutive varints). - // All small values are stored first and the large values are stored after them. - // This requires fewer bit-reads compared to indidually storing the varints. - template<size_t N> - ALWAYS_INLINE std::array<uint32_t, N> ReadInterleavedVarints() { - static_assert(N * kVarintBits <= sizeof(uint64_t) * kBitsPerByte, "N too big"); - std::array<uint32_t, N> values; - // StackMap BitTable uses over 8 varints in the header, so we need uint64_t. - uint64_t data = ReadBits<uint64_t>(N * kVarintBits); - for (size_t i = 0; i < N; i++) { - uint32_t x = BitFieldExtract(data, i * kVarintBits, kVarintBits); - values[i] = LIKELY(x <= kVarintMax) ? x : ReadBits((x - kVarintMax) * kBitsPerByte); + // Optimized version to read several consecutive varints. + // It reads all the headers at once in a single bit read. + template<size_t N> // Inference works only with ref-arrays. + ALWAYS_INLINE void ReadVarints(uint32_t (&varints)[N]) { + constexpr size_t kBatch = std::min(N, sizeof(uint32_t) * kBitsPerByte / kVarintHeaderBits); + uint32_t headers = ReadBits(kBatch * kVarintHeaderBits); + uint32_t* out = varints; + for (size_t i = 0; i < kBatch; out++) { + uint32_t header = BitFieldExtract(headers, (i++) * kVarintHeaderBits, kVarintHeaderBits); + if (LIKELY(header <= kVarintSmallValue)) { + // Fast-path: consume one of the headers and continue to the next varint. + *out = header; + } else { + // Slow-path: rollback reader, read large value, and read remaning headers. + finished_region_.Resize(finished_region_.size_in_bits() - (kBatch-i) * kVarintHeaderBits); + *out = ReadBits((header - kVarintSmallValue) * kBitsPerByte); + headers = ReadBits((kBatch-i) * kVarintHeaderBits) << (i * kVarintHeaderBits); + } + } + for (size_t i = kBatch; i < N; i++, out++) { + *out = ReadVarint(); } - return values; } private: @@ -311,26 +320,16 @@ class BitMemoryWriter { Allocate(1).StoreBit(/* bit_offset */ 0, value); } - template<size_t N> - ALWAYS_INLINE void WriteInterleavedVarints(std::array<uint32_t, N> values) { - // Write small values (or the number of bytes needed for the large values). - for (uint32_t value : values) { - if (value > kVarintMax) { - WriteBits(kVarintMax + BitsToBytesRoundUp(MinimumBitsToStore(value)), kVarintBits); - } else { - WriteBits(value, kVarintBits); - } - } - // Write large values. - for (uint32_t value : values) { - if (value > kVarintMax) { - WriteBits(value, BitsToBytesRoundUp(MinimumBitsToStore(value)) * kBitsPerByte); - } - } - } - + // Write variable-length bit-packed integer. ALWAYS_INLINE void WriteVarint(uint32_t value) { - WriteInterleavedVarints<1>({value}); + if (value <= kVarintSmallValue) { + WriteBits(value, kVarintHeaderBits); + } else { + uint32_t num_bits = RoundUp(MinimumBitsToStore(value), kBitsPerByte); + uint32_t header = kVarintSmallValue + num_bits / kBitsPerByte; + WriteBits(header, kVarintHeaderBits); + WriteBits(value, num_bits); + } } ALWAYS_INLINE void ByteAlign() { diff --git a/libartbase/base/bit_memory_region_test.cc b/libartbase/base/bit_memory_region_test.cc index bf9c6d62b3..02623bf040 100644 --- a/libartbase/base/bit_memory_region_test.cc +++ b/libartbase/base/bit_memory_region_test.cc @@ -43,7 +43,7 @@ TEST(BitMemoryRegion, TestVarint) { BitMemoryReader reader(buffer.data(), start_bit_offset); uint32_t result = reader.ReadVarint(); - uint32_t upper_bound = RoundUp(MinimumBitsToStore(value), kBitsPerByte) + kVarintBits; + uint32_t upper_bound = RoundUp(MinimumBitsToStore(value), kBitsPerByte) + kVarintHeaderBits; EXPECT_EQ(writer.NumberOfWrittenBits(), reader.NumberOfReadBits()); EXPECT_EQ(value, result); EXPECT_GE(upper_bound, writer.NumberOfWrittenBits()); diff --git a/libartbase/base/bit_table.h b/libartbase/base/bit_table.h index 5ec162d1e8..1984265825 100644 --- a/libartbase/base/bit_table.h +++ b/libartbase/base/bit_table.h @@ -49,7 +49,8 @@ class BitTableBase { ALWAYS_INLINE void Decode(BitMemoryReader& reader) { // Decode row count and column sizes from the table header. - std::array<uint32_t, 1+kNumColumns> header = reader.ReadInterleavedVarints<1+kNumColumns>(); + uint32_t header[1 + kNumColumns]; + reader.ReadVarints(header); num_rows_ = header[0]; column_offset_[0] = 0; for (uint32_t i = 0; i < kNumColumns; i++) { @@ -334,7 +335,7 @@ class BitTableBuilderBase { } // Calculate the column bit widths based on the current data. - void Measure(/*out*/ uint32_t* column_bits) const { + void Measure(/*out*/ std::array<uint32_t, kNumColumns>* column_bits) const { uint32_t max_column_value[kNumColumns]; std::fill_n(max_column_value, kNumColumns, 0); for (uint32_t r = 0; r < size(); r++) { @@ -343,7 +344,7 @@ class BitTableBuilderBase { } } for (uint32_t c = 0; c < kNumColumns; c++) { - column_bits[c] = MinimumBitsToStore(max_column_value[c]); + (*column_bits)[c] = MinimumBitsToStore(max_column_value[c]); } } @@ -352,12 +353,14 @@ class BitTableBuilderBase { void Encode(BitMemoryWriter<Vector>& out) const { size_t initial_bit_offset = out.NumberOfWrittenBits(); + std::array<uint32_t, kNumColumns> column_bits; + Measure(&column_bits); + // Write table header. - std::array<uint32_t, 1 + kNumColumns> header; - header[0] = size(); - uint32_t* column_bits = header.data() + 1; - Measure(column_bits); - out.WriteInterleavedVarints(header); + out.WriteVarint(size()); + for (uint32_t c = 0; c < kNumColumns; c++) { + out.WriteVarint(column_bits[c]); + } // Write table data. for (uint32_t r = 0; r < size(); r++) { @@ -441,10 +444,8 @@ class BitmapTableBuilder { size_t initial_bit_offset = out.NumberOfWrittenBits(); // Write table header. - out.WriteInterleavedVarints(std::array<uint32_t, 2>{ - dchecked_integral_cast<uint32_t>(size()), - dchecked_integral_cast<uint32_t>(max_num_bits_), - }); + out.WriteVarint(size()); + out.WriteVarint(max_num_bits_); // Write table data. for (MemoryRegion row : rows_) { diff --git a/runtime/oat.h b/runtime/oat.h index 54d111cc31..c02ac0b113 100644 --- a/runtime/oat.h +++ b/runtime/oat.h @@ -32,8 +32,8 @@ class InstructionSetFeatures; class PACKED(4) OatHeader { public: static constexpr std::array<uint8_t, 4> kOatMagic { { 'o', 'a', 't', '\n' } }; - // Last oat version changed reason: Optimize stack map decoding - interleave varints. - static constexpr std::array<uint8_t, 4> kOatVersion { { '1', '7', '3', '\0' } }; + // Last oat version changed reason: Stack maps: Handle special cases using flags. + static constexpr std::array<uint8_t, 4> kOatVersion { { '1', '7', '2', '\0' } }; static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline"; static constexpr const char* kDebuggableKey = "debuggable"; diff --git a/runtime/stack_map.cc b/runtime/stack_map.cc index eebca853c6..2300d1fc0d 100644 --- a/runtime/stack_map.cc +++ b/runtime/stack_map.cc @@ -33,13 +33,14 @@ CodeInfo::CodeInfo(const OatQuickMethodHeader* header, DecodeFlags flags) void CodeInfo::Decode(const uint8_t* data, DecodeFlags flags) { BitMemoryReader reader(data); - std::array<uint32_t, kNumHeaders> header = reader.ReadInterleavedVarints<kNumHeaders>(); + uint32_t header[kNumHeaders]; + reader.ReadVarints(header); ForEachHeaderField([this, &header](size_t i, auto member_pointer) { this->*member_pointer = header[i]; }); ForEachBitTableField([this, &reader](size_t i, auto member_pointer) { auto& table = this->*member_pointer; - if (LIKELY(HasBitTable(i))) { + if (HasBitTable(i)) { if (UNLIKELY(IsBitTableDeduped(i))) { ssize_t bit_offset = reader.NumberOfReadBits() - reader.ReadVarint(); BitMemoryReader reader2(reader.data(), bit_offset); // The offset is negative. @@ -59,16 +60,12 @@ size_t CodeInfo::Deduper::Dedupe(const uint8_t* code_info_data) { writer_.ByteAlign(); size_t deduped_offset = writer_.NumberOfWrittenBits() / kBitsPerByte; - // The back-reference offset takes space so dedupe is not worth it for tiny tables. - constexpr size_t kMinDedupSize = 32; // Assume 32-bit offset on average. - // Read the existing code info and find (and keep) dedup-map iterator for each table. // The iterator stores BitMemoryRegion and bit_offset of previous identical BitTable. BitMemoryReader reader(code_info_data); CodeInfo code_info; // Temporary storage for decoded data. - std::array<uint32_t, kNumHeaders> header = reader.ReadInterleavedVarints<kNumHeaders>(); - ForEachHeaderField([&code_info, &header](size_t i, auto member_pointer) { - code_info.*member_pointer = header[i]; + ForEachHeaderField([&reader, &code_info](size_t, auto member_pointer) { + code_info.*member_pointer = reader.ReadVarint(); }); std::map<BitMemoryRegion, uint32_t, BitMemoryRegion::Less>::iterator it[kNumBitTables]; ForEachBitTableField([this, &reader, &code_info, &it](size_t i, auto member_pointer) { @@ -78,17 +75,16 @@ size_t CodeInfo::Deduper::Dedupe(const uint8_t* code_info_data) { (code_info.*member_pointer).Decode(reader); BitMemoryRegion region = reader.GetReadRegion().Subregion(bit_table_start); it[i] = dedupe_map_.emplace(region, /* default bit_offset */ 0).first; - if (it[i]->second != 0 && region.size_in_bits() > kMinDedupSize) { // Seen before and large? + if (it[i]->second != 0 && region.size_in_bits() > 32) { // Seen before and large? code_info.SetBitTableDeduped(i); // Mark as deduped before we write header. } } }); // Write the code info back, but replace deduped tables with relative offsets. - ForEachHeaderField([&code_info, &header](size_t i, auto member_pointer) { - header[i] = code_info.*member_pointer; + ForEachHeaderField([this, &code_info](size_t, auto member_pointer) { + writer_.WriteVarint(code_info.*member_pointer); }); - writer_.WriteInterleavedVarints(header); ForEachBitTableField([this, &code_info, &it](size_t i, auto) { if (code_info.HasBitTable(i)) { uint32_t& bit_offset = it[i]->second; @@ -110,8 +106,7 @@ size_t CodeInfo::Deduper::Dedupe(const uint8_t* code_info_data) { DCHECK_EQ(old_code_info.*member_pointer, new_code_info.*member_pointer); } }); - ForEachBitTableField([&old_code_info, &new_code_info](size_t i, auto member_pointer) { - DCHECK_EQ(old_code_info.HasBitTable(i), new_code_info.HasBitTable(i)); + ForEachBitTableField([&old_code_info, &new_code_info](size_t, auto member_pointer) { DCHECK((old_code_info.*member_pointer).Equals(new_code_info.*member_pointer)); }); } @@ -208,9 +203,8 @@ void CodeInfo::CollectSizeStats(const uint8_t* code_info_data, /*out*/ Stats* pa Stats* codeinfo_stats = parent->Child("CodeInfo"); BitMemoryReader reader(code_info_data); CodeInfo code_info; // Temporary storage for decoded tables. - std::array<uint32_t, kNumHeaders> header = reader.ReadInterleavedVarints<kNumHeaders>(); - ForEachHeaderField([&code_info, &header](size_t i, auto member_pointer) { - code_info.*member_pointer = header[i]; + ForEachHeaderField([&reader, &code_info](size_t, auto member_pointer) { + code_info.*member_pointer = reader.ReadVarint(); }); codeinfo_stats->Child("Header")->AddBits(reader.NumberOfReadBits()); ForEachBitTableField([codeinfo_stats, &reader, &code_info](size_t i, auto member_pointer) { diff --git a/runtime/stack_map.h b/runtime/stack_map.h index b438074d6d..c088eb6f80 100644 --- a/runtime/stack_map.h +++ b/runtime/stack_map.h @@ -443,7 +443,8 @@ class CodeInfo { ALWAYS_INLINE static QuickMethodFrameInfo DecodeFrameInfo(const uint8_t* code_info_data) { BitMemoryReader reader(code_info_data); - std::array<uint32_t, kNumHeaders> header = reader.ReadInterleavedVarints<kNumHeaders>(); + uint32_t header[4]; // flags, packed_frame_size, core_spill_mask, fp_spill_mask. + reader.ReadVarints(header); return QuickMethodFrameInfo(header[1] * kStackAlignment, header[2], header[3]); } |