Revert^2 "Faster deduplication of `CodeInfo` tables."

This reverts commit 8c7f649fff75ba98392931157292f06f7930f2b6.

Reason for revert: Add hwasan exclusion annotation.

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Bug: 181943478
Change-Id: Ifc4dec165a2977a08654d7ae094fe1aa8a5bbbe5
diff --git a/compiler/optimizing/stack_map_test.cc b/compiler/optimizing/stack_map_test.cc
index e83d37e..f6a739e 100644
--- a/compiler/optimizing/stack_map_test.cc
+++ b/compiler/optimizing/stack_map_test.cc
@@ -724,46 +724,4 @@
             stack_map2.GetStackMaskIndex());
 }
 
-TEST(StackMapTest, TestDedupeBitTables) {
-  MallocArenaPool pool;
-  ArenaStack arena_stack(&pool);
-  ScopedArenaAllocator allocator(&arena_stack);
-  StackMapStream stream(&allocator, kRuntimeISA);
-  stream.BeginMethod(32, 0, 0, 2);
-
-  stream.BeginStackMapEntry(0, 64 * kPcAlign);
-  stream.AddDexRegisterEntry(Kind::kInStack, 0);
-  stream.AddDexRegisterEntry(Kind::kConstant, -2);
-  stream.EndStackMapEntry();
-
-  stream.EndMethod(64 * kPcAlign);
-  ScopedArenaVector<uint8_t> memory = stream.Encode();
-
-  std::vector<uint8_t> out;
-  CodeInfo::Deduper deduper(&out);
-  size_t deduped1 = deduper.Dedupe(memory.data());
-  size_t deduped2 = deduper.Dedupe(memory.data());
-
-  for (size_t deduped : { deduped1, deduped2 }) {
-    CodeInfo code_info(out.data() + deduped);
-    ASSERT_EQ(1u, code_info.GetNumberOfStackMaps());
-
-    StackMap stack_map = code_info.GetStackMapAt(0);
-    ASSERT_TRUE(stack_map.Equals(code_info.GetStackMapForDexPc(0)));
-    ASSERT_TRUE(stack_map.Equals(code_info.GetStackMapForNativePcOffset(64 * kPcAlign)));
-    ASSERT_EQ(0u, stack_map.GetDexPc());
-    ASSERT_EQ(64u * kPcAlign, stack_map.GetNativePcOffset(kRuntimeISA));
-
-    ASSERT_TRUE(stack_map.HasDexRegisterMap());
-    DexRegisterMap dex_register_map = code_info.GetDexRegisterMapOf(stack_map);
-
-    ASSERT_EQ(Kind::kInStack, dex_register_map[0].GetKind());
-    ASSERT_EQ(Kind::kConstant, dex_register_map[1].GetKind());
-    ASSERT_EQ(0, dex_register_map[0].GetStackOffsetInBytes());
-    ASSERT_EQ(-2, dex_register_map[1].GetConstant());
-  }
-
-  ASSERT_GT(memory.size() * 2, out.size());
-}
-
 }  // namespace art
diff --git a/dex2oat/Android.bp b/dex2oat/Android.bp
index a89c62f..6d94767 100644
--- a/dex2oat/Android.bp
+++ b/dex2oat/Android.bp
@@ -30,6 +30,7 @@
     srcs: [
         "dex/quick_compiler_callbacks.cc",
         "driver/compiler_driver.cc",
+        "linker/code_info_table_deduper.cc",
         "linker/elf_writer.cc",
         "linker/elf_writer_quick.cc",
         "linker/image_writer.cc",
@@ -494,6 +495,7 @@
         "dex2oat_vdex_test.cc",
         "dex2oat_image_test.cc",
         "driver/compiler_driver_test.cc",
+        "linker/code_info_table_deduper_test.cc",
         "linker/elf_writer_test.cc",
         "linker/image_test.cc",
         "linker/image_write_read_test.cc",
diff --git a/dex2oat/linker/code_info_table_deduper.cc b/dex2oat/linker/code_info_table_deduper.cc
new file mode 100644
index 0000000..5ffe004
--- /dev/null
+++ b/dex2oat/linker/code_info_table_deduper.cc
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "code_info_table_deduper.h"
+
+#include "stack_map.h"
+
+namespace art {
+namespace linker {
+
+size_t CodeInfoTableDeduper::Dedupe(const uint8_t* code_info_data) {
+  static constexpr size_t kNumHeaders = CodeInfo::kNumHeaders;
+  static constexpr size_t kNumBitTables = CodeInfo::kNumBitTables;
+
+  // The back-reference offset takes space so dedupe is not worth it for tiny tables.
+  constexpr size_t kMinDedupSize = 33;  // Assume 32-bit offset on average.
+
+  size_t start_bit_offset = writer_.NumberOfWrittenBits();
+  DCHECK_ALIGNED(start_bit_offset, kBitsPerByte);
+
+  // Reserve enough space in the `dedupe_set_` to avoid reashing later in this
+  // function and allow using direct pointers to the `HashSet<>` entries.
+  size_t elements_until_expand = dedupe_set_.ElementsUntilExpand();
+  if (UNLIKELY(elements_until_expand - dedupe_set_.size() < kNumBitTables)) {
+    // When resizing, try to make the load factor close to the minimum load factor.
+    size_t required_capacity = dedupe_set_.size() + kNumBitTables;
+    double factor = dedupe_set_.GetMaxLoadFactor() / dedupe_set_.GetMinLoadFactor();
+    size_t reservation = required_capacity * factor;
+    DCHECK_GE(reservation, required_capacity);
+    dedupe_set_.reserve(reservation);
+    elements_until_expand = dedupe_set_.ElementsUntilExpand();
+    DCHECK_GE(elements_until_expand - dedupe_set_.size(), kNumBitTables);
+  }
+
+  // Read the existing code info and record bit table starts and end.
+  BitMemoryReader reader(code_info_data);
+  std::array<uint32_t, kNumHeaders> header = reader.ReadInterleavedVarints<kNumHeaders>();
+  CodeInfo code_info;
+  CodeInfo::ForEachHeaderField([&code_info, &header](size_t i, auto member_pointer) {
+    code_info.*member_pointer = header[i];
+  });
+  DCHECK(!code_info.HasDedupedBitTables());  // Input `CodeInfo` has no deduped tables.
+  std::array<uint32_t, kNumBitTables + 1u> bit_table_bit_starts;
+  CodeInfo::ForEachBitTableField([&](size_t i, auto member_pointer) {
+    bit_table_bit_starts[i] = dchecked_integral_cast<uint32_t>(reader.NumberOfReadBits());
+    DCHECK(!code_info.IsBitTableDeduped(i));
+    if (LIKELY(code_info.HasBitTable(i))) {
+      auto& table = code_info.*member_pointer;
+      table.Decode(reader);
+    }
+  });
+  bit_table_bit_starts[kNumBitTables] = dchecked_integral_cast<uint32_t>(reader.NumberOfReadBits());
+
+  // Copy the source data.
+  BitMemoryRegion read_region = reader.GetReadRegion();
+  writer_.WriteBytesAligned(code_info_data, BitsToBytesRoundUp(read_region.size_in_bits()));
+
+  // Insert entries for large tables to the `dedupe_set_` and check for duplicates.
+  std::array<DedupeSetEntry*, kNumBitTables> dedupe_entries;
+  std::fill(dedupe_entries.begin(), dedupe_entries.end(), nullptr);
+  CodeInfo::ForEachBitTableField([&](size_t i, auto member_pointer ATTRIBUTE_UNUSED) {
+    if (LIKELY(code_info.HasBitTable(i))) {
+      uint32_t table_bit_size = bit_table_bit_starts[i + 1u] - bit_table_bit_starts[i];
+      if (table_bit_size >= kMinDedupSize) {
+        uint32_t table_bit_start = start_bit_offset + bit_table_bit_starts[i];
+        BitMemoryRegion region(
+            const_cast<uint8_t*>(writer_.data()), table_bit_start, table_bit_size);
+        uint32_t hash = DataHash()(region);
+        DedupeSetEntry entry{table_bit_start, table_bit_size, hash};
+        auto [it, inserted] = dedupe_set_.InsertWithHash(entry, hash);
+        dedupe_entries[i] = &*it;
+        if (!inserted) {
+          code_info.SetBitTableDeduped(i);  // Mark as deduped before we write header.
+        }
+      }
+    }
+  });
+  DCHECK_EQ(elements_until_expand, dedupe_set_.ElementsUntilExpand()) << "Unexpected resizing!";
+
+  if (code_info.HasDedupedBitTables()) {
+    // Reset the writer to the original position. This makes new entries in the
+    // `dedupe_set_` effectively point to non-existent data. We shall write the
+    // new data again at the correct position and update these entries.
+    writer_.Truncate(start_bit_offset);
+    // Update bit table flags in the `header` and write the `header`.
+    header[kNumHeaders - 1u] = code_info.bit_table_flags_;
+    CodeInfo::ForEachHeaderField([&code_info, &header](size_t i, auto member_pointer) {
+      DCHECK_EQ(code_info.*member_pointer, header[i]);
+    });
+    writer_.WriteInterleavedVarints(header);
+    // Write bit tables and update offsets in `dedupe_set_` after encoding the `header`.
+    CodeInfo::ForEachBitTableField([&](size_t i, auto member_pointer ATTRIBUTE_UNUSED) {
+      if (code_info.HasBitTable(i)) {
+        size_t current_bit_offset = writer_.NumberOfWrittenBits();
+        if (code_info.IsBitTableDeduped(i)) {
+          DCHECK_GE(bit_table_bit_starts[i + 1u] - bit_table_bit_starts[i], kMinDedupSize);
+          DCHECK(dedupe_entries[i] != nullptr);
+          size_t deduped_offset = dedupe_entries[i]->bit_start;
+          writer_.WriteVarint(current_bit_offset - deduped_offset);
+        } else {
+          uint32_t table_bit_size = bit_table_bit_starts[i + 1u] - bit_table_bit_starts[i];
+          writer_.WriteRegion(read_region.Subregion(bit_table_bit_starts[i], table_bit_size));
+          if (table_bit_size >= kMinDedupSize) {
+            // Update offset in the `dedupe_set_` entry.
+            DCHECK(dedupe_entries[i] != nullptr);
+            dedupe_entries[i]->bit_start = current_bit_offset;
+          }
+        }
+      }
+    });
+    writer_.ByteAlign();
+  }  // else nothing to do - we already copied the data.
+
+  if (kIsDebugBuild) {
+    CodeInfo old_code_info(code_info_data);
+    CodeInfo new_code_info(writer_.data() + start_bit_offset / kBitsPerByte);
+    CodeInfo::ForEachHeaderField([&old_code_info, &new_code_info](size_t, auto member_pointer) {
+      if (member_pointer != &CodeInfo::bit_table_flags_) {  // Expected to differ.
+        DCHECK_EQ(old_code_info.*member_pointer, new_code_info.*member_pointer);
+      }
+    });
+    CodeInfo::ForEachBitTableField([&old_code_info, &new_code_info](size_t i, auto member_pointer) {
+      DCHECK_EQ(old_code_info.HasBitTable(i), new_code_info.HasBitTable(i));
+      DCHECK((old_code_info.*member_pointer).Equals(new_code_info.*member_pointer));
+    });
+  }
+
+  return start_bit_offset / kBitsPerByte;
+}
+
+}  //  namespace linker
+}  //  namespace art
diff --git a/dex2oat/linker/code_info_table_deduper.h b/dex2oat/linker/code_info_table_deduper.h
new file mode 100644
index 0000000..682aba3
--- /dev/null
+++ b/dex2oat/linker/code_info_table_deduper.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (C) 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_DEX2OAT_LINKER_CODE_INFO_TABLE_DEDUPER_H_
+#define ART_DEX2OAT_LINKER_CODE_INFO_TABLE_DEDUPER_H_
+
+#include <vector>
+
+#include "base/bit_memory_region.h"
+#include "base/hash_set.h"
+
+namespace art {
+namespace linker {
+
+class CodeInfoTableDeduper {
+ public:
+  explicit CodeInfoTableDeduper(std::vector<uint8_t>* output)
+      : writer_(output),
+        dedupe_set_(DedupeSetEntryHash(), DedupeSetEntryEquals(output)) {
+    DCHECK_EQ(output->size(), 0u);
+  }
+
+  // Copy CodeInfo into output while de-duplicating the internal bit tables.
+  // It returns the byte offset of the copied CodeInfo within the output.
+  size_t Dedupe(const uint8_t* code_info);
+
+ private:
+  struct DedupeSetEntry {
+    uint32_t bit_start;
+    uint32_t bit_size;
+    uint32_t hash;
+  };
+
+  class DedupeSetEntryEmpty {
+   public:
+    void MakeEmpty(DedupeSetEntry& item) const {
+      item = {0u, 0u, 0u};
+    }
+    bool IsEmpty(const DedupeSetEntry& item) const {
+      return item.bit_size == 0u;
+    }
+  };
+
+  class DedupeSetEntryHash {
+   public:
+    uint32_t operator()(const DedupeSetEntry& item) const {
+      return item.hash;
+    }
+  };
+
+  class DedupeSetEntryEquals {
+   public:
+    explicit DedupeSetEntryEquals(std::vector<uint8_t>* output) : output_(output) {}
+
+    bool operator()(const DedupeSetEntry& lhs, const DedupeSetEntry& rhs) const {
+      DCHECK_NE(lhs.bit_size, 0u);
+      DCHECK_NE(rhs.bit_size, 0u);
+      return lhs.hash == rhs.hash &&
+             lhs.bit_size == rhs.bit_size &&
+             BitMemoryRegion::Equals(
+                 BitMemoryRegion(output_->data(), lhs.bit_start, lhs.bit_size),
+                 BitMemoryRegion(output_->data(), rhs.bit_start, rhs.bit_size));
+    }
+
+   private:
+    std::vector<uint8_t>* const output_;
+  };
+
+  using DedupeSet =
+      HashSet<DedupeSetEntry, DedupeSetEntryEmpty, DedupeSetEntryHash, DedupeSetEntryEquals>;
+
+  BitMemoryWriter<std::vector<uint8_t>> writer_;
+
+  // Deduplicate at BitTable level. Entries describe ranges in `output`, see constructor.
+  DedupeSet dedupe_set_;
+};
+
+}  //  namespace linker
+}  //  namespace art
+
+#endif  // ART_DEX2OAT_LINKER_CODE_INFO_TABLE_DEDUPER_H_
diff --git a/dex2oat/linker/code_info_table_deduper_test.cc b/dex2oat/linker/code_info_table_deduper_test.cc
new file mode 100644
index 0000000..8913b07
--- /dev/null
+++ b/dex2oat/linker/code_info_table_deduper_test.cc
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "code_info_table_deduper.h"
+
+#include "arch/instruction_set.h"
+#include "base/malloc_arena_pool.h"
+#include "base/scoped_arena_allocator.h"
+#include "base/scoped_arena_containers.h"
+#include "optimizing/stack_map_stream.h"
+
+namespace art {
+namespace linker {
+
+TEST(StackMapTest, TestDedupeBitTables) {
+  constexpr static uint32_t kPcAlign = GetInstructionSetInstructionAlignment(kRuntimeISA);
+  using Kind = DexRegisterLocation::Kind;
+
+  MallocArenaPool pool;
+  ArenaStack arena_stack(&pool);
+  ScopedArenaAllocator allocator(&arena_stack);
+  StackMapStream stream(&allocator, kRuntimeISA);
+  stream.BeginMethod(32, 0, 0, 2);
+
+  stream.BeginStackMapEntry(0, 64 * kPcAlign);
+  stream.AddDexRegisterEntry(Kind::kInStack, 0);
+  stream.AddDexRegisterEntry(Kind::kConstant, -2);
+  stream.EndStackMapEntry();
+
+  stream.EndMethod(64 * kPcAlign);
+  ScopedArenaVector<uint8_t> memory = stream.Encode();
+
+  std::vector<uint8_t> out;
+  CodeInfoTableDeduper deduper(&out);
+  size_t deduped1 = deduper.Dedupe(memory.data());
+  size_t deduped2 = deduper.Dedupe(memory.data());
+
+  for (size_t deduped : { deduped1, deduped2 }) {
+    CodeInfo code_info(out.data() + deduped);
+    ASSERT_EQ(1u, code_info.GetNumberOfStackMaps());
+
+    StackMap stack_map = code_info.GetStackMapAt(0);
+    ASSERT_TRUE(stack_map.Equals(code_info.GetStackMapForDexPc(0)));
+    ASSERT_TRUE(stack_map.Equals(code_info.GetStackMapForNativePcOffset(64 * kPcAlign)));
+    ASSERT_EQ(0u, stack_map.GetDexPc());
+    ASSERT_EQ(64u * kPcAlign, stack_map.GetNativePcOffset(kRuntimeISA));
+
+    ASSERT_TRUE(stack_map.HasDexRegisterMap());
+    DexRegisterMap dex_register_map = code_info.GetDexRegisterMapOf(stack_map);
+
+    ASSERT_EQ(Kind::kInStack, dex_register_map[0].GetKind());
+    ASSERT_EQ(Kind::kConstant, dex_register_map[1].GetKind());
+    ASSERT_EQ(0, dex_register_map[0].GetStackOffsetInBytes());
+    ASSERT_EQ(-2, dex_register_map[1].GetConstant());
+  }
+
+  ASSERT_GT(memory.size() * 2, out.size());
+}
+
+}  //  namespace linker
+}  //  namespace art
diff --git a/dex2oat/linker/oat_writer.cc b/dex2oat/linker/oat_writer.cc
index 55cee6d..428ec81 100644
--- a/dex2oat/linker/oat_writer.cc
+++ b/dex2oat/linker/oat_writer.cc
@@ -36,6 +36,7 @@
 #include "base/zip_archive.h"
 #include "class_linker.h"
 #include "class_table-inl.h"
+#include "code_info_table_deduper.h"
 #include "compiled_method-inl.h"
 #include "debug/method_debug_info.h"
 #include "dex/art_dex_file_loader.h"
@@ -1551,7 +1552,7 @@
   SafeMap<const uint8_t*, size_t> dedupe_code_info_;
 
   // Deduplicate at BitTable level.
-  CodeInfo::Deduper dedupe_bit_table_;
+  CodeInfoTableDeduper dedupe_bit_table_;
 };
 
 class OatWriter::InitImageMethodVisitor final : public OatDexMethodVisitor {
diff --git a/dexlayout/compact_dex_writer.h b/dexlayout/compact_dex_writer.h
index d78f8be..d7a3f84 100644
--- a/dexlayout/compact_dex_writer.h
+++ b/dexlayout/compact_dex_writer.h
@@ -72,7 +72,7 @@
         // Hash function.
         size_t operator()(const HashedMemoryRange& range) const {
           DCHECK_LE(range.offset_ + range.length_, section_->Size());
-          return HashBytes(Data() + range.offset_, range.length_);
+          return DataHash::HashBytes(Data() + range.offset_, range.length_);
         }
 
         ALWAYS_INLINE uint8_t* Data() const {
diff --git a/libartbase/base/bit_memory_region.h b/libartbase/base/bit_memory_region.h
index 6168488..cca4217 100644
--- a/libartbase/base/bit_memory_region.h
+++ b/libartbase/base/bit_memory_region.h
@@ -30,12 +30,6 @@
 // abstracting away the bit start offset to avoid needing passing as an argument everywhere.
 class BitMemoryRegion final : public ValueObject {
  public:
-  struct Less {
-    bool operator()(const BitMemoryRegion& lhs, const BitMemoryRegion& rhs) const {
-      return Compare(lhs, rhs) < 0;
-    }
-  };
-
   BitMemoryRegion() = default;
   ALWAYS_INLINE BitMemoryRegion(uint8_t* data, ssize_t bit_start, size_t bit_size) {
     // Normalize the data pointer. Note that bit_start may be negative.
@@ -136,17 +130,17 @@
 
   // Store `bit_length` bits in `data` starting at given `bit_offset`.
   // The least significant bit is stored in the smallest memory offset.
-  ALWAYS_INLINE void StoreBits(size_t bit_offset, uint32_t value, size_t bit_length) {
+  ALWAYS_INLINE void StoreBits(size_t bit_offset, size_t value, size_t bit_length) {
     DCHECK_LE(bit_offset, bit_size_);
     DCHECK_LE(bit_length, bit_size_ - bit_offset);
-    DCHECK_LE(bit_length, BitSizeOf<uint32_t>());
-    DCHECK_LE(value, MaxInt<uint32_t>(bit_length));
+    DCHECK_LE(bit_length, BitSizeOf<size_t>());
+    DCHECK_LE(value, MaxInt<size_t>(bit_length));
     if (bit_length == 0) {
       return;
     }
     // Write data byte by byte to avoid races with other threads
     // on bytes that do not overlap with this region.
-    uint32_t mask = std::numeric_limits<uint32_t>::max() >> (BitSizeOf<uint32_t>() - bit_length);
+    size_t mask = std::numeric_limits<size_t>::max() >> (BitSizeOf<size_t>() - bit_length);
     size_t index = (bit_start_ + bit_offset) / kBitsPerByte;
     size_t shift = (bit_start_ + bit_offset) % kBitsPerByte;
     data_[index] &= ~(mask << shift);  // Clear bits.
@@ -159,91 +153,172 @@
     DCHECK_EQ(value, LoadBits(bit_offset, bit_length));
   }
 
-  // Store bits from other bit region.
-  ALWAYS_INLINE void StoreBits(size_t bit_offset, const BitMemoryRegion& src, size_t bit_length) {
-    DCHECK_LE(bit_offset, bit_size_);
-    DCHECK_LE(bit_length, bit_size_ - bit_offset);
-    size_t bit = 0;
-    constexpr size_t kNumBits = BitSizeOf<uint32_t>();
-    for (; bit + kNumBits <= bit_length; bit += kNumBits) {
-      StoreBits(bit_offset + bit, src.LoadBits(bit, kNumBits), kNumBits);
-    }
-    size_t num_bits = bit_length - bit;
-    StoreBits(bit_offset + bit, src.LoadBits(bit, num_bits), num_bits);
+  // Copy bits from other bit region.
+  ALWAYS_INLINE void CopyBits(const BitMemoryRegion& src) {
+    DCHECK_EQ(size_in_bits(), src.size_in_bits());
+    // Hopefully, the loads of the unused `value` shall be optimized away.
+    VisitChunks(
+        [this, &src](size_t offset, size_t num_bits, size_t value ATTRIBUTE_UNUSED) ALWAYS_INLINE {
+          StoreChunk(offset, src.LoadBits(offset, num_bits), num_bits);
+          return true;
+        });
+  }
+
+  // And bits from other bit region.
+  ALWAYS_INLINE void AndBits(const BitMemoryRegion& src) {
+    DCHECK_EQ(size_in_bits(), src.size_in_bits());
+    VisitChunks([this, &src](size_t offset, size_t num_bits, size_t value) ALWAYS_INLINE {
+      StoreChunk(offset, value & src.LoadBits(offset, num_bits), num_bits);
+      return true;
+    });
   }
 
   // Or bits from other bit region.
-  ALWAYS_INLINE void OrBits(size_t bit_offset, const BitMemoryRegion& src, size_t bit_length) {
-    // TODO: Load `size_t` chunks (instead of `uint32_t`) from aligned
-    // addresses except for the leading and trailing bits. Refactor to
-    // share code with StoreBits() and maybe other functions.
-    DCHECK_LE(bit_offset, bit_size_);
-    DCHECK_LE(bit_length, bit_size_ - bit_offset);
-    size_t bit = 0;
-    constexpr size_t kNumBits = BitSizeOf<uint32_t>();
-    for (; bit + kNumBits <= bit_length; bit += kNumBits) {
-      size_t old_bits = LoadBits(bit_offset + bit, kNumBits);
-      StoreBits(bit_offset + bit, old_bits | src.LoadBits(bit, kNumBits), kNumBits);
-    }
-    size_t num_bits = bit_length - bit;
-    size_t old_bits = LoadBits(bit_offset + bit, num_bits);
-    StoreBits(bit_offset + bit, old_bits | src.LoadBits(bit, num_bits), num_bits);
+  ALWAYS_INLINE void OrBits(const BitMemoryRegion& src) {
+    DCHECK_EQ(size_in_bits(), src.size_in_bits());
+    VisitChunks([this, &src](size_t offset, size_t num_bits, size_t value) ALWAYS_INLINE {
+      StoreChunk(offset, value | src.LoadBits(offset, num_bits), num_bits);
+      return true;
+    });
+  }
+
+  // Xor bits from other bit region.
+  ALWAYS_INLINE void XorBits(const BitMemoryRegion& src) {
+    DCHECK_EQ(size_in_bits(), src.size_in_bits());
+    VisitChunks([this, &src](size_t offset, size_t num_bits, size_t value) ALWAYS_INLINE {
+      StoreChunk(offset, value ^ src.LoadBits(offset, num_bits), num_bits);
+      return true;
+    });
+  }
+
+  // Count the number of set bits within this region.
+  ALWAYS_INLINE size_t PopCount() const {
+    size_t result = 0u;
+    VisitChunks([&](size_t offset ATTRIBUTE_UNUSED,
+                    size_t num_bits ATTRIBUTE_UNUSED,
+                    size_t value) ALWAYS_INLINE {
+                      result += POPCOUNT(value);
+                      return true;
+                    });
+    return result;
   }
 
   // Count the number of set bits within the given bit range.
   ALWAYS_INLINE size_t PopCount(size_t bit_offset, size_t bit_length) const {
-    DCHECK_LE(bit_offset, bit_size_);
-    DCHECK_LE(bit_length, bit_size_ - bit_offset);
-    size_t count = 0;
-    size_t bit = 0;
-    constexpr size_t kNumBits = BitSizeOf<uint32_t>();
-    for (; bit + kNumBits <= bit_length; bit += kNumBits) {
-      count += POPCOUNT(LoadBits(bit_offset + bit, kNumBits));
-    }
-    count += POPCOUNT(LoadBits(bit_offset + bit, bit_length - bit));
-    return count;
+    return Subregion(bit_offset, bit_length).PopCount();
+  }
+
+  // Check if this region has all bits clear.
+  ALWAYS_INLINE bool HasAllBitsClear() const {
+    return VisitChunks([](size_t offset ATTRIBUTE_UNUSED,
+                          size_t num_bits ATTRIBUTE_UNUSED,
+                          size_t value) ALWAYS_INLINE {
+                            return value == 0u;
+                          });
+  }
+
+  // Check if this region has any bit set.
+  ALWAYS_INLINE bool HasSomeBitSet() const {
+    return !HasAllBitsClear();
   }
 
   // Check if there is any bit set within the given bit range.
   ALWAYS_INLINE bool HasSomeBitSet(size_t bit_offset, size_t bit_length) const {
-    // TODO: Load `size_t` chunks (instead of `uint32_t`) from aligned
-    // addresses except for the leading and trailing bits. Refactor to
-    // share code with PopCount() and maybe also Compare().
-    DCHECK_LE(bit_offset, bit_size_);
-    DCHECK_LE(bit_length, bit_size_ - bit_offset);
-    size_t bit = 0;
-    constexpr size_t kNumBits = BitSizeOf<uint32_t>();
-    for (; bit + kNumBits <= bit_length; bit += kNumBits) {
-      if (LoadBits(bit_offset + bit, kNumBits) != 0u) {
-        return true;
-      }
-    }
-    return LoadBits(bit_offset + bit, bit_length - bit) != 0u;
+    return Subregion(bit_offset, bit_length).HasSomeBitSet();
   }
 
   static int Compare(const BitMemoryRegion& lhs, const BitMemoryRegion& rhs) {
     if (lhs.size_in_bits() != rhs.size_in_bits()) {
       return (lhs.size_in_bits() < rhs.size_in_bits()) ? -1 : 1;
     }
-    size_t bit = 0;
-    constexpr size_t kNumBits = BitSizeOf<uint32_t>();
-    for (; bit + kNumBits <= lhs.size_in_bits(); bit += kNumBits) {
-      uint32_t lhs_bits = lhs.LoadBits(bit, kNumBits);
-      uint32_t rhs_bits = rhs.LoadBits(bit, kNumBits);
-      if (lhs_bits != rhs_bits) {
-        return (lhs_bits < rhs_bits) ? -1 : 1;
-      }
+    int result = 0;
+    bool equals = lhs.VisitChunks(
+        [&](size_t offset, size_t num_bits, size_t lhs_value) ALWAYS_INLINE {
+          size_t rhs_value = rhs.LoadBits(offset, num_bits);
+          if (lhs_value == rhs_value) {
+            return true;
+          }
+          // We have found a difference. To avoid the comparison being dependent on how the region
+          // is split into chunks, check the lowest bit that differs. (Android is little-endian.)
+          int bit = CTZ(lhs_value ^ rhs_value);
+          result = ((rhs_value >> bit) & 1u) != 0u ? 1 : -1;
+          return false;  // Stop iterating.
+        });
+    DCHECK_EQ(equals, result == 0);
+    return result;
+  }
+
+  static bool Equals(const BitMemoryRegion& lhs, const BitMemoryRegion& rhs) {
+    if (lhs.size_in_bits() != rhs.size_in_bits()) {
+      return false;
     }
-    size_t num_bits = lhs.size_in_bits() - bit;
-    uint32_t lhs_bits = lhs.LoadBits(bit, num_bits);
-    uint32_t rhs_bits = rhs.LoadBits(bit, num_bits);
-    if (lhs_bits != rhs_bits) {
-      return (lhs_bits < rhs_bits) ? -1 : 1;
-    }
-    return 0;
+    return lhs.VisitChunks([&rhs](size_t offset, size_t num_bits, size_t lhs_value) ALWAYS_INLINE {
+      return lhs_value == rhs.LoadBits(offset, num_bits);
+    });
   }
 
  private:
+  // Visit the region in aligned `size_t` chunks. The first and last chunk may have fewer bits.
+  //
+  // Returns `true` if the iteration visited all chunks successfully, i.e. none of the
+  // calls to `visitor(offset, num_bits, value)` returned `false`; otherwise `false`.
+  template <typename VisitorType>
+  ATTRIBUTE_NO_SANITIZE_ADDRESS  // We might touch extra bytes due to the alignment.
+  ATTRIBUTE_NO_SANITIZE_HWADDRESS  // The hwasan uses different attribute.
+  ALWAYS_INLINE bool VisitChunks(VisitorType&& visitor) const {
+    constexpr size_t kChunkSize = BitSizeOf<size_t>();
+    size_t remaining_bits = bit_size_;
+    if (remaining_bits == 0) {
+      return true;
+    }
+    DCHECK(IsAligned<sizeof(size_t)>(data_));
+    const size_t* data = reinterpret_cast<const size_t*>(data_);
+    size_t offset = 0u;
+    size_t bit_start = bit_start_;
+    data += bit_start / kChunkSize;
+    if ((bit_start % kChunkSize) != 0u) {
+      size_t leading_bits = kChunkSize - (bit_start % kChunkSize);
+      size_t value = (*data) >> (bit_start % kChunkSize);
+      if (leading_bits > remaining_bits) {
+        leading_bits = remaining_bits;
+        value = value & ~(std::numeric_limits<size_t>::max() << remaining_bits);
+      }
+      if (!visitor(offset, leading_bits, value)) {
+        return false;
+      }
+      offset += leading_bits;
+      remaining_bits -= leading_bits;
+      ++data;
+    }
+    while (remaining_bits >= kChunkSize) {
+      size_t value = *data;
+      if (!visitor(offset, kChunkSize, value)) {
+        return false;
+      }
+      offset += kChunkSize;
+      remaining_bits -= kChunkSize;
+      ++data;
+    }
+    if (remaining_bits != 0u) {
+      size_t value = (*data) & ~(std::numeric_limits<size_t>::max() << remaining_bits);
+      if (!visitor(offset, remaining_bits, value)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  ALWAYS_INLINE void StoreChunk(size_t bit_offset, size_t value, size_t bit_length) {
+    if (bit_length == BitSizeOf<size_t>()) {
+      DCHECK_ALIGNED(bit_start_ + bit_offset, BitSizeOf<size_t>());
+      uint8_t* data = data_ + (bit_start_ + bit_offset) / kBitsPerByte;
+      DCHECK_ALIGNED(data, sizeof(size_t));
+      reinterpret_cast<size_t*>(data)[0] = value;
+    } else {
+      StoreBits(bit_offset, value, bit_length);
+    }
+  }
+
   uint8_t* data_ = nullptr;  // The pointer is page aligned.
   size_t bit_start_ = 0;
   size_t bit_size_ = 0;
@@ -329,6 +404,14 @@
     DCHECK_EQ(NumberOfWrittenBits(), 0u);
   }
 
+  void Truncate(size_t bit_offset) {
+    DCHECK_GE(bit_offset, bit_start_);
+    DCHECK_LE(bit_offset, bit_offset_);
+    bit_offset_ = bit_offset;
+    DCHECK_LE(BitsToBytesRoundUp(bit_offset), out_->size());
+    out_->resize(BitsToBytesRoundUp(bit_offset));  // Shrink.
+  }
+
   BitMemoryRegion GetWrittenRegion() const {
     return BitMemoryRegion(out_->data(), bit_start_, bit_offset_ - bit_start_);
   }
@@ -346,7 +429,7 @@
   }
 
   ALWAYS_INLINE void WriteRegion(const BitMemoryRegion& region) {
-    Allocate(region.size_in_bits()).StoreBits(/* bit_offset */ 0, region, region.size_in_bits());
+    Allocate(region.size_in_bits()).CopyBits(region);
   }
 
   ALWAYS_INLINE void WriteBits(uint32_t value, size_t bit_length) {
@@ -379,9 +462,17 @@
     WriteInterleavedVarints<1>({value});
   }
 
+  void WriteBytesAligned(const uint8_t* bytes, size_t length) {
+    DCHECK_ALIGNED(bit_start_, kBitsPerByte);
+    DCHECK_ALIGNED(bit_offset_, kBitsPerByte);
+    DCHECK_EQ(BitsToBytesRoundUp(bit_offset_), out_->size());
+    out_->insert(out_->end(), bytes, bytes + length);
+    bit_offset_ += length * kBitsPerByte;
+  }
+
   ALWAYS_INLINE void ByteAlign() {
-    size_t end = bit_start_ + bit_offset_;
-    bit_offset_ += RoundUp(end, kBitsPerByte) - end;
+    DCHECK_ALIGNED(bit_start_, kBitsPerByte);
+    bit_offset_ = RoundUp(bit_offset_, kBitsPerByte);
   }
 
  private:
diff --git a/libartbase/base/bit_table.h b/libartbase/base/bit_table.h
index 07e6b27..227f5eb 100644
--- a/libartbase/base/bit_table.h
+++ b/libartbase/base/bit_table.h
@@ -92,7 +92,7 @@
   bool Equals(const BitTableBase& other) const {
     return num_rows_ == other.num_rows_ &&
         std::equal(column_offset_, column_offset_ + kNumColumns, other.column_offset_) &&
-        BitMemoryRegion::Compare(table_data_, other.table_data_) == 0;
+        BitMemoryRegion::Equals(table_data_, other.table_data_);
   }
 
  protected:
@@ -449,9 +449,10 @@
 
     // Write table data.
     for (MemoryRegion row : rows_) {
-      BitMemoryRegion src(row);
+      size_t bits_to_copy = std::min(max_num_bits_, row.size_in_bits());
+      BitMemoryRegion src(row, /*bit_offset=*/ 0u, bits_to_copy);
       BitMemoryRegion dst = out.Allocate(max_num_bits_);
-      dst.StoreBits(/* bit_offset */ 0, src, std::min(max_num_bits_, src.size_in_bits()));
+      dst.Subregion(/*bit_offset=*/ 0, bits_to_copy).CopyBits(src);
     }
 
     // Verify the written data.
diff --git a/libartbase/base/data_hash.h b/libartbase/base/data_hash.h
index 5ad7779..3399899 100644
--- a/libartbase/base/data_hash.h
+++ b/libartbase/base/data_hash.h
@@ -17,89 +17,169 @@
 #ifndef ART_LIBARTBASE_BASE_DATA_HASH_H_
 #define ART_LIBARTBASE_BASE_DATA_HASH_H_
 
+#include <type_traits>
+
+#include "base/globals.h"
 #include "base/macros.h"
 
 namespace art {
 
-// Hash bytes using a relatively fast hash.
-static inline size_t HashBytes(const uint8_t* data, size_t len) {
-  size_t hash = 0x811c9dc5;
-  for (uint32_t i = 0; i < len; ++i) {
-    hash = (hash * 16777619) ^ data[i];
-  }
-  hash += hash << 13;
-  hash ^= hash >> 7;
-  hash += hash << 3;
-  hash ^= hash >> 17;
-  hash += hash << 5;
-  return hash;
-}
+// Note: Touching this file or any #included file causes too many files to be rebuilt, so
+// we want to avoid #including any files that are not necessary. Therefore we use templates
+// (and std::enable_if<>) to avoid `#including headers for `ArrayRef<>` or `BitMemoryRegion`.
+class BitMemoryRegion;
 
 class DataHash {
  private:
   static constexpr bool kUseMurmur3Hash = true;
 
  public:
-  template <class Container>
+  template <class Container,
+            typename = std::enable_if_t<!std::is_same_v<Container, BitMemoryRegion>>>
   size_t operator()(const Container& array) const {
     // Containers that provide the data() function use contiguous storage.
     const uint8_t* data = reinterpret_cast<const uint8_t*>(array.data());
-    uint32_t len = sizeof(typename Container::value_type) * array.size();
+    uint32_t length_in_bytes = sizeof(typename Container::value_type) * array.size();
     if (kUseMurmur3Hash) {
-      static constexpr uint32_t c1 = 0xcc9e2d51;
-      static constexpr uint32_t c2 = 0x1b873593;
-      static constexpr uint32_t r1 = 15;
-      static constexpr uint32_t r2 = 13;
-      static constexpr uint32_t m = 5;
-      static constexpr uint32_t n = 0xe6546b64;
+      uint32_t hash = Murmur3Start();
 
-      uint32_t hash = 0;
-
-      const int nblocks = len / 4;
+      const size_t nblocks = length_in_bytes / 4;
       typedef __attribute__((__aligned__(1))) uint32_t unaligned_uint32_t;
-      const unaligned_uint32_t *blocks = reinterpret_cast<const uint32_t*>(data);
-      int i;
-      for (i = 0; i < nblocks; i++) {
-        uint32_t k = blocks[i];
-        k *= c1;
-        k = (k << r1) | (k >> (32 - r1));
-        k *= c2;
-
-        hash ^= k;
-        hash = ((hash << r2) | (hash >> (32 - r2))) * m + n;
+      const unaligned_uint32_t* blocks = reinterpret_cast<const unaligned_uint32_t*>(data);
+      for (size_t i = 0; i != nblocks; ++i) {
+        hash = Murmur3Update(hash, blocks[i]);
       }
 
-      const uint8_t *tail = reinterpret_cast<const uint8_t*>(data + nblocks * 4);
-      uint32_t k1 = 0;
+      const uint8_t* tail = reinterpret_cast<const uint8_t*>(data + nblocks * 4);
+      uint32_t last_block = 0;
 
-      switch (len & 3) {
+      switch (length_in_bytes & 3) {
         case 3:
-          k1 ^= tail[2] << 16;
+          last_block ^= tail[2] << 16;
           FALLTHROUGH_INTENDED;
         case 2:
-          k1 ^= tail[1] << 8;
+          last_block ^= tail[1] << 8;
           FALLTHROUGH_INTENDED;
         case 1:
-          k1 ^= tail[0];
-
-          k1 *= c1;
-          k1 = (k1 << r1) | (k1 >> (32 - r1));
-          k1 *= c2;
-          hash ^= k1;
+          last_block ^= tail[0];
+          hash = Murmur3UpdatePartial(hash, last_block);
       }
 
-      hash ^= len;
-      hash ^= (hash >> 16);
-      hash *= 0x85ebca6b;
-      hash ^= (hash >> 13);
-      hash *= 0xc2b2ae35;
-      hash ^= (hash >> 16);
-
+      hash = Murmur3Finish(hash, length_in_bytes);
       return hash;
     } else {
-      return HashBytes(data, len);
+      return HashBytes(data, length_in_bytes);
     }
   }
+
+  // Hash bytes using a relatively fast hash.
+  static inline size_t HashBytes(const uint8_t* data, size_t length_in_bytes) {
+    size_t hash = HashBytesStart();
+    for (uint32_t i = 0; i != length_in_bytes; ++i) {
+      hash = HashBytesUpdate(hash, data[i]);
+    }
+    return HashBytesFinish(hash);
+  }
+
+  template <typename BMR,
+            typename = std::enable_if_t<std::is_same_v<BMR, BitMemoryRegion>>>
+  size_t operator()(BMR region) const {
+    if (kUseMurmur3Hash) {
+      size_t num_full_blocks = region.size_in_bits() / kMurmur3BlockBits;
+      size_t num_end_bits = region.size_in_bits() % kMurmur3BlockBits;
+      size_t hash = Murmur3Start();
+      for (uint32_t i = 0; i != num_full_blocks; ++i) {
+        uint32_t block = region.LoadBits(i * kMurmur3BlockBits, kMurmur3BlockBits);
+        hash = Murmur3Update(hash, block);
+      }
+      if (num_end_bits != 0u) {
+        uint32_t end_bits = region.LoadBits(num_full_blocks * kMurmur3BlockBits, num_end_bits);
+        hash = Murmur3UpdatePartial(hash, end_bits);
+      }
+      return HashBytesFinish(hash);
+    } else {
+      size_t num_full_bytes = region.size_in_bits() / kBitsPerByte;
+      size_t num_end_bits = region.size_in_bits() % kBitsPerByte;
+      size_t hash = HashBytesStart();
+      for (uint32_t i = 0; i != num_full_bytes; ++i) {
+        uint8_t byte = region.LoadBits(i * kBitsPerByte, kBitsPerByte);
+        hash = HashBytesUpdate(hash, byte);
+      }
+      if (num_end_bits != 0u) {
+        uint32_t end_bits = region.LoadBits(num_full_bytes * kBitsPerByte, num_end_bits);
+        hash = HashBytesUpdate(hash, end_bits);
+      }
+      return HashBytesFinish(hash);
+    }
+  }
+
+ private:
+  ALWAYS_INLINE
+  static constexpr size_t HashBytesStart() {
+    return 0x811c9dc5;
+  }
+
+  ALWAYS_INLINE
+  static constexpr size_t HashBytesUpdate(size_t hash, uint8_t value) {
+    return (hash * 16777619) ^ value;
+  }
+
+  ALWAYS_INLINE
+  static constexpr size_t HashBytesFinish(size_t hash) {
+    hash += hash << 13;
+    hash ^= hash >> 7;
+    hash += hash << 3;
+    hash ^= hash >> 17;
+    hash += hash << 5;
+    return hash;
+  }
+
+  static constexpr uint32_t kMurmur3Seed = 0u;
+  static constexpr uint32_t kMurmur3BlockBits = 32u;
+  static constexpr uint32_t kMurmur3C1 = 0xcc9e2d51;
+  static constexpr uint32_t kMurmur3C2 = 0x1b873593;
+  static constexpr uint32_t kMurmur3R1 = 15;
+  static constexpr uint32_t kMurmur3R2 = 13;
+  static constexpr uint32_t kMurmur3M = 5;
+  static constexpr uint32_t kMurmur3N = 0xe6546b64;
+
+  ALWAYS_INLINE
+  static constexpr uint32_t Murmur3Start() {
+    return kMurmur3Seed;
+  }
+
+  ALWAYS_INLINE
+  static constexpr uint32_t Murmur3Update(uint32_t hash, uint32_t block) {
+    uint32_t k = block;
+    k *= kMurmur3C1;
+    k = (k << kMurmur3R1) | (k >> (32 - kMurmur3R1));
+    k *= kMurmur3C2;
+    hash ^= k;
+    hash = ((hash << kMurmur3R2) | (hash >> (32 - kMurmur3R2))) * kMurmur3M + kMurmur3N;
+    return hash;
+  }
+
+  ALWAYS_INLINE
+  static constexpr uint32_t Murmur3UpdatePartial(uint32_t hash, uint32_t block) {
+    uint32_t k = block;
+    k *= kMurmur3C1;
+    k = (k << kMurmur3R1) | (k >> (32 - kMurmur3R1));
+    k *= kMurmur3C2;
+    hash ^= k;
+    // Note: Unlike full block, the partial block does not have `hash = hash * M + N`.
+    return hash;
+  }
+
+  ALWAYS_INLINE
+  static constexpr uint32_t Murmur3Finish(uint32_t hash, uint32_t length_in_bytes) {
+    hash ^= length_in_bytes;
+    hash ^= (hash >> 16);
+    hash *= 0x85ebca6b;
+    hash ^= (hash >> 13);
+    hash *= 0xc2b2ae35;
+    hash ^= (hash >> 16);
+    return hash;
+  }
 };
 
 }  // namespace art
diff --git a/libprofile/profile/profile_compilation_info.cc b/libprofile/profile/profile_compilation_info.cc
index 7565a0b..b5cc6b7 100644
--- a/libprofile/profile/profile_compilation_info.cc
+++ b/libprofile/profile/profile_compilation_info.cc
@@ -2596,7 +2596,7 @@
     if ((method_flags & flag) != 0u) {
       size_t index = FlagBitmapIndex(static_cast<MethodHotness::Flag>(flag));
       BitMemoryRegion src = method_bitmap.Subregion(index * num_method_ids, num_method_ids);
-      saved_bitmap.StoreBits(saved_bitmap_index * num_method_ids, src, num_method_ids);
+      saved_bitmap.Subregion(saved_bitmap_index * num_method_ids, num_method_ids).CopyBits(src);
       ++saved_bitmap_index;
     }
     return true;
@@ -2704,7 +2704,7 @@
       size_t index = FlagBitmapIndex(static_cast<MethodHotness::Flag>(flag));
       BitMemoryRegion src =
           saved_bitmap.Subregion(saved_bitmap_index * num_method_ids, num_method_ids);
-      method_bitmap.OrBits(index * num_method_ids, src, num_method_ids);
+      method_bitmap.Subregion(index * num_method_ids, num_method_ids).OrBits(src);
       ++saved_bitmap_index;
     }
     return true;
diff --git a/libprofile/profile/profile_compilation_info.h b/libprofile/profile/profile_compilation_info.h
index fe978d5..4366078 100644
--- a/libprofile/profile/profile_compilation_info.h
+++ b/libprofile/profile/profile_compilation_info.h
@@ -807,7 +807,7 @@
           num_method_ids == other.num_method_ids &&
           method_map == other.method_map &&
           class_set == other.class_set &&
-          (BitMemoryRegion::Compare(method_bitmap, other.method_bitmap) == 0);
+          BitMemoryRegion::Equals(method_bitmap, other.method_bitmap);
     }
 
     // Mark a method as executed at least once.
diff --git a/runtime/stack_map.cc b/runtime/stack_map.cc
index 19a854b..591cc6d 100644
--- a/runtime/stack_map.cc
+++ b/runtime/stack_map.cc
@@ -80,59 +80,6 @@
   return copy;
 }
 
-size_t CodeInfo::Deduper::Dedupe(const uint8_t* code_info_data) {
-  writer_.ByteAlign();
-  size_t deduped_offset = writer_.NumberOfWrittenBits() / kBitsPerByte;
-
-  // The back-reference offset takes space so dedupe is not worth it for tiny tables.
-  constexpr size_t kMinDedupSize = 32;  // Assume 32-bit offset on average.
-
-  // Read the existing code info and find (and keep) dedup-map iterator for each table.
-  // The iterator stores BitMemoryRegion and bit_offset of previous identical BitTable.
-  std::map<BitMemoryRegion, uint32_t, BitMemoryRegion::Less>::iterator it[kNumBitTables];
-  CodeInfo code_info(code_info_data, nullptr, [&](size_t i, auto*, BitMemoryRegion region) {
-    it[i] = dedupe_map_.emplace(region, /*bit_offset=*/0).first;
-    if (it[i]->second != 0 && region.size_in_bits() > kMinDedupSize) {  // Seen before and large?
-      code_info.SetBitTableDeduped(i);  // Mark as deduped before we write header.
-    }
-  });
-
-  // Write the code info back, but replace deduped tables with relative offsets.
-  std::array<uint32_t, kNumHeaders> header;
-  ForEachHeaderField([&code_info, &header](size_t i, auto member_pointer) {
-    header[i] = code_info.*member_pointer;
-  });
-  writer_.WriteInterleavedVarints(header);
-  ForEachBitTableField([this, &code_info, &it](size_t i, auto) {
-    if (code_info.HasBitTable(i)) {
-      uint32_t& bit_offset = it[i]->second;
-      if (code_info.IsBitTableDeduped(i)) {
-        DCHECK_NE(bit_offset, 0u);
-        writer_.WriteVarint(writer_.NumberOfWrittenBits() - bit_offset);
-      } else {
-        bit_offset = writer_.NumberOfWrittenBits();  // Store offset in dedup map.
-        writer_.WriteRegion(it[i]->first);
-      }
-    }
-  });
-
-  if (kIsDebugBuild) {
-    CodeInfo old_code_info(code_info_data);
-    CodeInfo new_code_info(writer_.data() + deduped_offset);
-    ForEachHeaderField([&old_code_info, &new_code_info](size_t, auto member_pointer) {
-      if (member_pointer != &CodeInfo::bit_table_flags_) {  // Expected to differ.
-        DCHECK_EQ(old_code_info.*member_pointer, new_code_info.*member_pointer);
-      }
-    });
-    ForEachBitTableField([&old_code_info, &new_code_info](size_t i, auto member_pointer) {
-      DCHECK_EQ(old_code_info.HasBitTable(i), new_code_info.HasBitTable(i));
-      DCHECK((old_code_info.*member_pointer).Equals(new_code_info.*member_pointer));
-    });
-  }
-
-  return deduped_offset;
-}
-
 StackMap CodeInfo::GetStackMapForNativePcOffset(uintptr_t pc, InstructionSet isa) const {
   uint32_t packed_pc = StackMap::PackNativePc(pc, isa);
   // Binary search.  All catch stack maps are stored separately at the end.
diff --git a/runtime/stack_map.h b/runtime/stack_map.h
index a86d638..7a13dbd 100644
--- a/runtime/stack_map.h
+++ b/runtime/stack_map.h
@@ -23,8 +23,6 @@
 #include "base/bit_memory_region.h"
 #include "base/bit_table.h"
 #include "base/bit_utils.h"
-#include "base/bit_vector.h"
-#include "base/leb128.h"
 #include "base/memory_region.h"
 #include "dex/dex_file_types.h"
 #include "dex_register_location.h"
@@ -32,6 +30,10 @@
 
 namespace art {
 
+namespace linker {
+class CodeInfoTableDeduper;
+}  //  namespace linker
+
 class OatQuickMethodHeader;
 class VariableIndentationOutputStream;
 
@@ -281,23 +283,6 @@
  */
 class CodeInfo {
  public:
-  class Deduper {
-   public:
-    explicit Deduper(std::vector<uint8_t>* output) : writer_(output) {
-      DCHECK_EQ(output->size(), 0u);
-    }
-
-    // Copy CodeInfo into output while de-duplicating the internal bit tables.
-    // It returns the byte offset of the copied CodeInfo within the output.
-    size_t Dedupe(const uint8_t* code_info);
-
-   private:
-    BitMemoryWriter<std::vector<uint8_t>> writer_;
-
-    // Deduplicate at BitTable level. The value is bit offset within the output.
-    std::map<BitMemoryRegion, uint32_t, BitMemoryRegion::Less> dedupe_map_;
-  };
-
   ALWAYS_INLINE CodeInfo() {}
   ALWAYS_INLINE explicit CodeInfo(const uint8_t* data, size_t* num_read_bits = nullptr);
   ALWAYS_INLINE explicit CodeInfo(const OatQuickMethodHeader* header);
@@ -505,6 +490,7 @@
   bool HasBitTable(size_t i) { return ((bit_table_flags_ >> i) & 1) != 0; }
   bool IsBitTableDeduped(size_t i) { return ((bit_table_flags_ >> (kNumBitTables + i)) & 1) != 0; }
   void SetBitTableDeduped(size_t i) { bit_table_flags_ |= 1 << (kNumBitTables + i); }
+  bool HasDedupedBitTables() { return (bit_table_flags_ >> kNumBitTables) != 0u; }
 
   enum Flags {
     kHasInlineInfo = 1 << 0,
@@ -533,6 +519,7 @@
   BitTable<DexRegisterMapInfo> dex_register_maps_;
   BitTable<DexRegisterInfo> dex_register_catalog_;
 
+  friend class linker::CodeInfoTableDeduper;
   friend class StackMapStream;
 };