Refactor and optimize memory region bit functions

Move optimized bit reading from FieldEncoding to MemoryRegion,
added optimized StoreBits to MemoryRegion.

Compilation of a large app on host:
Before:
Time -j1: 31.897s
2.00% art::MemoryRegion::StoreBits(unsigned long, unsigned int, unsigned long)

After:
Time -j1: 29.620s
0.39% art::MemoryRegion::StoreBits(unsigned long, unsigned int, unsigned long)

Bug: 34621054

Test: test-art-host

Change-Id: I0509613da83cc5741d5cfada3f8a8af503784e9e
diff --git a/runtime/memory_region.cc b/runtime/memory_region.cc
index a5c70c3..5bf0f40 100644
--- a/runtime/memory_region.cc
+++ b/runtime/memory_region.cc
@@ -33,4 +33,36 @@
           from.pointer(), from.size());
 }
 
+void MemoryRegion::StoreBits(uintptr_t bit_offset, uint32_t value, size_t length) {
+  DCHECK_LE(value, MaxInt<uint32_t>(length));
+  DCHECK_LE(length, BitSizeOf<uint32_t>());
+  DCHECK_LE(bit_offset + length, size_in_bits());
+  if (length == 0) {
+    return;
+  }
+  // Bits are stored in this order {7 6 5 4 3 2 1 0}.
+  // How many remaining bits in current byte is (bit_offset % kBitsPerByte) + 1.
+  uint8_t* out = ComputeInternalPointer<uint8_t>(bit_offset >> kBitsPerByteLog2);
+  auto orig_len = length;
+  auto orig_value = value;
+  uintptr_t bit_remainder = bit_offset % kBitsPerByte;
+  while (true) {
+    const uintptr_t remaining_bits = kBitsPerByte - bit_remainder;
+    if (length <= remaining_bits) {
+      // Length is smaller than all of remainder bits.
+      size_t mask = ((1 << length) - 1) << bit_remainder;
+      *out = (*out & ~mask) | (value << bit_remainder);
+      break;
+    }
+    // Copy remaining bits in current byte.
+    size_t value_mask = (1 << remaining_bits) - 1;
+    *out = (*out & ~(value_mask << bit_remainder)) | ((value & value_mask) << bit_remainder);
+    value >>= remaining_bits;
+    bit_remainder = 0;
+    length -= remaining_bits;
+    ++out;
+  }
+  DCHECK_EQ(LoadBits(bit_offset, orig_len), orig_value) << bit_offset << " " << orig_len;
+}
+
 }  // namespace art
diff --git a/runtime/memory_region.h b/runtime/memory_region.h
index fe3f917..f55dff7 100644
--- a/runtime/memory_region.h
+++ b/runtime/memory_region.h
@@ -124,11 +124,35 @@
   // The bit at the smallest offset is the least significant bit in the
   // loaded value.  `length` must not be larger than the number of bits
   // contained in the return value (32).
-  uint32_t LoadBits(uintptr_t bit_offset, size_t length) const {
-    CHECK_LE(length, sizeof(uint32_t) * kBitsPerByte);
-    uint32_t value = 0u;
+  ALWAYS_INLINE uint32_t LoadBits(uintptr_t bit_offset, size_t length) const {
+    DCHECK_LE(length, BitSizeOf<uint32_t>());
+    DCHECK_LE(bit_offset + length, size_in_bits());
+    if (UNLIKELY(length == 0)) {
+      // Do not touch any memory if the range is empty.
+      return 0;
+    }
+    const uint8_t* address = start() + bit_offset / kBitsPerByte;
+    const uint32_t shift = bit_offset & (kBitsPerByte - 1);
+    // Load the value (reading only the strictly needed bytes).
+    const uint32_t load_bit_count = shift + length;
+    uint32_t value = address[0] >> shift;
+    if (load_bit_count > 8) {
+      value |= static_cast<uint32_t>(address[1]) << (8 - shift);
+      if (load_bit_count > 16) {
+        value |= static_cast<uint32_t>(address[2]) << (16 - shift);
+        if (load_bit_count > 24) {
+          value |= static_cast<uint32_t>(address[3]) << (24 - shift);
+          if (load_bit_count > 32) {
+            value |= static_cast<uint32_t>(address[4]) << (32 - shift);
+          }
+        }
+      }
+    }
+    // Clear unwanted most significant bits.
+    uint32_t clear_bit_count = BitSizeOf(value) - length;
+    value = (value << clear_bit_count) >> clear_bit_count;
     for (size_t i = 0; i < length; ++i) {
-      value |= LoadBit(bit_offset + i) << i;
+      DCHECK_EQ((value >> i) & 1, LoadBit(bit_offset + i));
     }
     return value;
   }
@@ -137,13 +161,7 @@
   // `bit_offset`.  The bit at the smallest offset is the least significant
   // bit of the stored `value`.  `value` must not be larger than `length`
   // bits.
-  void StoreBits(uintptr_t bit_offset, uint32_t value, size_t length) {
-    CHECK_LE(value, MaxInt<uint32_t>(length));
-    for (size_t i = 0; i < length; ++i) {
-      bool ith_bit = value & (1 << i);
-      StoreBit(bit_offset + i, ith_bit);
-    }
-  }
+  void StoreBits(uintptr_t bit_offset, uint32_t value, size_t length);
 
   void CopyFrom(size_t offset, const MemoryRegion& from) const;
 
diff --git a/runtime/stack_map.h b/runtime/stack_map.h
index cd9a3f0..5782521 100644
--- a/runtime/stack_map.h
+++ b/runtime/stack_map.h
@@ -667,32 +667,7 @@
 
   ALWAYS_INLINE int32_t Load(const MemoryRegion& region) const {
     DCHECK_LE(end_offset_, region.size_in_bits());
-    const size_t bit_count = BitSize();
-    if (bit_count == 0) {
-      // Do not touch any memory if the range is empty.
-      return min_value_;
-    }
-    uint8_t* address = region.start() + start_offset_ / kBitsPerByte;
-    const uint32_t shift = start_offset_ & (kBitsPerByte - 1);
-    // Load the value (reading only the strictly needed bytes).
-    const uint32_t load_bit_count = shift + bit_count;
-    uint32_t value = *address++ >> shift;
-    if (load_bit_count > 8) {
-      value |= static_cast<uint32_t>(*address++) << (8 - shift);
-      if (load_bit_count > 16) {
-        value |= static_cast<uint32_t>(*address++) << (16 - shift);
-        if (load_bit_count > 24) {
-          value |= static_cast<uint32_t>(*address++) << (24 - shift);
-          if (load_bit_count > 32) {
-            value |= static_cast<uint32_t>(*address++) << (32 - shift);
-          }
-        }
-      }
-    }
-    // Clear unwanted most significant bits.
-    uint32_t clear_bit_count = 32 - bit_count;
-    value = (value << clear_bit_count) >> clear_bit_count;
-    return value + min_value_;
+    return static_cast<int32_t>(region.LoadBits(start_offset_, BitSize())) + min_value_;
   }
 
   ALWAYS_INLINE void Store(MemoryRegion region, int32_t value) const {