Use 28 bits for type check bit string.
And reverse the order of fields in the Class::status_. This
avoids generated code size increase:
- ClassStatus in high bits allows class initialization
check using "status_high_byte < (kInitialized << 4)"
which is unaffected by the low 4 bits of LHS instead of
needing to extract the status bits,
- the type check bit string in the bottom bits instead of
somewehere in the middle allows the comparison on ARM
to be done using the same code size as with the old
layout in most cases (except when the compared value is
9-16 bits and not a modified immediate: 2 bytes less for
9-12 bits and sometimes 2 bytes more for 13-16 bits; the
latter could be worked around using LDRH if the second
character's boundary is at 16 bits).
Add one of the extra bits to the 2nd character to push its
boundary to 16 bits so that we can test an implementation
using 16-bit loads in a subsequent CL, arbitrarily add the
other three bits to the 3rd character. This CL is only
about making those bits available and allowing testing, the
determination of how to use the additonal bits for the best
impact (whether to have a 4th character or distribute them
differently among the three characters) shall be done later.
Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: Pixel 2 XL boots.
Test: testrunner.py --target --optimizing
Bug: 64692057
Change-Id: I38c59837e3df3accb813fb1e04dc42e9afcd2d73
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 13886b3..28f4816 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -2097,13 +2097,17 @@
Register class_reg) {
UseScratchRegisterScope temps(GetVIXLAssembler());
Register temp = temps.AcquireW();
- size_t status_offset = mirror::Class::StatusOffset().SizeValue();
+ constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
+ const size_t status_byte_offset =
+ mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
+ constexpr uint32_t shifted_initialized_value =
+ enum_cast<uint32_t>(ClassStatus::kInitialized) << (status_lsb_position % kBitsPerByte);
// Even if the initialized flag is set, we need to ensure consistent memory ordering.
// TODO(vixl): Let the MacroAssembler handle MemOperand.
- __ Add(temp, class_reg, status_offset);
+ __ Add(temp, class_reg, status_byte_offset);
__ Ldarb(temp, HeapOperand(temp));
- __ Cmp(temp, enum_cast<>(ClassStatus::kInitialized));
+ __ Cmp(temp, shifted_initialized_value);
__ B(lo, slow_path->GetEntryLabel());
__ Bind(slow_path->GetExitLabel());
}
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 7c6a5fd..f1ad4e1 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -7172,11 +7172,14 @@
LoadClassSlowPathARMVIXL* slow_path, vixl32::Register class_reg) {
UseScratchRegisterScope temps(GetVIXLAssembler());
vixl32::Register temp = temps.Acquire();
- GetAssembler()->LoadFromOffset(kLoadUnsignedByte,
- temp,
- class_reg,
- mirror::Class::StatusOffset().Int32Value());
- __ Cmp(temp, enum_cast<>(ClassStatus::kInitialized));
+ constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
+ const size_t status_byte_offset =
+ mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
+ constexpr uint32_t shifted_initialized_value =
+ enum_cast<uint32_t>(ClassStatus::kInitialized) << (status_lsb_position % kBitsPerByte);
+
+ GetAssembler()->LoadFromOffset(kLoadUnsignedByte, temp, class_reg, status_byte_offset);
+ __ Cmp(temp, shifted_initialized_value);
__ B(lo, slow_path->GetEntryLabel());
// Even if the initialized flag is set, we may be in a situation where caches are not synced
// properly. Therefore, we do a memory fence.
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index ebe252a..c8bd5d4 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -1915,8 +1915,14 @@
void InstructionCodeGeneratorMIPS::GenerateClassInitializationCheck(SlowPathCodeMIPS* slow_path,
Register class_reg) {
- __ LoadFromOffset(kLoadUnsignedByte, TMP, class_reg, mirror::Class::StatusOffset().Int32Value());
- __ LoadConst32(AT, enum_cast<>(ClassStatus::kInitialized));
+ constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
+ const size_t status_byte_offset =
+ mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
+ constexpr uint32_t shifted_initialized_value =
+ enum_cast<uint32_t>(ClassStatus::kInitialized) << (status_lsb_position % kBitsPerByte);
+
+ __ LoadFromOffset(kLoadUnsignedByte, TMP, class_reg, status_byte_offset);
+ __ LoadConst32(AT, shifted_initialized_value);
__ Bltu(TMP, AT, slow_path->GetEntryLabel());
// Even if the initialized flag is set, we need to ensure consistent memory ordering.
__ Sync(0);
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 3ea7b82..bbdc3be 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -1761,8 +1761,14 @@
void InstructionCodeGeneratorMIPS64::GenerateClassInitializationCheck(SlowPathCodeMIPS64* slow_path,
GpuRegister class_reg) {
- __ LoadFromOffset(kLoadUnsignedByte, TMP, class_reg, mirror::Class::StatusOffset().Int32Value());
- __ LoadConst32(AT, enum_cast<>(ClassStatus::kInitialized));
+ constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
+ const size_t status_byte_offset =
+ mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
+ constexpr uint32_t shifted_initialized_value =
+ enum_cast<uint32_t>(ClassStatus::kInitialized) << (status_lsb_position % kBitsPerByte);
+
+ __ LoadFromOffset(kLoadUnsignedByte, TMP, class_reg, status_byte_offset);
+ __ LoadConst32(AT, shifted_initialized_value);
__ Bltuc(TMP, AT, slow_path->GetEntryLabel());
// Even if the initialized flag is set, we need to ensure consistent memory ordering.
__ Sync(0);
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 6853238..537e97a 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -6219,8 +6219,13 @@
void InstructionCodeGeneratorX86::GenerateClassInitializationCheck(
SlowPathCode* slow_path, Register class_reg) {
- __ cmpb(Address(class_reg, mirror::Class::StatusOffset().Int32Value()),
- Immediate(enum_cast<>(ClassStatus::kInitialized)));
+ constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
+ const size_t status_byte_offset =
+ mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
+ constexpr uint32_t shifted_initialized_value =
+ enum_cast<uint32_t>(ClassStatus::kInitialized) << (status_lsb_position % kBitsPerByte);
+
+ __ cmpb(Address(class_reg, status_byte_offset), Immediate(shifted_initialized_value));
__ j(kBelow, slow_path->GetEntryLabel());
__ Bind(slow_path->GetExitLabel());
// No need for memory fence, thanks to the X86 memory model.
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 1f8d822..4a64285 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -5425,8 +5425,13 @@
void InstructionCodeGeneratorX86_64::GenerateClassInitializationCheck(
SlowPathCode* slow_path, CpuRegister class_reg) {
- __ cmpb(Address(class_reg, mirror::Class::StatusOffset().Int32Value()),
- Immediate(enum_cast<>(ClassStatus::kInitialized)));
+ constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
+ const size_t status_byte_offset =
+ mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
+ constexpr uint32_t shifted_initialized_value =
+ enum_cast<uint32_t>(ClassStatus::kInitialized) << (status_lsb_position % kBitsPerByte);
+
+ __ cmpb(Address(class_reg, status_byte_offset), Immediate(shifted_initialized_value));
__ j(kBelow, slow_path->GetEntryLabel());
__ Bind(slow_path->GetExitLabel());
// No need for memory fence, thanks to the x86-64 memory model.