ARM64: Change code emitted by ClinitCheck.
Change the code from MVN+CBNZ to CMP+BLO. The latter is
better optimized in ARM64 CPUs. To avoid increasing code
size, this requires the preceding load to be changed from
LDR to LDRB for a single byte of the 32-bit field.
This shows small but measurable improvement on a few Golem
benchmarks, for example MicroLambda, KotlinAutoReversiBench
and KotlinImgProc-GaussianBlurOpt.
Test: testrunner.py --target --optimizing
Bug: 36692143
Change-Id: Ia73f791d7026220ef38e73bd5ee19fcc4877564d
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index adc98ab..3a2988f 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -1761,14 +1761,19 @@
UseScratchRegisterScope temps(GetVIXLAssembler());
Register temp = temps.AcquireW();
constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
- constexpr uint32_t visibly_initialized = enum_cast<uint32_t>(ClassStatus::kVisiblyInitialized);
- static_assert(visibly_initialized == MaxInt<uint32_t>(32u - status_lsb_position),
- "kVisiblyInitialized must have all bits set");
+ const size_t status_byte_offset =
+ mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
+ constexpr uint32_t shifted_visibly_initialized_value =
+ enum_cast<uint32_t>(ClassStatus::kVisiblyInitialized) << (status_lsb_position % kBitsPerByte);
- const size_t status_offset = mirror::Class::StatusOffset().SizeValue();
- __ Ldr(temp, HeapOperand(class_reg, status_offset));
- __ Mvn(temp, Operand(temp, ASR, status_lsb_position)); // Were all the bits of the status set?
- __ Cbnz(temp, slow_path->GetEntryLabel()); // If not, go to slow path.
+ // CMP (immediate) is limited to imm12 or imm12<<12, so we would need to materialize
+ // the constant 0xf0000000 for comparison with the full 32-bit field. To reduce the code
+ // size, load only the high byte of the field and compare with 0xf0.
+ // Note: The same code size could be achieved with LDR+MNV(asr #24)+CBNZ but benchmarks
+ // show that this pattern is slower (tested on little cores).
+ __ Ldrb(temp, HeapOperand(class_reg, status_byte_offset));
+ __ Cmp(temp, shifted_visibly_initialized_value);
+ __ B(lo, slow_path->GetEntryLabel());
__ Bind(slow_path->GetExitLabel());
}