ARM64: Change code emitted by ClinitCheck.

Change the code from MVN+CBNZ to CMP+BLO. The latter is
better optimized in ARM64 CPUs. To avoid increasing code
size, this requires the preceding load to be changed from
LDR to LDRB for a single byte of the 32-bit field.

This shows small but measurable improvement on a few Golem
benchmarks, for example MicroLambda, KotlinAutoReversiBench
and KotlinImgProc-GaussianBlurOpt.

Test: testrunner.py --target --optimizing
Bug: 36692143
Change-Id: Ia73f791d7026220ef38e73bd5ee19fcc4877564d
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index adc98ab..3a2988f 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -1761,14 +1761,19 @@
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register temp = temps.AcquireW();
   constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
-  constexpr uint32_t visibly_initialized = enum_cast<uint32_t>(ClassStatus::kVisiblyInitialized);
-  static_assert(visibly_initialized == MaxInt<uint32_t>(32u - status_lsb_position),
-                "kVisiblyInitialized must have all bits set");
+  const size_t status_byte_offset =
+      mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
+  constexpr uint32_t shifted_visibly_initialized_value =
+      enum_cast<uint32_t>(ClassStatus::kVisiblyInitialized) << (status_lsb_position % kBitsPerByte);
 
-  const size_t status_offset = mirror::Class::StatusOffset().SizeValue();
-  __ Ldr(temp, HeapOperand(class_reg, status_offset));
-  __ Mvn(temp, Operand(temp, ASR, status_lsb_position));  // Were all the bits of the status set?
-  __ Cbnz(temp, slow_path->GetEntryLabel());              // If not, go to slow path.
+  // CMP (immediate) is limited to imm12 or imm12<<12, so we would need to materialize
+  // the constant 0xf0000000 for comparison with the full 32-bit field. To reduce the code
+  // size, load only the high byte of the field and compare with 0xf0.
+  // Note: The same code size could be achieved with LDR+MNV(asr #24)+CBNZ but benchmarks
+  // show that this pattern is slower (tested on little cores).
+  __ Ldrb(temp, HeapOperand(class_reg, status_byte_offset));
+  __ Cmp(temp, shifted_visibly_initialized_value);
+  __ B(lo, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }