ARM64: Change code emitted by ClinitCheck. Change the code from MVN+CBNZ to CMP+BLO. The latter is better optimized in ARM64 CPUs. To avoid increasing code size, this requires the preceding load to be changed from LDR to LDRB for a single byte of the 32-bit field. This shows small but measurable improvement on a few Golem benchmarks, for example MicroLambda, KotlinAutoReversiBench and KotlinImgProc-GaussianBlurOpt. Test: testrunner.py --target --optimizing Bug: 36692143 Change-Id: Ia73f791d7026220ef38e73bd5ee19fcc4877564d

commit: 2bb44fe818f2bf1d867a6ae490ef69c7f3a51e97 [log] [tgz]
author: Vladimir Marko <vmarko@google.com> Fri Oct 04 12:28:14 2019 +0100
committer: Vladimir Marko <vmarko@google.com> Wed Oct 09 08:25:22 2019 +0000
tree: c1860179daba52ab0d53707650c1e85194399629
parent: 59770df741b87b201e83ef81cbcfac9df048d19b [diff]
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index adc98ab..3a2988f 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc

@@ -1761,14 +1761,19 @@
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register temp = temps.AcquireW();
   constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
-  constexpr uint32_t visibly_initialized = enum_cast<uint32_t>(ClassStatus::kVisiblyInitialized);
-  static_assert(visibly_initialized == MaxInt<uint32_t>(32u - status_lsb_position),
-                "kVisiblyInitialized must have all bits set");
+  const size_t status_byte_offset =
+      mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
+  constexpr uint32_t shifted_visibly_initialized_value =
+      enum_cast<uint32_t>(ClassStatus::kVisiblyInitialized) << (status_lsb_position % kBitsPerByte);
 
-  const size_t status_offset = mirror::Class::StatusOffset().SizeValue();
-  __ Ldr(temp, HeapOperand(class_reg, status_offset));
-  __ Mvn(temp, Operand(temp, ASR, status_lsb_position));  // Were all the bits of the status set?
-  __ Cbnz(temp, slow_path->GetEntryLabel());              // If not, go to slow path.
+  // CMP (immediate) is limited to imm12 or imm12<<12, so we would need to materialize
+  // the constant 0xf0000000 for comparison with the full 32-bit field. To reduce the code
+  // size, load only the high byte of the field and compare with 0xf0.
+  // Note: The same code size could be achieved with LDR+MNV(asr #24)+CBNZ but benchmarks
+  // show that this pattern is slower (tested on little cores).
+  __ Ldrb(temp, HeapOperand(class_reg, status_byte_offset));
+  __ Cmp(temp, shifted_visibly_initialized_value);
+  __ B(lo, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
commit	2bb44fe818f2bf1d867a6ae490ef69c7f3a51e97	[log] [tgz]
author	Vladimir Marko <vmarko@google.com>	Fri Oct 04 12:28:14 2019 +0100
committer	Vladimir Marko <vmarko@google.com>	Wed Oct 09 08:25:22 2019 +0000
tree	c1860179daba52ab0d53707650c1e85194399629
parent	59770df741b87b201e83ef81cbcfac9df048d19b [diff]