Implement ClassStatus::kVisiblyInitialized.

Previously all class initialization checks involved a memory
barrier to ensure appropriate memory visibility. We change
that by introducing the kVisiblyInitialized status which can
be checked without a memory barrier. Before we mark a class
as visibly initialized, we run a checkpoint on all threads
to ensure memory visibility. This is done in batches for up
to 32 classes to reduce the overhead.

Avoiding memory barriers in the compiled code reduces code
size and improves performance. This is also the first step
toward fixing a long-standing synchronization bug 18161648.

Prebuilt sizes for aosp_taimen-userdebug:
 - before:
   arm/boot*.oat: 19150696
   arm64/boot*.oat: 22574336
   oat/arm64/services.odex: 21929800
 - after:
   arm/boot*.oat: 19134508 (-16KiB)
   arm64/boot*.oat: 22553664 (-20KiB)
   oat/arm64/services.odex: 21888760 (-40KiB)

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: aosp_taimen-userdebug boots
Test: run-gtests.sh -j4
Test: testrunner.py --target --optimizing
Test: Manually diff `m dump-oat-boot` output from before
      with output after this CL without codegen changes,
      with `sed` replacements for class status. Check that
      only checksums and the oatdump runtime values of
      DexCache.dexFile differ.
Bug: 18161648
Bug: 36692143
Change-Id: Ida10439d347e680a0abf4674546923374ffaa957
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index cf596c7..43d466b 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -1756,17 +1756,14 @@
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register temp = temps.AcquireW();
   constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
-  const size_t status_byte_offset =
-      mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
-  constexpr uint32_t shifted_initialized_value =
-      enum_cast<uint32_t>(ClassStatus::kInitialized) << (status_lsb_position % kBitsPerByte);
+  constexpr uint32_t visibly_initialized = enum_cast<uint32_t>(ClassStatus::kVisiblyInitialized);
+  static_assert(visibly_initialized == MaxInt<uint32_t>(32u - status_lsb_position),
+                "kVisiblyInitialized must have all bits set");
 
-  // Even if the initialized flag is set, we need to ensure consistent memory ordering.
-  // TODO(vixl): Let the MacroAssembler handle MemOperand.
-  __ Add(temp, class_reg, status_byte_offset);
-  __ Ldarb(temp, HeapOperand(temp));
-  __ Cmp(temp, shifted_initialized_value);
-  __ B(lo, slow_path->GetEntryLabel());
+  const size_t status_offset = mirror::Class::StatusOffset().SizeValue();
+  __ Ldr(temp, HeapOperand(class_reg, status_offset));
+  __ Mvn(temp, Operand(temp, ASR, status_lsb_position));  // Were all the bits of the status set?
+  __ Cbnz(temp, slow_path->GetEntryLabel());              // If not, go to slow path.
   __ Bind(slow_path->GetExitLabel());
 }
 
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 49e7695..b72a1a0 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -7182,17 +7182,13 @@
   UseScratchRegisterScope temps(GetVIXLAssembler());
   vixl32::Register temp = temps.Acquire();
   constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
-  const size_t status_byte_offset =
-      mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
-  constexpr uint32_t shifted_initialized_value =
-      enum_cast<uint32_t>(ClassStatus::kInitialized) << (status_lsb_position % kBitsPerByte);
+  constexpr uint32_t shifted_visibly_initialized_value =
+      enum_cast<uint32_t>(ClassStatus::kVisiblyInitialized) << status_lsb_position;
 
-  GetAssembler()->LoadFromOffset(kLoadUnsignedByte, temp, class_reg, status_byte_offset);
-  __ Cmp(temp, shifted_initialized_value);
+  const size_t status_offset = mirror::Class::StatusOffset().SizeValue();
+  GetAssembler()->LoadFromOffset(kLoadWord, temp, class_reg, status_offset);
+  __ Cmp(temp, shifted_visibly_initialized_value);
   __ B(lo, slow_path->GetEntryLabel());
-  // Even if the initialized flag is set, we may be in a situation where caches are not synced
-  // properly. Therefore, we do a memory fence.
-  __ Dmb(ISH);
   __ Bind(slow_path->GetExitLabel());
 }
 
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 7f7e3a5..5159553 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -6709,13 +6709,12 @@
   constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
   const size_t status_byte_offset =
       mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
-  constexpr uint32_t shifted_initialized_value =
-      enum_cast<uint32_t>(ClassStatus::kInitialized) << (status_lsb_position % kBitsPerByte);
+  constexpr uint32_t shifted_visibly_initialized_value =
+      enum_cast<uint32_t>(ClassStatus::kVisiblyInitialized) << (status_lsb_position % kBitsPerByte);
 
-  __ cmpb(Address(class_reg,  status_byte_offset), Immediate(shifted_initialized_value));
+  __ cmpb(Address(class_reg,  status_byte_offset), Immediate(shifted_visibly_initialized_value));
   __ j(kBelow, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
-  // No need for memory fence, thanks to the X86 memory model.
 }
 
 void InstructionCodeGeneratorX86::GenerateBitstringTypeCheckCompare(HTypeCheckInstruction* check,
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 8067b9c..8c8b5e6 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -5837,13 +5837,12 @@
   constexpr size_t status_lsb_position = SubtypeCheckBits::BitStructSizeOf();
   const size_t status_byte_offset =
       mirror::Class::StatusOffset().SizeValue() + (status_lsb_position / kBitsPerByte);
-  constexpr uint32_t shifted_initialized_value =
-      enum_cast<uint32_t>(ClassStatus::kInitialized) << (status_lsb_position % kBitsPerByte);
+  constexpr uint32_t shifted_visibly_initialized_value =
+      enum_cast<uint32_t>(ClassStatus::kVisiblyInitialized) << (status_lsb_position % kBitsPerByte);
 
-  __ cmpb(Address(class_reg,  status_byte_offset), Immediate(shifted_initialized_value));
+  __ cmpb(Address(class_reg,  status_byte_offset), Immediate(shifted_visibly_initialized_value));
   __ j(kBelow, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
-  // No need for memory fence, thanks to the x86-64 memory model.
 }
 
 void InstructionCodeGeneratorX86_64::GenerateBitstringTypeCheckCompare(HTypeCheckInstruction* check,