Change type initialization entrypoints to kSaveEverything.

Also avoid the unnecessary read barriers for boot image
classes with kBssEntry or kJitTableAddress (the kBssEntry
and JIT work missed the `read_barrier_option` flag), fix
bit-rotten non-Baker read barriers on ARM and ARM64 and
fix bit-rotten ARM64 relative patcher's IsAdrpPatch() used
for erratum 843419 workaround.

aosp_angler-userdebug with CC:
  before:
    arm boot*.oat: 35440420
    arm64 boot*.oat: 43504952
  after:
    arm boot*.oat: 35222292 (-218128, -0.62%)
    arm64 boot*.oat: 43389048 (-115904, -0.26%)

aosp_angler-userdebug without CC:
  before:
    arm boot*.oat: 31927412
    arm64 boot*.oat: 39340512
  after:
    arm boot*.oat: 31708736 (-218676, -0.68%)
    arm64 boot*.oat: 39211768 (-128744, -0.33%)

Test: m test-art-host (non-CC, Baker CC, table lookup CC)
Test: m test-art-target on Nexus 6P (non-CC, Baker CC, table lookup CC)
Test: Nexus 6P boots (non-CC, Baker CC, table lookup CC)
Bug: 30627598
Change-Id: Ida5bbce414844de9e4273e40334165d4494230d4
diff --git a/compiler/linker/arm64/relative_patcher_arm64.cc b/compiler/linker/arm64/relative_patcher_arm64.cc
index 79e1785..9ddf200 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64.cc
@@ -31,9 +31,7 @@
 namespace {
 
 inline bool IsAdrpPatch(const LinkerPatch& patch) {
-  LinkerPatch::Type type = patch.GetType();
-  return
-      (type == LinkerPatch::Type::kStringRelative || type == LinkerPatch::Type::kDexCacheArray) &&
+  return (patch.IsPcRelative() && patch.GetType() != LinkerPatch::Type::kCallRelative) &&
       patch.LiteralOffset() == patch.PcInsnOffset();
 }
 
@@ -214,11 +212,11 @@
         DCHECK(patch.GetType() == LinkerPatch::Type::kStringRelative ||
                patch.GetType() == LinkerPatch::Type::kTypeRelative) << patch.GetType();
       } else {
-        // With the read barrier (non-Baker) enabled, it could be kDexCacheArray in the
-        // HLoadString::LoadKind::kDexCachePcRelative case of VisitLoadString().
+        // With the read barrier (non-Baker) enabled, it could be kStringBssEntry or kTypeBssEntry.
         DCHECK(patch.GetType() == LinkerPatch::Type::kStringRelative ||
                patch.GetType() == LinkerPatch::Type::kTypeRelative ||
-               patch.GetType() == LinkerPatch::Type::kDexCacheArray) << patch.GetType();
+               patch.GetType() == LinkerPatch::Type::kStringBssEntry ||
+               patch.GetType() == LinkerPatch::Type::kTypeBssEntry) << patch.GetType();
       }
       shift = 0u;  // No shift for ADD.
     } else {
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 20cdae3..06e164f 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -367,22 +367,37 @@
 
 class LoadClassSlowPathARM : public SlowPathCodeARM {
  public:
-  LoadClassSlowPathARM(HLoadClass* cls,
-                       HInstruction* at,
-                       uint32_t dex_pc,
-                       bool do_clinit)
+  LoadClassSlowPathARM(HLoadClass* cls, HInstruction* at, uint32_t dex_pc, bool do_clinit)
       : SlowPathCodeARM(at), cls_(cls), dex_pc_(dex_pc), do_clinit_(do_clinit) {
     DCHECK(at->IsLoadClass() || at->IsClinitCheck());
   }
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
+    Location out = locations->Out();
+    constexpr bool call_saves_everything_except_r0 = (!kUseReadBarrier || kUseBakerReadBarrier);
 
     CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, locations);
 
     InvokeRuntimeCallingConvention calling_convention;
+    // For HLoadClass/kBssEntry/kSaveEverything, make sure we preserve the address of the entry.
+    DCHECK_EQ(instruction_->IsLoadClass(), cls_ == instruction_);
+    bool is_load_class_bss_entry =
+        (cls_ == instruction_) && (cls_->GetLoadKind() == HLoadClass::LoadKind::kBssEntry);
+    Register entry_address = kNoRegister;
+    if (is_load_class_bss_entry && call_saves_everything_except_r0) {
+      Register temp = locations->GetTemp(0).AsRegister<Register>();
+      // In the unlucky case that the `temp` is R0, we preserve the address in `out` across
+      // the kSaveEverything call.
+      bool temp_is_r0 = (temp == calling_convention.GetRegisterAt(0));
+      entry_address = temp_is_r0 ? out.AsRegister<Register>() : temp;
+      DCHECK_NE(entry_address, calling_convention.GetRegisterAt(0));
+      if (temp_is_r0) {
+        __ mov(entry_address, ShifterOperand(temp));
+      }
+    }
     dex::TypeIndex type_index = cls_->GetTypeIndex();
     __ LoadImmediate(calling_convention.GetRegisterAt(0), type_index.index_);
     QuickEntrypointEnum entrypoint = do_clinit_ ? kQuickInitializeStaticStorage
@@ -394,30 +409,31 @@
       CheckEntrypointTypes<kQuickInitializeType, void*, uint32_t>();
     }
 
+    // For HLoadClass/kBssEntry, store the resolved Class to the BSS entry.
+    if (is_load_class_bss_entry) {
+      if (call_saves_everything_except_r0) {
+        // The class entry address was preserved in `entry_address` thanks to kSaveEverything.
+        __ str(R0, Address(entry_address));
+      } else {
+        // For non-Baker read barrier, we need to re-calculate the address of the string entry.
+        Register temp = IP;
+        CodeGeneratorARM::PcRelativePatchInfo* labels =
+            arm_codegen->NewTypeBssEntryPatch(cls_->GetDexFile(), type_index);
+        __ BindTrackedLabel(&labels->movw_label);
+        __ movw(temp, /* placeholder */ 0u);
+        __ BindTrackedLabel(&labels->movt_label);
+        __ movt(temp, /* placeholder */ 0u);
+        __ BindTrackedLabel(&labels->add_pc_label);
+        __ add(temp, temp, ShifterOperand(PC));
+        __ str(R0, Address(temp));
+      }
+    }
     // Move the class to the desired location.
-    Location out = locations->Out();
     if (out.IsValid()) {
       DCHECK(out.IsRegister() && !locations->GetLiveRegisters()->ContainsCoreRegister(out.reg()));
       arm_codegen->Move32(locations->Out(), Location::RegisterLocation(R0));
     }
     RestoreLiveRegisters(codegen, locations);
-    // For HLoadClass/kBssEntry, store the resolved Class to the BSS entry.
-    DCHECK_EQ(instruction_->IsLoadClass(), cls_ == instruction_);
-    if (cls_ == instruction_ && cls_->GetLoadKind() == HLoadClass::LoadKind::kBssEntry) {
-      DCHECK(out.IsValid());
-      // TODO: Change art_quick_initialize_type/art_quick_initialize_static_storage to
-      // kSaveEverything and use a temporary for the .bss entry address in the fast path,
-      // so that we can avoid another calculation here.
-      CodeGeneratorARM::PcRelativePatchInfo* labels =
-          arm_codegen->NewTypeBssEntryPatch(cls_->GetDexFile(), type_index);
-      __ BindTrackedLabel(&labels->movw_label);
-      __ movw(IP, /* placeholder */ 0u);
-      __ BindTrackedLabel(&labels->movt_label);
-      __ movt(IP, /* placeholder */ 0u);
-      __ BindTrackedLabel(&labels->add_pc_label);
-      __ add(IP, IP, ShifterOperand(PC));
-      __ str(locations->Out().AsRegister<Register>(), Address(IP));
-    }
     __ b(GetExitLabel());
   }
 
@@ -441,12 +457,13 @@
   explicit LoadStringSlowPathARM(HLoadString* instruction) : SlowPathCodeARM(instruction) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    DCHECK(instruction_->IsLoadString());
+    DCHECK_EQ(instruction_->AsLoadString()->GetLoadKind(), HLoadString::LoadKind::kBssEntry);
     LocationSummary* locations = instruction_->GetLocations();
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(locations->Out().reg()));
     HLoadString* load = instruction_->AsLoadString();
     const dex::StringIndex string_index = load->GetStringIndex();
     Register out = locations->Out().AsRegister<Register>();
-    Register temp = locations->GetTemp(0).AsRegister<Register>();
     constexpr bool call_saves_everything_except_r0 = (!kUseReadBarrier || kUseBakerReadBarrier);
 
     CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
@@ -455,12 +472,16 @@
 
     InvokeRuntimeCallingConvention calling_convention;
     // In the unlucky case that the `temp` is R0, we preserve the address in `out` across
-    // the kSaveEverything call (or use `out` for the address after non-kSaveEverything call).
-    bool temp_is_r0 = (temp == calling_convention.GetRegisterAt(0));
-    Register entry_address = temp_is_r0 ? out : temp;
-    DCHECK_NE(entry_address, calling_convention.GetRegisterAt(0));
-    if (call_saves_everything_except_r0 && temp_is_r0) {
-      __ mov(entry_address, ShifterOperand(temp));
+    // the kSaveEverything call.
+    Register entry_address = kNoRegister;
+    if (call_saves_everything_except_r0) {
+      Register temp = locations->GetTemp(0).AsRegister<Register>();
+      bool temp_is_r0 = (temp == calling_convention.GetRegisterAt(0));
+      entry_address = temp_is_r0 ? out : temp;
+      DCHECK_NE(entry_address, calling_convention.GetRegisterAt(0));
+      if (temp_is_r0) {
+        __ mov(entry_address, ShifterOperand(temp));
+      }
     }
 
     __ LoadImmediate(calling_convention.GetRegisterAt(0), string_index.index_);
@@ -473,15 +494,16 @@
       __ str(R0, Address(entry_address));
     } else {
       // For non-Baker read barrier, we need to re-calculate the address of the string entry.
+      Register temp = IP;
       CodeGeneratorARM::PcRelativePatchInfo* labels =
           arm_codegen->NewPcRelativeStringPatch(load->GetDexFile(), string_index);
       __ BindTrackedLabel(&labels->movw_label);
-      __ movw(entry_address, /* placeholder */ 0u);
+      __ movw(temp, /* placeholder */ 0u);
       __ BindTrackedLabel(&labels->movt_label);
-      __ movt(entry_address, /* placeholder */ 0u);
+      __ movt(temp, /* placeholder */ 0u);
       __ BindTrackedLabel(&labels->add_pc_label);
-      __ add(entry_address, entry_address, ShifterOperand(PC));
-      __ str(R0, Address(entry_address));
+      __ add(temp, temp, ShifterOperand(PC));
+      __ str(R0, Address(temp));
     }
 
     arm_codegen->Move32(locations->Out(), Location::RegisterLocation(R0));
@@ -5755,6 +5777,7 @@
         cls,
         Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
         Location::RegisterLocation(R0));
+    DCHECK_EQ(calling_convention.GetRegisterAt(0), R0);
     return;
   }
   DCHECK(!cls->NeedsAccessCheck());
@@ -5772,6 +5795,22 @@
     locations->SetInAt(0, Location::RequiresRegister());
   }
   locations->SetOut(Location::RequiresRegister());
+  if (load_kind == HLoadClass::LoadKind::kBssEntry) {
+    if (!kUseReadBarrier || kUseBakerReadBarrier) {
+      // Rely on the type resolution or initialization and marking to save everything we need.
+      // Note that IP may be clobbered by saving/restoring the live register (only one thanks
+      // to the custom calling convention) or by marking, so we request a different temp.
+      locations->AddTemp(Location::RequiresRegister());
+      RegisterSet caller_saves = RegisterSet::Empty();
+      InvokeRuntimeCallingConvention calling_convention;
+      caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+      // TODO: Add GetReturnLocation() to the calling convention so that we can DCHECK()
+      // that the the kPrimNot result register is the same as the first argument register.
+      locations->SetCustomSlowPathCallerSaves(caller_saves);
+    } else {
+      // For non-Baker read barrier we have a temp-clobbering call.
+    }
+  }
 }
 
 // NO_THREAD_SAFETY_ANALYSIS as we manipulate handles whose internal object we know does not
@@ -5834,15 +5873,18 @@
       break;
     }
     case HLoadClass::LoadKind::kBssEntry: {
+      Register temp = (!kUseReadBarrier || kUseBakerReadBarrier)
+          ? locations->GetTemp(0).AsRegister<Register>()
+          : out;
       CodeGeneratorARM::PcRelativePatchInfo* labels =
           codegen_->NewTypeBssEntryPatch(cls->GetDexFile(), cls->GetTypeIndex());
       __ BindTrackedLabel(&labels->movw_label);
-      __ movw(out, /* placeholder */ 0u);
+      __ movw(temp, /* placeholder */ 0u);
       __ BindTrackedLabel(&labels->movt_label);
-      __ movt(out, /* placeholder */ 0u);
+      __ movt(temp, /* placeholder */ 0u);
       __ BindTrackedLabel(&labels->add_pc_label);
-      __ add(out, out, ShifterOperand(PC));
-      GenerateGcRootFieldLoad(cls, out_loc, out, 0, kCompilerReadBarrierOption);
+      __ add(temp, temp, ShifterOperand(PC));
+      GenerateGcRootFieldLoad(cls, out_loc, temp, /* offset */ 0, read_barrier_option);
       generate_null_check = true;
       break;
     }
@@ -5851,7 +5893,7 @@
                                                                cls->GetTypeIndex(),
                                                                cls->GetClass()));
       // /* GcRoot<mirror::Class> */ out = *out
-      GenerateGcRootFieldLoad(cls, out_loc, out, /* offset */ 0, kCompilerReadBarrierOption);
+      GenerateGcRootFieldLoad(cls, out_loc, out, /* offset */ 0, read_barrier_option);
       break;
     }
     case HLoadClass::LoadKind::kDexCacheViaMethod:
@@ -5938,9 +5980,9 @@
     locations->SetOut(Location::RequiresRegister());
     if (load_kind == HLoadString::LoadKind::kBssEntry) {
       if (!kUseReadBarrier || kUseBakerReadBarrier) {
-        // Rely on the pResolveString and/or marking to save everything, including temps.
-        // Note that IP may theoretically be clobbered by saving/restoring the live register
-        // (only one thanks to the custom calling convention), so we request a different temp.
+        // Rely on the pResolveString and marking to save everything we need, including temps.
+        // Note that IP may be clobbered by saving/restoring the live register (only one thanks
+        // to the custom calling convention) or by marking, so we request a different temp.
         locations->AddTemp(Location::RequiresRegister());
         RegisterSet caller_saves = RegisterSet::Empty();
         InvokeRuntimeCallingConvention calling_convention;
@@ -5991,7 +6033,9 @@
     }
     case HLoadString::LoadKind::kBssEntry: {
       DCHECK(!codegen_->GetCompilerOptions().IsBootImage());
-      Register temp = locations->GetTemp(0).AsRegister<Register>();
+      Register temp = (!kUseReadBarrier || kUseBakerReadBarrier)
+          ? locations->GetTemp(0).AsRegister<Register>()
+          : out;
       CodeGeneratorARM::PcRelativePatchInfo* labels =
           codegen_->NewPcRelativeStringPatch(load->GetDexFile(), load->GetStringIndex());
       __ BindTrackedLabel(&labels->movw_label);
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 598be47..248cee1 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -275,15 +275,37 @@
   LoadClassSlowPathARM64(HLoadClass* cls,
                          HInstruction* at,
                          uint32_t dex_pc,
-                         bool do_clinit)
-      : SlowPathCodeARM64(at), cls_(cls), dex_pc_(dex_pc), do_clinit_(do_clinit) {
+                         bool do_clinit,
+                         vixl::aarch64::Register bss_entry_temp = vixl::aarch64::Register(),
+                         vixl::aarch64::Label* bss_entry_adrp_label = nullptr)
+      : SlowPathCodeARM64(at),
+        cls_(cls),
+        dex_pc_(dex_pc),
+        do_clinit_(do_clinit),
+        bss_entry_temp_(bss_entry_temp),
+        bss_entry_adrp_label_(bss_entry_adrp_label) {
     DCHECK(at->IsLoadClass() || at->IsClinitCheck());
   }
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
+    Location out = locations->Out();
+    constexpr bool call_saves_everything_except_r0_ip0 = (!kUseReadBarrier || kUseBakerReadBarrier);
     CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
 
+    // For HLoadClass/kBssEntry/kSaveEverything, make sure we preserve the page address of
+    // the entry which is in a scratch register. Make sure it's not used for saving/restoring
+    // registers. Exclude the scratch register also for non-Baker read barrier for simplicity.
+    DCHECK_EQ(instruction_->IsLoadClass(), cls_ == instruction_);
+    bool is_load_class_bss_entry =
+        (cls_ == instruction_) && (cls_->GetLoadKind() == HLoadClass::LoadKind::kBssEntry);
+    UseScratchRegisterScope temps(arm64_codegen->GetVIXLAssembler());
+    if (is_load_class_bss_entry) {
+      // This temp is a scratch register.
+      DCHECK(bss_entry_temp_.IsValid());
+      temps.Exclude(bss_entry_temp_);
+    }
+
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, locations);
 
@@ -300,7 +322,6 @@
     }
 
     // Move the class to the desired location.
-    Location out = locations->Out();
     if (out.IsValid()) {
       DCHECK(out.IsRegister() && !locations->GetLiveRegisters()->ContainsCoreRegister(out.reg()));
       Primitive::Type type = instruction_->GetType();
@@ -308,25 +329,23 @@
     }
     RestoreLiveRegisters(codegen, locations);
     // For HLoadClass/kBssEntry, store the resolved Class to the BSS entry.
-    DCHECK_EQ(instruction_->IsLoadClass(), cls_ == instruction_);
-    if (cls_ == instruction_ && cls_->GetLoadKind() == HLoadClass::LoadKind::kBssEntry) {
+    if (is_load_class_bss_entry) {
       DCHECK(out.IsValid());
-      UseScratchRegisterScope temps(arm64_codegen->GetVIXLAssembler());
-      Register temp = temps.AcquireX();
       const DexFile& dex_file = cls_->GetDexFile();
-      // TODO: Change art_quick_initialize_type/art_quick_initialize_static_storage to
-      // kSaveEverything and use a temporary for the ADRP in the fast path, so that we
-      // can avoid the ADRP here.
-      vixl::aarch64::Label* adrp_label =
-          arm64_codegen->NewBssEntryTypePatch(dex_file, type_index);
-      arm64_codegen->EmitAdrpPlaceholder(adrp_label, temp);
+      if (call_saves_everything_except_r0_ip0) {
+        // The class entry page address was preserved in bss_entry_temp_ thanks to kSaveEverything.
+      } else {
+        // For non-Baker read barrier, we need to re-calculate the address of the class entry page.
+        bss_entry_adrp_label_ = arm64_codegen->NewBssEntryTypePatch(dex_file, type_index);
+        arm64_codegen->EmitAdrpPlaceholder(bss_entry_adrp_label_, bss_entry_temp_);
+      }
       vixl::aarch64::Label* strp_label =
-          arm64_codegen->NewBssEntryTypePatch(dex_file, type_index, adrp_label);
+          arm64_codegen->NewBssEntryTypePatch(dex_file, type_index, bss_entry_adrp_label_);
       {
         SingleEmissionCheckScope guard(arm64_codegen->GetVIXLAssembler());
         __ Bind(strp_label);
         __ str(RegisterFrom(locations->Out(), Primitive::kPrimNot),
-               MemOperand(temp, /* offset placeholder */ 0));
+               MemOperand(bss_entry_temp_, /* offset placeholder */ 0));
       }
     }
     __ B(GetExitLabel());
@@ -344,6 +363,10 @@
   // Whether to initialize the class.
   const bool do_clinit_;
 
+  // For HLoadClass/kBssEntry, the temp register and the label of the ADRP where it was loaded.
+  vixl::aarch64::Register bss_entry_temp_;
+  vixl::aarch64::Label* bss_entry_adrp_label_;
+
   DISALLOW_COPY_AND_ASSIGN(LoadClassSlowPathARM64);
 };
 
@@ -4393,6 +4416,7 @@
         cls,
         LocationFrom(calling_convention.GetRegisterAt(0)),
         LocationFrom(vixl::aarch64::x0));
+    DCHECK(calling_convention.GetRegisterAt(0).Is(vixl::aarch64::x0));
     return;
   }
   DCHECK(!cls->NeedsAccessCheck());
@@ -4410,6 +4434,22 @@
     locations->SetInAt(0, Location::RequiresRegister());
   }
   locations->SetOut(Location::RequiresRegister());
+  if (cls->GetLoadKind() == HLoadClass::LoadKind::kBssEntry) {
+    if (!kUseReadBarrier || kUseBakerReadBarrier) {
+      // Rely on the type resolution or initialization and marking to save everything we need.
+      // Note that IP0 may be clobbered by saving/restoring the live register (only one thanks
+      // to the custom calling convention) or by marking, so we shall use IP1.
+      RegisterSet caller_saves = RegisterSet::Empty();
+      InvokeRuntimeCallingConvention calling_convention;
+      caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0).GetCode()));
+      DCHECK_EQ(calling_convention.GetRegisterAt(0).GetCode(),
+                RegisterFrom(calling_convention.GetReturnLocation(Primitive::kPrimNot),
+                             Primitive::kPrimNot).GetCode());
+      locations->SetCustomSlowPathCallerSaves(caller_saves);
+    } else {
+      // For non-Baker read barrier we have a temp-clobbering call.
+    }
+  }
 }
 
 // NO_THREAD_SAFETY_ANALYSIS as we manipulate handles whose internal object we know does not
@@ -4424,6 +4464,8 @@
 
   Location out_loc = cls->GetLocations()->Out();
   Register out = OutputRegister(cls);
+  Register bss_entry_temp;
+  vixl::aarch64::Label* bss_entry_adrp_label = nullptr;
 
   const ReadBarrierOption read_barrier_option = cls->IsInBootImage()
       ? kWithoutReadBarrier
@@ -4473,18 +4515,23 @@
       // Add ADRP with its PC-relative Class .bss entry patch.
       const DexFile& dex_file = cls->GetDexFile();
       dex::TypeIndex type_index = cls->GetTypeIndex();
-      vixl::aarch64::Label* adrp_label = codegen_->NewBssEntryTypePatch(dex_file, type_index);
-      codegen_->EmitAdrpPlaceholder(adrp_label, out.X());
+      // We can go to slow path even with non-zero reference and in that case marking
+      // can clobber IP0, so we need to use IP1 which shall be preserved.
+      bss_entry_temp = ip1;
+      UseScratchRegisterScope temps(codegen_->GetVIXLAssembler());
+      temps.Exclude(bss_entry_temp);
+      bss_entry_adrp_label = codegen_->NewBssEntryTypePatch(dex_file, type_index);
+      codegen_->EmitAdrpPlaceholder(bss_entry_adrp_label, bss_entry_temp);
       // Add LDR with its PC-relative Class patch.
       vixl::aarch64::Label* ldr_label =
-          codegen_->NewBssEntryTypePatch(dex_file, type_index, adrp_label);
+          codegen_->NewBssEntryTypePatch(dex_file, type_index, bss_entry_adrp_label);
       // /* GcRoot<mirror::Class> */ out = *(base_address + offset)  /* PC-relative */
       GenerateGcRootFieldLoad(cls,
-                              cls->GetLocations()->Out(),
-                              out.X(),
-                              /* placeholder */ 0u,
+                              out_loc,
+                              bss_entry_temp,
+                              /* offset placeholder */ 0u,
                               ldr_label,
-                              kCompilerReadBarrierOption);
+                              read_barrier_option);
       generate_null_check = true;
       break;
     }
@@ -4497,7 +4544,7 @@
                               out.X(),
                               /* offset */ 0,
                               /* fixup_label */ nullptr,
-                              kCompilerReadBarrierOption);
+                              read_barrier_option);
       break;
     }
     case HLoadClass::LoadKind::kDexCacheViaMethod:
@@ -4506,10 +4553,11 @@
       UNREACHABLE();
   }
 
-  if (generate_null_check || cls->MustGenerateClinitCheck()) {
+  bool do_clinit = cls->MustGenerateClinitCheck();
+  if (generate_null_check || do_clinit) {
     DCHECK(cls->CanCallRuntime());
     SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadClassSlowPathARM64(
-        cls, cls, cls->GetDexPc(), cls->MustGenerateClinitCheck());
+        cls, cls, cls->GetDexPc(), do_clinit, bss_entry_temp, bss_entry_adrp_label);
     codegen_->AddSlowPath(slow_path);
     if (generate_null_check) {
       __ Cbz(out, slow_path->GetEntryLabel());
@@ -4577,7 +4625,9 @@
     locations->SetOut(Location::RequiresRegister());
     if (load->GetLoadKind() == HLoadString::LoadKind::kBssEntry) {
       if (!kUseReadBarrier || kUseBakerReadBarrier) {
-        // Rely on the pResolveString and/or marking to save everything, including temps.
+        // Rely on the pResolveString and marking to save everything we need.
+        // Note that IP0 may be clobbered by saving/restoring the live register (only one thanks
+        // to the custom calling convention) or by marking, so we shall use IP1.
         RegisterSet caller_saves = RegisterSet::Empty();
         InvokeRuntimeCallingConvention calling_convention;
         caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0).GetCode()));
@@ -4628,8 +4678,11 @@
       const DexFile& dex_file = load->GetDexFile();
       const dex::StringIndex string_index = load->GetStringIndex();
       DCHECK(!codegen_->GetCompilerOptions().IsBootImage());
+      // We could use IP0 as the marking shall not clobber IP0 if the reference is null and
+      // that's when we need the slow path. But let's not rely on such details and use IP1.
+      Register temp = ip1;
       UseScratchRegisterScope temps(codegen_->GetVIXLAssembler());
-      Register temp = temps.AcquireX();
+      temps.Exclude(temp);
       vixl::aarch64::Label* adrp_label = codegen_->NewPcRelativeStringPatch(dex_file, string_index);
       codegen_->EmitAdrpPlaceholder(adrp_label, temp);
       // Add LDR with its PC-relative String patch.
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index e189608..5c4ca5b 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -400,12 +400,30 @@
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     LocationSummary* locations = instruction_->GetLocations();
+    Location out = locations->Out();
+    constexpr bool call_saves_everything_except_r0 = (!kUseReadBarrier || kUseBakerReadBarrier);
 
     CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen);
     __ Bind(GetEntryLabel());
     SaveLiveRegisters(codegen, locations);
 
     InvokeRuntimeCallingConventionARMVIXL calling_convention;
+    // For HLoadClass/kBssEntry/kSaveEverything, make sure we preserve the address of the entry.
+    DCHECK_EQ(instruction_->IsLoadClass(), cls_ == instruction_);
+    bool is_load_class_bss_entry =
+        (cls_ == instruction_) && (cls_->GetLoadKind() == HLoadClass::LoadKind::kBssEntry);
+    vixl32::Register entry_address;
+    if (is_load_class_bss_entry && call_saves_everything_except_r0) {
+      vixl32::Register temp = RegisterFrom(locations->GetTemp(0));
+      // In the unlucky case that the `temp` is R0, we preserve the address in `out` across
+      // the kSaveEverything call.
+      bool temp_is_r0 = temp.Is(calling_convention.GetRegisterAt(0));
+      entry_address = temp_is_r0 ? RegisterFrom(out) : temp;
+      DCHECK(!entry_address.Is(calling_convention.GetRegisterAt(0)));
+      if (temp_is_r0) {
+        __ Mov(entry_address, temp);
+      }
+    }
     dex::TypeIndex type_index = cls_->GetTypeIndex();
     __ Mov(calling_convention.GetRegisterAt(0), type_index.index_);
     QuickEntrypointEnum entrypoint = do_clinit_ ? kQuickInitializeStaticStorage
@@ -417,27 +435,28 @@
       CheckEntrypointTypes<kQuickInitializeType, void*, uint32_t>();
     }
 
+    // For HLoadClass/kBssEntry, store the resolved Class to the BSS entry.
+    if (is_load_class_bss_entry) {
+      if (call_saves_everything_except_r0) {
+        // The class entry address was preserved in `entry_address` thanks to kSaveEverything.
+        __ Str(r0, MemOperand(entry_address));
+      } else {
+        // For non-Baker read barrier, we need to re-calculate the address of the string entry.
+        UseScratchRegisterScope temps(
+            down_cast<CodeGeneratorARMVIXL*>(codegen)->GetVIXLAssembler());
+        vixl32::Register temp = temps.Acquire();
+        CodeGeneratorARMVIXL::PcRelativePatchInfo* labels =
+            arm_codegen->NewTypeBssEntryPatch(cls_->GetDexFile(), type_index);
+        arm_codegen->EmitMovwMovtPlaceholder(labels, temp);
+        __ Str(r0, MemOperand(temp));
+      }
+    }
     // Move the class to the desired location.
-    Location out = locations->Out();
     if (out.IsValid()) {
       DCHECK(out.IsRegister() && !locations->GetLiveRegisters()->ContainsCoreRegister(out.reg()));
       arm_codegen->Move32(locations->Out(), LocationFrom(r0));
     }
     RestoreLiveRegisters(codegen, locations);
-    // For HLoadClass/kBssEntry, store the resolved Class to the BSS entry.
-    DCHECK_EQ(instruction_->IsLoadClass(), cls_ == instruction_);
-    if (cls_ == instruction_ && cls_->GetLoadKind() == HLoadClass::LoadKind::kBssEntry) {
-      DCHECK(out.IsValid());
-      // TODO: Change art_quick_initialize_type/art_quick_initialize_static_storage to
-      // kSaveEverything and use a temporary for the .bss entry address in the fast path,
-      // so that we can avoid another calculation here.
-      UseScratchRegisterScope temps(down_cast<CodeGeneratorARMVIXL*>(codegen)->GetVIXLAssembler());
-      vixl32::Register temp = temps.Acquire();
-      CodeGeneratorARMVIXL::PcRelativePatchInfo* labels =
-          arm_codegen->NewTypeBssEntryPatch(cls_->GetDexFile(), type_index);
-      arm_codegen->EmitMovwMovtPlaceholder(labels, temp);
-      __ Str(OutputRegister(cls_), MemOperand(temp));
-    }
     __ B(GetExitLabel());
   }
 
@@ -462,12 +481,13 @@
       : SlowPathCodeARMVIXL(instruction) {}
 
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    DCHECK(instruction_->IsLoadString());
+    DCHECK_EQ(instruction_->AsLoadString()->GetLoadKind(), HLoadString::LoadKind::kBssEntry);
     LocationSummary* locations = instruction_->GetLocations();
     DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(locations->Out().reg()));
     HLoadString* load = instruction_->AsLoadString();
     const dex::StringIndex string_index = load->GetStringIndex();
     vixl32::Register out = OutputRegister(load);
-    vixl32::Register temp = RegisterFrom(locations->GetTemp(0));
     constexpr bool call_saves_everything_except_r0 = (!kUseReadBarrier || kUseBakerReadBarrier);
 
     CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen);
@@ -476,12 +496,16 @@
 
     InvokeRuntimeCallingConventionARMVIXL calling_convention;
     // In the unlucky case that the `temp` is R0, we preserve the address in `out` across
-    // the kSaveEverything call (or use `out` for the address after non-kSaveEverything call).
-    bool temp_is_r0 = (temp.Is(calling_convention.GetRegisterAt(0)));
-    vixl32::Register entry_address = temp_is_r0 ? out : temp;
-    DCHECK(!entry_address.Is(calling_convention.GetRegisterAt(0)));
-    if (call_saves_everything_except_r0 && temp_is_r0) {
-      __ Mov(entry_address, temp);
+    // the kSaveEverything call.
+    vixl32::Register entry_address;
+    if (call_saves_everything_except_r0) {
+      vixl32::Register temp = RegisterFrom(locations->GetTemp(0));
+      bool temp_is_r0 = (temp.Is(calling_convention.GetRegisterAt(0)));
+      entry_address = temp_is_r0 ? out : temp;
+      DCHECK(!entry_address.Is(calling_convention.GetRegisterAt(0)));
+      if (temp_is_r0) {
+        __ Mov(entry_address, temp);
+      }
     }
 
     __ Mov(calling_convention.GetRegisterAt(0), string_index.index_);
@@ -494,10 +518,13 @@
       __ Str(r0, MemOperand(entry_address));
     } else {
       // For non-Baker read barrier, we need to re-calculate the address of the string entry.
+      UseScratchRegisterScope temps(
+          down_cast<CodeGeneratorARMVIXL*>(codegen)->GetVIXLAssembler());
+      vixl32::Register temp = temps.Acquire();
       CodeGeneratorARMVIXL::PcRelativePatchInfo* labels =
           arm_codegen->NewPcRelativeStringPatch(load->GetDexFile(), string_index);
-      arm_codegen->EmitMovwMovtPlaceholder(labels, out);
-      __ Str(r0, MemOperand(entry_address));
+      arm_codegen->EmitMovwMovtPlaceholder(labels, temp);
+      __ Str(r0, MemOperand(temp));
     }
 
     arm_codegen->Move32(locations->Out(), LocationFrom(r0));
@@ -5832,6 +5859,7 @@
         cls,
         LocationFrom(calling_convention.GetRegisterAt(0)),
         LocationFrom(r0));
+    DCHECK(calling_convention.GetRegisterAt(0).Is(r0));
     return;
   }
   DCHECK(!cls->NeedsAccessCheck());
@@ -5849,6 +5877,22 @@
     locations->SetInAt(0, Location::RequiresRegister());
   }
   locations->SetOut(Location::RequiresRegister());
+  if (load_kind == HLoadClass::LoadKind::kBssEntry) {
+    if (!kUseReadBarrier || kUseBakerReadBarrier) {
+      // Rely on the type resolution or initialization and marking to save everything we need.
+      // Note that IP may be clobbered by saving/restoring the live register (only one thanks
+      // to the custom calling convention) or by marking, so we request a different temp.
+      locations->AddTemp(Location::RequiresRegister());
+      RegisterSet caller_saves = RegisterSet::Empty();
+      InvokeRuntimeCallingConventionARMVIXL calling_convention;
+      caller_saves.Add(LocationFrom(calling_convention.GetRegisterAt(0)));
+      // TODO: Add GetReturnLocation() to the calling convention so that we can DCHECK()
+      // that the the kPrimNot result register is the same as the first argument register.
+      locations->SetCustomSlowPathCallerSaves(caller_saves);
+    } else {
+      // For non-Baker read barrier we have a temp-clobbering call.
+    }
+  }
 }
 
 // NO_THREAD_SAFETY_ANALYSIS as we manipulate handles whose internal object we know does not
@@ -5906,10 +5950,13 @@
       break;
     }
     case HLoadClass::LoadKind::kBssEntry: {
+      vixl32::Register temp = (!kUseReadBarrier || kUseBakerReadBarrier)
+          ? RegisterFrom(locations->GetTemp(0))
+          : out;
       CodeGeneratorARMVIXL::PcRelativePatchInfo* labels =
           codegen_->NewTypeBssEntryPatch(cls->GetDexFile(), cls->GetTypeIndex());
-      codegen_->EmitMovwMovtPlaceholder(labels, out);
-      GenerateGcRootFieldLoad(cls, out_loc, out, 0, kCompilerReadBarrierOption);
+      codegen_->EmitMovwMovtPlaceholder(labels, temp);
+      GenerateGcRootFieldLoad(cls, out_loc, temp, /* offset */ 0, read_barrier_option);
       generate_null_check = true;
       break;
     }
@@ -5918,7 +5965,7 @@
                                                        cls->GetTypeIndex(),
                                                        cls->GetClass()));
       // /* GcRoot<mirror::Class> */ out = *out
-      GenerateGcRootFieldLoad(cls, out_loc, out, /* offset */ 0, kCompilerReadBarrierOption);
+      GenerateGcRootFieldLoad(cls, out_loc, out, /* offset */ 0, read_barrier_option);
       break;
     }
     case HLoadClass::LoadKind::kDexCacheViaMethod:
@@ -6012,9 +6059,9 @@
     locations->SetOut(Location::RequiresRegister());
     if (load_kind == HLoadString::LoadKind::kBssEntry) {
       if (!kUseReadBarrier || kUseBakerReadBarrier) {
-        // Rely on the pResolveString and/or marking to save everything, including temps.
-        // Note that IP may theoretically be clobbered by saving/restoring the live register
-        // (only one thanks to the custom calling convention), so we request a different temp.
+        // Rely on the pResolveString and marking to save everything we need, including temps.
+        // Note that IP may be clobbered by saving/restoring the live register (only one thanks
+        // to the custom calling convention) or by marking, so we request a different temp.
         locations->AddTemp(Location::RequiresRegister());
         RegisterSet caller_saves = RegisterSet::Empty();
         InvokeRuntimeCallingConventionARMVIXL calling_convention;
@@ -6059,7 +6106,9 @@
     }
     case HLoadString::LoadKind::kBssEntry: {
       DCHECK(!codegen_->GetCompilerOptions().IsBootImage());
-      vixl32::Register temp = RegisterFrom(locations->GetTemp(0));
+      vixl32::Register temp = (!kUseReadBarrier || kUseBakerReadBarrier)
+          ? RegisterFrom(locations->GetTemp(0))
+          : out;
       CodeGeneratorARMVIXL::PcRelativePatchInfo* labels =
           codegen_->NewPcRelativeStringPatch(load->GetDexFile(), load->GetStringIndex());
       codegen_->EmitMovwMovtPlaceholder(labels, temp);
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 137b554..48a82b8 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -6057,6 +6057,7 @@
         cls,
         Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
         Location::RegisterLocation(EAX));
+    DCHECK_EQ(calling_convention.GetRegisterAt(0), EAX);
     return;
   }
   DCHECK(!cls->NeedsAccessCheck());
@@ -6076,6 +6077,17 @@
     locations->SetInAt(0, Location::RequiresRegister());
   }
   locations->SetOut(Location::RequiresRegister());
+  if (load_kind == HLoadClass::LoadKind::kBssEntry) {
+    if (!kUseReadBarrier || kUseBakerReadBarrier) {
+      // Rely on the type resolution and/or initialization to save everything.
+      RegisterSet caller_saves = RegisterSet::Empty();
+      InvokeRuntimeCallingConvention calling_convention;
+      caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
+      locations->SetCustomSlowPathCallerSaves(caller_saves);
+    } else {
+      // For non-Baker read barrier we have a temp-clobbering call.
+    }
+  }
 }
 
 Label* CodeGeneratorX86::NewJitRootClassPatch(const DexFile& dex_file,
@@ -6158,7 +6170,7 @@
       Label* fixup_label = codegen_->NewJitRootClassPatch(
           cls->GetDexFile(), cls->GetTypeIndex(), cls->GetClass());
       // /* GcRoot<mirror::Class> */ out = *address
-      GenerateGcRootFieldLoad(cls, out_loc, address, fixup_label, kCompilerReadBarrierOption);
+      GenerateGcRootFieldLoad(cls, out_loc, address, fixup_label, read_barrier_option);
       break;
     }
     case HLoadClass::LoadKind::kDexCacheViaMethod:
@@ -6250,7 +6262,7 @@
     locations->SetOut(Location::RequiresRegister());
     if (load_kind == HLoadString::LoadKind::kBssEntry) {
       if (!kUseReadBarrier || kUseBakerReadBarrier) {
-        // Rely on the pResolveString and/or marking to save everything.
+        // Rely on the pResolveString to save everything.
         RegisterSet caller_saves = RegisterSet::Empty();
         InvokeRuntimeCallingConvention calling_convention;
         caller_saves.Add(Location::RegisterLocation(calling_convention.GetRegisterAt(0)));
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index c5367ce..c71f5e9 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -245,9 +245,8 @@
 
     SaveLiveRegisters(codegen, locations);
 
-    InvokeRuntimeCallingConvention calling_convention;
-    __ movl(CpuRegister(calling_convention.GetRegisterAt(0)),
-            Immediate(cls_->GetTypeIndex().index_));
+    // Custom calling convention: RAX serves as both input and output.
+    __ movl(CpuRegister(RAX), Immediate(cls_->GetTypeIndex().index_));
     x86_64_codegen->InvokeRuntime(do_clinit_ ? kQuickInitializeStaticStorage : kQuickInitializeType,
                                   instruction_,
                                   dex_pc_,
@@ -5456,10 +5455,10 @@
 void LocationsBuilderX86_64::VisitLoadClass(HLoadClass* cls) {
   HLoadClass::LoadKind load_kind = cls->GetLoadKind();
   if (load_kind == HLoadClass::LoadKind::kDexCacheViaMethod) {
-    InvokeRuntimeCallingConvention calling_convention;
+    // Custom calling convention: RAX serves as both input and output.
     CodeGenerator::CreateLoadClassRuntimeCallLocationSummary(
         cls,
-        Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
+        Location::RegisterLocation(RAX),
         Location::RegisterLocation(RAX));
     return;
   }
@@ -5478,6 +5477,17 @@
     locations->SetInAt(0, Location::RequiresRegister());
   }
   locations->SetOut(Location::RequiresRegister());
+  if (load_kind == HLoadClass::LoadKind::kBssEntry) {
+    if (!kUseReadBarrier || kUseBakerReadBarrier) {
+      // Rely on the type resolution and/or initialization to save everything.
+      // Custom calling convention: RAX serves as both input and output.
+      RegisterSet caller_saves = RegisterSet::Empty();
+      caller_saves.Add(Location::RegisterLocation(RAX));
+      locations->SetCustomSlowPathCallerSaves(caller_saves);
+    } else {
+      // For non-Baker read barrier we have a temp-clobbering call.
+    }
+  }
 }
 
 Label* CodeGeneratorX86_64::NewJitRootClassPatch(const DexFile& dex_file,
@@ -5553,7 +5563,7 @@
       Label* fixup_label =
           codegen_->NewJitRootClassPatch(cls->GetDexFile(), cls->GetTypeIndex(), cls->GetClass());
       // /* GcRoot<mirror::Class> */ out = *address
-      GenerateGcRootFieldLoad(cls, out_loc, address, fixup_label, kCompilerReadBarrierOption);
+      GenerateGcRootFieldLoad(cls, out_loc, address, fixup_label, read_barrier_option);
       break;
     }
     default:
@@ -5629,7 +5639,7 @@
     locations->SetOut(Location::RequiresRegister());
     if (load->GetLoadKind() == HLoadString::LoadKind::kBssEntry) {
       if (!kUseReadBarrier || kUseBakerReadBarrier) {
-        // Rely on the pResolveString and/or marking to save everything.
+        // Rely on the pResolveString to save everything.
         // Custom calling convention: RAX serves as both input and output.
         RegisterSet caller_saves = RegisterSet::Empty();
         caller_saves.Add(Location::RegisterLocation(RAX));