Allocate dex cache arrays at startup.

And deallocate them post startup.

Local measurements for app startup average of 1000 runs
on Pixel 5 and master:
- youtube: 1035 -> 985
- maps: 1142 -> 877

Test: test.py
Change-Id: I4fb6e7e2cecdea8bf977ef3b59d47efd0b8b9b74
diff --git a/runtime/linear_alloc.h b/runtime/linear_alloc.h
index fe92d19..7353721 100644
--- a/runtime/linear_alloc.h
+++ b/runtime/linear_alloc.h
@@ -61,7 +61,6 @@
 
 std::ostream& operator<<(std::ostream& os, LinearAllocKind value);
 
-// TODO: Support freeing if we add class unloading.
 class LinearAlloc {
  public:
   static constexpr size_t kAlignment = 8u;
diff --git a/runtime/mirror/dex_cache-inl.h b/runtime/mirror/dex_cache-inl.h
index 0b6bb14..5a44fff 100644
--- a/runtime/mirror/dex_cache-inl.h
+++ b/runtime/mirror/dex_cache-inl.h
@@ -54,7 +54,7 @@
 }
 
 template<typename T>
-T* DexCache::AllocArray(MemberOffset obj_offset, size_t num, LinearAllocKind kind) {
+T* DexCache::AllocArray(MemberOffset obj_offset, size_t num, LinearAllocKind kind, bool startup) {
   Thread* self = Thread::Current();
   mirror::DexCache* dex_cache = this;
   if (gUseReadBarrier && self->GetIsGcMarking()) {
@@ -63,8 +63,14 @@
     dex_cache = reinterpret_cast<DexCache*>(ReadBarrier::Mark(this));
   }
   // DON'T USE 'this' from now on.
-  ClassLinker* linker = Runtime::Current()->GetClassLinker();
-  LinearAlloc* alloc = linker->GetOrCreateAllocatorForClassLoader(GetClassLoader());
+  Runtime* runtime = Runtime::Current();
+  // Note: in the 1002-notify-startup test, the startup linear alloc can become null
+  // concurrently, even if the runtime is marked at startup. Therefore we should only
+  // fetch it once here.
+  LinearAlloc* startup_linear_alloc = runtime->GetStartupLinearAlloc();
+  LinearAlloc* alloc = (startup && startup_linear_alloc != nullptr)
+      ? startup_linear_alloc
+      : runtime->GetClassLinker()->GetOrCreateAllocatorForClassLoader(GetClassLoader());
   MutexLock mu(self, *Locks::dex_cache_lock_);  // Avoid allocation by multiple threads.
   T* array = dex_cache->GetFieldPtr64<T*>(obj_offset);
   if (array != nullptr) {
diff --git a/runtime/mirror/dex_cache.cc b/runtime/mirror/dex_cache.cc
index 7d0c97f..d758c97 100644
--- a/runtime/mirror/dex_cache.cc
+++ b/runtime/mirror/dex_cache.cc
@@ -165,5 +165,21 @@
   return GetFieldObject<ClassLoader>(OFFSET_OF_OBJECT_MEMBER(DexCache, class_loader_));
 }
 
+bool DexCache::AtStartup() {
+  return !Runtime::Current()->GetStartupCompleted();
+}
+
+void DexCache::UnlinkStartupCaches() {
+  if (GetDexFile() == nullptr) {
+    // Unused dex cache.
+    return;
+  }
+  UnlinkStringsArrayIfStartup();
+  UnlinkResolvedFieldsArrayIfStartup();
+  UnlinkResolvedMethodsArrayIfStartup();
+  UnlinkResolvedTypesArrayIfStartup();
+  UnlinkResolvedMethodTypesArrayIfStartup();
+}
+
 }  // namespace mirror
 }  // namespace art
diff --git a/runtime/mirror/dex_cache.h b/runtime/mirror/dex_cache.h
index 4c0c35d..86eb2de 100644
--- a/runtime/mirror/dex_cache.h
+++ b/runtime/mirror/dex_cache.h
@@ -365,6 +365,10 @@
   void VisitNativeRoots(const Visitor& visitor)
       REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_);
 
+  // Sets null to dex cache array fields which were allocated with the startup
+  // allocator.
+  void UnlinkStartupCaches() REQUIRES_SHARED(Locks::mutator_lock_);
+
 // NOLINTBEGIN(bugprone-macro-parentheses)
 #define DEFINE_ARRAY(name, array_kind, getter_setter, type, ids, alloc_kind) \
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags> \
@@ -380,10 +384,10 @@
   static constexpr MemberOffset getter_setter ##Offset() { \
     return OFFSET_OF_OBJECT_MEMBER(DexCache, name); \
   } \
-  array_kind* Allocate ##getter_setter() \
+  array_kind* Allocate ##getter_setter(bool startup = false) \
       REQUIRES_SHARED(Locks::mutator_lock_) { \
     return reinterpret_cast<array_kind*>(AllocArray<type>( \
-        getter_setter ##Offset(), GetDexFile()->ids(), alloc_kind)); \
+        getter_setter ##Offset(), GetDexFile()->ids(), alloc_kind, startup)); \
   } \
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags> \
   size_t Num ##getter_setter() REQUIRES_SHARED(Locks::mutator_lock_) { \
@@ -443,8 +447,9 @@
     } else { \
       auto* pairs = Get ##getter_setter(); \
       if (pairs == nullptr) { \
-        if (GetDexFile()->ids() <= pair_size) { \
-          array = Allocate ##getter_setter ##Array(); \
+        bool should_allocate_full_array = ShouldAllocateFullArray(GetDexFile()->ids(), pair_size); \
+        if (AtStartup() || should_allocate_full_array) { \
+          array = Allocate ##getter_setter ##Array(!should_allocate_full_array); \
           array->Set(index, resolved); \
         } else { \
           pairs = Allocate ##getter_setter(); \
@@ -454,6 +459,12 @@
         pairs->Set(index, resolved); \
       } \
     } \
+  } \
+  void Unlink ##getter_setter ##ArrayIfStartup() \
+      REQUIRES_SHARED(Locks::mutator_lock_) { \
+    if (!ShouldAllocateFullArray(GetDexFile()->ids(), pair_size)) { \
+      Set ##getter_setter ##Array(nullptr) ; \
+    } \
   }
 
   DEFINE_ARRAY(resolved_call_sites_,
@@ -523,7 +534,7 @@
  private:
   // Allocate new array in linear alloc and save it in the given fields.
   template<typename T>
-  T* AllocArray(MemberOffset obj_offset, size_t num, LinearAllocKind kind)
+  T* AllocArray(MemberOffset obj_offset, size_t num, LinearAllocKind kind, bool startup = false)
      REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Visit instance fields of the dex cache as well as its associated arrays.
@@ -534,6 +545,15 @@
   void VisitReferences(ObjPtr<Class> klass, const Visitor& visitor)
       REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_);
 
+  // Returns whether the runtime is currently in startup mode.
+  static bool AtStartup();
+
+  // Returns whether we should allocate a full array given the number of
+  // elements.
+  static bool ShouldAllocateFullArray(size_t number_of_elements, size_t dex_cache_size) {
+    return number_of_elements <= dex_cache_size;
+  }
+
   HeapReference<ClassLoader> class_loader_;
   HeapReference<String> location_;
 
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 8752125..977c916 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -525,6 +525,7 @@
   // Destroy allocators before shutting down the MemMap because they may use it.
   java_vm_.reset();
   linear_alloc_.reset();
+  startup_linear_alloc_.reset();
   linear_alloc_arena_pool_.reset();
   arena_pool_.reset();
   jit_arena_pool_.reset();
@@ -1748,6 +1749,7 @@
     linear_alloc_arena_pool_.reset(new MemMapArenaPool(low_4gb));
   }
   linear_alloc_.reset(CreateLinearAlloc());
+  startup_linear_alloc_.reset(CreateLinearAlloc());
 
   small_irt_allocator_ = new SmallIrtAllocator();
 
@@ -3306,6 +3308,14 @@
   startup_completed_.store(false, std::memory_order_seq_cst);
 }
 
+class UnlinkStartupDexCacheVisitor : public DexCacheVisitor {
+ public:
+  void Visit(ObjPtr<mirror::DexCache> dex_cache)
+      REQUIRES_SHARED(Locks::dex_lock_, Locks::mutator_lock_) override {
+    dex_cache->UnlinkStartupCaches();
+  }
+};
+
 class Runtime::NotifyStartupCompletedTask : public gc::HeapTask {
  public:
   NotifyStartupCompletedTask() : gc::HeapTask(/*target_run_time=*/ NanoTime()) {}
@@ -3313,11 +3323,25 @@
   void Run(Thread* self) override {
     VLOG(startup) << "NotifyStartupCompletedTask running";
     Runtime* const runtime = Runtime::Current();
+    // Fetch the startup linear alloc before the checkpoint to play nice with
+    // 1002-notify-startup test which resets the startup state.
+    std::unique_ptr<LinearAlloc> startup_linear_alloc(runtime->ReleaseStartupLinearAlloc());
     {
-      ScopedTrace trace("Releasing app image spaces metadata");
+      ScopedTrace trace("Releasing dex caches and app image spaces metadata");
       ScopedObjectAccess soa(Thread::Current());
-      // Request empty checkpoints to make sure no threads are accessing the image space metadata
-      // section when we madvise it. Use GC exclusion to prevent deadlocks that may happen if
+
+      {
+        // Unlink dex caches that were allocated with the startup linear alloc.
+        UnlinkStartupDexCacheVisitor visitor;
+        ReaderMutexLock mu(self, *Locks::dex_lock_);
+        runtime->GetClassLinker()->VisitDexCaches(&visitor);
+      }
+
+      // Request empty checkpoints to make sure no threads are:
+      // - accessing the image space metadata section when we madvise it
+      // - accessing dex caches when we free them
+      //
+      // Use GC exclusion to prevent deadlocks that may happen if
       // multiple threads are attempting to run empty checkpoints at the same time.
       {
         // Avoid using ScopedGCCriticalSection since that does not allow thread suspension. This is
@@ -3328,6 +3352,7 @@
                                                        gc::kCollectorTypeCriticalSection);
         runtime->GetThreadList()->RunEmptyCheckpoint();
       }
+
       for (gc::space::ContinuousSpace* space : runtime->GetHeap()->GetContinuousSpaces()) {
         if (space->IsImageSpace()) {
           gc::space::ImageSpace* image_space = space->AsImageSpace();
@@ -3343,6 +3368,13 @@
       ScopedTrace trace2("Delete thread pool");
       runtime->DeleteThreadPool();
     }
+
+    {
+      // We know that after the checkpoint, there is no thread that can hold
+      // the startup linear alloc, so it's safe to delete it now.
+      ScopedTrace trace2("Delete startup linear alloc");
+      startup_linear_alloc.reset();
+    }
   }
 };
 
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 10ee4ae..0cebdab 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -820,6 +820,10 @@
     return linear_alloc_.get();
   }
 
+  LinearAlloc* GetStartupLinearAlloc() {
+    return startup_linear_alloc_.get();
+  }
+
   jit::JitOptions* GetJITOptions() {
     return jit_options_.get();
   }
@@ -1062,6 +1066,10 @@
     ThreadPool* const thread_pool_;
   };
 
+  LinearAlloc* ReleaseStartupLinearAlloc() {
+    return startup_linear_alloc_.release();
+  }
+
   bool LoadAppImageStartupCache() const {
     return load_app_image_startup_cache_;
   }
@@ -1278,6 +1286,10 @@
   // Shared linear alloc for now.
   std::unique_ptr<LinearAlloc> linear_alloc_;
 
+  // Linear alloc used for allocations during startup. Will be deleted after
+  // startup.
+  std::unique_ptr<LinearAlloc> startup_linear_alloc_;
+
   // The number of spins that are done before thread suspension is used to forcibly inflate.
   size_t max_spins_before_thin_lock_inflation_;
   MonitorList* monitor_list_;