3 files changed, 58 insertions, 13 deletions
diff --git a/runtime/alloc_instrumentation.md b/runtime/alloc_instrumentation.md
new file mode 100644
index 0000000000..513bbe3809
--- /dev/null
+++ b/runtime/alloc_instrumentation.md
@@ -0,0 +1,33 @@
+In order to buy some performance on the common, uninstrumented, fast path, we replace repeated
+checks for both allocation instrumentation and allocator changes by  a single function table
+dispatch, and templatized allocation code that can be used to generate either instrumented
+or uninstrumented versions of allocation routines.
+
+When we call an allocation routine, we always indirect through a thread-local function table that
+either points to instrumented or uninstrumented allocation routines. The instrumented code has a
+`kInstrumented` = true template argument (or `kIsInstrumented` in some places), the uninstrumented
+code has `kInstrumented` = false.
+
+The function table is thread-local. There appears to be no logical necessity for that; it just
+makes it easier to access from compiled Java code.
+
+- The function table is switched out by `InstrumentQuickAllocEntryPoints[Locked]`, and a
+corresponding `UninstrumentQuickAlloc`... function.
+
+- These in turn are called by `SetStatsEnabled()`, `SetAllocationListener()`, et al, which
+require the mutator lock is not held.
+
+- With a started runtime, `SetEntrypointsInstrumented()` calls `ScopedSupendAll(`) before updating
+  the function table.
+
+Mutual exclusion in the dispatch table is thus ensured by the fact that it is only updated while
+all other threads are suspended, and is only accessed with the mutator lock logically held,
+which inhibits suspension.
+
+To ensure correctness, we thus must:
+
+1. Suspend all threads when swapping out the dispatch table, and
+2. Make sure that we hold the mutator lock when accessing it.
+3. Not trust kInstrumented once we've given up the mutator lock, since it could have changed in the
+    interim.
+
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 4cfbe2d4bd..3fb2f45440 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -66,7 +66,7 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self,
     self->PoisonObjectPointers();
   }
   auto pre_object_allocated = [&]() REQUIRES_SHARED(Locks::mutator_lock_)
-      REQUIRES(!Roles::uninterruptible_) {
+      REQUIRES(!Roles::uninterruptible_ /* only suspends if kInstrumented */) {
     if constexpr (kInstrumented) {
       AllocationListener* l = alloc_listener_.load(std::memory_order_seq_cst);
       if (UNLIKELY(l != nullptr) && UNLIKELY(l->HasPreAlloc())) {
@@ -88,14 +88,13 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self,
     // non-TLAB object allocations. Only set for non-thread-local allocation,
     size_t bytes_tl_bulk_allocated = 0u;
     // Do the initial pre-alloc
+    // TODO: Consider what happens if the allocator is switched while suspended here.
     pre_object_allocated();
-    ScopedAssertNoThreadSuspension ants("Called PreObjectAllocated, no suspend until alloc");
 
     // Need to check that we aren't the large object allocator since the large object allocation
     // code path includes this function. If we didn't check we would have an infinite loop.
     if (kCheckLargeObject && UNLIKELY(ShouldAllocLargeObject(klass, byte_count))) {
       // AllocLargeObject can suspend and will recall PreObjectAllocated if needed.
-      ScopedAllowThreadSuspension ats;
       obj = AllocLargeObject<kInstrumented, PreFenceVisitor>(self, &klass, byte_count,
                                                              pre_fence_visitor);
       if (obj != nullptr) {
@@ -107,8 +106,12 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self,
       // If the large object allocation failed, try to use the normal spaces (main space,
       // non moving space). This can happen if there is significant virtual address space
       // fragmentation.
-      pre_object_allocated();
+      // kInstrumented may be out of date, so recurse without large object checking, rather than
+      // continue.
+      return AllocObjectWithAllocator</*kInstrumented=*/ true, /*kCheckLargeObject=*/ false>
+          (self, klass, byte_count, GetUpdatedAllocator(allocator), pre_fence_visitor);
     }
+    ScopedAssertNoThreadSuspension ants("Called PreObjectAllocated, no suspend until alloc");
     if (IsTLABAllocator(allocator)) {
       byte_count = RoundUp(byte_count, space::BumpPointerSpace::kAlignment);
     }
@@ -140,9 +143,9 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self,
       obj = TryToAllocate<kInstrumented, false>(self, allocator, byte_count, &bytes_allocated,
                                                 &usable_size, &bytes_tl_bulk_allocated);
       if (UNLIKELY(obj == nullptr)) {
-        // AllocateInternalWithGc can cause thread suspension, if someone instruments the
-        // entrypoints or changes the allocator in a suspend point here, we need to retry the
-        // allocation. It will send the pre-alloc event again.
+        // AllocateInternalWithGc internally re-allows, and can cause, thread suspension, if
+        // someone instruments the entrypoints or changes the allocator in a suspend point here,
+        // we need to retry the allocation. It will send the pre-alloc event again.
         obj = AllocateInternalWithGc(self,
                                      allocator,
                                      kInstrumented,
@@ -159,13 +162,15 @@ inline mirror::Object* Heap::AllocObjectWithAllocator(Thread* self,
             ScopedAllowThreadSuspension ats;
             // AllocObject will pick up the new allocator type, and instrumented as true is the safe
             // default.
-            return AllocObject</*kInstrumented=*/true>(self,
-                                                       klass,
-                                                       byte_count,
-                                                       pre_fence_visitor);
+            return AllocObjectWithAllocator</*kInstrumented=*/true>(self,
+                                                                    klass,
+                                                                    byte_count,
+                                                                    GetUpdatedAllocator(allocator),
+                                                                    pre_fence_visitor);
           }
           return nullptr;
         }
+        // Non-null result implies neither instrumentation nor allocator changed.
       }
       DCHECK_GT(bytes_allocated, 0u);
       DCHECK_GT(usable_size, 0u);
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index d21aafa087..5d0ba8ff89 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -287,6 +287,11 @@ class Heap {
     return current_non_moving_allocator_;
   }
 
+  AllocatorType GetUpdatedAllocator(AllocatorType old_allocator) {
+    return (old_allocator == kAllocatorTypeNonMoving) ?
+        GetCurrentNonMovingAllocator() : GetCurrentAllocator();
+  }
+
   // Visit all of the live objects in the heap.
   template <typename Visitor>
   ALWAYS_INLINE void VisitObjects(Visitor&& visitor)
@@ -1030,8 +1035,10 @@ class Heap {
       REQUIRES(!*gc_complete_lock_, !*pending_task_lock_,
                !*backtrace_lock_, !process_state_update_lock_);
 
-  // Handles Allocate()'s slow allocation path with GC involved after
-  // an initial allocation attempt failed.
+  // Handles Allocate()'s slow allocation path with GC involved after an initial allocation
+  // attempt failed.
+  // Called with thread suspension disallowed, but re-enables it, and may suspend, internally.
+  // Returns null if instrumentation or the allocator changed.
   mirror::Object* AllocateInternalWithGc(Thread* self,
                                          AllocatorType allocator,
                                          bool instrumented,