Clean up allocation accounting

Add a "RACING_DCHECK" for counter comparisons that are read with
memory_order_relaxed, and thus might legitimately be inconsistent.
This is a hack, but so long as it's only used for DCHECKs, it seems
better than other available options.

Several counters used inconsistent memory_order specifications.
Generally move those to memory_order_relaxed for informational counters
where the value does not affect code execution in a way that matters
for correctness, and where performance might be an issue.

In cases in which performance is clearly not an issue, just remove
the memory_order specifications, thus no longer erroneously implying
that we've actually thought about ordering.

Improve comments in a few places where I found them confusing.

Bug: 79921586

Test: build and boot AOSP.
Change-Id: I8d0115817a9ff47708ed5e92e8c9caca7e73f899
diff --git a/runtime/gc/allocator_type.h b/runtime/gc/allocator_type.h
index 2f1f577..992c32a 100644
--- a/runtime/gc/allocator_type.h
+++ b/runtime/gc/allocator_type.h
@@ -23,15 +23,19 @@
 namespace gc {
 
 // Different types of allocators.
+// Those marked with * have fast path entrypoints callable from generated code.
 enum AllocatorType {
-  kAllocatorTypeBumpPointer,  // Use BumpPointer allocator, has entrypoints.
-  kAllocatorTypeTLAB,  // Use TLAB allocator, has entrypoints.
-  kAllocatorTypeRosAlloc,  // Use RosAlloc allocator, has entrypoints.
-  kAllocatorTypeDlMalloc,  // Use dlmalloc allocator, has entrypoints.
-  kAllocatorTypeNonMoving,  // Special allocator for non moving objects, doesn't have entrypoints.
-  kAllocatorTypeLOS,  // Large object space, also doesn't have entrypoints.
-  kAllocatorTypeRegion,
-  kAllocatorTypeRegionTLAB,
+  // BumpPointer spaces are currently only used for ZygoteSpace construction.
+  kAllocatorTypeBumpPointer,  // Use global CAS-based BumpPointer allocator. (*)
+  kAllocatorTypeTLAB,  // Use TLAB allocator within BumpPointer space. (*)
+  kAllocatorTypeRosAlloc,  // Use RosAlloc (segregated size, free list) allocator. (*)
+  kAllocatorTypeDlMalloc,  // Use dlmalloc (well-known C malloc) allocator. (*)
+  kAllocatorTypeNonMoving,  // Special allocator for non moving objects.
+  kAllocatorTypeLOS,  // Large object space.
+  // The following differ from the BumpPointer allocators primarily in that memory is
+  // allocated from multiple regions, instead of a single contiguous space.
+  kAllocatorTypeRegion,  // Use CAS-based contiguous bump-pointer allocation within a region. (*)
+  kAllocatorTypeRegionTLAB,  // Use region pieces as TLABs. Default for most small objects. (*)
 };
 std::ostream& operator<<(std::ostream& os, const AllocatorType& rhs);
 
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index eede5a5..376f87b 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -406,7 +406,7 @@
         concurrent_copying_->region_space_->RevokeThreadLocalBuffers(thread);
         reinterpret_cast<Atomic<size_t>*>(
             &concurrent_copying_->from_space_num_objects_at_first_pause_)->
-                fetch_add(thread_local_objects, std::memory_order_seq_cst);
+                fetch_add(thread_local_objects, std::memory_order_relaxed);
       } else {
         concurrent_copying_->region_space_->RevokeThreadLocalBuffers(thread);
       }
@@ -2013,9 +2013,9 @@
     const uint64_t from_objects = region_space_->GetObjectsAllocatedInFromSpace();
     const uint64_t unevac_from_bytes = region_space_->GetBytesAllocatedInUnevacFromSpace();
     const uint64_t unevac_from_objects = region_space_->GetObjectsAllocatedInUnevacFromSpace();
-    uint64_t to_bytes = bytes_moved_.load(std::memory_order_seq_cst) + bytes_moved_gc_thread_;
+    uint64_t to_bytes = bytes_moved_.load(std::memory_order_relaxed) + bytes_moved_gc_thread_;
     cumulative_bytes_moved_.fetch_add(to_bytes, std::memory_order_relaxed);
-    uint64_t to_objects = objects_moved_.load(std::memory_order_seq_cst) + objects_moved_gc_thread_;
+    uint64_t to_objects = objects_moved_.load(std::memory_order_relaxed) + objects_moved_gc_thread_;
     cumulative_objects_moved_.fetch_add(to_objects, std::memory_order_relaxed);
     if (kEnableFromSpaceAccountingCheck) {
       CHECK_EQ(from_space_num_objects_at_first_pause_, from_objects + unevac_from_objects);
@@ -2047,12 +2047,12 @@
                 << " unevac_from_space size=" << region_space_->UnevacFromSpaceSize()
                 << " to_space size=" << region_space_->ToSpaceSize();
       LOG(INFO) << "(before) num_bytes_allocated="
-                << heap_->num_bytes_allocated_.load(std::memory_order_seq_cst);
+                << heap_->num_bytes_allocated_.load();
     }
     RecordFree(ObjectBytePair(freed_objects, freed_bytes));
     if (kVerboseMode) {
       LOG(INFO) << "(after) num_bytes_allocated="
-                << heap_->num_bytes_allocated_.load(std::memory_order_seq_cst);
+                << heap_->num_bytes_allocated_.load();
     }
   }
 
@@ -2729,17 +2729,17 @@
         region_space_->RecordAlloc(to_ref);
       }
       bytes_allocated = region_space_alloc_size;
-      heap_->num_bytes_allocated_.fetch_sub(bytes_allocated, std::memory_order_seq_cst);
-      to_space_bytes_skipped_.fetch_sub(bytes_allocated, std::memory_order_seq_cst);
-      to_space_objects_skipped_.fetch_sub(1, std::memory_order_seq_cst);
+      heap_->num_bytes_allocated_.fetch_sub(bytes_allocated, std::memory_order_relaxed);
+      to_space_bytes_skipped_.fetch_sub(bytes_allocated, std::memory_order_relaxed);
+      to_space_objects_skipped_.fetch_sub(1, std::memory_order_relaxed);
     } else {
       // Fall back to the non-moving space.
       fall_back_to_non_moving = true;
       if (kVerboseMode) {
         LOG(INFO) << "Out of memory in the to-space. Fall back to non-moving. skipped_bytes="
-                  << to_space_bytes_skipped_.load(std::memory_order_seq_cst)
+                  << to_space_bytes_skipped_.load(std::memory_order_relaxed)
                   << " skipped_objects="
-                  << to_space_objects_skipped_.load(std::memory_order_seq_cst);
+                  << to_space_objects_skipped_.load(std::memory_order_relaxed);
       }
       to_ref = heap_->non_moving_space_->Alloc(self, obj_size,
                                                &non_moving_space_bytes_allocated, nullptr, &dummy);
@@ -2791,9 +2791,9 @@
           region_space_->FreeLarge</*kForEvac*/ true>(to_ref, bytes_allocated);
         } else {
           // Record the lost copy for later reuse.
-          heap_->num_bytes_allocated_.fetch_add(bytes_allocated, std::memory_order_seq_cst);
-          to_space_bytes_skipped_.fetch_add(bytes_allocated, std::memory_order_seq_cst);
-          to_space_objects_skipped_.fetch_add(1, std::memory_order_seq_cst);
+          heap_->num_bytes_allocated_.fetch_add(bytes_allocated, std::memory_order_relaxed);
+          to_space_bytes_skipped_.fetch_add(bytes_allocated, std::memory_order_relaxed);
+          to_space_objects_skipped_.fetch_add(1, std::memory_order_relaxed);
           MutexLock mu(self, skipped_blocks_lock_);
           skipped_blocks_map_.insert(std::make_pair(bytes_allocated,
                                                     reinterpret_cast<uint8_t*>(to_ref)));
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 791d037..af9000b 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -108,7 +108,8 @@
     pre_fence_visitor(obj, usable_size);
     QuasiAtomic::ThreadFenceForConstructor();
   } else {
-    // Bytes allocated that takes bulk thread-local buffer allocations into account.
+    // Bytes allocated that includes bulk thread-local buffer allocations in addition to direct
+    // non-TLAB object allocations.
     size_t bytes_tl_bulk_allocated = 0u;
     obj = TryToAllocate<kInstrumented, false>(self, allocator, byte_count, &bytes_allocated,
                                               &usable_size, &bytes_tl_bulk_allocated);
@@ -156,10 +157,10 @@
     }
     pre_fence_visitor(obj, usable_size);
     QuasiAtomic::ThreadFenceForConstructor();
-    size_t num_bytes_allocated_before =
-        num_bytes_allocated_.fetch_add(bytes_tl_bulk_allocated, std::memory_order_relaxed);
-    new_num_bytes_allocated = num_bytes_allocated_before + bytes_tl_bulk_allocated;
     if (bytes_tl_bulk_allocated > 0) {
+      size_t num_bytes_allocated_before =
+          num_bytes_allocated_.fetch_add(bytes_tl_bulk_allocated, std::memory_order_relaxed);
+      new_num_bytes_allocated = num_bytes_allocated_before + bytes_tl_bulk_allocated;
       // Only trace when we get an increase in the number of bytes allocated. This happens when
       // obtaining a new TLAB and isn't often enough to hurt performance according to golem.
       TraceHeapSize(new_num_bytes_allocated);
@@ -212,6 +213,8 @@
   // optimized out. And for the other allocators, AllocatorMayHaveConcurrentGC is a constant since
   // the allocator_type should be constant propagated.
   if (AllocatorMayHaveConcurrentGC(allocator) && IsGcConcurrent()) {
+    // New_num_bytes_allocated is zero if we didn't update num_bytes_allocated_.
+    // That's fine.
     CheckConcurrentGC(self, new_num_bytes_allocated, &obj);
   }
   VerifyObject(obj);
@@ -394,7 +397,7 @@
 inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type,
                                             size_t alloc_size,
                                             bool grow) {
-  size_t new_footprint = num_bytes_allocated_.load(std::memory_order_seq_cst) + alloc_size;
+  size_t new_footprint = num_bytes_allocated_.load(std::memory_order_relaxed) + alloc_size;
   if (UNLIKELY(new_footprint > max_allowed_footprint_)) {
     if (UNLIKELY(new_footprint > growth_limit_)) {
       return true;
@@ -411,6 +414,8 @@
   return false;
 }
 
+// Request a GC if new_num_bytes_allocated is sufficiently large.
+// A call with new_num_bytes_allocated == 0 is a fast no-op.
 inline void Heap::CheckConcurrentGC(Thread* self,
                                     size_t new_num_bytes_allocated,
                                     ObjPtr<mirror::Object>* obj) {
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index e76d35d..82a1a68 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -51,6 +51,7 @@
 #include "gc/collector/partial_mark_sweep.h"
 #include "gc/collector/semi_space.h"
 #include "gc/collector/sticky_mark_sweep.h"
+#include "gc/racing_check.h"
 #include "gc/reference_processor.h"
 #include "gc/scoped_gc_critical_section.h"
 #include "gc/space/bump_pointer_space.h"
@@ -1199,8 +1200,8 @@
   delete thread_flip_lock_;
   delete pending_task_lock_;
   delete backtrace_lock_;
-  uint64_t unique_count = unique_backtrace_count_.load(std::memory_order_relaxed);
-  uint64_t seen_count = seen_backtrace_count_.load(std::memory_order_relaxed);
+  uint64_t unique_count = unique_backtrace_count_.load();
+  uint64_t seen_count = seen_backtrace_count_.load();
   if (unique_count != 0 || seen_count != 0) {
     LOG(INFO) << "gc stress unique=" << unique_count << " total=" << (unique_count + seen_count);
   }
@@ -1582,10 +1583,10 @@
   // Use signed comparison since freed bytes can be negative when background compaction foreground
   // transitions occurs. This is caused by the moving objects from a bump pointer space to a
   // free list backed space typically increasing memory footprint due to padding and binning.
-  DCHECK_LE(freed_bytes,
-            static_cast<int64_t>(num_bytes_allocated_.load(std::memory_order_relaxed)));
+  RACING_DCHECK_LE(freed_bytes,
+                   static_cast<int64_t>(num_bytes_allocated_.load(std::memory_order_relaxed)));
   // Note: This relies on 2s complement for handling negative freed_bytes.
-  num_bytes_allocated_.fetch_sub(static_cast<ssize_t>(freed_bytes));
+  num_bytes_allocated_.fetch_sub(static_cast<ssize_t>(freed_bytes), std::memory_order_relaxed);
   if (Runtime::Current()->HasStatsEnabled()) {
     RuntimeStats* thread_stats = Thread::Current()->GetStats();
     thread_stats->freed_objects += freed_objects;
@@ -1602,10 +1603,10 @@
   // ahead-of-time, bulk counting of bytes allocated in rosalloc thread-local buffers.
   // If there's a concurrent revoke, ok to not necessarily reset num_bytes_freed_revoke_
   // all the way to zero exactly as the remainder will be subtracted at the next GC.
-  size_t bytes_freed = num_bytes_freed_revoke_.load();
-  CHECK_GE(num_bytes_freed_revoke_.fetch_sub(bytes_freed),
+  size_t bytes_freed = num_bytes_freed_revoke_.load(std::memory_order_relaxed);
+  CHECK_GE(num_bytes_freed_revoke_.fetch_sub(bytes_freed, std::memory_order_relaxed),
            bytes_freed) << "num_bytes_freed_revoke_ underflow";
-  CHECK_GE(num_bytes_allocated_.fetch_sub(bytes_freed),
+  CHECK_GE(num_bytes_allocated_.fetch_sub(bytes_freed, std::memory_order_relaxed),
            bytes_freed) << "num_bytes_allocated_ underflow";
   GetCurrentGcIteration()->SetFreedRevoke(bytes_freed);
 }
@@ -2030,7 +2031,7 @@
   VLOG(heap) << "TransitionCollector: " << static_cast<int>(collector_type_)
              << " -> " << static_cast<int>(collector_type);
   uint64_t start_time = NanoTime();
-  uint32_t before_allocated = num_bytes_allocated_.load();
+  uint32_t before_allocated = num_bytes_allocated_.load(std::memory_order_relaxed);
   Runtime* const runtime = Runtime::Current();
   Thread* const self = Thread::Current();
   ScopedThreadStateChange tsc(self, kWaitingPerformingGc);
@@ -2166,7 +2167,7 @@
     ScopedObjectAccess soa(self);
     soa.Vm()->UnloadNativeLibraries();
   }
-  int32_t after_allocated = num_bytes_allocated_.load(std::memory_order_seq_cst);
+  int32_t after_allocated = num_bytes_allocated_.load(std::memory_order_relaxed);
   int32_t delta_allocated = before_allocated - after_allocated;
   std::string saved_str;
   if (delta_allocated >= 0) {
@@ -3799,7 +3800,7 @@
 
 void Heap::IncrementNumberOfBytesFreedRevoke(size_t freed_bytes_revoke) {
   size_t previous_num_bytes_freed_revoke =
-      num_bytes_freed_revoke_.fetch_add(freed_bytes_revoke, std::memory_order_seq_cst);
+      num_bytes_freed_revoke_.fetch_add(freed_bytes_revoke, std::memory_order_relaxed);
   // Check the updated value is less than the number of bytes allocated. There is a risk of
   // execution being suspended between the increment above and the CHECK below, leading to
   // the use of previous_num_bytes_freed_revoke in the comparison.
@@ -4018,9 +4019,9 @@
       StackHandleScope<1> hs(self);
       auto h = hs.NewHandleWrapper(obj);
       CollectGarbage(/* clear_soft_references */ false);
-      unique_backtrace_count_.fetch_add(1, std::memory_order_seq_cst);
+      unique_backtrace_count_.fetch_add(1);
     } else {
-      seen_backtrace_count_.fetch_add(1, std::memory_order_seq_cst);
+      seen_backtrace_count_.fetch_add(1);
     }
   }
 }
@@ -4201,7 +4202,7 @@
   explicit TriggerPostForkCCGcTask(uint64_t target_time) : HeapTask(target_time) {}
   void Run(Thread* self) override {
     gc::Heap* heap = Runtime::Current()->GetHeap();
-    // Trigger a GC, if not already done. The first GC after fork, whenever
+    // Trigger a GC, if not already done. The first GC after fork, whenever it
     // takes place, will adjust the thresholds to normal levels.
     if (heap->max_allowed_footprint_ == heap->growth_limit_) {
       heap->RequestConcurrentGC(self, kGcCauseBackground, false);
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 90bac20..6c4b936 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -477,8 +477,9 @@
   void AddFinalizerReference(Thread* self, ObjPtr<mirror::Object>* object);
 
   // Returns the number of bytes currently allocated.
+  // The result should be treated as an approximation, if it is being concurrently updated.
   size_t GetBytesAllocated() const {
-    return num_bytes_allocated_.load(std::memory_order_seq_cst);
+    return num_bytes_allocated_.load(std::memory_order_relaxed);
   }
 
   // Returns the number of objects currently allocated.
@@ -506,7 +507,7 @@
   // were specified. Android apps start with a growth limit (small heap size) which is
   // cleared/extended for large apps.
   size_t GetMaxMemory() const {
-    // There is some race conditions in the allocation code that can cause bytes allocated to
+    // There are some race conditions in the allocation code that can cause bytes allocated to
     // become larger than growth_limit_ in rare cases.
     return std::max(GetBytesAllocated(), growth_limit_);
   }
@@ -528,7 +529,7 @@
   // Returns how much free memory we have until we need to grow the heap to perform an allocation.
   // Similar to GetFreeMemoryUntilGC. Implements java.lang.Runtime.freeMemory.
   size_t GetFreeMemory() const {
-    size_t byte_allocated = num_bytes_allocated_.load(std::memory_order_seq_cst);
+    size_t byte_allocated = num_bytes_allocated_.load(std::memory_order_relaxed);
     size_t total_memory = GetTotalMemory();
     // Make sure we don't get a negative number.
     return total_memory - std::min(total_memory, byte_allocated);
@@ -1222,7 +1223,8 @@
   // Since the heap was created, how many objects have been freed.
   uint64_t total_objects_freed_ever_;
 
-  // Number of bytes allocated.  Adjusted after each allocation and free.
+  // Number of bytes currently allocated and not yet reclaimed. Includes active
+  // TLABS in their entirety, even if they have not yet been parceled out.
   Atomic<size_t> num_bytes_allocated_;
 
   // Number of registered native bytes allocated since the last time GC was
diff --git a/runtime/gc/racing_check.h b/runtime/gc/racing_check.h
new file mode 100644
index 0000000..a81a513
--- /dev/null
+++ b/runtime/gc/racing_check.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_GC_RACING_CHECK_H_
+#define ART_RUNTIME_GC_RACING_CHECK_H_
+
+#include <unistd.h>
+#include <android-base/logging.h>
+
+// For checking purposes, we occasionally compare global counter values.
+// These counters are generally updated without ordering constraints, and hence
+// we may actually see inconsistent values when checking. To minimize spurious
+// failures, try twice with an intervening short sleep. This is a hack not used
+// in production builds.
+#define RACING_DCHECK_LE(x, y) \
+  if (::android::base::kEnableDChecks && ((x) > (y))) { usleep(1000); CHECK_LE(x, y); }
+
+#endif  // ART_RUNTIME_GC_RACING_CHECK_H_
diff --git a/runtime/gc/space/bump_pointer_space-inl.h b/runtime/gc/space/bump_pointer_space-inl.h
index 4c58549..20f7a93 100644
--- a/runtime/gc/space/bump_pointer_space-inl.h
+++ b/runtime/gc/space/bump_pointer_space-inl.h
@@ -83,8 +83,8 @@
 inline mirror::Object* BumpPointerSpace::AllocNonvirtual(size_t num_bytes) {
   mirror::Object* ret = AllocNonvirtualWithoutAccounting(num_bytes);
   if (ret != nullptr) {
-    objects_allocated_.fetch_add(1, std::memory_order_seq_cst);
-    bytes_allocated_.fetch_add(num_bytes, std::memory_order_seq_cst);
+    objects_allocated_.fetch_add(1, std::memory_order_relaxed);
+    bytes_allocated_.fetch_add(num_bytes, std::memory_order_relaxed);
   }
   return ret;
 }
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index 42453f5..80af700 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -206,8 +206,8 @@
 }
 
 void BumpPointerSpace::RevokeThreadLocalBuffersLocked(Thread* thread) {
-  objects_allocated_.fetch_add(thread->GetThreadLocalObjectsAllocated(), std::memory_order_seq_cst);
-  bytes_allocated_.fetch_add(thread->GetThreadLocalBytesAllocated(), std::memory_order_seq_cst);
+  objects_allocated_.fetch_add(thread->GetThreadLocalObjectsAllocated(), std::memory_order_relaxed);
+  bytes_allocated_.fetch_add(thread->GetThreadLocalBytesAllocated(), std::memory_order_relaxed);
   thread->SetTlab(nullptr, nullptr, nullptr);
 }
 
diff --git a/runtime/gc/space/bump_pointer_space.h b/runtime/gc/space/bump_pointer_space.h
index 02e84b5..59d4d27 100644
--- a/runtime/gc/space/bump_pointer_space.h
+++ b/runtime/gc/space/bump_pointer_space.h
@@ -155,8 +155,8 @@
 
   // Record objects / bytes freed.
   void RecordFree(int32_t objects, int32_t bytes) {
-    objects_allocated_.fetch_sub(objects, std::memory_order_seq_cst);
-    bytes_allocated_.fetch_sub(bytes, std::memory_order_seq_cst);
+    objects_allocated_.fetch_sub(objects, std::memory_order_relaxed);
+    bytes_allocated_.fetch_sub(bytes, std::memory_order_relaxed);
   }
 
   void LogFragmentationAllocFailure(std::ostream& os, size_t failed_alloc_bytes) override
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index 3999e27..464d03b 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -624,7 +624,7 @@
     // Loaded the map, use the image header from the file now in case we patch it with
     // RelocateInPlace.
     image_header = reinterpret_cast<ImageHeader*>(map.Begin());
-    const uint32_t bitmap_index = ImageSpace::bitmap_index_.fetch_add(1, std::memory_order_seq_cst);
+    const uint32_t bitmap_index = ImageSpace::bitmap_index_.fetch_add(1);
     std::string bitmap_name(StringPrintf("imagespace %s live-bitmap %u",
                                          image_filename,
                                          bitmap_index));
diff --git a/runtime/gc/space/large_object_space.cc b/runtime/gc/space/large_object_space.cc
index 09d0251..b783cfe 100644
--- a/runtime/gc/space/large_object_space.cc
+++ b/runtime/gc/space/large_object_space.cc
@@ -108,8 +108,10 @@
   mark_bitmap_->SetName(temp_name);
 }
 
-LargeObjectSpace::LargeObjectSpace(const std::string& name, uint8_t* begin, uint8_t* end)
+LargeObjectSpace::LargeObjectSpace(const std::string& name, uint8_t* begin, uint8_t* end,
+                                   const char* lock_name)
     : DiscontinuousSpace(name, kGcRetentionPolicyAlwaysCollect),
+      lock_(lock_name, kAllocSpaceLock),
       num_bytes_allocated_(0), num_objects_allocated_(0), total_bytes_allocated_(0),
       total_objects_allocated_(0), begin_(begin), end_(end) {
 }
@@ -120,8 +122,7 @@
 }
 
 LargeObjectMapSpace::LargeObjectMapSpace(const std::string& name)
-    : LargeObjectSpace(name, nullptr, nullptr),
-      lock_("large object map space lock", kAllocSpaceLock) {}
+    : LargeObjectSpace(name, nullptr, nullptr, "large object map space lock") {}
 
 LargeObjectMapSpace* LargeObjectMapSpace::Create(const std::string& name) {
   if (Runtime::Current()->IsRunningOnMemoryTool()) {
@@ -362,9 +363,8 @@
                              MemMap&& mem_map,
                              uint8_t* begin,
                              uint8_t* end)
-    : LargeObjectSpace(name, begin, end),
-      mem_map_(std::move(mem_map)),
-      lock_("free list space lock", kAllocSpaceLock) {
+    : LargeObjectSpace(name, begin, end, "free list space lock"),
+      mem_map_(std::move(mem_map)) {
   const size_t space_capacity = end - begin;
   free_end_ = space_capacity;
   CHECK_ALIGNED(space_capacity, kAlignment);
diff --git a/runtime/gc/space/large_object_space.h b/runtime/gc/space/large_object_space.h
index 26c6463..47167fa 100644
--- a/runtime/gc/space/large_object_space.h
+++ b/runtime/gc/space/large_object_space.h
@@ -22,6 +22,7 @@
 #include "base/tracking_safe_map.h"
 #include "dlmalloc_space.h"
 #include "space.h"
+#include "thread-current-inl.h"
 
 #include <set>
 #include <vector>
@@ -50,15 +51,19 @@
   virtual ~LargeObjectSpace() {}
 
   uint64_t GetBytesAllocated() override {
+    MutexLock mu(Thread::Current(), lock_);
     return num_bytes_allocated_;
   }
   uint64_t GetObjectsAllocated() override {
+    MutexLock mu(Thread::Current(), lock_);
     return num_objects_allocated_;
   }
   uint64_t GetTotalBytesAllocated() const {
+    MutexLock mu(Thread::Current(), lock_);
     return total_bytes_allocated_;
   }
   uint64_t GetTotalObjectsAllocated() const {
+    MutexLock mu(Thread::Current(), lock_);
     return total_objects_allocated_;
   }
   size_t FreeList(Thread* self, size_t num_ptrs, mirror::Object** ptrs) override;
@@ -110,14 +115,26 @@
   virtual std::pair<uint8_t*, uint8_t*> GetBeginEndAtomic() const = 0;
 
  protected:
-  explicit LargeObjectSpace(const std::string& name, uint8_t* begin, uint8_t* end);
+  explicit LargeObjectSpace(const std::string& name, uint8_t* begin, uint8_t* end,
+                            const char* lock_name);
   static void SweepCallback(size_t num_ptrs, mirror::Object** ptrs, void* arg);
 
-  // Approximate number of bytes which have been allocated into the space.
-  uint64_t num_bytes_allocated_;
-  uint64_t num_objects_allocated_;
-  uint64_t total_bytes_allocated_;
-  uint64_t total_objects_allocated_;
+  // Used to ensure mutual exclusion when the allocation spaces data structures,
+  // including the allocation counters below, are being modified.
+  mutable Mutex lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+
+  // Number of bytes which have been allocated into the space and not yet freed. The count is also
+  // included in the identically named field in Heap. Counts actual allocated (after rounding),
+  // not requested, sizes. TODO: It would be cheaper to just maintain total allocated and total
+  // free counts.
+  uint64_t num_bytes_allocated_ GUARDED_BY(lock_);
+  uint64_t num_objects_allocated_ GUARDED_BY(lock_);
+
+  // Totals for large objects ever allocated, including those that have since been deallocated.
+  // Never decremented.
+  uint64_t total_bytes_allocated_ GUARDED_BY(lock_);
+  uint64_t total_objects_allocated_ GUARDED_BY(lock_);
+
   // Begin and end, may change as more large objects are allocated.
   uint8_t* begin_;
   uint8_t* end_;
@@ -157,8 +174,6 @@
   bool IsZygoteLargeObject(Thread* self, mirror::Object* obj) const override REQUIRES(!lock_);
   void SetAllLargeObjectsAsZygoteObjects(Thread* self) override REQUIRES(!lock_);
 
-  // Used to ensure mutual exclusion when the allocation spaces data structures are being modified.
-  mutable Mutex lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   AllocationTrackingSafeMap<mirror::Object*, LargeObject, kAllocatorTagLOSMaps> large_objects_
       GUARDED_BY(lock_);
 };
@@ -215,7 +230,6 @@
   MemMap allocation_info_map_;
   AllocationInfo* allocation_info_;
 
-  mutable Mutex lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   // Free bytes at the end of the space.
   size_t free_end_ GUARDED_BY(lock_);
   FreeBlocks free_blocks_ GUARDED_BY(lock_);
diff --git a/runtime/gc/space/region_space-inl.h b/runtime/gc/space/region_space-inl.h
index e048515..bda1f1c 100644
--- a/runtime/gc/space/region_space-inl.h
+++ b/runtime/gc/space/region_space-inl.h
@@ -60,7 +60,8 @@
       return obj;
     }
     MutexLock mu(Thread::Current(), region_lock_);
-    // Retry with current region since another thread may have updated it.
+    // Retry with current region since another thread may have updated
+    // current_region_ or evac_region_.  TODO: fix race.
     obj = (kForEvac ? evac_region_ : current_region_)->Alloc(num_bytes,
                                                              bytes_allocated,
                                                              usable_size,
diff --git a/runtime/gc/space/region_space.cc b/runtime/gc/space/region_space.cc
index f74fa86..608b48e 100644
--- a/runtime/gc/space/region_space.cc
+++ b/runtime/gc/space/region_space.cc
@@ -726,7 +726,7 @@
 void RegionSpace::RecordAlloc(mirror::Object* ref) {
   CHECK(ref != nullptr);
   Region* r = RefToRegion(ref);
-  r->objects_allocated_.fetch_add(1, std::memory_order_seq_cst);
+  r->objects_allocated_.fetch_add(1, std::memory_order_relaxed);
 }
 
 bool RegionSpace::AllocNewTlab(Thread* self, size_t min_bytes) {
diff --git a/runtime/gc/space/region_space.h b/runtime/gc/space/region_space.h
index 0bf4f38..c200776 100644
--- a/runtime/gc/space/region_space.h
+++ b/runtime/gc/space/region_space.h
@@ -580,6 +580,8 @@
     // (large region + one or more large tail regions).
     Atomic<uint8_t*> top_;              // The current position of the allocation.
     uint8_t* end_;                      // The end address of the region.
+    // objects_allocated_ is accessed using memory_order_relaxed. Treat as approximate when there
+    // are concurrent updates.
     Atomic<size_t> objects_allocated_;  // The number of objects allocated.
     uint32_t alloc_time_;               // The allocation time of the region.
     // Note that newly allocated and evacuated regions use -1 as
diff --git a/runtime/gc/space/zygote_space.cc b/runtime/gc/space/zygote_space.cc
index ed85b06..f482466 100644
--- a/runtime/gc/space/zygote_space.cc
+++ b/runtime/gc/space/zygote_space.cc
@@ -127,7 +127,7 @@
     // Need to mark the card since this will update the mod-union table next GC cycle.
     card_table->MarkCard(ptrs[i]);
   }
-  zygote_space->objects_allocated_.fetch_sub(num_ptrs, std::memory_order_seq_cst);
+  zygote_space->objects_allocated_.fetch_sub(num_ptrs);
 }
 
 }  // namespace space
diff --git a/runtime/gc/space/zygote_space.h b/runtime/gc/space/zygote_space.h
index 1f73577..03e2ec8 100644
--- a/runtime/gc/space/zygote_space.h
+++ b/runtime/gc/space/zygote_space.h
@@ -68,7 +68,7 @@
   }
 
   uint64_t GetObjectsAllocated() {
-    return objects_allocated_.load(std::memory_order_seq_cst);
+    return objects_allocated_.load();
   }
 
   void Clear() override;