ART/Perfetto Java Heap Profiler

Adding a Sampling Java Heap Profiler to ART and its interface to Perfetto.
This cl is the first cl (CL1) in a series of cls described as below to implement the full ART/Perfetto Java Heap Profiler.
CL1: ART Java Heap Profiler. This is the main ART sampling profiler code. Tested using the ART testrunner as below as well as VLOG.
CL2: Uncomment APEX code ART side. CL3: Add APEX code Perfetto side.
CL2 and CL3 will be submitted simultaneously to avoid build failures and to add APEX dependencies.
CL4: Uncomment Perfetto API code. To be reviewed by fmayer@ (Perfetto Team). Further Testing, could be full feature testing including Perfetto at this point.
CL5: Further tests and/or optimizations can be added as needed.

Test: Passing Tests
test/testrunner/testrunner.py --host --debug -b
test/testrunner/testrunner.py --host --debug -b --64 -t 004-ThreadStress
test/testrunner/testrunner.py --host --runtime-option=-XX:PerfettoJavaHeapStackProf=true --debug -b
test/testrunner/testrunner.py --host --runtime-option=-XX:PerfettoJavaHeapStackProf=true --debug -b --64 -t 004-ThreadStress
Individualized and VLOG testing.

Bug: 160214819

Change-Id: I2be4c4e715ce8c3c8ac545e3e14332198b9c2295
(cherry picked from commit 7b149d585b4627ebb389e987c14fe808f2fe698b)
Merged-In: I2be4c4e715ce8c3c8ac545e3e14332198b9c2295
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 4a03e61..4ca6bf7 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -94,6 +94,7 @@
 #include "obj_ptr-inl.h"
 #include "reflection.h"
 #include "runtime.h"
+#include "javaheapprof/javaheapsampler.h"
 #include "scoped_thread_state_change-inl.h"
 #include "thread_list.h"
 #include "verify_object-inl.h"
@@ -348,6 +349,7 @@
                                         kGcCountRateMaxBucketCount),
       alloc_tracking_enabled_(false),
       alloc_record_depth_(AllocRecordObjectMap::kDefaultAllocStackDepth),
+      perfetto_javaheapprof_heapid_(0),
       backtrace_lock_(nullptr),
       seen_backtrace_count_(0u),
       unique_backtrace_count_(0u),
@@ -742,6 +744,15 @@
       LOG(FATAL) << "There's a gap between the image space and the non-moving space";
     }
   }
+  // Perfetto Java Heap Profiler Support.
+  if (runtime->IsPerfettoJavaHeapStackProfEnabled()) {
+    // Perfetto Plugin is loaded and enabled, initialize the Java Heap Profiler.
+    InitPerfettoJavaHeapProf();
+  } else {
+    // Disable the Java Heap Profiler.
+    GetHeapSampler().DisableHeapSampler(/*disable_ptr=*/nullptr, /*disable_info_ptr=*/nullptr);
+  }
+
   instrumentation::Instrumentation* const instrumentation = runtime->GetInstrumentation();
   if (gc_stress_mode_) {
     backtrace_lock_ = new Mutex("GC complete lock");
@@ -4031,6 +4042,71 @@
   }
 }
 
+// Perfetto Java Heap Profiler Support.
+
+// Perfetto initialization.
+void Heap::InitPerfettoJavaHeapProf() {
+  // Register the heap and create the heapid.
+  // Use a heap name = "HeapSampler".
+  // Initialize Perfetto Heap info and Heap id.
+  static uint32_t heap_id = 1;  // Initialize to 1, to be overwritten by Perfetto heap id.
+  SetPerfettoJavaHeapProfHeapID(heap_id);
+  // Enable the Java Heap Profiler.
+  GetHeapSampler().EnableHeapSampler(/*enable_ptr=*/nullptr, /*enable_info_ptr=*/nullptr);
+  // Set the Enable Callback, there is no callback data ("nullptr").
+  // Set the Disable Callback.
+  VLOG(heap) << "Java Heap Profiler Initialized";
+}
+
+// Check if the Java Heap Profiler is enabled and initialized.
+int Heap::CheckPerfettoJHPEnabled() {
+  return GetHeapSampler().IsEnabled();
+}
+
+void Heap::JHPCheckNonTlabSampleAllocation(Thread* self, mirror::Object* ret, size_t alloc_size) {
+  bool take_sample = false;
+  size_t bytes_until_sample = 0;
+  HeapSampler& prof_heap_sampler = GetHeapSampler();
+  if (ret != nullptr && prof_heap_sampler.IsEnabled()) {
+    // An allocation occurred, sample it, even if non-Tlab.
+    // In case take_sample is already set from the previous GetSampleOffset
+    // because we tried the Tlab allocation first, we will not use this value.
+    // A new value is generated below. Also bytes_until_sample will be updated.
+    // Note that we are not using the return value from the GetSampleOffset in
+    // the NonTlab case here.
+    prof_heap_sampler.GetSampleOffset(alloc_size,
+                                      self->GetTlabPosOffset(),
+                                      &take_sample,
+                                      &bytes_until_sample);
+    prof_heap_sampler.SetBytesUntilSample(bytes_until_sample);
+    if (take_sample) {
+      prof_heap_sampler.ReportSample(ret, alloc_size);
+    }
+    VLOG(heap) << "JHP:NonTlab:AllocNonvirtual";
+  }
+}
+
+size_t Heap::JHPCalculateNextTlabSize(Thread* self,
+                                      size_t jhp_def_tlab_size,
+                                      size_t alloc_size,
+                                      bool* take_sample,
+                                      size_t* bytes_until_sample) {
+  size_t next_tlab_size = jhp_def_tlab_size;
+  if (CheckPerfettoJHPEnabled()) {
+    size_t next_sample_point =
+        GetHeapSampler().GetSampleOffset(alloc_size,
+                                         self->GetTlabPosOffset(),
+                                         take_sample,
+                                         bytes_until_sample);
+    next_tlab_size = std::min(next_sample_point, jhp_def_tlab_size);
+  }
+  return next_tlab_size;
+}
+
+void Heap::AdjustSampleOffset(size_t adjustment) {
+  GetHeapSampler().AdjustSampleOffset(adjustment);
+}
+
 void Heap::CheckGcStressMode(Thread* self, ObjPtr<mirror::Object>* obj) {
   DCHECK(gc_stress_mode_);
   auto* const runtime = Runtime::Current();
@@ -4117,14 +4193,23 @@
                                        size_t* bytes_allocated,
                                        size_t* usable_size,
                                        size_t* bytes_tl_bulk_allocated) {
+  mirror::Object* ret = nullptr;
+  bool take_sample = false;
+  size_t bytes_until_sample = 0;
+
   if (kUsePartialTlabs && alloc_size <= self->TlabRemainingCapacity()) {
     DCHECK_GT(alloc_size, self->TlabSize());
     // There is enough space if we grow the TLAB. Lets do that. This increases the
     // TLAB bytes.
     const size_t min_expand_size = alloc_size - self->TlabSize();
+    size_t next_tlab_size = JHPCalculateNextTlabSize(self,
+                                                     kPartialTlabSize,
+                                                     alloc_size,
+                                                     &take_sample,
+                                                     &bytes_until_sample);
     const size_t expand_bytes = std::max(
         min_expand_size,
-        std::min(self->TlabRemainingCapacity() - self->TlabSize(), kPartialTlabSize));
+        std::min(self->TlabRemainingCapacity() - self->TlabSize(), next_tlab_size));
     if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, expand_bytes, grow))) {
       return nullptr;
     }
@@ -4133,7 +4218,12 @@
     DCHECK_LE(alloc_size, self->TlabSize());
   } else if (allocator_type == kAllocatorTypeTLAB) {
     DCHECK(bump_pointer_space_ != nullptr);
-    const size_t new_tlab_size = alloc_size + kDefaultTLABSize;
+    size_t next_tlab_size = JHPCalculateNextTlabSize(self,
+                                                     kDefaultTLABSize,
+                                                     alloc_size,
+                                                     &take_sample,
+                                                     &bytes_until_sample);
+    const size_t new_tlab_size = alloc_size + next_tlab_size;
     if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, new_tlab_size, grow))) {
       return nullptr;
     }
@@ -4143,6 +4233,9 @@
       return nullptr;
     }
     *bytes_tl_bulk_allocated = new_tlab_size;
+    if (CheckPerfettoJHPEnabled()) {
+      VLOG(heap) << "JHP:kAllocatorTypeTLAB, New Tlab bytes allocated= " << new_tlab_size;
+    }
   } else {
     DCHECK(allocator_type == kAllocatorTypeRegionTLAB);
     DCHECK(region_space_ != nullptr);
@@ -4151,25 +4244,37 @@
       if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type,
                                             space::RegionSpace::kRegionSize,
                                             grow))) {
+        size_t def_pr_tlab_size = kUsePartialTlabs
+                                      ? kPartialTlabSize
+                                      : gc::space::RegionSpace::kRegionSize;
+        size_t next_pr_tlab_size = JHPCalculateNextTlabSize(self,
+                                                            def_pr_tlab_size,
+                                                            alloc_size,
+                                                            &take_sample,
+                                                            &bytes_until_sample);
         const size_t new_tlab_size = kUsePartialTlabs
-            ? std::max(alloc_size, kPartialTlabSize)
-            : gc::space::RegionSpace::kRegionSize;
+            ? std::max(alloc_size, next_pr_tlab_size)
+            : next_pr_tlab_size;
         // Try to allocate a tlab.
         if (!region_space_->AllocNewTlab(self, new_tlab_size, bytes_tl_bulk_allocated)) {
           // Failed to allocate a tlab. Try non-tlab.
-          return region_space_->AllocNonvirtual<false>(alloc_size,
-                                                       bytes_allocated,
-                                                       usable_size,
-                                                       bytes_tl_bulk_allocated);
+          ret = region_space_->AllocNonvirtual<false>(alloc_size,
+                                                      bytes_allocated,
+                                                      usable_size,
+                                                      bytes_tl_bulk_allocated);
+          JHPCheckNonTlabSampleAllocation(self, ret, alloc_size);
+          return ret;
         }
         // Fall-through to using the TLAB below.
       } else {
         // Check OOME for a non-tlab allocation.
         if (!IsOutOfMemoryOnAllocation(allocator_type, alloc_size, grow)) {
-          return region_space_->AllocNonvirtual<false>(alloc_size,
-                                                       bytes_allocated,
-                                                       usable_size,
-                                                       bytes_tl_bulk_allocated);
+          ret = region_space_->AllocNonvirtual<false>(alloc_size,
+                                                      bytes_allocated,
+                                                      usable_size,
+                                                      bytes_tl_bulk_allocated);
+          JHPCheckNonTlabSampleAllocation(self, ret, alloc_size);
+          return ret;
         }
         // Neither tlab or non-tlab works. Give up.
         return nullptr;
@@ -4177,19 +4282,34 @@
     } else {
       // Large. Check OOME.
       if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type, alloc_size, grow))) {
-        return region_space_->AllocNonvirtual<false>(alloc_size,
-                                                     bytes_allocated,
-                                                     usable_size,
-                                                     bytes_tl_bulk_allocated);
+        ret = region_space_->AllocNonvirtual<false>(alloc_size,
+                                                    bytes_allocated,
+                                                    usable_size,
+                                                    bytes_tl_bulk_allocated);
+        JHPCheckNonTlabSampleAllocation(self, ret, alloc_size);
+        return ret;
       }
       return nullptr;
     }
   }
   // Refilled TLAB, return.
-  mirror::Object* ret = self->AllocTlab(alloc_size);
+  ret = self->AllocTlab(alloc_size);
   DCHECK(ret != nullptr);
   *bytes_allocated = alloc_size;
   *usable_size = alloc_size;
+
+  // JavaHeapProfiler: Send the thread information about this allocation in case a sample is
+  // requested.
+  // This is the fallthrough from both the if and else if above cases => Cases that use TLAB.
+  if (CheckPerfettoJHPEnabled()) {
+    if (take_sample) {
+      GetHeapSampler().ReportSample(ret, alloc_size);
+      // Update the bytes_until_sample now that the allocation is already done.
+      GetHeapSampler().SetBytesUntilSample(bytes_until_sample);
+    }
+    VLOG(heap) << "JHP:Fallthrough Tlab allocation";
+  }
+
   return ret;
 }