ART/Perfetto Java Heap Profiler

Adding a Sampling Java Heap Profiler to ART and its interface to Perfetto.
This cl is the first cl (CL1) in a series of cls described as below to implement the full ART/Perfetto Java Heap Profiler.
CL1: ART Java Heap Profiler. This is the main ART sampling profiler code. Tested using the ART testrunner as below as well as VLOG.
CL2: Uncomment APEX code ART side. CL3: Add APEX code Perfetto side.
CL2 and CL3 will be submitted simultaneously to avoid build failures and to add APEX dependencies.
CL4: Uncomment Perfetto API code. To be reviewed by fmayer@ (Perfetto Team). Further Testing, could be full feature testing including Perfetto at this point.
CL5: Further tests and/or optimizations can be added as needed.

Test: Passing Tests
test/testrunner/testrunner.py --host --debug -b
test/testrunner/testrunner.py --host --debug -b --64 -t 004-ThreadStress
test/testrunner/testrunner.py --host --runtime-option=-XX:PerfettoJavaHeapStackProf=true --debug -b
test/testrunner/testrunner.py --host --runtime-option=-XX:PerfettoJavaHeapStackProf=true --debug -b --64 -t 004-ThreadStress
Individualized and VLOG testing.

Bug: 160214819

Change-Id: I2be4c4e715ce8c3c8ac545e3e14332198b9c2295
diff --git a/runtime/Android.bp b/runtime/Android.bp
index 8ad00ad..ca82dcf 100644
--- a/runtime/Android.bp
+++ b/runtime/Android.bp
@@ -142,6 +142,7 @@
         "interpreter/shadow_frame.cc",
         "interpreter/unstarted_runtime.cc",
         "java_frame_root_info.cc",
+        "javaheapprof/javaheapsampler.cc",
         "jit/debugger_interface.cc",
         "jit/jit.cc",
         "jit/jit_code_cache.cc",
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 4a03e61..4ca6bf7 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -94,6 +94,7 @@
 #include "obj_ptr-inl.h"
 #include "reflection.h"
 #include "runtime.h"
+#include "javaheapprof/javaheapsampler.h"
 #include "scoped_thread_state_change-inl.h"
 #include "thread_list.h"
 #include "verify_object-inl.h"
@@ -348,6 +349,7 @@
                                         kGcCountRateMaxBucketCount),
       alloc_tracking_enabled_(false),
       alloc_record_depth_(AllocRecordObjectMap::kDefaultAllocStackDepth),
+      perfetto_javaheapprof_heapid_(0),
       backtrace_lock_(nullptr),
       seen_backtrace_count_(0u),
       unique_backtrace_count_(0u),
@@ -742,6 +744,15 @@
       LOG(FATAL) << "There's a gap between the image space and the non-moving space";
     }
   }
+  // Perfetto Java Heap Profiler Support.
+  if (runtime->IsPerfettoJavaHeapStackProfEnabled()) {
+    // Perfetto Plugin is loaded and enabled, initialize the Java Heap Profiler.
+    InitPerfettoJavaHeapProf();
+  } else {
+    // Disable the Java Heap Profiler.
+    GetHeapSampler().DisableHeapSampler(/*disable_ptr=*/nullptr, /*disable_info_ptr=*/nullptr);
+  }
+
   instrumentation::Instrumentation* const instrumentation = runtime->GetInstrumentation();
   if (gc_stress_mode_) {
     backtrace_lock_ = new Mutex("GC complete lock");
@@ -4031,6 +4042,71 @@
   }
 }
 
+// Perfetto Java Heap Profiler Support.
+
+// Perfetto initialization.
+void Heap::InitPerfettoJavaHeapProf() {
+  // Register the heap and create the heapid.
+  // Use a heap name = "HeapSampler".
+  // Initialize Perfetto Heap info and Heap id.
+  static uint32_t heap_id = 1;  // Initialize to 1, to be overwritten by Perfetto heap id.
+  SetPerfettoJavaHeapProfHeapID(heap_id);
+  // Enable the Java Heap Profiler.
+  GetHeapSampler().EnableHeapSampler(/*enable_ptr=*/nullptr, /*enable_info_ptr=*/nullptr);
+  // Set the Enable Callback, there is no callback data ("nullptr").
+  // Set the Disable Callback.
+  VLOG(heap) << "Java Heap Profiler Initialized";
+}
+
+// Check if the Java Heap Profiler is enabled and initialized.
+int Heap::CheckPerfettoJHPEnabled() {
+  return GetHeapSampler().IsEnabled();
+}
+
+void Heap::JHPCheckNonTlabSampleAllocation(Thread* self, mirror::Object* ret, size_t alloc_size) {
+  bool take_sample = false;
+  size_t bytes_until_sample = 0;
+  HeapSampler& prof_heap_sampler = GetHeapSampler();
+  if (ret != nullptr && prof_heap_sampler.IsEnabled()) {
+    // An allocation occurred, sample it, even if non-Tlab.
+    // In case take_sample is already set from the previous GetSampleOffset
+    // because we tried the Tlab allocation first, we will not use this value.
+    // A new value is generated below. Also bytes_until_sample will be updated.
+    // Note that we are not using the return value from the GetSampleOffset in
+    // the NonTlab case here.
+    prof_heap_sampler.GetSampleOffset(alloc_size,
+                                      self->GetTlabPosOffset(),
+                                      &take_sample,
+                                      &bytes_until_sample);
+    prof_heap_sampler.SetBytesUntilSample(bytes_until_sample);
+    if (take_sample) {
+      prof_heap_sampler.ReportSample(ret, alloc_size);
+    }
+    VLOG(heap) << "JHP:NonTlab:AllocNonvirtual";
+  }
+}
+
+size_t Heap::JHPCalculateNextTlabSize(Thread* self,
+                                      size_t jhp_def_tlab_size,
+                                      size_t alloc_size,
+                                      bool* take_sample,
+                                      size_t* bytes_until_sample) {
+  size_t next_tlab_size = jhp_def_tlab_size;
+  if (CheckPerfettoJHPEnabled()) {
+    size_t next_sample_point =
+        GetHeapSampler().GetSampleOffset(alloc_size,
+                                         self->GetTlabPosOffset(),
+                                         take_sample,
+                                         bytes_until_sample);
+    next_tlab_size = std::min(next_sample_point, jhp_def_tlab_size);
+  }
+  return next_tlab_size;
+}
+
+void Heap::AdjustSampleOffset(size_t adjustment) {
+  GetHeapSampler().AdjustSampleOffset(adjustment);
+}
+
 void Heap::CheckGcStressMode(Thread* self, ObjPtr<mirror::Object>* obj) {
   DCHECK(gc_stress_mode_);
   auto* const runtime = Runtime::Current();
@@ -4117,14 +4193,23 @@
                                        size_t* bytes_allocated,
                                        size_t* usable_size,
                                        size_t* bytes_tl_bulk_allocated) {
+  mirror::Object* ret = nullptr;
+  bool take_sample = false;
+  size_t bytes_until_sample = 0;
+
   if (kUsePartialTlabs && alloc_size <= self->TlabRemainingCapacity()) {
     DCHECK_GT(alloc_size, self->TlabSize());
     // There is enough space if we grow the TLAB. Lets do that. This increases the
     // TLAB bytes.
     const size_t min_expand_size = alloc_size - self->TlabSize();
+    size_t next_tlab_size = JHPCalculateNextTlabSize(self,
+                                                     kPartialTlabSize,
+                                                     alloc_size,
+                                                     &take_sample,
+                                                     &bytes_until_sample);
     const size_t expand_bytes = std::max(
         min_expand_size,
-        std::min(self->TlabRemainingCapacity() - self->TlabSize(), kPartialTlabSize));
+        std::min(self->TlabRemainingCapacity() - self->TlabSize(), next_tlab_size));
     if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, expand_bytes, grow))) {
       return nullptr;
     }
@@ -4133,7 +4218,12 @@
     DCHECK_LE(alloc_size, self->TlabSize());
   } else if (allocator_type == kAllocatorTypeTLAB) {
     DCHECK(bump_pointer_space_ != nullptr);
-    const size_t new_tlab_size = alloc_size + kDefaultTLABSize;
+    size_t next_tlab_size = JHPCalculateNextTlabSize(self,
+                                                     kDefaultTLABSize,
+                                                     alloc_size,
+                                                     &take_sample,
+                                                     &bytes_until_sample);
+    const size_t new_tlab_size = alloc_size + next_tlab_size;
     if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, new_tlab_size, grow))) {
       return nullptr;
     }
@@ -4143,6 +4233,9 @@
       return nullptr;
     }
     *bytes_tl_bulk_allocated = new_tlab_size;
+    if (CheckPerfettoJHPEnabled()) {
+      VLOG(heap) << "JHP:kAllocatorTypeTLAB, New Tlab bytes allocated= " << new_tlab_size;
+    }
   } else {
     DCHECK(allocator_type == kAllocatorTypeRegionTLAB);
     DCHECK(region_space_ != nullptr);
@@ -4151,25 +4244,37 @@
       if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type,
                                             space::RegionSpace::kRegionSize,
                                             grow))) {
+        size_t def_pr_tlab_size = kUsePartialTlabs
+                                      ? kPartialTlabSize
+                                      : gc::space::RegionSpace::kRegionSize;
+        size_t next_pr_tlab_size = JHPCalculateNextTlabSize(self,
+                                                            def_pr_tlab_size,
+                                                            alloc_size,
+                                                            &take_sample,
+                                                            &bytes_until_sample);
         const size_t new_tlab_size = kUsePartialTlabs
-            ? std::max(alloc_size, kPartialTlabSize)
-            : gc::space::RegionSpace::kRegionSize;
+            ? std::max(alloc_size, next_pr_tlab_size)
+            : next_pr_tlab_size;
         // Try to allocate a tlab.
         if (!region_space_->AllocNewTlab(self, new_tlab_size, bytes_tl_bulk_allocated)) {
           // Failed to allocate a tlab. Try non-tlab.
-          return region_space_->AllocNonvirtual<false>(alloc_size,
-                                                       bytes_allocated,
-                                                       usable_size,
-                                                       bytes_tl_bulk_allocated);
+          ret = region_space_->AllocNonvirtual<false>(alloc_size,
+                                                      bytes_allocated,
+                                                      usable_size,
+                                                      bytes_tl_bulk_allocated);
+          JHPCheckNonTlabSampleAllocation(self, ret, alloc_size);
+          return ret;
         }
         // Fall-through to using the TLAB below.
       } else {
         // Check OOME for a non-tlab allocation.
         if (!IsOutOfMemoryOnAllocation(allocator_type, alloc_size, grow)) {
-          return region_space_->AllocNonvirtual<false>(alloc_size,
-                                                       bytes_allocated,
-                                                       usable_size,
-                                                       bytes_tl_bulk_allocated);
+          ret = region_space_->AllocNonvirtual<false>(alloc_size,
+                                                      bytes_allocated,
+                                                      usable_size,
+                                                      bytes_tl_bulk_allocated);
+          JHPCheckNonTlabSampleAllocation(self, ret, alloc_size);
+          return ret;
         }
         // Neither tlab or non-tlab works. Give up.
         return nullptr;
@@ -4177,19 +4282,34 @@
     } else {
       // Large. Check OOME.
       if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type, alloc_size, grow))) {
-        return region_space_->AllocNonvirtual<false>(alloc_size,
-                                                     bytes_allocated,
-                                                     usable_size,
-                                                     bytes_tl_bulk_allocated);
+        ret = region_space_->AllocNonvirtual<false>(alloc_size,
+                                                    bytes_allocated,
+                                                    usable_size,
+                                                    bytes_tl_bulk_allocated);
+        JHPCheckNonTlabSampleAllocation(self, ret, alloc_size);
+        return ret;
       }
       return nullptr;
     }
   }
   // Refilled TLAB, return.
-  mirror::Object* ret = self->AllocTlab(alloc_size);
+  ret = self->AllocTlab(alloc_size);
   DCHECK(ret != nullptr);
   *bytes_allocated = alloc_size;
   *usable_size = alloc_size;
+
+  // JavaHeapProfiler: Send the thread information about this allocation in case a sample is
+  // requested.
+  // This is the fallthrough from both the if and else if above cases => Cases that use TLAB.
+  if (CheckPerfettoJHPEnabled()) {
+    if (take_sample) {
+      GetHeapSampler().ReportSample(ret, alloc_size);
+      // Update the bytes_until_sample now that the allocation is already done.
+      GetHeapSampler().SetBytesUntilSample(bytes_until_sample);
+    }
+    VLOG(heap) << "JHP:Fallthrough Tlab allocation";
+  }
+
   return ret;
 }
 
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 99da008..7cc06a7 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -834,6 +834,38 @@
   void DumpGcCountRateHistogram(std::ostream& os) const REQUIRES(!*gc_complete_lock_);
   void DumpBlockingGcCountRateHistogram(std::ostream& os) const REQUIRES(!*gc_complete_lock_);
 
+  // Perfetto Art Heap Profiler Support.
+  // HeapID is a heap identifier used by the Perfetto API and is used in allocation reporting
+  // to Perfetto API.
+  void SetPerfettoJavaHeapProfHeapID(uint32_t heapid) {
+    perfetto_javaheapprof_heapid_ = heapid;
+  }
+
+  uint32_t GetPerfettoJavaHeapProfHeapID() const {
+    return perfetto_javaheapprof_heapid_;
+  }
+
+  HeapSampler& GetHeapSampler() {
+    return heap_sampler_;
+  }
+
+  void InitPerfettoJavaHeapProf();
+  int CheckPerfettoJHPEnabled();
+  // In NonTlab case: Check whether we should report a sample allocation and if so report it.
+  // Also update state (bytes_until_sample).
+  void JHPCheckNonTlabSampleAllocation(Thread* self,
+                                       mirror::Object* ret,
+                                       size_t alloc_size);
+  // In Tlab case: Calculate the next tlab size (location of next sample point) and whether
+  // a sample should be taken.
+  size_t JHPCalculateNextTlabSize(Thread* self,
+                                  size_t jhp_def_tlab_size,
+                                  size_t alloc_size,
+                                  bool* take_sample,
+                                  size_t* bytes_until_sample);
+  // Reduce the number of bytes to the next sample position by this adjustment.
+  void AdjustSampleOffset(size_t adjustment);
+
   // Allocation tracking support
   // Callers to this function use double-checked locking to ensure safety on allocation_records_
   bool IsAllocTrackingEnabled() const {
@@ -1571,6 +1603,10 @@
   std::unique_ptr<AllocRecordObjectMap> allocation_records_;
   size_t alloc_record_depth_;
 
+  // Perfetto Java Heap Profiler support.
+  uint32_t perfetto_javaheapprof_heapid_;
+  HeapSampler heap_sampler_;
+
   // GC stress related data structures.
   Mutex* backtrace_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   // Debugging variables, seen backtraces vs unique backtraces.
diff --git a/runtime/javaheapprof/javaheapsampler.cc b/runtime/javaheapprof/javaheapsampler.cc
new file mode 100644
index 0000000..a1c58d8
--- /dev/null
+++ b/runtime/javaheapprof/javaheapsampler.cc
@@ -0,0 +1,173 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "base/atomic.h"
+#include "base/locks.h"
+#include "gc/heap.h"
+#include "javaheapprof/javaheapsampler.h"
+#include "runtime.h"
+
+namespace art {
+
+size_t HeapSampler::NextGeoDistRandSample() {
+  // Make sure that rng_ and geo_dist are thread safe by acquiring a lock to access.
+  art::MutexLock mu(art::Thread::Current(), geo_dist_rng_lock_);
+  size_t nsample = geo_dist_(rng_);
+  if (nsample == 0) {
+    // Geometric distribution results in +ve values but could have zero.
+    // In the zero case, return 1.
+    nsample = 1;
+  }
+  return nsample;
+}
+
+size_t HeapSampler::PickAndAdjustNextSample(size_t sample_adjust_bytes) {
+  size_t bytes_until_sample;
+  if (GetSamplingInterval() == 1) {
+    bytes_until_sample = 1;
+    return bytes_until_sample;
+  }
+  bytes_until_sample = NextGeoDistRandSample();
+  VLOG(heap) << "JHP:PickAndAdjustNextSample, sample_adjust_bytes: "
+             << sample_adjust_bytes
+             << " bytes_until_sample: " << bytes_until_sample;
+  // Adjust the sample bytes
+  if (sample_adjust_bytes > 0 && bytes_until_sample > sample_adjust_bytes) {
+    bytes_until_sample -= sample_adjust_bytes;
+    VLOG(heap) << "JHP:PickAndAdjustNextSample, final bytes_until_sample: "
+               << bytes_until_sample;
+  }
+  return bytes_until_sample;
+}
+
+// Report to Perfetto an allocation sample.
+// Samples can only be reported after the allocation is done.
+// Also bytes_until_sample can only be updated after the allocation and reporting is done.
+// Thus next bytes_until_sample is previously calculated (before allocation) to be able to
+// get the next tlab_size, but only saved/updated here.
+void HeapSampler::ReportSample(art::mirror::Object* obj ATTRIBUTE_UNUSED, size_t allocation_size) {
+  VLOG(heap) << "JHP:***Report Perfetto Allocation: alloc_size: " << allocation_size;
+}
+
+// Check whether we should take a sample or not at this allocation and calculate the sample
+// offset to use in the expand Tlab calculation. Thus the offset from current pos to the next
+// sample.
+// tlab_used = pos - start
+size_t HeapSampler::GetSampleOffset(size_t alloc_size,
+                                    size_t tlab_used,
+                                    bool* take_sample,
+                                    size_t* temp_bytes_until_sample) {
+  size_t exhausted_size = alloc_size + tlab_used;
+  VLOG(heap) << "JHP:GetSampleOffset: exhausted_size = " << exhausted_size;
+  // Note bytes_until_sample is used as an offset from the start point
+  size_t bytes_until_sample = *GetBytesUntilSample();
+  ssize_t diff = bytes_until_sample - exhausted_size;
+  VLOG(heap) << "JHP:GetSampleOffset: diff = " << diff << " bytes_until_sample = "
+             << bytes_until_sample;
+  if (diff <= 0) {
+    *take_sample = true;
+    // Compute a new bytes_until_sample
+    size_t sample_adj_bytes = -diff;
+    size_t next_bytes_until_sample = PickAndAdjustNextSample(sample_adj_bytes);
+    VLOG(heap) << "JHP:GetSampleOffset: Take sample, next_bytes_until_sample = "
+               << next_bytes_until_sample;
+    next_bytes_until_sample += tlab_used;
+    VLOG(heap) << "JHP:GetSampleOffset:Next sample offset = "
+               << (next_bytes_until_sample - tlab_used);
+    // This function is called before the actual allocation happens so we cannot update
+    // the bytes_until_sample till after the allocation happens, save it to temp which
+    // will be saved after the allocation by the calling function.
+    *temp_bytes_until_sample = next_bytes_until_sample;
+    return (next_bytes_until_sample - tlab_used);
+    // original bytes_until_sample, not offseted
+  } else {
+    *take_sample = false;
+    // The following 2 lines are used in the NonTlab case but have no effect on the
+    // Tlab case, because we will only use the temp_bytes_until_sample if the
+    // take_sample was true (after returning from this function in Tlab case in the
+    // SetBytesUntilSample).
+    size_t next_bytes_until_sample = bytes_until_sample - alloc_size;
+    *temp_bytes_until_sample = next_bytes_until_sample;
+    VLOG(heap) << "JHP:GetSampleOffset: No sample, next_bytes_until_sample= "
+               << next_bytes_until_sample << " alloc= " << alloc_size;
+    return diff;
+  }
+}
+
+// We are tracking the location of samples from the start location of the Tlab
+// We need to adjust how to calculate the sample position in cases where ResetTlab.
+// Adjustment is the new reference position adjustment, usually the new pos-start.
+void HeapSampler::AdjustSampleOffset(size_t adjustment) {
+  size_t* bytes_until_sample = GetBytesUntilSample();
+  size_t cur_bytes_until_sample = *bytes_until_sample;
+  if (cur_bytes_until_sample < adjustment) {
+    VLOG(heap) << "JHP:AdjustSampleOffset:No Adjustment";
+    return;
+  }
+  size_t next_bytes_until_sample = cur_bytes_until_sample - adjustment;
+  *bytes_until_sample = next_bytes_until_sample;
+  VLOG(heap) << "JHP:AdjustSampleOffset: adjustment = " << adjustment
+             << " next_bytes_until_sample = " << next_bytes_until_sample;
+}
+
+// Enable the heap sampler and initialize/set the sampling interval.
+void HeapSampler::EnableHeapSampler(void* enable_ptr ATTRIBUTE_UNUSED,
+                                    const void* enable_info_ptr ATTRIBUTE_UNUSED) {
+  uint64_t interval = 4 * 1024;
+  // Set the ART profiler sampling interval to the value from AHeapProfileSessionInfo
+  // Set interval to sampling interval from AHeapProfileSessionInfo
+  if (interval > 0) {
+    // Make sure that rng_ and geo_dist are thread safe by acquiring a lock to access.
+    art::MutexLock mu(art::Thread::Current(), geo_dist_rng_lock_);
+    SetSamplingInterval(interval);
+  }
+  // Else default is 4K sampling interval. However, default case shouldn't happen for Perfetto API.
+  // AHeapProfileEnableCallbackInfo_getSamplingInterval should always give the requested
+  // (non-negative) sampling interval. It is a uint64_t and gets checked for != 0
+  // Do not call heap->GetPerfettoJavaHeapProfHeapID() as a temp here, it will build but test run
+  // will silently fail. Heap is not fully constructed yet.
+  // heap_id will be set through the Perfetto API.
+  perfetto_heap_id_ = 1;  // To be set by Perfetto API
+  enabled_.store(true, std::memory_order_release);
+}
+
+bool HeapSampler::IsEnabled() {
+  return enabled_.load(std::memory_order_acquire);
+}
+
+void HeapSampler::DisableHeapSampler(void* disable_ptr ATTRIBUTE_UNUSED,
+                                     const void* disable_info_ptr ATTRIBUTE_UNUSED) {
+  enabled_.store(false, std::memory_order_release);
+}
+
+int HeapSampler::GetSamplingInterval() {
+  return p_sampling_interval_.load(std::memory_order_acquire);
+}
+
+void HeapSampler::SetSamplingInterval(int sampling_interval) {
+  p_sampling_interval_.store(sampling_interval, std::memory_order_release);
+  geo_dist_.param(std::geometric_distribution<size_t>::param_type(1.0/p_sampling_interval_));
+}
+
+void HeapSampler::SetSessionInfo(void* info) {
+  perfetto_session_info_ = info;
+}
+
+void* HeapSampler::GetSessionInfo() {
+  return perfetto_session_info_;
+}
+
+}  // namespace art
diff --git a/runtime/javaheapprof/javaheapsampler.h b/runtime/javaheapprof/javaheapsampler.h
new file mode 100644
index 0000000..02cb7b7
--- /dev/null
+++ b/runtime/javaheapprof/javaheapsampler.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_JAVAHEAPPROF_JAVAHEAPSAMPLER_H_
+#define ART_RUNTIME_JAVAHEAPPROF_JAVAHEAPSAMPLER_H_
+
+#include <random>
+#include "base/locks.h"
+#include "base/mutex.h"
+#include "mirror/object.h"
+
+namespace art {
+
+class HeapSampler {
+ public:
+  HeapSampler() : rng_(/*seed=*/std::minstd_rand::default_seed),
+                  geo_dist_(1.0 / /*expected value=4KB*/ 4096),
+                  geo_dist_rng_lock_("Heap Sampler RNG Geometric Dist lock",
+                                     art::LockLevel::kGenericBottomLock) {}
+
+  // Set the bytes until sample.
+  void SetBytesUntilSample(size_t bytes) {
+    *GetBytesUntilSample() = bytes;
+  }
+  // Get the bytes until sample.
+  size_t* GetBytesUntilSample() {
+    // Initialization should happen only once the first time the function is called.
+    // However there will always be a slot allocated for it at thread creation.
+    thread_local size_t bytes_until_sample = 0;
+    return &bytes_until_sample;
+  }
+  // Report a sample to Perfetto.
+  void ReportSample(art::mirror::Object* obj, size_t allocation_size);
+  // Check whether we should take a sample or not at this allocation, and return the
+  // number of bytes from current pos to the next sample to use in the expand Tlab
+  // calculation.
+  // Update state of both take_sample and temp_bytes_until_sample.
+  // tlab_used = pos - start
+  // Note: we do not update bytes until sample here. It will be saved after the allocation
+  // happens. This function can be called before the actual allocation happens.
+  size_t GetSampleOffset(size_t alloc_size,
+                         size_t tlab_used,
+                         bool* take_sample,
+                         size_t* temp_bytes_until_sample) REQUIRES(!geo_dist_rng_lock_);
+  // Adjust the sample offset value with the adjustment usually (pos - start)
+  // of new Tlab after Reset.
+  void AdjustSampleOffset(size_t adjustment);
+  // Is heap sampler enabled?
+  bool IsEnabled();
+  void EnableHeapSampler(void* enable_ptr, const void* enable_info_ptr);
+  void DisableHeapSampler(void* disable_ptr, const void* disable_info_ptr);
+  // Set the sampling interval.
+  void SetSamplingInterval(int sampling_interval) REQUIRES(geo_dist_rng_lock_);
+  // Return the sampling interval.
+  int GetSamplingInterval();
+  // Set the Perfetto Session Info.
+  void SetSessionInfo(void* info);
+  // Get the Perfetto Session Info.
+  void* GetSessionInfo();
+
+ private:
+  size_t NextGeoDistRandSample() REQUIRES(!geo_dist_rng_lock_);
+  // Choose, save, and return the number of bytes until the next sample,
+  // possibly decreasing sample intervals by sample_adj_bytes.
+  size_t PickAndAdjustNextSample(size_t sample_adj_bytes = 0) REQUIRES(!geo_dist_rng_lock_);
+
+  std::atomic<bool> enabled_;
+  // Default sampling interval is 4kb.
+  // Writes guarded by geo_dist_rng_lock_.
+  std::atomic<int> p_sampling_interval_{4 * 1024};
+  void* perfetto_session_info_;
+  uint32_t perfetto_heap_id_ = 0;
+  // std random number generator.
+  std::minstd_rand rng_ GUARDED_BY(geo_dist_rng_lock_);  // Holds the state
+  // std geometric distribution
+  std::geometric_distribution</*result_type=*/size_t> geo_dist_ GUARDED_BY(geo_dist_rng_lock_);
+  // Multiple threads can access the geometric distribution and the random number
+  // generator concurrently and thus geo_dist_rng_lock_ is used for thread safety.
+  art::Mutex geo_dist_rng_lock_;
+};
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_JAVAHEAPPROF_JAVAHEAPSAMPLER_H_
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index 6dd121c..3aa85cd 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -439,6 +439,10 @@
           .WithType<bool>()
           .WithValueMap({{"false", false}, {"true", true}})
           .IntoKey(M::PerfettoHprof)
+      .Define("-XX:PerfettoJavaHeapStackProf=_")
+          .WithType<bool>()
+          .WithValueMap({{"false", false}, {"true", true}})
+          .IntoKey(M::PerfettoJavaHeapStackProf)
       .Ignore({
           "-ea", "-da", "-enableassertions", "-disableassertions", "--runtime-arg", "-esa",
           "-dsa", "-enablesystemassertions", "-disablesystemassertions", "-Xrs", "-Xint:_",
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index ae54453..aeaa175 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -296,7 +296,8 @@
       zygote_no_threads_(false),
       verifier_logging_threshold_ms_(100),
       verifier_missing_kthrow_fatal_(false),
-      perfetto_hprof_enabled_(false) {
+      perfetto_hprof_enabled_(false),
+      perfetto_javaheapprof_enabled_(false) {
   static_assert(Runtime::kCalleeSaveSize ==
                     static_cast<uint32_t>(CalleeSaveType::kLastCalleeSaveType), "Unexpected size");
   CheckConstants();
@@ -1085,6 +1086,16 @@
       LOG(WARNING) << "Failed to load perfetto_hprof: " << err;
     }
   }
+  if (IsPerfettoJavaHeapStackProfEnabled() &&
+      (Dbg::IsJdwpAllowed() || IsProfileableFromShell() || IsJavaDebuggable() ||
+       Runtime::Current()->IsSystemServer())) {
+    std::string err;
+    ScopedTrace tr("perfetto_javaheapprof init.");
+    ScopedThreadSuspension sts(Thread::Current(), ThreadState::kNative);
+    if (!EnsurePerfettoJavaHeapProfPlugin(&err)) {
+      LOG(WARNING) << "Failed to load perfetto_javaheapprof: " << err;
+    }
+  }
   if (LIKELY(automatically_set_jni_ids_indirection_) && CanSetJniIdType()) {
     if (IsJavaDebuggable()) {
       SetJniIdType(JniIdType::kIndices);
@@ -1217,6 +1228,7 @@
 
   verifier_missing_kthrow_fatal_ = runtime_options.GetOrDefault(Opt::VerifierMissingKThrowFatal);
   perfetto_hprof_enabled_ = runtime_options.GetOrDefault(Opt::PerfettoHprof);
+  perfetto_javaheapprof_enabled_ = runtime_options.GetOrDefault(Opt::PerfettoJavaHeapStackProf);
 
   // Try to reserve a dedicated fault page. This is allocated for clobbered registers and sentinels.
   // If we cannot reserve it, log a warning.
@@ -1807,6 +1819,14 @@
     // subsequent dlopens for the library no-ops.
     dlopen(plugin_name, RTLD_NOW | RTLD_LOCAL);
   }
+  if (IsZygote() && IsPerfettoJavaHeapStackProfEnabled()) {
+    // There is no debug build of heapprofd_client_api.so currently.
+    // Add debug build .so when available.
+    constexpr const char* jhp_plugin_name = "heapprofd_client_api.so";
+    // Load eagerly in Zygote to improve app startup times. This will make
+    // subsequent dlopens for the library no-ops.
+    dlopen(jhp_plugin_name, RTLD_NOW | RTLD_LOCAL);
+  }
 
   VLOG(startup) << "Runtime::Init exiting";
 
@@ -1847,6 +1867,13 @@
   return EnsurePluginLoaded(plugin_name, error_msg);
 }
 
+bool Runtime::EnsurePerfettoJavaHeapProfPlugin(std::string* error_msg) {
+  // There is no debug build of heapprofd_client_api.so currently.
+  // Add debug build .so when available.
+  constexpr const char* jhp_plugin_name = "heapprofd_client_api.so";
+  return EnsurePluginLoaded(jhp_plugin_name, error_msg);
+}
+
 static bool EnsureJvmtiPlugin(Runtime* runtime,
                               std::string* error_msg) {
   // TODO Rename Dbg::IsJdwpAllowed is IsDebuggingAllowed.
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 662238e..0054403 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -132,6 +132,7 @@
 
   bool EnsurePluginLoaded(const char* plugin_name, std::string* error_msg);
   bool EnsurePerfettoPlugin(std::string* error_msg);
+  bool EnsurePerfettoJavaHeapProfPlugin(std::string* error_msg);
 
   // IsAotCompiler for compilers that don't have a running runtime. Only dex2oat currently.
   bool IsAotCompiler() const {
@@ -957,6 +958,10 @@
     return perfetto_hprof_enabled_;
   }
 
+  bool IsPerfettoJavaHeapStackProfEnabled() const {
+    return perfetto_javaheapprof_enabled_;
+  }
+
   // Return true if we should load oat files as executable or not.
   bool GetOatFilesExecutable() const;
 
@@ -1311,6 +1316,7 @@
 
   bool verifier_missing_kthrow_fatal_;
   bool perfetto_hprof_enabled_;
+  bool perfetto_javaheapprof_enabled_;
 
   metrics::ArtMetrics metrics_;
   std::unique_ptr<metrics::MetricsReporter> metrics_reporter_;
diff --git a/runtime/runtime_options.def b/runtime/runtime_options.def
index 6aec33c..1961113 100644
--- a/runtime/runtime_options.def
+++ b/runtime/runtime_options.def
@@ -183,6 +183,9 @@
 // This is set to true in frameworks/base/core/jni/AndroidRuntime.cpp.
 RUNTIME_OPTIONS_KEY (bool,                PerfettoHprof,                  false)
 
+// This is to enable/disable Perfetto Java Heap Stack Profiling
+RUNTIME_OPTIONS_KEY (bool,                PerfettoJavaHeapStackProf,      false)
+
 // Whether to dump ART metrics to logcat
 RUNTIME_OPTIONS_KEY (Unit,                WriteMetricsToLog)
 RUNTIME_OPTIONS_KEY (std::string,         WriteMetricsToFile)
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 7f80691..8f999f6 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -4154,6 +4154,15 @@
 }
 
 void Thread::ResetTlab() {
+  gc::Heap* const heap = Runtime::Current()->GetHeap();
+  if (heap->GetHeapSampler().IsEnabled()) {
+    // Note: We always ResetTlab before SetTlab, therefore we can do the sample
+    // offset adjustment here.
+    heap->AdjustSampleOffset(GetTlabPosOffset());
+    VLOG(heap) << "JHP: ResetTlab, Tid: " << GetTid()
+               << " adjustment = "
+               << (tlsPtr_.thread_local_pos - tlsPtr_.thread_local_start);
+  }
   SetTlab(nullptr, nullptr, nullptr);
 }
 
diff --git a/runtime/thread.h b/runtime/thread.h
index b23f647..7475681 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -36,6 +36,7 @@
 #include "handle.h"
 #include "handle_scope.h"
 #include "interpreter/interpreter_cache.h"
+#include "javaheapprof/javaheapsampler.h"
 #include "jvalue.h"
 #include "managed_stack.h"
 #include "offsets.h"
@@ -1157,6 +1158,11 @@
     return tlsPtr_.thread_local_end - tlsPtr_.thread_local_pos;
   }
 
+  // Returns pos offset from start.
+  size_t GetTlabPosOffset() const {
+    return tlsPtr_.thread_local_pos - tlsPtr_.thread_local_start;
+  }
+
   // Returns the remaining space in the TLAB if we were to expand it to maximum capacity.
   size_t TlabRemainingCapacity() const {
     return tlsPtr_.thread_local_limit - tlsPtr_.thread_local_pos;