1 files changed, 548 insertions, 278 deletions
diff --git a/runtime/trace.cc b/runtime/trace.cc
index ec61726ff2..429a5ae591 100644
--- a/runtime/trace.cc
+++ b/runtime/trace.cc
@@ -82,33 +82,147 @@ static TraceAction DecodeTraceAction(uint32_t tmid) {
   return static_cast<TraceAction>(tmid & kTraceMethodActionMask);
 }
 
+namespace {
+// Scaling factor to convert timestamp counter into wall clock time reported in micro seconds.
+// This is initialized at the start of tracing using the timestamp counter update frequency.
+// See InitializeTimestampCounters for more details.
+double tsc_to_microsec_scaling_factor = -1.0;
+
+uint64_t GetTimestamp() {
+  uint64_t t = 0;
+#if defined(__arm__)
+  // See Architecture Reference Manual ARMv7-A and ARMv7-R edition section B4.1.34
+  // Q and R specify that they should be written to lower and upper halves of 64-bit value.
+  // See: https://llvm.org/docs/LangRef.html#asm-template-argument-modifiers
+  asm volatile("mrrc p15, 1, %Q0, %R0, c14" : "=r"(t));
+#elif defined(__aarch64__)
+  // See Arm Architecture Registers  Armv8 section System Registers
+  asm volatile("mrs %0, cntvct_el0" : "=r"(t));
+#elif defined(__i386__) || defined(__x86_64__)
+  // rdtsc returns two 32-bit values in rax and rdx even on 64-bit architectures.
+  unsigned int lo, hi;
+  asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
+  t = (static_cast<uint64_t>(hi) << 32) | lo;
+#else
+  t = MicroTime();
+#endif
+  return t;
+}
+
+#if defined(__i386__) || defined(__x86_64__)
+// Here we compute the scaling factor by sleeping for a millisecond. Alternatively, we could
+// generate raw timestamp counter and also time using clock_gettime at the start and the end of the
+// trace. We can compute the frequency of timestamp counter upadtes in the post processing step
+// using these two samples. However, that would require a change in Android Studio which is the main
+// consumer of these profiles. For now, just compute the frequency of tsc updates here.
+double computeScalingFactor() {
+  uint64_t start = MicroTime();
+  uint64_t start_tsc = GetTimestamp();
+  // Sleep for one millisecond.
+  usleep(1000);
+  uint64_t diff_tsc = GetTimestamp() - start_tsc;
+  uint64_t diff_time = MicroTime() - start;
+  double scaling_factor = static_cast<double>(diff_time) / diff_tsc;
+  DCHECK(scaling_factor > 0.0) << scaling_factor;
+  return scaling_factor;
+}
+
+double GetScalingFactorForX86() {
+  uint32_t eax, ebx, ecx;
+  asm volatile("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx) : "a"(0x0), "c"(0));
+  if (eax < 0x15) {
+    // There is no 15H - Timestamp counter and core crystal clock information
+    // leaf. Just compute the frequency.
+    return computeScalingFactor();
+  }
+
+  // From Intel architecture-instruction-set-extensions-programming-reference:
+  // EBX[31:0]/EAX[31:0] indicates the ratio of the TSC frequency and the
+  // core crystal clock frequency.
+  // If EBX[31:0] is 0, the TSC and "core crystal clock" ratio is not enumerated.
+  // If ECX is 0, the nominal core crystal clock frequency is not enumerated.
+  // "TSC frequency" = "core crystal clock frequency" * EBX/EAX.
+  // The core crystal clock may differ from the reference clock, bus clock, or core clock
+  // frequencies.
+  // EAX Bits 31 - 00: An unsigned integer which is the denominator of the
+  //                   TSC/"core crystal clock" ratio.
+  // EBX Bits 31 - 00: An unsigned integer which is the numerator of the
+  //                   TSC/"core crystal clock" ratio.
+  // ECX Bits 31 - 00: An unsigned integer which is the nominal frequency of the core
+  //                   crystal clock in Hz.
+  // EDX Bits 31 - 00: Reserved = 0.
+  asm volatile("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx) : "a"(0x15), "c"(0));
+  if (ebx == 0 || ecx == 0) {
+    return computeScalingFactor();
+  }
+  double coreCrystalFreq = ecx;
+  // frequency = coreCrystalFreq * (ebx / eax)
+  // scaling_factor = seconds_to_microseconds / frequency
+  //                = seconds_to_microseconds * eax / (coreCrystalFreq * ebx)
+  double seconds_to_microseconds = 1000 * 1000;
+  double scaling_factor = (seconds_to_microseconds * eax) / (coreCrystalFreq * ebx);
+  return scaling_factor;
+}
+#endif
+
+void InitializeTimestampCounters() {
+  // It is sufficient to initialize this once for the entire execution. Just return if it is
+  // already initialized.
+  if (tsc_to_microsec_scaling_factor > 0.0) {
+    return;
+  }
+
+#if defined(__arm__)
+  double seconds_to_microseconds = 1000 * 1000;
+  uint64_t freq = 0;
+  // See Architecture Reference Manual ARMv7-A and ARMv7-R edition section B4.1.21
+  asm volatile("mrc p15, 0, %0, c14, c0, 0" : "=r"(freq));
+  tsc_to_microsec_scaling_factor = seconds_to_microseconds / static_cast<double>(freq);
+#elif defined(__aarch64__)
+  double seconds_to_microseconds = 1000 * 1000;
+  uint64_t freq = 0;
+  // See Arm Architecture Registers  Armv8 section System Registers
+  asm volatile("mrs %0,  cntfrq_el0" : "=r"(freq));
+  tsc_to_microsec_scaling_factor = seconds_to_microseconds / static_cast<double>(freq);
+#elif defined(__i386__) || defined(__x86_64__)
+  tsc_to_microsec_scaling_factor = GetScalingFactorForX86();
+#else
+  tsc_to_microsec_scaling_factor = 1.0;
+#endif
+}
+
+ALWAYS_INLINE uint64_t GetMicroTime(uint64_t counter) {
+  DCHECK(tsc_to_microsec_scaling_factor > 0.0) << tsc_to_microsec_scaling_factor;
+  return tsc_to_microsec_scaling_factor * counter;
+}
+
+}  // namespace
+
 ArtMethod* Trace::DecodeTraceMethod(uint32_t tmid) {
-  MutexLock mu(Thread::Current(), *unique_methods_lock_);
-  return unique_methods_[tmid >> TraceActionBits];
+  uint32_t method_index = tmid >> TraceActionBits;
+  // This is used only for logging which is usually needed only for debugging ART. So it's not
+  // performance critical.
+  for (auto const& entry : art_method_id_map_) {
+    if (method_index == entry.second) {
+      return entry.first;
+    }
+  }
+  return nullptr;
 }
 
 uint32_t Trace::EncodeTraceMethod(ArtMethod* method) {
-  MutexLock mu(Thread::Current(), *unique_methods_lock_);
-  uint32_t idx;
+  uint32_t idx = 0;
   auto it = art_method_id_map_.find(method);
   if (it != art_method_id_map_.end()) {
     idx = it->second;
   } else {
-    unique_methods_.push_back(method);
-    idx = unique_methods_.size() - 1;
+    idx = current_method_index_;
     art_method_id_map_.emplace(method, idx);
+    current_method_index_++;
   }
-  DCHECK_LT(idx, unique_methods_.size());
-  DCHECK_EQ(unique_methods_[idx], method);
   return idx;
 }
 
-uint32_t Trace::EncodeTraceMethodAndAction(ArtMethod* method, TraceAction action) {
-  uint32_t tmid = (EncodeTraceMethod(method) << TraceActionBits) | action;
-  DCHECK_EQ(method, DecodeTraceMethod(tmid));
-  return tmid;
-}
-
 std::vector<ArtMethod*>* Trace::AllocStackTrace() {
   return (temp_stack_trace_.get() != nullptr)  ? temp_stack_trace_.release() :
       new std::vector<ArtMethod*>();
@@ -154,7 +268,7 @@ void Trace::MeasureClockOverhead() {
     Thread::Current()->GetCpuMicroTime();
   }
   if (UseWallClock()) {
-    MicroTime();
+    GetTimestamp();
   }
 }
 
@@ -237,14 +351,13 @@ void Trace::CompareAndUpdateStackTrace(Thread* thread,
   thread->SetStackTraceSample(stack_trace);
   // Read timer clocks to use for all events in this trace.
   uint32_t thread_clock_diff = 0;
-  uint32_t wall_clock_diff = 0;
-  ReadClocks(thread, &thread_clock_diff, &wall_clock_diff);
+  uint64_t timestamp_counter = 0;
+  ReadClocks(thread, &thread_clock_diff, &timestamp_counter);
   if (old_stack_trace == nullptr) {
     // If there's no previous stack trace sample for this thread, log an entry event for all
     // methods in the trace.
     for (auto rit = stack_trace->rbegin(); rit != stack_trace->rend(); ++rit) {
-      LogMethodTraceEvent(thread, *rit, instrumentation::Instrumentation::kMethodEntered,
-                          thread_clock_diff, wall_clock_diff);
+      LogMethodTraceEvent(thread, *rit, kTraceMethodEnter, thread_clock_diff, timestamp_counter);
     }
   } else {
     // If there's a previous stack trace for this thread, diff the traces and emit entry and exit
@@ -258,13 +371,11 @@ void Trace::CompareAndUpdateStackTrace(Thread* thread,
     }
     // Iterate top-down over the old trace until the point where they differ, emitting exit events.
     for (auto old_it = old_stack_trace->begin(); old_it != old_rit.base(); ++old_it) {
-      LogMethodTraceEvent(thread, *old_it, instrumentation::Instrumentation::kMethodExited,
-                          thread_clock_diff, wall_clock_diff);
+      LogMethodTraceEvent(thread, *old_it, kTraceMethodExit, thread_clock_diff, timestamp_counter);
     }
     // Iterate bottom-up over the new trace from the point where they differ, emitting entry events.
     for (; rit != stack_trace->rend(); ++rit) {
-      LogMethodTraceEvent(thread, *rit, instrumentation::Instrumentation::kMethodEntered,
-                          thread_clock_diff, wall_clock_diff);
+      LogMethodTraceEvent(thread, *rit, kTraceMethodEnter, thread_clock_diff, timestamp_counter);
     }
     FreeStackTrace(old_stack_trace);
   }
@@ -285,7 +396,7 @@ void* Trace::RunSamplingThread(void* arg) {
     {
       MutexLock mu(self, *Locks::trace_lock_);
       the_trace = the_trace_;
-      if (the_trace == nullptr) {
+      if (the_trace_->stop_tracing_) {
         break;
       }
     }
@@ -388,22 +499,21 @@ void Trace::Start(std::unique_ptr<File>&& trace_file_in,
     return;
   }
 
+  // Initialize the frequency of timestamp counter updates here. This is needed
+  // to get wallclock time from timestamp counter values.
+  InitializeTimestampCounters();
+
   Runtime* runtime = Runtime::Current();
 
   // Enable count of allocs if specified in the flags.
   bool enable_stats = false;
 
-  if (runtime->GetJit() != nullptr) {
-    // TODO b/110263880 It would be better if we didn't need to do this.
-    // Since we need to hold the method entrypoint across a suspend to ensure instrumentation
-    // hooks are called correctly we have to disable jit-gc to ensure that the entrypoint doesn't
-    // go away. Furthermore we need to leave this off permanently since one could get the same
-    // effect by causing this to be toggled on and off.
-    runtime->GetJit()->GetCodeCache()->SetGarbageCollectCode(false);
-  }
-
   // Create Trace object.
   {
+    // Suspend JIT here since we are switching runtime to debuggable. Debuggable runtimes cannot use
+    // JITed code from before so we need to invalidated all JITed code here. Enter suspend JIT scope
+    // to prevent any races with ongoing JIT compilations.
+    jit::ScopedJitSuspend suspend_jit;
     // Required since EnableMethodTracing calls ConfigureStubs which visits class linker classes.
     gc::ScopedGCCriticalSection gcs(self,
                                     gc::kGcCauseInstrumentation,
@@ -421,6 +531,17 @@ void Trace::Start(std::unique_ptr<File>&& trace_file_in,
                                             "Sampling profiler thread");
         the_trace_->interval_us_ = interval_us;
       } else {
+        if (!runtime->IsJavaDebuggable()) {
+          art::jit::Jit* jit = runtime->GetJit();
+          if (jit != nullptr) {
+            jit->GetCodeCache()->InvalidateAllCompiledCode();
+            jit->GetCodeCache()->TransitionToDebuggable();
+            jit->GetJitCompiler()->SetDebuggableCompilerOption(true);
+          }
+          runtime->SetRuntimeDebugState(art::Runtime::RuntimeDebugState::kJavaDebuggable);
+          runtime->GetInstrumentation()->UpdateEntrypointsForDebuggable();
+          runtime->DeoptimizeBootImage();
+        }
         runtime->GetInstrumentation()->AddListener(
             the_trace_,
             instrumentation::Instrumentation::kMethodEntered |
@@ -431,8 +552,9 @@ void Trace::Start(std::unique_ptr<File>&& trace_file_in,
         // we know that inlining and other problematic optimizations are disabled. We might just
         // want to use the trampolines anyway since it is faster. It makes the story with disabling
         // jit-gc more complex though.
-        runtime->GetInstrumentation()->EnableMethodTracing(
-            kTracerInstrumentationKey, /*needs_interpreter=*/!runtime->IsJavaDebuggable());
+        runtime->GetInstrumentation()->EnableMethodTracing(kTracerInstrumentationKey,
+                                                           the_trace_,
+                                                           /*needs_interpreter=*/false);
       }
     }
   }
@@ -443,78 +565,129 @@ void Trace::Start(std::unique_ptr<File>&& trace_file_in,
   }
 }
 
+void Trace::UpdateThreadsList(Thread* thread) {
+  // TODO(mythria): Clean this up and update threads_list_ when recording the trace event similar
+  // to what we do for streaming case.
+  std::string name;
+  thread->GetThreadName(name);
+  // In tests, we destroy VM after already detaching the current thread. When a thread is
+  // detached we record the information about the threads_list_. We re-attach the current
+  // thread again as a "Shutdown thread" in the process of shutting down. So don't record
+  // information about shutdown threads.
+  if (name.compare("Shutdown thread") == 0) {
+    return;
+  }
+
+  // There can be races when unregistering a thread and stopping the trace and it is possible to
+  // update the list twice. For example, This information is updated here when stopping tracing and
+  // also when a thread is detaching. In thread detach, we first update this information and then
+  // remove the thread from the list of active threads. If the tracing was stopped in between these
+  // events, we can see two updates for the same thread. Since we need a trace_lock_ it isn't easy
+  // to prevent this race (for ex: update this information when holding thread_list_lock_). It is
+  // harmless to do two updates so just use overwrite here.
+  threads_list_.Overwrite(thread->GetTid(), name);
+}
+
 void Trace::StopTracing(bool finish_tracing, bool flush_file) {
-  bool stop_alloc_counting = false;
   Runtime* const runtime = Runtime::Current();
-  Trace* the_trace = nullptr;
   Thread* const self = Thread::Current();
   pthread_t sampling_pthread = 0U;
   {
     MutexLock mu(self, *Locks::trace_lock_);
     if (the_trace_ == nullptr) {
       LOG(ERROR) << "Trace stop requested, but no trace currently running";
-    } else {
-      the_trace = the_trace_;
-      the_trace_ = nullptr;
-      sampling_pthread = sampling_pthread_;
+      return;
     }
+    // Tell sampling_pthread_ to stop tracing.
+    the_trace_->stop_tracing_ = true;
+    sampling_pthread = sampling_pthread_;
   }
+
   // Make sure that we join before we delete the trace since we don't want to have
   // the sampling thread access a stale pointer. This finishes since the sampling thread exits when
   // the_trace_ is null.
   if (sampling_pthread != 0U) {
     CHECK_PTHREAD_CALL(pthread_join, (sampling_pthread, nullptr), "sampling thread shutdown");
-    sampling_pthread_ = 0U;
   }
 
-  if (the_trace != nullptr) {
-    stop_alloc_counting = (the_trace->flags_ & Trace::kTraceCountAllocs) != 0;
-    // Stop the trace sources adding more entries to the trace buffer and synchronise stores.
-    {
-      gc::ScopedGCCriticalSection gcs(self,
-                                      gc::kGcCauseInstrumentation,
-                                      gc::kCollectorTypeInstrumentation);
-      ScopedSuspendAll ssa(__FUNCTION__);
+  // Make a copy of the_trace_, so it can be flushed later. We want to reset
+  // the_trace_ to nullptr in suspend all scope to prevent any races
+  Trace* the_trace = the_trace_;
+  bool stop_alloc_counting = (the_trace->flags_ & Trace::kTraceCountAllocs) != 0;
+  // Stop the trace sources adding more entries to the trace buffer and synchronise stores.
+  {
+    gc::ScopedGCCriticalSection gcs(
+        self, gc::kGcCauseInstrumentation, gc::kCollectorTypeInstrumentation);
+    jit::ScopedJitSuspend suspend_jit;
+    ScopedSuspendAll ssa(__FUNCTION__);
 
-      if (the_trace->trace_mode_ == TraceMode::kSampling) {
-        MutexLock mu(self, *Locks::thread_list_lock_);
-        runtime->GetThreadList()->ForEach(ClearThreadStackTraceAndClockBase, nullptr);
-      } else {
-        runtime->GetInstrumentation()->RemoveListener(
-            the_trace,
-            instrumentation::Instrumentation::kMethodEntered |
-                instrumentation::Instrumentation::kMethodExited |
-                instrumentation::Instrumentation::kMethodUnwind);
-        runtime->GetInstrumentation()->DisableMethodTracing(kTracerInstrumentationKey);
-      }
-    }
-    // At this point, code may read buf_ as it's writers are shutdown
-    // and the ScopedSuspendAll above has ensured all stores to buf_
-    // are now visible.
-    if (finish_tracing) {
-      the_trace->FinishTracing();
+    if (the_trace->trace_mode_ == TraceMode::kSampling) {
+      MutexLock mu(self, *Locks::thread_list_lock_);
+      runtime->GetThreadList()->ForEach(ClearThreadStackTraceAndClockBase, nullptr);
+    } else {
+      runtime->GetInstrumentation()->RemoveListener(
+          the_trace,
+          instrumentation::Instrumentation::kMethodEntered |
+              instrumentation::Instrumentation::kMethodExited |
+              instrumentation::Instrumentation::kMethodUnwind);
+      runtime->GetInstrumentation()->DisableMethodTracing(kTracerInstrumentationKey);
+      runtime->GetInstrumentation()->MaybeSwitchRuntimeDebugState(self);
     }
-    if (the_trace->trace_file_.get() != nullptr) {
-      // Do not try to erase, so flush and close explicitly.
-      if (flush_file) {
-        if (the_trace->trace_file_->Flush() != 0) {
-          PLOG(WARNING) << "Could not flush trace file.";
+
+    // Flush thread specific buffer from all threads before resetting the_trace_ to nullptr.
+    // We also flush the buffer when destroying a thread which expects the_trace_ to be valid so
+    // make sure that the per-thread buffer is reset before resetting the_trace_.
+    {
+      MutexLock tl_lock(Thread::Current(), *Locks::thread_list_lock_);
+      for (Thread* thread : Runtime::Current()->GetThreadList()->GetList()) {
+        if (thread->GetMethodTraceBuffer() != nullptr) {
+          the_trace_->FlushStreamingBuffer(thread);
+          thread->ResetMethodTraceBuffer();
         }
-      } else {
-        the_trace->trace_file_->MarkUnchecked();  // Do not trigger guard.
+        // Record threads here before resetting the_trace_ to prevent any races between
+        // unregistering the thread and resetting the_trace_.
+        the_trace->UpdateThreadsList(thread);
       }
-      if (the_trace->trace_file_->Close() != 0) {
-        PLOG(ERROR) << "Could not close trace file.";
+    }
+
+    // Reset the_trace_ by taking a trace_lock
+    MutexLock mu(self, *Locks::trace_lock_);
+    the_trace_ = nullptr;
+    sampling_pthread_ = 0U;
+  }
+
+  // At this point, code may read buf_ as its writers are shutdown
+  // and the ScopedSuspendAll above has ensured all stores to buf_
+  // are now visible.
+  if (finish_tracing) {
+    the_trace->FinishTracing();
+  }
+  if (the_trace->trace_file_.get() != nullptr) {
+    // Do not try to erase, so flush and close explicitly.
+    if (flush_file) {
+      if (the_trace->trace_file_->Flush() != 0) {
+        PLOG(WARNING) << "Could not flush trace file.";
       }
+    } else {
+      the_trace->trace_file_->MarkUnchecked();  // Do not trigger guard.
+    }
+    if (the_trace->trace_file_->Close() != 0) {
+      PLOG(ERROR) << "Could not close trace file.";
     }
-    delete the_trace;
   }
+  delete the_trace;
+
   if (stop_alloc_counting) {
     // Can be racy since SetStatsEnabled is not guarded by any locks.
     runtime->SetStatsEnabled(false);
   }
 }
 
+void Trace::FlushThreadBuffer(Thread* self) {
+  MutexLock mu(self, *Locks::trace_lock_);
+  the_trace_->FlushStreamingBuffer(self);
+}
+
 void Trace::Abort() {
   // Do not write anything anymore.
   StopTracing(false, false);
@@ -548,6 +721,28 @@ TracingMode Trace::GetMethodTracingMode() {
 }
 
 static constexpr size_t kMinBufSize = 18U;  // Trace header is up to 18B.
+// Size of per-thread buffer size. The value is chosen arbitrarily. This value
+// should be greater than kMinBufSize.
+static constexpr size_t kPerThreadBufSize = 512 * 1024;
+static_assert(kPerThreadBufSize > kMinBufSize);
+
+namespace {
+
+TraceClockSource GetClockSourceFromFlags(int flags) {
+  bool need_wall = flags & Trace::TraceFlag::kTraceClockSourceWallClock;
+  bool need_thread_cpu = flags & Trace::TraceFlag::kTraceClockSourceThreadCpu;
+  if (need_wall && need_thread_cpu) {
+    return TraceClockSource::kDual;
+  } else if (need_wall) {
+    return TraceClockSource::kWall;
+  } else if (need_thread_cpu) {
+    return TraceClockSource::kThreadCpu;
+  } else {
+    return kDefaultTraceClockSource;
+  }
+}
+
+}  // namespace
 
 Trace::Trace(File* trace_file,
              size_t buffer_size,
@@ -556,12 +751,17 @@ Trace::Trace(File* trace_file,
              TraceMode trace_mode)
     : trace_file_(trace_file),
       buf_(new uint8_t[std::max(kMinBufSize, buffer_size)]()),
-      flags_(flags), trace_output_mode_(output_mode), trace_mode_(trace_mode),
-      clock_source_(default_clock_source_),
+      flags_(flags),
+      trace_output_mode_(output_mode),
+      trace_mode_(trace_mode),
+      clock_source_(GetClockSourceFromFlags(flags)),
       buffer_size_(std::max(kMinBufSize, buffer_size)),
-      start_time_(MicroTime()), clock_overhead_ns_(GetClockOverheadNanoSeconds()),
-      overflow_(false), interval_us_(0), streaming_lock_(nullptr),
-      unique_methods_lock_(new Mutex("unique methods lock", kTracingUniqueMethodsLock)) {
+      start_time_(GetMicroTime(GetTimestamp())),
+      clock_overhead_ns_(GetClockOverheadNanoSeconds()),
+      overflow_(false),
+      interval_us_(0),
+      stop_tracing_(false),
+      tracing_lock_("tracing lock", LockLevel::kTracingStreamingLock) {
   CHECK_IMPLIES(trace_file == nullptr, output_mode == TraceOutputMode::kDDMS);
 
   uint16_t trace_version = GetTraceVersion(clock_source_);
@@ -583,16 +783,15 @@ Trace::Trace(File* trace_file,
   cur_offset_.store(kTraceHeaderLength, std::memory_order_relaxed);
 
   if (output_mode == TraceOutputMode::kStreaming) {
-    streaming_lock_ = new Mutex("tracing lock", LockLevel::kTracingStreamingLock);
-    seen_threads_.reset(new ThreadIDBitSet());
+    // Flush the header information to the file. We use a per thread buffer, so
+    // it is easier to just write the header information directly to file.
+    if (!trace_file_->WriteFully(buf_.get(), kTraceHeaderLength)) {
+      PLOG(WARNING) << "Failed streaming a tracing event.";
+    }
+    cur_offset_.store(0, std::memory_order_relaxed);
   }
 }
 
-Trace::~Trace() {
-  delete streaming_lock_;
-  delete unique_methods_lock_;
-}
-
 static uint64_t ReadBytes(uint8_t* buf, size_t bytes) {
   uint64_t ret = 0;
   for (size_t i = 0; i < bytes; ++i) {
@@ -605,6 +804,7 @@ void Trace::DumpBuf(uint8_t* buf, size_t buf_size, TraceClockSource clock_source
   uint8_t* ptr = buf + kTraceHeaderLength;
   uint8_t* end = buf + buf_size;
 
+  MutexLock mu(Thread::Current(), tracing_lock_);
   while (ptr < end) {
     uint32_t tmid = ReadBytes(ptr + 2, sizeof(tmid));
     ArtMethod* method = DecodeTraceMethod(tmid);
@@ -616,18 +816,12 @@ void Trace::DumpBuf(uint8_t* buf, size_t buf_size, TraceClockSource clock_source
 
 void Trace::FinishTracing() {
   size_t final_offset = 0;
-  std::set<ArtMethod*> visited_methods;
-  if (trace_output_mode_ == TraceOutputMode::kStreaming) {
-    // Clean up.
-    MutexLock mu(Thread::Current(), *streaming_lock_);
-    STLDeleteValues(&seen_methods_);
-  } else {
+  if (trace_output_mode_ != TraceOutputMode::kStreaming) {
     final_offset = cur_offset_.load(std::memory_order_relaxed);
-    GetVisitedMethods(final_offset, &visited_methods);
   }
 
   // Compute elapsed time.
-  uint64_t elapsed = MicroTime() - start_time_;
+  uint64_t elapsed = GetMicroTime(GetTimestamp()) - start_time_;
 
   std::ostringstream os;
 
@@ -659,25 +853,25 @@ void Trace::FinishTracing() {
   os << StringPrintf("%cthreads\n", kTraceTokenChar);
   DumpThreadList(os);
   os << StringPrintf("%cmethods\n", kTraceTokenChar);
-  DumpMethodList(os, visited_methods);
+  DumpMethodList(os);
   os << StringPrintf("%cend\n", kTraceTokenChar);
   std::string header(os.str());
 
   if (trace_output_mode_ == TraceOutputMode::kStreaming) {
-    // Protect access to buf_ and satisfy sanitizer for calls to WriteBuf / FlushBuf.
-    MutexLock mu(Thread::Current(), *streaming_lock_);
+    // It is expected that this method is called when all other threads are suspended, so there
+    // cannot be any writes to trace_file_ after finish tracing.
     // Write a special token to mark the end of trace records and the start of
     // trace summary.
     uint8_t buf[7];
     Append2LE(buf, 0);
     buf[2] = kOpTraceSummary;
     Append4LE(buf + 3, static_cast<uint32_t>(header.length()));
-    WriteToBuf(buf, sizeof(buf));
     // Write the trace summary. The summary is identical to the file header when
     // the output mode is not streaming (except for methods).
-    WriteToBuf(reinterpret_cast<const uint8_t*>(header.c_str()), header.length());
-    // Flush the buffer, which may include some trace records before the summary.
-    FlushBuf();
+    if (!trace_file_->WriteFully(buf, sizeof(buf)) ||
+        !trace_file_->WriteFully(header.c_str(), header.length())) {
+      PLOG(WARNING) << "Failed streaming a tracing event.";
+    }
   } else {
     if (trace_file_.get() == nullptr) {
       std::vector<uint8_t> data;
@@ -736,10 +930,9 @@ void Trace::FieldWritten(Thread* thread ATTRIBUTE_UNUSED,
 
 void Trace::MethodEntered(Thread* thread, ArtMethod* method) {
   uint32_t thread_clock_diff = 0;
-  uint32_t wall_clock_diff = 0;
-  ReadClocks(thread, &thread_clock_diff, &wall_clock_diff);
-  LogMethodTraceEvent(thread, method, instrumentation::Instrumentation::kMethodEntered,
-                      thread_clock_diff, wall_clock_diff);
+  uint64_t timestamp_counter = 0;
+  ReadClocks(thread, &thread_clock_diff, &timestamp_counter);
+  LogMethodTraceEvent(thread, method, kTraceMethodEnter, thread_clock_diff, timestamp_counter);
 }
 
 void Trace::MethodExited(Thread* thread,
@@ -747,24 +940,18 @@ void Trace::MethodExited(Thread* thread,
                          instrumentation::OptionalFrame frame ATTRIBUTE_UNUSED,
                          JValue& return_value ATTRIBUTE_UNUSED) {
   uint32_t thread_clock_diff = 0;
-  uint32_t wall_clock_diff = 0;
-  ReadClocks(thread, &thread_clock_diff, &wall_clock_diff);
-  LogMethodTraceEvent(thread,
-                      method,
-                      instrumentation::Instrumentation::kMethodExited,
-                      thread_clock_diff,
-                      wall_clock_diff);
+  uint64_t timestamp_counter = 0;
+  ReadClocks(thread, &thread_clock_diff, &timestamp_counter);
+  LogMethodTraceEvent(thread, method, kTraceMethodExit, thread_clock_diff, timestamp_counter);
 }
 
 void Trace::MethodUnwind(Thread* thread,
-                         Handle<mirror::Object> this_object ATTRIBUTE_UNUSED,
                          ArtMethod* method,
                          uint32_t dex_pc ATTRIBUTE_UNUSED) {
   uint32_t thread_clock_diff = 0;
-  uint32_t wall_clock_diff = 0;
-  ReadClocks(thread, &thread_clock_diff, &wall_clock_diff);
-  LogMethodTraceEvent(thread, method, instrumentation::Instrumentation::kMethodUnwind,
-                      thread_clock_diff, wall_clock_diff);
+  uint64_t timestamp_counter = 0;
+  ReadClocks(thread, &thread_clock_diff, &timestamp_counter);
+  LogMethodTraceEvent(thread, method, kTraceUnroll, thread_clock_diff, timestamp_counter);
 }
 
 void Trace::ExceptionThrown(Thread* thread ATTRIBUTE_UNUSED,
@@ -790,7 +977,7 @@ void Trace::WatchedFramePop(Thread* self ATTRIBUTE_UNUSED,
   LOG(ERROR) << "Unexpected WatchedFramePop event in tracing";
 }
 
-void Trace::ReadClocks(Thread* thread, uint32_t* thread_clock_diff, uint32_t* wall_clock_diff) {
+void Trace::ReadClocks(Thread* thread, uint32_t* thread_clock_diff, uint64_t* timestamp_counter) {
   if (UseThreadCpuClock()) {
     uint64_t clock_base = thread->GetTraceClockBase();
     if (UNLIKELY(clock_base == 0)) {
@@ -802,90 +989,186 @@ void Trace::ReadClocks(Thread* thread, uint32_t* thread_clock_diff, uint32_t* wa
     }
   }
   if (UseWallClock()) {
-    *wall_clock_diff = MicroTime() - start_time_;
+    *timestamp_counter = GetTimestamp();
   }
 }
 
-bool Trace::RegisterMethod(ArtMethod* method) {
-  const DexFile* dex_file = method->GetDexFile();
-  if (seen_methods_.find(dex_file) == seen_methods_.end()) {
-    seen_methods_.insert(std::make_pair(dex_file, new DexIndexBitSet()));
-  }
-  DexIndexBitSet* bit_set = seen_methods_.find(dex_file)->second;
-  if (!(*bit_set)[method->GetDexMethodIndex()]) {
-    bit_set->set(method->GetDexMethodIndex());
-    return true;
-  }
-  return false;
+std::string Trace::GetMethodLine(ArtMethod* method, uint32_t method_index) {
+  method = method->GetInterfaceMethodIfProxy(kRuntimePointerSize);
+  return StringPrintf("%#x\t%s\t%s\t%s\t%s\n",
+                      (method_index << TraceActionBits),
+                      PrettyDescriptor(method->GetDeclaringClassDescriptor()).c_str(),
+                      method->GetName(),
+                      method->GetSignature().ToString().c_str(),
+                      method->GetDeclaringClassSourceFile());
 }
 
-bool Trace::RegisterThread(Thread* thread) {
-  pid_t tid = thread->GetTid();
-  CHECK_LT(0U, static_cast<uint32_t>(tid));
-  CHECK_LT(static_cast<uint32_t>(tid), kMaxThreadIdNumber);
+void Trace::RecordStreamingMethodEvent(Thread* thread,
+                                       ArtMethod* method,
+                                       TraceAction action,
+                                       uint32_t thread_clock_diff,
+                                       uint64_t timestamp_counter) {
+  uintptr_t* method_trace_buffer = thread->GetMethodTraceBuffer();
+  size_t* current_offset = thread->GetMethodTraceIndexPtr();
+  // Initialize the buffer lazily. It's just simpler to keep the creation at one place.
+  if (method_trace_buffer == nullptr) {
+    method_trace_buffer = new uintptr_t[std::max(kMinBufSize, kPerThreadBufSize)]();
+    thread->SetMethodTraceBuffer(method_trace_buffer);
+    *current_offset = 0;
+
+    // This is the first event from this thread, so first record information about the thread.
+    std::string thread_name;
+    thread->GetThreadName(thread_name);
+    static constexpr size_t kThreadNameHeaderSize = 7;
+    uint8_t header[kThreadNameHeaderSize];
+    Append2LE(header, 0);
+    header[2] = kOpNewThread;
+    // We use only 16 bits to encode thread id. On Android, we don't expect to use more than
+    // 16-bits for a Tid. For 32-bit platforms it is always ensured we use less than 16 bits.
+    // See  __check_max_thread_id in bionic for more details. Even on 64-bit the max threads
+    // is currently less than 65536.
+    // TODO(mythria): On host, we know thread ids can be greater than 16 bits. Consider adding
+    // a map similar to method ids.
+    DCHECK(!kIsTargetBuild || thread->GetTid() < (1 << 16));
+    Append2LE(header + 3, static_cast<uint16_t>(thread->GetTid()));
+    Append2LE(header + 5, static_cast<uint16_t>(thread_name.length()));
+
+    {
+      MutexLock mu(Thread::Current(), tracing_lock_);
+      if (!trace_file_->WriteFully(header, kThreadNameHeaderSize) ||
+          !trace_file_->WriteFully(reinterpret_cast<const uint8_t*>(thread_name.c_str()),
+                                   thread_name.length())) {
+        PLOG(WARNING) << "Failed streaming a tracing event.";
+      }
+    }
+  }
+
+  size_t required_entries = (clock_source_ == TraceClockSource::kDual) ? 4 : 3;
+  if (*current_offset + required_entries >= kPerThreadBufSize) {
+    // We don't have space for further entries. Flush the contents of the buffer and reuse the
+    // buffer to store contents. Reset the index to the start of the buffer.
+    FlushStreamingBuffer(thread);
+    *current_offset = 0;
+  }
 
-  if (!(*seen_threads_)[tid]) {
-    seen_threads_->set(tid);
-    return true;
+  // Record entry in per-thread trace buffer.
+  int current_index = *current_offset;
+  method_trace_buffer[current_index++] = reinterpret_cast<uintptr_t>(method);
+  // TODO(mythria): We only need two bits to record the action. Consider merging
+  // it with the method entry to save space.
+  method_trace_buffer[current_index++] = action;
+  if (UseThreadCpuClock()) {
+    method_trace_buffer[current_index++] = thread_clock_diff;
+  }
+  if (UseWallClock()) {
+    if (art::kRuntimePointerSize == PointerSize::k32) {
+      // On 32-bit architectures store timestamp counter as two 32-bit values.
+      method_trace_buffer[current_index++] = timestamp_counter >> 32;
+      method_trace_buffer[current_index++] = static_cast<uint32_t>(timestamp_counter);
+    } else {
+      method_trace_buffer[current_index++] = timestamp_counter;
+    }
   }
-  return false;
+  *current_offset = current_index;
 }
 
-std::string Trace::GetMethodLine(ArtMethod* method) {
-  method = method->GetInterfaceMethodIfProxy(kRuntimePointerSize);
-  return StringPrintf("%#x\t%s\t%s\t%s\t%s\n", (EncodeTraceMethod(method) << TraceActionBits),
-      PrettyDescriptor(method->GetDeclaringClassDescriptor()).c_str(), method->GetName(),
-      method->GetSignature().ToString().c_str(), method->GetDeclaringClassSourceFile());
-}
-
-void Trace::WriteToBuf(const uint8_t* src, size_t src_size) {
-  // Updates to cur_offset_ are done under the streaming_lock_ here as in streaming mode.
-  int32_t old_offset = cur_offset_.load(std::memory_order_relaxed);
-  int32_t new_offset = old_offset + static_cast<int32_t>(src_size);
-  if (dchecked_integral_cast<size_t>(new_offset) > buffer_size_) {
-    // Flush buffer.
-    if (!trace_file_->WriteFully(buf_.get(), old_offset)) {
+void Trace::WriteToBuf(uint8_t* header,
+                       size_t header_size,
+                       const std::string& data,
+                       size_t* current_index,
+                       uint8_t* buffer,
+                       size_t buffer_size) {
+  EnsureSpace(buffer, current_index, buffer_size, header_size);
+  memcpy(buffer + *current_index, header, header_size);
+  *current_index += header_size;
+
+  EnsureSpace(buffer, current_index, buffer_size, data.length());
+  if (data.length() < buffer_size) {
+    memcpy(buffer + *current_index, reinterpret_cast<const uint8_t*>(data.c_str()), data.length());
+    *current_index += data.length();
+  } else {
+    // The data is larger than buffer, so write directly to the file. EnsureSpace should have
+    // flushed any data in the buffer.
+    DCHECK_EQ(*current_index, 0U);
+    if (!trace_file_->WriteFully(reinterpret_cast<const uint8_t*>(data.c_str()), data.length())) {
       PLOG(WARNING) << "Failed streaming a tracing event.";
     }
+  }
+}
 
-    // Check whether the data is too large for the buffer, then write immediately.
-    if (src_size >= buffer_size_) {
-      if (!trace_file_->WriteFully(src, src_size)) {
-        PLOG(WARNING) << "Failed streaming a tracing event.";
+void Trace::FlushStreamingBuffer(Thread* thread) {
+  // Take a tracing_lock_ to serialize writes across threads. We also need to allocate a unique
+  // method id for each method. We do that by maintaining a map from id to method for each newly
+  // seen method. tracing_lock_ is required to serialize these.
+  MutexLock mu(Thread::Current(), tracing_lock_);
+  uintptr_t* method_trace_buffer = thread->GetMethodTraceBuffer();
+  // Create a temporary buffer to encode the trace events from the specified thread.
+  size_t buffer_size = kPerThreadBufSize;
+  size_t current_index = 0;
+  std::unique_ptr<uint8_t[]> buffer(new uint8_t[std::max(kMinBufSize, buffer_size)]);
+
+  size_t num_entries = *(thread->GetMethodTraceIndexPtr());
+  for (size_t entry_index = 0; entry_index < num_entries;) {
+    ArtMethod* method = reinterpret_cast<ArtMethod*>(method_trace_buffer[entry_index++]);
+    TraceAction action = DecodeTraceAction(method_trace_buffer[entry_index++]);
+    uint32_t thread_time = 0;
+    uint32_t wall_time = 0;
+    if (UseThreadCpuClock()) {
+      thread_time = method_trace_buffer[entry_index++];
+    }
+    if (UseWallClock()) {
+      uint64_t timestamp = method_trace_buffer[entry_index++];
+      if (art::kRuntimePointerSize == PointerSize::k32) {
+        // On 32-bit architectures timestamp is stored as two 32-bit values.
+        timestamp = (timestamp << 32 | method_trace_buffer[entry_index++]);
       }
-      cur_offset_.store(0, std::memory_order_relaxed);  // Buffer is empty now.
-      return;
+      wall_time = GetMicroTime(timestamp) - start_time_;
+    }
+
+    auto it = art_method_id_map_.find(method);
+    uint32_t method_index = 0;
+    // If we haven't seen this method before record information about the method.
+    if (it == art_method_id_map_.end()) {
+      art_method_id_map_.emplace(method, current_method_index_);
+      method_index = current_method_index_;
+      current_method_index_++;
+      // Write a special block with the name.
+      std::string method_line(GetMethodLine(method, method_index));
+      static constexpr size_t kMethodNameHeaderSize = 5;
+      uint8_t method_header[kMethodNameHeaderSize];
+      DCHECK_LT(kMethodNameHeaderSize, kPerThreadBufSize);
+      Append2LE(method_header, 0);
+      method_header[2] = kOpNewMethod;
+      Append2LE(method_header + 3, static_cast<uint16_t>(method_line.length()));
+      WriteToBuf(method_header,
+                 kMethodNameHeaderSize,
+                 method_line,
+                 &current_index,
+                 buffer.get(),
+                 buffer_size);
+    } else {
+      method_index = it->second;
     }
 
-    old_offset = 0;
-    new_offset = static_cast<int32_t>(src_size);
+    const size_t record_size = GetRecordSize(clock_source_);
+    DCHECK_LT(record_size, kPerThreadBufSize);
+    EnsureSpace(buffer.get(), &current_index, buffer_size, record_size);
+    EncodeEventEntry(
+        buffer.get() + current_index, thread, method_index, action, thread_time, wall_time);
+    current_index += record_size;
   }
-  cur_offset_.store(new_offset, std::memory_order_relaxed);
-  // Fill in data.
-  memcpy(buf_.get() + old_offset, src, src_size);
-}
 
-void Trace::FlushBuf() {
-  // Updates to cur_offset_ are done under the streaming_lock_ here as in streaming mode.
-  int32_t offset = cur_offset_.load(std::memory_order_relaxed);
-  if (!trace_file_->WriteFully(buf_.get(), offset)) {
-    PLOG(WARNING) << "Failed flush the remaining data in streaming.";
+  // Flush the contents of buffer to file.
+  if (!trace_file_->WriteFully(buffer.get(), current_index)) {
+    PLOG(WARNING) << "Failed streaming a tracing event.";
   }
-  cur_offset_.store(0, std::memory_order_relaxed);
 }
 
-void Trace::LogMethodTraceEvent(Thread* thread, ArtMethod* method,
-                                instrumentation::Instrumentation::InstrumentationEvent event,
-                                uint32_t thread_clock_diff, uint32_t wall_clock_diff) {
-  // This method is called in both tracing modes (method and
-  // sampling). In sampling mode, this method is only called by the
-  // sampling thread. In method tracing mode, it can be called
-  // concurrently.
-
-  // Ensure we always use the non-obsolete version of the method so that entry/exit events have the
-  // same pointer value.
-  method = method->GetNonObsoleteMethod();
-
+void Trace::RecordMethodEvent(Thread* thread,
+                              ArtMethod* method,
+                              TraceAction action,
+                              uint32_t thread_clock_diff,
+                              uint64_t timestamp_counter) {
   // Advance cur_offset_ atomically.
   int32_t new_offset;
   int32_t old_offset = 0;
@@ -893,37 +1176,18 @@ void Trace::LogMethodTraceEvent(Thread* thread, ArtMethod* method,
   // In the non-streaming case, we do a busy loop here trying to get
   // an offset to write our record and advance cur_offset_ for the
   // next use.
-  if (trace_output_mode_ != TraceOutputMode::kStreaming) {
-    // Although multiple threads can call this method concurrently,
-    // the compare_exchange_weak here is still atomic (by definition).
-    // A succeeding update is visible to other cores when they pass
-    // through this point.
-    old_offset = cur_offset_.load(std::memory_order_relaxed);  // Speculative read
-    do {
-      new_offset = old_offset + GetRecordSize(clock_source_);
-      if (static_cast<size_t>(new_offset) > buffer_size_) {
-        overflow_ = true;
-        return;
-      }
-    } while (!cur_offset_.compare_exchange_weak(old_offset, new_offset, std::memory_order_relaxed));
-  }
-
-  TraceAction action = kTraceMethodEnter;
-  switch (event) {
-    case instrumentation::Instrumentation::kMethodEntered:
-      action = kTraceMethodEnter;
-      break;
-    case instrumentation::Instrumentation::kMethodExited:
-      action = kTraceMethodExit;
-      break;
-    case instrumentation::Instrumentation::kMethodUnwind:
-      action = kTraceUnroll;
-      break;
-    default:
-      UNIMPLEMENTED(FATAL) << "Unexpected event: " << event;
-  }
-
-  uint32_t method_value = EncodeTraceMethodAndAction(method, action);
+  // Although multiple threads can call this method concurrently,
+  // the compare_exchange_weak here is still atomic (by definition).
+  // A succeeding update is visible to other cores when they pass
+  // through this point.
+  old_offset = cur_offset_.load(std::memory_order_relaxed);  // Speculative read
+  do {
+    new_offset = old_offset + GetRecordSize(clock_source_);
+    if (static_cast<size_t>(new_offset) > buffer_size_) {
+      overflow_ = true;
+      return;
+    }
+  } while (!cur_offset_.compare_exchange_weak(old_offset, new_offset, std::memory_order_relaxed));
 
   // Write data into the tracing buffer (if not streaming) or into a
   // small buffer on the stack (if streaming) which we'll put into the
@@ -934,14 +1198,41 @@ void Trace::LogMethodTraceEvent(Thread* thread, ArtMethod* method,
   // of FinishTracing() acquire locks and (implicitly) synchronise
   // the buffer memory.
   uint8_t* ptr;
-  static constexpr size_t kPacketSize = 14U;  // The maximum size of data in a packet.
-  uint8_t stack_buf[kPacketSize];             // Space to store a packet when in streaming mode.
+  ptr = buf_.get() + old_offset;
+  uint32_t wall_clock_diff = GetMicroTime(timestamp_counter) - start_time_;
+  MutexLock mu(Thread::Current(), tracing_lock_);
+  EncodeEventEntry(
+      ptr, thread, EncodeTraceMethod(method), action, thread_clock_diff, wall_clock_diff);
+}
+
+void Trace::LogMethodTraceEvent(Thread* thread,
+                                ArtMethod* method,
+                                TraceAction action,
+                                uint32_t thread_clock_diff,
+                                uint64_t timestamp_counter) {
+  // This method is called in both tracing modes (method and sampling). In sampling mode, this
+  // method is only called by the sampling thread. In method tracing mode, it can be called
+  // concurrently.
+
+  // Ensure we always use the non-obsolete version of the method so that entry/exit events have the
+  // same pointer value.
+  method = method->GetNonObsoleteMethod();
+
   if (trace_output_mode_ == TraceOutputMode::kStreaming) {
-    ptr = stack_buf;
+    RecordStreamingMethodEvent(thread, method, action, thread_clock_diff, timestamp_counter);
   } else {
-    ptr = buf_.get() + old_offset;
+    RecordMethodEvent(thread, method, action, thread_clock_diff, timestamp_counter);
   }
+}
 
+void Trace::EncodeEventEntry(uint8_t* ptr,
+                             Thread* thread,
+                             uint32_t method_index,
+                             TraceAction action,
+                             uint32_t thread_clock_diff,
+                             uint32_t wall_clock_diff) {
+  static constexpr size_t kPacketSize = 14U;  // The maximum size of data in a packet.
+  uint32_t method_value = (method_index << TraceActionBits) | action;
   Append2LE(ptr, thread->GetTid());
   Append4LE(ptr + 2, method_value);
   ptr += 6;
@@ -954,79 +1245,46 @@ void Trace::LogMethodTraceEvent(Thread* thread, ArtMethod* method,
     Append4LE(ptr, wall_clock_diff);
   }
   static_assert(kPacketSize == 2 + 4 + 4 + 4, "Packet size incorrect.");
-
-  if (trace_output_mode_ == TraceOutputMode::kStreaming) {
-    MutexLock mu(Thread::Current(), *streaming_lock_);  // To serialize writing.
-    if (RegisterMethod(method)) {
-      // Write a special block with the name.
-      std::string method_line(GetMethodLine(method));
-      uint8_t buf2[5];
-      Append2LE(buf2, 0);
-      buf2[2] = kOpNewMethod;
-      Append2LE(buf2 + 3, static_cast<uint16_t>(method_line.length()));
-      WriteToBuf(buf2, sizeof(buf2));
-      WriteToBuf(reinterpret_cast<const uint8_t*>(method_line.c_str()), method_line.length());
-    }
-    if (RegisterThread(thread)) {
-      // It might be better to postpone this. Threads might not have received names...
-      std::string thread_name;
-      thread->GetThreadName(thread_name);
-      uint8_t buf2[7];
-      Append2LE(buf2, 0);
-      buf2[2] = kOpNewThread;
-      Append2LE(buf2 + 3, static_cast<uint16_t>(thread->GetTid()));
-      Append2LE(buf2 + 5, static_cast<uint16_t>(thread_name.length()));
-      WriteToBuf(buf2, sizeof(buf2));
-      WriteToBuf(reinterpret_cast<const uint8_t*>(thread_name.c_str()), thread_name.length());
-    }
-    WriteToBuf(stack_buf, sizeof(stack_buf));
-  }
 }
 
-void Trace::GetVisitedMethods(size_t buf_size,
-                              std::set<ArtMethod*>* visited_methods) {
-  uint8_t* ptr = buf_.get() + kTraceHeaderLength;
-  uint8_t* end = buf_.get() + buf_size;
-
-  while (ptr < end) {
-    uint32_t tmid = ReadBytes(ptr + 2, sizeof(tmid));
-    ArtMethod* method = DecodeTraceMethod(tmid);
-    visited_methods->insert(method);
-    ptr += GetRecordSize(clock_source_);
+void Trace::EnsureSpace(uint8_t* buffer,
+                        size_t* current_index,
+                        size_t buffer_size,
+                        size_t required_size) {
+  if (*current_index + required_size < buffer_size) {
+    return;
   }
-}
 
-void Trace::DumpMethodList(std::ostream& os, const std::set<ArtMethod*>& visited_methods) {
-  for (const auto& method : visited_methods) {
-    os << GetMethodLine(method);
+  if (!trace_file_->WriteFully(buffer, *current_index)) {
+    PLOG(WARNING) << "Failed streaming a tracing event.";
   }
+  *current_index = 0;
 }
 
-static void DumpThread(Thread* t, void* arg) {
-  std::ostream& os = *reinterpret_cast<std::ostream*>(arg);
-  std::string name;
-  t->GetThreadName(name);
-  os << t->GetTid() << "\t" << name << "\n";
+void Trace::DumpMethodList(std::ostream& os) {
+  MutexLock mu(Thread::Current(), tracing_lock_);
+  for (auto const& entry : art_method_id_map_) {
+    os << GetMethodLine(entry.first, entry.second);
+  }
 }
 
 void Trace::DumpThreadList(std::ostream& os) {
-  Thread* self = Thread::Current();
-  for (const auto& it : exited_threads_) {
-    os << it.first << "\t" << it.second << "\n";
+  for (const auto& it : threads_list_) {
+    // We use only 16 bits to encode thread id. On Android, we don't expect to use more than
+    // 16-bits for a Tid. For 32-bit platforms it is always ensured we use less than 16 bits.
+    // See  __check_max_thread_id in bionic for more details. Even on 64-bit the max threads
+    // is currently less than 65536.
+    // TODO(mythria): On host, we know thread ids can be greater than 16 bits. Consider adding
+    // a map similar to method ids.
+    DCHECK(!kIsTargetBuild || it.first < (1 << 16));
+    os << static_cast<uint16_t>(it.first) << "\t" << it.second << "\n";
   }
-  Locks::thread_list_lock_->AssertNotHeld(self);
-  MutexLock mu(self, *Locks::thread_list_lock_);
-  Runtime::Current()->GetThreadList()->ForEach(DumpThread, &os);
 }
 
 void Trace::StoreExitingThreadInfo(Thread* thread) {
   MutexLock mu(thread, *Locks::trace_lock_);
   if (the_trace_ != nullptr) {
-    std::string name;
-    thread->GetThreadName(name);
-    // The same thread/tid may be used multiple times. As SafeMap::Put does not allow to override
-    // a previous mapping, use SafeMap::Overwrite.
-    the_trace_->exited_threads_.Overwrite(thread->GetTid(), name);
+    the_trace_->UpdateThreadsList(thread);
   }
 }
 
@@ -1042,9 +1300,21 @@ Trace::TraceMode Trace::GetMode() {
   return the_trace_->trace_mode_;
 }
 
+int Trace::GetFlags() {
+  MutexLock mu(Thread::Current(), *Locks::trace_lock_);
+  CHECK(the_trace_ != nullptr) << "Trace flags requested, but no trace currently running";
+  return the_trace_->flags_;
+}
+
+int Trace::GetIntervalInMillis() {
+  MutexLock mu(Thread::Current(), *Locks::trace_lock_);
+  CHECK(the_trace_ != nullptr) << "Trace interval requested, but no trace currently running";
+  return the_trace_->interval_us_;
+}
+
 size_t Trace::GetBufferSize() {
   MutexLock mu(Thread::Current(), *Locks::trace_lock_);
-  CHECK(the_trace_ != nullptr) << "Trace mode requested, but no trace currently running";
+  CHECK(the_trace_ != nullptr) << "Trace buffer size requested, but no trace currently running";
   return the_trace_->buffer_size_;
 }