diff options
author | 2024-12-27 17:06:24 +0000 | |
---|---|---|
committer | 2025-01-21 05:28:45 -0800 | |
commit | 761ea222e35e0ffacb2149ce22a92b3c6a4bdf79 (patch) | |
tree | 147bf1438749db83ca6693df93a0bb72c0a1dc00 | |
parent | 612ee65f6ac572ed60097c7bd9a7016cce37da41 (diff) |
Add support for collecting long running methods
This CL extends low overhead tracing to support collecting long running
methods. This feature is disabled by default and is implemented behind
always_enable_profiling_code flag. Turning on the flag enables the
support but doesn't start tracing. Tracing has to be started via the
StartTraceLongRunningMethods interface in VMDebug. If a trace was
running the collected data is included in the data generated by the
SIGQUIT handlers. This would help when analyzing ANR reports.
The trace can also be dumped using the API provided in VMDebug.
Bug: 352518093
Test: art/test.py
Change-Id: I407560c45cd31bfa06906bb2286c455eef7ceec8
-rw-r--r-- | runtime/arch/arm/entrypoints_init_arm.cc | 3 | ||||
-rw-r--r-- | runtime/arch/arm64/entrypoints_init_arm64.cc | 25 | ||||
-rw-r--r-- | runtime/arch/arm64/quick_entrypoints_arm64.S | 81 | ||||
-rw-r--r-- | runtime/arch/riscv64/entrypoints_init_riscv64.cc | 3 | ||||
-rw-r--r-- | runtime/arch/x86/entrypoints_init_x86.cc | 3 | ||||
-rw-r--r-- | runtime/arch/x86_64/entrypoints_init_x86_64.cc | 3 | ||||
-rw-r--r-- | runtime/entrypoints/quick/quick_trampoline_entrypoints.cc | 10 | ||||
-rw-r--r-- | runtime/native/dalvik_system_VMDebug.cc | 9 | ||||
-rw-r--r-- | runtime/signal_catcher.cc | 8 | ||||
-rw-r--r-- | runtime/thread.cc | 8 | ||||
-rw-r--r-- | runtime/thread.h | 3 | ||||
-rw-r--r-- | runtime/trace.cc | 169 | ||||
-rw-r--r-- | runtime/trace.h | 1 | ||||
-rw-r--r-- | runtime/trace_common.h | 168 | ||||
-rw-r--r-- | runtime/trace_profile.cc | 376 | ||||
-rw-r--r-- | runtime/trace_profile.h | 122 | ||||
-rw-r--r-- | tools/cpp-define-generator/thread.def | 1 |
17 files changed, 776 insertions, 217 deletions
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc index 4fb4774a6d..26664cd94b 100644 --- a/runtime/arch/arm/entrypoints_init_arm.cc +++ b/runtime/arch/arm/entrypoints_init_arm.cc @@ -28,6 +28,7 @@ #include "entrypoints/quick/runtime_entrypoints_list.h" #include "entrypoints/runtime_asm_entrypoints.h" #include "interpreter/interpreter.h" +#include "trace_profile.h" namespace art HIDDEN { @@ -197,7 +198,7 @@ void InitEntryPoints(JniEntryPoints* jpoints, } void UpdateLowOverheadTraceEntrypoints([[maybe_unused]] QuickEntryPoints* qpoints, - [[maybe_unused]] bool enable) { + [[maybe_unused]] LowOverheadTraceType trace_type) { // This is a nop on this architecture. Low overhead tracing is only implemented for ARM64. } diff --git a/runtime/arch/arm64/entrypoints_init_arm64.cc b/runtime/arch/arm64/entrypoints_init_arm64.cc index 3360708a4c..dbcb2c7273 100644 --- a/runtime/arch/arm64/entrypoints_init_arm64.cc +++ b/runtime/arch/arm64/entrypoints_init_arm64.cc @@ -29,6 +29,7 @@ #include "entrypoints/quick/runtime_entrypoints_list.h" #include "entrypoints/runtime_asm_entrypoints.h" #include "interpreter/interpreter.h" +#include "trace_profile.h" namespace art_flags = com::android::art::flags; @@ -79,6 +80,9 @@ extern "C" mirror::Object* art_quick_read_barrier_mark_introspection_gc_roots(mi extern "C" void art_quick_record_entry_trace_event(); extern "C" void art_quick_record_exit_trace_event(); +extern "C" void art_quick_record_long_running_entry_trace_event(); +extern "C" void art_quick_record_long_running_exit_trace_event(); + extern "C" void art_quick_nop_record_entry_trace_event() { return; } @@ -218,13 +222,20 @@ void InitEntryPoints(JniEntryPoints* jpoints, } } -void UpdateLowOverheadTraceEntrypoints(QuickEntryPoints* qpoints, bool enable) { - if (enable) { - qpoints->SetRecordEntryTraceEvent(art_quick_record_entry_trace_event); - qpoints->SetRecordExitTraceEvent(art_quick_record_exit_trace_event); - } else { - qpoints->SetRecordEntryTraceEvent(art_quick_nop_record_entry_trace_event); - qpoints->SetRecordExitTraceEvent(art_quick_nop_record_exit_trace_event); +void UpdateLowOverheadTraceEntrypoints(QuickEntryPoints* qpoints, LowOverheadTraceType type) { + switch (type) { + case LowOverheadTraceType::kAllMethods: + qpoints->SetRecordEntryTraceEvent(art_quick_record_entry_trace_event); + qpoints->SetRecordExitTraceEvent(art_quick_record_exit_trace_event); + break; + case LowOverheadTraceType::kLongRunningMethods: + qpoints->SetRecordEntryTraceEvent(art_quick_record_long_running_entry_trace_event); + qpoints->SetRecordExitTraceEvent(art_quick_record_long_running_exit_trace_event); + break; + case LowOverheadTraceType::kNone: + qpoints->SetRecordEntryTraceEvent(art_quick_nop_record_entry_trace_event); + qpoints->SetRecordExitTraceEvent(art_quick_nop_record_exit_trace_event); + break; } } diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S index 1302b5b036..01336f9f97 100644 --- a/runtime/arch/arm64/quick_entrypoints_arm64.S +++ b/runtime/arch/arm64/quick_entrypoints_arm64.S @@ -2576,6 +2576,87 @@ ENTRY art_quick_record_exit_trace_event ret END art_quick_record_exit_trace_event +ENTRY art_quick_record_long_running_entry_trace_event + ldr xIP1, [xSELF, #TRACE_BUFFER_CURRENT_OFFSET] + sub xIP1, xIP1, 16 + // xIP0 has the trace buffer pointer. This is loaded on the fast path before + // checking if we need to call this method. This will be still valid here. + cmp xIP0, xIP1 + bhi .Lhandle_overflow + // Store the method pointer + str x0, [xIP1, 8] + // Store the timestamp with the last bit set to 0 to indicate method entry event + mrs xIP0, cntvct_el0 + bfc xIP0, 0, 1 + str xIP0, [xIP1] + str xIP1, [xSELF, #TRACE_BUFFER_CURRENT_OFFSET] + ret +.Lhandle_overflow: + // Call runtime to flush buffer. We expect the frequency of this case to be low. For ANR it + // might be interesting to record the past data instead of overwriting with new entries. + SETUP_SAVE_EVERYTHING_FRAME + ldr x0, [sp, #FRAME_SIZE_SAVE_EVERYTHING] // pass ArtMethod* + mov x1, xSELF // pass Thread::Current + mov x2, 1 // set to true for entry events + bl artRecordLongRunningMethodTraceEvent // (ArtMethod*, Thread*, is_entry) + RESTORE_SAVE_EVERYTHING_FRAME // Note: will restore xSELF + REFRESH_MARKING_REGISTER + ret +END art_quick_record_long_running_entry_trace_event + +ENTRY art_quick_record_long_running_exit_trace_event + ldr xIP1, [xSELF, #TRACE_BUFFER_CURRENT_OFFSET] + // xIP0 has the trace buffer pointer. This is loaded on the fast path before checking if we + // need to call this method. This will be still valid here. + // We just need one entry for the method exit. We only record the timestamp. Method will be + // available from the corresponding method entry event. + cmp xIP0, xIP1 + bhs .Lhandle_overflow_exit + // Get the timestamp of the method exit event. + mrs xIP0, cntvct_el0 + // Load the previous event. + ldr xIP1, [xIP1] + // If lsb is set for the previous event, then the previous event is an exit event. This means + // there was a long running method earlier on the call stack. Record the exit event for this + // method to construct the call chain. + tst xIP1, 1 + bne .Lrecord_exit_event + // The previous event was an entry event. This exit corresponds to the previous method entry. + // Check if the method is long running by getting the current timestamp and comparing with the + // previous event's timestamp. + mrs xIP0, cntvct_el0 + sub xIP1, xIP0, xIP1 + cmp xIP1, #LONG_RUNNING_METHOD_THRESHOLD + bhs .Lrecord_exit_event + // This wasn't a long running method. Erase the previously recorded method entry event. We + // don't need to record entry / exit for this method. + ldr xIP1, [xSELF, #TRACE_BUFFER_CURRENT_OFFSET] + add xIP1, xIP1, 16 + str xIP1, [xSELF, #TRACE_BUFFER_CURRENT_OFFSET] + ret +.Lrecord_exit_event: + // For method exits we only record the current timestamp. We can infer the method from the + // corresponding method entry event. + ldr xIP1, [xSELF, #TRACE_BUFFER_CURRENT_OFFSET] + sub xIP1, xIP1, 8 + // Set the lsb of the timestamp to 1 to indicate a method exit event. + orr xIP0, xIP0, #1 + str xIP0, [xIP1] + str xIP1, [xSELF, #TRACE_BUFFER_CURRENT_OFFSET] + ret +.Lhandle_overflow_exit: + // Call runtime to flush buffer. We expect the frequency of this case to be low. For ANR it + // might be interesting to record the past data instead of overwriting with new entries. + SETUP_SAVE_EVERYTHING_FRAME + ldr xzr, [sp, #FRAME_SIZE_SAVE_EVERYTHING] // pass nullptr. ArtMethod* isn't required for exits + mov x1, xSELF // pass Thread::Current + mov x2, 0 // set to false for exit events + bl artRecordLongRunningMethodTraceEvent // (ArtMethod*, Thread*, is_entry) + RESTORE_SAVE_EVERYTHING_FRAME // Note: will restore xSELF + REFRESH_MARKING_REGISTER + ret +END art_quick_record_long_running_exit_trace_event + .extern artMethodEntryHook ENTRY art_quick_method_entry_hook SETUP_SAVE_EVERYTHING_FRAME diff --git a/runtime/arch/riscv64/entrypoints_init_riscv64.cc b/runtime/arch/riscv64/entrypoints_init_riscv64.cc index d8424cc3ee..a7487483ec 100644 --- a/runtime/arch/riscv64/entrypoints_init_riscv64.cc +++ b/runtime/arch/riscv64/entrypoints_init_riscv64.cc @@ -19,6 +19,7 @@ #include "entrypoints/quick/quick_default_init_entrypoints.h" #include "entrypoints/quick/quick_entrypoints.h" #include "entrypoints/quick/runtime_entrypoints_list.h" +#include "trace_profile.h" namespace art HIDDEN { @@ -151,7 +152,7 @@ void InitEntryPoints(JniEntryPoints* jpoints, } void UpdateLowOverheadTraceEntrypoints([[maybe_unused]] QuickEntryPoints* qpoints, - [[maybe_unused]] bool enable) { + [[maybe_unused]] LowOverheadTraceType trace_type) { // This is a nop on this architecture. Low overhead tracing is only implemented for ARM64. } diff --git a/runtime/arch/x86/entrypoints_init_x86.cc b/runtime/arch/x86/entrypoints_init_x86.cc index 237a55ce9d..c38accb5fe 100644 --- a/runtime/arch/x86/entrypoints_init_x86.cc +++ b/runtime/arch/x86/entrypoints_init_x86.cc @@ -23,6 +23,7 @@ #include "entrypoints/quick/quick_entrypoints.h" #include "entrypoints/runtime_asm_entrypoints.h" #include "interpreter/interpreter.h" +#include "trace_profile.h" namespace art HIDDEN { @@ -129,7 +130,7 @@ void InitEntryPoints(JniEntryPoints* jpoints, } void UpdateLowOverheadTraceEntrypoints([[maybe_unused]] QuickEntryPoints* qpoints, - [[maybe_unused]] bool enable) { + [[maybe_unused]] LowOverheadTraceType trace_type) { // This is a nop on this architecture. Low overhead tracing is only implemented for ARM64. } diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc index c00ecbd203..7502174821 100644 --- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc +++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc @@ -26,6 +26,7 @@ #include "entrypoints/quick/runtime_entrypoints_list.h" #include "entrypoints/runtime_asm_entrypoints.h" #include "interpreter/interpreter.h" +#include "trace_profile.h" namespace art HIDDEN { @@ -148,7 +149,7 @@ void InitEntryPoints(JniEntryPoints* jpoints, } void UpdateLowOverheadTraceEntrypoints([[maybe_unused]] QuickEntryPoints* qpoints, - [[maybe_unused]] bool enable) { + [[maybe_unused]] LowOverheadTraceType trace_tpe) { // This is a nop on this architecture. Low overhead tracing is only implemented for ARM64. } diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc index 90c0aea478..767b0c7b8b 100644 --- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc +++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc @@ -23,8 +23,8 @@ #include "base/globals.h" #include "base/pointer_size.h" #include "callee_save_frame.h" -#include "common_throws.h" #include "class_root-inl.h" +#include "common_throws.h" #include "debug_print.h" #include "debugger.h" #include "dex/dex_file-inl.h" @@ -57,12 +57,13 @@ #include "oat/oat_quick_method_header.h" #include "quick_exception_handler.h" #include "runtime.h" +#include "runtime_entrypoints_list.h" #include "scoped_thread_state_change-inl.h" #include "stack.h" #include "thread-inl.h" +#include "trace_profile.h" #include "var_handles.h" #include "well_known_classes.h" -#include "runtime_entrypoints_list.h" namespace art HIDDEN { @@ -2795,4 +2796,9 @@ extern "C" Context* artMethodExitHook(Thread* self, return nullptr; } +extern "C" void artRecordLongRunningMethodTraceEvent(ArtMethod* method, Thread* self, bool is_entry) + REQUIRES_SHARED(Locks::mutator_lock_) { + TraceProfiler::FlushBufferAndRecordTraceEvent(method, self, is_entry); +} + } // namespace art diff --git a/runtime/native/dalvik_system_VMDebug.cc b/runtime/native/dalvik_system_VMDebug.cc index 9440610f36..1b9a53e1d6 100644 --- a/runtime/native/dalvik_system_VMDebug.cc +++ b/runtime/native/dalvik_system_VMDebug.cc @@ -188,10 +188,14 @@ static void VMDebug_dumpLowOverheadTraceFdImpl(JNIEnv*, jclass, jint originalFd) TraceProfiler::Dump(fd); } -static void VMDebug_startLowOverheadTraceImpl(JNIEnv*, jclass) { +static void VMDebug_startLowOverheadTraceForAllMethodsImpl(JNIEnv*, jclass) { TraceProfiler::Start(); } +static void VMDebug_startLowOverheadTraceForLongMethodsImpl(JNIEnv*, jclass, jlong trace_duration) { + TraceProfiler::StartTraceLongRunningMethods(trace_duration); +} + static jboolean VMDebug_isDebuggerConnected(JNIEnv*, jclass) { // This function will be replaced by the debugger when it's connected. See // external/oj-libjdwp/src/share/vmDebug.c for implementation when debugger is connected. @@ -698,7 +702,8 @@ static JNINativeMethod gMethods[] = { NATIVE_METHOD(VMDebug, addApplication, "(Ljava/lang/String;)V"), NATIVE_METHOD(VMDebug, removeApplication, "(Ljava/lang/String;)V"), NATIVE_METHOD(VMDebug, setUserId, "(I)V"), - NATIVE_METHOD(VMDebug, startLowOverheadTraceImpl, "()V"), + NATIVE_METHOD(VMDebug, startLowOverheadTraceForAllMethodsImpl, "()V"), + NATIVE_METHOD(VMDebug, startLowOverheadTraceForLongMethodsImpl, "(J)V"), NATIVE_METHOD(VMDebug, stopLowOverheadTraceImpl, "()V"), NATIVE_METHOD(VMDebug, dumpLowOverheadTraceImpl, "(Ljava/lang/String;)V"), NATIVE_METHOD(VMDebug, dumpLowOverheadTraceFdImpl, "(I)V"), diff --git a/runtime/signal_catcher.cc b/runtime/signal_catcher.cc index 09a4b2d85e..c7e297f3b7 100644 --- a/runtime/signal_catcher.cc +++ b/runtime/signal_catcher.cc @@ -37,6 +37,7 @@ #include "base/time_utils.h" #include "base/utils.h" #include "class_linker.h" +#include "com_android_art_flags.h" #include "gc/heap.h" #include "jit/profile_saver.h" #include "palette/palette.h" @@ -45,6 +46,9 @@ #include "signal_set.h" #include "thread.h" #include "thread_list.h" +#include "trace_profile.h" + +namespace art_flags = com::android::art::flags; namespace art HIDDEN { @@ -138,6 +142,10 @@ void SignalCatcher::HandleSigQuit() { os << "Debug Store: " << DebugStoreGetString() << "\n"; + if (art_flags::always_enable_profile_code()) { + os << "LongRunningMethods: " << TraceProfiler::GetLongRunningMethodsString() << "\n"; + } + runtime->DumpForSigQuit(os); if ((false)) { diff --git a/runtime/thread.cc b/runtime/thread.cc index a94dbd697d..50f6bfc499 100644 --- a/runtime/thread.cc +++ b/runtime/thread.cc @@ -166,10 +166,10 @@ void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints, bool monitor_jni_entry_exit); void UpdateReadBarrierEntrypoints(QuickEntryPoints* qpoints, bool is_active); -void UpdateLowOverheadTraceEntrypoints(QuickEntryPoints* qpoints, bool enable); +void UpdateLowOverheadTraceEntrypoints(QuickEntryPoints* qpoints, LowOverheadTraceType trace_type); -void Thread::UpdateTlsLowOverheadTraceEntrypoints(bool enable) { - UpdateLowOverheadTraceEntrypoints(&tlsPtr_.quick_entrypoints, enable); +void Thread::UpdateTlsLowOverheadTraceEntrypoints(LowOverheadTraceType trace_type) { + UpdateLowOverheadTraceEntrypoints(&tlsPtr_.quick_entrypoints, trace_type); } void Thread::SetIsGcMarkingAndUpdateEntrypoints(bool is_marking) { @@ -1072,7 +1072,7 @@ bool Thread::Init(ThreadList* thread_list, JavaVMExt* java_vm, JNIEnvExt* jni_en ScopedTrace trace3("ThreadList::Register"); thread_list->Register(this); if (art_flags::always_enable_profile_code()) { - UpdateTlsLowOverheadTraceEntrypoints(!Trace::IsTracingEnabled()); + UpdateTlsLowOverheadTraceEntrypoints(TraceProfiler::GetTraceType()); } return true; } diff --git a/runtime/thread.h b/runtime/thread.h index 3ccf7c3c97..be4ffd0bc2 100644 --- a/runtime/thread.h +++ b/runtime/thread.h @@ -105,6 +105,7 @@ class StackedShadowFrameRecord; class Thread; class ThreadList; enum VisitRootFlags : uint8_t; +enum class LowOverheadTraceType; // A piece of data that can be held in the CustomTls. The destructor will be called during thread // shutdown. The thread the destructor is called on is not necessarily the same thread it was stored @@ -1427,7 +1428,7 @@ class EXPORT Thread { } } - void UpdateTlsLowOverheadTraceEntrypoints(bool enable); + void UpdateTlsLowOverheadTraceEntrypoints(LowOverheadTraceType type); uint64_t GetTraceClockBase() const { return tls64_.trace_clock_base; diff --git a/runtime/trace.cc b/runtime/trace.cc index 0fc41b6886..13a7dd26d4 100644 --- a/runtime/trace.cc +++ b/runtime/trace.cc @@ -32,8 +32,8 @@ #include "base/unix_file/fd_file.h" #include "base/utils.h" #include "class_linker.h" -#include "common_throws.h" #include "com_android_art_flags.h" +#include "common_throws.h" #include "debugger.h" #include "dex/descriptors_names.h" #include "dex/dex_file-inl.h" @@ -51,6 +51,7 @@ #include "stack.h" #include "thread.h" #include "thread_list.h" +#include "trace_common.h" #include "trace_profile.h" namespace art_flags = com::android::art::flags; @@ -94,6 +95,8 @@ static constexpr size_t kScalingFactorEncodedEntries = 6; // The key identifying the tracer to update instrumentation. static constexpr const char* kTracerInstrumentationKey = "Tracer"; +double TimestampCounter::tsc_to_microsec_scaling_factor = -1; + Trace* Trace::the_trace_ = nullptr; pthread_t Trace::sampling_pthread_ = 0U; std::unique_ptr<std::vector<ArtMethod*>> Trace::temp_stack_trace_; @@ -104,131 +107,6 @@ static TraceAction DecodeTraceAction(uint32_t tmid) { } namespace { -// Scaling factor to convert timestamp counter into wall clock time reported in micro seconds. -// This is initialized at the start of tracing using the timestamp counter update frequency. -// See InitializeTimestampCounters for more details. -double tsc_to_microsec_scaling_factor = -1.0; - -uint64_t GetTimestamp() { - uint64_t t = 0; -#if defined(__arm__) - // On ARM 32 bit, we don't always have access to the timestamp counters from user space. There is - // no easy way to check if it is safe to read the timestamp counters. There is HWCAP_EVTSTRM which - // is set when generic timer is available but not necessarily from the user space. Kernel disables - // access to generic timer when there are known problems on the target CPUs. Sometimes access is - // disabled only for 32-bit processes even when 64-bit processes can accesses the timer from user - // space. These are not reflected in the HWCAP_EVTSTRM capability.So just fallback to - // clock_gettime on these processes. See b/289178149 for more discussion. - t = MicroTime(); -#elif defined(__aarch64__) - // See Arm Architecture Registers Armv8 section System Registers - asm volatile("mrs %0, cntvct_el0" : "=r"(t)); -#elif defined(__i386__) || defined(__x86_64__) - // rdtsc returns two 32-bit values in rax and rdx even on 64-bit architectures. - unsigned int lo, hi; - asm volatile("rdtsc" : "=a"(lo), "=d"(hi)); - t = (static_cast<uint64_t>(hi) << 32) | lo; -#elif defined(__riscv) - asm volatile("rdtime %0" : "=r"(t)); -#else - t = MicroTime(); -#endif - return t; -} - -#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) -// Here we compute the scaling factor by sleeping for a millisecond. Alternatively, we could -// generate raw timestamp counter and also time using clock_gettime at the start and the end of the -// trace. We can compute the frequency of timestamp counter upadtes in the post processing step -// using these two samples. However, that would require a change in Android Studio which is the main -// consumer of these profiles. For now, just compute the frequency of tsc updates here. -double computeScalingFactor() { - uint64_t start = MicroTime(); - uint64_t start_tsc = GetTimestamp(); - // Sleep for one millisecond. - usleep(1000); - uint64_t diff_tsc = GetTimestamp() - start_tsc; - uint64_t diff_time = MicroTime() - start; - double scaling_factor = static_cast<double>(diff_time) / diff_tsc; - DCHECK(scaling_factor > 0.0) << scaling_factor; - return scaling_factor; -} -#endif - -#if defined(__i386__) || defined(__x86_64__) -double GetScalingFactorForX86() { - uint32_t eax, ebx, ecx; - asm volatile("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx) : "a"(0x0), "c"(0)); - if (eax < 0x15) { - // There is no 15H - Timestamp counter and core crystal clock information - // leaf. Just compute the frequency. - return computeScalingFactor(); - } - - // From Intel architecture-instruction-set-extensions-programming-reference: - // EBX[31:0]/EAX[31:0] indicates the ratio of the TSC frequency and the - // core crystal clock frequency. - // If EBX[31:0] is 0, the TSC and "core crystal clock" ratio is not enumerated. - // If ECX is 0, the nominal core crystal clock frequency is not enumerated. - // "TSC frequency" = "core crystal clock frequency" * EBX/EAX. - // The core crystal clock may differ from the reference clock, bus clock, or core clock - // frequencies. - // EAX Bits 31 - 00: An unsigned integer which is the denominator of the - // TSC/"core crystal clock" ratio. - // EBX Bits 31 - 00: An unsigned integer which is the numerator of the - // TSC/"core crystal clock" ratio. - // ECX Bits 31 - 00: An unsigned integer which is the nominal frequency of the core - // crystal clock in Hz. - // EDX Bits 31 - 00: Reserved = 0. - asm volatile("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx) : "a"(0x15), "c"(0)); - if (ebx == 0 || ecx == 0) { - return computeScalingFactor(); - } - double coreCrystalFreq = ecx; - // frequency = coreCrystalFreq * (ebx / eax) - // scaling_factor = seconds_to_microseconds / frequency - // = seconds_to_microseconds * eax / (coreCrystalFreq * ebx) - double seconds_to_microseconds = 1000 * 1000; - double scaling_factor = (seconds_to_microseconds * eax) / (coreCrystalFreq * ebx); - return scaling_factor; -} -#endif - -void InitializeTimestampCounters() { - // It is sufficient to initialize this once for the entire execution. Just return if it is - // already initialized. - if (tsc_to_microsec_scaling_factor > 0.0) { - return; - } - -#if defined(__arm__) - // On ARM 32 bit, we don't always have access to the timestamp counters from - // user space. Seem comment in GetTimestamp for more details. - tsc_to_microsec_scaling_factor = 1.0; -#elif defined(__aarch64__) - double seconds_to_microseconds = 1000 * 1000; - uint64_t freq = 0; - // See Arm Architecture Registers Armv8 section System Registers - asm volatile("mrs %0, cntfrq_el0" : "=r"(freq)); - if (freq == 0) { - // It is expected that cntfrq_el0 is correctly setup during system initialization but some - // devices don't do this. In such cases fall back to computing the frequency. See b/315139000. - tsc_to_microsec_scaling_factor = computeScalingFactor(); - } else { - tsc_to_microsec_scaling_factor = seconds_to_microseconds / static_cast<double>(freq); - } -#elif defined(__i386__) || defined(__x86_64__) - tsc_to_microsec_scaling_factor = GetScalingFactorForX86(); -#else - tsc_to_microsec_scaling_factor = 1.0; -#endif -} - -ALWAYS_INLINE uint64_t GetMicroTime(uint64_t counter) { - DCHECK(tsc_to_microsec_scaling_factor > 0.0) << tsc_to_microsec_scaling_factor; - return tsc_to_microsec_scaling_factor * counter; -} - TraceClockSource GetClockSourceFromFlags(int flags) { bool need_wall = flags & Trace::TraceFlag::kTraceClockSourceWallClock; bool need_thread_cpu = flags & Trace::TraceFlag::kTraceClockSourceThreadCpu; @@ -402,7 +280,7 @@ bool UseFastTraceListeners(TraceClockSource clock_source) { bool is_fast_trace = !UseThreadCpuClock(clock_source); #if defined(__arm__) // On ARM 32 bit, we don't always have access to the timestamp counters from - // user space. See comment in GetTimestamp for more details. + // user space. See comment in TimestampCounter::GetTimestamp for more details. is_fast_trace = false; #endif return is_fast_trace; @@ -413,7 +291,7 @@ void Trace::MeasureClockOverhead() { Thread::Current()->GetCpuMicroTime(); } if (UseWallClock(clock_source_)) { - GetTimestamp(); + TimestampCounter::GetTimestamp(); } } @@ -766,7 +644,7 @@ void Trace::Start(std::unique_ptr<File>&& trace_file_in, // Initialize the frequency of timestamp counter updates here. This is needed // to get wallclock time from timestamp counter values. - InitializeTimestampCounters(); + TimestampCounter::InitializeTimestampCounters(); Runtime* runtime = Runtime::Current(); @@ -838,14 +716,6 @@ void Trace::Start(std::unique_ptr<File>&& trace_file_in, the_trace_, /*needs_interpreter=*/false); } - - if (art_flags::always_enable_profile_code()) { - // Reset the trace low overhead trace entry points to be a nop. - MutexLock thread_list_mutex(self, *Locks::thread_list_lock_); - for (Thread* thread : Runtime::Current()->GetThreadList()->GetList()) { - thread->UpdateTlsLowOverheadTraceEntrypoints(/*enable= */ false); - } - } } // Can't call this when holding the mutator lock. @@ -925,10 +795,6 @@ void Trace::StopTracing(bool flush_entries) { the_trace->trace_writer_->FlushBuffer( thread, /* is_sync= */ false, /* free_buffer= */ true); } - - if (art_flags::always_enable_profile_code()) { - thread->UpdateTlsLowOverheadTraceEntrypoints(/*enable= */ true); - } } the_trace_ = nullptr; sampling_pthread_ = 0U; @@ -1037,7 +903,7 @@ TraceWriter::TraceWriter(File* trace_file, buf_(new uint8_t[std::max(kMinBufSize, buffer_size)]()), buffer_size_(std::max(kMinBufSize, buffer_size)), trace_format_version_(trace_format_version), - start_time_(GetMicroTime(GetTimestamp())), + start_time_(TimestampCounter::GetMicroTime(TimestampCounter::GetTimestamp())), overflow_(false), num_records_(0), clock_overhead_ns_(clock_overhead_ns), @@ -1053,7 +919,9 @@ TraceWriter::TraceWriter(File* trace_file, // We record monotonic time at the start of the trace, because Android Studio // fetches the monotonic timer from other places and matches these times to // construct a cpu profile. See b/318052824 for more context. - uint64_t start_time_monotonic = start_time_ + (MicroTime() - GetMicroTime(GetTimestamp())); + uint64_t start_time_monotonic = + start_time_ + + (MicroTime() - TimestampCounter::GetMicroTime(TimestampCounter::GetTimestamp())); uint16_t trace_version = GetTraceVersion(clock_source_, trace_format_version_); if (output_mode == TraceOutputMode::kStreaming) { trace_version |= 0xF0U; @@ -1136,7 +1004,7 @@ Trace::Trace(File* trace_file, std::string TraceWriter::CreateSummary(int flags) { std::ostringstream os; // Compute elapsed time. - uint64_t elapsed = GetMicroTime(GetTimestamp()) - start_time_; + uint64_t elapsed = TimestampCounter::GetMicroTime(TimestampCounter::GetTimestamp()) - start_time_; os << StringPrintf("%cversion\n", kTraceTokenChar); os << StringPrintf("%d\n", GetTraceVersion(clock_source_, trace_format_version_)); os << StringPrintf("data-file-overflow=%s\n", overflow_ ? "true" : "false"); @@ -1363,7 +1231,7 @@ void Trace::ReadClocks(Thread* thread, uint32_t* thread_clock_diff, uint64_t* ti } } if (UseWallClock(clock_source_)) { - *timestamp_counter = GetTimestamp(); + *timestamp_counter = TimestampCounter::GetTimestamp(); } } @@ -1371,15 +1239,6 @@ std::string TraceWriter::GetMethodLine(const std::string& method_line, uint32_t return StringPrintf("%#x\t%s", (method_index << TraceActionBits), method_line.c_str()); } -std::string TraceWriter::GetMethodInfoLine(ArtMethod* method) { - method = method->GetInterfaceMethodIfProxy(kRuntimePointerSize); - return StringPrintf("%s\t%s\t%s\t%s\n", - PrettyDescriptor(method->GetDeclaringClassDescriptor()).c_str(), - method->GetName(), - method->GetSignature().ToString().c_str(), - method->GetDeclaringClassSourceFile()); -} - void TraceWriter::RecordThreadInfo(Thread* thread) { // This is the first event from this thread, so first record information about the thread. std::string thread_name; @@ -1691,7 +1550,7 @@ void TraceWriter::ReadValuesFromRecord(uintptr_t* method_trace_entries, uint64_t high_timestamp = method_trace_entries[record_index++]; timestamp = (high_timestamp << 32 | timestamp); } - record.wall_clock_time = GetMicroTime(timestamp) - start_time_; + record.wall_clock_time = TimestampCounter::GetMicroTime(timestamp) - start_time_; } } diff --git a/runtime/trace.h b/runtime/trace.h index ffd70aa8e6..63c96f927c 100644 --- a/runtime/trace.h +++ b/runtime/trace.h @@ -330,7 +330,6 @@ class TraceWriter { // Get the information about the method. std::string GetMethodLine(const std::string& method_line, uint32_t method_id); - std::string GetMethodInfoLine(ArtMethod* method) REQUIRES_SHARED(Locks::mutator_lock_); // Helper function to record method information when processing the events. These are used by // streaming output mode. Non-streaming modes dump the methods and threads list at the end of diff --git a/runtime/trace_common.h b/runtime/trace_common.h new file mode 100644 index 0000000000..93ffb9b0da --- /dev/null +++ b/runtime/trace_common.h @@ -0,0 +1,168 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ART_RUNTIME_TRACE_COMMON_H_ +#define ART_RUNTIME_TRACE_COMMON_H_ + +#include "android-base/stringprintf.h" +#include "art_method-inl.h" +#include "dex/descriptors_names.h" +#include "oat/oat_quick_method_header.h" + +using android::base::StringPrintf; + +namespace art HIDDEN { + +static std::string GetMethodInfoLine(ArtMethod* method) REQUIRES_SHARED(Locks::mutator_lock_) { + return StringPrintf("%s\t%s\t%s\t%s\n", + PrettyDescriptor(method->GetDeclaringClassDescriptor()).c_str(), + method->GetName(), + method->GetSignature().ToString().c_str(), + method->GetDeclaringClassSourceFile()); +} + +class TimestampCounter { + public: + static uint64_t GetTimestamp() { + uint64_t t = 0; +#if defined(__arm__) + // On ARM 32 bit, we don't always have access to the timestamp counters from user space. There + // is no easy way to check if it is safe to read the timestamp counters. There is HWCAP_EVTSTRM + // which is set when generic timer is available but not necessarily from the user space. Kernel + // disables access to generic timer when there are known problems on the target CPUs. Sometimes + // access is disabled only for 32-bit processes even when 64-bit processes can accesses the + // timer from user space. These are not reflected in the HWCAP_EVTSTRM capability.So just + // fallback to clock_gettime on these processes. See b/289178149 for more discussion. + t = MicroTime(); +#elif defined(__aarch64__) + // See Arm Architecture Registers Armv8 section System Registers + asm volatile("mrs %0, cntvct_el0" : "=r"(t)); +#elif defined(__i386__) || defined(__x86_64__) + // rdtsc returns two 32-bit values in rax and rdx even on 64-bit architectures. + unsigned int lo, hi; + asm volatile("rdtsc" : "=a"(lo), "=d"(hi)); + t = (static_cast<uint64_t>(hi) << 32) | lo; +#elif defined(__riscv) + asm volatile("rdtime %0" : "=r"(t)); +#else + t = MicroTime(); +#endif + return t; + } + + static void InitializeTimestampCounters() { + // It is sufficient to initialize this once for the entire execution. Just return if it is + // already initialized. + if (tsc_to_microsec_scaling_factor > 0.0) { + return; + } + +#if defined(__arm__) + // On ARM 32 bit, we don't always have access to the timestamp counters from + // user space. Seem comment in GetTimestamp for more details. + tsc_to_microsec_scaling_factor = 1.0; +#elif defined(__aarch64__) + double seconds_to_microseconds = 1000 * 1000; + uint64_t freq = 0; + // See Arm Architecture Registers Armv8 section System Registers + asm volatile("mrs %0, cntfrq_el0" : "=r"(freq)); + if (freq == 0) { + // It is expected that cntfrq_el0 is correctly setup during system initialization but some + // devices don't do this. In such cases fall back to computing the frequency. See b/315139000. + tsc_to_microsec_scaling_factor = computeScalingFactor(); + } else { + tsc_to_microsec_scaling_factor = seconds_to_microseconds / static_cast<double>(freq); + } +#elif defined(__i386__) || defined(__x86_64__) + tsc_to_microsec_scaling_factor = GetScalingFactorForX86(); +#else + tsc_to_microsec_scaling_factor = 1.0; +#endif + } + + static ALWAYS_INLINE uint64_t GetMicroTime(uint64_t counter) { + DCHECK(tsc_to_microsec_scaling_factor > 0.0) << tsc_to_microsec_scaling_factor; + return tsc_to_microsec_scaling_factor * counter; + } + + private: +#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) + // Here we compute the scaling factor by sleeping for a millisecond. Alternatively, we could + // generate raw timestamp counter and also time using clock_gettime at the start and the end of + // the trace. We can compute the frequency of timestamp counter upadtes in the post processing + // step using these two samples. However, that would require a change in Android Studio which is + // the main consumer of these profiles. For now, just compute the frequency of tsc updates here. + static double computeScalingFactor() { + uint64_t start = MicroTime(); + uint64_t start_tsc = GetTimestamp(); + // Sleep for one millisecond. + usleep(1000); + uint64_t diff_tsc = GetTimestamp() - start_tsc; + uint64_t diff_time = MicroTime() - start; + double scaling_factor = static_cast<double>(diff_time) / diff_tsc; + DCHECK(scaling_factor > 0.0) << scaling_factor; + return scaling_factor; + } +#endif + +#if defined(__i386__) || defined(__x86_64__) + static double GetScalingFactorForX86() { + uint32_t eax, ebx, ecx; + asm volatile("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx) : "a"(0x0), "c"(0)); + if (eax < 0x15) { + // There is no 15H - Timestamp counter and core crystal clock information + // leaf. Just compute the frequency. + return computeScalingFactor(); + } + + // From Intel architecture-instruction-set-extensions-programming-reference: + // EBX[31:0]/EAX[31:0] indicates the ratio of the TSC frequency and the + // core crystal clock frequency. + // If EBX[31:0] is 0, the TSC and "core crystal clock" ratio is not enumerated. + // If ECX is 0, the nominal core crystal clock frequency is not enumerated. + // "TSC frequency" = "core crystal clock frequency" * EBX/EAX. + // The core crystal clock may differ from the reference clock, bus clock, or core clock + // frequencies. + // EAX Bits 31 - 00: An unsigned integer which is the denominator of the + // TSC/"core crystal clock" ratio. + // EBX Bits 31 - 00: An unsigned integer which is the numerator of the + // TSC/"core crystal clock" ratio. + // ECX Bits 31 - 00: An unsigned integer which is the nominal frequency of the core + // crystal clock in Hz. + // EDX Bits 31 - 00: Reserved = 0. + asm volatile("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx) : "a"(0x15), "c"(0)); + if (ebx == 0 || ecx == 0) { + return computeScalingFactor(); + } + double coreCrystalFreq = ecx; + // frequency = coreCrystalFreq * (ebx / eax) + // scaling_factor = seconds_to_microseconds / frequency + // = seconds_to_microseconds * eax / (coreCrystalFreq * ebx) + double seconds_to_microseconds = 1000 * 1000; + double scaling_factor = (seconds_to_microseconds * eax) / (coreCrystalFreq * ebx); + return scaling_factor; + } +#endif + + // Scaling factor to convert timestamp counter into wall clock time reported in micro seconds. + // This is initialized at the start of tracing using the timestamp counter update frequency. + // See InitializeTimestampCounters for more details. + static double tsc_to_microsec_scaling_factor; +}; + +} // namespace art + +#endif // ART_RUNTIME_TRACE_COMMON_H_ diff --git a/runtime/trace_profile.cc b/runtime/trace_profile.cc index 6907e28972..6aa09303d4 100644 --- a/runtime/trace_profile.cc +++ b/runtime/trace_profile.cc @@ -17,17 +17,22 @@ #include "trace_profile.h" #include "android-base/stringprintf.h" +#include "arch/context.h" #include "art_method-inl.h" #include "base/leb128.h" #include "base/mutex.h" #include "base/unix_file/fd_file.h" #include "com_android_art_flags.h" #include "dex/descriptors_names.h" +#include "gc/task_processor.h" +#include "oat/oat_quick_method_header.h" #include "runtime.h" +#include "stack.h" #include "thread-current-inl.h" #include "thread.h" #include "thread_list.h" #include "trace.h" +#include "trace_common.h" namespace art_flags = com::android::art::flags; @@ -40,6 +45,8 @@ using android::base::StringPrintf; // sizeof(uintptr_t). static constexpr size_t kMaxBytesPerTraceEntry = sizeof(uintptr_t); +static constexpr size_t kMaxEntriesAfterFlush = kAlwaysOnTraceBufSize / 2; + // We don't handle buffer overflows when processing the raw trace entries. We have a maximum of // kAlwaysOnTraceBufSize raw entries and we need a maximum of kMaxBytesPerTraceEntry to encode // each entry. To avoid overflow, we ensure that there are at least kMinBufSizeForEncodedData @@ -55,6 +62,21 @@ static constexpr size_t kAlwaysOnTraceHeaderSize = 8; bool TraceProfiler::profile_in_progress_ = false; +int TraceProfiler::num_trace_stop_tasks_ = 0; + +TraceData* TraceProfiler::trace_data_ = nullptr; + +void TraceData::AddTracedThread(Thread* thread) { + size_t thread_id = thread->GetTid(); + if (traced_threads_.find(thread_id) != traced_threads_.end()) { + return; + } + + std::string thread_name; + thread->GetThreadName(thread_name); + traced_threads_.emplace(thread_id, thread_name); +} + void TraceProfiler::AllocateBuffer(Thread* thread) { if (!art_flags::always_enable_profile_code()) { return; @@ -71,7 +93,82 @@ void TraceProfiler::AllocateBuffer(Thread* thread) { thread->SetMethodTraceBuffer(buffer, kAlwaysOnTraceBufSize); } -void TraceProfiler::Start() { +LowOverheadTraceType TraceProfiler::GetTraceType() { + MutexLock mu(Thread::Current(), *Locks::trace_lock_); + // LowOverhead trace entry points are configured based on the trace type. When trace_data_ is null + // then there is no low overhead tracing running, so we use nop entry points. + if (trace_data_ == nullptr) { + return LowOverheadTraceType::kNone; + } + + return trace_data_->GetTraceType(); +} + +namespace { +void RecordMethodsOnThreadStack(Thread* thread, TraceData* trace_data) + REQUIRES(Locks::mutator_lock_) { + Locks::mutator_lock_->AssertExclusiveHeld(Thread::Current()); + + struct MethodEntryStackVisitor final : public StackVisitor { + MethodEntryStackVisitor(Thread* thread_in, Context* context) + : StackVisitor(thread_in, context, StackVisitor::StackWalkKind::kSkipInlinedFrames) {} + + bool VisitFrame() override REQUIRES_SHARED(Locks::mutator_lock_) { + ArtMethod* m = GetMethod(); + if (m != nullptr && !m->IsRuntimeMethod()) { + if (GetCurrentShadowFrame() != nullptr) { + // TODO(mythria): Support low-overhead tracing for the switch interpreter. + } else { + const OatQuickMethodHeader* method_header = GetCurrentOatQuickMethodHeader(); + if (method_header == nullptr) { + // TODO(mythria): Consider low-overhead tracing support for the GenericJni stubs. + } else { + // Ignore nterp methods. We don't support recording trace events in nterp. + if (!method_header->IsNterpMethodHeader()) { + stack_methods_.push_back(m); + } + } + } + } + return true; + } + + std::vector<ArtMethod*> stack_methods_; + }; + + std::unique_ptr<Context> context(Context::Create()); + MethodEntryStackVisitor visitor(thread, context.get()); + visitor.WalkStack(true); + + // Create method entry events for all methods currently on the thread's stack. + // Annotate them with an 'S' to indicate they are methods at startup and the entry timestamp + // isn't accurate. + uintptr_t* method_trace_buffer = thread->GetMethodTraceBuffer(); + uint64_t init_time = TimestampCounter::GetMicroTime(TimestampCounter::GetTimestamp()); + std::ostringstream os; + os << "Thread:" << thread->GetTid() << "\n"; + for (auto smi = visitor.stack_methods_.rbegin(); smi != visitor.stack_methods_.rend(); smi++) { + os << "S->" << *smi << " " << init_time << "\n"; + trace_data->AddTracedMethod(*smi); + } + + // Record a placeholder method exit event into the buffer so we record method exits for the + // methods that are currently on stack. + method_trace_buffer[kAlwaysOnTraceBufSize - 1] = 0x1; + thread->SetMethodTraceBufferCurrentEntry(kAlwaysOnTraceBufSize - 1); + trace_data->AppendToLongRunningMethods(os.str()); + trace_data->AddTracedThread(thread); +} +} // namespace + +class TraceStopTask : public gc::HeapTask { + public: + explicit TraceStopTask(uint64_t target_run_time) : gc::HeapTask(target_run_time) {} + + void Run([[maybe_unused]] Thread* self) override { TraceProfiler::TraceTimeElapsed(); } +}; + +void TraceProfiler::Start(LowOverheadTraceType trace_type, uint64_t trace_duration_ns) { if (!art_flags::always_enable_profile_code()) { LOG(ERROR) << "Feature not supported. Please build with ART_ALWAYS_ENABLE_PROFILE_CODE."; return; @@ -89,17 +186,38 @@ void TraceProfiler::Start() { return; } + TimestampCounter::InitializeTimestampCounters(); profile_in_progress_ = true; + trace_data_ = new TraceData(trace_type); + + { + ScopedSuspendAll ssa(__FUNCTION__); + MutexLock tl(self, *Locks::thread_list_lock_); + for (Thread* thread : Runtime::Current()->GetThreadList()->GetList()) { + auto buffer = new uintptr_t[kAlwaysOnTraceBufSize]; + memset(buffer, 0, kAlwaysOnTraceBufSize * sizeof(uintptr_t)); + thread->SetMethodTraceBuffer(buffer, kAlwaysOnTraceBufSize); + if (trace_type == LowOverheadTraceType::kLongRunningMethods) { + // Record methods that are currently on stack. + RecordMethodsOnThreadStack(thread, trace_data_); + thread->UpdateTlsLowOverheadTraceEntrypoints(LowOverheadTraceType::kLongRunningMethods); + } else { + thread->UpdateTlsLowOverheadTraceEntrypoints(LowOverheadTraceType::kAllMethods); + } + } + } - ScopedSuspendAll ssa(__FUNCTION__); - MutexLock tl(self, *Locks::thread_list_lock_); - for (Thread* thread : Runtime::Current()->GetThreadList()->GetList()) { - auto buffer = new uintptr_t[kAlwaysOnTraceBufSize]; - memset(buffer, 0, kAlwaysOnTraceBufSize * sizeof(uintptr_t)); - thread->SetMethodTraceBuffer(buffer, kAlwaysOnTraceBufSize); + if (trace_type == LowOverheadTraceType::kLongRunningMethods) { + // Add a Task that stops the tracing after trace_duration. + Runtime::Current()->GetHeap()->AddHeapTask(new TraceStopTask(NanoTime() + trace_duration_ns)); + num_trace_stop_tasks_++; } } +void TraceProfiler::Start() { + TraceProfiler::Start(LowOverheadTraceType::kAllMethods, /* trace_duration_ns= */ 0); +} + void TraceProfiler::Stop() { if (!art_flags::always_enable_profile_code()) { LOG(ERROR) << "Feature not supported. Please build with ART_ALWAYS_ENABLE_PROFILE_CODE."; @@ -108,22 +226,49 @@ void TraceProfiler::Stop() { Thread* self = Thread::Current(); MutexLock mu(self, *Locks::trace_lock_); + TraceProfiler::StopLocked(); +} + +void TraceProfiler::StopLocked() { if (!profile_in_progress_) { LOG(ERROR) << "No Profile in progress but a stop was requested"; return; } - ScopedSuspendAll ssa(__FUNCTION__); - MutexLock tl(self, *Locks::thread_list_lock_); - for (Thread* thread : Runtime::Current()->GetThreadList()->GetList()) { + FunctionClosure reset_buffer([](Thread* thread) { auto buffer = thread->GetMethodTraceBuffer(); if (buffer != nullptr) { delete[] buffer; thread->SetMethodTraceBuffer(/* buffer= */ nullptr, /* offset= */ 0); } - } + thread->UpdateTlsLowOverheadTraceEntrypoints(LowOverheadTraceType::kNone); + }); + Runtime::Current()->GetThreadList()->RunCheckpoint(&reset_buffer); profile_in_progress_ = false; + DCHECK_NE(trace_data_, nullptr); + delete trace_data_; + trace_data_ = nullptr; +} + +void TraceProfiler::DumpThreadMethodInfo( + const std::unordered_map<size_t, std::string>& traced_threads, + const std::unordered_set<ArtMethod*>& traced_methods, + std::ostringstream& os) { + // Dump data about thread information. + os << "\n*threads\n"; + for (const auto& it : traced_threads) { + os << it.first << "\t" << it.second << "\n"; + } + + // Dump data about method information. + os << "*methods\n"; + for (ArtMethod* method : traced_methods) { + ArtMethod* method_ptr = reinterpret_cast<ArtMethod*>(method); + os << method_ptr << "\t" << GetMethodInfoLine(method); + } + + os << "*end"; } uint8_t* TraceProfiler::DumpBuffer(uint32_t thread_id, @@ -186,14 +331,6 @@ void TraceProfiler::Dump(int fd) { Dump(std::move(trace_file)); } -std::string TraceProfiler::GetMethodInfoLine(ArtMethod* method) { - return StringPrintf("%s\t%s\t%s\t%s\n", - PrettyDescriptor(method->GetDeclaringClassDescriptor()).c_str(), - method->GetName(), - method->GetSignature().ToString().c_str(), - method->GetDeclaringClassSourceFile()); -} - void TraceProfiler::Dump(const char* filename) { if (!art_flags::always_enable_profile_code()) { LOG(ERROR) << "Feature not supported. Please build with ART_ALWAYS_ENABLE_PROFILE_CODE."; @@ -210,15 +347,23 @@ void TraceProfiler::Dump(const char* filename) { } void TraceProfiler::Dump(std::unique_ptr<File>&& trace_file) { - Thread* self = Thread::Current(); - std::unordered_set<ArtMethod*> traced_methods; - std::unordered_map<size_t, std::string> traced_threads; - MutexLock mu(self, *Locks::trace_lock_); + MutexLock mu(Thread::Current(), *Locks::trace_lock_); if (!profile_in_progress_) { LOG(ERROR) << "No Profile in progress. Nothing to dump."; return; } + if (trace_data_->GetTraceType() == LowOverheadTraceType::kAllMethods) { + DumpTrace(std::move(trace_file)); + } else { + DumpLongRunningMethods(std::move(trace_file)); + } +} + +void TraceProfiler::DumpTrace(std::unique_ptr<File>&& trace_file) { + Thread* self = Thread::Current(); + std::unordered_set<ArtMethod*> traced_methods; + std::unordered_map<size_t, std::string> traced_threads; uint8_t* buffer_ptr = new uint8_t[kBufSizeForEncodedData]; uint8_t* curr_buffer_ptr = buffer_ptr; @@ -237,7 +382,7 @@ void TraceProfiler::Dump(std::unique_ptr<File>&& trace_file) { std::string thread_name; thread->GetThreadName(thread_name); - traced_threads.emplace(thread->GetThreadId(), thread_name); + traced_threads.emplace(thread->GetTid(), thread_name); size_t offset = curr_buffer_ptr - buffer_ptr; if (offset >= kMinBufSizeForEncodedData) { @@ -264,21 +409,7 @@ void TraceProfiler::Dump(std::unique_ptr<File>&& trace_file) { } std::ostringstream os; - // Dump data about thread information. - os << "\n*threads\n"; - for (const auto& it : traced_threads) { - os << it.first << "\t" << it.second << "\n"; - } - - // Dump data about method information. - os << "*methods\n"; - for (ArtMethod* method : traced_methods) { - uint64_t method_id = reinterpret_cast<uint64_t>(method); - os << method_id << "\t" << GetMethodInfoLine(method); - } - - os << "*end"; - + DumpThreadMethodInfo(traced_threads, traced_methods, os); std::string info = os.str(); if (!trace_file->WriteFully(info.c_str(), info.length())) { PLOG(WARNING) << "Failed writing information to file"; @@ -304,4 +435,171 @@ bool TraceProfiler::IsTraceProfileInProgress() { return profile_in_progress_; } +void TraceProfiler::StartTraceLongRunningMethods(uint64_t trace_duration_ns) { + TraceProfiler::Start(LowOverheadTraceType::kLongRunningMethods, trace_duration_ns); +} + +void TraceProfiler::TraceTimeElapsed() { + MutexLock mu(Thread::Current(), *Locks::trace_lock_); + num_trace_stop_tasks_--; + if (num_trace_stop_tasks_ == 0) { + // Only stop the trace if this event corresponds to the currently running trace. + TraceProfiler::StopLocked(); + } +} + +void TraceProfiler::DumpLongRunningMethodBuffer(uint32_t thread_id, + uintptr_t* method_trace_entries, + uintptr_t* end_trace_entries, + std::unordered_set<ArtMethod*>& methods, + std::ostringstream& os) { + os << "Thread:" << thread_id << "\n"; + for (uintptr_t* ptr = method_trace_entries + kAlwaysOnTraceBufSize - 1; + ptr >= end_trace_entries;) { + uintptr_t entry = *ptr; + if (entry == 0x1) { + // This is the special placeholder exit we added to record all methods on the stack at the + // start of the trace. Just ignore this entry. + } else if (entry & 0x1) { + // Method exit + os << "<-" << TimestampCounter::GetMicroTime(entry & ~1) << "\n"; + } else { + // Method entry + ArtMethod* method = reinterpret_cast<ArtMethod*>(entry); + ptr--; + CHECK(ptr >= end_trace_entries); + os << "->" << method << " " << TimestampCounter::GetMicroTime(*ptr) << "\n"; + methods.insert(method); + } + ptr--; + } +} + +void TraceProfiler::FlushBufferAndRecordTraceEvent(ArtMethod* method, + Thread* thread, + bool is_entry) { + uint64_t timestamp = TimestampCounter::GetTimestamp(); + std::ostringstream os; + std::unordered_set<ArtMethod*> traced_methods; + uintptr_t* method_trace_entries = thread->GetMethodTraceBuffer(); + DCHECK(method_trace_entries != nullptr); + uintptr_t** method_trace_curr_ptr = thread->GetTraceBufferCurrEntryPtr(); + + // Find the last method exit event. We can flush all the entries before this event. We cannot + // flush remaining events because we haven't determined if they are long running or not. + uintptr_t* processed_events_ptr = nullptr; + for (uintptr_t* ptr = *method_trace_curr_ptr; + ptr < method_trace_entries + kAlwaysOnTraceBufSize;) { + if (*ptr & 0x1) { + // Method exit. We need to keep events until (including this method exit) here. + processed_events_ptr = ptr + 1; + break; + } + ptr += 2; + } + + size_t num_occupied_entries = (processed_events_ptr - *method_trace_curr_ptr); + size_t index = kAlwaysOnTraceBufSize; + if (num_occupied_entries > kMaxEntriesAfterFlush) { + // If we don't have sufficient space just record a placeholder exit and flush all the existing + // events. We have accurate timestamps to filter out these events in a post-processing step. + // This would happen only when we have very deeply (~1024) nested code. + DumpLongRunningMethodBuffer( + thread->GetTid(), method_trace_entries, *method_trace_curr_ptr, traced_methods, os); + + // Encode a placeholder exit event. This will be ignored when dumping the methods. + method_trace_entries[--index] = 0x1; + } else { + // Flush all the entries till the method exit event. + DumpLongRunningMethodBuffer( + thread->GetTid(), method_trace_entries, processed_events_ptr, traced_methods, os); + + // Move the remaining events to the start of the buffer. + for (uintptr_t* ptr = processed_events_ptr - 1; ptr >= *method_trace_curr_ptr; ptr--) { + method_trace_entries[--index] = *ptr; + } + } + + // Record new entry + if (is_entry) { + method_trace_entries[--index] = reinterpret_cast<uintptr_t>(method); + method_trace_entries[--index] = timestamp & ~1; + } else { + if (method_trace_entries[index] & 0x1) { + method_trace_entries[--index] = timestamp | 1; + } else { + size_t prev_timestamp = method_trace_entries[index]; + if (timestamp - prev_timestamp < kLongRunningMethodThreshold) { + index += 2; + DCHECK_LT(index, kAlwaysOnTraceBufSize); + } else { + method_trace_entries[--index] = timestamp | 1; + } + } + } + *method_trace_curr_ptr = method_trace_entries + index; + + MutexLock mu(Thread::Current(), *Locks::trace_lock_); + trace_data_->AppendToLongRunningMethods(os.str()); + trace_data_->AddTracedMethods(traced_methods); + trace_data_->AddTracedThread(thread); +} + +std::string TraceProfiler::GetLongRunningMethodsString() { + if (!art_flags::always_enable_profile_code()) { + return std::string(); + } + + MutexLock mu(Thread::Current(), *Locks::trace_lock_); + if (!profile_in_progress_) { + return std::string(); + } + + return GetLongRunningMethodsStringLocked(); +} + +std::string TraceProfiler::GetLongRunningMethodsStringLocked() { + Thread* self = Thread::Current(); + std::ostringstream os; + ScopedSuspendAll ssa(__FUNCTION__); + MutexLock tl(self, *Locks::thread_list_lock_); + + // Get any data that was previously flushed. + const std::string& old_data = trace_data_->GetLongRunningMethods(); + if (old_data.length() > 0) { + os << trace_data_->GetLongRunningMethods(); + } + + // Collect long running methods from all the threads; + std::unordered_set<ArtMethod*> traced_methods; + for (Thread* thread : Runtime::Current()->GetThreadList()->GetList()) { + auto method_trace_entries = thread->GetMethodTraceBuffer(); + if (method_trace_entries == nullptr) { + continue; + } + + trace_data_->AddTracedThread(thread); + uintptr_t* method_trace_curr_ptr = *(thread->GetTraceBufferCurrEntryPtr()); + DumpLongRunningMethodBuffer( + thread->GetTid(), method_trace_entries, method_trace_curr_ptr, traced_methods, os); + } + + trace_data_->AddTracedMethods(traced_methods); + + // Dump the information about traced_methods and threads + DumpThreadMethodInfo(trace_data_->GetTracedThreads(), trace_data_->GetTracedMethods(), os); + return os.str(); +} + +void TraceProfiler::DumpLongRunningMethods(std::unique_ptr<File>&& trace_file) { + std::string info = GetLongRunningMethodsStringLocked(); + if (!trace_file->WriteFully(info.c_str(), info.length())) { + PLOG(WARNING) << "Failed writing information to file"; + } + + if (!trace_file->Close()) { + PLOG(WARNING) << "Failed to close file."; + } +} + } // namespace art diff --git a/runtime/trace_profile.h b/runtime/trace_profile.h index 694eb7ac99..c71cb2aee6 100644 --- a/runtime/trace_profile.h +++ b/runtime/trace_profile.h @@ -31,6 +31,67 @@ class ArtMethod; // entries required in the buffer. static constexpr size_t kAlwaysOnTraceBufSize = 2048; +// The typical frequency at which the timestamp counters are updated is 24576000. +// 2^22 (4194304) corresponds to about 170ms at that frequency. +static constexpr size_t kLongRunningMethodThreshold = 1 << 22; + +enum class LowOverheadTraceType { + kLongRunningMethods, + kAllMethods, + kNone +}; + +class TraceData { + public: + explicit TraceData(LowOverheadTraceType trace_type) : trace_type_(trace_type) { + } + + LowOverheadTraceType GetTraceType() const { + return trace_type_; + } + + const std::string& GetLongRunningMethods() const { + return long_running_methods_; + } + + const std::unordered_set<ArtMethod*>& GetTracedMethods() const { + return traced_methods_; + } + + const std::unordered_map<size_t, std::string>& GetTracedThreads() const { + return traced_threads_; + } + + void AppendToLongRunningMethods(const std::string& str) { + long_running_methods_.append(str); + } + + void AddTracedMethods(std::unordered_set<ArtMethod*>& methods) { + traced_methods_.merge(methods); + } + + void AddTracedMethod(ArtMethod* method) { + traced_methods_.insert(method); + } + + void AddTracedThread(Thread* thread); + + private: + // This is used to hold the initial methods on stack and also long running methods when there is a + // buffer overflow. + std::string long_running_methods_; + + LowOverheadTraceType trace_type_; + + // These hold the methods and threads see so far. These are used to generate information about + // the methods and threads. + std::unordered_set<ArtMethod*> traced_methods_; + + // Threads might exit before we dump the data, so record thread id and name when we see a new + // thread. + std::unordered_map<size_t, std::string> traced_threads_; +}; + // This class implements low-overhead tracing. This feature is available only when // always_enable_profile_code is enabled which is a build time flag defined in // build/flags/art-flags.aconfig. When this flag is enabled, AOT and JITed code can record events @@ -42,6 +103,10 @@ class TraceProfiler { // Starts profiling by allocating a per-thread buffer for all the threads. static void Start(); + // Starts recording long running methods. A long running method means any + // method that executes for more than kLongRunningMethodDuration. + static void StartTraceLongRunningMethods(uint64_t trace_duration_ns); + // Releases all the buffers. static void Stop(); @@ -49,6 +114,10 @@ class TraceProfiler { static void Dump(int fd); static void Dump(const char* trace_filename); + // Get the long running methods as a string. This is used in the sigquit handler to record + // information about long running methods. + static std::string GetLongRunningMethodsString(); + // Called when thread is exiting to release the allocated buffer. static void ReleaseThreadBuffer(Thread* self) REQUIRES(Locks::trace_lock_); @@ -57,10 +126,40 @@ class TraceProfiler { // Allocates a buffer for the specified thread. static void AllocateBuffer(Thread* thread); + // Used to flush the long running method buffer when it is full. This method flushes all methods + // that have already seen an exit and records them into a string. If we don't have sufficient free + // entries after this processing (for example: if we have a really deep call stack) then we record + // a placeholder method exit event and flush all events. + static void FlushBufferAndRecordTraceEvent(ArtMethod* method, Thread* thread, bool is_entry); + + static LowOverheadTraceType GetTraceType(); + + // Callback that is run when the specified duration for the long running trace has elapsed. If the + // trace is still running then tracing is stopped and all buffers are released. If the trace + // has already stopped then this request is ignored. + static void TraceTimeElapsed(); + private: - // Dumps the events from all threads into the trace_file. + // Starts tracing. + static void Start(LowOverheadTraceType trace_type, uint64_t trace_duration_ns); + + // Dumps the tracing data into the specified trace_file static void Dump(std::unique_ptr<File>&& trace_file); + // Stops tracing. + static void StopLocked() REQUIRES(Locks::trace_lock_); + + // Returns the information about long running methods as a string. Used both by Dump + // and GetLongRunningMethodsString. + static std::string GetLongRunningMethodsStringLocked() REQUIRES(Locks::trace_lock_); + + // Dumps the events from all threads into the trace_file. + static void DumpTrace(std::unique_ptr<File>&& trace_file) REQUIRES(Locks::trace_lock_); + + // Dumps the long running methods from all threads into the trace_file. + static void DumpLongRunningMethods(std::unique_ptr<File>&& trace_file) + REQUIRES(Locks::trace_lock_); + // This method goes over all the events in the thread_buffer and stores the encoded event in the // buffer. It returns the pointer to the next free entry in the buffer. // This also records the ArtMethods from the events in the thread_buffer in a set. This set is @@ -71,9 +170,28 @@ class TraceProfiler { uint8_t* buffer /* out */, std::unordered_set<ArtMethod*>& methods /* out */); - static std::string GetMethodInfoLine(ArtMethod* method) REQUIRES(Locks::mutator_lock_); + // Dumps all the events in the buffer into the file. Also records the ArtMethods from the events + // which is then used to record information about these methods. + static void DumpLongRunningMethodBuffer(uint32_t thread_id, + uintptr_t* thread_buffer, + uintptr_t* end_buffer, + std::unordered_set<ArtMethod*>& methods /* out */, + std::ostringstream& os); + + // Records the thread and method info. + static void DumpThreadMethodInfo(const std::unordered_map<size_t, std::string>& traced_threads, + const std::unordered_set<ArtMethod*>& traced_methods, + std::ostringstream& os) REQUIRES(Locks::mutator_lock_); static bool profile_in_progress_ GUARDED_BY(Locks::trace_lock_); + + // Keeps track of number of outstanding trace stop tasks. We should only stop a trace when the + // count is 0. If a trace was already stopped and a new trace has started before the time elapsed + // for the previous one we shouldn't stop the new trace. + static int num_trace_stop_tasks_ GUARDED_BY(Locks::trace_lock_); + + static TraceData* trace_data_ GUARDED_BY(Locks::trace_lock_); + DISALLOW_COPY_AND_ASSIGN(TraceProfiler); }; diff --git a/tools/cpp-define-generator/thread.def b/tools/cpp-define-generator/thread.def index aa621697c8..f5eefad3b6 100644 --- a/tools/cpp-define-generator/thread.def +++ b/tools/cpp-define-generator/thread.def @@ -77,3 +77,4 @@ ASM_DEFINE(TRACE_BUFFER_INIT_OFFSET, ASM_DEFINE(TRACE_BUFFER_CURRENT_OFFSET, art::Thread::TraceBufferCurrPtrOffset<art::kRuntimePointerSize>().Int32Value()) ASM_DEFINE(TRACE_BUFFER_SIZE, (art::kAlwaysOnTraceBufSize - 1) * sizeof(uintptr_t)) +ASM_DEFINE(LONG_RUNNING_METHOD_THRESHOLD, art::kLongRunningMethodThreshold) |