Rewrite JVMTI method tracing to use trampolines

Now that we have working instrumentation trampolines we can rewrite
the MethodEntry and MethodExit events to use these instead of the
interpreter. This will improve performance of code being traced in
this way by allowing the actual code to still use optimized jit code.

Test: ./test/testrunner/testrunner.py --host --runtime-option=-Xjitthreshold:0 --jit
Test: ./test/testrunner/testrunner.py --host --jit --debug --runtime-option=-Xplugin:libtracefast-trampolined.so
Test: ./test/testrunner/testrunner.py --host --jit --debug --runtime-option=-Xjitthreshold:0 --runtime-option=-Xplugin:libtracefast-trampolined.so
Test: ./test/testrunner/testrunner.py --host --debuggable --run-test-option='--with-agent libtifast.so=MethodEntry,MethodExit'
Test: ./test/testrunner/testrunner.py --host --debuggable --jit --runtime-option=-Xjitthreshold:0 --run-test-option='--with-agent libtifast.so=MethodEntry,MethodExit'
Test: adb shell am start-activity -S --attach-agent /data/local/tmp/libtifast.so=MethodEntry,MethodExit <some app>

Change-Id: I37c2ece14b1b12a79390da7d668ebbefa2966f87
diff --git a/openjdkjvmti/deopt_manager.cc b/openjdkjvmti/deopt_manager.cc
index 2f24d7e..b444cc7 100644
--- a/openjdkjvmti/deopt_manager.cc
+++ b/openjdkjvmti/deopt_manager.cc
@@ -40,7 +40,9 @@
 #include "dex/dex_file_annotations.h"
 #include "dex/modifiers.h"
 #include "events-inl.h"
+#include "intrinsics_list.h"
 #include "jit/jit.h"
+#include "jit/jit_code_cache.h"
 #include "jni/jni_internal.h"
 #include "mirror/class-inl.h"
 #include "mirror/object_array-inl.h"
@@ -84,11 +86,13 @@
     deoptimization_condition_("JVMTI_DeoptimizationCondition", deoptimization_status_lock_),
     performing_deoptimization_(false),
     global_deopt_count_(0),
+    global_interpreter_deopt_count_(0),
     deopter_count_(0),
     breakpoint_status_lock_("JVMTI_BreakpointStatusLock",
                             static_cast<art::LockLevel>(art::LockLevel::kAbortLock + 1)),
     inspection_callback_(this),
-    set_local_variable_called_(false) { }
+    set_local_variable_called_(false),
+    already_disabled_intrinsics_(false) { }
 
 void DeoptManager::Setup() {
   art::ScopedThreadStateChange stsc(art::Thread::Current(),
@@ -159,18 +163,18 @@
   return elem != breakpoint_status_.end() && elem->second != 0;
 }
 
-void DeoptManager::RemoveDeoptimizeAllMethods() {
+void DeoptManager::RemoveDeoptimizeAllMethods(FullDeoptRequirement req) {
   art::Thread* self = art::Thread::Current();
   art::ScopedThreadSuspension sts(self, art::kSuspended);
   deoptimization_status_lock_.ExclusiveLock(self);
-  RemoveDeoptimizeAllMethodsLocked(self);
+  RemoveDeoptimizeAllMethodsLocked(self, req);
 }
 
-void DeoptManager::AddDeoptimizeAllMethods() {
+void DeoptManager::AddDeoptimizeAllMethods(FullDeoptRequirement req) {
   art::Thread* self = art::Thread::Current();
   art::ScopedThreadSuspension sts(self, art::kSuspended);
   deoptimization_status_lock_.ExclusiveLock(self);
-  AddDeoptimizeAllMethodsLocked(self);
+  AddDeoptimizeAllMethodsLocked(self, req);
 }
 
 void DeoptManager::AddMethodBreakpoint(art::ArtMethod* method) {
@@ -207,7 +211,7 @@
     deoptimization_status_lock_.ExclusiveUnlock(self);
     return;
   } else if (is_default) {
-    AddDeoptimizeAllMethodsLocked(self);
+    AddDeoptimizeAllMethodsLocked(self, FullDeoptRequirement::kInterpreter);
   } else {
     PerformLimitedDeoptimization(self, method);
   }
@@ -244,7 +248,7 @@
     return;
   } else if (is_last_breakpoint) {
     if (UNLIKELY(is_default)) {
-      RemoveDeoptimizeAllMethodsLocked(self);
+      RemoveDeoptimizeAllMethodsLocked(self, FullDeoptRequirement::kInterpreter);
     } else {
       PerformLimitedUndeoptimization(self, method);
     }
@@ -272,13 +276,22 @@
       RELEASE(deopt->deoptimization_status_lock_)
       ACQUIRE(art::Locks::mutator_lock_)
       ACQUIRE(art::Roles::uninterruptible_)
-      : self_(self), deopt_(deopt), uninterruptible_cause_(nullptr) {
+      : self_(self),
+        deopt_(deopt),
+        uninterruptible_cause_(nullptr),
+        jit_(art::Runtime::Current()->GetJit()) {
     deopt_->WaitForDeoptimizationToFinishLocked(self_);
     DCHECK(!deopt->performing_deoptimization_)
         << "Already performing deoptimization on another thread!";
     // Use performing_deoptimization_ to keep track of the lock.
     deopt_->performing_deoptimization_ = true;
     deopt_->deoptimization_status_lock_.Unlock(self_);
+    // Stop the jit. We might need to disable all intrinsics which needs the jit disabled and this
+    // is the only place we can do that. Since this isn't expected to be entered too often it should
+    // be fine to always stop it.
+    if (jit_ != nullptr) {
+      jit_->Stop();
+    }
     art::Runtime::Current()->GetThreadList()->SuspendAll("JMVTI Deoptimizing methods",
                                                          /*long_suspend*/ false);
     uninterruptible_cause_ = self_->StartAssertNoThreadSuspension("JVMTI deoptimizing methods");
@@ -291,6 +304,10 @@
     self_->EndAssertNoThreadSuspension(uninterruptible_cause_);
     // Release the mutator lock.
     art::Runtime::Current()->GetThreadList()->ResumeAll();
+    // Let the jit start again.
+    if (jit_ != nullptr) {
+      jit_->Start();
+    }
     // Let other threads know it's fine to proceed.
     art::MutexLock lk(self_, deopt_->deoptimization_status_lock_);
     deopt_->performing_deoptimization_ = false;
@@ -301,22 +318,44 @@
   art::Thread* self_;
   DeoptManager* deopt_;
   const char* uninterruptible_cause_;
+  art::jit::Jit* jit_;
 };
 
-void DeoptManager::AddDeoptimizeAllMethodsLocked(art::Thread* self) {
+void DeoptManager::AddDeoptimizeAllMethodsLocked(art::Thread* self, FullDeoptRequirement req) {
+  DCHECK_GE(global_deopt_count_, global_interpreter_deopt_count_);
   global_deopt_count_++;
+  if (req == FullDeoptRequirement::kInterpreter) {
+    global_interpreter_deopt_count_++;
+  }
   if (global_deopt_count_ == 1) {
-    PerformGlobalDeoptimization(self);
+    PerformGlobalDeoptimization(self,
+                                /*needs_interpreter*/ global_interpreter_deopt_count_ > 0,
+                                /*disable_intrinsics*/ global_interpreter_deopt_count_ == 0);
+  } else if (req == FullDeoptRequirement::kInterpreter && global_interpreter_deopt_count_ == 1) {
+    // First kInterpreter request.
+    PerformGlobalDeoptimization(self,
+                                /*needs_interpreter*/true,
+                                /*disable_intrinsics*/false);
   } else {
     WaitForDeoptimizationToFinish(self);
   }
 }
 
-void DeoptManager::RemoveDeoptimizeAllMethodsLocked(art::Thread* self) {
+void DeoptManager::RemoveDeoptimizeAllMethodsLocked(art::Thread* self, FullDeoptRequirement req) {
   DCHECK_GT(global_deopt_count_, 0u) << "Request to remove non-existent global deoptimization!";
+  DCHECK_GE(global_deopt_count_, global_interpreter_deopt_count_);
   global_deopt_count_--;
+  if (req == FullDeoptRequirement::kInterpreter) {
+    global_interpreter_deopt_count_--;
+  }
   if (global_deopt_count_ == 0) {
-    PerformGlobalUndeoptimization(self);
+    PerformGlobalUndeoptimization(self,
+                                  /*still_needs_stubs*/ false,
+                                  /*disable_intrinsics*/ false);
+  } else if (req == FullDeoptRequirement::kInterpreter && global_interpreter_deopt_count_ == 0) {
+    PerformGlobalUndeoptimization(self,
+                                  /*still_needs_stubs*/ global_deopt_count_ > 0,
+                                  /*disable_intrinsics*/ global_deopt_count_ > 0);
   } else {
     WaitForDeoptimizationToFinish(self);
   }
@@ -332,18 +371,85 @@
   art::Runtime::Current()->GetInstrumentation()->Undeoptimize(method);
 }
 
-void DeoptManager::PerformGlobalDeoptimization(art::Thread* self) {
+void DeoptManager::PerformGlobalDeoptimization(art::Thread* self,
+                                               bool needs_interpreter,
+                                               bool disable_intrinsics) {
   ScopedDeoptimizationContext sdc(self, this);
-  art::Runtime::Current()->GetInstrumentation()->DeoptimizeEverything(
-      kDeoptManagerInstrumentationKey);
+  art::Runtime::Current()->GetInstrumentation()->EnableMethodTracing(
+      kDeoptManagerInstrumentationKey, needs_interpreter);
+  MaybeDisableIntrinsics(disable_intrinsics);
 }
 
-void DeoptManager::PerformGlobalUndeoptimization(art::Thread* self) {
+void DeoptManager::PerformGlobalUndeoptimization(art::Thread* self,
+                                                 bool still_needs_stubs,
+                                                 bool disable_intrinsics) {
   ScopedDeoptimizationContext sdc(self, this);
-  art::Runtime::Current()->GetInstrumentation()->UndeoptimizeEverything(
-      kDeoptManagerInstrumentationKey);
+  if (still_needs_stubs) {
+    art::Runtime::Current()->GetInstrumentation()->EnableMethodTracing(
+        kDeoptManagerInstrumentationKey, /*needs_interpreter*/false);
+    MaybeDisableIntrinsics(disable_intrinsics);
+  } else {
+    art::Runtime::Current()->GetInstrumentation()->DisableMethodTracing(
+        kDeoptManagerInstrumentationKey);
+    // We shouldn't care about intrinsics if we don't need tracing anymore.
+    DCHECK(!disable_intrinsics);
+  }
 }
 
+static void DisableSingleIntrinsic(const char* class_name,
+                                   const char* method_name,
+                                   const char* signature)
+    REQUIRES(art::Locks::mutator_lock_, art::Roles::uninterruptible_) {
+  // Since these intrinsics are all loaded during runtime startup this cannot fail and will not
+  // suspend.
+  art::Thread* self = art::Thread::Current();
+  art::ClassLinker* class_linker = art::Runtime::Current()->GetClassLinker();
+  art::ObjPtr<art::mirror::Class> cls = class_linker->FindSystemClass(self, class_name);
+
+  if (cls == nullptr) {
+    LOG(FATAL) << "Could not find class of intrinsic "
+               << class_name << "->" << method_name << signature;
+  }
+
+  art::ArtMethod* method = cls->FindClassMethod(method_name, signature, art::kRuntimePointerSize);
+  if (method == nullptr || method->GetDeclaringClass() != cls) {
+    LOG(FATAL) << "Could not find method of intrinsic "
+               << class_name << "->" << method_name << signature;
+  }
+
+  if (LIKELY(method->IsIntrinsic())) {
+    method->SetNotIntrinsic();
+  } else {
+    LOG(WARNING) << method->PrettyMethod() << " was already marked as non-intrinsic!";
+  }
+}
+
+void DeoptManager::MaybeDisableIntrinsics(bool do_disable) {
+  if (!do_disable || already_disabled_intrinsics_) {
+    // Don't toggle intrinsics on and off. It will lead to too much purging of the jit and would
+    // require us to keep around the intrinsics status of all methods.
+    return;
+  }
+  already_disabled_intrinsics_ = true;
+  // First just mark all intrinsic methods as no longer intrinsics.
+#define DISABLE_INTRINSIC(_1, _2, _3, _4, _5, decl_class_name, meth_name, meth_desc) \
+    DisableSingleIntrinsic(decl_class_name, meth_name, meth_desc);
+  INTRINSICS_LIST(DISABLE_INTRINSIC);
+#undef DISABLE_INTRINSIC
+  // Next tell the jit to throw away all of its code (since there might be intrinsic code in them.
+  // TODO it would be nice to be more selective.
+  art::jit::Jit* jit = art::Runtime::Current()->GetJit();
+  if (jit != nullptr) {
+    jit->GetCodeCache()->ClearAllCompiledDexCode();
+  }
+  art::MutexLock mu(art::Thread::Current(), *art::Locks::thread_list_lock_);
+  // Now make all threads go to interpreter.
+  art::Runtime::Current()->GetThreadList()->ForEach(
+      [](art::Thread* thr, void* ctx) REQUIRES(art::Locks::mutator_lock_) {
+        reinterpret_cast<DeoptManager*>(ctx)->DeoptimizeThread(thr);
+      },
+      this);
+}
 
 void DeoptManager::RemoveDeoptimizationRequester() {
   art::Thread* self = art::Thread::Current();
diff --git a/openjdkjvmti/deopt_manager.h b/openjdkjvmti/deopt_manager.h
index 6e991de..1a13c0f 100644
--- a/openjdkjvmti/deopt_manager.h
+++ b/openjdkjvmti/deopt_manager.h
@@ -51,6 +51,11 @@
 
 namespace openjdkjvmti {
 
+enum class FullDeoptRequirement {
+  kStubs,
+  kInterpreter,
+};
+
 class DeoptManager;
 
 struct JvmtiMethodInspectionCallback : public art::MethodInspectionCallback {
@@ -94,11 +99,11 @@
       REQUIRES(!deoptimization_status_lock_, !art::Roles::uninterruptible_)
       REQUIRES_SHARED(art::Locks::mutator_lock_);
 
-  void AddDeoptimizeAllMethods()
+  void AddDeoptimizeAllMethods(FullDeoptRequirement requirement)
       REQUIRES(!deoptimization_status_lock_, !art::Roles::uninterruptible_)
       REQUIRES_SHARED(art::Locks::mutator_lock_);
 
-  void RemoveDeoptimizeAllMethods()
+  void RemoveDeoptimizeAllMethods(FullDeoptRequirement requirement)
       REQUIRES(!deoptimization_status_lock_, !art::Roles::uninterruptible_)
       REQUIRES_SHARED(art::Locks::mutator_lock_);
 
@@ -132,19 +137,23 @@
   void WaitForDeoptimizationToFinishLocked(art::Thread* self)
       REQUIRES(deoptimization_status_lock_, !art::Locks::mutator_lock_);
 
-  void AddDeoptimizeAllMethodsLocked(art::Thread* self)
+  void AddDeoptimizeAllMethodsLocked(art::Thread* self, FullDeoptRequirement req)
       RELEASE(deoptimization_status_lock_)
       REQUIRES(!art::Roles::uninterruptible_, !art::Locks::mutator_lock_);
 
-  void RemoveDeoptimizeAllMethodsLocked(art::Thread* self)
+  void RemoveDeoptimizeAllMethodsLocked(art::Thread* self, FullDeoptRequirement req)
       RELEASE(deoptimization_status_lock_)
       REQUIRES(!art::Roles::uninterruptible_, !art::Locks::mutator_lock_);
 
-  void PerformGlobalDeoptimization(art::Thread* self)
+  void PerformGlobalDeoptimization(art::Thread* self,
+                                   bool needs_interpreter,
+                                   bool disable_intrinsics)
       RELEASE(deoptimization_status_lock_)
       REQUIRES(!art::Roles::uninterruptible_, !art::Locks::mutator_lock_);
 
-  void PerformGlobalUndeoptimization(art::Thread* self)
+  void PerformGlobalUndeoptimization(art::Thread* self,
+                                     bool still_needs_stubs,
+                                     bool disable_intrinsics)
       RELEASE(deoptimization_status_lock_)
       REQUIRES(!art::Roles::uninterruptible_, !art::Locks::mutator_lock_);
 
@@ -156,15 +165,25 @@
       RELEASE(deoptimization_status_lock_)
       REQUIRES(!art::Roles::uninterruptible_, !art::Locks::mutator_lock_);
 
+  // Disables intrinsics and purges the jit code cache if needed.
+  void MaybeDisableIntrinsics(bool do_disable)
+      REQUIRES(art::Locks::mutator_lock_,
+               !deoptimization_status_lock_,
+               art::Roles::uninterruptible_);
+
   static constexpr const char* kDeoptManagerInstrumentationKey = "JVMTI_DeoptManager";
 
   art::Mutex deoptimization_status_lock_ ACQUIRED_BEFORE(art::Locks::classlinker_classes_lock_);
   art::ConditionVariable deoptimization_condition_ GUARDED_BY(deoptimization_status_lock_);
   bool performing_deoptimization_ GUARDED_BY(deoptimization_status_lock_);
 
-  // Number of times we have gotten requests to deopt everything.
+  // Number of times we have gotten requests to deopt everything both requiring and not requiring
+  // interpreter.
   uint32_t global_deopt_count_ GUARDED_BY(deoptimization_status_lock_);
 
+  // Number of deopt-everything requests that require interpreter.
+  uint32_t global_interpreter_deopt_count_ GUARDED_BY(deoptimization_status_lock_);
+
   // Number of users of deoptimization there currently are.
   uint32_t deopter_count_ GUARDED_BY(deoptimization_status_lock_);
 
@@ -182,6 +201,10 @@
   // OSR after this.
   std::atomic<bool> set_local_variable_called_;
 
+  // If we have already disabled intrinsics. Since doing this throws out all JIT code we really will
+  // only ever do it once and never undo it.
+  bool already_disabled_intrinsics_ GUARDED_BY(art::Locks::mutator_lock_);
+
   // Helper for setting up/tearing-down for deoptimization.
   friend class ScopedDeoptimizationContext;
 };
diff --git a/openjdkjvmti/events.cc b/openjdkjvmti/events.cc
index f71a5dc..10a4923 100644
--- a/openjdkjvmti/events.cc
+++ b/openjdkjvmti/events.cc
@@ -888,16 +888,29 @@
     case ArtJvmtiEvent::kBreakpoint:
     case ArtJvmtiEvent::kException:
       return false;
-    // TODO We should support more of these or at least do something to make them discriminate by
-    // thread.
+    default:
+      return true;
+  }
+}
+
+static FullDeoptRequirement GetFullDeoptRequirement(ArtJvmtiEvent event) {
+  switch (event) {
+    // TODO We should support more of these as Limited or at least do something to make them
+    // discriminate by thread.
     case ArtJvmtiEvent::kMethodEntry:
-    case ArtJvmtiEvent::kExceptionCatch:
     case ArtJvmtiEvent::kMethodExit:
+      // We only need MethodEntered and MethodExited for these so we can use Stubs. We will need to
+      // disable intrinsics.
+      // TODO Offer a version of this without disabling intrinsics.
+      return FullDeoptRequirement::kStubs;
+    case ArtJvmtiEvent::kExceptionCatch:
     case ArtJvmtiEvent::kFieldModification:
     case ArtJvmtiEvent::kFieldAccess:
     case ArtJvmtiEvent::kSingleStep:
+    // NB If we ever make this runnable using stubs or some other method we will need to be careful
+    // that it doesn't require disabling intrinsics.
     case ArtJvmtiEvent::kFramePop:
-      return true;
+      return FullDeoptRequirement::kInterpreter;
     default:
       LOG(FATAL) << "Unexpected event type!";
       UNREACHABLE();
@@ -907,19 +920,18 @@
 void EventHandler::SetupTraceListener(JvmtiMethodTraceListener* listener,
                                       ArtJvmtiEvent event,
                                       bool enable) {
-  bool needs_full_deopt = EventNeedsFullDeopt(event);
   // Make sure we can deopt.
   {
     art::ScopedObjectAccess soa(art::Thread::Current());
     DeoptManager* deopt_manager = DeoptManager::Get();
     if (enable) {
       deopt_manager->AddDeoptimizationRequester();
-      if (needs_full_deopt) {
-        deopt_manager->AddDeoptimizeAllMethods();
+      if (EventNeedsFullDeopt(event)) {
+        deopt_manager->AddDeoptimizeAllMethods(GetFullDeoptRequirement(event));
       }
     } else {
-      if (needs_full_deopt) {
-        deopt_manager->RemoveDeoptimizeAllMethods();
+      if (EventNeedsFullDeopt(event)) {
+        deopt_manager->RemoveDeoptimizeAllMethods(GetFullDeoptRequirement(event));
       }
       deopt_manager->RemoveDeoptimizationRequester();
     }
diff --git a/runtime/jit/jit.cc b/runtime/jit/jit.cc
index ed449b5..86f0606 100644
--- a/runtime/jit/jit.cc
+++ b/runtime/jit/jit.cc
@@ -768,13 +768,20 @@
 void Jit::Stop() {
   Thread* self = Thread::Current();
   // TODO(ngeoffray): change API to not require calling WaitForCompilationToFinish twice.
+  // During shutdown and startup the thread-pool can be null.
+  if (GetThreadPool() == nullptr) {
+    return;
+  }
   WaitForCompilationToFinish(self);
   GetThreadPool()->StopWorkers(self);
   WaitForCompilationToFinish(self);
 }
 
 void Jit::Start() {
-  GetThreadPool()->StartWorkers(Thread::Current());
+  // During shutdown and startup the thread-pool can be null.
+  if (GetThreadPool() != nullptr) {
+    GetThreadPool()->StartWorkers(Thread::Current());
+  }
 }
 
 ScopedJitSuspend::ScopedJitSuspend() {
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index e635ad9..52262f4 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -362,6 +362,21 @@
   return nullptr;
 }
 
+void JitCodeCache::ClearAllCompiledDexCode() {
+  MutexLock mu(Thread::Current(), lock_);
+  // Get rid of OSR code waiting to be put on a thread.
+  osr_code_map_.clear();
+
+  // We don't clear out or even touch method_code_map_ since that is what we use to go the other
+  // way, move from code currently-running to the method it's from. Getting rid of it would break
+  // the jit-gc, stack-walking and signal handling. Since we never look through it to go the other
+  // way (from method -> code) everything is fine.
+
+  for (ProfilingInfo* p : profiling_infos_) {
+    p->SetSavedEntryPoint(nullptr);
+  }
+}
+
 const void* JitCodeCache::FindCompiledCodeForInstrumentation(ArtMethod* method) {
   if (LIKELY(!GetGarbageCollectCode())) {
     return nullptr;
diff --git a/runtime/jit/jit_code_cache.h b/runtime/jit/jit_code_cache.h
index f7829de..19de978 100644
--- a/runtime/jit/jit_code_cache.h
+++ b/runtime/jit/jit_code_cache.h
@@ -215,6 +215,8 @@
       REQUIRES(!lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  void ClearAllCompiledDexCode() REQUIRES(!lock_, Locks::mutator_lock_);
+
   void CopyInlineCacheInto(const InlineCache& ic, Handle<mirror::ObjectArray<mirror::Class>> array)
       REQUIRES(!lock_)
       REQUIRES_SHARED(Locks::mutator_lock_);