Remove race on cached thread-name deletion

Manage the thread name using an RCU-like scheme, in which the name
is an atomic pointer to an immutable string, and readers explicitly
maintain a count of active readers. Writers wait until there are
no readers before deallocating a name that was just replaced.

Currently this potentially blocks writers, but readers should be very
fast, and relatively infrequent, so I can't imagine this is a real
issue.

This also replaces JVM_SetNativeThreadName with a stub. This part of
the CL came from Nicolas' aosp/1706702, which this is otherwise
intended to replace.

Test: Build and boot AOSP.
Bug: 37970289
Change-Id: I7e1d13d24adc10b4009d60ee68bc9e837ce78b34
diff --git a/compiler/utils/assembler_thumb_test_expected.cc.inc b/compiler/utils/assembler_thumb_test_expected.cc.inc
index 2d1de97..b6c6025 100644
--- a/compiler/utils/assembler_thumb_test_expected.cc.inc
+++ b/compiler/utils/assembler_thumb_test_expected.cc.inc
@@ -76,7 +76,7 @@
   "      f2: bb f1 00 0f   cmp.w r11, #0\n"
   "      f6: 18 bf         it ne\n"
   "      f8: e3 46         movne r11, r12\n"
-  "      fa: d9 f8 8c c0   ldr.w r12, [r9, #140]\n"
+  "      fa: d9 f8 94 c0   ldr.w r12, [r9, #148]\n"
   "      fe: bc f1 00 0f   cmp.w r12, #0\n"
   "     102: 71 d1         bne 0x1e8     @ imm = #226\n"
   "     104: cd f8 ff c7   str.w r12, [sp, #2047]\n"
@@ -152,8 +152,8 @@
   "     218: bd e8 e0 4d   pop.w {r5, r6, r7, r8, r10, r11, lr}\n"
   "     21c: d9 f8 24 80   ldr.w r8, [r9, #36]\n"
   "     220: 70 47         bx lr\n"
-  "     222: d9 f8 8c 00   ldr.w r0, [r9, #140]\n"
-  "     226: d9 f8 c0 e2   ldr.w lr, [r9, #704]\n"
+  "     222: d9 f8 94 00   ldr.w r0, [r9, #148]\n"
+  "     226: d9 f8 c8 e2   ldr.w lr, [r9, #712]\n"
   "     22a: f0 47         blx lr\n"
 };
 
diff --git a/openjdkjvm/OpenjdkJvm.cc b/openjdkjvm/OpenjdkJvm.cc
index a9fa0fc6c..63503f4 100644
--- a/openjdkjvm/OpenjdkJvm.cc
+++ b/openjdkjvm/OpenjdkJvm.cc
@@ -406,38 +406,12 @@
   return soa.Self()->HoldsLock(object);
 }
 
-JNIEXPORT void JVM_SetNativeThreadName(JNIEnv* env, jobject jthread, jstring java_name) {
-  ScopedUtfChars name(env, java_name);
-  {
-    art::ScopedObjectAccess soa(env);
-    if (soa.Decode<art::mirror::Object>(jthread) == soa.Self()->GetPeer()) {
-      soa.Self()->SetThreadName(name.c_str());
-      return;
-    }
-  }
-  // Suspend thread to avoid it from killing itself while we set its name. We don't just hold the
-  // thread list lock to avoid this, as setting the thread name causes mutator to lock/unlock
-  // in the DDMS send code.
-  art::ThreadList* thread_list = art::Runtime::Current()->GetThreadList();
-  bool timed_out;
-  // Take suspend thread lock to avoid races with threads trying to suspend this one.
-  art::Thread* thread;
-  {
-    thread = thread_list->SuspendThreadByPeer(jthread,
-                                              art::SuspendReason::kInternal,
-                                              &timed_out);
-  }
-  if (thread != nullptr) {
-    {
-      art::ScopedObjectAccess soa(env);
-      thread->SetThreadName(name.c_str());
-    }
-    bool resumed = thread_list->Resume(thread, art::SuspendReason::kInternal);
-    DCHECK(resumed);
-  } else if (timed_out) {
-    LOG(ERROR) << "Trying to set thread name to '" << name.c_str() << "' failed as the thread "
-        "failed to suspend within a generous timeout.";
-  }
+JNIEXPORT __attribute__((noreturn)) void JVM_SetNativeThreadName(
+    JNIEnv* env ATTRIBUTE_UNUSED,
+    jobject jthread ATTRIBUTE_UNUSED,
+    jstring java_name ATTRIBUTE_UNUSED) {
+  UNIMPLEMENTED(FATAL) << "JVM_SetNativeThreadName is not implemented";
+  UNREACHABLE();
 }
 
 JNIEXPORT __attribute__((noreturn)) jint JVM_IHashCode(JNIEnv* env ATTRIBUTE_UNUSED,
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 8b2a411..0a3c09a 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -19,6 +19,7 @@
 #include <limits.h>  // for INT_MAX
 #include <pthread.h>
 #include <signal.h>
+#include <stdlib.h>
 #include <sys/resource.h>
 #include <sys/time.h>
 
@@ -29,6 +30,7 @@
 #endif
 
 #include <algorithm>
+#include <atomic>
 #include <bitset>
 #include <cerrno>
 #include <iostream>
@@ -1066,7 +1068,7 @@
     } else {
       // These aren't necessary, but they improve diagnostics for unit tests & command-line tools.
       if (thread_name != nullptr) {
-        self->tlsPtr_.name->assign(thread_name);
+        self->SetCachedThreadName(thread_name);
         ::art::SetThreadName(thread_name);
       } else if (self->GetJniEnv()->IsCheckJniEnabled()) {
         LOG(WARNING) << *Thread::Current() << " attached without supplying a name";
@@ -1232,8 +1234,27 @@
       SetInt<kTransactionActive>(peer, thread_priority);
 }
 
+void Thread::SetCachedThreadName(const char* name) {
+  DCHECK(name != kThreadNameDuringStartup);
+  const char* old_name = tlsPtr_.name.exchange(name == nullptr ? nullptr : strdup(name));
+  if (old_name != nullptr && old_name !=  kThreadNameDuringStartup) {
+    // Deallocate it, carefully. Note that the load has to be ordered wrt the store of the xchg.
+    for (uint32_t i = 0; UNLIKELY(tls32_.num_name_readers.load(std::memory_order_seq_cst) != 0);
+         ++i) {
+      static constexpr uint32_t kNumSpins = 1000;
+      // Ugly, but keeps us from having to do anything on the reader side.
+      if (i > kNumSpins) {
+        usleep(500);
+      }
+    }
+    // We saw the reader count drop to zero since we replaced the name; old one is now safe to
+    // deallocate.
+    free(const_cast<char *>(old_name));
+  }
+}
+
 void Thread::SetThreadName(const char* name) {
-  tlsPtr_.name->assign(name);
+  SetCachedThreadName(name);
   ::art::SetThreadName(name);
   Dbg::DdmSendThreadNotification(this, CHUNK_TYPE("THNM"));
 }
@@ -1359,11 +1380,14 @@
     os << GetThreadId()
        << ",tid=" << GetTid() << ',';
   }
+  tls32_.num_name_readers.fetch_add(1, std::memory_order_seq_cst);
+  const char* name = tlsPtr_.name.load();
   os << GetState()
      << ",Thread*=" << this
      << ",peer=" << tlsPtr_.opeer
-     << ",\"" << (tlsPtr_.name != nullptr ? *tlsPtr_.name : "null") << "\""
+     << ",\"" << (name == nullptr ? "null" : name) << "\""
      << "]";
+  tls32_.num_name_readers.fetch_sub(1 /* at least memory_order_release */);
 }
 
 void Thread::Dump(std::ostream& os, bool dump_native_stack, BacktraceMap* backtrace_map,
@@ -1382,7 +1406,10 @@
 }
 
 void Thread::GetThreadName(std::string& name) const {
-  name.assign(*tlsPtr_.name);
+  tls32_.num_name_readers.fetch_add(1, std::memory_order_seq_cst);
+  // The store part of the increment has to be ordered with respect to the following load.
+  name.assign(tlsPtr_.name.load(std::memory_order_seq_cst));
+  tls32_.num_name_readers.fetch_sub(1 /* at least memory_order_release */);
 }
 
 uint64_t Thread::GetCpuMicroTime() const {
@@ -1941,7 +1968,9 @@
   }
 
   if (thread != nullptr) {
-    os << '"' << *thread->tlsPtr_.name << '"';
+    thread->tls32_.num_name_readers.fetch_add(1, std::memory_order_seq_cst);
+    os << '"' << thread->tlsPtr_.name.load() << '"';
+    thread->tls32_.num_name_readers.fetch_sub(1 /* at least memory_order_release */);
     if (is_daemon) {
       os << " daemon";
     }
@@ -2382,7 +2411,7 @@
   DCHECK(tlsPtr_.mutator_lock != nullptr);
   tlsPtr_.instrumentation_stack =
       new std::map<uintptr_t, instrumentation::InstrumentationStackFrame>;
-  tlsPtr_.name = new std::string(kThreadNameDuringStartup);
+  tlsPtr_.name.store(kThreadNameDuringStartup, std::memory_order_relaxed);
 
   static_assert((sizeof(Thread) % 4) == 0U,
                 "art::Thread has a size which is not a multiple of 4.");
@@ -2419,7 +2448,7 @@
   // It turns out that the last thing to change is the thread name; that's a good proxy for "has
   // this thread _ever_ entered kRunnable".
   return (tlsPtr_.jpeer == nullptr && tlsPtr_.opeer == nullptr) ||
-      (*tlsPtr_.name == kThreadNameDuringStartup);
+      (tlsPtr_.name.load() == kThreadNameDuringStartup);
 }
 
 void Thread::AssertPendingException() const {
@@ -2572,7 +2601,7 @@
   }
 
   delete tlsPtr_.instrumentation_stack;
-  delete tlsPtr_.name;
+  SetCachedThreadName(nullptr);  // Deallocate name.
   delete tlsPtr_.deps_or_stack_trace_sample.stack_trace_sample;
 
   Runtime::Current()->GetHeap()->AssertThreadLocalBuffersAreRevoked(this);
diff --git a/runtime/thread.h b/runtime/thread.h
index 7eb57e9..b64de25 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -1545,6 +1545,8 @@
 
   void ReleaseLongJumpContextInternal();
 
+  void SetCachedThreadName(const char* name);
+
   // Helper class for manipulating the 32 bits of atomically changed state and flags.
   class StateAndFlags {
    public:
@@ -1690,7 +1692,8 @@
           user_code_suspend_count(0),
           force_interpreter_count(0),
           make_visibly_initialized_counter(0),
-          define_class_counter(0) {}
+          define_class_counter(0),
+          num_name_readers(0) {}
 
     // The state and flags field must be changed atomically so that flag values aren't lost.
     // See `StateAndFlags` for bit assignments of `ThreadFlag` and `ThreadState` values.
@@ -1779,6 +1782,11 @@
     // Counter for how many nested define-classes are ongoing in this thread. Used to allow waiting
     // for threads to be done with class-definition work.
     uint32_t define_class_counter;
+
+    // A count of the number of readers of tlsPtr_.name that may still be looking at a string they
+    // retrieved.
+    mutable std::atomic<uint32_t> num_name_readers;
+    static_assert(std::atomic<uint32_t>::is_always_lock_free);
   } tls32_;
 
   struct PACKED(8) tls_64bit_sized_values {
@@ -1924,8 +1932,11 @@
     // set local values there first.
     FrameIdToShadowFrame* frame_id_to_shadow_frame;
 
-    // A cached copy of the java.lang.Thread's name.
-    std::string* name;
+    // A cached copy of the java.lang.Thread's (modified UTF-8) name.
+    // If this is not null or kThreadNameDuringStartup, then it owns the malloc memory holding
+    // the string. Updated in an RCU-like manner.
+    std::atomic<const char*> name;
+    static_assert(std::atomic<const char*>::is_always_lock_free);
 
     // A cached pthread_t for the pthread underlying this Thread*.
     pthread_t pthread_self;