Add thread-shared interpreter cache

The thread-local interpreter cache handles around 75% of method/field lookups from the interpreter. Add thread-shared interpreter cache which can be much bigger (since we pay the memory just once rather then per thread). This increases the cache hit rate to 90%. This effectively halves the amount of time we spend in 'NterpGetMethod' (including DexCache lookups), which is the single most expensive method during startup. Furthermore, it also reduces the amount of time we spend resolving methods by around 25% since DexCache entries get evicted less often. The shared cache increases memory use by 256k per process, so also reduce the fixed-size DexCache fields array, which balances it back. Test: test.py --host Change-Id: I3cd369613d47de117ab69d5bee00d4cf89b87913
author: David Srbecky <dsrbecky@google.com> 2021-12-01 17:09:08 +0000
committer: David Srbecky <dsrbecky@google.com> 2022-01-15 22:04:52 +0000
commit: fa40e6e318b21d4a1885a6ffea6efc3c0b5cc1cd (patch)
tree: ea74f6947c5c6caaaf7a6fb3ece3369aec3a62c3
parent: c5f41cd1a39472dbe2480c09a1ce20aeced2a9f6 (diff)
16 files changed, 304 insertions, 179 deletions
diff --git a/openjdkjvmti/ti_redefine.cc b/openjdkjvmti/ti_redefine.cc
index 37a61d3c54..16baf935db 100644
--- a/openjdkjvmti/ti_redefine.cc
+++ b/openjdkjvmti/ti_redefine.cc
@@ -2929,8 +2929,9 @@ void Redefiner::ClassRedefinition::UpdateClassStructurally(const RedefinitionDat
     // TODO We might be able to avoid doing this but given the rather unstructured nature of the
     // interpreter cache it's probably not worth the effort.
     art::MutexLock mu(driver_->self_, *art::Locks::thread_list_lock_);
+    art::InterpreterCache::ClearShared();
     driver_->runtime_->GetThreadList()->ForEach(
-        [](art::Thread* t) { t->GetInterpreterCache()->Clear(t); });
+        [](art::Thread* t) { t->GetInterpreterCache()->ClearThreadLocal(t); });
   }
 
   if (art::kIsDebugBuild) {
diff --git a/runtime/base/atomic_pair.h b/runtime/base/atomic_pair.h
new file mode 100644
index 0000000000..4e43d3062c
--- /dev/null
+++ b/runtime/base/atomic_pair.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_BASE_ATOMIC_PAIR_H_
+#define ART_RUNTIME_BASE_ATOMIC_PAIR_H_
+
+#include "base/macros.h"
+
+#include <type_traits>
+
+namespace art {
+
+// std::pair<> is not trivially copyable and as such it is unsuitable for atomic operations.
+template <typename IntType>
+struct PACKED(2 * sizeof(IntType)) AtomicPair {
+  static_assert(std::is_integral<IntType>::value);
+
+  constexpr AtomicPair() : first(0), second(0) { }
+  AtomicPair(IntType f, IntType s) : first(f), second(s) { }
+  AtomicPair(const AtomicPair&) = default;
+  AtomicPair& operator=(const AtomicPair&) = default;
+
+  IntType first;
+  IntType second;
+};
+
+template <typename IntType>
+ALWAYS_INLINE static inline AtomicPair<IntType> AtomicPairLoadAcquire(
+    std::atomic<AtomicPair<IntType>>* target) {
+  static_assert(std::atomic<AtomicPair<IntType>>::is_always_lock_free);
+  return target->load(std::memory_order_acquire);
+}
+
+template <typename IntType>
+ALWAYS_INLINE static inline void AtomicPairStoreRelease(
+    std::atomic<AtomicPair<IntType>>* target, AtomicPair<IntType> value) {
+  static_assert(std::atomic<AtomicPair<IntType>>::is_always_lock_free);
+  target->store(value, std::memory_order_release);
+}
+
+// llvm does not implement 16-byte atomic operations on x86-64.
+#if defined(__x86_64__)
+ALWAYS_INLINE static inline AtomicPair<uint64_t> AtomicPairLoadAcquire(
+    std::atomic<AtomicPair<uint64_t>>* target) {
+  uint64_t first, second;
+  __asm__ __volatile__(
+      "lock cmpxchg16b (%2)"
+      : "=&a"(first), "=&d"(second)
+      : "r"(target), "a"(0), "d"(0), "b"(0), "c"(0)
+      : "cc");
+  return {first, second};
+}
+
+ALWAYS_INLINE static inline void AtomicPairStoreRelease(
+    std::atomic<AtomicPair<uint64_t>>* target, AtomicPair<uint64_t> value) {
+  uint64_t first, second;
+  __asm__ __volatile__ (
+      "movq (%2), %%rax\n\t"
+      "movq 8(%2), %%rdx\n\t"
+      "1:\n\t"
+      "lock cmpxchg16b (%2)\n\t"
+      "jnz 1b"
+      : "=&a"(first), "=&d"(second)
+      : "r"(target), "b"(value.first), "c"(value.second)
+      : "cc");
+}
+#endif  // defined(__x86_64__)
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_BASE_ATOMIC_PAIR_H_
+
diff --git a/runtime/interpreter/interpreter_cache-inl.h b/runtime/interpreter/interpreter_cache-inl.h
new file mode 100644
index 0000000000..249df23b27
--- /dev/null
+++ b/runtime/interpreter/interpreter_cache-inl.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2022 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_INTERPRETER_INTERPRETER_CACHE_INL_H_
+#define ART_RUNTIME_INTERPRETER_INTERPRETER_CACHE_INL_H_
+
+#include "interpreter_cache.h"
+#include "thread.h"
+
+namespace art {
+
+template<bool kSkipThreadLocal>
+inline bool InterpreterCache::Get(Thread* self, const void* dex_instr, /* out */ size_t* value) {
+  DCHECK(self->GetInterpreterCache() == this) << "Must be called from owning thread";
+  size_t key = reinterpret_cast<size_t>(dex_instr);
+  Entry& local_entry = thread_local_array_[IndexOf<kThreadLocalSize>(key)];
+  if (kSkipThreadLocal) {
+    DCHECK_NE(local_entry.first, key) << "Expected cache miss";
+  } else {
+    if (LIKELY(local_entry.first == key)) {
+      *value = local_entry.second;
+      return true;
+    }
+  }
+  Entry shared_entry = AtomicPairLoadAcquire(&shared_array_[IndexOf<kSharedSize>(key)]);
+  if (LIKELY(shared_entry.first == key)) {
+    // For simplicity, only update the cache if weak ref accesses are enabled. If
+    // they are disabled, this means the GC is processing the cache, and is
+    // reading it concurrently.
+    if (self->GetWeakRefAccessEnabled()) {
+      local_entry = shared_entry;  // Copy to local array to make future lookup faster.
+    }
+    *value = shared_entry.second;
+    return true;
+  }
+  return false;
+}
+
+inline void InterpreterCache::Set(Thread* self, const void* dex_instr, size_t value) {
+  DCHECK(self->GetInterpreterCache() == this) << "Must be called from owning thread";
+
+  // For simplicity, only update the cache if weak ref accesses are enabled. If
+  // they are disabled, this means the GC is processing the cache, and is
+  // reading it concurrently.
+  if (self->GetWeakRefAccessEnabled()) {
+    size_t key = reinterpret_cast<size_t>(dex_instr);
+    thread_local_array_[IndexOf<kThreadLocalSize>(key)] = {key, value};
+    AtomicPairStoreRelease(&shared_array_[IndexOf<kSharedSize>(key)], {key, value});
+  }
+}
+
+}  // namespace art
+
+#endif  // ART_RUNTIME_INTERPRETER_INTERPRETER_CACHE_INL_H_
diff --git a/runtime/interpreter/interpreter_cache.cc b/runtime/interpreter/interpreter_cache.cc
index e43fe318cc..1a35c038e9 100644
--- a/runtime/interpreter/interpreter_cache.cc
+++ b/runtime/interpreter/interpreter_cache.cc
@@ -19,14 +19,30 @@
 
 namespace art {
 
-void InterpreterCache::Clear(Thread* owning_thread) {
+std::array<std::atomic<InterpreterCache::Entry>,
+           InterpreterCache::kSharedSize> InterpreterCache::shared_array_;
+
+InterpreterCache::InterpreterCache() {
+  // We can not use the ClearThreadLocal method since the constructor will not
+  // be called from the owning thread.
+  thread_local_array_.fill(Entry{});
+}
+
+void InterpreterCache::ClearThreadLocal(Thread* owning_thread) {
+  // Must be called from the owning thread or when the owning thread is suspended.
   DCHECK(owning_thread->GetInterpreterCache() == this);
   DCHECK(owning_thread == Thread::Current() || owning_thread->IsSuspended());
-  data_.fill(Entry{});
+
+  thread_local_array_.fill(Entry{});
 }
 
-bool InterpreterCache::IsCalledFromOwningThread() {
-  return Thread::Current()->GetInterpreterCache() == this;
+void InterpreterCache::ClearShared() {
+  // Can be called from any thread since the writes are atomic.
+  // The static shared cache isn't bound to specific thread in the first place.
+
+  for (std::atomic<Entry>& entry : shared_array_) {
+    AtomicPairStoreRelease(&entry, Entry{});
+  }
 }
 
 }  // namespace art
diff --git a/runtime/interpreter/interpreter_cache.h b/runtime/interpreter/interpreter_cache.h
index 0ada562438..af025cecbd 100644
--- a/runtime/interpreter/interpreter_cache.h
+++ b/runtime/interpreter/interpreter_cache.h
@@ -20,17 +20,22 @@
 #include <array>
 #include <atomic>
 
+#include "base/atomic_pair.h"
 #include "base/bit_utils.h"
 #include "base/macros.h"
 
 namespace art {
 
+class Instruction;
 class Thread;
 
 // Small fast thread-local cache for the interpreter.
-// It can hold arbitrary pointer-sized key-value pair.
-// The interpretation of the value depends on the key.
+//
+// The key is an absolute pointer to a dex instruction.
+//
+// The value depends on the opcode of the dex instruction.
 // Presence of entry might imply some pre-conditions.
+//
 // All operations must be done from the owning thread,
 // or at a point when the owning thread is suspended.
 //
@@ -46,52 +51,61 @@ class Thread;
 // from assembly (it ensures that the offset is valid immediate value).
 class ALIGNED(16) InterpreterCache {
  public:
-  // Aligned since we load the whole entry in single assembly instruction.
-  typedef std::pair<const void*, size_t> Entry ALIGNED(2 * sizeof(size_t));
+  using Entry = AtomicPair<size_t>;
 
-  // 2x size increase/decrease corresponds to ~0.5% interpreter performance change.
-  // Value of 256 has around 75% cache hit rate.
-  static constexpr size_t kSize = 256;
+  static constexpr size_t kThreadLocalSize = 256;   // Value of 256 has around 75% cache hit rate.
+  static constexpr size_t kSharedSize = 16 * 1024;  // Value of 16k has around 90% cache hit rate.
+  static constexpr size_t kHashShift = 2;           // Number of tailing dex pc bits to drop.
 
-  InterpreterCache() {
-    // We can not use the Clear() method since the constructor will not
-    // be called from the owning thread.
-    data_.fill(Entry{});
-  }
+  InterpreterCache();
 
-  // Clear the whole cache. It requires the owning thread for DCHECKs.
-  void Clear(Thread* owning_thread);
+  void ClearThreadLocal(Thread* owning_thread);
 
-  ALWAYS_INLINE bool Get(const void* key, /* out */ size_t* value) {
-    DCHECK(IsCalledFromOwningThread());
-    Entry& entry = data_[IndexOf(key)];
-    if (LIKELY(entry.first == key)) {
-      *value = entry.second;
-      return true;
-    }
-    return false;
-  }
+  static void ClearShared();
 
-  ALWAYS_INLINE void Set(const void* key, size_t value) {
-    DCHECK(IsCalledFromOwningThread());
-    data_[IndexOf(key)] = Entry{key, value};
+  template<bool kSkipThreadLocal = false>
+  ALWAYS_INLINE bool Get(Thread* self, const void* dex_instruction, /* out */ size_t* value);
+
+  ALWAYS_INLINE void Set(Thread* self, const void* dex_instruction, size_t value);
+
+  template<typename Callback>
+  void ForEachTheadLocalEntry(Callback&& callback) {
+    for (Entry& entry : thread_local_array_) {
+      callback(reinterpret_cast<const Instruction*>(entry.first), entry.second);
+    }
   }
 
-  std::array<Entry, kSize>& GetArray() {
-    return data_;
+  template<typename Callback>
+  static void ForEachSharedEntry(Callback&& callback) {
+    for (std::atomic<Entry>& atomic_entry : shared_array_) {
+      Entry old_entry = AtomicPairLoadAcquire(&atomic_entry);
+      Entry new_entry = old_entry;
+      callback(reinterpret_cast<const Instruction*>(new_entry.first), new_entry.second);
+      if (old_entry.second != new_entry.second) {
+        AtomicPairStoreRelease(&atomic_entry, new_entry);
+      }
+    }
   }
 
  private:
-  bool IsCalledFromOwningThread();
-
-  static ALWAYS_INLINE size_t IndexOf(const void* key) {
+  template<size_t kSize>
+  static ALWAYS_INLINE size_t IndexOf(size_t key) {
     static_assert(IsPowerOfTwo(kSize), "Size must be power of two");
-    size_t index = (reinterpret_cast<uintptr_t>(key) >> 2) & (kSize - 1);
+    size_t index = (key >> kHashShift) & (kSize - 1);
     DCHECK_LT(index, kSize);
     return index;
   }
 
-  std::array<Entry, kSize> data_;
+  // Small cache of fixed size which is always present for every thread.
+  // It is stored directly (without indrection) inside the Thread object.
+  // This makes it as fast as possible to access from assembly fast-path.
+  std::array<Entry, kThreadLocalSize> thread_local_array_;
+
+  // Larger cache which is shared by all threads.
+  // It is used as next cache level if lookup in the local array fails.
+  // It needs to be accessed using atomic operations, and is contended,
+  // but the sharing allows it to be larger then the per-thread cache.
+  static std::array<std::atomic<Entry>, kSharedSize> shared_array_;
 };
 
 }  // namespace art
diff --git a/runtime/interpreter/interpreter_common.h b/runtime/interpreter/interpreter_common.h
index 1809227f2c..4cbe81d0c2 100644
--- a/runtime/interpreter/interpreter_common.h
+++ b/runtime/interpreter/interpreter_common.h
@@ -46,6 +46,7 @@
 #include "dex/dex_instruction-inl.h"
 #include "entrypoints/entrypoint_utils-inl.h"
 #include "handle_scope-inl.h"
+#include "interpreter_cache-inl.h"
 #include "interpreter_switch_impl.h"
 #include "jit/jit-inl.h"
 #include "mirror/call_site.h"
@@ -238,7 +239,7 @@ static ALWAYS_INLINE bool DoInvoke(Thread* self,
   InterpreterCache* tls_cache = self->GetInterpreterCache();
   size_t tls_value;
   ArtMethod* resolved_method;
-  if (!IsNterpSupported() && LIKELY(tls_cache->Get(inst, &tls_value))) {
+  if (!IsNterpSupported() && LIKELY(tls_cache->Get(self, inst, &tls_value))) {
     resolved_method = reinterpret_cast<ArtMethod*>(tls_value);
   } else {
     ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
@@ -252,7 +253,7 @@ static ALWAYS_INLINE bool DoInvoke(Thread* self,
       return false;
     }
     if (!IsNterpSupported()) {
-      tls_cache->Set(inst, reinterpret_cast<size_t>(resolved_method));
+      tls_cache->Set(self, inst, reinterpret_cast<size_t>(resolved_method));
     }
   }
 
diff --git a/runtime/interpreter/mterp/nterp.cc b/runtime/interpreter/mterp/nterp.cc
index 670ae1b0c2..d49260e56a 100644
--- a/runtime/interpreter/mterp/nterp.cc
+++ b/runtime/interpreter/mterp/nterp.cc
@@ -24,6 +24,7 @@
 #include "dex/dex_instruction_utils.h"
 #include "debugger.h"
 #include "entrypoints/entrypoint_utils-inl.h"
+#include "interpreter/interpreter_cache-inl.h"
 #include "interpreter/interpreter_common.h"
 #include "interpreter/interpreter_intrinsics.h"
 #include "interpreter/shadow_frame-inl.h"
@@ -93,12 +94,7 @@ inline void UpdateHotness(ArtMethod* method) REQUIRES_SHARED(Locks::mutator_lock
 template<typename T>
 inline void UpdateCache(Thread* self, uint16_t* dex_pc_ptr, T value) {
   DCHECK(kUseReadBarrier) << "Nterp only works with read barriers";
-  // For simplicity, only update the cache if weak ref accesses are enabled. If
-  // they are disabled, this means the GC is processing the cache, and is
-  // reading it concurrently.
-  if (self->GetWeakRefAccessEnabled()) {
-    self->GetInterpreterCache()->Set(dex_pc_ptr, value);
-  }
+  self->GetInterpreterCache()->Set(self, dex_pc_ptr, value);
 }
 
 template<typename T>
@@ -252,7 +248,7 @@ extern "C" const char* NterpGetShortyFromInvokeCustom(ArtMethod* caller, uint16_
 }
 
 FLATTEN
-extern "C" size_t NterpGetMethod(Thread* self, ArtMethod* caller, uint16_t* dex_pc_ptr)
+extern "C" size_t NterpGetMethodSlowPath(Thread* self, ArtMethod* caller, uint16_t* dex_pc_ptr)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   UpdateHotness(caller);
   const Instruction* inst = Instruction::At(dex_pc_ptr);
@@ -410,6 +406,16 @@ extern "C" size_t NterpGetMethod(Thread* self, ArtMethod* caller, uint16_t* dex_
   }
 }
 
+extern "C" size_t NterpGetMethod(Thread* self, ArtMethod* caller, uint16_t* dex_pc_ptr)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  InterpreterCache* cache = self->GetInterpreterCache();
+  size_t cached_value;
+  if (LIKELY(cache->Get</*kSkipThreadLocal=*/true>(self, dex_pc_ptr, &cached_value))) {
+    return cached_value;
+  }
+  return NterpGetMethodSlowPath(self, caller, dex_pc_ptr);  // Tail call.
+}
+
 FLATTEN
 static ArtField* ResolveFieldWithAccessChecks(Thread* self,
                                               ClassLinker* class_linker,
@@ -459,10 +465,10 @@ static ArtField* ResolveFieldWithAccessChecks(Thread* self,
   return resolved_field;
 }
 
-extern "C" size_t NterpGetStaticField(Thread* self,
-                                      ArtMethod* caller,
-                                      uint16_t* dex_pc_ptr,
-                                      size_t resolve_field_type)  // Resolve if not zero
+extern "C" size_t NterpGetStaticFieldSlowPath(Thread* self,
+                                              ArtMethod* caller,
+                                              uint16_t* dex_pc_ptr,
+                                              size_t resolve_field_type)  // Resolve if not zero
     REQUIRES_SHARED(Locks::mutator_lock_) {
   UpdateHotness(caller);
   const Instruction* inst = Instruction::At(dex_pc_ptr);
@@ -502,10 +508,25 @@ extern "C" size_t NterpGetStaticField(Thread* self,
   }
 }
 
-extern "C" uint32_t NterpGetInstanceFieldOffset(Thread* self,
-                                                ArtMethod* caller,
-                                                uint16_t* dex_pc_ptr,
-                                                size_t resolve_field_type)  // Resolve if not zero
+extern "C" size_t NterpGetStaticField(Thread* self,
+                                      ArtMethod* caller,
+                                      uint16_t* dex_pc_ptr,
+                                      size_t resolve_field_type)  // Resolve if not zero
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  InterpreterCache* cache = self->GetInterpreterCache();
+  size_t cached_value;
+  if (LIKELY(cache->Get</*kSkipThreadLocal=*/true>(self, dex_pc_ptr, &cached_value))) {
+    return cached_value;
+  }
+  return NterpGetStaticFieldSlowPath(self, caller, dex_pc_ptr, resolve_field_type);
+}
+
+
+extern "C" uint32_t NterpGetInstanceFieldOffsetSlowPath(
+    Thread* self,
+    ArtMethod* caller,
+    uint16_t* dex_pc_ptr,
+    size_t resolve_field_type)  // Resolve if not zero
     REQUIRES_SHARED(Locks::mutator_lock_) {
   UpdateHotness(caller);
   const Instruction* inst = Instruction::At(dex_pc_ptr);
@@ -532,6 +553,19 @@ extern "C" uint32_t NterpGetInstanceFieldOffset(Thread* self,
   return resolved_field->GetOffset().Uint32Value();
 }
 
+extern "C" uint32_t NterpGetInstanceFieldOffset(Thread* self,
+                                                ArtMethod* caller,
+                                                uint16_t* dex_pc_ptr,
+                                                size_t resolve_field_type)  // Resolve if not zero
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  InterpreterCache* cache = self->GetInterpreterCache();
+  size_t cached_value;
+  if (LIKELY(cache->Get</*kSkipThreadLocal=*/true>(self, dex_pc_ptr, &cached_value))) {
+    return cached_value;
+  }
+  return NterpGetInstanceFieldOffsetSlowPath(self, caller, dex_pc_ptr, resolve_field_type);
+}
+
 extern "C" mirror::Object* NterpGetClassOrAllocateObject(Thread* self,
                                                          ArtMethod* caller,
                                                          uint16_t* dex_pc_ptr)
diff --git a/runtime/mirror/dex_cache-inl.h b/runtime/mirror/dex_cache-inl.h
index 31f2bd2d7b..8a1ed71197 100644
--- a/runtime/mirror/dex_cache-inl.h
+++ b/runtime/mirror/dex_cache-inl.h
@@ -357,32 +357,18 @@ inline void DexCache::SetResolvedMethod(uint32_t method_idx, ArtMethod* method)
 template <typename T>
 NativeDexCachePair<T> DexCache::GetNativePair(std::atomic<NativeDexCachePair<T>>* pair_array,
                                               size_t idx) {
-  if (kRuntimePointerSize == PointerSize::k64) {
-    auto* array = reinterpret_cast<std::atomic<ConversionPair64>*>(pair_array);
-    ConversionPair64 value = AtomicLoadRelaxed16B(&array[idx]);
-    return NativeDexCachePair<T>(reinterpret_cast64<T*>(value.first),
-                                 dchecked_integral_cast<size_t>(value.second));
-  } else {
-    auto* array = reinterpret_cast<std::atomic<ConversionPair32>*>(pair_array);
-    ConversionPair32 value = array[idx].load(std::memory_order_relaxed);
-    return NativeDexCachePair<T>(reinterpret_cast32<T*>(value.first), value.second);
-  }
+  auto* array = reinterpret_cast<std::atomic<AtomicPair<size_t>>*>(pair_array);
+  AtomicPair<size_t> value = AtomicPairLoadAcquire(&array[idx]);
+  return NativeDexCachePair<T>(reinterpret_cast<T*>(value.first), value.second);
 }
 
 template <typename T>
 void DexCache::SetNativePair(std::atomic<NativeDexCachePair<T>>* pair_array,
                              size_t idx,
                              NativeDexCachePair<T> pair) {
-  if (kRuntimePointerSize == PointerSize::k64) {
-    auto* array = reinterpret_cast<std::atomic<ConversionPair64>*>(pair_array);
-    ConversionPair64 v(reinterpret_cast64<uint64_t>(pair.object), pair.index);
-    AtomicStoreRelease16B(&array[idx], v);
-  } else {
-    auto* array = reinterpret_cast<std::atomic<ConversionPair32>*>(pair_array);
-    ConversionPair32 v(reinterpret_cast32<uint32_t>(pair.object),
-                       dchecked_integral_cast<uint32_t>(pair.index));
-    array[idx].store(v, std::memory_order_release);
-  }
+  auto* array = reinterpret_cast<std::atomic<AtomicPair<size_t>>*>(pair_array);
+  AtomicPair<size_t> v(reinterpret_cast<size_t>(pair.object), pair.index);
+  AtomicPairStoreRelease(&array[idx], v);
 }
 
 template <typename T,
diff --git a/runtime/mirror/dex_cache.cc b/runtime/mirror/dex_cache.cc
index c80f9dfe2f..b7f8ee7a07 100644
--- a/runtime/mirror/dex_cache.cc
+++ b/runtime/mirror/dex_cache.cc
@@ -126,23 +126,5 @@ ObjPtr<ClassLoader> DexCache::GetClassLoader() {
   return GetFieldObject<ClassLoader>(OFFSET_OF_OBJECT_MEMBER(DexCache, class_loader_));
 }
 
-#if !defined(__aarch64__) && !defined(__x86_64__)
-static pthread_mutex_t dex_cache_slow_atomic_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-DexCache::ConversionPair64 DexCache::AtomicLoadRelaxed16B(std::atomic<ConversionPair64>* target) {
-  pthread_mutex_lock(&dex_cache_slow_atomic_mutex);
-  DexCache::ConversionPair64 value = *reinterpret_cast<ConversionPair64*>(target);
-  pthread_mutex_unlock(&dex_cache_slow_atomic_mutex);
-  return value;
-}
-
-void DexCache::AtomicStoreRelease16B(std::atomic<ConversionPair64>* target,
-                                     ConversionPair64 value) {
-  pthread_mutex_lock(&dex_cache_slow_atomic_mutex);
-  *reinterpret_cast<ConversionPair64*>(target) = value;
-  pthread_mutex_unlock(&dex_cache_slow_atomic_mutex);
-}
-#endif
-
 }  // namespace mirror
 }  // namespace art
diff --git a/runtime/mirror/dex_cache.h b/runtime/mirror/dex_cache.h
index 26fc520cd7..19197095aa 100644
--- a/runtime/mirror/dex_cache.h
+++ b/runtime/mirror/dex_cache.h
@@ -149,7 +149,7 @@ class MANAGED DexCache final : public Object {
                 "String dex cache size is not a power of 2.");
 
   // Size of field dex cache. Needs to be a power of 2 for entrypoint assumptions to hold.
-  static constexpr size_t kDexCacheFieldCacheSize = 1024;
+  static constexpr size_t kDexCacheFieldCacheSize = 512;
   static_assert(IsPowerOfTwo(kDexCacheFieldCacheSize),
                 "Field dex cache size is not a power of 2.");
 
@@ -448,19 +448,6 @@ class MANAGED DexCache final : public Object {
   T* AllocArray(MemberOffset obj_offset, MemberOffset num_offset, size_t num)
      REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // std::pair<> is not trivially copyable and as such it is unsuitable for atomic operations,
-  // so we use a custom pair class for loading and storing the NativeDexCachePair<>.
-  template <typename IntType>
-  struct PACKED(2 * sizeof(IntType)) ConversionPair {
-    ConversionPair(IntType f, IntType s) : first(f), second(s) { }
-    ConversionPair(const ConversionPair&) = default;
-    ConversionPair& operator=(const ConversionPair&) = default;
-    IntType first;
-    IntType second;
-  };
-  using ConversionPair32 = ConversionPair<uint32_t>;
-  using ConversionPair64 = ConversionPair<uint64_t>;
-
   // Visit instance fields of the dex cache as well as its associated arrays.
   template <bool kVisitNativeRoots,
             VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
@@ -469,48 +456,6 @@ class MANAGED DexCache final : public Object {
   void VisitReferences(ObjPtr<Class> klass, const Visitor& visitor)
       REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_);
 
-  // Due to lack of 16-byte atomics support, we use hand-crafted routines.
-#if defined(__aarch64__)
-  // 16-byte atomics are supported on aarch64.
-  ALWAYS_INLINE static ConversionPair64 AtomicLoadRelaxed16B(
-      std::atomic<ConversionPair64>* target) {
-    return target->load(std::memory_order_relaxed);
-  }
-
-  ALWAYS_INLINE static void AtomicStoreRelease16B(
-      std::atomic<ConversionPair64>* target, ConversionPair64 value) {
-    target->store(value, std::memory_order_release);
-  }
-#elif defined(__x86_64__)
-  ALWAYS_INLINE static ConversionPair64 AtomicLoadRelaxed16B(
-      std::atomic<ConversionPair64>* target) {
-    uint64_t first, second;
-    __asm__ __volatile__(
-        "lock cmpxchg16b (%2)"
-        : "=&a"(first), "=&d"(second)
-        : "r"(target), "a"(0), "d"(0), "b"(0), "c"(0)
-        : "cc");
-    return ConversionPair64(first, second);
-  }
-
-  ALWAYS_INLINE static void AtomicStoreRelease16B(
-      std::atomic<ConversionPair64>* target, ConversionPair64 value) {
-    uint64_t first, second;
-    __asm__ __volatile__ (
-        "movq (%2), %%rax\n\t"
-        "movq 8(%2), %%rdx\n\t"
-        "1:\n\t"
-        "lock cmpxchg16b (%2)\n\t"
-        "jnz 1b"
-        : "=&a"(first), "=&d"(second)
-        : "r"(target), "b"(value.first), "c"(value.second)
-        : "cc");
-  }
-#else
-  static ConversionPair64 AtomicLoadRelaxed16B(std::atomic<ConversionPair64>* target);
-  static void AtomicStoreRelease16B(std::atomic<ConversionPair64>* target, ConversionPair64 value);
-#endif
-
   HeapReference<ClassLoader> class_loader_;
   HeapReference<String> location_;
 
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 54e9d38b3c..2a3afa8c99 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -739,7 +739,7 @@ void Runtime::SweepSystemWeaks(IsMarkedVisitor* visitor) {
     // from mutators. See b/32167580.
     GetJit()->GetCodeCache()->SweepRootTables(visitor);
   }
-  thread_list_->SweepInterpreterCaches(visitor);
+  Thread::SweepInterpreterCaches(visitor);
 
   // All other generic system-weak holders.
   for (gc::AbstractSystemWeakHolder* holder : system_weak_holders_) {
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 25d493f0a5..2b3f47298e 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -4240,27 +4240,28 @@ void Thread::VisitRoots(RootVisitor* visitor) {
 }
 #pragma GCC diagnostic pop
 
-void Thread::SweepInterpreterCache(IsMarkedVisitor* visitor) {
-  for (InterpreterCache::Entry& entry : GetInterpreterCache()->GetArray()) {
-    const Instruction* inst = reinterpret_cast<const Instruction*>(entry.first);
+void Thread::SweepInterpreterCaches(IsMarkedVisitor* visitor) {
+  Thread* self = Thread::Current();
+  auto visit = [visitor, self](const Instruction* inst, size_t& value) {
+    Locks::mutator_lock_->AssertSharedHeld(self);
     if (inst != nullptr) {
       if (inst->Opcode() == Instruction::NEW_INSTANCE ||
           inst->Opcode() == Instruction::CHECK_CAST ||
           inst->Opcode() == Instruction::INSTANCE_OF ||
           inst->Opcode() == Instruction::NEW_ARRAY ||
           inst->Opcode() == Instruction::CONST_CLASS) {
-        mirror::Class* cls = reinterpret_cast<mirror::Class*>(entry.second);
+        mirror::Class* cls = reinterpret_cast<mirror::Class*>(value);
         if (cls == nullptr || cls == Runtime::GetWeakClassSentinel()) {
           // Entry got deleted in a previous sweep.
-          continue;
+          return;
         }
         Runtime::ProcessWeakClass(
-            reinterpret_cast<GcRoot<mirror::Class>*>(&entry.second),
+            reinterpret_cast<GcRoot<mirror::Class>*>(&value),
             visitor,
             Runtime::GetWeakClassSentinel());
       } else if (inst->Opcode() == Instruction::CONST_STRING ||
                  inst->Opcode() == Instruction::CONST_STRING_JUMBO) {
-        mirror::Object* object = reinterpret_cast<mirror::Object*>(entry.second);
+        mirror::Object* object = reinterpret_cast<mirror::Object*>(value);
         mirror::Object* new_object = visitor->IsMarked(object);
         // We know the string is marked because it's a strongly-interned string that
         // is always alive (see b/117621117 for trying to make those strings weak).
@@ -4268,11 +4269,16 @@ void Thread::SweepInterpreterCache(IsMarkedVisitor* visitor) {
         // null for newly allocated objects, but we know those haven't moved. Therefore,
         // only update the entry if we get a different non-null string.
         if (new_object != nullptr && new_object != object) {
-          entry.second = reinterpret_cast<size_t>(new_object);
+          value = reinterpret_cast<size_t>(new_object);
         }
       }
     }
-  }
+  };
+  InterpreterCache::ForEachSharedEntry(visit);
+  MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
+  Runtime::Current()->GetThreadList()->ForEach([&visit](Thread* thread) {
+    thread->GetInterpreterCache()->ForEachTheadLocalEntry(visit);
+  });
 }
 
 // FIXME: clang-r433403 reports the below function exceeds frame size limit.
@@ -4487,9 +4493,10 @@ void Thread::SetReadBarrierEntrypoints() {
 void Thread::ClearAllInterpreterCaches() {
   static struct ClearInterpreterCacheClosure : Closure {
     void Run(Thread* thread) override {
-      thread->GetInterpreterCache()->Clear(thread);
+      thread->GetInterpreterCache()->ClearThreadLocal(thread);
     }
   } closure;
+  InterpreterCache::ClearShared();
   Runtime::Current()->GetThreadList()->RunCheckpoint(&closure);
 }
 
diff --git a/runtime/thread.h b/runtime/thread.h
index 1085a563c4..ec971a564f 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -1348,10 +1348,6 @@ class Thread {
     return ThreadOffset<pointer_size>(OFFSETOF_MEMBER(Thread, interpreter_cache_));
   }
 
-  static constexpr int InterpreterCacheSizeLog2() {
-    return WhichPowerOf2(InterpreterCache::kSize);
-  }
-
   static constexpr uint32_t AllThreadFlags() {
     return enum_cast<uint32_t>(ThreadFlag::kLastFlag) |
            (enum_cast<uint32_t>(ThreadFlag::kLastFlag) - 1u);
@@ -1526,7 +1522,8 @@ class Thread {
   template <bool kPrecise>
   void VisitRoots(RootVisitor* visitor) REQUIRES_SHARED(Locks::mutator_lock_);
 
-  void SweepInterpreterCache(IsMarkedVisitor* visitor) REQUIRES_SHARED(Locks::mutator_lock_);
+  static void SweepInterpreterCaches(IsMarkedVisitor* visitor)
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
   static bool IsAotCompiler();
 
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 4e3b40ba7a..6482e72417 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -1407,13 +1407,6 @@ void ThreadList::VisitRoots(RootVisitor* visitor, VisitRootFlags flags) const {
   }
 }
 
-void ThreadList::SweepInterpreterCaches(IsMarkedVisitor* visitor) const {
-  MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
-  for (const auto& thread : list_) {
-    thread->SweepInterpreterCache(visitor);
-  }
-}
-
 void ThreadList::VisitReflectiveTargets(ReflectiveValueVisitor *visitor) const {
   MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
   for (const auto& thread : list_) {
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index f5b58a0c54..29b0c52186 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -179,10 +179,6 @@ class ThreadList {
     return empty_checkpoint_barrier_.get();
   }
 
-  void SweepInterpreterCaches(IsMarkedVisitor* visitor) const
-      REQUIRES(!Locks::thread_list_lock_)
-      REQUIRES_SHARED(Locks::mutator_lock_);
-
   void WaitForOtherNonDaemonThreadsToExit(bool check_no_birth = true)
       REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_,
                !Locks::mutator_lock_);
diff --git a/tools/cpp-define-generator/thread.def b/tools/cpp-define-generator/thread.def
index ec8e28b977..07ca8841f8 100644
--- a/tools/cpp-define-generator/thread.def
+++ b/tools/cpp-define-generator/thread.def
@@ -30,11 +30,12 @@ ASM_DEFINE(THREAD_ID_OFFSET,
 ASM_DEFINE(THREAD_INTERPRETER_CACHE_OFFSET,
            art::Thread::InterpreterCacheOffset<art::kRuntimePointerSize>().Int32Value())
 ASM_DEFINE(THREAD_INTERPRETER_CACHE_SIZE_LOG2,
-           art::Thread::InterpreterCacheSizeLog2())
+           art::WhichPowerOf2(art::InterpreterCache::kThreadLocalSize))
 ASM_DEFINE(THREAD_INTERPRETER_CACHE_SIZE_MASK,
-           (sizeof(art::InterpreterCache::Entry) * (art::InterpreterCache::kSize - 1)))
+           (sizeof(art::InterpreterCache::Entry) * (art::InterpreterCache::kThreadLocalSize - 1)))
 ASM_DEFINE(THREAD_INTERPRETER_CACHE_SIZE_SHIFT,
-           2)
+           (art::WhichPowerOf2(sizeof(art::InterpreterCache::Entry)) -
+            art::InterpreterCache::kHashShift))
 ASM_DEFINE(THREAD_IS_GC_MARKING_OFFSET,
            art::Thread::IsGcMarkingOffset<art::kRuntimePointerSize>().Int32Value())
 ASM_DEFINE(THREAD_LOCAL_ALLOC_STACK_END_OFFSET,
author	David Srbecky <dsrbecky@google.com>	2021-12-01 17:09:08 +0000
committer	David Srbecky <dsrbecky@google.com>	2022-01-15 22:04:52 +0000
commit	fa40e6e318b21d4a1885a6ffea6efc3c0b5cc1cd (patch)
tree	ea74f6947c5c6caaaf7a6fb3ece3369aec3a62c3
parent	c5f41cd1a39472dbe2480c09a1ce20aeced2a9f6 (diff)