Revert "Add thread-shared interpreter cache"

This reverts commit fa40e6e318b21d4a1885a6ffea6efc3c0b5cc1cd. Reason for revert: Seeing several different failures that appear related, both test failures and b/214850618. And it appears a potentially significant unresolved comment was overlooked. Change-Id: I2b5260ac7f2168831f0d1b0d7c76b70ecc1fb77d
author: Hans Boehm <hboehm@google.com> 2022-01-17 01:32:55 +0000
committer: Hans Boehm <hboehm@google.com> 2022-01-17 01:32:55 +0000
commit: 3d2f148fe040b60452d5d9be7d08dec693132078 (patch)
tree: 6a8a0aa66c68e8a4c49833b2a93b263985259aed
parent: fa40e6e318b21d4a1885a6ffea6efc3c0b5cc1cd (diff)
16 files changed, 179 insertions, 304 deletions
diff --git a/openjdkjvmti/ti_redefine.cc b/openjdkjvmti/ti_redefine.cc
index 16baf935db..37a61d3c54 100644
--- a/openjdkjvmti/ti_redefine.cc
+++ b/openjdkjvmti/ti_redefine.cc
@@ -2929,9 +2929,8 @@ void Redefiner::ClassRedefinition::UpdateClassStructurally(const RedefinitionDat
     // TODO We might be able to avoid doing this but given the rather unstructured nature of the
     // interpreter cache it's probably not worth the effort.
     art::MutexLock mu(driver_->self_, *art::Locks::thread_list_lock_);
-    art::InterpreterCache::ClearShared();
     driver_->runtime_->GetThreadList()->ForEach(
-        [](art::Thread* t) { t->GetInterpreterCache()->ClearThreadLocal(t); });
+        [](art::Thread* t) { t->GetInterpreterCache()->Clear(t); });
   }
 
   if (art::kIsDebugBuild) {
diff --git a/runtime/base/atomic_pair.h b/runtime/base/atomic_pair.h
deleted file mode 100644
index 4e43d3062c..0000000000
--- a/runtime/base/atomic_pair.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (C) 2021 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_RUNTIME_BASE_ATOMIC_PAIR_H_
-#define ART_RUNTIME_BASE_ATOMIC_PAIR_H_
-
-#include "base/macros.h"
-
-#include <type_traits>
-
-namespace art {
-
-// std::pair<> is not trivially copyable and as such it is unsuitable for atomic operations.
-template <typename IntType>
-struct PACKED(2 * sizeof(IntType)) AtomicPair {
-  static_assert(std::is_integral<IntType>::value);
-
-  constexpr AtomicPair() : first(0), second(0) { }
-  AtomicPair(IntType f, IntType s) : first(f), second(s) { }
-  AtomicPair(const AtomicPair&) = default;
-  AtomicPair& operator=(const AtomicPair&) = default;
-
-  IntType first;
-  IntType second;
-};
-
-template <typename IntType>
-ALWAYS_INLINE static inline AtomicPair<IntType> AtomicPairLoadAcquire(
-    std::atomic<AtomicPair<IntType>>* target) {
-  static_assert(std::atomic<AtomicPair<IntType>>::is_always_lock_free);
-  return target->load(std::memory_order_acquire);
-}
-
-template <typename IntType>
-ALWAYS_INLINE static inline void AtomicPairStoreRelease(
-    std::atomic<AtomicPair<IntType>>* target, AtomicPair<IntType> value) {
-  static_assert(std::atomic<AtomicPair<IntType>>::is_always_lock_free);
-  target->store(value, std::memory_order_release);
-}
-
-// llvm does not implement 16-byte atomic operations on x86-64.
-#if defined(__x86_64__)
-ALWAYS_INLINE static inline AtomicPair<uint64_t> AtomicPairLoadAcquire(
-    std::atomic<AtomicPair<uint64_t>>* target) {
-  uint64_t first, second;
-  __asm__ __volatile__(
-      "lock cmpxchg16b (%2)"
-      : "=&a"(first), "=&d"(second)
-      : "r"(target), "a"(0), "d"(0), "b"(0), "c"(0)
-      : "cc");
-  return {first, second};
-}
-
-ALWAYS_INLINE static inline void AtomicPairStoreRelease(
-    std::atomic<AtomicPair<uint64_t>>* target, AtomicPair<uint64_t> value) {
-  uint64_t first, second;
-  __asm__ __volatile__ (
-      "movq (%2), %%rax\n\t"
-      "movq 8(%2), %%rdx\n\t"
-      "1:\n\t"
-      "lock cmpxchg16b (%2)\n\t"
-      "jnz 1b"
-      : "=&a"(first), "=&d"(second)
-      : "r"(target), "b"(value.first), "c"(value.second)
-      : "cc");
-}
-#endif  // defined(__x86_64__)
-
-}  // namespace art
-
-#endif  // ART_RUNTIME_BASE_ATOMIC_PAIR_H_
-
diff --git a/runtime/interpreter/interpreter_cache-inl.h b/runtime/interpreter/interpreter_cache-inl.h
deleted file mode 100644
index 249df23b27..0000000000
--- a/runtime/interpreter/interpreter_cache-inl.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (C) 2022 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_RUNTIME_INTERPRETER_INTERPRETER_CACHE_INL_H_
-#define ART_RUNTIME_INTERPRETER_INTERPRETER_CACHE_INL_H_
-
-#include "interpreter_cache.h"
-#include "thread.h"
-
-namespace art {
-
-template<bool kSkipThreadLocal>
-inline bool InterpreterCache::Get(Thread* self, const void* dex_instr, /* out */ size_t* value) {
-  DCHECK(self->GetInterpreterCache() == this) << "Must be called from owning thread";
-  size_t key = reinterpret_cast<size_t>(dex_instr);
-  Entry& local_entry = thread_local_array_[IndexOf<kThreadLocalSize>(key)];
-  if (kSkipThreadLocal) {
-    DCHECK_NE(local_entry.first, key) << "Expected cache miss";
-  } else {
-    if (LIKELY(local_entry.first == key)) {
-      *value = local_entry.second;
-      return true;
-    }
-  }
-  Entry shared_entry = AtomicPairLoadAcquire(&shared_array_[IndexOf<kSharedSize>(key)]);
-  if (LIKELY(shared_entry.first == key)) {
-    // For simplicity, only update the cache if weak ref accesses are enabled. If
-    // they are disabled, this means the GC is processing the cache, and is
-    // reading it concurrently.
-    if (self->GetWeakRefAccessEnabled()) {
-      local_entry = shared_entry;  // Copy to local array to make future lookup faster.
-    }
-    *value = shared_entry.second;
-    return true;
-  }
-  return false;
-}
-
-inline void InterpreterCache::Set(Thread* self, const void* dex_instr, size_t value) {
-  DCHECK(self->GetInterpreterCache() == this) << "Must be called from owning thread";
-
-  // For simplicity, only update the cache if weak ref accesses are enabled. If
-  // they are disabled, this means the GC is processing the cache, and is
-  // reading it concurrently.
-  if (self->GetWeakRefAccessEnabled()) {
-    size_t key = reinterpret_cast<size_t>(dex_instr);
-    thread_local_array_[IndexOf<kThreadLocalSize>(key)] = {key, value};
-    AtomicPairStoreRelease(&shared_array_[IndexOf<kSharedSize>(key)], {key, value});
-  }
-}
-
-}  // namespace art
-
-#endif  // ART_RUNTIME_INTERPRETER_INTERPRETER_CACHE_INL_H_
diff --git a/runtime/interpreter/interpreter_cache.cc b/runtime/interpreter/interpreter_cache.cc
index 1a35c038e9..e43fe318cc 100644
--- a/runtime/interpreter/interpreter_cache.cc
+++ b/runtime/interpreter/interpreter_cache.cc
@@ -19,30 +19,14 @@
 
 namespace art {
 
-std::array<std::atomic<InterpreterCache::Entry>,
-           InterpreterCache::kSharedSize> InterpreterCache::shared_array_;
-
-InterpreterCache::InterpreterCache() {
-  // We can not use the ClearThreadLocal method since the constructor will not
-  // be called from the owning thread.
-  thread_local_array_.fill(Entry{});
-}
-
-void InterpreterCache::ClearThreadLocal(Thread* owning_thread) {
-  // Must be called from the owning thread or when the owning thread is suspended.
+void InterpreterCache::Clear(Thread* owning_thread) {
   DCHECK(owning_thread->GetInterpreterCache() == this);
   DCHECK(owning_thread == Thread::Current() || owning_thread->IsSuspended());
-
-  thread_local_array_.fill(Entry{});
+  data_.fill(Entry{});
 }
 
-void InterpreterCache::ClearShared() {
-  // Can be called from any thread since the writes are atomic.
-  // The static shared cache isn't bound to specific thread in the first place.
-
-  for (std::atomic<Entry>& entry : shared_array_) {
-    AtomicPairStoreRelease(&entry, Entry{});
-  }
+bool InterpreterCache::IsCalledFromOwningThread() {
+  return Thread::Current()->GetInterpreterCache() == this;
 }
 
 }  // namespace art
diff --git a/runtime/interpreter/interpreter_cache.h b/runtime/interpreter/interpreter_cache.h
index af025cecbd..0ada562438 100644
--- a/runtime/interpreter/interpreter_cache.h
+++ b/runtime/interpreter/interpreter_cache.h
@@ -20,22 +20,17 @@
 #include <array>
 #include <atomic>
 
-#include "base/atomic_pair.h"
 #include "base/bit_utils.h"
 #include "base/macros.h"
 
 namespace art {
 
-class Instruction;
 class Thread;
 
 // Small fast thread-local cache for the interpreter.
-//
-// The key is an absolute pointer to a dex instruction.
-//
-// The value depends on the opcode of the dex instruction.
+// It can hold arbitrary pointer-sized key-value pair.
+// The interpretation of the value depends on the key.
 // Presence of entry might imply some pre-conditions.
-//
 // All operations must be done from the owning thread,
 // or at a point when the owning thread is suspended.
 //
@@ -51,61 +46,52 @@ class Thread;
 // from assembly (it ensures that the offset is valid immediate value).
 class ALIGNED(16) InterpreterCache {
  public:
-  using Entry = AtomicPair<size_t>;
-
-  static constexpr size_t kThreadLocalSize = 256;   // Value of 256 has around 75% cache hit rate.
-  static constexpr size_t kSharedSize = 16 * 1024;  // Value of 16k has around 90% cache hit rate.
-  static constexpr size_t kHashShift = 2;           // Number of tailing dex pc bits to drop.
-
-  InterpreterCache();
-
-  void ClearThreadLocal(Thread* owning_thread);
+  // Aligned since we load the whole entry in single assembly instruction.
+  typedef std::pair<const void*, size_t> Entry ALIGNED(2 * sizeof(size_t));
 
-  static void ClearShared();
+  // 2x size increase/decrease corresponds to ~0.5% interpreter performance change.
+  // Value of 256 has around 75% cache hit rate.
+  static constexpr size_t kSize = 256;
 
-  template<bool kSkipThreadLocal = false>
-  ALWAYS_INLINE bool Get(Thread* self, const void* dex_instruction, /* out */ size_t* value);
+  InterpreterCache() {
+    // We can not use the Clear() method since the constructor will not
+    // be called from the owning thread.
+    data_.fill(Entry{});
+  }
 
-  ALWAYS_INLINE void Set(Thread* self, const void* dex_instruction, size_t value);
+  // Clear the whole cache. It requires the owning thread for DCHECKs.
+  void Clear(Thread* owning_thread);
 
-  template<typename Callback>
-  void ForEachTheadLocalEntry(Callback&& callback) {
-    for (Entry& entry : thread_local_array_) {
-      callback(reinterpret_cast<const Instruction*>(entry.first), entry.second);
+  ALWAYS_INLINE bool Get(const void* key, /* out */ size_t* value) {
+    DCHECK(IsCalledFromOwningThread());
+    Entry& entry = data_[IndexOf(key)];
+    if (LIKELY(entry.first == key)) {
+      *value = entry.second;
+      return true;
     }
+    return false;
   }
 
-  template<typename Callback>
-  static void ForEachSharedEntry(Callback&& callback) {
-    for (std::atomic<Entry>& atomic_entry : shared_array_) {
-      Entry old_entry = AtomicPairLoadAcquire(&atomic_entry);
-      Entry new_entry = old_entry;
-      callback(reinterpret_cast<const Instruction*>(new_entry.first), new_entry.second);
-      if (old_entry.second != new_entry.second) {
-        AtomicPairStoreRelease(&atomic_entry, new_entry);
-      }
-    }
+  ALWAYS_INLINE void Set(const void* key, size_t value) {
+    DCHECK(IsCalledFromOwningThread());
+    data_[IndexOf(key)] = Entry{key, value};
+  }
+
+  std::array<Entry, kSize>& GetArray() {
+    return data_;
   }
 
  private:
-  template<size_t kSize>
-  static ALWAYS_INLINE size_t IndexOf(size_t key) {
+  bool IsCalledFromOwningThread();
+
+  static ALWAYS_INLINE size_t IndexOf(const void* key) {
     static_assert(IsPowerOfTwo(kSize), "Size must be power of two");
-    size_t index = (key >> kHashShift) & (kSize - 1);
+    size_t index = (reinterpret_cast<uintptr_t>(key) >> 2) & (kSize - 1);
     DCHECK_LT(index, kSize);
     return index;
   }
 
-  // Small cache of fixed size which is always present for every thread.
-  // It is stored directly (without indrection) inside the Thread object.
-  // This makes it as fast as possible to access from assembly fast-path.
-  std::array<Entry, kThreadLocalSize> thread_local_array_;
-
-  // Larger cache which is shared by all threads.
-  // It is used as next cache level if lookup in the local array fails.
-  // It needs to be accessed using atomic operations, and is contended,
-  // but the sharing allows it to be larger then the per-thread cache.
-  static std::array<std::atomic<Entry>, kSharedSize> shared_array_;
+  std::array<Entry, kSize> data_;
 };
 
 }  // namespace art
diff --git a/runtime/interpreter/interpreter_common.h b/runtime/interpreter/interpreter_common.h
index 4cbe81d0c2..1809227f2c 100644
--- a/runtime/interpreter/interpreter_common.h
+++ b/runtime/interpreter/interpreter_common.h
@@ -46,7 +46,6 @@
 #include "dex/dex_instruction-inl.h"
 #include "entrypoints/entrypoint_utils-inl.h"
 #include "handle_scope-inl.h"
-#include "interpreter_cache-inl.h"
 #include "interpreter_switch_impl.h"
 #include "jit/jit-inl.h"
 #include "mirror/call_site.h"
@@ -239,7 +238,7 @@ static ALWAYS_INLINE bool DoInvoke(Thread* self,
   InterpreterCache* tls_cache = self->GetInterpreterCache();
   size_t tls_value;
   ArtMethod* resolved_method;
-  if (!IsNterpSupported() && LIKELY(tls_cache->Get(self, inst, &tls_value))) {
+  if (!IsNterpSupported() && LIKELY(tls_cache->Get(inst, &tls_value))) {
     resolved_method = reinterpret_cast<ArtMethod*>(tls_value);
   } else {
     ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
@@ -253,7 +252,7 @@ static ALWAYS_INLINE bool DoInvoke(Thread* self,
       return false;
     }
     if (!IsNterpSupported()) {
-      tls_cache->Set(self, inst, reinterpret_cast<size_t>(resolved_method));
+      tls_cache->Set(inst, reinterpret_cast<size_t>(resolved_method));
     }
   }
 
diff --git a/runtime/interpreter/mterp/nterp.cc b/runtime/interpreter/mterp/nterp.cc
index d49260e56a..670ae1b0c2 100644
--- a/runtime/interpreter/mterp/nterp.cc
+++ b/runtime/interpreter/mterp/nterp.cc
@@ -24,7 +24,6 @@
 #include "dex/dex_instruction_utils.h"
 #include "debugger.h"
 #include "entrypoints/entrypoint_utils-inl.h"
-#include "interpreter/interpreter_cache-inl.h"
 #include "interpreter/interpreter_common.h"
 #include "interpreter/interpreter_intrinsics.h"
 #include "interpreter/shadow_frame-inl.h"
@@ -94,7 +93,12 @@ inline void UpdateHotness(ArtMethod* method) REQUIRES_SHARED(Locks::mutator_lock
 template<typename T>
 inline void UpdateCache(Thread* self, uint16_t* dex_pc_ptr, T value) {
   DCHECK(kUseReadBarrier) << "Nterp only works with read barriers";
-  self->GetInterpreterCache()->Set(self, dex_pc_ptr, value);
+  // For simplicity, only update the cache if weak ref accesses are enabled. If
+  // they are disabled, this means the GC is processing the cache, and is
+  // reading it concurrently.
+  if (self->GetWeakRefAccessEnabled()) {
+    self->GetInterpreterCache()->Set(dex_pc_ptr, value);
+  }
 }
 
 template<typename T>
@@ -248,7 +252,7 @@ extern "C" const char* NterpGetShortyFromInvokeCustom(ArtMethod* caller, uint16_
 }
 
 FLATTEN
-extern "C" size_t NterpGetMethodSlowPath(Thread* self, ArtMethod* caller, uint16_t* dex_pc_ptr)
+extern "C" size_t NterpGetMethod(Thread* self, ArtMethod* caller, uint16_t* dex_pc_ptr)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   UpdateHotness(caller);
   const Instruction* inst = Instruction::At(dex_pc_ptr);
@@ -406,16 +410,6 @@ extern "C" size_t NterpGetMethodSlowPath(Thread* self, ArtMethod* caller, uint16
   }
 }
 
-extern "C" size_t NterpGetMethod(Thread* self, ArtMethod* caller, uint16_t* dex_pc_ptr)
-    REQUIRES_SHARED(Locks::mutator_lock_) {
-  InterpreterCache* cache = self->GetInterpreterCache();
-  size_t cached_value;
-  if (LIKELY(cache->Get</*kSkipThreadLocal=*/true>(self, dex_pc_ptr, &cached_value))) {
-    return cached_value;
-  }
-  return NterpGetMethodSlowPath(self, caller, dex_pc_ptr);  // Tail call.
-}
-
 FLATTEN
 static ArtField* ResolveFieldWithAccessChecks(Thread* self,
                                               ClassLinker* class_linker,
@@ -465,10 +459,10 @@ static ArtField* ResolveFieldWithAccessChecks(Thread* self,
   return resolved_field;
 }
 
-extern "C" size_t NterpGetStaticFieldSlowPath(Thread* self,
-                                              ArtMethod* caller,
-                                              uint16_t* dex_pc_ptr,
-                                              size_t resolve_field_type)  // Resolve if not zero
+extern "C" size_t NterpGetStaticField(Thread* self,
+                                      ArtMethod* caller,
+                                      uint16_t* dex_pc_ptr,
+                                      size_t resolve_field_type)  // Resolve if not zero
     REQUIRES_SHARED(Locks::mutator_lock_) {
   UpdateHotness(caller);
   const Instruction* inst = Instruction::At(dex_pc_ptr);
@@ -508,25 +502,10 @@ extern "C" size_t NterpGetStaticFieldSlowPath(Thread* self,
   }
 }
 
-extern "C" size_t NterpGetStaticField(Thread* self,
-                                      ArtMethod* caller,
-                                      uint16_t* dex_pc_ptr,
-                                      size_t resolve_field_type)  // Resolve if not zero
-    REQUIRES_SHARED(Locks::mutator_lock_) {
-  InterpreterCache* cache = self->GetInterpreterCache();
-  size_t cached_value;
-  if (LIKELY(cache->Get</*kSkipThreadLocal=*/true>(self, dex_pc_ptr, &cached_value))) {
-    return cached_value;
-  }
-  return NterpGetStaticFieldSlowPath(self, caller, dex_pc_ptr, resolve_field_type);
-}
-
-
-extern "C" uint32_t NterpGetInstanceFieldOffsetSlowPath(
-    Thread* self,
-    ArtMethod* caller,
-    uint16_t* dex_pc_ptr,
-    size_t resolve_field_type)  // Resolve if not zero
+extern "C" uint32_t NterpGetInstanceFieldOffset(Thread* self,
+                                                ArtMethod* caller,
+                                                uint16_t* dex_pc_ptr,
+                                                size_t resolve_field_type)  // Resolve if not zero
     REQUIRES_SHARED(Locks::mutator_lock_) {
   UpdateHotness(caller);
   const Instruction* inst = Instruction::At(dex_pc_ptr);
@@ -553,19 +532,6 @@ extern "C" uint32_t NterpGetInstanceFieldOffsetSlowPath(
   return resolved_field->GetOffset().Uint32Value();
 }
 
-extern "C" uint32_t NterpGetInstanceFieldOffset(Thread* self,
-                                                ArtMethod* caller,
-                                                uint16_t* dex_pc_ptr,
-                                                size_t resolve_field_type)  // Resolve if not zero
-    REQUIRES_SHARED(Locks::mutator_lock_) {
-  InterpreterCache* cache = self->GetInterpreterCache();
-  size_t cached_value;
-  if (LIKELY(cache->Get</*kSkipThreadLocal=*/true>(self, dex_pc_ptr, &cached_value))) {
-    return cached_value;
-  }
-  return NterpGetInstanceFieldOffsetSlowPath(self, caller, dex_pc_ptr, resolve_field_type);
-}
-
 extern "C" mirror::Object* NterpGetClassOrAllocateObject(Thread* self,
                                                          ArtMethod* caller,
                                                          uint16_t* dex_pc_ptr)
diff --git a/runtime/mirror/dex_cache-inl.h b/runtime/mirror/dex_cache-inl.h
index 8a1ed71197..31f2bd2d7b 100644
--- a/runtime/mirror/dex_cache-inl.h
+++ b/runtime/mirror/dex_cache-inl.h
@@ -357,18 +357,32 @@ inline void DexCache::SetResolvedMethod(uint32_t method_idx, ArtMethod* method)
 template <typename T>
 NativeDexCachePair<T> DexCache::GetNativePair(std::atomic<NativeDexCachePair<T>>* pair_array,
                                               size_t idx) {
-  auto* array = reinterpret_cast<std::atomic<AtomicPair<size_t>>*>(pair_array);
-  AtomicPair<size_t> value = AtomicPairLoadAcquire(&array[idx]);
-  return NativeDexCachePair<T>(reinterpret_cast<T*>(value.first), value.second);
+  if (kRuntimePointerSize == PointerSize::k64) {
+    auto* array = reinterpret_cast<std::atomic<ConversionPair64>*>(pair_array);
+    ConversionPair64 value = AtomicLoadRelaxed16B(&array[idx]);
+    return NativeDexCachePair<T>(reinterpret_cast64<T*>(value.first),
+                                 dchecked_integral_cast<size_t>(value.second));
+  } else {
+    auto* array = reinterpret_cast<std::atomic<ConversionPair32>*>(pair_array);
+    ConversionPair32 value = array[idx].load(std::memory_order_relaxed);
+    return NativeDexCachePair<T>(reinterpret_cast32<T*>(value.first), value.second);
+  }
 }
 
 template <typename T>
 void DexCache::SetNativePair(std::atomic<NativeDexCachePair<T>>* pair_array,
                              size_t idx,
                              NativeDexCachePair<T> pair) {
-  auto* array = reinterpret_cast<std::atomic<AtomicPair<size_t>>*>(pair_array);
-  AtomicPair<size_t> v(reinterpret_cast<size_t>(pair.object), pair.index);
-  AtomicPairStoreRelease(&array[idx], v);
+  if (kRuntimePointerSize == PointerSize::k64) {
+    auto* array = reinterpret_cast<std::atomic<ConversionPair64>*>(pair_array);
+    ConversionPair64 v(reinterpret_cast64<uint64_t>(pair.object), pair.index);
+    AtomicStoreRelease16B(&array[idx], v);
+  } else {
+    auto* array = reinterpret_cast<std::atomic<ConversionPair32>*>(pair_array);
+    ConversionPair32 v(reinterpret_cast32<uint32_t>(pair.object),
+                       dchecked_integral_cast<uint32_t>(pair.index));
+    array[idx].store(v, std::memory_order_release);
+  }
 }
 
 template <typename T,
diff --git a/runtime/mirror/dex_cache.cc b/runtime/mirror/dex_cache.cc
index b7f8ee7a07..c80f9dfe2f 100644
--- a/runtime/mirror/dex_cache.cc
+++ b/runtime/mirror/dex_cache.cc
@@ -126,5 +126,23 @@ ObjPtr<ClassLoader> DexCache::GetClassLoader() {
   return GetFieldObject<ClassLoader>(OFFSET_OF_OBJECT_MEMBER(DexCache, class_loader_));
 }
 
+#if !defined(__aarch64__) && !defined(__x86_64__)
+static pthread_mutex_t dex_cache_slow_atomic_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+DexCache::ConversionPair64 DexCache::AtomicLoadRelaxed16B(std::atomic<ConversionPair64>* target) {
+  pthread_mutex_lock(&dex_cache_slow_atomic_mutex);
+  DexCache::ConversionPair64 value = *reinterpret_cast<ConversionPair64*>(target);
+  pthread_mutex_unlock(&dex_cache_slow_atomic_mutex);
+  return value;
+}
+
+void DexCache::AtomicStoreRelease16B(std::atomic<ConversionPair64>* target,
+                                     ConversionPair64 value) {
+  pthread_mutex_lock(&dex_cache_slow_atomic_mutex);
+  *reinterpret_cast<ConversionPair64*>(target) = value;
+  pthread_mutex_unlock(&dex_cache_slow_atomic_mutex);
+}
+#endif
+
 }  // namespace mirror
 }  // namespace art
diff --git a/runtime/mirror/dex_cache.h b/runtime/mirror/dex_cache.h
index 19197095aa..26fc520cd7 100644
--- a/runtime/mirror/dex_cache.h
+++ b/runtime/mirror/dex_cache.h
@@ -149,7 +149,7 @@ class MANAGED DexCache final : public Object {
                 "String dex cache size is not a power of 2.");
 
   // Size of field dex cache. Needs to be a power of 2 for entrypoint assumptions to hold.
-  static constexpr size_t kDexCacheFieldCacheSize = 512;
+  static constexpr size_t kDexCacheFieldCacheSize = 1024;
   static_assert(IsPowerOfTwo(kDexCacheFieldCacheSize),
                 "Field dex cache size is not a power of 2.");
 
@@ -448,6 +448,19 @@ class MANAGED DexCache final : public Object {
   T* AllocArray(MemberOffset obj_offset, MemberOffset num_offset, size_t num)
      REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // std::pair<> is not trivially copyable and as such it is unsuitable for atomic operations,
+  // so we use a custom pair class for loading and storing the NativeDexCachePair<>.
+  template <typename IntType>
+  struct PACKED(2 * sizeof(IntType)) ConversionPair {
+    ConversionPair(IntType f, IntType s) : first(f), second(s) { }
+    ConversionPair(const ConversionPair&) = default;
+    ConversionPair& operator=(const ConversionPair&) = default;
+    IntType first;
+    IntType second;
+  };
+  using ConversionPair32 = ConversionPair<uint32_t>;
+  using ConversionPair64 = ConversionPair<uint64_t>;
+
   // Visit instance fields of the dex cache as well as its associated arrays.
   template <bool kVisitNativeRoots,
             VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
@@ -456,6 +469,48 @@ class MANAGED DexCache final : public Object {
   void VisitReferences(ObjPtr<Class> klass, const Visitor& visitor)
       REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_);
 
+  // Due to lack of 16-byte atomics support, we use hand-crafted routines.
+#if defined(__aarch64__)
+  // 16-byte atomics are supported on aarch64.
+  ALWAYS_INLINE static ConversionPair64 AtomicLoadRelaxed16B(
+      std::atomic<ConversionPair64>* target) {
+    return target->load(std::memory_order_relaxed);
+  }
+
+  ALWAYS_INLINE static void AtomicStoreRelease16B(
+      std::atomic<ConversionPair64>* target, ConversionPair64 value) {
+    target->store(value, std::memory_order_release);
+  }
+#elif defined(__x86_64__)
+  ALWAYS_INLINE static ConversionPair64 AtomicLoadRelaxed16B(
+      std::atomic<ConversionPair64>* target) {
+    uint64_t first, second;
+    __asm__ __volatile__(
+        "lock cmpxchg16b (%2)"
+        : "=&a"(first), "=&d"(second)
+        : "r"(target), "a"(0), "d"(0), "b"(0), "c"(0)
+        : "cc");
+    return ConversionPair64(first, second);
+  }
+
+  ALWAYS_INLINE static void AtomicStoreRelease16B(
+      std::atomic<ConversionPair64>* target, ConversionPair64 value) {
+    uint64_t first, second;
+    __asm__ __volatile__ (
+        "movq (%2), %%rax\n\t"
+        "movq 8(%2), %%rdx\n\t"
+        "1:\n\t"
+        "lock cmpxchg16b (%2)\n\t"
+        "jnz 1b"
+        : "=&a"(first), "=&d"(second)
+        : "r"(target), "b"(value.first), "c"(value.second)
+        : "cc");
+  }
+#else
+  static ConversionPair64 AtomicLoadRelaxed16B(std::atomic<ConversionPair64>* target);
+  static void AtomicStoreRelease16B(std::atomic<ConversionPair64>* target, ConversionPair64 value);
+#endif
+
   HeapReference<ClassLoader> class_loader_;
   HeapReference<String> location_;
 
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 2a3afa8c99..54e9d38b3c 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -739,7 +739,7 @@ void Runtime::SweepSystemWeaks(IsMarkedVisitor* visitor) {
     // from mutators. See b/32167580.
     GetJit()->GetCodeCache()->SweepRootTables(visitor);
   }
-  Thread::SweepInterpreterCaches(visitor);
+  thread_list_->SweepInterpreterCaches(visitor);
 
   // All other generic system-weak holders.
   for (gc::AbstractSystemWeakHolder* holder : system_weak_holders_) {
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 2b3f47298e..25d493f0a5 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -4240,28 +4240,27 @@ void Thread::VisitRoots(RootVisitor* visitor) {
 }
 #pragma GCC diagnostic pop
 
-void Thread::SweepInterpreterCaches(IsMarkedVisitor* visitor) {
-  Thread* self = Thread::Current();
-  auto visit = [visitor, self](const Instruction* inst, size_t& value) {
-    Locks::mutator_lock_->AssertSharedHeld(self);
+void Thread::SweepInterpreterCache(IsMarkedVisitor* visitor) {
+  for (InterpreterCache::Entry& entry : GetInterpreterCache()->GetArray()) {
+    const Instruction* inst = reinterpret_cast<const Instruction*>(entry.first);
     if (inst != nullptr) {
       if (inst->Opcode() == Instruction::NEW_INSTANCE ||
           inst->Opcode() == Instruction::CHECK_CAST ||
           inst->Opcode() == Instruction::INSTANCE_OF ||
           inst->Opcode() == Instruction::NEW_ARRAY ||
           inst->Opcode() == Instruction::CONST_CLASS) {
-        mirror::Class* cls = reinterpret_cast<mirror::Class*>(value);
+        mirror::Class* cls = reinterpret_cast<mirror::Class*>(entry.second);
         if (cls == nullptr || cls == Runtime::GetWeakClassSentinel()) {
           // Entry got deleted in a previous sweep.
-          return;
+          continue;
         }
         Runtime::ProcessWeakClass(
-            reinterpret_cast<GcRoot<mirror::Class>*>(&value),
+            reinterpret_cast<GcRoot<mirror::Class>*>(&entry.second),
             visitor,
             Runtime::GetWeakClassSentinel());
       } else if (inst->Opcode() == Instruction::CONST_STRING ||
                  inst->Opcode() == Instruction::CONST_STRING_JUMBO) {
-        mirror::Object* object = reinterpret_cast<mirror::Object*>(value);
+        mirror::Object* object = reinterpret_cast<mirror::Object*>(entry.second);
         mirror::Object* new_object = visitor->IsMarked(object);
         // We know the string is marked because it's a strongly-interned string that
         // is always alive (see b/117621117 for trying to make those strings weak).
@@ -4269,16 +4268,11 @@ void Thread::SweepInterpreterCaches(IsMarkedVisitor* visitor) {
         // null for newly allocated objects, but we know those haven't moved. Therefore,
         // only update the entry if we get a different non-null string.
         if (new_object != nullptr && new_object != object) {
-          value = reinterpret_cast<size_t>(new_object);
+          entry.second = reinterpret_cast<size_t>(new_object);
         }
       }
     }
-  };
-  InterpreterCache::ForEachSharedEntry(visit);
-  MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
-  Runtime::Current()->GetThreadList()->ForEach([&visit](Thread* thread) {
-    thread->GetInterpreterCache()->ForEachTheadLocalEntry(visit);
-  });
+  }
 }
 
 // FIXME: clang-r433403 reports the below function exceeds frame size limit.
@@ -4493,10 +4487,9 @@ void Thread::SetReadBarrierEntrypoints() {
 void Thread::ClearAllInterpreterCaches() {
   static struct ClearInterpreterCacheClosure : Closure {
     void Run(Thread* thread) override {
-      thread->GetInterpreterCache()->ClearThreadLocal(thread);
+      thread->GetInterpreterCache()->Clear(thread);
     }
   } closure;
-  InterpreterCache::ClearShared();
   Runtime::Current()->GetThreadList()->RunCheckpoint(&closure);
 }
 
diff --git a/runtime/thread.h b/runtime/thread.h
index ec971a564f..1085a563c4 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -1348,6 +1348,10 @@ class Thread {
     return ThreadOffset<pointer_size>(OFFSETOF_MEMBER(Thread, interpreter_cache_));
   }
 
+  static constexpr int InterpreterCacheSizeLog2() {
+    return WhichPowerOf2(InterpreterCache::kSize);
+  }
+
   static constexpr uint32_t AllThreadFlags() {
     return enum_cast<uint32_t>(ThreadFlag::kLastFlag) |
            (enum_cast<uint32_t>(ThreadFlag::kLastFlag) - 1u);
@@ -1522,8 +1526,7 @@ class Thread {
   template <bool kPrecise>
   void VisitRoots(RootVisitor* visitor) REQUIRES_SHARED(Locks::mutator_lock_);
 
-  static void SweepInterpreterCaches(IsMarkedVisitor* visitor)
-      REQUIRES_SHARED(Locks::mutator_lock_);
+  void SweepInterpreterCache(IsMarkedVisitor* visitor) REQUIRES_SHARED(Locks::mutator_lock_);
 
   static bool IsAotCompiler();
 
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 6482e72417..4e3b40ba7a 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -1407,6 +1407,13 @@ void ThreadList::VisitRoots(RootVisitor* visitor, VisitRootFlags flags) const {
   }
 }
 
+void ThreadList::SweepInterpreterCaches(IsMarkedVisitor* visitor) const {
+  MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
+  for (const auto& thread : list_) {
+    thread->SweepInterpreterCache(visitor);
+  }
+}
+
 void ThreadList::VisitReflectiveTargets(ReflectiveValueVisitor *visitor) const {
   MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
   for (const auto& thread : list_) {
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index 29b0c52186..f5b58a0c54 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -179,6 +179,10 @@ class ThreadList {
     return empty_checkpoint_barrier_.get();
   }
 
+  void SweepInterpreterCaches(IsMarkedVisitor* visitor) const
+      REQUIRES(!Locks::thread_list_lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
   void WaitForOtherNonDaemonThreadsToExit(bool check_no_birth = true)
       REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_,
                !Locks::mutator_lock_);
diff --git a/tools/cpp-define-generator/thread.def b/tools/cpp-define-generator/thread.def
index 07ca8841f8..ec8e28b977 100644
--- a/tools/cpp-define-generator/thread.def
+++ b/tools/cpp-define-generator/thread.def
@@ -30,12 +30,11 @@ ASM_DEFINE(THREAD_ID_OFFSET,
 ASM_DEFINE(THREAD_INTERPRETER_CACHE_OFFSET,
            art::Thread::InterpreterCacheOffset<art::kRuntimePointerSize>().Int32Value())
 ASM_DEFINE(THREAD_INTERPRETER_CACHE_SIZE_LOG2,
-           art::WhichPowerOf2(art::InterpreterCache::kThreadLocalSize))
+           art::Thread::InterpreterCacheSizeLog2())
 ASM_DEFINE(THREAD_INTERPRETER_CACHE_SIZE_MASK,
-           (sizeof(art::InterpreterCache::Entry) * (art::InterpreterCache::kThreadLocalSize - 1)))
+           (sizeof(art::InterpreterCache::Entry) * (art::InterpreterCache::kSize - 1)))
 ASM_DEFINE(THREAD_INTERPRETER_CACHE_SIZE_SHIFT,
-           (art::WhichPowerOf2(sizeof(art::InterpreterCache::Entry)) -
-            art::InterpreterCache::kHashShift))
+           2)
 ASM_DEFINE(THREAD_IS_GC_MARKING_OFFSET,
            art::Thread::IsGcMarkingOffset<art::kRuntimePointerSize>().Int32Value())
 ASM_DEFINE(THREAD_LOCAL_ALLOC_STACK_END_OFFSET,
author	Hans Boehm <hboehm@google.com>	2022-01-17 01:32:55 +0000
committer	Hans Boehm <hboehm@google.com>	2022-01-17 01:32:55 +0000
commit	3d2f148fe040b60452d5d9be7d08dec693132078 (patch)
tree	6a8a0aa66c68e8a4c49833b2a93b263985259aed
parent	fa40e6e318b21d4a1885a6ffea6efc3c0b5cc1cd (diff)