Revert "Add thread-shared interpreter cache"

This reverts commit fa40e6e318b21d4a1885a6ffea6efc3c0b5cc1cd. Reason for revert: Seeing several different failures that appear related, both test failures and b/214850618. And it appears a potentially significant unresolved comment was overlooked. Change-Id: I2b5260ac7f2168831f0d1b0d7c76b70ecc1fb77d
author: Hans Boehm <hboehm@google.com> 2022-01-17 01:32:55 +0000
committer: Hans Boehm <hboehm@google.com> 2022-01-17 01:32:55 +0000
commit: 3d2f148fe040b60452d5d9be7d08dec693132078 (patch)
tree: 6a8a0aa66c68e8a4c49833b2a93b263985259aed /runtime/interpreter/interpreter_cache.h
parent: fa40e6e318b21d4a1885a6ffea6efc3c0b5cc1cd (diff)
1 files changed, 33 insertions, 47 deletions
diff --git a/runtime/interpreter/interpreter_cache.h b/runtime/interpreter/interpreter_cache.h
index af025cecbd..0ada562438 100644
--- a/runtime/interpreter/interpreter_cache.h
+++ b/runtime/interpreter/interpreter_cache.h
@@ -20,22 +20,17 @@
 #include <array>
 #include <atomic>
 
-#include "base/atomic_pair.h"
 #include "base/bit_utils.h"
 #include "base/macros.h"
 
 namespace art {
 
-class Instruction;
 class Thread;
 
 // Small fast thread-local cache for the interpreter.
-//
-// The key is an absolute pointer to a dex instruction.
-//
-// The value depends on the opcode of the dex instruction.
+// It can hold arbitrary pointer-sized key-value pair.
+// The interpretation of the value depends on the key.
 // Presence of entry might imply some pre-conditions.
-//
 // All operations must be done from the owning thread,
 // or at a point when the owning thread is suspended.
 //
@@ -51,61 +46,52 @@ class Thread;
 // from assembly (it ensures that the offset is valid immediate value).
 class ALIGNED(16) InterpreterCache {
  public:
-  using Entry = AtomicPair<size_t>;
-
-  static constexpr size_t kThreadLocalSize = 256;   // Value of 256 has around 75% cache hit rate.
-  static constexpr size_t kSharedSize = 16 * 1024;  // Value of 16k has around 90% cache hit rate.
-  static constexpr size_t kHashShift = 2;           // Number of tailing dex pc bits to drop.
-
-  InterpreterCache();
-
-  void ClearThreadLocal(Thread* owning_thread);
+  // Aligned since we load the whole entry in single assembly instruction.
+  typedef std::pair<const void*, size_t> Entry ALIGNED(2 * sizeof(size_t));
 
-  static void ClearShared();
+  // 2x size increase/decrease corresponds to ~0.5% interpreter performance change.
+  // Value of 256 has around 75% cache hit rate.
+  static constexpr size_t kSize = 256;
 
-  template<bool kSkipThreadLocal = false>
-  ALWAYS_INLINE bool Get(Thread* self, const void* dex_instruction, /* out */ size_t* value);
+  InterpreterCache() {
+    // We can not use the Clear() method since the constructor will not
+    // be called from the owning thread.
+    data_.fill(Entry{});
+  }
 
-  ALWAYS_INLINE void Set(Thread* self, const void* dex_instruction, size_t value);
+  // Clear the whole cache. It requires the owning thread for DCHECKs.
+  void Clear(Thread* owning_thread);
 
-  template<typename Callback>
-  void ForEachTheadLocalEntry(Callback&& callback) {
-    for (Entry& entry : thread_local_array_) {
-      callback(reinterpret_cast<const Instruction*>(entry.first), entry.second);
+  ALWAYS_INLINE bool Get(const void* key, /* out */ size_t* value) {
+    DCHECK(IsCalledFromOwningThread());
+    Entry& entry = data_[IndexOf(key)];
+    if (LIKELY(entry.first == key)) {
+      *value = entry.second;
+      return true;
     }
+    return false;
   }
 
-  template<typename Callback>
-  static void ForEachSharedEntry(Callback&& callback) {
-    for (std::atomic<Entry>& atomic_entry : shared_array_) {
-      Entry old_entry = AtomicPairLoadAcquire(&atomic_entry);
-      Entry new_entry = old_entry;
-      callback(reinterpret_cast<const Instruction*>(new_entry.first), new_entry.second);
-      if (old_entry.second != new_entry.second) {
-        AtomicPairStoreRelease(&atomic_entry, new_entry);
-      }
-    }
+  ALWAYS_INLINE void Set(const void* key, size_t value) {
+    DCHECK(IsCalledFromOwningThread());
+    data_[IndexOf(key)] = Entry{key, value};
+  }
+
+  std::array<Entry, kSize>& GetArray() {
+    return data_;
   }
 
  private:
-  template<size_t kSize>
-  static ALWAYS_INLINE size_t IndexOf(size_t key) {
+  bool IsCalledFromOwningThread();
+
+  static ALWAYS_INLINE size_t IndexOf(const void* key) {
     static_assert(IsPowerOfTwo(kSize), "Size must be power of two");
-    size_t index = (key >> kHashShift) & (kSize - 1);
+    size_t index = (reinterpret_cast<uintptr_t>(key) >> 2) & (kSize - 1);
     DCHECK_LT(index, kSize);
     return index;
   }
 
-  // Small cache of fixed size which is always present for every thread.
-  // It is stored directly (without indrection) inside the Thread object.
-  // This makes it as fast as possible to access from assembly fast-path.
-  std::array<Entry, kThreadLocalSize> thread_local_array_;
-
-  // Larger cache which is shared by all threads.
-  // It is used as next cache level if lookup in the local array fails.
-  // It needs to be accessed using atomic operations, and is contended,
-  // but the sharing allows it to be larger then the per-thread cache.
-  static std::array<std::atomic<Entry>, kSharedSize> shared_array_;
+  std::array<Entry, kSize> data_;
 };
 
 }  // namespace art
author	Hans Boehm <hboehm@google.com>	2022-01-17 01:32:55 +0000
committer	Hans Boehm <hboehm@google.com>	2022-01-17 01:32:55 +0000
commit	3d2f148fe040b60452d5d9be7d08dec693132078 (patch)
tree	6a8a0aa66c68e8a4c49833b2a93b263985259aed /runtime/interpreter/interpreter_cache.h
parent	fa40e6e318b21d4a1885a6ffea6efc3c0b5cc1cd (diff)