1 files changed, 47 insertions, 33 deletions
diff --git a/runtime/interpreter/interpreter_cache.h b/runtime/interpreter/interpreter_cache.h
index 0ada562438..af025cecbd 100644
--- a/runtime/interpreter/interpreter_cache.h
+++ b/runtime/interpreter/interpreter_cache.h
@@ -20,17 +20,22 @@
 #include <array>
 #include <atomic>
 
+#include "base/atomic_pair.h"
 #include "base/bit_utils.h"
 #include "base/macros.h"
 
 namespace art {
 
+class Instruction;
 class Thread;
 
 // Small fast thread-local cache for the interpreter.
-// It can hold arbitrary pointer-sized key-value pair.
-// The interpretation of the value depends on the key.
+//
+// The key is an absolute pointer to a dex instruction.
+//
+// The value depends on the opcode of the dex instruction.
 // Presence of entry might imply some pre-conditions.
+//
 // All operations must be done from the owning thread,
 // or at a point when the owning thread is suspended.
 //
@@ -46,52 +51,61 @@ class Thread;
 // from assembly (it ensures that the offset is valid immediate value).
 class ALIGNED(16) InterpreterCache {
  public:
-  // Aligned since we load the whole entry in single assembly instruction.
-  typedef std::pair<const void*, size_t> Entry ALIGNED(2 * sizeof(size_t));
+  using Entry = AtomicPair<size_t>;
 
-  // 2x size increase/decrease corresponds to ~0.5% interpreter performance change.
-  // Value of 256 has around 75% cache hit rate.
-  static constexpr size_t kSize = 256;
+  static constexpr size_t kThreadLocalSize = 256;   // Value of 256 has around 75% cache hit rate.
+  static constexpr size_t kSharedSize = 16 * 1024;  // Value of 16k has around 90% cache hit rate.
+  static constexpr size_t kHashShift = 2;           // Number of tailing dex pc bits to drop.
 
-  InterpreterCache() {
-    // We can not use the Clear() method since the constructor will not
-    // be called from the owning thread.
-    data_.fill(Entry{});
-  }
+  InterpreterCache();
 
-  // Clear the whole cache. It requires the owning thread for DCHECKs.
-  void Clear(Thread* owning_thread);
+  void ClearThreadLocal(Thread* owning_thread);
 
-  ALWAYS_INLINE bool Get(const void* key, /* out */ size_t* value) {
-    DCHECK(IsCalledFromOwningThread());
-    Entry& entry = data_[IndexOf(key)];
-    if (LIKELY(entry.first == key)) {
-      *value = entry.second;
-      return true;
-    }
-    return false;
-  }
+  static void ClearShared();
 
-  ALWAYS_INLINE void Set(const void* key, size_t value) {
-    DCHECK(IsCalledFromOwningThread());
-    data_[IndexOf(key)] = Entry{key, value};
+  template<bool kSkipThreadLocal = false>
+  ALWAYS_INLINE bool Get(Thread* self, const void* dex_instruction, /* out */ size_t* value);
+
+  ALWAYS_INLINE void Set(Thread* self, const void* dex_instruction, size_t value);
+
+  template<typename Callback>
+  void ForEachTheadLocalEntry(Callback&& callback) {
+    for (Entry& entry : thread_local_array_) {
+      callback(reinterpret_cast<const Instruction*>(entry.first), entry.second);
+    }
   }
 
-  std::array<Entry, kSize>& GetArray() {
-    return data_;
+  template<typename Callback>
+  static void ForEachSharedEntry(Callback&& callback) {
+    for (std::atomic<Entry>& atomic_entry : shared_array_) {
+      Entry old_entry = AtomicPairLoadAcquire(&atomic_entry);
+      Entry new_entry = old_entry;
+      callback(reinterpret_cast<const Instruction*>(new_entry.first), new_entry.second);
+      if (old_entry.second != new_entry.second) {
+        AtomicPairStoreRelease(&atomic_entry, new_entry);
+      }
+    }
   }
 
  private:
-  bool IsCalledFromOwningThread();
-
-  static ALWAYS_INLINE size_t IndexOf(const void* key) {
+  template<size_t kSize>
+  static ALWAYS_INLINE size_t IndexOf(size_t key) {
     static_assert(IsPowerOfTwo(kSize), "Size must be power of two");
-    size_t index = (reinterpret_cast<uintptr_t>(key) >> 2) & (kSize - 1);
+    size_t index = (key >> kHashShift) & (kSize - 1);
     DCHECK_LT(index, kSize);
     return index;
   }
 
-  std::array<Entry, kSize> data_;
+  // Small cache of fixed size which is always present for every thread.
+  // It is stored directly (without indrection) inside the Thread object.
+  // This makes it as fast as possible to access from assembly fast-path.
+  std::array<Entry, kThreadLocalSize> thread_local_array_;
+
+  // Larger cache which is shared by all threads.
+  // It is used as next cache level if lookup in the local array fails.
+  // It needs to be accessed using atomic operations, and is contended,
+  // but the sharing allows it to be larger then the per-thread cache.
+  static std::array<std::atomic<Entry>, kSharedSize> shared_array_;
 };
 
 }  // namespace art