diff options
Diffstat (limited to 'runtime/interpreter/interpreter_cache.h')
-rw-r--r-- | runtime/interpreter/interpreter_cache.h | 80 |
1 files changed, 47 insertions, 33 deletions
diff --git a/runtime/interpreter/interpreter_cache.h b/runtime/interpreter/interpreter_cache.h index 0ada562438..af025cecbd 100644 --- a/runtime/interpreter/interpreter_cache.h +++ b/runtime/interpreter/interpreter_cache.h @@ -20,17 +20,22 @@ #include <array> #include <atomic> +#include "base/atomic_pair.h" #include "base/bit_utils.h" #include "base/macros.h" namespace art { +class Instruction; class Thread; // Small fast thread-local cache for the interpreter. -// It can hold arbitrary pointer-sized key-value pair. -// The interpretation of the value depends on the key. +// +// The key is an absolute pointer to a dex instruction. +// +// The value depends on the opcode of the dex instruction. // Presence of entry might imply some pre-conditions. +// // All operations must be done from the owning thread, // or at a point when the owning thread is suspended. // @@ -46,52 +51,61 @@ class Thread; // from assembly (it ensures that the offset is valid immediate value). class ALIGNED(16) InterpreterCache { public: - // Aligned since we load the whole entry in single assembly instruction. - typedef std::pair<const void*, size_t> Entry ALIGNED(2 * sizeof(size_t)); + using Entry = AtomicPair<size_t>; - // 2x size increase/decrease corresponds to ~0.5% interpreter performance change. - // Value of 256 has around 75% cache hit rate. - static constexpr size_t kSize = 256; + static constexpr size_t kThreadLocalSize = 256; // Value of 256 has around 75% cache hit rate. + static constexpr size_t kSharedSize = 16 * 1024; // Value of 16k has around 90% cache hit rate. + static constexpr size_t kHashShift = 2; // Number of tailing dex pc bits to drop. - InterpreterCache() { - // We can not use the Clear() method since the constructor will not - // be called from the owning thread. - data_.fill(Entry{}); - } + InterpreterCache(); - // Clear the whole cache. It requires the owning thread for DCHECKs. - void Clear(Thread* owning_thread); + void ClearThreadLocal(Thread* owning_thread); - ALWAYS_INLINE bool Get(const void* key, /* out */ size_t* value) { - DCHECK(IsCalledFromOwningThread()); - Entry& entry = data_[IndexOf(key)]; - if (LIKELY(entry.first == key)) { - *value = entry.second; - return true; - } - return false; - } + static void ClearShared(); - ALWAYS_INLINE void Set(const void* key, size_t value) { - DCHECK(IsCalledFromOwningThread()); - data_[IndexOf(key)] = Entry{key, value}; + template<bool kSkipThreadLocal = false> + ALWAYS_INLINE bool Get(Thread* self, const void* dex_instruction, /* out */ size_t* value); + + ALWAYS_INLINE void Set(Thread* self, const void* dex_instruction, size_t value); + + template<typename Callback> + void ForEachTheadLocalEntry(Callback&& callback) { + for (Entry& entry : thread_local_array_) { + callback(reinterpret_cast<const Instruction*>(entry.first), entry.second); + } } - std::array<Entry, kSize>& GetArray() { - return data_; + template<typename Callback> + static void ForEachSharedEntry(Callback&& callback) { + for (std::atomic<Entry>& atomic_entry : shared_array_) { + Entry old_entry = AtomicPairLoadAcquire(&atomic_entry); + Entry new_entry = old_entry; + callback(reinterpret_cast<const Instruction*>(new_entry.first), new_entry.second); + if (old_entry.second != new_entry.second) { + AtomicPairStoreRelease(&atomic_entry, new_entry); + } + } } private: - bool IsCalledFromOwningThread(); - - static ALWAYS_INLINE size_t IndexOf(const void* key) { + template<size_t kSize> + static ALWAYS_INLINE size_t IndexOf(size_t key) { static_assert(IsPowerOfTwo(kSize), "Size must be power of two"); - size_t index = (reinterpret_cast<uintptr_t>(key) >> 2) & (kSize - 1); + size_t index = (key >> kHashShift) & (kSize - 1); DCHECK_LT(index, kSize); return index; } - std::array<Entry, kSize> data_; + // Small cache of fixed size which is always present for every thread. + // It is stored directly (without indrection) inside the Thread object. + // This makes it as fast as possible to access from assembly fast-path. + std::array<Entry, kThreadLocalSize> thread_local_array_; + + // Larger cache which is shared by all threads. + // It is used as next cache level if lookup in the local array fails. + // It needs to be accessed using atomic operations, and is contended, + // but the sharing allows it to be larger then the per-thread cache. + static std::array<std::atomic<Entry>, kSharedSize> shared_array_; }; } // namespace art |