diff options
author | 2022-01-17 01:32:55 +0000 | |
---|---|---|
committer | 2022-01-17 01:32:55 +0000 | |
commit | 3d2f148fe040b60452d5d9be7d08dec693132078 (patch) | |
tree | 6a8a0aa66c68e8a4c49833b2a93b263985259aed | |
parent | fa40e6e318b21d4a1885a6ffea6efc3c0b5cc1cd (diff) |
Revert "Add thread-shared interpreter cache"
This reverts commit fa40e6e318b21d4a1885a6ffea6efc3c0b5cc1cd.
Reason for revert: Seeing several different failures that appear related, both test failures and b/214850618. And it appears a potentially significant unresolved comment was overlooked.
Change-Id: I2b5260ac7f2168831f0d1b0d7c76b70ecc1fb77d
-rw-r--r-- | openjdkjvmti/ti_redefine.cc | 3 | ||||
-rw-r--r-- | runtime/base/atomic_pair.h | 85 | ||||
-rw-r--r-- | runtime/interpreter/interpreter_cache-inl.h | 67 | ||||
-rw-r--r-- | runtime/interpreter/interpreter_cache.cc | 24 | ||||
-rw-r--r-- | runtime/interpreter/interpreter_cache.h | 80 | ||||
-rw-r--r-- | runtime/interpreter/interpreter_common.h | 5 | ||||
-rw-r--r-- | runtime/interpreter/mterp/nterp.cc | 64 | ||||
-rw-r--r-- | runtime/mirror/dex_cache-inl.h | 26 | ||||
-rw-r--r-- | runtime/mirror/dex_cache.cc | 18 | ||||
-rw-r--r-- | runtime/mirror/dex_cache.h | 57 | ||||
-rw-r--r-- | runtime/runtime.cc | 2 | ||||
-rw-r--r-- | runtime/thread.cc | 27 | ||||
-rw-r--r-- | runtime/thread.h | 7 | ||||
-rw-r--r-- | runtime/thread_list.cc | 7 | ||||
-rw-r--r-- | runtime/thread_list.h | 4 | ||||
-rw-r--r-- | tools/cpp-define-generator/thread.def | 7 |
16 files changed, 179 insertions, 304 deletions
diff --git a/openjdkjvmti/ti_redefine.cc b/openjdkjvmti/ti_redefine.cc index 16baf935db..37a61d3c54 100644 --- a/openjdkjvmti/ti_redefine.cc +++ b/openjdkjvmti/ti_redefine.cc @@ -2929,9 +2929,8 @@ void Redefiner::ClassRedefinition::UpdateClassStructurally(const RedefinitionDat // TODO We might be able to avoid doing this but given the rather unstructured nature of the // interpreter cache it's probably not worth the effort. art::MutexLock mu(driver_->self_, *art::Locks::thread_list_lock_); - art::InterpreterCache::ClearShared(); driver_->runtime_->GetThreadList()->ForEach( - [](art::Thread* t) { t->GetInterpreterCache()->ClearThreadLocal(t); }); + [](art::Thread* t) { t->GetInterpreterCache()->Clear(t); }); } if (art::kIsDebugBuild) { diff --git a/runtime/base/atomic_pair.h b/runtime/base/atomic_pair.h deleted file mode 100644 index 4e43d3062c..0000000000 --- a/runtime/base/atomic_pair.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (C) 2021 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ART_RUNTIME_BASE_ATOMIC_PAIR_H_ -#define ART_RUNTIME_BASE_ATOMIC_PAIR_H_ - -#include "base/macros.h" - -#include <type_traits> - -namespace art { - -// std::pair<> is not trivially copyable and as such it is unsuitable for atomic operations. -template <typename IntType> -struct PACKED(2 * sizeof(IntType)) AtomicPair { - static_assert(std::is_integral<IntType>::value); - - constexpr AtomicPair() : first(0), second(0) { } - AtomicPair(IntType f, IntType s) : first(f), second(s) { } - AtomicPair(const AtomicPair&) = default; - AtomicPair& operator=(const AtomicPair&) = default; - - IntType first; - IntType second; -}; - -template <typename IntType> -ALWAYS_INLINE static inline AtomicPair<IntType> AtomicPairLoadAcquire( - std::atomic<AtomicPair<IntType>>* target) { - static_assert(std::atomic<AtomicPair<IntType>>::is_always_lock_free); - return target->load(std::memory_order_acquire); -} - -template <typename IntType> -ALWAYS_INLINE static inline void AtomicPairStoreRelease( - std::atomic<AtomicPair<IntType>>* target, AtomicPair<IntType> value) { - static_assert(std::atomic<AtomicPair<IntType>>::is_always_lock_free); - target->store(value, std::memory_order_release); -} - -// llvm does not implement 16-byte atomic operations on x86-64. -#if defined(__x86_64__) -ALWAYS_INLINE static inline AtomicPair<uint64_t> AtomicPairLoadAcquire( - std::atomic<AtomicPair<uint64_t>>* target) { - uint64_t first, second; - __asm__ __volatile__( - "lock cmpxchg16b (%2)" - : "=&a"(first), "=&d"(second) - : "r"(target), "a"(0), "d"(0), "b"(0), "c"(0) - : "cc"); - return {first, second}; -} - -ALWAYS_INLINE static inline void AtomicPairStoreRelease( - std::atomic<AtomicPair<uint64_t>>* target, AtomicPair<uint64_t> value) { - uint64_t first, second; - __asm__ __volatile__ ( - "movq (%2), %%rax\n\t" - "movq 8(%2), %%rdx\n\t" - "1:\n\t" - "lock cmpxchg16b (%2)\n\t" - "jnz 1b" - : "=&a"(first), "=&d"(second) - : "r"(target), "b"(value.first), "c"(value.second) - : "cc"); -} -#endif // defined(__x86_64__) - -} // namespace art - -#endif // ART_RUNTIME_BASE_ATOMIC_PAIR_H_ - diff --git a/runtime/interpreter/interpreter_cache-inl.h b/runtime/interpreter/interpreter_cache-inl.h deleted file mode 100644 index 249df23b27..0000000000 --- a/runtime/interpreter/interpreter_cache-inl.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (C) 2022 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ART_RUNTIME_INTERPRETER_INTERPRETER_CACHE_INL_H_ -#define ART_RUNTIME_INTERPRETER_INTERPRETER_CACHE_INL_H_ - -#include "interpreter_cache.h" -#include "thread.h" - -namespace art { - -template<bool kSkipThreadLocal> -inline bool InterpreterCache::Get(Thread* self, const void* dex_instr, /* out */ size_t* value) { - DCHECK(self->GetInterpreterCache() == this) << "Must be called from owning thread"; - size_t key = reinterpret_cast<size_t>(dex_instr); - Entry& local_entry = thread_local_array_[IndexOf<kThreadLocalSize>(key)]; - if (kSkipThreadLocal) { - DCHECK_NE(local_entry.first, key) << "Expected cache miss"; - } else { - if (LIKELY(local_entry.first == key)) { - *value = local_entry.second; - return true; - } - } - Entry shared_entry = AtomicPairLoadAcquire(&shared_array_[IndexOf<kSharedSize>(key)]); - if (LIKELY(shared_entry.first == key)) { - // For simplicity, only update the cache if weak ref accesses are enabled. If - // they are disabled, this means the GC is processing the cache, and is - // reading it concurrently. - if (self->GetWeakRefAccessEnabled()) { - local_entry = shared_entry; // Copy to local array to make future lookup faster. - } - *value = shared_entry.second; - return true; - } - return false; -} - -inline void InterpreterCache::Set(Thread* self, const void* dex_instr, size_t value) { - DCHECK(self->GetInterpreterCache() == this) << "Must be called from owning thread"; - - // For simplicity, only update the cache if weak ref accesses are enabled. If - // they are disabled, this means the GC is processing the cache, and is - // reading it concurrently. - if (self->GetWeakRefAccessEnabled()) { - size_t key = reinterpret_cast<size_t>(dex_instr); - thread_local_array_[IndexOf<kThreadLocalSize>(key)] = {key, value}; - AtomicPairStoreRelease(&shared_array_[IndexOf<kSharedSize>(key)], {key, value}); - } -} - -} // namespace art - -#endif // ART_RUNTIME_INTERPRETER_INTERPRETER_CACHE_INL_H_ diff --git a/runtime/interpreter/interpreter_cache.cc b/runtime/interpreter/interpreter_cache.cc index 1a35c038e9..e43fe318cc 100644 --- a/runtime/interpreter/interpreter_cache.cc +++ b/runtime/interpreter/interpreter_cache.cc @@ -19,30 +19,14 @@ namespace art { -std::array<std::atomic<InterpreterCache::Entry>, - InterpreterCache::kSharedSize> InterpreterCache::shared_array_; - -InterpreterCache::InterpreterCache() { - // We can not use the ClearThreadLocal method since the constructor will not - // be called from the owning thread. - thread_local_array_.fill(Entry{}); -} - -void InterpreterCache::ClearThreadLocal(Thread* owning_thread) { - // Must be called from the owning thread or when the owning thread is suspended. +void InterpreterCache::Clear(Thread* owning_thread) { DCHECK(owning_thread->GetInterpreterCache() == this); DCHECK(owning_thread == Thread::Current() || owning_thread->IsSuspended()); - - thread_local_array_.fill(Entry{}); + data_.fill(Entry{}); } -void InterpreterCache::ClearShared() { - // Can be called from any thread since the writes are atomic. - // The static shared cache isn't bound to specific thread in the first place. - - for (std::atomic<Entry>& entry : shared_array_) { - AtomicPairStoreRelease(&entry, Entry{}); - } +bool InterpreterCache::IsCalledFromOwningThread() { + return Thread::Current()->GetInterpreterCache() == this; } } // namespace art diff --git a/runtime/interpreter/interpreter_cache.h b/runtime/interpreter/interpreter_cache.h index af025cecbd..0ada562438 100644 --- a/runtime/interpreter/interpreter_cache.h +++ b/runtime/interpreter/interpreter_cache.h @@ -20,22 +20,17 @@ #include <array> #include <atomic> -#include "base/atomic_pair.h" #include "base/bit_utils.h" #include "base/macros.h" namespace art { -class Instruction; class Thread; // Small fast thread-local cache for the interpreter. -// -// The key is an absolute pointer to a dex instruction. -// -// The value depends on the opcode of the dex instruction. +// It can hold arbitrary pointer-sized key-value pair. +// The interpretation of the value depends on the key. // Presence of entry might imply some pre-conditions. -// // All operations must be done from the owning thread, // or at a point when the owning thread is suspended. // @@ -51,61 +46,52 @@ class Thread; // from assembly (it ensures that the offset is valid immediate value). class ALIGNED(16) InterpreterCache { public: - using Entry = AtomicPair<size_t>; - - static constexpr size_t kThreadLocalSize = 256; // Value of 256 has around 75% cache hit rate. - static constexpr size_t kSharedSize = 16 * 1024; // Value of 16k has around 90% cache hit rate. - static constexpr size_t kHashShift = 2; // Number of tailing dex pc bits to drop. - - InterpreterCache(); - - void ClearThreadLocal(Thread* owning_thread); + // Aligned since we load the whole entry in single assembly instruction. + typedef std::pair<const void*, size_t> Entry ALIGNED(2 * sizeof(size_t)); - static void ClearShared(); + // 2x size increase/decrease corresponds to ~0.5% interpreter performance change. + // Value of 256 has around 75% cache hit rate. + static constexpr size_t kSize = 256; - template<bool kSkipThreadLocal = false> - ALWAYS_INLINE bool Get(Thread* self, const void* dex_instruction, /* out */ size_t* value); + InterpreterCache() { + // We can not use the Clear() method since the constructor will not + // be called from the owning thread. + data_.fill(Entry{}); + } - ALWAYS_INLINE void Set(Thread* self, const void* dex_instruction, size_t value); + // Clear the whole cache. It requires the owning thread for DCHECKs. + void Clear(Thread* owning_thread); - template<typename Callback> - void ForEachTheadLocalEntry(Callback&& callback) { - for (Entry& entry : thread_local_array_) { - callback(reinterpret_cast<const Instruction*>(entry.first), entry.second); + ALWAYS_INLINE bool Get(const void* key, /* out */ size_t* value) { + DCHECK(IsCalledFromOwningThread()); + Entry& entry = data_[IndexOf(key)]; + if (LIKELY(entry.first == key)) { + *value = entry.second; + return true; } + return false; } - template<typename Callback> - static void ForEachSharedEntry(Callback&& callback) { - for (std::atomic<Entry>& atomic_entry : shared_array_) { - Entry old_entry = AtomicPairLoadAcquire(&atomic_entry); - Entry new_entry = old_entry; - callback(reinterpret_cast<const Instruction*>(new_entry.first), new_entry.second); - if (old_entry.second != new_entry.second) { - AtomicPairStoreRelease(&atomic_entry, new_entry); - } - } + ALWAYS_INLINE void Set(const void* key, size_t value) { + DCHECK(IsCalledFromOwningThread()); + data_[IndexOf(key)] = Entry{key, value}; + } + + std::array<Entry, kSize>& GetArray() { + return data_; } private: - template<size_t kSize> - static ALWAYS_INLINE size_t IndexOf(size_t key) { + bool IsCalledFromOwningThread(); + + static ALWAYS_INLINE size_t IndexOf(const void* key) { static_assert(IsPowerOfTwo(kSize), "Size must be power of two"); - size_t index = (key >> kHashShift) & (kSize - 1); + size_t index = (reinterpret_cast<uintptr_t>(key) >> 2) & (kSize - 1); DCHECK_LT(index, kSize); return index; } - // Small cache of fixed size which is always present for every thread. - // It is stored directly (without indrection) inside the Thread object. - // This makes it as fast as possible to access from assembly fast-path. - std::array<Entry, kThreadLocalSize> thread_local_array_; - - // Larger cache which is shared by all threads. - // It is used as next cache level if lookup in the local array fails. - // It needs to be accessed using atomic operations, and is contended, - // but the sharing allows it to be larger then the per-thread cache. - static std::array<std::atomic<Entry>, kSharedSize> shared_array_; + std::array<Entry, kSize> data_; }; } // namespace art diff --git a/runtime/interpreter/interpreter_common.h b/runtime/interpreter/interpreter_common.h index 4cbe81d0c2..1809227f2c 100644 --- a/runtime/interpreter/interpreter_common.h +++ b/runtime/interpreter/interpreter_common.h @@ -46,7 +46,6 @@ #include "dex/dex_instruction-inl.h" #include "entrypoints/entrypoint_utils-inl.h" #include "handle_scope-inl.h" -#include "interpreter_cache-inl.h" #include "interpreter_switch_impl.h" #include "jit/jit-inl.h" #include "mirror/call_site.h" @@ -239,7 +238,7 @@ static ALWAYS_INLINE bool DoInvoke(Thread* self, InterpreterCache* tls_cache = self->GetInterpreterCache(); size_t tls_value; ArtMethod* resolved_method; - if (!IsNterpSupported() && LIKELY(tls_cache->Get(self, inst, &tls_value))) { + if (!IsNterpSupported() && LIKELY(tls_cache->Get(inst, &tls_value))) { resolved_method = reinterpret_cast<ArtMethod*>(tls_value); } else { ClassLinker* const class_linker = Runtime::Current()->GetClassLinker(); @@ -253,7 +252,7 @@ static ALWAYS_INLINE bool DoInvoke(Thread* self, return false; } if (!IsNterpSupported()) { - tls_cache->Set(self, inst, reinterpret_cast<size_t>(resolved_method)); + tls_cache->Set(inst, reinterpret_cast<size_t>(resolved_method)); } } diff --git a/runtime/interpreter/mterp/nterp.cc b/runtime/interpreter/mterp/nterp.cc index d49260e56a..670ae1b0c2 100644 --- a/runtime/interpreter/mterp/nterp.cc +++ b/runtime/interpreter/mterp/nterp.cc @@ -24,7 +24,6 @@ #include "dex/dex_instruction_utils.h" #include "debugger.h" #include "entrypoints/entrypoint_utils-inl.h" -#include "interpreter/interpreter_cache-inl.h" #include "interpreter/interpreter_common.h" #include "interpreter/interpreter_intrinsics.h" #include "interpreter/shadow_frame-inl.h" @@ -94,7 +93,12 @@ inline void UpdateHotness(ArtMethod* method) REQUIRES_SHARED(Locks::mutator_lock template<typename T> inline void UpdateCache(Thread* self, uint16_t* dex_pc_ptr, T value) { DCHECK(kUseReadBarrier) << "Nterp only works with read barriers"; - self->GetInterpreterCache()->Set(self, dex_pc_ptr, value); + // For simplicity, only update the cache if weak ref accesses are enabled. If + // they are disabled, this means the GC is processing the cache, and is + // reading it concurrently. + if (self->GetWeakRefAccessEnabled()) { + self->GetInterpreterCache()->Set(dex_pc_ptr, value); + } } template<typename T> @@ -248,7 +252,7 @@ extern "C" const char* NterpGetShortyFromInvokeCustom(ArtMethod* caller, uint16_ } FLATTEN -extern "C" size_t NterpGetMethodSlowPath(Thread* self, ArtMethod* caller, uint16_t* dex_pc_ptr) +extern "C" size_t NterpGetMethod(Thread* self, ArtMethod* caller, uint16_t* dex_pc_ptr) REQUIRES_SHARED(Locks::mutator_lock_) { UpdateHotness(caller); const Instruction* inst = Instruction::At(dex_pc_ptr); @@ -406,16 +410,6 @@ extern "C" size_t NterpGetMethodSlowPath(Thread* self, ArtMethod* caller, uint16 } } -extern "C" size_t NterpGetMethod(Thread* self, ArtMethod* caller, uint16_t* dex_pc_ptr) - REQUIRES_SHARED(Locks::mutator_lock_) { - InterpreterCache* cache = self->GetInterpreterCache(); - size_t cached_value; - if (LIKELY(cache->Get</*kSkipThreadLocal=*/true>(self, dex_pc_ptr, &cached_value))) { - return cached_value; - } - return NterpGetMethodSlowPath(self, caller, dex_pc_ptr); // Tail call. -} - FLATTEN static ArtField* ResolveFieldWithAccessChecks(Thread* self, ClassLinker* class_linker, @@ -465,10 +459,10 @@ static ArtField* ResolveFieldWithAccessChecks(Thread* self, return resolved_field; } -extern "C" size_t NterpGetStaticFieldSlowPath(Thread* self, - ArtMethod* caller, - uint16_t* dex_pc_ptr, - size_t resolve_field_type) // Resolve if not zero +extern "C" size_t NterpGetStaticField(Thread* self, + ArtMethod* caller, + uint16_t* dex_pc_ptr, + size_t resolve_field_type) // Resolve if not zero REQUIRES_SHARED(Locks::mutator_lock_) { UpdateHotness(caller); const Instruction* inst = Instruction::At(dex_pc_ptr); @@ -508,25 +502,10 @@ extern "C" size_t NterpGetStaticFieldSlowPath(Thread* self, } } -extern "C" size_t NterpGetStaticField(Thread* self, - ArtMethod* caller, - uint16_t* dex_pc_ptr, - size_t resolve_field_type) // Resolve if not zero - REQUIRES_SHARED(Locks::mutator_lock_) { - InterpreterCache* cache = self->GetInterpreterCache(); - size_t cached_value; - if (LIKELY(cache->Get</*kSkipThreadLocal=*/true>(self, dex_pc_ptr, &cached_value))) { - return cached_value; - } - return NterpGetStaticFieldSlowPath(self, caller, dex_pc_ptr, resolve_field_type); -} - - -extern "C" uint32_t NterpGetInstanceFieldOffsetSlowPath( - Thread* self, - ArtMethod* caller, - uint16_t* dex_pc_ptr, - size_t resolve_field_type) // Resolve if not zero +extern "C" uint32_t NterpGetInstanceFieldOffset(Thread* self, + ArtMethod* caller, + uint16_t* dex_pc_ptr, + size_t resolve_field_type) // Resolve if not zero REQUIRES_SHARED(Locks::mutator_lock_) { UpdateHotness(caller); const Instruction* inst = Instruction::At(dex_pc_ptr); @@ -553,19 +532,6 @@ extern "C" uint32_t NterpGetInstanceFieldOffsetSlowPath( return resolved_field->GetOffset().Uint32Value(); } -extern "C" uint32_t NterpGetInstanceFieldOffset(Thread* self, - ArtMethod* caller, - uint16_t* dex_pc_ptr, - size_t resolve_field_type) // Resolve if not zero - REQUIRES_SHARED(Locks::mutator_lock_) { - InterpreterCache* cache = self->GetInterpreterCache(); - size_t cached_value; - if (LIKELY(cache->Get</*kSkipThreadLocal=*/true>(self, dex_pc_ptr, &cached_value))) { - return cached_value; - } - return NterpGetInstanceFieldOffsetSlowPath(self, caller, dex_pc_ptr, resolve_field_type); -} - extern "C" mirror::Object* NterpGetClassOrAllocateObject(Thread* self, ArtMethod* caller, uint16_t* dex_pc_ptr) diff --git a/runtime/mirror/dex_cache-inl.h b/runtime/mirror/dex_cache-inl.h index 8a1ed71197..31f2bd2d7b 100644 --- a/runtime/mirror/dex_cache-inl.h +++ b/runtime/mirror/dex_cache-inl.h @@ -357,18 +357,32 @@ inline void DexCache::SetResolvedMethod(uint32_t method_idx, ArtMethod* method) template <typename T> NativeDexCachePair<T> DexCache::GetNativePair(std::atomic<NativeDexCachePair<T>>* pair_array, size_t idx) { - auto* array = reinterpret_cast<std::atomic<AtomicPair<size_t>>*>(pair_array); - AtomicPair<size_t> value = AtomicPairLoadAcquire(&array[idx]); - return NativeDexCachePair<T>(reinterpret_cast<T*>(value.first), value.second); + if (kRuntimePointerSize == PointerSize::k64) { + auto* array = reinterpret_cast<std::atomic<ConversionPair64>*>(pair_array); + ConversionPair64 value = AtomicLoadRelaxed16B(&array[idx]); + return NativeDexCachePair<T>(reinterpret_cast64<T*>(value.first), + dchecked_integral_cast<size_t>(value.second)); + } else { + auto* array = reinterpret_cast<std::atomic<ConversionPair32>*>(pair_array); + ConversionPair32 value = array[idx].load(std::memory_order_relaxed); + return NativeDexCachePair<T>(reinterpret_cast32<T*>(value.first), value.second); + } } template <typename T> void DexCache::SetNativePair(std::atomic<NativeDexCachePair<T>>* pair_array, size_t idx, NativeDexCachePair<T> pair) { - auto* array = reinterpret_cast<std::atomic<AtomicPair<size_t>>*>(pair_array); - AtomicPair<size_t> v(reinterpret_cast<size_t>(pair.object), pair.index); - AtomicPairStoreRelease(&array[idx], v); + if (kRuntimePointerSize == PointerSize::k64) { + auto* array = reinterpret_cast<std::atomic<ConversionPair64>*>(pair_array); + ConversionPair64 v(reinterpret_cast64<uint64_t>(pair.object), pair.index); + AtomicStoreRelease16B(&array[idx], v); + } else { + auto* array = reinterpret_cast<std::atomic<ConversionPair32>*>(pair_array); + ConversionPair32 v(reinterpret_cast32<uint32_t>(pair.object), + dchecked_integral_cast<uint32_t>(pair.index)); + array[idx].store(v, std::memory_order_release); + } } template <typename T, diff --git a/runtime/mirror/dex_cache.cc b/runtime/mirror/dex_cache.cc index b7f8ee7a07..c80f9dfe2f 100644 --- a/runtime/mirror/dex_cache.cc +++ b/runtime/mirror/dex_cache.cc @@ -126,5 +126,23 @@ ObjPtr<ClassLoader> DexCache::GetClassLoader() { return GetFieldObject<ClassLoader>(OFFSET_OF_OBJECT_MEMBER(DexCache, class_loader_)); } +#if !defined(__aarch64__) && !defined(__x86_64__) +static pthread_mutex_t dex_cache_slow_atomic_mutex = PTHREAD_MUTEX_INITIALIZER; + +DexCache::ConversionPair64 DexCache::AtomicLoadRelaxed16B(std::atomic<ConversionPair64>* target) { + pthread_mutex_lock(&dex_cache_slow_atomic_mutex); + DexCache::ConversionPair64 value = *reinterpret_cast<ConversionPair64*>(target); + pthread_mutex_unlock(&dex_cache_slow_atomic_mutex); + return value; +} + +void DexCache::AtomicStoreRelease16B(std::atomic<ConversionPair64>* target, + ConversionPair64 value) { + pthread_mutex_lock(&dex_cache_slow_atomic_mutex); + *reinterpret_cast<ConversionPair64*>(target) = value; + pthread_mutex_unlock(&dex_cache_slow_atomic_mutex); +} +#endif + } // namespace mirror } // namespace art diff --git a/runtime/mirror/dex_cache.h b/runtime/mirror/dex_cache.h index 19197095aa..26fc520cd7 100644 --- a/runtime/mirror/dex_cache.h +++ b/runtime/mirror/dex_cache.h @@ -149,7 +149,7 @@ class MANAGED DexCache final : public Object { "String dex cache size is not a power of 2."); // Size of field dex cache. Needs to be a power of 2 for entrypoint assumptions to hold. - static constexpr size_t kDexCacheFieldCacheSize = 512; + static constexpr size_t kDexCacheFieldCacheSize = 1024; static_assert(IsPowerOfTwo(kDexCacheFieldCacheSize), "Field dex cache size is not a power of 2."); @@ -448,6 +448,19 @@ class MANAGED DexCache final : public Object { T* AllocArray(MemberOffset obj_offset, MemberOffset num_offset, size_t num) REQUIRES_SHARED(Locks::mutator_lock_); + // std::pair<> is not trivially copyable and as such it is unsuitable for atomic operations, + // so we use a custom pair class for loading and storing the NativeDexCachePair<>. + template <typename IntType> + struct PACKED(2 * sizeof(IntType)) ConversionPair { + ConversionPair(IntType f, IntType s) : first(f), second(s) { } + ConversionPair(const ConversionPair&) = default; + ConversionPair& operator=(const ConversionPair&) = default; + IntType first; + IntType second; + }; + using ConversionPair32 = ConversionPair<uint32_t>; + using ConversionPair64 = ConversionPair<uint64_t>; + // Visit instance fields of the dex cache as well as its associated arrays. template <bool kVisitNativeRoots, VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags, @@ -456,6 +469,48 @@ class MANAGED DexCache final : public Object { void VisitReferences(ObjPtr<Class> klass, const Visitor& visitor) REQUIRES_SHARED(Locks::mutator_lock_) REQUIRES(Locks::heap_bitmap_lock_); + // Due to lack of 16-byte atomics support, we use hand-crafted routines. +#if defined(__aarch64__) + // 16-byte atomics are supported on aarch64. + ALWAYS_INLINE static ConversionPair64 AtomicLoadRelaxed16B( + std::atomic<ConversionPair64>* target) { + return target->load(std::memory_order_relaxed); + } + + ALWAYS_INLINE static void AtomicStoreRelease16B( + std::atomic<ConversionPair64>* target, ConversionPair64 value) { + target->store(value, std::memory_order_release); + } +#elif defined(__x86_64__) + ALWAYS_INLINE static ConversionPair64 AtomicLoadRelaxed16B( + std::atomic<ConversionPair64>* target) { + uint64_t first, second; + __asm__ __volatile__( + "lock cmpxchg16b (%2)" + : "=&a"(first), "=&d"(second) + : "r"(target), "a"(0), "d"(0), "b"(0), "c"(0) + : "cc"); + return ConversionPair64(first, second); + } + + ALWAYS_INLINE static void AtomicStoreRelease16B( + std::atomic<ConversionPair64>* target, ConversionPair64 value) { + uint64_t first, second; + __asm__ __volatile__ ( + "movq (%2), %%rax\n\t" + "movq 8(%2), %%rdx\n\t" + "1:\n\t" + "lock cmpxchg16b (%2)\n\t" + "jnz 1b" + : "=&a"(first), "=&d"(second) + : "r"(target), "b"(value.first), "c"(value.second) + : "cc"); + } +#else + static ConversionPair64 AtomicLoadRelaxed16B(std::atomic<ConversionPair64>* target); + static void AtomicStoreRelease16B(std::atomic<ConversionPair64>* target, ConversionPair64 value); +#endif + HeapReference<ClassLoader> class_loader_; HeapReference<String> location_; diff --git a/runtime/runtime.cc b/runtime/runtime.cc index 2a3afa8c99..54e9d38b3c 100644 --- a/runtime/runtime.cc +++ b/runtime/runtime.cc @@ -739,7 +739,7 @@ void Runtime::SweepSystemWeaks(IsMarkedVisitor* visitor) { // from mutators. See b/32167580. GetJit()->GetCodeCache()->SweepRootTables(visitor); } - Thread::SweepInterpreterCaches(visitor); + thread_list_->SweepInterpreterCaches(visitor); // All other generic system-weak holders. for (gc::AbstractSystemWeakHolder* holder : system_weak_holders_) { diff --git a/runtime/thread.cc b/runtime/thread.cc index 2b3f47298e..25d493f0a5 100644 --- a/runtime/thread.cc +++ b/runtime/thread.cc @@ -4240,28 +4240,27 @@ void Thread::VisitRoots(RootVisitor* visitor) { } #pragma GCC diagnostic pop -void Thread::SweepInterpreterCaches(IsMarkedVisitor* visitor) { - Thread* self = Thread::Current(); - auto visit = [visitor, self](const Instruction* inst, size_t& value) { - Locks::mutator_lock_->AssertSharedHeld(self); +void Thread::SweepInterpreterCache(IsMarkedVisitor* visitor) { + for (InterpreterCache::Entry& entry : GetInterpreterCache()->GetArray()) { + const Instruction* inst = reinterpret_cast<const Instruction*>(entry.first); if (inst != nullptr) { if (inst->Opcode() == Instruction::NEW_INSTANCE || inst->Opcode() == Instruction::CHECK_CAST || inst->Opcode() == Instruction::INSTANCE_OF || inst->Opcode() == Instruction::NEW_ARRAY || inst->Opcode() == Instruction::CONST_CLASS) { - mirror::Class* cls = reinterpret_cast<mirror::Class*>(value); + mirror::Class* cls = reinterpret_cast<mirror::Class*>(entry.second); if (cls == nullptr || cls == Runtime::GetWeakClassSentinel()) { // Entry got deleted in a previous sweep. - return; + continue; } Runtime::ProcessWeakClass( - reinterpret_cast<GcRoot<mirror::Class>*>(&value), + reinterpret_cast<GcRoot<mirror::Class>*>(&entry.second), visitor, Runtime::GetWeakClassSentinel()); } else if (inst->Opcode() == Instruction::CONST_STRING || inst->Opcode() == Instruction::CONST_STRING_JUMBO) { - mirror::Object* object = reinterpret_cast<mirror::Object*>(value); + mirror::Object* object = reinterpret_cast<mirror::Object*>(entry.second); mirror::Object* new_object = visitor->IsMarked(object); // We know the string is marked because it's a strongly-interned string that // is always alive (see b/117621117 for trying to make those strings weak). @@ -4269,16 +4268,11 @@ void Thread::SweepInterpreterCaches(IsMarkedVisitor* visitor) { // null for newly allocated objects, but we know those haven't moved. Therefore, // only update the entry if we get a different non-null string. if (new_object != nullptr && new_object != object) { - value = reinterpret_cast<size_t>(new_object); + entry.second = reinterpret_cast<size_t>(new_object); } } } - }; - InterpreterCache::ForEachSharedEntry(visit); - MutexLock mu(Thread::Current(), *Locks::thread_list_lock_); - Runtime::Current()->GetThreadList()->ForEach([&visit](Thread* thread) { - thread->GetInterpreterCache()->ForEachTheadLocalEntry(visit); - }); + } } // FIXME: clang-r433403 reports the below function exceeds frame size limit. @@ -4493,10 +4487,9 @@ void Thread::SetReadBarrierEntrypoints() { void Thread::ClearAllInterpreterCaches() { static struct ClearInterpreterCacheClosure : Closure { void Run(Thread* thread) override { - thread->GetInterpreterCache()->ClearThreadLocal(thread); + thread->GetInterpreterCache()->Clear(thread); } } closure; - InterpreterCache::ClearShared(); Runtime::Current()->GetThreadList()->RunCheckpoint(&closure); } diff --git a/runtime/thread.h b/runtime/thread.h index ec971a564f..1085a563c4 100644 --- a/runtime/thread.h +++ b/runtime/thread.h @@ -1348,6 +1348,10 @@ class Thread { return ThreadOffset<pointer_size>(OFFSETOF_MEMBER(Thread, interpreter_cache_)); } + static constexpr int InterpreterCacheSizeLog2() { + return WhichPowerOf2(InterpreterCache::kSize); + } + static constexpr uint32_t AllThreadFlags() { return enum_cast<uint32_t>(ThreadFlag::kLastFlag) | (enum_cast<uint32_t>(ThreadFlag::kLastFlag) - 1u); @@ -1522,8 +1526,7 @@ class Thread { template <bool kPrecise> void VisitRoots(RootVisitor* visitor) REQUIRES_SHARED(Locks::mutator_lock_); - static void SweepInterpreterCaches(IsMarkedVisitor* visitor) - REQUIRES_SHARED(Locks::mutator_lock_); + void SweepInterpreterCache(IsMarkedVisitor* visitor) REQUIRES_SHARED(Locks::mutator_lock_); static bool IsAotCompiler(); diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc index 6482e72417..4e3b40ba7a 100644 --- a/runtime/thread_list.cc +++ b/runtime/thread_list.cc @@ -1407,6 +1407,13 @@ void ThreadList::VisitRoots(RootVisitor* visitor, VisitRootFlags flags) const { } } +void ThreadList::SweepInterpreterCaches(IsMarkedVisitor* visitor) const { + MutexLock mu(Thread::Current(), *Locks::thread_list_lock_); + for (const auto& thread : list_) { + thread->SweepInterpreterCache(visitor); + } +} + void ThreadList::VisitReflectiveTargets(ReflectiveValueVisitor *visitor) const { MutexLock mu(Thread::Current(), *Locks::thread_list_lock_); for (const auto& thread : list_) { diff --git a/runtime/thread_list.h b/runtime/thread_list.h index 29b0c52186..f5b58a0c54 100644 --- a/runtime/thread_list.h +++ b/runtime/thread_list.h @@ -179,6 +179,10 @@ class ThreadList { return empty_checkpoint_barrier_.get(); } + void SweepInterpreterCaches(IsMarkedVisitor* visitor) const + REQUIRES(!Locks::thread_list_lock_) + REQUIRES_SHARED(Locks::mutator_lock_); + void WaitForOtherNonDaemonThreadsToExit(bool check_no_birth = true) REQUIRES(!Locks::thread_list_lock_, !Locks::thread_suspend_count_lock_, !Locks::mutator_lock_); diff --git a/tools/cpp-define-generator/thread.def b/tools/cpp-define-generator/thread.def index 07ca8841f8..ec8e28b977 100644 --- a/tools/cpp-define-generator/thread.def +++ b/tools/cpp-define-generator/thread.def @@ -30,12 +30,11 @@ ASM_DEFINE(THREAD_ID_OFFSET, ASM_DEFINE(THREAD_INTERPRETER_CACHE_OFFSET, art::Thread::InterpreterCacheOffset<art::kRuntimePointerSize>().Int32Value()) ASM_DEFINE(THREAD_INTERPRETER_CACHE_SIZE_LOG2, - art::WhichPowerOf2(art::InterpreterCache::kThreadLocalSize)) + art::Thread::InterpreterCacheSizeLog2()) ASM_DEFINE(THREAD_INTERPRETER_CACHE_SIZE_MASK, - (sizeof(art::InterpreterCache::Entry) * (art::InterpreterCache::kThreadLocalSize - 1))) + (sizeof(art::InterpreterCache::Entry) * (art::InterpreterCache::kSize - 1))) ASM_DEFINE(THREAD_INTERPRETER_CACHE_SIZE_SHIFT, - (art::WhichPowerOf2(sizeof(art::InterpreterCache::Entry)) - - art::InterpreterCache::kHashShift)) + 2) ASM_DEFINE(THREAD_IS_GC_MARKING_OFFSET, art::Thread::IsGcMarkingOffset<art::kRuntimePointerSize>().Int32Value()) ASM_DEFINE(THREAD_LOCAL_ALLOC_STACK_END_OFFSET, |