Use the small thread-local cache for mterp field accesses.
This reduces the overhead of non-quickened code from 10% to 7.5%.
(measured on golem benchmarks for arm64)
Test: ./art/test.py -b -r --interpreter
Change-Id: Icce9183eb60c62ac30a0c6ff57e32c796c807f03
diff --git a/runtime/interpreter/interpreter_cache.h b/runtime/interpreter/interpreter_cache.h
index c25222e..b4966fd 100644
--- a/runtime/interpreter/interpreter_cache.h
+++ b/runtime/interpreter/interpreter_cache.h
@@ -35,6 +35,10 @@
// All operations must be done from the owning thread,
// or at a point when the owning thread is suspended.
//
+// The values stored for opcodes in the cache currently are:
+// iget/iput: The field offset. The field must be non-volatile.
+// sget/sput: The ArtField* pointer. The field must be non-volitile.
+//
// Aligned to 16-bytes to make it easier to get the address of the cache
// from assembly (it ensures that the offset is valid immediate value).
class ALIGNED(16) InterpreterCache {
diff --git a/runtime/interpreter/mterp/mterp.cc b/runtime/interpreter/mterp/mterp.cc
index a7423c8..fbc96f7 100644
--- a/runtime/interpreter/mterp/mterp.cc
+++ b/runtime/interpreter/mterp/mterp.cc
@@ -748,6 +748,10 @@
return true;
}
+// This methods is called from assembly to handle field access instructions.
+//
+// This method is fairly hot. It is long, but it has been carefully optimized.
+// It contains only fully inlined methods -> no spills -> no prologue/epilogue.
template<typename PrimType, FindFieldType kAccessType>
ALWAYS_INLINE bool MterpFieldAccessFast(Instruction* inst,
uint16_t inst_data,
@@ -756,8 +760,32 @@
REQUIRES_SHARED(Locks::mutator_lock_) {
constexpr bool kIsStatic = (kAccessType & FindFieldFlags::StaticBit) != 0;
+ // Try to find the field in small thread-local cache first.
+ InterpreterCache* tls_cache = self->GetInterpreterCache();
+ size_t tls_value;
+ if (LIKELY(tls_cache->Get(inst, &tls_value))) {
+ // The meaning of the cache value is opcode-specific.
+ // It is ArtFiled* for static fields and the raw offset for instance fields.
+ size_t offset = kIsStatic
+ ? reinterpret_cast<ArtField*>(tls_value)->GetOffset().SizeValue()
+ : tls_value;
+ if (kIsDebugBuild) {
+ uint32_t field_idx = kIsStatic ? inst->VRegB_21c() : inst->VRegC_22c();
+ ArtField* field = FindFieldFromCode<kAccessType, /* access_checks */ false>(
+ field_idx, shadow_frame->GetMethod(), self, sizeof(PrimType));
+ DCHECK_EQ(offset, field->GetOffset().SizeValue());
+ }
+ ObjPtr<mirror::Object> obj = kIsStatic
+ ? reinterpret_cast<ArtField*>(tls_value)->GetDeclaringClass()
+ : MakeObjPtr(shadow_frame->GetVRegReference(inst->VRegB_22c(inst_data)));
+ if (LIKELY(obj != nullptr)) {
+ MterpFieldAccess<PrimType, kAccessType>(
+ inst, inst_data, shadow_frame, obj, MemberOffset(offset), /* is_volatile */ false);
+ return true;
+ }
+ }
+
// This effectively inlines the fast path from ArtMethod::GetDexCache.
- // It avoids non-inlined call which in turn allows elimination of the prologue and epilogue.
ArtMethod* referrer = shadow_frame->GetMethod();
if (LIKELY(!referrer->IsObsolete())) {
// Avoid read barriers, since we need only the pointer to the native (non-movable)
@@ -777,6 +805,14 @@
? field->GetDeclaringClass().Ptr()
: shadow_frame->GetVRegReference(inst->VRegB_22c(inst_data));
if (LIKELY(kIsStatic || obj != nullptr)) {
+ // Only non-volatile fields are allowed in the thread-local cache.
+ if (LIKELY(!field->IsVolatile())) {
+ if (kIsStatic) {
+ tls_cache->Set(inst, reinterpret_cast<uintptr_t>(field));
+ } else {
+ tls_cache->Set(inst, field->GetOffset().SizeValue());
+ }
+ }
MterpFieldAccess<PrimType, kAccessType>(
inst, inst_data, shadow_frame, obj, field->GetOffset(), field->IsVolatile());
return true;