Use the small thread-local cache for mterp field accesses.

This reduces the overhead of non-quickened code from 10% to 7.5%.
(measured on golem benchmarks for arm64)

Test: ./art/test.py -b -r --interpreter
Change-Id: Icce9183eb60c62ac30a0c6ff57e32c796c807f03
diff --git a/runtime/interpreter/interpreter_cache.h b/runtime/interpreter/interpreter_cache.h
index c25222e..b4966fd 100644
--- a/runtime/interpreter/interpreter_cache.h
+++ b/runtime/interpreter/interpreter_cache.h
@@ -35,6 +35,10 @@
 // All operations must be done from the owning thread,
 // or at a point when the owning thread is suspended.
 //
+// The values stored for opcodes in the cache currently are:
+//   iget/iput: The field offset. The field must be non-volatile.
+//   sget/sput: The ArtField* pointer. The field must be non-volitile.
+//
 // Aligned to 16-bytes to make it easier to get the address of the cache
 // from assembly (it ensures that the offset is valid immediate value).
 class ALIGNED(16) InterpreterCache {
diff --git a/runtime/interpreter/mterp/mterp.cc b/runtime/interpreter/mterp/mterp.cc
index a7423c8..fbc96f7 100644
--- a/runtime/interpreter/mterp/mterp.cc
+++ b/runtime/interpreter/mterp/mterp.cc
@@ -748,6 +748,10 @@
   return true;
 }
 
+// This methods is called from assembly to handle field access instructions.
+//
+// This method is fairly hot.  It is long, but it has been carefully optimized.
+// It contains only fully inlined methods -> no spills -> no prologue/epilogue.
 template<typename PrimType, FindFieldType kAccessType>
 ALWAYS_INLINE bool MterpFieldAccessFast(Instruction* inst,
                                         uint16_t inst_data,
@@ -756,8 +760,32 @@
     REQUIRES_SHARED(Locks::mutator_lock_) {
   constexpr bool kIsStatic = (kAccessType & FindFieldFlags::StaticBit) != 0;
 
+  // Try to find the field in small thread-local cache first.
+  InterpreterCache* tls_cache = self->GetInterpreterCache();
+  size_t tls_value;
+  if (LIKELY(tls_cache->Get(inst, &tls_value))) {
+    // The meaning of the cache value is opcode-specific.
+    // It is ArtFiled* for static fields and the raw offset for instance fields.
+    size_t offset = kIsStatic
+        ? reinterpret_cast<ArtField*>(tls_value)->GetOffset().SizeValue()
+        : tls_value;
+    if (kIsDebugBuild) {
+      uint32_t field_idx = kIsStatic ? inst->VRegB_21c() : inst->VRegC_22c();
+      ArtField* field = FindFieldFromCode<kAccessType, /* access_checks */ false>(
+          field_idx, shadow_frame->GetMethod(), self, sizeof(PrimType));
+      DCHECK_EQ(offset, field->GetOffset().SizeValue());
+    }
+    ObjPtr<mirror::Object> obj = kIsStatic
+        ? reinterpret_cast<ArtField*>(tls_value)->GetDeclaringClass()
+        : MakeObjPtr(shadow_frame->GetVRegReference(inst->VRegB_22c(inst_data)));
+    if (LIKELY(obj != nullptr)) {
+      MterpFieldAccess<PrimType, kAccessType>(
+          inst, inst_data, shadow_frame, obj, MemberOffset(offset), /* is_volatile */ false);
+      return true;
+    }
+  }
+
   // This effectively inlines the fast path from ArtMethod::GetDexCache.
-  // It avoids non-inlined call which in turn allows elimination of the prologue and epilogue.
   ArtMethod* referrer = shadow_frame->GetMethod();
   if (LIKELY(!referrer->IsObsolete())) {
     // Avoid read barriers, since we need only the pointer to the native (non-movable)
@@ -777,6 +805,14 @@
             ? field->GetDeclaringClass().Ptr()
             : shadow_frame->GetVRegReference(inst->VRegB_22c(inst_data));
         if (LIKELY(kIsStatic || obj != nullptr)) {
+          // Only non-volatile fields are allowed in the thread-local cache.
+          if (LIKELY(!field->IsVolatile())) {
+            if (kIsStatic) {
+              tls_cache->Set(inst, reinterpret_cast<uintptr_t>(field));
+            } else {
+              tls_cache->Set(inst, field->GetOffset().SizeValue());
+            }
+          }
           MterpFieldAccess<PrimType, kAccessType>(
               inst, inst_data, shadow_frame, obj, field->GetOffset(), field->IsVolatile());
           return true;