Add an implementation of Nterp for x64.

And enable it on x64 when runtime and ArtMethod requirements are met
(see nterp.cc).

Test: test.py
Bug: 112676029
Change-Id: I772cd20a20fdc0ff99529df7495801d773091584
diff --git a/runtime/Android.bp b/runtime/Android.bp
index baa921d..e09d828 100644
--- a/runtime/Android.bp
+++ b/runtime/Android.bp
@@ -344,8 +344,9 @@
                 // Note that the fault_handler_x86.cc is not a mistake.  This file is
                 // shared between the x86 and x86_64 architectures.
                 "interpreter/mterp/mterp.cc",
-                "interpreter/mterp/nterp_stub.cc",
+                "interpreter/mterp/nterp.cc",
                 ":libart_mterp.x86_64",
+                ":libart_mterp.x86_64ng",
                 "arch/x86_64/context_x86_64.cc",
                 "arch/x86_64/entrypoints_init_x86_64.cc",
                 "arch/x86_64/jni_entrypoints_x86_64.S",
@@ -806,3 +807,16 @@
     ],
     cmd: "$(location interpreter/mterp/gen_mterp.py) $(out) $(in)",
 }
+
+genrule {
+    name: "libart_mterp.x86_64ng",
+    out: ["mterp_x86_64ng.S"],
+    srcs: ["interpreter/mterp/x86_64ng/*.S",
+           "interpreter/mterp/x86_64/arithmetic.S",
+           "interpreter/mterp/x86_64/floating_point.S"],
+    tool_files: [
+        "interpreter/mterp/gen_mterp.py",
+        "interpreter/mterp/common/gen_setup.py",
+    ],
+    cmd: "$(location interpreter/mterp/gen_mterp.py) $(out) $(in)",
+}
diff --git a/runtime/arch/x86_64/asm_support_x86_64.S b/runtime/arch/x86_64/asm_support_x86_64.S
index 596e468..2b50cdb 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.S
+++ b/runtime/arch/x86_64/asm_support_x86_64.S
@@ -82,6 +82,7 @@
     // The restored CFA state should match the CFA state during CFI_REMEMBER_STATE.
     // `objdump -Wf libart.so | egrep "_cfa|_state"` is useful to audit the opcodes.
     #define CFI_RESTORE_STATE_AND_DEF_CFA(reg,off) .cfi_restore_state .cfi_def_cfa reg,off
+    #define CFI_RESTORE_STATE .cfi_restore_state
 #else
     // Mac OS' doesn't like cfi_* directives.
     #define CFI_STARTPROC
@@ -93,6 +94,7 @@
     #define CFI_REL_OFFSET(reg,size)
     #define CFI_REMEMBER_STATE
     #define CFI_RESTORE_STATE_AND_DEF_CFA(off)
+    #define CFI_RESTORE_STATE
 #endif
 
     // Symbols.
diff --git a/runtime/arch/x86_64/context_x86_64.h b/runtime/arch/x86_64/context_x86_64.h
index ab38614..1e2658c 100644
--- a/runtime/arch/x86_64/context_x86_64.h
+++ b/runtime/arch/x86_64/context_x86_64.h
@@ -45,6 +45,10 @@
     rip_ = new_pc;
   }
 
+  void SetNterpDexPC(uintptr_t dex_pc_ptr) override {
+    SetGPR(R12, dex_pc_ptr);
+  }
+
   void SetArg0(uintptr_t new_arg0_value) override {
     SetGPR(RDI, new_arg0_value);
   }
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index a0a2365..ffa772e 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -143,6 +143,8 @@
 #include "verifier/class_verifier.h"
 #include "well_known_classes.h"
 
+#include "interpreter/interpreter_mterp_impl.h"
+
 namespace art {
 
 using android::base::StringPrintf;
@@ -224,16 +226,25 @@
 // Ensures that methods have the kAccSkipAccessChecks bit set. We use the
 // kAccVerificationAttempted bit on the class access flags to determine whether this has been done
 // before.
-template <bool kNeedsVerified = false>
 static void EnsureSkipAccessChecksMethods(Handle<mirror::Class> klass, PointerSize pointer_size)
     REQUIRES_SHARED(Locks::mutator_lock_) {
-  if (kNeedsVerified) {
-    // To not fail access-flags access checks, push a minimal state.
-    mirror::Class::SetStatus(klass, ClassStatus::kVerified, Thread::Current());
-  }
+  Runtime* runtime = Runtime::Current();
+  ClassLinker* class_linker = runtime->GetClassLinker();
   if (!klass->WasVerificationAttempted()) {
     klass->SetSkipAccessChecksFlagOnAllMethods(pointer_size);
     klass->SetVerificationAttempted();
+    // Now that the class has passed verification, try to set nterp entrypoints
+    // to methods that currently use the switch interpreter.
+    if (interpreter::CanRuntimeUseNterp()) {
+      for (ArtMethod& m : klass->GetMethods(pointer_size)) {
+        if (class_linker->IsQuickToInterpreterBridge(m.GetEntryPointFromQuickCompiledCode()) &&
+            interpreter::CanMethodUseNterp(&m)) {
+          if (klass->IsVisiblyInitialized() || !NeedsClinitCheckBeforeCall(&m)) {
+            runtime->GetInstrumentation()->UpdateMethodsCode(&m, interpreter::GetNterpEntryPoint());
+          }
+        }
+      }
+    }
   }
 }
 
@@ -3681,6 +3692,11 @@
     // No code and native? Use generic trampoline.
     return GetQuickGenericJniStub();
   }
+
+  if (interpreter::CanRuntimeUseNterp() && interpreter::CanMethodUseNterp(method)) {
+    return interpreter::GetNterpEntryPoint();
+  }
+
   return GetQuickToInterpreterBridge();
 }
 
@@ -3778,27 +3794,41 @@
       // Only update static methods.
       continue;
     }
-    if (!IsQuickResolutionStub(method->GetEntryPointFromQuickCompiledCode())) {
-      // Only update methods whose entrypoint is the resolution stub.
-      continue;
-    }
     const void* quick_code = nullptr;
+
+    // In order:
+    // 1) Check if we have AOT Code.
+    // 2) Check if we have JIT Code.
+    // 3) Check if we can use Nterp.
     if (has_oat_class) {
       OatFile::OatMethod oat_method = oat_class.GetOatMethod(method_index);
       quick_code = oat_method.GetQuickCode();
     }
-    // Check if we have JIT compiled code for it.
+
     jit::Jit* jit = runtime->GetJit();
     if (quick_code == nullptr && jit != nullptr) {
       quick_code = jit->GetCodeCache()->GetSavedEntryPointOfPreCompiledMethod(method);
     }
+
+    if (quick_code == nullptr &&
+        interpreter::CanRuntimeUseNterp() &&
+        interpreter::CanMethodUseNterp(method)) {
+      quick_code = interpreter::GetNterpEntryPoint();
+    }
+
     // Check whether the method is native, in which case it's generic JNI.
     if (quick_code == nullptr && method->IsNative()) {
       quick_code = GetQuickGenericJniStub();
     } else if (ShouldUseInterpreterEntrypoint(method, quick_code)) {
       // Use interpreter entry point.
+      if (IsQuickToInterpreterBridge(method->GetEntryPointFromQuickCompiledCode())) {
+        // If we have the trampoline or the bridge already, no need to update.
+        // This saves in not dirtying boot image memory.
+        continue;
+      }
       quick_code = GetQuickToInterpreterBridge();
     }
+    CHECK(quick_code != nullptr);
     runtime->GetInstrumentation()->UpdateMethodsCode(method, quick_code);
   }
   // Ignore virtual methods on the iterator.
diff --git a/runtime/interpreter/cfi_asm_support.h b/runtime/interpreter/cfi_asm_support.h
index c1e5fb5..04812e1 100644
--- a/runtime/interpreter/cfi_asm_support.h
+++ b/runtime/interpreter/cfi_asm_support.h
@@ -44,9 +44,16 @@
     0x0c /* DW_OP_const4u */, 0x44, 0x45, 0x58, 0x31, /* magic = "DEX1" */     \
     0x13 /* DW_OP_drop */,                                                     \
     0x92 /* DW_OP_bregx */, dexReg, (dexOffset & 0x7F) /* 1-byte SLEB128 */
+
+  #define CFI_DEFINE_CFA_DEREF(reg, offset, size) .cfi_escape                  \
+    0x0f /* DW_CFA_expression */, 6 /* size */,                                \
+    0x92 /* bregx */, reg, (offset & 0x7F),                                    \
+    0x06 /* DW_OP_DEREF */,                                                    \
+    0x23 /* DW_OP_plus_uconst */, size
 #else
   // Mac OS doesn't like cfi_* directives.
   #define CFI_DEFINE_DEX_PC_WITH_OFFSET(tmpReg, dexReg, dexOffset)
+  #define CFI_DEFINE_CFA_DEREF(reg, offset)
 #endif
 
 #endif  // ART_RUNTIME_INTERPRETER_CFI_ASM_SUPPORT_H_
diff --git a/runtime/interpreter/interpreter_common.h b/runtime/interpreter/interpreter_common.h
index 9fccde9..d7fee02 100644
--- a/runtime/interpreter/interpreter_common.h
+++ b/runtime/interpreter/interpreter_common.h
@@ -245,13 +245,14 @@
   const uint32_t vregC = (is_range) ? inst->VRegC_3rc() : inst->VRegC_35c();
   ArtMethod* sf_method = shadow_frame.GetMethod();
 
-  // Try to find the method in small thread-local cache first.
+  // Try to find the method in small thread-local cache first (only used when
+  // nterp is not used as mterp and nterp use the cache in an incompatible way).
   InterpreterCache* tls_cache = self->GetInterpreterCache();
   size_t tls_value;
   ArtMethod* resolved_method;
   if (is_quick) {
     resolved_method = nullptr;  // We don't know/care what the original method was.
-  } else if (LIKELY(tls_cache->Get(inst, &tls_value))) {
+  } else if (!IsNterpSupported() && LIKELY(tls_cache->Get(inst, &tls_value))) {
     resolved_method = reinterpret_cast<ArtMethod*>(tls_value);
   } else {
     ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
@@ -264,7 +265,9 @@
       result->SetJ(0);
       return false;
     }
-    tls_cache->Set(inst, reinterpret_cast<size_t>(resolved_method));
+    if (!IsNterpSupported()) {
+      tls_cache->Set(inst, reinterpret_cast<size_t>(resolved_method));
+    }
   }
 
   // Null pointer check and virtual method resolution.
diff --git a/runtime/interpreter/mterp/mterp.h b/runtime/interpreter/mterp/mterp.h
index dfbba29..7813fca 100644
--- a/runtime/interpreter/mterp/mterp.h
+++ b/runtime/interpreter/mterp/mterp.h
@@ -20,6 +20,8 @@
 #include <cstddef>
 #include <cstdint>
 
+#include "base/globals.h"
+
 /*
  * Mterp assembly handler bases
  */
@@ -31,6 +33,7 @@
 
 namespace art {
 
+class ArtMethod;
 class Thread;
 
 namespace interpreter {
@@ -40,6 +43,8 @@
 void CheckNterpAsmConstants();
 bool CanUseMterp();
 bool IsNterpSupported();
+bool CanRuntimeUseNterp();
+bool CanMethodUseNterp(ArtMethod* method);
 const void* GetNterpEntryPoint();
 
 // Poison value for TestExportPC.  If we segfault with this value, it means that a mterp
@@ -51,6 +56,9 @@
 
 constexpr size_t kMterpHandlerSize = 128;
 
+// The maximum we will allow an nterp frame to be.
+constexpr size_t kMaxNterpFrame = 3 * KB;
+
 }  // namespace interpreter
 }  // namespace art
 
diff --git a/runtime/interpreter/mterp/nterp.cc b/runtime/interpreter/mterp/nterp.cc
new file mode 100644
index 0000000..1e52492
--- /dev/null
+++ b/runtime/interpreter/mterp/nterp.cc
@@ -0,0 +1,598 @@
+/*
+ * Copyright (C) 2019 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Mterp entry point and support functions.
+ */
+#include "mterp.h"
+
+#include "base/quasi_atomic.h"
+#include "dex/dex_instruction_utils.h"
+#include "debugger.h"
+#include "entrypoints/entrypoint_utils-inl.h"
+#include "interpreter/interpreter_common.h"
+#include "interpreter/interpreter_intrinsics.h"
+#include "interpreter/shadow_frame-inl.h"
+#include "mirror/string-alloc-inl.h"
+#include "nterp_helpers.h"
+
+namespace art {
+namespace interpreter {
+
+bool IsNterpSupported() {
+  return !kPoisonHeapReferences && kUseReadBarrier;
+}
+
+bool CanRuntimeUseNterp() REQUIRES_SHARED(Locks::mutator_lock_) {
+  // Nterp has the same restrictions as Mterp.
+  return CanUseMterp();
+}
+
+bool CanMethodUseNterp(ArtMethod* method) REQUIRES_SHARED(Locks::mutator_lock_) {
+  return method->SkipAccessChecks() &&
+      !method->IsNative() &&
+      method->GetDexFile()->IsStandardDexFile() &&
+      NterpGetFrameSize(method) < kMaxNterpFrame;
+}
+
+const void* GetNterpEntryPoint() {
+  return reinterpret_cast<const void*>(interpreter::ExecuteNterpImpl);
+}
+
+/*
+ * Verify some constants used by the nterp interpreter.
+ */
+void CheckNterpAsmConstants() {
+  /*
+   * If we're using computed goto instruction transitions, make sure
+   * none of the handlers overflows the byte limit.  This won't tell
+   * which one did, but if any one is too big the total size will
+   * overflow.
+   */
+  const int width = kMterpHandlerSize;
+  ptrdiff_t interp_size = reinterpret_cast<uintptr_t>(artNterpAsmInstructionEnd) -
+                          reinterpret_cast<uintptr_t>(artNterpAsmInstructionStart);
+  if ((interp_size == 0) || (interp_size != (art::kNumPackedOpcodes * width))) {
+      LOG(FATAL) << "ERROR: unexpected asm interp size " << interp_size
+                 << "(did an instruction handler exceed " << width << " bytes?)";
+  }
+}
+
+template<typename T>
+inline void UpdateCache(Thread* self, uint16_t* dex_pc_ptr, T value) {
+  DCHECK(kUseReadBarrier) << "Nterp only works with read barriers";
+  // For simplicity, only update the cache if weak ref accesses are enabled. If
+  // they are disabled, this means the GC is processing the cache, and is
+  // reading it concurrently.
+  if (self->GetWeakRefAccessEnabled()) {
+    self->GetInterpreterCache()->Set(dex_pc_ptr, value);
+  }
+}
+
+template<typename T>
+inline void UpdateCache(Thread* self, uint16_t* dex_pc_ptr, T* value) {
+  UpdateCache(self, dex_pc_ptr, reinterpret_cast<size_t>(value));
+}
+
+extern "C" const dex::CodeItem* NterpGetCodeItem(ArtMethod* method)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  ScopedAssertNoThreadSuspension sants("In nterp");
+  return method->GetCodeItem();
+}
+
+extern "C" const char* NterpGetShorty(ArtMethod* method)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  ScopedAssertNoThreadSuspension sants("In nterp");
+  return method->GetInterfaceMethodIfProxy(kRuntimePointerSize)->GetShorty();
+}
+
+extern "C" const char* NterpGetShortyFromMethodId(ArtMethod* caller, uint32_t method_index)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  ScopedAssertNoThreadSuspension sants("In nterp");
+  return caller->GetDexFile()->GetMethodShorty(method_index);
+}
+
+extern "C" const char* NterpGetShortyFromInvokePolymorphic(ArtMethod* caller, uint16_t* dex_pc_ptr)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  ScopedAssertNoThreadSuspension sants("In nterp");
+  const Instruction* inst = Instruction::At(dex_pc_ptr);
+  dex::ProtoIndex proto_idx(inst->Opcode() == Instruction::INVOKE_POLYMORPHIC
+      ? inst->VRegH_45cc()
+      : inst->VRegH_4rcc());
+  return caller->GetDexFile()->GetShorty(proto_idx);
+}
+
+extern "C" const char* NterpGetShortyFromInvokeCustom(ArtMethod* caller, uint16_t* dex_pc_ptr)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  ScopedAssertNoThreadSuspension sants("In nterp");
+  const Instruction* inst = Instruction::At(dex_pc_ptr);
+  uint16_t call_site_index = (inst->Opcode() == Instruction::INVOKE_CUSTOM
+      ? inst->VRegB_35c()
+      : inst->VRegB_3rc());
+  const DexFile* dex_file = caller->GetDexFile();
+  dex::ProtoIndex proto_idx = dex_file->GetProtoIndexForCallSite(call_site_index);
+  return dex_file->GetShorty(proto_idx);
+}
+
+extern "C" size_t NterpGetMethod(Thread* self, ArtMethod* caller, uint16_t* dex_pc_ptr)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  const Instruction* inst = Instruction::At(dex_pc_ptr);
+  InvokeType invoke_type = kStatic;
+  uint16_t method_index = 0;
+  switch (inst->Opcode()) {
+    case Instruction::INVOKE_DIRECT: {
+      method_index = inst->VRegB_35c();
+      invoke_type = kDirect;
+      break;
+    }
+
+    case Instruction::INVOKE_INTERFACE: {
+      method_index = inst->VRegB_35c();
+      invoke_type = kInterface;
+      break;
+    }
+
+    case Instruction::INVOKE_STATIC: {
+      method_index = inst->VRegB_35c();
+      invoke_type = kStatic;
+      break;
+    }
+
+    case Instruction::INVOKE_SUPER: {
+      method_index = inst->VRegB_35c();
+      invoke_type = kSuper;
+      break;
+    }
+    case Instruction::INVOKE_VIRTUAL: {
+      method_index = inst->VRegB_35c();
+      invoke_type = kVirtual;
+      break;
+    }
+
+    case Instruction::INVOKE_DIRECT_RANGE: {
+      method_index = inst->VRegB_3rc();
+      invoke_type = kDirect;
+      break;
+    }
+
+    case Instruction::INVOKE_INTERFACE_RANGE: {
+      method_index = inst->VRegB_3rc();
+      invoke_type = kInterface;
+      break;
+    }
+
+    case Instruction::INVOKE_STATIC_RANGE: {
+      method_index = inst->VRegB_3rc();
+      invoke_type = kStatic;
+      break;
+    }
+
+    case Instruction::INVOKE_SUPER_RANGE: {
+      method_index = inst->VRegB_3rc();
+      invoke_type = kSuper;
+      break;
+    }
+
+    case Instruction::INVOKE_VIRTUAL_RANGE: {
+      method_index = inst->VRegB_3rc();
+      invoke_type = kVirtual;
+      break;
+    }
+
+    default:
+      LOG(FATAL) << "Unknown instruction " << inst->Opcode();
+  }
+
+  ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
+  ArtMethod* resolved_method = caller->SkipAccessChecks()
+      ? class_linker->ResolveMethod<ClassLinker::ResolveMode::kNoChecks>(
+            self, method_index, caller, invoke_type)
+      : class_linker->ResolveMethod<ClassLinker::ResolveMode::kCheckICCEAndIAE>(
+            self, method_index, caller, invoke_type);
+  if (resolved_method == nullptr) {
+    DCHECK(self->IsExceptionPending());
+    return 0;
+  }
+
+  // ResolveMethod returns the method based on the method_id. For super invokes
+  // we must use the executing class's context to find the right method.
+  if (invoke_type == kSuper) {
+    ObjPtr<mirror::Class> executing_class = caller->GetDeclaringClass();
+    ObjPtr<mirror::Class> referenced_class = class_linker->LookupResolvedType(
+        executing_class->GetDexFile().GetMethodId(method_index).class_idx_,
+        executing_class->GetDexCache(),
+        executing_class->GetClassLoader());
+    DCHECK(referenced_class != nullptr);  // We have already resolved a method from this class.
+    if (!referenced_class->IsAssignableFrom(executing_class)) {
+      // We cannot determine the target method.
+      ThrowNoSuchMethodError(invoke_type,
+                             resolved_method->GetDeclaringClass(),
+                             resolved_method->GetName(),
+                             resolved_method->GetSignature());
+      return 0;
+    }
+    if (referenced_class->IsInterface()) {
+      resolved_method = referenced_class->FindVirtualMethodForInterfaceSuper(
+          resolved_method, class_linker->GetImagePointerSize());
+    } else {
+      uint16_t vtable_index = resolved_method->GetMethodIndex();
+      ObjPtr<mirror::Class> super_class = executing_class->GetSuperClass();
+      if (super_class == nullptr ||
+          !super_class->HasVTable() ||
+          vtable_index >= static_cast<uint32_t>(super_class->GetVTableLength())) {
+        // Behavior to agree with that of the verifier.
+        ThrowNoSuchMethodError(invoke_type,
+                               resolved_method->GetDeclaringClass(),
+                               resolved_method->GetName(),
+                               resolved_method->GetSignature());
+        return 0;
+      } else {
+        resolved_method = executing_class->GetSuperClass()->GetVTableEntry(
+            vtable_index, class_linker->GetImagePointerSize());
+      }
+    }
+  }
+
+  if (invoke_type == kInterface) {
+    UpdateCache(self, dex_pc_ptr, resolved_method->GetImtIndex());
+    return resolved_method->GetImtIndex();
+  } else if (resolved_method->GetDeclaringClass()->IsStringClass()
+             && !resolved_method->IsStatic()
+             && resolved_method->IsConstructor()) {
+    resolved_method = WellKnownClasses::StringInitToStringFactory(resolved_method);
+    // Or the result with 1 to notify to nterp this is a string init method. We
+    // also don't cache the result as we don't want nterp to have its fast path always
+    // check for it, and we expect a lot more regular calls than string init
+    // calls.
+    return reinterpret_cast<size_t>(resolved_method) | 1;
+  } else if (invoke_type == kVirtual) {
+    UpdateCache(self, dex_pc_ptr, resolved_method->GetMethodIndex());
+    return resolved_method->GetMethodIndex();
+  } else {
+    UpdateCache(self, dex_pc_ptr, resolved_method);
+    return reinterpret_cast<size_t>(resolved_method);
+  }
+}
+
+static ArtField* ResolveFieldWithAccessChecks(Thread* self,
+                                              ClassLinker* class_linker,
+                                              uint16_t field_index,
+                                              ArtMethod* caller,
+                                              bool is_static,
+                                              bool is_put)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  if (caller->SkipAccessChecks()) {
+    return class_linker->ResolveField(field_index, caller, is_static);
+  }
+
+  caller = caller->GetInterfaceMethodIfProxy(kRuntimePointerSize);
+
+  StackHandleScope<2> hs(self);
+  Handle<mirror::DexCache> h_dex_cache(hs.NewHandle(caller->GetDexCache()));
+  Handle<mirror::ClassLoader> h_class_loader(hs.NewHandle(caller->GetClassLoader()));
+
+  ArtField* resolved_field = class_linker->ResolveFieldJLS(field_index,
+                                                           h_dex_cache,
+                                                           h_class_loader);
+  if (resolved_field == nullptr) {
+    return nullptr;
+  }
+
+  ObjPtr<mirror::Class> fields_class = resolved_field->GetDeclaringClass();
+  if (UNLIKELY(resolved_field->IsStatic() != is_static)) {
+    ThrowIncompatibleClassChangeErrorField(resolved_field, is_static, caller);
+    return nullptr;
+  }
+  ObjPtr<mirror::Class> referring_class = caller->GetDeclaringClass();
+  if (UNLIKELY(!referring_class->CheckResolvedFieldAccess(fields_class,
+                                                          resolved_field,
+                                                          caller->GetDexCache(),
+                                                          field_index))) {
+    return nullptr;
+  }
+  if (UNLIKELY(is_put && resolved_field->IsFinal() && (fields_class != referring_class))) {
+    ThrowIllegalAccessErrorFinalField(caller, resolved_field);
+    return nullptr;
+  }
+  return resolved_field;
+}
+
+extern "C" size_t NterpGetStaticField(Thread* self, ArtMethod* caller, uint16_t* dex_pc_ptr)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  const Instruction* inst = Instruction::At(dex_pc_ptr);
+  uint16_t field_index = inst->VRegB_21c();
+  ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
+  ArtField* resolved_field = ResolveFieldWithAccessChecks(
+      self,
+      class_linker,
+      field_index,
+      caller,
+      /* is_static */ true,
+      /* is_put */ IsInstructionSPut(inst->Opcode()));
+
+  if (resolved_field == nullptr) {
+    DCHECK(self->IsExceptionPending());
+    return 0;
+  }
+  if (UNLIKELY(!resolved_field->GetDeclaringClass()->IsVisiblyInitialized())) {
+    StackHandleScope<1> hs(self);
+    Handle<mirror::Class> h_class(hs.NewHandle(resolved_field->GetDeclaringClass()));
+    if (UNLIKELY(!class_linker->EnsureInitialized(
+                      self, h_class, /*can_init_fields=*/ true, /*can_init_parents=*/ true))) {
+      DCHECK(self->IsExceptionPending());
+      return 0;
+    }
+    DCHECK(h_class->IsInitializing());
+  }
+  if (resolved_field->IsVolatile()) {
+    // Or the result with 1 to notify to nterp this is a volatile field. We
+    // also don't cache the result as we don't want nterp to have its fast path always
+    // check for it.
+    return reinterpret_cast<size_t>(resolved_field) | 1;
+  } else {
+    UpdateCache(self, dex_pc_ptr, resolved_field);
+    return reinterpret_cast<size_t>(resolved_field);
+  }
+}
+
+extern "C" uint32_t NterpGetInstanceFieldOffset(Thread* self,
+                                                ArtMethod* caller,
+                                                uint16_t* dex_pc_ptr)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  const Instruction* inst = Instruction::At(dex_pc_ptr);
+  uint16_t field_index = inst->VRegC_22c();
+  ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
+  ArtField* resolved_field = ResolveFieldWithAccessChecks(
+      self,
+      class_linker,
+      field_index,
+      caller,
+      /* is_static */ false,
+      /* is_put */ IsInstructionIPut(inst->Opcode()));
+  if (resolved_field == nullptr) {
+    DCHECK(self->IsExceptionPending());
+    return 0;
+  }
+  if (resolved_field->IsVolatile()) {
+    // Don't cache for a volatile field, and return a negative offset as marker
+    // of volatile.
+    return -resolved_field->GetOffset().Uint32Value();
+  }
+  UpdateCache(self, dex_pc_ptr, resolved_field->GetOffset().Uint32Value());
+  return resolved_field->GetOffset().Uint32Value();
+}
+
+extern "C" mirror::Object* NterpGetClassOrAllocateObject(Thread* self,
+                                                         ArtMethod* caller,
+                                                         uint16_t* dex_pc_ptr)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  const Instruction* inst = Instruction::At(dex_pc_ptr);
+  dex::TypeIndex index;
+  switch (inst->Opcode()) {
+    case Instruction::NEW_INSTANCE:
+      index = dex::TypeIndex(inst->VRegB_21c());
+      break;
+    case Instruction::CHECK_CAST:
+      index = dex::TypeIndex(inst->VRegB_21c());
+      break;
+    case Instruction::INSTANCE_OF:
+      index = dex::TypeIndex(inst->VRegC_22c());
+      break;
+    case Instruction::CONST_CLASS:
+      index = dex::TypeIndex(inst->VRegB_21c());
+      break;
+    case Instruction::NEW_ARRAY:
+      index = dex::TypeIndex(inst->VRegC_22c());
+      break;
+    default:
+      LOG(FATAL) << "Unreachable";
+  }
+  ObjPtr<mirror::Class> c =
+      ResolveVerifyAndClinit(index,
+                             caller,
+                             self,
+                             /* can_run_clinit= */ false,
+                             /* verify_access= */ !caller->SkipAccessChecks());
+  if (c == nullptr) {
+    DCHECK(self->IsExceptionPending());
+    return nullptr;
+  }
+
+  if (inst->Opcode() == Instruction::NEW_INSTANCE) {
+    gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
+    if (UNLIKELY(c->IsStringClass())) {
+      // We don't cache the class for strings as we need to special case their
+      // allocation.
+      return mirror::String::AllocEmptyString(self, allocator_type).Ptr();
+    } else {
+      if (!c->IsFinalizable() && c->IsInstantiable()) {
+        // Cache non-finalizable classes for next calls.
+        UpdateCache(self, dex_pc_ptr, c.Ptr());
+      }
+      return AllocObjectFromCode(c, self, allocator_type).Ptr();
+    }
+  } else {
+    // For all other cases, cache the class.
+    UpdateCache(self, dex_pc_ptr, c.Ptr());
+  }
+  return c.Ptr();
+}
+
+extern "C" mirror::Object* NterpLoadObject(Thread* self, ArtMethod* caller, uint16_t* dex_pc_ptr)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  const Instruction* inst = Instruction::At(dex_pc_ptr);
+  ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
+  switch (inst->Opcode()) {
+    case Instruction::CONST_STRING:
+    case Instruction::CONST_STRING_JUMBO: {
+      dex::StringIndex string_index(
+          (inst->Opcode() == Instruction::CONST_STRING)
+              ? inst->VRegB_21c()
+              : inst->VRegB_31c());
+      ObjPtr<mirror::String> str = class_linker->ResolveString(string_index, caller);
+      if (str == nullptr) {
+        DCHECK(self->IsExceptionPending());
+        return nullptr;
+      }
+      UpdateCache(self, dex_pc_ptr, str.Ptr());
+      return str.Ptr();
+    }
+    case Instruction::CONST_METHOD_HANDLE: {
+      // Don't cache: we don't expect this to be performance sensitive, and we
+      // don't want the cache to conflict with a performance sensitive entry.
+      return class_linker->ResolveMethodHandle(self, inst->VRegB_21c(), caller).Ptr();
+    }
+    case Instruction::CONST_METHOD_TYPE: {
+      // Don't cache: we don't expect this to be performance sensitive, and we
+      // don't want the cache to conflict with a performance sensitive entry.
+      return class_linker->ResolveMethodType(
+          self, dex::ProtoIndex(inst->VRegB_21c()), caller).Ptr();
+    }
+    default:
+      LOG(FATAL) << "Unreachable";
+  }
+  return nullptr;
+}
+
+extern "C" void NterpUnimplemented() {
+  LOG(FATAL) << "Unimplemented";
+}
+
+static mirror::Object* DoFilledNewArray(Thread* self,
+                                        ArtMethod* caller,
+                                        uint16_t* dex_pc_ptr,
+                                        int32_t* regs,
+                                        bool is_range)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  const Instruction* inst = Instruction::At(dex_pc_ptr);
+  if (kIsDebugBuild) {
+    if (is_range) {
+      DCHECK_EQ(inst->Opcode(), Instruction::FILLED_NEW_ARRAY_RANGE);
+    } else {
+      DCHECK_EQ(inst->Opcode(), Instruction::FILLED_NEW_ARRAY);
+    }
+  }
+  const int32_t length = is_range ? inst->VRegA_3rc() : inst->VRegA_35c();
+  DCHECK_GE(length, 0);
+  if (!is_range) {
+    // Checks FILLED_NEW_ARRAY's length does not exceed 5 arguments.
+    DCHECK_LE(length, 5);
+  }
+  uint16_t type_idx = is_range ? inst->VRegB_3rc() : inst->VRegB_35c();
+  ObjPtr<mirror::Class> array_class = ResolveVerifyAndClinit(dex::TypeIndex(type_idx),
+                                                             caller,
+                                                             self,
+                                                             /* can_run_clinit= */ true,
+                                                             /* verify_access= */ false);
+  if (UNLIKELY(array_class == nullptr)) {
+    DCHECK(self->IsExceptionPending());
+    return nullptr;
+  }
+  DCHECK(array_class->IsArrayClass());
+  ObjPtr<mirror::Class> component_class = array_class->GetComponentType();
+  const bool is_primitive_int_component = component_class->IsPrimitiveInt();
+  if (UNLIKELY(component_class->IsPrimitive() && !is_primitive_int_component)) {
+    if (component_class->IsPrimitiveLong() || component_class->IsPrimitiveDouble()) {
+      ThrowRuntimeException("Bad filled array request for type %s",
+                            component_class->PrettyDescriptor().c_str());
+    } else {
+      self->ThrowNewExceptionF(
+          "Ljava/lang/InternalError;",
+          "Found type %s; filled-new-array not implemented for anything but 'int'",
+          component_class->PrettyDescriptor().c_str());
+    }
+    return nullptr;
+  }
+  ObjPtr<mirror::Object> new_array = mirror::Array::Alloc(
+      self,
+      array_class,
+      length,
+      array_class->GetComponentSizeShift(),
+      Runtime::Current()->GetHeap()->GetCurrentAllocator());
+  if (UNLIKELY(new_array == nullptr)) {
+    self->AssertPendingOOMException();
+    return nullptr;
+  }
+  uint32_t arg[Instruction::kMaxVarArgRegs];  // only used in filled-new-array.
+  uint32_t vregC = 0;   // only used in filled-new-array-range.
+  if (is_range) {
+    vregC = inst->VRegC_3rc();
+  } else {
+    inst->GetVarArgs(arg);
+  }
+  for (int32_t i = 0; i < length; ++i) {
+    size_t src_reg = is_range ? vregC + i : arg[i];
+    if (is_primitive_int_component) {
+      new_array->AsIntArray()->SetWithoutChecks</* kTransactionActive= */ false>(i, regs[src_reg]);
+    } else {
+      new_array->AsObjectArray<mirror::Object>()->SetWithoutChecks</* kTransactionActive= */ false>(
+          i, reinterpret_cast<mirror::Object*>(regs[src_reg]));
+    }
+  }
+  return new_array.Ptr();
+}
+
+extern "C" mirror::Object* NterpFilledNewArray(Thread* self,
+                                               ArtMethod* caller,
+                                               int32_t* registers,
+                                               uint16_t* dex_pc_ptr)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  return DoFilledNewArray(self, caller, dex_pc_ptr, registers, /* is_range= */ false);
+}
+
+extern "C" mirror::Object* NterpFilledNewArrayRange(Thread* self,
+                                                    ArtMethod* caller,
+                                                    int32_t* registers,
+                                                    uint16_t* dex_pc_ptr)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  return DoFilledNewArray(self, caller, dex_pc_ptr, registers, /* is_range= */ true);
+}
+
+extern "C" jit::OsrData* NterpHotMethod(ArtMethod* method, uint16_t* dex_pc_ptr, uint32_t* vregs)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  ScopedAssertNoThreadSuspension sants("In nterp");
+  jit::Jit* jit = Runtime::Current()->GetJit();
+  if (jit != nullptr) {
+    // Nterp passes null on entry where we don't want to OSR.
+    if (dex_pc_ptr != nullptr) {
+      // This could be a loop back edge, check if we can OSR.
+      CodeItemInstructionAccessor accessor(method->DexInstructions());
+      uint32_t dex_pc = dex_pc_ptr - accessor.Insns();
+      jit::OsrData* osr_data = jit->PrepareForOsr(
+          method->GetInterfaceMethodIfProxy(kRuntimePointerSize), dex_pc, vregs);
+      if (osr_data != nullptr) {
+        return osr_data;
+      }
+    }
+    jit->EnqueueCompilationFromNterp(method, Thread::Current());
+  }
+  return nullptr;
+}
+
+extern "C" ssize_t MterpDoPackedSwitch(const uint16_t* switchData, int32_t testVal);
+extern "C" ssize_t NterpDoPackedSwitch(const uint16_t* switchData, int32_t testVal)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  return MterpDoPackedSwitch(switchData, testVal);
+}
+
+extern "C" ssize_t MterpDoSparseSwitch(const uint16_t* switchData, int32_t testVal);
+extern "C" ssize_t NterpDoSparseSwitch(const uint16_t* switchData, int32_t testVal)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  return MterpDoSparseSwitch(switchData, testVal);
+}
+
+}  // namespace interpreter
+}  // namespace art
diff --git a/runtime/interpreter/mterp/nterp_stub.cc b/runtime/interpreter/mterp/nterp_stub.cc
index e77f0e3..c1b1ec3 100644
--- a/runtime/interpreter/mterp/nterp_stub.cc
+++ b/runtime/interpreter/mterp/nterp_stub.cc
@@ -21,12 +21,27 @@
  */
 
 namespace art {
+
+class ArtMethod;
+
 namespace interpreter {
 
 bool IsNterpSupported() {
   return false;
 }
 
+bool CanRuntimeUseNterp() {
+  return false;
+}
+
+bool CanMethodUseNterp(ArtMethod* method ATTRIBUTE_UNUSED) {
+  return false;
+}
+
+const void* GetNterpEntryPoint() {
+  return nullptr;
+}
+
 void CheckNterpAsmConstants() {
 }
 
@@ -34,10 +49,6 @@
   UNIMPLEMENTED(FATAL);
 }
 
-const void* GetNterpEntryPoint() {
-  return nullptr;
-}
-
 extern "C" void* artNterpAsmInstructionStart[] = { nullptr };
 extern "C" void* artNterpAsmInstructionEnd[] = { nullptr };
 
diff --git a/runtime/interpreter/mterp/x86_64ng/array.S b/runtime/interpreter/mterp/x86_64ng/array.S
new file mode 100644
index 0000000..baf5f30
--- /dev/null
+++ b/runtime/interpreter/mterp/x86_64ng/array.S
@@ -0,0 +1,151 @@
+%def op_aget(load="movl", shift="4", data_offset="MIRROR_INT_ARRAY_DATA_OFFSET", wide="0", is_object="0"):
+/*
+ * Array get.  vAA <- vBB[vCC].
+ *
+ * for: aget, aget-boolean, aget-byte, aget-char, aget-short, aget-wide, aget-object
+ *
+ */
+    /* op vAA, vBB, vCC */
+    movzbq  2(rPC), %rax                    # eax <- BB
+    movzbq  3(rPC), %rcx                    # ecx <- CC
+    GET_VREG %edi, %rax                     # eax <- vBB (array object)
+    GET_VREG %esi, %rcx                     # ecx <- vCC (requested index)
+    testl   %edi, %edi                      # null array object?
+    je      common_errNullObject            # bail if so
+    cmpl    MIRROR_ARRAY_LENGTH_OFFSET(%edi), %esi
+    jae     common_errArrayIndex            # index >= length, bail.
+    .if $wide
+    movq    $data_offset(%rdi,%rsi,8), %rax
+    SET_WIDE_VREG %rax, rINSTq
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+    .elseif $is_object
+    testb $$READ_BARRIER_TEST_VALUE, GRAY_BYTE_OFFSET(%edi)
+    $load   $data_offset(%rdi,%rsi,$shift), %eax
+    jnz 2f
+1:
+    SET_VREG_OBJECT %eax, rINSTq
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+2:
+    // reg00 is eax
+    call art_quick_read_barrier_mark_reg00
+    jmp 1b
+    .else
+    $load   $data_offset(%rdi,%rsi,$shift), %eax
+    SET_VREG %eax, rINSTq
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+    .endif
+
+%def op_aget_boolean():
+%  op_aget(load="movzbl", shift="1", data_offset="MIRROR_BOOLEAN_ARRAY_DATA_OFFSET", is_object="0")
+
+%def op_aget_byte():
+%  op_aget(load="movsbl", shift="1", data_offset="MIRROR_BYTE_ARRAY_DATA_OFFSET", is_object="0")
+
+%def op_aget_char():
+%  op_aget(load="movzwl", shift="2", data_offset="MIRROR_CHAR_ARRAY_DATA_OFFSET", is_object="0")
+
+%def op_aget_object():
+%  op_aget(load="movl", shift="4", data_offset="MIRROR_OBJECT_ARRAY_DATA_OFFSET", is_object="1")
+
+%def op_aget_short():
+%  op_aget(load="movswl", shift="2", data_offset="MIRROR_SHORT_ARRAY_DATA_OFFSET", is_object="0")
+
+%def op_aget_wide():
+%  op_aget(load="movq", shift="8", data_offset="MIRROR_WIDE_ARRAY_DATA_OFFSET", wide="1", is_object="0")
+
+%def op_aput(rINST_reg="rINST", store="movl", shift="4", data_offset="MIRROR_INT_ARRAY_DATA_OFFSET", wide="0"):
+/*
+ * Array put.  vBB[vCC] <- vAA.
+ *
+ * for: aput, aput-boolean, aput-byte, aput-char, aput-short, aput-wide
+ *
+ */
+    /* op vAA, vBB, vCC */
+    movzbq  2(rPC), %rax                    # rax <- BB
+    movzbq  3(rPC), %rcx                    # rcx <- CC
+    GET_VREG %edi, %rax                     # edi <- vBB (array object)
+    GET_VREG %esi, %rcx                     # esi <- vCC (requested index)
+    testl   %edi, %edi                      # null array object?
+    je      common_errNullObject            # bail if so
+    cmpl    MIRROR_ARRAY_LENGTH_OFFSET(%edi), %esi
+    jae     common_errArrayIndex            # index >= length, bail.
+    .if $wide
+    GET_WIDE_VREG rINSTq, rINSTq
+    .else
+    GET_VREG rINST, rINSTq
+    .endif
+    $store    $rINST_reg, $data_offset(%rdi,%rsi,$shift)
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+
+%def op_aput_boolean():
+%  op_aput(rINST_reg="rINSTbl", store="movb", shift="1", data_offset="MIRROR_BOOLEAN_ARRAY_DATA_OFFSET", wide="0")
+
+%def op_aput_byte():
+%  op_aput(rINST_reg="rINSTbl", store="movb", shift="1", data_offset="MIRROR_BYTE_ARRAY_DATA_OFFSET", wide="0")
+
+%def op_aput_char():
+%  op_aput(rINST_reg="rINSTw", store="movw", shift="2", data_offset="MIRROR_CHAR_ARRAY_DATA_OFFSET", wide="0")
+
+%def op_aput_short():
+%  op_aput(rINST_reg="rINSTw", store="movw", shift="2", data_offset="MIRROR_SHORT_ARRAY_DATA_OFFSET", wide="0")
+
+%def op_aput_wide():
+%  op_aput(rINST_reg="rINSTq", store="movq", shift="8", data_offset="MIRROR_WIDE_ARRAY_DATA_OFFSET", wide="1")
+
+%def op_aput_object():
+    movzbq  2(rPC), %rax                    # rax <- BB
+    movzbq  3(rPC), %rcx                    # rcx <- CC
+    GET_VREG %edi, %rax                     # edi <- vBB (array object)
+    GET_VREG %esi, %rcx                     # esi <- vCC (requested index)
+    testl   %edi, %edi                      # null array object?
+    je      common_errNullObject            # bail if so
+    cmpl    MIRROR_ARRAY_LENGTH_OFFSET(%edi), %esi
+    jae     common_errArrayIndex            # index >= length, bail.
+    GET_VREG %edx, rINSTq
+    call art_quick_aput_obj
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+
+%def op_array_length():
+/*
+ * Return the length of an array.
+ */
+    movl    rINST, %eax                     # eax <- BA
+    sarl    $$4, rINST                      # rINST <- B
+    GET_VREG %ecx, rINSTq                   # ecx <- vB (object ref)
+    testl   %ecx, %ecx                      # is null?
+    je      common_errNullObject
+    andb    $$0xf, %al                      # eax <- A
+    movl    MIRROR_ARRAY_LENGTH_OFFSET(%rcx), rINST
+    SET_VREG rINST, %rax
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 1
+
+%def op_fill_array_data():
+    /* fill-array-data vAA, +BBBBBBBB */
+    EXPORT_PC
+    movslq  2(rPC), %rcx                    # rcx <- ssssssssBBBBbbbb
+    leaq    (rPC,%rcx,2), OUT_ARG0          # OUT_ARG0 <- PC + ssssssssBBBBbbbb*2
+    GET_VREG OUT_32_ARG1, rINSTq            # OUT_ARG1 <- vAA (array object)
+    call    art_quick_handle_fill_data
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 3
+
+%def op_filled_new_array(helper="nterp_filled_new_array"):
+/*
+ * Create a new array with elements filled from registers.
+ *
+ * for: filled-new-array, filled-new-array/range
+ */
+    /* op vB, {vD, vE, vF, vG, vA}, class@CCCC */
+    /* op {vCCCC..v(CCCC+AA-1)}, type@BBBB */
+    EXPORT_PC
+    movq    rSELF:THREAD_SELF_OFFSET, OUT_ARG0
+    movq    (%rsp), OUT_ARG1
+    movq    rFP, OUT_ARG2
+    movq    rPC, OUT_ARG3
+    call    SYMBOL($helper)
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 3
+
+%def op_filled_new_array_range():
+%  op_filled_new_array(helper="nterp_filled_new_array_range")
+
+%def op_new_array():
+  jmp NterpNewArray
diff --git a/runtime/interpreter/mterp/x86_64ng/control_flow.S b/runtime/interpreter/mterp/x86_64ng/control_flow.S
new file mode 100644
index 0000000..35276d4
--- /dev/null
+++ b/runtime/interpreter/mterp/x86_64ng/control_flow.S
@@ -0,0 +1,179 @@
+%def bincmp(revcmp=""):
+/*
+ * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
+ * fragment that specifies the *reverse* comparison to perform, e.g.
+ * for "if-le" you would use "gt".
+ *
+ * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
+ */
+    /* if-cmp vA, vB, +CCCC */
+    movl    rINST, %ecx                     # rcx <- A+
+    sarl    $$4, rINST                      # rINST <- B
+    andb    $$0xf, %cl                      # rcx <- A
+    GET_VREG %eax, %rcx                     # eax <- vA
+    cmpl    VREG_ADDRESS(rINSTq), %eax      # compare (vA, vB)
+    j${revcmp}   1f
+    movswq  2(rPC), rINSTq                  # Get signed branch offset
+    BRANCH
+1:
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+
+%def zcmp(revcmp=""):
+/*
+ * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
+ * fragment that specifies the *reverse* comparison to perform, e.g.
+ * for "if-le" you would use "gt".
+ *
+ * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
+ */
+    /* if-cmp vAA, +BBBB */
+    cmpl    $$0, VREG_ADDRESS(rINSTq)       # compare (vA, 0)
+    j${revcmp}   1f
+    movswq  2(rPC), rINSTq                  # fetch signed displacement
+    BRANCH
+1:
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+
+%def op_goto():
+/*
+ * Unconditional branch, 8-bit offset.
+ *
+ * The branch distance is a signed code-unit offset, which we need to
+ * double to get a byte offset.
+ */
+    /* goto +AA */
+    movsbq  rINSTbl, rINSTq                 # rINSTq <- ssssssAA
+    BRANCH
+
+%def op_goto_16():
+/*
+ * Unconditional branch, 16-bit offset.
+ *
+ * The branch distance is a signed code-unit offset, which we need to
+ * double to get a byte offset.
+ */
+    /* goto/16 +AAAA */
+    movswq  2(rPC), rINSTq                  # rINSTq <- ssssAAAA
+    BRANCH
+
+%def op_goto_32():
+/*
+ * Unconditional branch, 32-bit offset.
+ *
+ * The branch distance is a signed code-unit offset, which we need to
+ * double to get a byte offset.
+ *
+ * Because we need the SF bit set, we'll use an adds
+ * to convert from Dalvik offset to byte offset.
+ */
+    /* goto/32 +AAAAAAAA */
+    movslq  2(rPC), rINSTq                  # rINSTq <- AAAAAAAA
+    BRANCH
+
+%def op_if_eq():
+%  bincmp(revcmp="ne")
+
+%def op_if_eqz():
+%  zcmp(revcmp="ne")
+
+%def op_if_ge():
+%  bincmp(revcmp="l")
+
+%def op_if_gez():
+%  zcmp(revcmp="l")
+
+%def op_if_gt():
+%  bincmp(revcmp="le")
+
+%def op_if_gtz():
+%  zcmp(revcmp="le")
+
+%def op_if_le():
+%  bincmp(revcmp="g")
+
+%def op_if_lez():
+%  zcmp(revcmp="g")
+
+%def op_if_lt():
+%  bincmp(revcmp="ge")
+
+%def op_if_ltz():
+%  zcmp(revcmp="ge")
+
+%def op_if_ne():
+%  bincmp(revcmp="e")
+
+%def op_if_nez():
+%  zcmp(revcmp="e")
+
+%def op_packed_switch(func="NterpDoPackedSwitch"):
+/*
+ * Handle a packed-switch or sparse-switch instruction.  In both cases
+ * we decode it and hand it off to a helper function.
+ *
+ * We don't really expect backward branches in a switch statement, but
+ * they're perfectly legal, so we check for them here.
+ *
+ * for: packed-switch, sparse-switch
+ */
+    /* op vAA, +BBBB */
+    movslq  2(rPC), OUT_ARG0                # rcx <- ssssssssBBBBbbbb
+    leaq    (rPC,OUT_ARG0,2), OUT_ARG0      # rcx <- PC + ssssssssBBBBbbbb*2
+    GET_VREG OUT_32_ARG1, rINSTq            # eax <- vAA
+    call    SYMBOL($func)
+    movslq  %eax, rINSTq
+    BRANCH
+
+/*
+ * Return a 32-bit value.
+ */
+%def op_return(is_object="0"):
+    GET_VREG %eax, rINSTq                   # eax <- vAA
+    .if !$is_object
+    // In case we're going back to compiled code, put the
+    // result also in a xmm register.
+    movd %eax, %xmm0
+    .endif
+    CFI_REMEMBER_STATE
+    movq -8(rREFS), %rsp
+    CFI_DEF_CFA(rsp, CALLEE_SAVES_SIZE)
+    RESTORE_ALL_CALLEE_SAVES
+    ret
+    CFI_RESTORE_STATE
+
+%def op_return_object():
+%  op_return(is_object="1")
+
+%def op_return_void():
+    // Thread fence for constructor is a no-op on x86_64.
+    CFI_REMEMBER_STATE
+    movq -8(rREFS), %rsp
+    CFI_DEF_CFA(rsp, CALLEE_SAVES_SIZE)
+    RESTORE_ALL_CALLEE_SAVES
+    ret
+    CFI_RESTORE_STATE
+
+%def op_return_void_no_barrier():
+%  op_return_void()
+
+%def op_return_wide():
+    GET_WIDE_VREG %rax, rINSTq   # eax <- vAA
+    // In case we're going back to compiled code, put the
+    // result also in a xmm register.
+    movq    %rax, %xmm0
+    CFI_REMEMBER_STATE
+    movq    -8(rREFS), %rsp
+    CFI_DEF_CFA(rsp, CALLEE_SAVES_SIZE)
+    RESTORE_ALL_CALLEE_SAVES
+    ret
+    CFI_RESTORE_STATE
+
+%def op_sparse_switch():
+%  op_packed_switch(func="NterpDoSparseSwitch")
+
+%def op_throw():
+  EXPORT_PC
+  GET_VREG %edi, rINSTq                   # edi<- vAA (exception object)
+  movq rSELF:THREAD_SELF_OFFSET, %rsi
+  call SYMBOL(art_quick_deliver_exception)
+  int3
diff --git a/runtime/interpreter/mterp/x86_64ng/invoke.S b/runtime/interpreter/mterp/x86_64ng/invoke.S
new file mode 100644
index 0000000..64d0623
--- /dev/null
+++ b/runtime/interpreter/mterp/x86_64ng/invoke.S
@@ -0,0 +1,173 @@
+%def invoke(helper="NterpUnimplemented"):
+    call    SYMBOL($helper)
+
+%def op_invoke_custom():
+   EXPORT_PC
+   movzwl 2(rPC), %edi // call_site index, first argument of runtime call.
+   jmp NterpCommonInvokeCustom
+
+%def op_invoke_custom_range():
+   EXPORT_PC
+   movzwl 2(rPC), %edi // call_site index, first argument of runtime call.
+   jmp NterpCommonInvokeCustomRange
+
+%def invoke_direct_or_super(helper="", range=""):
+   EXPORT_PC
+   // Fast-path which gets the method from thread-local cache.
+   FETCH_FROM_THREAD_CACHE %rdi, 2f
+1:
+   // Load the first argument (the 'this' pointer).
+   movzwl 4(rPC), %r11d // arguments
+   .if !$range
+   andq $$0xf, %r11
+   .endif
+   movl (rFP, %r11, 4), %esi
+   // NullPointerException check.
+   movl (%esi), %eax
+   jmp $helper
+2:
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   call nterp_get_method
+   movq %rax, %rdi
+   testl MACRO_LITERAL(1), %eax
+   je 1b
+   andq $$-2, %rdi  // Remove the extra bit that marks it's a String.<init> method.
+   .if $range
+   jmp NterpHandleStringInitRange
+   .else
+   jmp NterpHandleStringInit
+   .endif
+
+%def op_invoke_direct():
+%  invoke_direct_or_super(helper="NterpCommonInvokeInstance", range="0")
+
+%def op_invoke_direct_range():
+%  invoke_direct_or_super(helper="NterpCommonInvokeInstanceRange", range="1")
+
+%def op_invoke_polymorphic():
+   EXPORT_PC
+   // No need to fetch the target method.
+   // Load the first argument (the 'this' pointer).
+   movzwl 4(rPC), %r11d // arguments
+   andq $$0xf, %r11
+   movl (rFP, %r11, 4), %esi
+   // NullPointerException check.
+   movl (%esi), %eax
+   jmp NterpCommonInvokePolymorphic
+
+%def op_invoke_polymorphic_range():
+   EXPORT_PC
+   // No need to fetch the target method.
+   // Load the first argument (the 'this' pointer).
+   movzwl 4(rPC), %r11d // arguments
+   movl (rFP, %r11, 4), %esi
+   // NullPointerException check.
+   movl (%esi), %eax
+   jmp NterpCommonInvokePolymorphicRange
+
+%def invoke_interface(helper="", range=""):
+   EXPORT_PC
+   // Fast-path which gets the method from thread-local cache.
+   FETCH_FROM_THREAD_CACHE %rax, 2f
+1:
+   // First argument is the 'this' pointer.
+   movzwl 4(rPC), %r11d // arguments
+   .if !$range
+   andq $$0xf, %r11
+   .endif
+   movl (rFP, %r11, 4), %esi
+   movl MIRROR_OBJECT_CLASS_OFFSET(%esi), %edx
+   movq MIRROR_CLASS_IMT_PTR_OFFSET_64(%edx), %rdx
+   movq (%rdx, %rax, 8), %rdi
+   jmp $helper
+2:
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   call nterp_get_method
+   jmp 1b
+
+%def op_invoke_interface():
+%  invoke_interface(helper="NterpCommonInvokeInterface", range="0")
+
+%def op_invoke_interface_range():
+%  invoke_interface(helper="NterpCommonInvokeInterfaceRange", range="1")
+
+%def invoke_static(helper=""):
+   EXPORT_PC
+   // Fast-path which gets the method from thread-local cache.
+   FETCH_FROM_THREAD_CACHE %rdi, 1f
+   jmp $helper
+1:
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   call nterp_get_method
+   movq %rax, %rdi
+   jmp $helper
+
+%def op_invoke_static():
+%  invoke_static(helper="NterpCommonInvokeStatic")
+
+%def op_invoke_static_range():
+%  invoke_static(helper="NterpCommonInvokeStaticRange")
+
+%def op_invoke_super():
+%  invoke_direct_or_super(helper="NterpCommonInvokeInstance", range="0")
+
+%def op_invoke_super_range():
+%  invoke_direct_or_super(helper="NterpCommonInvokeInstanceRange", range="1")
+
+%def invoke_virtual(helper="", range=""):
+   EXPORT_PC
+   // Fast-path which gets the method from thread-local cache.
+   FETCH_FROM_THREAD_CACHE %rdi, 2f
+1:
+   // First argument is the 'this' pointer.
+   movzwl 4(rPC), %r11d // arguments
+   .if !$range
+   andq $$0xf, %r11
+   .endif
+   movl (rFP, %r11, 4), %esi
+   // Note: if esi is null, this will be handled by our SIGSEGV handler.
+   movl MIRROR_OBJECT_CLASS_OFFSET(%esi), %edx
+   movq MIRROR_CLASS_VTABLE_OFFSET_64(%edx, %edi, 8), %rdi
+   jmp $helper
+2:
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   call nterp_get_method
+   movl %eax, %edi
+   jmp 1b
+
+%def op_invoke_virtual():
+%  invoke_virtual(helper="NterpCommonInvokeInstance", range="0")
+
+%def op_invoke_virtual_quick():
+   EXPORT_PC
+   movzwl 2(rPC), %eax // offset
+   // First argument is the 'this' pointer.
+   movzwl 4(rPC), %r11d // arguments
+   andq $$0xf, %r11
+   movl (rFP, %r11, 4), %esi
+   // Note: if esi is null, this will be handled by our SIGSEGV handler.
+   movl MIRROR_OBJECT_CLASS_OFFSET(%esi), %edx
+   movq MIRROR_CLASS_VTABLE_OFFSET_64(%edx, %eax, 8), %rdi
+   jmp NterpCommonInvokeInstance
+
+%def op_invoke_virtual_range():
+%  invoke_virtual(helper="NterpCommonInvokeInstanceRange", range="1")
+
+%def op_invoke_virtual_range_quick():
+   EXPORT_PC
+   movzwl 2(rPC), %eax // offset
+   // First argument is the 'this' pointer.
+   movzwl 4(rPC), %r11d // arguments
+   movl (rFP, %r11, 4), %esi
+   // Note: if esi is null, this will be handled by our SIGSEGV handler.
+   movl MIRROR_OBJECT_CLASS_OFFSET(%esi), %edx
+   movq MIRROR_CLASS_VTABLE_OFFSET_64(%edx, %eax, 8), %rdi
+   jmp NterpCommonInvokeInstanceRange
diff --git a/runtime/interpreter/mterp/x86_64ng/main.S b/runtime/interpreter/mterp/x86_64ng/main.S
new file mode 100644
index 0000000..4d97bcd
--- /dev/null
+++ b/runtime/interpreter/mterp/x86_64ng/main.S
@@ -0,0 +1,2030 @@
+%def header():
+/*
+ * Copyright (C) 2019 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This is a #include, not a %include, because we want the C pre-processor
+ * to expand the macros into assembler assignment statements.
+ */
+#include "asm_support.h"
+#include "arch/x86_64/asm_support_x86_64.S"
+#include "interpreter/cfi_asm_support.h"
+
+/**
+ * x86_64 ABI general notes:
+ *
+ * Caller save set:
+ *    rax, rdx, rcx, rsi, rdi, r8-r11, st(0)-st(7)
+ * Callee save set:
+ *    rbx, rbp, r12-r15
+ * Return regs:
+ *    32-bit in eax
+ *    64-bit in rax
+ *    fp on xmm0
+ *
+ * First 8 fp parameters came in xmm0-xmm7.
+ * First 6 non-fp parameters came in rdi, rsi, rdx, rcx, r8, r9.
+ * Other parameters passed on stack, pushed right-to-left.  On entry to target, first
+ * param is at 8(%esp).
+ *
+ * Stack must be 16-byte aligned to support SSE in native code.
+ */
+
+#define IN_ARG3        %rcx
+#define IN_ARG2        %rdx
+#define IN_ARG1        %rsi
+#define IN_ARG0        %rdi
+/* Out Args  */
+#define OUT_ARG3       %rcx
+#define OUT_ARG2       %rdx
+#define OUT_ARG1       %rsi
+#define OUT_ARG0       %rdi
+#define OUT_32_ARG3    %ecx
+#define OUT_32_ARG2    %edx
+#define OUT_32_ARG1    %esi
+#define OUT_32_ARG0    %edi
+#define OUT_FP_ARG1    %xmm1
+#define OUT_FP_ARG0    %xmm0
+
+/*
+ * single-purpose registers, given names for clarity
+ */
+#define rSELF    %gs
+#define rPC      %r12
+#define CFI_DEX  12 // DWARF register number of the register holding dex-pc (rPC).
+#define CFI_TMP  5  // DWARF register number of the first argument register (rdi).
+#define rFP      %r13
+#define rINST    %ebx
+#define rINSTq   %rbx
+#define rINSTw   %bx
+#define rINSTbh  %bh
+#define rINSTbl  %bl
+#define rIBASE   %r14
+#define rREFS    %r15
+#define CFI_REFS 15 // DWARF register number of the reference array (r15).
+
+// Temporary registers while setting up a frame.
+#define rNEW_FP   %r8
+#define rNEW_REFS %r9
+#define CFI_NEW_REFS 9
+
+/*
+ * Get/set the 32-bit value from a Dalvik register.
+ */
+#define VREG_ADDRESS(_vreg) (rFP,_vreg,4)
+#define VREG_HIGH_ADDRESS(_vreg) 4(rFP,_vreg,4)
+#define VREG_REF_ADDRESS(_vreg) (rREFS,_vreg,4)
+#define VREG_REF_HIGH_ADDRESS(_vreg) 4(rREFS,_vreg,4)
+
+// Includes the return address implictly pushed on stack by 'call'.
+#define CALLEE_SAVES_SIZE (6 * 8 + 4 * 8 + 1 * 8)
+
+// +8 for the ArtMethod of the caller.
+#define OFFSET_TO_FIRST_ARGUMENT_IN_STACK (CALLEE_SAVES_SIZE + 8)
+
+/*
+ * Refresh rINST.
+ * At enter to handler rINST does not contain the opcode number.
+ * However some utilities require the full value, so this macro
+ * restores the opcode number.
+ */
+.macro REFRESH_INST _opnum
+    movb    rINSTbl, rINSTbh
+    movb    $$\_opnum, rINSTbl
+.endm
+
+/*
+ * Fetch the next instruction from rPC into rINSTw.  Does not advance rPC.
+ */
+.macro FETCH_INST
+    movzwq  (rPC), rINSTq
+.endm
+
+/*
+ * Remove opcode from rINST, compute the address of handler and jump to it.
+ */
+.macro GOTO_NEXT
+    movzx   rINSTbl,%ecx
+    movzbl  rINSTbh,rINST
+    shll    MACRO_LITERAL(${handler_size_bits}), %ecx
+    addq    rIBASE, %rcx
+    jmp     *%rcx
+.endm
+
+/*
+ * Advance rPC by instruction count.
+ */
+.macro ADVANCE_PC _count
+    leaq    2*\_count(rPC), rPC
+.endm
+
+/*
+ * Advance rPC by instruction count, fetch instruction and jump to handler.
+ */
+.macro ADVANCE_PC_FETCH_AND_GOTO_NEXT _count
+    ADVANCE_PC \_count
+    FETCH_INST
+    GOTO_NEXT
+.endm
+
+.macro GET_VREG _reg _vreg
+    movl    VREG_ADDRESS(\_vreg), \_reg
+.endm
+
+.macro GET_VREG_OBJECT _reg _vreg
+    movl    VREG_REF_ADDRESS(\_vreg), \_reg
+.endm
+
+/* Read wide value. */
+.macro GET_WIDE_VREG _reg _vreg
+    movq    VREG_ADDRESS(\_vreg), \_reg
+.endm
+
+.macro SET_VREG _reg _vreg
+    movl    \_reg, VREG_ADDRESS(\_vreg)
+    movl    MACRO_LITERAL(0), VREG_REF_ADDRESS(\_vreg)
+.endm
+
+/* Write wide value. reg is clobbered. */
+.macro SET_WIDE_VREG _reg _vreg
+    movq    \_reg, VREG_ADDRESS(\_vreg)
+    xorq    \_reg, \_reg
+    movq    \_reg, VREG_REF_ADDRESS(\_vreg)
+.endm
+
+.macro SET_VREG_OBJECT _reg _vreg
+    movl    \_reg, VREG_ADDRESS(\_vreg)
+    movl    \_reg, VREG_REF_ADDRESS(\_vreg)
+.endm
+
+.macro GET_VREG_HIGH _reg _vreg
+    movl    VREG_HIGH_ADDRESS(\_vreg), \_reg
+.endm
+
+.macro SET_VREG_HIGH _reg _vreg
+    movl    \_reg, VREG_HIGH_ADDRESS(\_vreg)
+    movl    MACRO_LITERAL(0), VREG_REF_HIGH_ADDRESS(\_vreg)
+.endm
+
+.macro CLEAR_REF _vreg
+    movl    MACRO_LITERAL(0), VREG_REF_ADDRESS(\_vreg)
+.endm
+
+.macro CLEAR_WIDE_REF _vreg
+    movl    MACRO_LITERAL(0), VREG_REF_ADDRESS(\_vreg)
+    movl    MACRO_LITERAL(0), VREG_REF_HIGH_ADDRESS(\_vreg)
+.endm
+
+.macro GET_VREG_XMMs _xmmreg _vreg
+    movss VREG_ADDRESS(\_vreg), \_xmmreg
+.endm
+.macro GET_VREG_XMMd _xmmreg _vreg
+    movsd VREG_ADDRESS(\_vreg), \_xmmreg
+.endm
+.macro SET_VREG_XMMs _xmmreg _vreg
+    movss \_xmmreg, VREG_ADDRESS(\_vreg)
+.endm
+.macro SET_VREG_XMMd _xmmreg _vreg
+    movsd \_xmmreg, VREG_ADDRESS(\_vreg)
+.endm
+
+// An assembly entry that has a OatQuickMethodHeader prefix.
+.macro OAT_ENTRY name, end
+    FUNCTION_TYPE(\name)
+    ASM_HIDDEN SYMBOL(\name)
+    .global SYMBOL(\name)
+    .balign 16
+    .long 0
+    .long (SYMBOL(\end) - SYMBOL(\name))
+SYMBOL(\name):
+.endm
+
+.macro ENTRY name
+    .text
+    ASM_HIDDEN SYMBOL(\name)
+    .global SYMBOL(\name)
+    FUNCTION_TYPE(\name)
+SYMBOL(\name):
+.endm
+
+.macro END name
+    SIZE(\name)
+.endm
+
+// Macro for defining entrypoints into runtime. We don't need to save registers
+// (we're not holding references there), but there is no
+// kDontSave runtime method. So just use the kSaveRefsOnly runtime method.
+.macro NTERP_TRAMPOLINE name, helper
+DEFINE_FUNCTION \name
+  SETUP_SAVE_REFS_ONLY_FRAME
+  call \helper
+  RESTORE_SAVE_REFS_ONLY_FRAME
+  RETURN_OR_DELIVER_PENDING_EXCEPTION
+END_FUNCTION nterp_get_static_field
+.endm
+
+.macro CLEAR_VOLATILE_MARKER reg
+  andq MACRO_LITERAL(-2), \reg
+.endm
+
+.macro EXPORT_PC
+    movq    rPC, -16(rREFS)
+.endm
+
+
+.macro BRANCH
+    // Update method counter and do a suspend check if the branch is negative.
+    testq rINSTq, rINSTq
+    js 3f
+2:
+    leaq    (rPC, rINSTq, 2), rPC
+    FETCH_INST
+    GOTO_NEXT
+3:
+    movq (%rsp), %rdi
+    addw $$1, ART_METHOD_HOTNESS_COUNT_OFFSET(%rdi)
+    // If the counter overflows, handle this in the runtime.
+    jo NterpHandleHotnessOverflow
+    // Otherwise, do a suspend check.
+    testl   $$(THREAD_SUSPEND_OR_CHECKPOINT_REQUEST), rSELF:THREAD_FLAGS_OFFSET
+    jz      2b
+    EXPORT_PC
+    call    SYMBOL(art_quick_test_suspend)
+    jmp 2b
+.endm
+
+// Puts the next floating point argument into the expected register,
+// fetching values based on a non-range invoke.
+// Uses rax as temporary.
+//
+// TODO: We could simplify a lot of code by loading the G argument into
+// the "inst" register. Given that we enter the handler with "1(rPC)" in
+// the rINST, we can just add rINST<<16 to the args and we don't even
+// need to pass "arg_index" around.
+.macro LOOP_OVER_SHORTY_LOADING_XMMS xmm_reg, inst, shorty, arg_index, finished
+1: // LOOP
+    movb (REG_VAR(shorty)), %al             // bl := *shorty
+    addq MACRO_LITERAL(1), REG_VAR(shorty)  // shorty++
+    cmpb MACRO_LITERAL(0), %al              // if (al == '\0') goto finished
+    je VAR(finished)
+    cmpb MACRO_LITERAL(68), %al             // if (al == 'D') goto FOUND_DOUBLE
+    je 2f
+    cmpb MACRO_LITERAL(70), %al             // if (al == 'F') goto FOUND_FLOAT
+    je 3f
+    shrq MACRO_LITERAL(4), REG_VAR(inst)
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    //  Handle extra argument in arg array taken by a long.
+    cmpb MACRO_LITERAL(74), %al   // if (al != 'J') goto LOOP
+    jne 1b
+    shrq MACRO_LITERAL(4), REG_VAR(inst)
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    jmp 1b                        // goto LOOP
+2:  // FOUND_DOUBLE
+    subq MACRO_LITERAL(8), %rsp
+    movq REG_VAR(inst), %rax
+    andq MACRO_LITERAL(0xf), %rax
+    GET_VREG %eax, %rax
+    movl %eax, (%rsp)
+    shrq MACRO_LITERAL(4), REG_VAR(inst)
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    cmpq MACRO_LITERAL(4), REG_VAR(arg_index)
+    je 5f
+    movq REG_VAR(inst), %rax
+    andq MACRO_LITERAL(0xf), %rax
+    shrq MACRO_LITERAL(4), REG_VAR(inst)
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    jmp 6f
+5:
+    movzbl 1(rPC), %eax
+    andq MACRO_LITERAL(0xf), %rax
+6:
+    GET_VREG %eax, %rax
+    movl %eax, 4(%rsp)
+    movsd (%rsp), REG_VAR(xmm_reg)
+    addq MACRO_LITERAL(8), %rsp
+    jmp 4f
+3:  // FOUND_FLOAT
+    cmpq MACRO_LITERAL(4), REG_VAR(arg_index)
+    je 7f
+    movq REG_VAR(inst), %rax
+    andq MACRO_LITERAL(0xf), %rax
+    shrq MACRO_LITERAL(4), REG_VAR(inst)
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    jmp 8f
+7:
+    movzbl 1(rPC), %eax
+    andq MACRO_LITERAL(0xf), %rax
+8:
+    GET_VREG_XMMs REG_VAR(xmm_reg), %rax
+4:
+.endm
+
+// Puts the next int/long/object argument in the expected register,
+// fetching values based on a non-range invoke.
+// Uses rax as temporary.
+.macro LOOP_OVER_SHORTY_LOADING_GPRS gpr_reg64, gpr_reg32, inst, shorty, arg_index, finished
+1: // LOOP
+    movb (REG_VAR(shorty)), %al   // bl := *shorty
+    addq MACRO_LITERAL(1), REG_VAR(shorty)  // shorty++
+    cmpb MACRO_LITERAL(0), %al    // if (al == '\0') goto finished
+    je  VAR(finished)
+    cmpb MACRO_LITERAL(74), %al   // if (al == 'J') goto FOUND_LONG
+    je 2f
+    cmpb MACRO_LITERAL(70), %al   // if (al == 'F') goto SKIP_FLOAT
+    je 3f
+    cmpb MACRO_LITERAL(68), %al   // if (al == 'D') goto SKIP_DOUBLE
+    je 4f
+    cmpq MACRO_LITERAL(4), REG_VAR(arg_index)
+    je 7f
+    movq REG_VAR(inst), %rax
+    andq MACRO_LITERAL(0xf), %rax
+    shrq MACRO_LITERAL(4), REG_VAR(inst)
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    jmp 8f
+7:
+    movzbl 1(rPC), %eax
+    andq MACRO_LITERAL(0xf), %rax
+8:
+    GET_VREG REG_VAR(gpr_reg32), %rax
+    jmp 5f
+2:  // FOUND_LONG
+    subq MACRO_LITERAL(8), %rsp
+    movq REG_VAR(inst), %rax
+    andq MACRO_LITERAL(0xf), %rax
+    GET_VREG %eax, %rax
+    movl %eax, (%rsp)
+    shrq MACRO_LITERAL(4), REG_VAR(inst)
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    cmpq MACRO_LITERAL(4), REG_VAR(arg_index)
+    je 9f
+    movq REG_VAR(inst), %rax
+    andq MACRO_LITERAL(0xf), %rax
+    shrq MACRO_LITERAL(4), REG_VAR(inst)
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    jmp 10f
+9:
+    movzbl 1(rPC), %eax
+    andq MACRO_LITERAL(0xf), %rax
+10:
+    GET_VREG %eax, %rax
+    movl %eax, 4(%rsp)
+    movq (%rsp), REG_VAR(gpr_reg64)
+    addq MACRO_LITERAL(8), %rsp
+    jmp 5f
+3:  // SKIP_FLOAT
+    shrq MACRO_LITERAL(4), REG_VAR(inst)
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    jmp 1b
+4:  // SKIP_DOUBLE
+    shrq MACRO_LITERAL(4), REG_VAR(inst)
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    cmpq MACRO_LITERAL(4), REG_VAR(arg_index)
+    je 1b
+    shrq MACRO_LITERAL(4), REG_VAR(inst)
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    jmp 1b
+5:
+.endm
+
+// Puts the next floating point argument into the expected register,
+// fetching values based on a range invoke.
+// Uses rax as temporary.
+.macro LOOP_RANGE_OVER_SHORTY_LOADING_XMMS xmm_reg, shorty, arg_index, stack_index, finished
+1: // LOOP
+    movb (REG_VAR(shorty)), %al             // bl := *shorty
+    addq MACRO_LITERAL(1), REG_VAR(shorty)  // shorty++
+    cmpb MACRO_LITERAL(0), %al              // if (al == '\0') goto finished
+    je VAR(finished)
+    cmpb MACRO_LITERAL(68), %al             // if (al == 'D') goto FOUND_DOUBLE
+    je 2f
+    cmpb MACRO_LITERAL(70), %al             // if (al == 'F') goto FOUND_FLOAT
+    je 3f
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    addq MACRO_LITERAL(1), REG_VAR(stack_index)
+    //  Handle extra argument in arg array taken by a long.
+    cmpb MACRO_LITERAL(74), %al   // if (al != 'J') goto LOOP
+    jne 1b
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    addq MACRO_LITERAL(1), REG_VAR(stack_index)
+    jmp 1b                        // goto LOOP
+2:  // FOUND_DOUBLE
+    GET_VREG_XMMd REG_VAR(xmm_reg), REG_VAR(arg_index)
+    addq MACRO_LITERAL(2), REG_VAR(arg_index)
+    addq MACRO_LITERAL(2), REG_VAR(stack_index)
+    jmp 4f
+3:  // FOUND_FLOAT
+    GET_VREG_XMMs REG_VAR(xmm_reg), REG_VAR(arg_index)
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    addq MACRO_LITERAL(1), REG_VAR(stack_index)
+4:
+.endm
+
+// Puts the next floating point argument into the expected stack slot,
+// fetching values based on a range invoke.
+// Uses rax as temporary.
+//
+// TODO: We could just copy all the vregs to the stack slots in a simple loop
+// (or REP MOVSD) without looking at the shorty at all. (We could also drop
+// the "stack_index" from the macros for loading registers.) We could also do
+// that conditionally if argument word count > 6; otherwise we know that all
+// args fit into registers.
+.macro LOOP_RANGE_OVER_FPs shorty, arg_index, stack_index, finished
+1: // LOOP
+    movb (REG_VAR(shorty)), %al             // bl := *shorty
+    addq MACRO_LITERAL(1), REG_VAR(shorty)  // shorty++
+    cmpb MACRO_LITERAL(0), %al              // if (al == '\0') goto finished
+    je VAR(finished)
+    cmpb MACRO_LITERAL(68), %al             // if (al == 'D') goto FOUND_DOUBLE
+    je 2f
+    cmpb MACRO_LITERAL(70), %al             // if (al == 'F') goto FOUND_FLOAT
+    je 3f
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    addq MACRO_LITERAL(1), REG_VAR(stack_index)
+    //  Handle extra argument in arg array taken by a long.
+    cmpb MACRO_LITERAL(74), %al   // if (al != 'J') goto LOOP
+    jne 1b
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    addq MACRO_LITERAL(1), REG_VAR(stack_index)
+    jmp 1b                        // goto LOOP
+2:  // FOUND_DOUBLE
+    movq (rFP, REG_VAR(arg_index), 4), %rax
+    movq %rax, 8(%rsp, REG_VAR(stack_index), 4)
+    addq MACRO_LITERAL(2), REG_VAR(arg_index)
+    addq MACRO_LITERAL(2), REG_VAR(stack_index)
+    jmp 1b
+3:  // FOUND_FLOAT
+    movl (rFP, REG_VAR(arg_index), 4), %eax
+    movl %eax, 8(%rsp, REG_VAR(stack_index), 4)
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    addq MACRO_LITERAL(1), REG_VAR(stack_index)
+    jmp 1b
+.endm
+
+// Puts the next int/long/object argument in the expected register,
+// fetching values based on a range invoke.
+// Uses rax as temporary.
+.macro LOOP_RANGE_OVER_SHORTY_LOADING_GPRS gpr_reg64, gpr_reg32, shorty, arg_index, stack_index, finished
+1: // LOOP
+    movb (REG_VAR(shorty)), %al             // bl := *shorty
+    addq MACRO_LITERAL(1), REG_VAR(shorty)  // shorty++
+    cmpb MACRO_LITERAL(0), %al    // if (al == '\0') goto finished
+    je  VAR(finished)
+    cmpb MACRO_LITERAL(74), %al   // if (al == 'J') goto FOUND_LONG
+    je 2f
+    cmpb MACRO_LITERAL(70), %al   // if (al == 'F') goto SKIP_FLOAT
+    je 3f
+    cmpb MACRO_LITERAL(68), %al   // if (al == 'D') goto SKIP_DOUBLE
+    je 4f
+    movl       (rFP, REG_VAR(arg_index), 4), REG_VAR(gpr_reg32)
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    addq MACRO_LITERAL(1), REG_VAR(stack_index)
+    jmp 5f
+2:  // FOUND_LONG
+    movq (rFP, REG_VAR(arg_index), 4), REG_VAR(gpr_reg64)
+    addq MACRO_LITERAL(2), REG_VAR(arg_index)
+    addq MACRO_LITERAL(2), REG_VAR(stack_index)
+    jmp 5f
+3:  // SKIP_FLOAT
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    addq MACRO_LITERAL(1), REG_VAR(stack_index)
+    jmp 1b
+4:  // SKIP_DOUBLE
+    addq MACRO_LITERAL(2), REG_VAR(arg_index)
+    addq MACRO_LITERAL(2), REG_VAR(stack_index)
+    jmp 1b
+5:
+.endm
+
+// Puts the next int/long/object argument in the expected stack slot,
+// fetching values based on a range invoke.
+// Uses rax as temporary.
+.macro LOOP_RANGE_OVER_INTs shorty, arg_index, stack_index, finished
+1: // LOOP
+    movb (REG_VAR(shorty)), %al             // al := *shorty
+    addq MACRO_LITERAL(1), REG_VAR(shorty)  // shorty++
+    cmpb MACRO_LITERAL(0), %al    // if (al == '\0') goto finished
+    je  VAR(finished)
+    cmpb MACRO_LITERAL(74), %al   // if (al == 'J') goto FOUND_LONG
+    je 2f
+    cmpb MACRO_LITERAL(70), %al   // if (al == 'F') goto SKIP_FLOAT
+    je 3f
+    cmpb MACRO_LITERAL(68), %al   // if (al == 'D') goto SKIP_DOUBLE
+    je 4f
+    movl (rFP, REG_VAR(arg_index), 4), %eax
+    movl %eax, 8(%rsp, REG_VAR(stack_index), 4)
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    addq MACRO_LITERAL(1), REG_VAR(stack_index)
+    jmp 1b
+2:  // FOUND_LONG
+    movq (rFP, REG_VAR(arg_index), 4), %rax
+    movq %rax, 8(%rsp, REG_VAR(stack_index), 4)
+    addq MACRO_LITERAL(2), REG_VAR(arg_index)
+    addq MACRO_LITERAL(2), REG_VAR(stack_index)
+    jmp 1b
+3:  // SKIP_FLOAT
+    addq MACRO_LITERAL(1), REG_VAR(arg_index)
+    addq MACRO_LITERAL(1), REG_VAR(stack_index)
+    jmp 1b
+4:  // SKIP_DOUBLE
+    addq MACRO_LITERAL(2), REG_VAR(arg_index)
+    addq MACRO_LITERAL(2), REG_VAR(stack_index)
+    jmp 1b
+.endm
+
+// Puts the next floating point parameter passed in physical register
+// in the expected dex register array entry.
+// Uses rax as temporary.
+.macro LOOP_OVER_SHORTY_STORING_XMMS xmm_reg, shorty, arg_index, fp, finished
+1: // LOOP
+    movb (REG_VAR(shorty)), %al             // al := *shorty
+    addq MACRO_LITERAL(1), REG_VAR(shorty)  // shorty++
+    cmpb MACRO_LITERAL(0), %al              // if (al == '\0') goto finished
+    je VAR(finished)
+    cmpb MACRO_LITERAL(68), %al             // if (al == 'D') goto FOUND_DOUBLE
+    je 2f
+    cmpb MACRO_LITERAL(70), %al             // if (al == 'F') goto FOUND_FLOAT
+    je 3f
+    addq MACRO_LITERAL(4), REG_VAR(arg_index)
+    //  Handle extra argument in arg array taken by a long.
+    cmpb MACRO_LITERAL(74), %al   // if (al != 'J') goto LOOP
+    jne 1b
+    addq MACRO_LITERAL(4), REG_VAR(arg_index)
+    jmp 1b                        // goto LOOP
+2:  // FOUND_DOUBLE
+    movsd REG_VAR(xmm_reg),(REG_VAR(fp), REG_VAR(arg_index), 1)
+    addq MACRO_LITERAL(8), REG_VAR(arg_index)
+    jmp 4f
+3:  // FOUND_FLOAT
+    movss REG_VAR(xmm_reg), (REG_VAR(fp), REG_VAR(arg_index), 1)
+    addq MACRO_LITERAL(4), REG_VAR(arg_index)
+4:
+.endm
+
+// Puts the next int/long/object parameter passed in physical register
+// in the expected dex register array entry, and in case of object in the
+// expected reference array entry.
+// Uses rax as temporary.
+.macro LOOP_OVER_SHORTY_STORING_GPRS gpr_reg64, gpr_reg32, shorty, arg_index, regs, refs, finished
+1: // LOOP
+    movb (REG_VAR(shorty)), %al             // bl := *shorty
+    addq MACRO_LITERAL(1), REG_VAR(shorty)  // shorty++
+    cmpb MACRO_LITERAL(0), %al    // if (al == '\0') goto finished
+    je  VAR(finished)
+    cmpb MACRO_LITERAL(74), %al   // if (al == 'J') goto FOUND_LONG
+    je 2f
+    cmpb MACRO_LITERAL(70), %al   // if (al == 'F') goto SKIP_FLOAT
+    je 3f
+    cmpb MACRO_LITERAL(68), %al   // if (al == 'D') goto SKIP_DOUBLE
+    je 4f
+    movl REG_VAR(gpr_reg32), (REG_VAR(regs), REG_VAR(arg_index), 1)
+    cmpb MACRO_LITERAL(76), %al   // if (al != 'L') goto NOT_REFERENCE
+    jne 6f
+    movl REG_VAR(gpr_reg32), (REG_VAR(refs), REG_VAR(arg_index), 1)
+6:  // NOT_REFERENCE
+    addq MACRO_LITERAL(4), REG_VAR(arg_index)
+    jmp 5f
+2:  // FOUND_LONG
+    movq REG_VAR(gpr_reg64), (REG_VAR(regs), REG_VAR(arg_index), 1)
+    addq MACRO_LITERAL(8), REG_VAR(arg_index)
+    jmp 5f
+3:  // SKIP_FLOAT
+    addq MACRO_LITERAL(4), REG_VAR(arg_index)
+    jmp 1b
+4:  // SKIP_DOUBLE
+    addq MACRO_LITERAL(8), REG_VAR(arg_index)
+    jmp 1b
+5:
+.endm
+
+// Puts the next floating point parameter passed in stack
+// in the expected dex register array entry.
+// Uses rax as temporary.
+//
+// TODO: Or we could just spill regs to the reserved slots in the caller's
+// frame and copy all regs in a simple loop. This time, however, we would
+// need to look at the shorty anyway to look for the references.
+// (The trade-off is different for passing arguments and receiving them.)
+.macro LOOP_OVER_FPs shorty, arg_index, regs, stack_ptr, finished
+1: // LOOP
+    movb (REG_VAR(shorty)), %al             // bl := *shorty
+    addq MACRO_LITERAL(1), REG_VAR(shorty)  // shorty++
+    cmpb MACRO_LITERAL(0), %al              // if (al == '\0') goto finished
+    je VAR(finished)
+    cmpb MACRO_LITERAL(68), %al             // if (al == 'D') goto FOUND_DOUBLE
+    je 2f
+    cmpb MACRO_LITERAL(70), %al             // if (al == 'F') goto FOUND_FLOAT
+    je 3f
+    addq MACRO_LITERAL(4), REG_VAR(arg_index)
+    //  Handle extra argument in arg array taken by a long.
+    cmpb MACRO_LITERAL(74), %al   // if (al != 'J') goto LOOP
+    jne 1b
+    addq MACRO_LITERAL(4), REG_VAR(arg_index)
+    jmp 1b                        // goto LOOP
+2:  // FOUND_DOUBLE
+    movq OFFSET_TO_FIRST_ARGUMENT_IN_STACK(REG_VAR(stack_ptr), REG_VAR(arg_index), 1), %rax
+    movq %rax, (REG_VAR(regs), REG_VAR(arg_index), 1)
+    addq MACRO_LITERAL(8), REG_VAR(arg_index)
+    jmp 1b
+3:  // FOUND_FLOAT
+    movl OFFSET_TO_FIRST_ARGUMENT_IN_STACK(REG_VAR(stack_ptr), REG_VAR(arg_index), 1), %eax
+    movl %eax, (REG_VAR(regs), REG_VAR(arg_index), 1)
+    addq MACRO_LITERAL(4), REG_VAR(arg_index)
+    jmp 1b
+.endm
+
+// Puts the next int/long/object parameter passed in stack
+// in the expected dex register array entry, and in case of object in the
+// expected reference array entry.
+// Uses rax as temporary.
+.macro LOOP_OVER_INTs shorty, arg_index, regs, refs, stack_ptr, finished
+1: // LOOP
+    movb (REG_VAR(shorty)), %al             // bl := *shorty
+    addq MACRO_LITERAL(1), REG_VAR(shorty)  // shorty++
+    cmpb MACRO_LITERAL(0), %al    // if (al == '\0') goto finished
+    je  VAR(finished)
+    cmpb MACRO_LITERAL(74), %al   // if (al == 'J') goto FOUND_LONG
+    je 2f
+    cmpb MACRO_LITERAL(76), %al   // if (al == 'L') goto FOUND_REFERENCE
+    je 6f
+    cmpb MACRO_LITERAL(70), %al   // if (al == 'F') goto SKIP_FLOAT
+    je 3f
+    cmpb MACRO_LITERAL(68), %al   // if (al == 'D') goto SKIP_DOUBLE
+    je 4f
+    movl OFFSET_TO_FIRST_ARGUMENT_IN_STACK(REG_VAR(stack_ptr), REG_VAR(arg_index), 1), %eax
+    movl %eax, (REG_VAR(regs), REG_VAR(arg_index), 1)
+    addq MACRO_LITERAL(4), REG_VAR(arg_index)
+    jmp 1b
+6:  // FOUND_REFERENCE
+    movl OFFSET_TO_FIRST_ARGUMENT_IN_STACK(REG_VAR(stack_ptr), REG_VAR(arg_index), 1), %eax
+    movl %eax, (REG_VAR(regs), REG_VAR(arg_index), 1)
+    movl %eax, (REG_VAR(refs), REG_VAR(arg_index), 1)
+    addq MACRO_LITERAL(4), REG_VAR(arg_index)
+    jmp 1b
+2:  // FOUND_LONG
+    movq OFFSET_TO_FIRST_ARGUMENT_IN_STACK(REG_VAR(stack_ptr), REG_VAR(arg_index), 1), %rax
+    movq %rax, (REG_VAR(regs), REG_VAR(arg_index), 1)
+    addq MACRO_LITERAL(8), REG_VAR(arg_index)
+    jmp 1b
+3:  // SKIP_FLOAT
+    addq MACRO_LITERAL(4), REG_VAR(arg_index)
+    jmp 1b
+4:  // SKIP_DOUBLE
+    addq MACRO_LITERAL(8), REG_VAR(arg_index)
+    jmp 1b
+.endm
+
+// Increase method hotness and do suspend check before starting executing the method.
+.macro START_EXECUTING_INSTRUCTIONS
+   movq (%rsp), %rdi
+   addw $$1, ART_METHOD_HOTNESS_COUNT_OFFSET(%rdi)
+   jo 2f
+   testl $$(THREAD_SUSPEND_OR_CHECKPOINT_REQUEST), rSELF:THREAD_FLAGS_OFFSET
+   jz 1f
+   EXPORT_PC
+   call SYMBOL(art_quick_test_suspend)
+1:
+   FETCH_INST
+   GOTO_NEXT
+2:
+   movq $$0, %rsi
+   movq rFP, %rdx
+   call nterp_hot_method
+   jmp 1b
+.endm
+
+.macro SPILL_ALL_CALLEE_SAVES
+    PUSH r15
+    PUSH r14
+    PUSH r13
+    PUSH r12
+    PUSH rbp
+    PUSH rbx
+    SETUP_FP_CALLEE_SAVE_FRAME
+.endm
+
+.macro RESTORE_ALL_CALLEE_SAVES
+    RESTORE_FP_CALLEE_SAVE_FRAME
+    POP rbx
+    POP rbp
+    POP r12
+    POP r13
+    POP r14
+    POP r15
+.endm
+
+// Helper to setup the stack after doing a nterp to nterp call. This will setup:
+// - rNEW_FP: the new pointer to dex registers
+// - rNEW_REFS: the new pointer to references
+// - rPC: the new PC pointer to execute
+// - edi: number of arguments
+// - ecx: first dex register
+.macro SETUP_STACK_FOR_INVOKE
+   // We do the same stack overflow check as the compiler. See CanMethodUseNterp
+   // in how we limit the maximum nterp frame size.
+   testq %rax, -STACK_OVERFLOW_RESERVED_BYTES(%rsp)
+
+   // Spill all callee saves to have a consistent stack frame whether we
+   // are called by compiled code or nterp.
+   SPILL_ALL_CALLEE_SAVES
+
+   movq %rsp, %r11
+   CFI_DEF_CFA_REGISTER(r11)
+
+   // From this point:
+   // - rax contains code item
+   // - rdi contains method
+   // - r11 contains saved stack pointer.
+
+   // Create space for registers * 2. Set rFP and rRefs.
+   movzwl CODE_ITEM_REGISTERS_SIZE_OFFSET(%rax), %ecx
+   sall MACRO_LITERAL(2), %ecx
+   subq %rcx, %rsp
+   movq %rsp, rNEW_FP
+   subq %rcx, %rsp
+   movq %rsp, rNEW_REFS
+
+   // Put nulls in reference frame.
+   testl %ecx, %ecx
+   je 2f
+   movq rNEW_REFS, %rcx
+1:
+   movl MACRO_LITERAL(0), (%rcx)
+   addq MACRO_LITERAL(4), %rcx
+   cmpq %rcx, rNEW_FP
+   jne 1b
+2:
+   // Create space for the previous frame, saved dex pc, and method being called
+   subq MACRO_LITERAL(24), %rsp
+
+   // TODO: We could get rid of the two lines below if we preserve r11 until we copy
+   // rNEW_REFS to rREFS. (We currently do because we use it for copying parameters.
+   // We should move the alignment and rewrite the parameter copy so that we do not
+   // need r11 for that and still preserve r11.)
+   //
+   // Save the previous frame.
+   movq %r11, -8(rNEW_REFS)
+   CFI_DEFINE_CFA_DEREF(CFI_NEW_REFS, -8, (6 + 4 + 1) * 8)
+
+   // Take space for outs.
+   movzwl CODE_ITEM_OUTS_SIZE_OFFSET(%rax), %ecx
+   sall MACRO_LITERAL(2), %ecx
+   subq %rcx, %rsp
+
+   // Align stack pointer to 16.
+   andq MACRO_LITERAL(-16), %rsp
+
+   // Save the ArtMethod.
+   movq %rdi, (%rsp)
+
+   // Fetch instruction information before replacing rPC.
+   movzbl 1(rPC), %edi
+   movzwl 4(rPC), %ecx
+
+   // Set the dex pc pointer.
+   leaq CODE_ITEM_INSNS_OFFSET(%rax), rPC
+   CFI_DEFINE_DEX_PC_WITH_OFFSET(CFI_TMP, CFI_DEX, 0)
+.endm
+
+// Setup arguments based on a non-range nterp to nterp call, and start executing
+// the method. We expect:
+// - rNEW_FP: the new pointer to dex registers
+// - rNEW_REFS: the new pointer to references
+// - rPC: the new PC pointer to execute
+// - edi: number of arguments
+// - ecx: first dex register
+// - r11: top of dex register array
+// - esi: receiver if non-static.
+.macro SETUP_NON_RANGE_ARGUMENTS_AND_EXECUTE is_static=0, is_string_init=0
+   // Now all temporary registers (except r11 containing top of registers array)
+   // are available, copy the parameters.
+   // /* op vA, vB, {vC...vG} */
+   movl %edi, %eax
+   shrl $$4, %eax # Number of arguments
+   jz 6f  # shl sets the Z flag
+   movq MACRO_LITERAL(-1), %r10
+   cmpl MACRO_LITERAL(2), %eax
+   jl 1f
+   je 2f
+   cmpl MACRO_LITERAL(4), %eax
+   jl 3f
+   je 4f
+
+  // We use a decrementing r10 to store references relative
+  // to rNEW_FP and dex registers relative to r11.
+  //
+  // TODO: We could set up r10 as the number of registers (this can be an additional output from
+  // SETUP_STACK_FOR_INVOKE) and then just decrement it by one before copying each arg to
+  // (rNEW_FP, r10, 4) and (rNEW_REFS, r10, 4).
+  // Maybe even introduce macros NEW_VREG_ADDRESS/NEW_VREG_REF_ADDRESS.
+5:
+   andq        MACRO_LITERAL(15), %rdi
+   GET_VREG_OBJECT %edx, %rdi
+   movl        %edx, (rNEW_FP, %r10, 4)
+   GET_VREG    %edx, %rdi
+   movl        %edx, (%r11, %r10, 4)
+   subq        MACRO_LITERAL(1), %r10
+4:
+   movl        %ecx, %eax
+   shrl        MACRO_LITERAL(12), %eax
+   GET_VREG_OBJECT %edx, %rax
+   movl        %edx, (rNEW_FP, %r10, 4)
+   GET_VREG    %edx, %rax
+   movl        %edx, (%r11, %r10, 4)
+   subq        MACRO_LITERAL(1), %r10
+3:
+   movl        %ecx, %eax
+   shrl        MACRO_LITERAL(8), %eax
+   andl        MACRO_LITERAL(0xf), %eax
+   GET_VREG_OBJECT %edx, %rax
+   movl        %edx, (rNEW_FP, %r10, 4)
+   GET_VREG    %edx, %rax
+   movl        %edx, (%r11, %r10, 4)
+   subq        MACRO_LITERAL(1), %r10
+2:
+   movl        %ecx, %eax
+   shrl        MACRO_LITERAL(4), %eax
+   andl        MACRO_LITERAL(0xf), %eax
+   GET_VREG_OBJECT %edx, %rax
+   movl        %edx, (rNEW_FP, %r10, 4)
+   GET_VREG    %edx, %rax
+   movl        %edx, (%r11, %r10, 4)
+   subq        MACRO_LITERAL(1), %r10
+1:
+   .if \is_string_init
+   // Ignore the first argument
+   .elseif \is_static
+   movl        %ecx, %eax
+   andq        MACRO_LITERAL(0x000f), %rax
+   GET_VREG_OBJECT %edx, %rax
+   movl        %edx, (rNEW_FP, %r10, 4)
+   GET_VREG    %edx, %rax
+   movl        %edx, (%r11, %r10, 4)
+   .else
+   movl        %esi, (rNEW_FP, %r10, 4)
+   movl        %esi, (%r11, %r10, 4)
+   .endif
+
+6:
+   // Start executing the method.
+   movq rNEW_FP, rFP
+   movq rNEW_REFS, rREFS
+   CFI_DEFINE_CFA_DEREF(CFI_REFS, -8, (6 + 4 + 1) * 8)
+   START_EXECUTING_INSTRUCTIONS
+.endm
+
+// Setup arguments based on a range nterp to nterp call, and start executing
+// the method.
+.macro SETUP_RANGE_ARGUMENTS_AND_EXECUTE is_static=0, is_string_init=0
+   // edi is number of arguments
+   // ecx is first register
+   movq MACRO_LITERAL(-4), %r10
+   .if \is_string_init
+   // Ignore the first argument
+   subl $$1, %edi
+   addl $$1, %ecx
+   .elseif !\is_static
+   subl $$1, %edi
+   addl $$1, %ecx
+   .endif
+
+   testl %edi, %edi
+   je 2f
+   leaq  (rREFS, %rcx, 4), %rax  # pointer to first argument in reference array
+   leaq  (%rax, %rdi, 4), %rax   # pointer to last argument in reference array
+   leaq  (rFP, %rcx, 4), %rcx    # pointer to first argument in register array
+   leaq  (%rcx, %rdi, 4), %rdi   # pointer to last argument in register array
+   // TODO: Same comment for copying arguments as in SETUP_NON_RANGE_ARGUMENTS_AND_EXECUTE.
+1:
+   movl  -4(%rax), %edx
+   movl  %edx, (rNEW_FP, %r10, 1)
+   movl  -4(%rdi), %edx
+   movl  %edx, (%r11, %r10, 1)
+   subq  MACRO_LITERAL(4), %r10
+   subq  MACRO_LITERAL(4), %rax
+   subq  MACRO_LITERAL(4), %rdi
+   cmpq  %rcx, %rdi
+   jne 1b
+
+2:
+   .if \is_string_init
+   // Ignore first argument
+   .elseif !\is_static
+   movl        %esi, (rNEW_FP, %r10, 1)
+   movl        %esi, (%r11, %r10, 1)
+   .endif
+   movq rNEW_FP, rFP
+   movq rNEW_REFS, rREFS
+   CFI_DEFINE_CFA_DEREF(CFI_REFS, -8, (6 + 4 + 1) * 8)
+   START_EXECUTING_INSTRUCTIONS
+.endm
+
+.macro GET_SHORTY dest, is_interface, is_polymorphic, is_custom
+   push %rdi
+   push %rsi
+   .if \is_polymorphic
+   movq 16(%rsp), %rdi
+   movq rPC, %rsi
+   call SYMBOL(NterpGetShortyFromInvokePolymorphic)
+   .elseif \is_custom
+   movq 16(%rsp), %rdi
+   movq rPC, %rsi
+   call SYMBOL(NterpGetShortyFromInvokeCustom)
+   .elseif \is_interface
+   movq 16(%rsp), %rdi
+   movzwl 2(rPC), %esi
+   call SYMBOL(NterpGetShortyFromMethodId)
+   .else
+   call SYMBOL(NterpGetShorty)
+   .endif
+   pop %rsi
+   pop %rdi
+   movq %rax, \dest
+.endm
+
+.macro DO_ENTRY_POINT_CHECK call_compiled_code
+   // On entry, the method is %rdi, the instance is %rsi
+   leaq ExecuteNterpImpl(%rip), %rax
+   cmpq %rax, ART_METHOD_QUICK_CODE_OFFSET_64(%rdi)
+   jne  VAR(call_compiled_code)
+
+   // TODO: Get code item in a better way and remove below
+   push %rdi
+   push %rsi
+   call SYMBOL(NterpGetCodeItem)
+   pop %rsi
+   pop %rdi
+   // TODO: Get code item in a better way and remove above
+.endm
+
+// Uses r9 and r10 as temporary
+.macro UPDATE_REGISTERS_FOR_STRING_INIT old_value, new_value
+   movq rREFS, %r9
+   movq rFP, %r10
+1:
+   cmpl (%r9), \old_value
+   jne 2f
+   movl \new_value, (%r9)
+   movl \new_value, (%r10)
+2:
+   addq $$4, %r9
+   addq $$4, %r10
+   cmpq %r9, rFP
+   jne 1b
+.endm
+
+.macro COMMON_INVOKE_NON_RANGE is_static=0, is_interface=0, suffix="", is_string_init=0, is_polymorphic=0, is_custom=0
+   .if \is_polymorphic
+   // We always go to compiled code for polymorphic calls.
+   .elseif \is_custom
+   // We always go to compiled code for custom calls.
+   .else
+     DO_ENTRY_POINT_CHECK .Lcall_compiled_code_\suffix
+     .if \is_string_init
+     call nterp_to_nterp_string_init_non_range
+     .elseif \is_static
+     call nterp_to_nterp_static_non_range
+     .else
+     call nterp_to_nterp_instance_non_range
+     .endif
+     jmp .Ldone_return_\suffix
+   .endif
+
+.Lcall_compiled_code_\suffix:
+   GET_SHORTY rINSTq, \is_interface, \is_polymorphic, \is_custom
+   // From this point:
+   // - rISNTq contains shorty (in callee-save to switch over return value after call).
+   // - rdi contains method
+   // - rsi contains 'this' pointer for instance method.
+   leaq 1(rINSTq), %r9  // shorty + 1  ; ie skip return arg character
+   movzwl 4(rPC), %r11d // arguments
+   .if \is_string_init
+   shrq MACRO_LITERAL(4), %r11
+   movq $$1, %r10       // ignore first argument
+   .elseif \is_static
+   movq $$0, %r10       // arg_index
+   .else
+   shrq MACRO_LITERAL(4), %r11
+   movq $$1, %r10       // arg_index
+   .endif
+   LOOP_OVER_SHORTY_LOADING_XMMS xmm0, r11, r9, r10, .Lxmm_setup_finished_\suffix
+   LOOP_OVER_SHORTY_LOADING_XMMS xmm1, r11, r9, r10, .Lxmm_setup_finished_\suffix
+   LOOP_OVER_SHORTY_LOADING_XMMS xmm2, r11, r9, r10, .Lxmm_setup_finished_\suffix
+   LOOP_OVER_SHORTY_LOADING_XMMS xmm3, r11, r9, r10, .Lxmm_setup_finished_\suffix
+   LOOP_OVER_SHORTY_LOADING_XMMS xmm4, r11, r9, r10, .Lxmm_setup_finished_\suffix
+.Lxmm_setup_finished_\suffix:
+   leaq 1(rINSTq), %r9  // shorty + 1  ; ie skip return arg character
+   movzwl 4(rPC), %r11d // arguments
+   .if \is_string_init
+   movq $$1, %r10       // ignore first argument
+   shrq MACRO_LITERAL(4), %r11
+   LOOP_OVER_SHORTY_LOADING_GPRS rsi, esi, r11, r9, r10, .Lgpr_setup_finished_\suffix
+   .elseif \is_static
+   movq $$0, %r10       // arg_index
+   LOOP_OVER_SHORTY_LOADING_GPRS rsi, esi, r11, r9, r10, .Lgpr_setup_finished_\suffix
+   .else
+   shrq MACRO_LITERAL(4), %r11
+   movq $$1, %r10       // arg_index
+   .endif
+   LOOP_OVER_SHORTY_LOADING_GPRS rdx, edx, r11, r9, r10, .Lgpr_setup_finished_\suffix
+   LOOP_OVER_SHORTY_LOADING_GPRS rcx, ecx, r11, r9, r10, .Lgpr_setup_finished_\suffix
+   LOOP_OVER_SHORTY_LOADING_GPRS r8, r8d, r11, r9, r10, .Lgpr_setup_finished_\suffix
+   LOOP_OVER_SHORTY_LOADING_GPRS r9, r9d, r11, r9, r10, .Lgpr_setup_finished_\suffix
+.Lgpr_setup_finished_\suffix:
+   .if \is_polymorphic
+   call SYMBOL(art_quick_invoke_polymorphic)
+   .elseif \is_custom
+   call SYMBOL(art_quick_invoke_custom)
+   .else
+      .if \is_interface
+      movzwl 2(rPC), %eax
+      .endif
+      call *ART_METHOD_QUICK_CODE_OFFSET_64(%rdi) // Call the method.
+   .endif
+   cmpb LITERAL(68), (rINSTq)       // Test if result type char == 'D'.
+   je .Lreturn_double_\suffix
+   cmpb LITERAL(70), (rINSTq)       // Test if result type char == 'F'.
+   jne .Ldone_return_\suffix
+.Lreturn_float_\suffix:
+   movd %xmm0, %eax
+   jmp .Ldone_return_\suffix
+.Lreturn_double_\suffix:
+   movq %xmm0, %rax
+.Ldone_return_\suffix:
+   /* resume execution of caller */
+   .if \is_string_init
+   movzwl 4(rPC), %r11d // arguments
+   andq $$0xf, %r11
+   GET_VREG %esi, %r11
+   UPDATE_REGISTERS_FOR_STRING_INIT %esi, %eax
+   .endif
+
+   .if \is_polymorphic
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 4
+   .else
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 3
+   .endif
+.endm
+
+.macro COMMON_INVOKE_RANGE is_static=0, is_interface=0, suffix="", is_string_init=0, is_polymorphic=0, is_custom=0
+   .if \is_polymorphic
+   // We always go to compiled code for polymorphic calls.
+   .elseif \is_custom
+   // We always go to compiled code for custom calls.
+   .else
+     DO_ENTRY_POINT_CHECK .Lcall_compiled_code_range_\suffix
+     .if \is_string_init
+     call nterp_to_nterp_string_init_range
+     .elseif \is_static
+     call nterp_to_nterp_static_range
+     .else
+     call nterp_to_nterp_instance_range
+     .endif
+     jmp .Ldone_return_range_\suffix
+   .endif
+
+.Lcall_compiled_code_range_\suffix:
+   GET_SHORTY rINSTq, \is_interface, \is_polymorphic, \is_custom
+   // From this point:
+   // - rINSTq contains shorty (in callee-save to switch over return value after call).
+   // - rdi contains method
+   // - rsi contains 'this' pointer for instance method.
+   leaq 1(rINSTq), %r9  // shorty + 1  ; ie skip return arg character
+   movzwl 4(rPC), %r10d // arg start index
+   .if \is_string_init
+   addq $$1, %r10       // arg start index
+   movq $$1, %rbp       // index in stack
+   .elseif \is_static
+   movq $$0, %rbp       // index in stack
+   .else
+   addq $$1, %r10       // arg start index
+   movq $$1, %rbp       // index in stack
+   .endif
+   LOOP_RANGE_OVER_SHORTY_LOADING_XMMS xmm0, r9, r10, rbp, .Lxmm_setup_finished_range_\suffix
+   LOOP_RANGE_OVER_SHORTY_LOADING_XMMS xmm1, r9, r10, rbp, .Lxmm_setup_finished_range_\suffix
+   LOOP_RANGE_OVER_SHORTY_LOADING_XMMS xmm2, r9, r10, rbp, .Lxmm_setup_finished_range_\suffix
+   LOOP_RANGE_OVER_SHORTY_LOADING_XMMS xmm3, r9, r10, rbp, .Lxmm_setup_finished_range_\suffix
+   LOOP_RANGE_OVER_SHORTY_LOADING_XMMS xmm4, r9, r10, rbp, .Lxmm_setup_finished_range_\suffix
+   LOOP_RANGE_OVER_SHORTY_LOADING_XMMS xmm5, r9, r10, rbp, .Lxmm_setup_finished_range_\suffix
+   LOOP_RANGE_OVER_SHORTY_LOADING_XMMS xmm6, r9, r10, rbp, .Lxmm_setup_finished_range_\suffix
+   LOOP_RANGE_OVER_SHORTY_LOADING_XMMS xmm7, r9, r10, rbp, .Lxmm_setup_finished_range_\suffix
+   LOOP_RANGE_OVER_FPs r9, r10, rbp, .Lxmm_setup_finished_range_\suffix
+.Lxmm_setup_finished_range_\suffix:
+   leaq 1(%rbx), %r11  // shorty + 1  ; ie skip return arg character
+   movzwl 4(rPC), %r10d // arg start index
+   .if \is_string_init
+   addq $$1, %r10       // arg start index
+   movq $$1, %rbp       // index in stack
+   LOOP_RANGE_OVER_SHORTY_LOADING_GPRS rsi, esi, r11, r10, rbp, .Lgpr_setup_finished_\suffix
+   .elseif \is_static
+   movq $$0, %rbp // index in stack
+   LOOP_RANGE_OVER_SHORTY_LOADING_GPRS rsi, esi, r11, r10, rbp, .Lgpr_setup_finished_\suffix
+   .else
+   addq $$1, %r10       // arg start index
+   movq $$1, %rbp // index in stack
+   .endif
+   LOOP_RANGE_OVER_SHORTY_LOADING_GPRS rdx, edx, r11, r10, rbp, .Lgpr_setup_finished_range_\suffix
+   LOOP_RANGE_OVER_SHORTY_LOADING_GPRS rcx, ecx, r11, r10, rbp, .Lgpr_setup_finished_range_\suffix
+   LOOP_RANGE_OVER_SHORTY_LOADING_GPRS r8, r8d, r11, r10, rbp, .Lgpr_setup_finished_range_\suffix
+   LOOP_RANGE_OVER_SHORTY_LOADING_GPRS r9, r9d, r11, r10, rbp, .Lgpr_setup_finished_range_\suffix
+   LOOP_RANGE_OVER_INTs r11, r10, rbp, .Lgpr_setup_finished_range_\suffix
+
+.Lgpr_setup_finished_range_\suffix:
+   .if \is_polymorphic
+   call SYMBOL(art_quick_invoke_polymorphic)
+   .elseif \is_custom
+   call SYMBOL(art_quick_invoke_custom)
+   .else
+     .if \is_interface
+     movzwl 2(rPC), %eax
+     .endif
+     call *ART_METHOD_QUICK_CODE_OFFSET_64(%rdi) // Call the method.
+   .endif
+   cmpb LITERAL(68), (%rbx)       // Test if result type char == 'D'.
+   je .Lreturn_range_double_\suffix
+   cmpb LITERAL(70), (%rbx)       // Test if result type char == 'F'.
+   je .Lreturn_range_float_\suffix
+   /* resume execution of caller */
+.Ldone_return_range_\suffix:
+   .if \is_string_init
+   movzwl 4(rPC), %r11d // arguments
+   GET_VREG %esi, %r11
+   UPDATE_REGISTERS_FOR_STRING_INIT %esi, %eax
+   .endif
+
+   .if \is_polymorphic
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 4
+   .else
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 3
+   .endif
+.Lreturn_range_double_\suffix:
+    movq %xmm0, %rax
+    jmp .Ldone_return_range_\suffix
+.Lreturn_range_float_\suffix:
+    movd %xmm0, %eax
+    jmp .Ldone_return_range_\suffix
+.endm
+
+// Fetch some information from the thread cache.
+// Uses rax, rdx, rcx as temporaries.
+.macro FETCH_FROM_THREAD_CACHE dest_reg, slow_path
+   movq rSELF:THREAD_SELF_OFFSET, %rax
+   movq rPC, %rdx
+   salq MACRO_LITERAL(THREAD_INTERPRETER_CACHE_SIZE_SHIFT), %rdx
+   andq MACRO_LITERAL(THREAD_INTERPRETER_CACHE_SIZE_MASK), %rdx
+   cmpq THREAD_INTERPRETER_CACHE_OFFSET(%rax, %rdx, 1), rPC
+   jne \slow_path
+   movq __SIZEOF_POINTER__+THREAD_INTERPRETER_CACHE_OFFSET(%rax, %rdx, 1), \dest_reg
+.endm
+
+// Helper for static field get.
+.macro OP_SGET load="movl", wide="0"
+   // Fast-path which gets the field from thread-local cache.
+   FETCH_FROM_THREAD_CACHE %rax, 2f
+1:
+   movl ART_FIELD_OFFSET_OFFSET(%rax), %edx
+   movl ART_FIELD_DECLARING_CLASS_OFFSET(%rax), %eax
+   cmpq $$0, rSELF:THREAD_READ_BARRIER_MARK_REG00_OFFSET
+   jne 3f
+4:
+   .if \wide
+   movq (%eax,%edx,1), %rax
+   SET_WIDE_VREG %rax, rINSTq              # fp[A] <- value
+   .else
+   \load (%eax, %edx, 1), %eax
+   SET_VREG %eax, rINSTq            # fp[A] <- value
+   .endif
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+2:
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   EXPORT_PC
+   call nterp_get_static_field
+   // Clear the marker that we put for volatile fields. The x86 memory
+   // model doesn't require a barrier.
+   andq $$-2, %rax
+   jmp 1b
+3:
+   call art_quick_read_barrier_mark_reg00
+   jmp 4b
+.endm
+
+// Helper for static field put.
+.macro OP_SPUT rINST_reg="rINST", store="movl", wide="0":
+   // Fast-path which gets the field from thread-local cache.
+   FETCH_FROM_THREAD_CACHE %rax, 2f
+1:
+   movl ART_FIELD_OFFSET_OFFSET(%rax), %edx
+   movl ART_FIELD_DECLARING_CLASS_OFFSET(%rax), %eax
+   cmpq $$0, rSELF:THREAD_READ_BARRIER_MARK_REG00_OFFSET
+   jne 3f
+4:
+   .if \wide
+   GET_WIDE_VREG rINSTq, rINSTq           # rINST <- v[A]
+   .else
+   GET_VREG rINST, rINSTq                  # rINST <- v[A]
+   .endif
+   \store    \rINST_reg, (%rax,%rdx,1)
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+2:
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   EXPORT_PC
+   call nterp_get_static_field
+   testq MACRO_LITERAL(1), %rax
+   je 1b
+   // Clear the marker that we put for volatile fields. The x86 memory
+   // model doesn't require a barrier.
+   CLEAR_VOLATILE_MARKER %rax
+   movl ART_FIELD_OFFSET_OFFSET(%rax), %edx
+   movl ART_FIELD_DECLARING_CLASS_OFFSET(%rax), %eax
+   cmpq $$0, rSELF:THREAD_READ_BARRIER_MARK_REG00_OFFSET
+   jne 6f
+5:
+   .if \wide
+   GET_WIDE_VREG rINSTq, rINSTq           # rINST <- v[A]
+   .else
+   GET_VREG rINST, rINSTq                  # rINST <- v[A]
+   .endif
+   \store    \rINST_reg, (%rax,%rdx,1)
+   lock addl $$0, (%rsp)
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+3:
+   call art_quick_read_barrier_mark_reg00
+   jmp 4b
+6:
+   call art_quick_read_barrier_mark_reg00
+   jmp 5b
+.endm
+
+
+.macro OP_IPUT_INTERNAL rINST_reg="rINST", store="movl", wide="0":
+   movzbq  rINSTbl, %rcx                   # rcx <- BA
+   sarl    $$4, %ecx                       # ecx <- B
+   GET_VREG %ecx, %rcx                     # vB (object we're operating on)
+   testl   %ecx, %ecx                      # is object null?
+   je      common_errNullObject
+   andb    $$0xf, rINSTbl                  # rINST <- A
+   .if \wide
+   GET_WIDE_VREG rINSTq, rINSTq              # rax<- fp[A]/fp[A+1]
+   .else
+   GET_VREG rINST, rINSTq                  # rINST <- v[A]
+   .endif
+   \store \rINST_reg, (%rcx,%rax,1)
+.endm
+
+// Helper for instance field put.
+.macro OP_IPUT rINST_reg="rINST", store="movl", wide="0":
+   // Fast-path which gets the field from thread-local cache.
+   FETCH_FROM_THREAD_CACHE %rax, 2f
+1:
+   OP_IPUT_INTERNAL \rINST_reg, \store, \wide
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+2:
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   EXPORT_PC
+   call nterp_get_instance_field_offset
+   testl %eax, %eax
+   jns 1b
+   negl %eax
+   OP_IPUT_INTERNAL \rINST_reg, \store, \wide
+   lock addl $$0, (%rsp)
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+.endm
+
+// Helper for instance field get.
+.macro OP_IGET load="movl", wide="0"
+   // Fast-path which gets the field from thread-local cache.
+   FETCH_FROM_THREAD_CACHE %rax, 2f
+1:
+   movl    rINST, %ecx                     # rcx <- BA
+   sarl    $$4, %ecx                       # ecx <- B
+   GET_VREG %ecx, %rcx                     # vB (object we're operating on)
+   testl   %ecx, %ecx                      # is object null?
+   je      common_errNullObject
+   andb    $$0xf,rINSTbl                   # rINST <- A
+   .if \wide
+   movq (%rcx,%rax,1), %rax
+   SET_WIDE_VREG %rax, rINSTq              # fp[A] <- value
+   .else
+   \load (%rcx,%rax,1), %eax
+   SET_VREG %eax, rINSTq                   # fp[A] <- value
+   .endif
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+2:
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   EXPORT_PC
+   call nterp_get_instance_field_offset
+   testl %eax, %eax
+   jns 1b
+   negl %eax
+   jmp 1b
+.endm
+
+%def entry():
+/*
+ * ArtMethod entry point.
+ *
+ * On entry:
+ *  rdi   ArtMethod* callee
+ *  rest  method parameters
+ */
+
+OAT_ENTRY ExecuteNterpImpl, EndExecuteNterpImpl
+    .cfi_startproc
+    .cfi_def_cfa rsp, 8
+    testq %rax, -STACK_OVERFLOW_RESERVED_BYTES(%rsp)
+    /* Spill callee save regs */
+    SPILL_ALL_CALLEE_SAVES
+
+    // TODO: Get shorty in a better way and remove below
+    PUSH rdi
+    PUSH rsi
+    PUSH rdx
+    PUSH rcx
+    PUSH r8
+    PUSH r9
+
+    // Save xmm registers + alignment.
+    subq MACRO_LITERAL(8 * 8 + 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(8 * 8 + 8)
+    movq %xmm0, 0(%rsp)
+    movq %xmm1, 8(%rsp)
+    movq %xmm2, 16(%rsp)
+    movq %xmm3, 24(%rsp)
+    movq %xmm4, 32(%rsp)
+    movq %xmm5, 40(%rsp)
+    movq %xmm6, 48(%rsp)
+    movq %xmm7, 56(%rsp)
+
+    // Save method in callee-save rbx.
+    movq %rdi, %rbx
+    call SYMBOL(NterpGetShorty)
+    // Save shorty in callee-save rbp.
+    movq %rax, %rbp
+    movq %rbx, %rdi
+    call SYMBOL(NterpGetCodeItem)
+    movq %rax, rPC
+
+    // Restore xmm registers _ alignment.
+    movq 0(%rsp), %xmm0
+    movq 8(%rsp), %xmm1
+    movq 16(%rsp), %xmm2
+    movq 24(%rsp), %xmm3
+    movq 32(%rsp), %xmm4
+    movq 40(%rsp), %xmm5
+    movq 48(%rsp), %xmm6
+    movq 56(%rsp), %xmm7
+    addq MACRO_LITERAL(8 * 8 + 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-8 * 8 - 8)
+
+    POP r9
+    POP r8
+    POP rcx
+    POP rdx
+    POP rsi
+    POP rdi
+    // TODO: Get shorty in a better way and remove above
+
+    movq %rsp, %r14  // Save stack pointer
+    CFI_DEF_CFA_REGISTER(r14)
+
+    // Create space for registers * 2. Set rFP and rRefs.
+    movzwl CODE_ITEM_REGISTERS_SIZE_OFFSET(rPC), %ebx
+    sall $$2, %ebx
+    subq %rbx, %rsp
+    movq %rsp, rFP
+    subq %rbx, %rsp
+    movq %rsp, rREFS
+    // Put nulls in reference frame.
+    testl %ebx, %ebx
+    je .Ldone_clearing_references
+    movq rREFS, %r11
+.Lclear_references:
+    movl $$0, (%r11)
+    addq $$4, %r11
+    cmpq %r11, rFP
+    jne .Lclear_references
+.Ldone_clearing_references:
+
+    // Create space for the previous frame, saved pc, and method being called
+    subq $$24, %rsp
+
+    // Save the previous frame.
+    movq %r14, -8(rREFS)
+    CFI_DEFINE_CFA_DEREF(CFI_REFS, -8, (6 + 4 + 1) * 8)
+
+    // Take space for outs.
+    movzwl CODE_ITEM_OUTS_SIZE_OFFSET(rPC), %r11d
+    sall $$2, %r11d
+    subq %r11, %rsp
+
+    // Align stack pointer to 16.
+    andq $$-16, %rsp
+
+    // Save the ArtMethod.
+    movq %rdi, (%rsp)
+
+    // Setup the parameters
+    movzwl CODE_ITEM_INS_SIZE_OFFSET(rPC), %r11d
+    testl %r11d, %r11d
+    je .Lgpr_setup_finished
+
+    sall $$2, %r11d
+    subq %r11, %rbx // rbx is now the offset for inputs into the registers array.
+
+    // Available r11, rbx, rdi, r10
+    testl $$ART_METHOD_IS_STATIC_FLAG, ART_METHOD_ACCESS_FLAGS_OFFSET(%rdi)
+    // Note the leaq below don't change the flags.
+    leaq 1(%rbp), %r10  // shorty + 1  ; ie skip return arg character
+    leaq (rFP, %rbx, 1), %rdi
+    leaq (rREFS, %rbx, 1), %rbx
+    jne .Lhandle_static_method
+    movl %esi, (%rdi)
+    movl %esi, (%rbx)
+    addq $$4, %rdi
+    addq $$4, %rbx
+    addq $$4, %r14
+    movq $$0, %r11
+    jmp .Lcontinue_setup_gprs
+.Lhandle_static_method:
+    movq $$0, %r11
+    LOOP_OVER_SHORTY_STORING_GPRS rsi, esi, r10, r11, rdi, rbx, .Lgpr_setup_finished
+.Lcontinue_setup_gprs:
+    LOOP_OVER_SHORTY_STORING_GPRS rdx, edx, r10, r11, rdi, rbx, .Lgpr_setup_finished
+    LOOP_OVER_SHORTY_STORING_GPRS rcx, ecx, r10, r11, rdi, rbx, .Lgpr_setup_finished
+    LOOP_OVER_SHORTY_STORING_GPRS r8, r8d, r10, r11, rdi, rbx, .Lgpr_setup_finished
+    LOOP_OVER_SHORTY_STORING_GPRS r9, r9d, r10, r11, rdi, rbx, .Lgpr_setup_finished
+    LOOP_OVER_INTs r10, r11, rdi, rbx, r14, .Lgpr_setup_finished
+.Lgpr_setup_finished:
+    leaq 1(%rbp), %r10  // shorty + 1  ; ie skip return arg character
+    movq $$0, %r11 // reset counter
+    LOOP_OVER_SHORTY_STORING_XMMS xmm0, r10, r11, rdi, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_STORING_XMMS xmm1, r10, r11, rdi, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_STORING_XMMS xmm2, r10, r11, rdi, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_STORING_XMMS xmm3, r10, r11, rdi, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_STORING_XMMS xmm4, r10, r11, rdi, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_STORING_XMMS xmm5, r10, r11, rdi, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_STORING_XMMS xmm6, r10, r11, rdi, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_STORING_XMMS xmm7, r10, r11, rdi, .Lxmm_setup_finished
+    LOOP_OVER_FPs r10, r11, rdi, r14, .Lxmm_setup_finished
+.Lxmm_setup_finished:
+    // Set the dex pc pointer.
+    addq $$CODE_ITEM_INSNS_OFFSET, rPC
+    CFI_DEFINE_DEX_PC_WITH_OFFSET(CFI_TMP, CFI_DEX, 0)
+
+    // Set rIBASE
+    leaq artNterpAsmInstructionStart(%rip), rIBASE
+    /* start executing the instruction at rPC */
+    FETCH_INST
+    GOTO_NEXT
+    /* NOTE: no fallthrough */
+    // cfi info continues, and covers the whole nterp implementation.
+    END ExecuteNterpImpl
+
+%def opcode_pre():
+
+%def helpers():
+
+%def footer():
+/*
+ * ===========================================================================
+ *  Common subroutines and data
+ * ===========================================================================
+ */
+
+    .text
+    .align  2
+
+// Note: mterp also uses the common_* names below for helpers, but that's OK
+// as the C compiler compiled each interpreter separately.
+common_errDivideByZero:
+    EXPORT_PC
+    call art_quick_throw_div_zero
+
+common_errArrayIndex:
+    EXPORT_PC
+    movl MIRROR_ARRAY_LENGTH_OFFSET(%edi), %eax
+    movl %esi, %edi
+    movl %eax, %esi
+    call art_quick_throw_array_bounds
+
+common_errNullObject:
+    EXPORT_PC
+    call art_quick_throw_null_pointer_exception
+
+NterpCommonInvokeStatic:
+    COMMON_INVOKE_NON_RANGE is_static=1, is_interface=0, suffix="invokeStatic"
+
+NterpCommonInvokeStaticRange:
+    COMMON_INVOKE_RANGE is_static=1, is_interface=0, suffix="invokeStatic"
+
+NterpCommonInvokeInstance:
+    COMMON_INVOKE_NON_RANGE is_static=0, is_interface=0, suffix="invokeInstance"
+
+NterpCommonInvokeInstanceRange:
+    COMMON_INVOKE_RANGE is_static=0, is_interface=0, suffix="invokeInstance"
+
+NterpCommonInvokeInterface:
+    COMMON_INVOKE_NON_RANGE is_static=0, is_interface=1, suffix="invokeInterface"
+
+NterpCommonInvokeInterfaceRange:
+    COMMON_INVOKE_RANGE is_static=0, is_interface=1, suffix="invokeInterface"
+
+NterpCommonInvokePolymorphic:
+    COMMON_INVOKE_NON_RANGE is_static=0, is_interface=0, is_string_init=0, is_polymorphic=1, suffix="invokePolymorphic"
+
+NterpCommonInvokePolymorphicRange:
+    COMMON_INVOKE_RANGE is_static=0, is_interface=0, is_polymorphic=1, suffix="invokePolymorphic"
+
+NterpCommonInvokeCustom:
+    COMMON_INVOKE_NON_RANGE is_static=1, is_interface=0, is_string_init=0, is_polymorphic=0, is_custom=1, suffix="invokeCustom"
+
+NterpCommonInvokeCustomRange:
+    COMMON_INVOKE_RANGE is_static=1, is_interface=0, is_polymorphic=0, is_custom=1, suffix="invokeCustom"
+
+NterpHandleStringInit:
+   COMMON_INVOKE_NON_RANGE is_static=0, is_interface=0, is_string_init=1, suffix="stringInit"
+
+NterpHandleStringInitRange:
+   COMMON_INVOKE_RANGE is_static=0, is_interface=0, is_string_init=1, suffix="stringInit"
+
+NterpNewInstance:
+   EXPORT_PC
+   // Fast-path which gets the class from thread-local cache.
+   FETCH_FROM_THREAD_CACHE %rdi, 2f
+   cmpq $$0, rSELF:THREAD_READ_BARRIER_MARK_REG00_OFFSET
+   jne 3f
+4:
+   callq *rSELF:THREAD_ALLOC_OBJECT_ENTRYPOINT_OFFSET
+1:
+   SET_VREG_OBJECT %eax, rINSTq            # fp[A] <- value
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+2:
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   call nterp_get_class_or_allocate_object
+   jmp 1b
+3:
+   // 07 is %rdi
+   call art_quick_read_barrier_mark_reg07
+   jmp 4b
+
+NterpNewArray:
+   /* new-array vA, vB, class@CCCC */
+   EXPORT_PC
+   // Fast-path which gets the class from thread-local cache.
+   FETCH_FROM_THREAD_CACHE %rdi, 2f
+   cmpq $$0, rSELF:THREAD_READ_BARRIER_MARK_REG00_OFFSET
+   jne 3f
+1:
+   movzbl  rINSTbl,%esi
+   sarl    $$4,%esi                          # esi<- B
+   GET_VREG %esi %rsi                        # esi<- vB (array length)
+   andb    $$0xf,rINSTbl                     # rINST<- A
+   callq *rSELF:THREAD_ALLOC_ARRAY_ENTRYPOINT_OFFSET
+   SET_VREG_OBJECT %eax, rINSTq            # fp[A] <- value
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+2:
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   call nterp_get_class_or_allocate_object
+   movq %rax, %rdi
+   jmp 1b
+3:
+   // 07 is %rdi
+   call art_quick_read_barrier_mark_reg07
+   jmp 1b
+
+NterpPutObjectInstanceField:
+   // Fast-path which gets the field from thread-local cache.
+   FETCH_FROM_THREAD_CACHE %rax, 2f
+1:
+   movzbq  rINSTbl, %rcx                   # rcx <- BA
+   sarl    $$4, %ecx                       # ecx <- B
+   GET_VREG %ecx, %rcx                     # vB (object we're operating on)
+   testl   %ecx, %ecx                      # is object null?
+   je      common_errNullObject
+   andb    $$0xf, rINSTbl                  # rINST <- A
+   GET_VREG rINST, rINSTq                  # rINST <- v[A]
+   movl rINST, (%rcx,%rax,1)
+   testl rINST, rINST
+   je 4f
+   movq rSELF:THREAD_CARD_TABLE_OFFSET, %rax
+   shrq $$CARD_TABLE_CARD_SHIFT, %rcx
+   movb %al, (%rax, %rcx, 1)
+4:
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+2:
+   EXPORT_PC
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   EXPORT_PC
+   call nterp_get_instance_field_offset
+   testl %eax, %eax
+   jns 1b
+   negl %eax
+   movzbq  rINSTbl, %rcx                   # rcx <- BA
+   sarl    $$4, %ecx                       # ecx <- B
+   GET_VREG %ecx, %rcx                     # vB (object we're operating on)
+   testl   %ecx, %ecx                      # is object null?
+   je      common_errNullObject
+   andb    $$0xf, rINSTbl                  # rINST <- A
+   GET_VREG rINST, rINSTq                  # rINST <- v[A]
+   movl rINST, (%rcx,%rax,1)
+   testl rINST, rINST
+   je 5f
+   movq rSELF:THREAD_CARD_TABLE_OFFSET, %rax
+   shrq $$CARD_TABLE_CARD_SHIFT, %rcx
+   movb %al, (%rcx, %rax, 1)
+5:
+   lock addl $$0, (%rsp)
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+
+NterpGetObjectInstanceField:
+   // Fast-path which gets the field from thread-local cache.
+   FETCH_FROM_THREAD_CACHE %rax, 2f
+1:
+   movl    rINST, %ecx                     # rcx <- BA
+   sarl    $$4, %ecx                       # ecx <- B
+   GET_VREG %ecx, %rcx                     # vB (object we're operating on)
+   testl   %ecx, %ecx                      # is object null?
+   je      common_errNullObject
+   testb $$READ_BARRIER_TEST_VALUE, GRAY_BYTE_OFFSET(%ecx)
+   movl (%rcx,%rax,1), %eax
+   jnz 3f
+4:
+   andb    $$0xf,rINSTbl                   # rINST <- A
+   SET_VREG_OBJECT %eax, rINSTq            # fp[A] <- value
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+2:
+   EXPORT_PC
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   EXPORT_PC
+   call nterp_get_instance_field_offset
+   testl %eax, %eax
+   jns 1b
+   // For volatile fields, we return a negative offset. Remove the sign
+   // and no need for any barrier thanks to the memory model.
+   negl %eax
+   jmp 1b
+3:
+   // reg00 is eax
+   call art_quick_read_barrier_mark_reg00
+   jmp 4b
+
+NterpPutObjectStaticField:
+   // Fast-path which gets the field from thread-local cache.
+   FETCH_FROM_THREAD_CACHE %rax, 2f
+1:
+   movl ART_FIELD_OFFSET_OFFSET(%rax), %edx
+   movl ART_FIELD_DECLARING_CLASS_OFFSET(%rax), %eax
+   cmpq $$0, rSELF:THREAD_READ_BARRIER_MARK_REG00_OFFSET
+   jne 3f
+5:
+   GET_VREG %ecx, rINSTq
+   movl %ecx, (%eax, %edx, 1)
+   testl %ecx, %ecx
+   je 4f
+   movq rSELF:THREAD_CARD_TABLE_OFFSET, %rcx
+   shrq $$CARD_TABLE_CARD_SHIFT, %rax
+   movb %cl, (%rax, %rcx, 1)
+4:
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+2:
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   EXPORT_PC
+   call nterp_get_static_field
+   testq MACRO_LITERAL(1), %rax
+   je 1b
+   CLEAR_VOLATILE_MARKER %rax
+   movl ART_FIELD_OFFSET_OFFSET(%rax), %edx
+   movl ART_FIELD_DECLARING_CLASS_OFFSET(%rax), %eax
+   cmpq $$0, rSELF:THREAD_READ_BARRIER_MARK_REG00_OFFSET
+   jne 7f
+6:
+   movzbl rINSTbl, %ecx
+   GET_VREG %ecx, %rcx
+   movl %ecx, (%eax, %edx, 1)
+   testl %ecx, %ecx
+   je 8f
+   movq rSELF:THREAD_CARD_TABLE_OFFSET, %rcx
+   shrq $$CARD_TABLE_CARD_SHIFT, %rax
+   movb %cl, (%rax, %rcx, 1)
+8:
+   lock addl $$0, (%rsp)
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+3:
+   call art_quick_read_barrier_mark_reg00
+   jmp 5b
+7:
+   call art_quick_read_barrier_mark_reg00
+   jmp 6b
+
+NterpGetObjectStaticField:
+   // Fast-path which gets the field from thread-local cache.
+   FETCH_FROM_THREAD_CACHE %rax, 2f
+1:
+   movl ART_FIELD_OFFSET_OFFSET(%rax), %edx
+   movl ART_FIELD_DECLARING_CLASS_OFFSET(%rax), %eax
+   cmpq $$0, rSELF:THREAD_READ_BARRIER_MARK_REG00_OFFSET
+   jne 5f
+6:
+   testb $$READ_BARRIER_TEST_VALUE, GRAY_BYTE_OFFSET(%eax)
+   movl (%eax, %edx, 1), %eax
+   jnz 3f
+4:
+   SET_VREG_OBJECT %eax, rINSTq            # fp[A] <- value
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+2:
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   EXPORT_PC
+   call nterp_get_static_field
+   andq $$-2, %rax
+   jmp 1b
+3:
+   call art_quick_read_barrier_mark_reg00
+   jmp 4b
+5:
+   call art_quick_read_barrier_mark_reg00
+   jmp 6b
+
+NterpGetBooleanStaticField:
+  OP_SGET load="movsbl", wide=0
+
+NterpGetByteStaticField:
+  OP_SGET load="movsbl", wide=0
+
+NterpGetCharStaticField:
+  OP_SGET load="movzwl", wide=0
+
+NterpGetShortStaticField:
+  OP_SGET load="movswl", wide=0
+
+NterpGetWideStaticField:
+  OP_SGET load="movq", wide=1
+
+NterpGetIntStaticField:
+  OP_SGET load="movl", wide=0
+
+NterpPutStaticField:
+  OP_SPUT rINST_reg=rINST, store="movl", wide=0
+
+NterpPutBooleanStaticField:
+NterpPutByteStaticField:
+  OP_SPUT rINST_reg=rINSTbl, store="movb", wide=0
+
+NterpPutCharStaticField:
+NterpPutShortStaticField:
+  OP_SPUT rINST_reg=rINSTw, store="movw", wide=0
+
+NterpPutWideStaticField:
+  OP_SPUT rINST_reg=rINSTq, store="movq", wide=1
+
+NterpPutInstanceField:
+  OP_IPUT rINST_reg=rINST, store="movl", wide=0
+
+NterpPutBooleanInstanceField:
+NterpPutByteInstanceField:
+  OP_IPUT rINST_reg=rINSTbl, store="movb", wide=0
+
+NterpPutCharInstanceField:
+NterpPutShortInstanceField:
+  OP_IPUT rINST_reg=rINSTw, store="movw", wide=0
+
+NterpPutWideInstanceField:
+  OP_IPUT rINST_reg=rINSTq, store="movq", wide=1
+
+NterpGetBooleanInstanceField:
+  OP_IGET load="movzbl", wide=0
+
+NterpGetByteInstanceField:
+  OP_IGET load="movsbl", wide=0
+
+NterpGetCharInstanceField:
+  OP_IGET load="movzwl", wide=0
+
+NterpGetShortInstanceField:
+  OP_IGET load="movswl", wide=0
+
+NterpGetWideInstanceField:
+  OP_IGET load="movq", wide=1
+
+NterpGetInstanceField:
+  OP_IGET load="movl", wide=0
+
+NterpInstanceOf:
+    /* instance-of vA, vB, class@CCCC */
+   // Fast-path which gets the class from thread-local cache.
+   EXPORT_PC
+   FETCH_FROM_THREAD_CACHE %rsi, 2f
+   cmpq $$0, rSELF:THREAD_READ_BARRIER_MARK_REG00_OFFSET
+   jne 5f
+1:
+   movzbl  rINSTbl,%edi
+   sarl    $$4,%edi                          # edi<- B
+   GET_VREG %edi %rdi                        # edi<- vB (object)
+   andb    $$0xf,rINSTbl                     # rINST<- A
+   testl %edi, %edi
+   je 3f
+   call art_quick_instance_of
+   SET_VREG %eax, rINSTq            # fp[A] <- value
+4:
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+3:
+   SET_VREG %edi, rINSTq            # fp[A] <-0
+   jmp 4b
+2:
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   call nterp_get_class_or_allocate_object
+   movq %rax, %rsi
+   jmp 1b
+5:
+   // 06 is %rsi
+   call art_quick_read_barrier_mark_reg06
+   jmp 1b
+
+NterpCheckCast:
+   // Fast-path which gets the class from thread-local cache.
+   EXPORT_PC
+   FETCH_FROM_THREAD_CACHE %rsi, 3f
+   cmpq $$0, rSELF:THREAD_READ_BARRIER_MARK_REG00_OFFSET
+   jne 4f
+1:
+   GET_VREG %edi, rINSTq
+   testl %edi, %edi
+   je 2f
+   call art_quick_check_instance_of
+2:
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+3:
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   call nterp_get_class_or_allocate_object
+   movq %rax, %rsi
+   jmp 1b
+4:
+   // 06 is %rsi
+   call art_quick_read_barrier_mark_reg06
+   jmp 1b
+
+NterpHandleHotnessOverflow:
+    leaq (rPC, rINSTq, 2), %rsi
+    movq rFP, %rdx
+    call nterp_hot_method
+    testq %rax, %rax
+    jne 1f
+    leaq    (rPC, rINSTq, 2), rPC
+    FETCH_INST
+    GOTO_NEXT
+1:
+    // Drop the current frame.
+    movq -8(rREFS), %rsp
+    CFI_DEF_CFA(rsp, CALLEE_SAVES_SIZE)
+
+    // Setup the new frame
+    movq OSR_DATA_FRAME_SIZE(%rax), %rcx
+    // Given stack size contains all callee saved registers, remove them.
+    subq $$CALLEE_SAVES_SIZE, %rcx
+
+    // Remember CFA.
+    movq %rsp, %rbp
+    CFI_DEF_CFA_REGISTER(rbp)
+
+    subq %rcx, %rsp
+    movq %rsp, %rdi               // rdi := beginning of stack
+    leaq OSR_DATA_MEMORY(%rax), %rsi  // rsi := memory to copy
+    rep movsb                     // while (rcx--) { *rdi++ = *rsi++ }
+
+    // Fetch the native PC to jump to and save it in a callee-save register.
+    movq OSR_DATA_NATIVE_PC(%rax), %rbx
+
+    // Free the memory holding OSR Data.
+    movq %rax, %rdi
+    call free
+
+    // Jump to the compiled code.
+    jmp *%rbx
+
+// This is the logical end of ExecuteNterpImpl, where the frame info applies.
+// EndExecuteNterpImpl includes the methods below as we want the runtime to
+// see them as part of the Nterp PCs.
+.cfi_endproc
+
+nterp_to_nterp_static_non_range:
+    .cfi_startproc
+    .cfi_def_cfa rsp, 8
+    SETUP_STACK_FOR_INVOKE
+    SETUP_NON_RANGE_ARGUMENTS_AND_EXECUTE is_static=1, is_string_init=0
+    .cfi_endproc
+
+nterp_to_nterp_string_init_non_range:
+    .cfi_startproc
+    .cfi_def_cfa rsp, 8
+    SETUP_STACK_FOR_INVOKE
+    SETUP_NON_RANGE_ARGUMENTS_AND_EXECUTE is_static=0, is_string_init=1
+    .cfi_endproc
+
+nterp_to_nterp_instance_non_range:
+    .cfi_startproc
+    .cfi_def_cfa rsp, 8
+    SETUP_STACK_FOR_INVOKE
+    SETUP_NON_RANGE_ARGUMENTS_AND_EXECUTE is_static=0, is_string_init=0
+    .cfi_endproc
+
+nterp_to_nterp_static_range:
+    .cfi_startproc
+    .cfi_def_cfa rsp, 8
+    SETUP_STACK_FOR_INVOKE
+    SETUP_RANGE_ARGUMENTS_AND_EXECUTE is_static=1
+    .cfi_endproc
+
+nterp_to_nterp_instance_range:
+    .cfi_startproc
+    .cfi_def_cfa rsp, 8
+    SETUP_STACK_FOR_INVOKE
+    SETUP_RANGE_ARGUMENTS_AND_EXECUTE is_static=0
+    .cfi_endproc
+
+nterp_to_nterp_string_init_range:
+    .cfi_startproc
+    .cfi_def_cfa rsp, 8
+    SETUP_STACK_FOR_INVOKE
+    SETUP_RANGE_ARGUMENTS_AND_EXECUTE is_static=0, is_string_init=1
+    .cfi_endproc
+
+// This is the end of PCs contained by the OatQuickMethodHeader created for the interpreter
+// entry point.
+    FUNCTION_TYPE(EndExecuteNterpImpl)
+    ASM_HIDDEN SYMBOL(EndExecuteNterpImpl)
+    .global SYMBOL(EndExecuteNterpImpl)
+SYMBOL(EndExecuteNterpImpl):
+
+// Entrypoints into runtime.
+NTERP_TRAMPOLINE nterp_get_static_field, NterpGetStaticField
+NTERP_TRAMPOLINE nterp_get_instance_field_offset, NterpGetInstanceFieldOffset
+NTERP_TRAMPOLINE nterp_filled_new_array, NterpFilledNewArray
+NTERP_TRAMPOLINE nterp_filled_new_array_range, NterpFilledNewArrayRange
+NTERP_TRAMPOLINE nterp_get_class_or_allocate_object, NterpGetClassOrAllocateObject
+NTERP_TRAMPOLINE nterp_get_method, NterpGetMethod
+NTERP_TRAMPOLINE nterp_hot_method, NterpHotMethod
+NTERP_TRAMPOLINE nterp_load_object, NterpLoadObject
+
+// gen_mterp.py will inline the following definitions
+// within [ExecuteNterpImpl, EndExecuteNterpImpl).
+%def instruction_end():
+
+    FUNCTION_TYPE(artNterpAsmInstructionEnd)
+    ASM_HIDDEN SYMBOL(artNterpAsmInstructionEnd)
+    .global SYMBOL(artNterpAsmInstructionEnd)
+SYMBOL(artNterpAsmInstructionEnd):
+    // artNterpAsmInstructionEnd is used as landing pad for exception handling.
+    FETCH_INST
+    GOTO_NEXT
+
+%def instruction_start():
+
+    FUNCTION_TYPE(artNterpAsmInstructionStart)
+    ASM_HIDDEN SYMBOL(artNterpAsmInstructionStart)
+    .global SYMBOL(artNterpAsmInstructionStart)
+SYMBOL(artNterpAsmInstructionStart) = .L_op_nop
+    .text
+
+%def opcode_start():
+    ENTRY nterp_${opcode}
+%def opcode_end():
+    END nterp_${opcode}
+%def helper_start(name):
+    ENTRY ${name}
+%def helper_end(name):
+    END ${name}
diff --git a/runtime/interpreter/mterp/x86_64ng/object.S b/runtime/interpreter/mterp/x86_64ng/object.S
new file mode 100644
index 0000000..cb231e3
--- /dev/null
+++ b/runtime/interpreter/mterp/x86_64ng/object.S
@@ -0,0 +1,204 @@
+%def op_check_cast():
+  jmp NterpCheckCast
+
+%def op_iget_boolean():
+   jmp NterpGetBooleanInstanceField
+
+%def op_iget_boolean_quick():
+%  op_iget_quick(load="movsbl")
+
+%def op_iget_byte():
+   jmp NterpGetByteInstanceField
+
+%def op_iget_byte_quick():
+%  op_iget_quick(load="movsbl")
+
+%def op_iget_char():
+   jmp NterpGetCharInstanceField
+
+%def op_iget_char_quick():
+%  op_iget_quick(load="movzwl")
+
+%def op_iget_object():
+    jmp NterpGetObjectInstanceField
+
+%def op_iget_object_quick():
+   movzwq  2(rPC), %rax                    # eax <- field byte offset
+   movl    rINST, %ecx                     # rcx <- BA
+   sarl    $$4, %ecx                       # ecx <- B
+   GET_VREG %ecx, %rcx                     # vB (object we're operating on)
+   testl   %ecx, %ecx                      # is object null?
+   je      common_errNullObject
+   testb $$READ_BARRIER_TEST_VALUE, GRAY_BYTE_OFFSET(%ecx)
+   movl (%rcx,%rax,1), %eax
+   jnz 2f
+1:
+   andb    $$0xf,rINSTbl                   # rINST <- A
+   SET_VREG_OBJECT %eax, rINSTq            # fp[A] <- value
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+2:
+   // reg00 is eax
+   call art_quick_read_barrier_mark_reg00
+   jmp 1b
+
+%def op_iget_quick(load="movl", wide="0"):
+    /* For: iget-quick, iget-boolean-quick, iget-byte-quick, iget-char-quick, iget-short-quick, iget-wide-quick */
+    /* op vA, vB, offset@CCCC */
+    movl    rINST, %ecx                     # rcx <- BA
+    sarl    $$4, %ecx                       # ecx <- B
+    GET_VREG %ecx, %rcx                     # vB (object we're operating on)
+    movzwq  2(rPC), %rax                    # eax <- field byte offset
+    testl   %ecx, %ecx                      # is object null?
+    je      common_errNullObject
+    andb    $$0xf,rINSTbl                   # rINST <- A
+    .if $wide
+    movq (%rcx,%rax,1), %rax
+    SET_WIDE_VREG %rax, rINSTq              # fp[A] <- value
+    .else
+    ${load} (%rcx,%rax,1), %eax
+    SET_VREG %eax, rINSTq                   # fp[A] <- value
+    .endif
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+
+%def op_iget_short():
+   jmp NterpGetShortInstanceField
+
+%def op_iget_short_quick():
+%  op_iget_quick(load="movswl")
+
+%def op_iget_wide():
+   jmp NterpGetWideInstanceField
+
+%def op_iget_wide_quick():
+%  op_iget_quick(load="movq", wide="1")
+
+%def op_instance_of():
+   jmp NterpInstanceOf
+
+%def op_iget():
+   jmp NterpGetInstanceField
+
+%def op_iput():
+   jmp NterpPutInstanceField
+
+%def op_iput_boolean():
+   jmp NterpPutBooleanInstanceField
+
+%def op_iput_boolean_quick():
+%  op_iput_quick(reg="rINSTbl", store="movb")
+
+%def op_iput_byte():
+   jmp NterpPutByteInstanceField
+
+%def op_iput_byte_quick():
+%  op_iput_quick(reg="rINSTbl", store="movb")
+
+%def op_iput_char():
+   jmp NterpPutCharInstanceField
+
+%def op_iput_char_quick():
+%  op_iput_quick(reg="rINSTw", store="movw")
+
+%def op_iput_object():
+    jmp NterpPutObjectInstanceField
+
+%def op_iput_object_quick():
+   movzwq  2(rPC), %rax                    # eax <- field byte offset
+   movzbq  rINSTbl, %rcx                   # rcx <- BA
+   sarl    $$4, %ecx                       # ecx <- B
+   GET_VREG %ecx, %rcx                     # vB (object we're operating on)
+   testl   %ecx, %ecx                      # is object null?
+   je      common_errNullObject
+   andb    $$0xf, rINSTbl                  # rINST <- A
+   GET_VREG rINST, rINSTq                  # rINST <- v[A]
+   movl rINST, (%rcx,%rax,1)
+   testl rINST, rINST
+   je 1f
+   movq rSELF:THREAD_CARD_TABLE_OFFSET, %rax
+   shrq $$CARD_TABLE_CARD_SHIFT, %rcx
+   movb %al, (%rcx, %rax, 1)
+1:
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+
+%def op_iput_quick(reg="rINST", store="movl"):
+    /* For: iput-quick, iput-object-quick */
+    /* op vA, vB, offset@CCCC */
+    movzbq  rINSTbl, %rcx                   # rcx <- BA
+    sarl    $$4, %ecx                       # ecx <- B
+    GET_VREG %ecx, %rcx                     # vB (object we're operating on)
+    testl   %ecx, %ecx                      # is object null?
+    je      common_errNullObject
+    andb    $$0xf, rINSTbl                  # rINST <- A
+    GET_VREG rINST, rINSTq                  # rINST <- v[A]
+    movzwq  2(rPC), %rax                    # rax <- field byte offset
+    ${store}    ${reg}, (%rcx,%rax,1)
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+
+%def op_iput_short():
+   jmp NterpPutShortInstanceField
+
+%def op_iput_short_quick():
+%  op_iput_quick(reg="rINSTw", store="movw")
+
+%def op_iput_wide():
+   jmp NterpPutWideInstanceField
+
+%def op_iput_wide_quick():
+    /* iput-wide-quick vA, vB, offset@CCCC */
+    movzbq    rINSTbl, %rcx                 # rcx<- BA
+    sarl      $$4, %ecx                     # ecx<- B
+    GET_VREG  %ecx, %rcx                    # vB (object we're operating on)
+    testl     %ecx, %ecx                    # is object null?
+    je        common_errNullObject
+    movzwq    2(rPC), %rax                  # rax<- field byte offset
+    leaq      (%rcx,%rax,1), %rcx           # ecx<- Address of 64-bit target
+    andb      $$0xf, rINSTbl                # rINST<- A
+    GET_WIDE_VREG %rax, rINSTq              # rax<- fp[A]/fp[A+1]
+    movq      %rax, (%rcx)                  # obj.field<- r0/r1
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+
+%def op_sget(load="movl", wide="0"):
+   jmp NterpGetIntStaticField
+
+%def op_sget_boolean():
+   jmp NterpGetBooleanStaticField
+
+%def op_sget_byte():
+   jmp NterpGetByteStaticField
+
+%def op_sget_char():
+   jmp NterpGetCharStaticField
+
+%def op_sget_object():
+   jmp NterpGetObjectStaticField
+
+%def op_sget_short():
+   jmp NterpGetShortStaticField
+
+%def op_sget_wide():
+   jmp NterpGetWideStaticField
+
+%def op_sput():
+   jmp NterpPutStaticField
+
+%def op_sput_boolean():
+   jmp NterpPutBooleanStaticField
+
+%def op_sput_byte():
+   jmp NterpPutByteStaticField
+
+%def op_sput_char():
+   jmp NterpPutCharStaticField
+
+%def op_sput_object():
+   jmp NterpPutObjectStaticField
+
+%def op_sput_short():
+   jmp NterpPutShortStaticField
+
+%def op_sput_wide():
+   jmp NterpPutWideStaticField
+
+%def op_new_instance():
+   // The routine is too big to fit in a handler, so jump to it.
+   jmp NterpNewInstance
diff --git a/runtime/interpreter/mterp/x86_64ng/other.S b/runtime/interpreter/mterp/x86_64ng/other.S
new file mode 100644
index 0000000..7d82c3b
--- /dev/null
+++ b/runtime/interpreter/mterp/x86_64ng/other.S
@@ -0,0 +1,273 @@
+%def unused():
+    int3
+
+%def op_const():
+    /* const vAA, #+BBBBbbbb */
+    movl    2(rPC), %eax                    # grab all 32 bits at once
+    SET_VREG %eax, rINSTq                   # vAA<- eax
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 3
+
+%def op_const_16():
+    /* const/16 vAA, #+BBBB */
+    movswl  2(rPC), %ecx                    # ecx <- ssssBBBB
+    SET_VREG %ecx, rINSTq                   # vAA <- ssssBBBB
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+
+%def op_const_4():
+    /* const/4 vA, #+B */
+    movsbl  rINSTbl, %eax                   # eax <-ssssssBx
+    andl    MACRO_LITERAL(0xf), rINST       # rINST <- A
+    sarl    MACRO_LITERAL(4), %eax
+    SET_VREG %eax, rINSTq
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 1
+
+%def op_const_high16():
+    /* const/high16 vAA, #+BBBB0000 */
+    movzwl  2(rPC), %eax                    # eax <- 0000BBBB
+    sall    MACRO_LITERAL(16), %eax         # eax <- BBBB0000
+    SET_VREG %eax, rINSTq                   # vAA <- eax
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+
+%def op_const_object(jumbo="0", helper="nterp_load_object"):
+   // Fast-path which gets the object from thread-local cache.
+   FETCH_FROM_THREAD_CACHE %rax, 2f
+   cmpq MACRO_LITERAL(0), rSELF:THREAD_READ_BARRIER_MARK_REG00_OFFSET
+   jne 3f
+1:
+   SET_VREG_OBJECT %eax, rINSTq            # vAA <- value
+   .if $jumbo
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 3
+   .else
+   ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+   .endif
+2:
+   EXPORT_PC
+   movq rSELF:THREAD_SELF_OFFSET, %rdi
+   movq 0(%rsp), %rsi
+   movq rPC, %rdx
+   call SYMBOL($helper)
+   jmp 1b
+3:
+   // 00 is %rax
+   call art_quick_read_barrier_mark_reg00
+   jmp 1b
+
+%def op_const_class():
+%  op_const_object(jumbo="0", helper="nterp_get_class_or_allocate_object")
+
+%def op_const_method_handle():
+%  op_const_object(jumbo="0")
+
+%def op_const_method_type():
+%  op_const_object(jumbo="0")
+
+%def op_const_string():
+   /* const/string vAA, String@BBBB */
+%  op_const_object(jumbo="0")
+
+%def op_const_string_jumbo():
+   /* const/string vAA, String@BBBBBBBB */
+%  op_const_object(jumbo="1")
+
+%def op_const_wide():
+    /* const-wide vAA, #+HHHHhhhhBBBBbbbb */
+    movq    2(rPC), %rax                    # rax <- HHHHhhhhBBBBbbbb
+    SET_WIDE_VREG %rax, rINSTq
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 5
+
+%def op_const_wide_16():
+    /* const-wide/16 vAA, #+BBBB */
+    movswq  2(rPC), %rax                    # rax <- ssssssssssssBBBB
+    SET_WIDE_VREG %rax, rINSTq              # store
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+
+%def op_const_wide_32():
+    /* const-wide/32 vAA, #+BBBBbbbb */
+    movslq   2(rPC), %rax                   # eax <- ssssssssBBBBbbbb
+    SET_WIDE_VREG %rax, rINSTq              # store
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 3
+
+%def op_const_wide_high16():
+    /* const-wide/high16 vAA, #+BBBB000000000000 */
+    movzwq  2(rPC), %rax                    # eax <- 000000000000BBBB
+    salq    $$48, %rax                      # eax <- 00000000BBBB0000
+    SET_WIDE_VREG %rax, rINSTq              # v[AA+0] <- eax
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+
+%def op_monitor_enter():
+/*
+ * Synchronize on an object.
+ */
+    /* monitor-enter vAA */
+    EXPORT_PC
+    GET_VREG %edi, rINSTq
+    call art_quick_lock_object
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 1
+
+%def op_monitor_exit():
+/*
+ * Unlock an object.
+ *
+ * Exceptions that occur when unlocking a monitor need to appear as
+ * if they happened at the following instruction.  See the Dalvik
+ * instruction spec.
+ */
+    /* monitor-exit vAA */
+    EXPORT_PC
+    GET_VREG %edi, rINSTq
+    call art_quick_unlock_object
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 1
+
+%def op_move(is_object="0"):
+    /* for move, move-object, long-to-int */
+    /* op vA, vB */
+    movl    rINST, %eax                     # eax <- BA
+    andb    $$0xf, %al                      # eax <- A
+    shrl    $$4, rINST                      # rINST <- B
+    GET_VREG %edx, rINSTq
+    .if $is_object
+    SET_VREG_OBJECT %edx, %rax              # fp[A] <- fp[B]
+    .else
+    SET_VREG %edx, %rax                     # fp[A] <- fp[B]
+    .endif
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 1
+
+%def op_move_16(is_object="0"):
+    /* for: move/16, move-object/16 */
+    /* op vAAAA, vBBBB */
+    movzwq  4(rPC), %rcx                    # ecx <- BBBB
+    movzwq  2(rPC), %rax                    # eax <- AAAA
+    GET_VREG %edx, %rcx
+    .if $is_object
+    SET_VREG_OBJECT %edx, %rax              # fp[A] <- fp[B]
+    .else
+    SET_VREG %edx, %rax                     # fp[A] <- fp[B]
+    .endif
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 3
+
+%def op_move_exception():
+    /* move-exception vAA */
+    movl    rSELF:THREAD_EXCEPTION_OFFSET, %eax
+    SET_VREG_OBJECT %eax, rINSTq            # fp[AA] <- exception object
+    movl    $$0, rSELF:THREAD_EXCEPTION_OFFSET
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 1
+
+%def op_move_from16(is_object="0"):
+    /* for: move/from16, move-object/from16 */
+    /* op vAA, vBBBB */
+    movzwq  2(rPC), %rax                    # eax <- BBBB
+    GET_VREG %edx, %rax                     # edx <- fp[BBBB]
+    .if $is_object
+    SET_VREG_OBJECT %edx, rINSTq            # fp[A] <- fp[B]
+    .else
+    SET_VREG %edx, rINSTq                   # fp[A] <- fp[B]
+    .endif
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+
+%def op_move_object():
+%  op_move(is_object="1")
+
+%def op_move_object_16():
+%  op_move_16(is_object="1")
+
+%def op_move_object_from16():
+%  op_move_from16(is_object="1")
+
+%def op_move_result(is_object="0"):
+    /* for: move-result, move-result-object */
+    /* op vAA */
+    .if $is_object
+    SET_VREG_OBJECT %eax, rINSTq            # fp[A] <- fp[B]
+    .else
+    SET_VREG %eax, rINSTq                   # fp[A] <- fp[B]
+    .endif
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 1
+
+%def op_move_result_object():
+%  op_move_result(is_object="1")
+
+%def op_move_result_wide():
+    /* move-result-wide vAA */
+    SET_WIDE_VREG %rax, rINSTq                   # v[AA] <- rdx
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 1
+
+%def op_move_wide():
+    /* move-wide vA, vB */
+    /* NOTE: regs can overlap, e.g. "move v6,v7" or "move v7,v6" */
+    movl    rINST, %ecx                     # ecx <- BA
+    sarl    $$4, rINST                      # rINST <- B
+    andb    $$0xf, %cl                      # ecx <- A
+    GET_WIDE_VREG %rdx, rINSTq              # rdx <- v[B]
+    SET_WIDE_VREG %rdx, %rcx                # v[A] <- rdx
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 1
+
+%def op_move_wide_16():
+    /* move-wide/16 vAAAA, vBBBB */
+    /* NOTE: regs can overlap, e.g. "move v6,v7" or "move v7,v6" */
+    movzwq  4(rPC), %rcx                    # ecx<- BBBB
+    movzwq  2(rPC), %rax                    # eax<- AAAA
+    GET_WIDE_VREG %rdx, %rcx                # rdx <- v[B]
+    SET_WIDE_VREG %rdx, %rax                # v[A] <- rdx
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 3
+
+%def op_move_wide_from16():
+    /* move-wide/from16 vAA, vBBBB */
+    /* NOTE: regs can overlap, e.g. "move v6,v7" or "move v7,v6" */
+    movzwl  2(rPC), %ecx                    # ecx <- BBBB
+    GET_WIDE_VREG %rdx, %rcx                # rdx <- v[B]
+    SET_WIDE_VREG %rdx, rINSTq              # v[A] <- rdx
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 2
+
+%def op_nop():
+    ADVANCE_PC_FETCH_AND_GOTO_NEXT 1
+
+%def op_unused_3e():
+%  unused()
+
+%def op_unused_3f():
+%  unused()
+
+%def op_unused_40():
+%  unused()
+
+%def op_unused_41():
+%  unused()
+
+%def op_unused_42():
+%  unused()
+
+%def op_unused_43():
+%  unused()
+
+%def op_unused_79():
+%  unused()
+
+%def op_unused_7a():
+%  unused()
+
+%def op_unused_f3():
+%  unused()
+
+%def op_unused_f4():
+%  unused()
+
+%def op_unused_f5():
+%  unused()
+
+%def op_unused_f6():
+%  unused()
+
+%def op_unused_f7():
+%  unused()
+
+%def op_unused_f8():
+%  unused()
+
+%def op_unused_f9():
+%  unused()
+
+%def op_unused_fc():
+%  unused()
+
+%def op_unused_fd():
+%  unused()
diff --git a/runtime/jit/jit.cc b/runtime/jit/jit.cc
index 6e89973..48a51f1 100644
--- a/runtime/jit/jit.cc
+++ b/runtime/jit/jit.cc
@@ -1498,6 +1498,9 @@
 }
 
 void Jit::EnqueueOptimizedCompilation(ArtMethod* method, Thread* self) {
+  if (thread_pool_ == nullptr) {
+    return;
+  }
   // We arrive here after a baseline compiled code has reached its baseline
   // hotness threshold. If tiered compilation is enabled, enqueue a compilation
   // task that will compile optimize the method.
@@ -1744,5 +1747,21 @@
   }
 }
 
+void Jit::EnqueueCompilationFromNterp(ArtMethod* method, Thread* self) {
+  if (thread_pool_ == nullptr) {
+    return;
+  }
+  if (GetCodeCache()->ContainsPc(method->GetEntryPointFromQuickCompiledCode())) {
+    // If we already have compiled code for it, nterp may be stuck in a loop.
+    // Compile OSR.
+    thread_pool_->AddTask(
+        self, new JitCompileTask(method, JitCompileTask::TaskKind::kCompileOsr));
+    return;
+  }
+  ProfilingInfo::Create(self, method, /* retry_allocation= */ false);
+  thread_pool_->AddTask(
+      self, new JitCompileTask(method, JitCompileTask::TaskKind::kCompileBaseline));
+}
+
 }  // namespace jit
 }  // namespace art
diff --git a/runtime/jit/jit.h b/runtime/jit/jit.h
index 8d5676b..e9fd915 100644
--- a/runtime/jit/jit.h
+++ b/runtime/jit/jit.h
@@ -26,6 +26,7 @@
 #include "base/timing_logger.h"
 #include "handle.h"
 #include "offsets.h"
+#include "interpreter/mterp/mterp.h"
 #include "jit/debugger_interface.h"
 #include "jit/profile_saver_options.h"
 #include "obj_ptr.h"
@@ -120,7 +121,9 @@
   }
 
   bool CanCompileBaseline() const {
-    return use_tiered_jit_compilation_ || use_baseline_compiler_;
+    return use_tiered_jit_compilation_ ||
+           use_baseline_compiler_ ||
+           interpreter::IsNterpSupported();
   }
 
   void SetUseJitCompilation(bool b) {
@@ -435,6 +438,9 @@
 
   void EnqueueOptimizedCompilation(ArtMethod* method, Thread* self);
 
+  void EnqueueCompilationFromNterp(ArtMethod* method, Thread* self)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
  private:
   Jit(JitCodeCache* code_cache, JitOptions* options);
 
diff --git a/runtime/nterp_helpers.cc b/runtime/nterp_helpers.cc
index df0eb73..9b265c2 100644
--- a/runtime/nterp_helpers.cc
+++ b/runtime/nterp_helpers.cc
@@ -92,7 +92,7 @@
   return (POPCOUNT(core_spills) + POPCOUNT(fp_spills)) * kPointerSize;
 }
 
-static size_t NterpGetFrameSize(ArtMethod* method) REQUIRES_SHARED(Locks::mutator_lock_) {
+size_t NterpGetFrameSize(ArtMethod* method) {
   CodeItemDataAccessor accessor(method->DexInstructionData());
   const uint16_t num_regs = accessor.RegistersSize();
   const uint16_t out_regs = accessor.OutsSize();
diff --git a/runtime/nterp_helpers.h b/runtime/nterp_helpers.h
index 758d1fd..7dbf92e 100644
--- a/runtime/nterp_helpers.h
+++ b/runtime/nterp_helpers.h
@@ -24,6 +24,12 @@
 class ArtMethod;
 
 /**
+ * The frame size nterp will use for the given method.
+ */
+size_t NterpGetFrameSize(ArtMethod* method)
+    REQUIRES_SHARED(Locks::mutator_lock_);
+
+/**
  * Returns the QuickMethodFrameInfo of the given frame corresponding to the
  * given method.
  */
diff --git a/runtime/quick_exception_handler.cc b/runtime/quick_exception_handler.cc
index 0e04b7b..910b389 100644
--- a/runtime/quick_exception_handler.cc
+++ b/runtime/quick_exception_handler.cc
@@ -437,7 +437,11 @@
         updated_vregs = GetThread()->GetUpdatedVRegFlags(frame_id);
         DCHECK(updated_vregs != nullptr);
       }
-      HandleOptimizingDeoptimization(method, new_frame, updated_vregs);
+      if (GetCurrentOatQuickMethodHeader()->IsNterpMethodHeader()) {
+        HandleNterpDeoptimization(method, new_frame, updated_vregs);
+      } else {
+        HandleOptimizingDeoptimization(method, new_frame, updated_vregs);
+      }
       if (updated_vregs != nullptr) {
         // Calling Thread::RemoveDebuggerShadowFrameMapping will also delete the updated_vregs
         // array so this must come after we processed the frame.
@@ -467,6 +471,35 @@
   }
 
  private:
+  void HandleNterpDeoptimization(ArtMethod* m,
+                                 ShadowFrame* new_frame,
+                                 const bool* updated_vregs)
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    ArtMethod** cur_quick_frame = GetCurrentQuickFrame();
+    StackReference<mirror::Object>* vreg_ref_base =
+        reinterpret_cast<StackReference<mirror::Object>*>(NterpGetReferenceArray(cur_quick_frame));
+    int32_t* vreg_int_base =
+        reinterpret_cast<int32_t*>(NterpGetRegistersArray(cur_quick_frame));
+    CodeItemDataAccessor accessor(m->DexInstructionData());
+    const uint16_t num_regs = accessor.RegistersSize();
+    // An nterp frame has two arrays: a dex register array and a reference array
+    // that shadows the dex register array but only containing references
+    // (non-reference dex registers have nulls). See nterp_helpers.cc.
+    for (size_t reg = 0; reg < num_regs; ++reg) {
+      if (updated_vregs != nullptr && updated_vregs[reg]) {
+        // Keep the value set by debugger.
+        continue;
+      }
+      StackReference<mirror::Object>* ref_addr = vreg_ref_base + reg;
+      mirror::Object* ref = ref_addr->AsMirrorPtr();
+      if (ref != nullptr) {
+        new_frame->SetVRegReference(reg, ref);
+      } else {
+        new_frame->SetVReg(reg, vreg_int_base[reg]);
+      }
+    }
+  }
+
   void HandleOptimizingDeoptimization(ArtMethod* m,
                                       ShadowFrame* new_frame,
                                       const bool* updated_vregs)
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 8861a09..99980c5 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -150,6 +150,7 @@
 #include "oat.h"
 #include "oat_file.h"
 #include "oat_file_manager.h"
+#include "oat_quick_method_header.h"
 #include "object_callbacks.h"
 #include "parsed_options.h"
 #include "quick/quick_method_frame_info.h"
@@ -850,13 +851,15 @@
 
   if (!IsImageDex2OatEnabled() || !GetHeap()->HasBootImageSpace()) {
     ScopedObjectAccess soa(self);
-    StackHandleScope<2> hs(soa.Self());
+    StackHandleScope<3> hs(soa.Self());
 
     ObjPtr<mirror::ObjectArray<mirror::Class>> class_roots = GetClassLinker()->GetClassRoots();
     auto class_class(hs.NewHandle<mirror::Class>(GetClassRoot<mirror::Class>(class_roots)));
+    auto string_class(hs.NewHandle<mirror::Class>(GetClassRoot<mirror::String>(class_roots)));
     auto field_class(hs.NewHandle<mirror::Class>(GetClassRoot<mirror::Field>(class_roots)));
 
     class_linker_->EnsureInitialized(soa.Self(), class_class, true, true);
+    class_linker_->EnsureInitialized(soa.Self(), string_class, true, true);
     self->AssertNoPendingException();
     // Field class is needed for register_java_net_InetAddress in libcore, b/28153851.
     class_linker_->EnsureInitialized(soa.Self(), field_class, true, true);
@@ -2727,6 +2730,11 @@
 }
 
 bool Runtime::IsAsyncDeoptimizeable(uintptr_t code) const {
+  if (OatQuickMethodHeader::NterpMethodHeader != nullptr) {
+    if (OatQuickMethodHeader::NterpMethodHeader->Contains(code)) {
+      return true;
+    }
+  }
   // We only support async deopt (ie the compiled code is not explicitly asking for
   // deopt, but something else like the debugger) in debuggable JIT code.
   // We could look at the oat file where `code` is being defined,
diff --git a/runtime/runtime_options.def b/runtime/runtime_options.def
index c91da68..877a5a0 100644
--- a/runtime/runtime_options.def
+++ b/runtime/runtime_options.def
@@ -75,7 +75,7 @@
 RUNTIME_OPTIONS_KEY (bool,                UseTLAB,                        (kUseTlab || kUseReadBarrier))
 RUNTIME_OPTIONS_KEY (bool,                EnableHSpaceCompactForOOM,      true)
 RUNTIME_OPTIONS_KEY (bool,                UseJitCompilation,              true)
-RUNTIME_OPTIONS_KEY (bool,                UseTieredJitCompilation,        false)
+RUNTIME_OPTIONS_KEY (bool,                UseTieredJitCompilation,        interpreter::IsNterpSupported())
 RUNTIME_OPTIONS_KEY (bool,                DumpNativeStackOnSigQuit,       true)
 RUNTIME_OPTIONS_KEY (bool,                MadviseRandomAccess,            false)
 RUNTIME_OPTIONS_KEY (JniIdType,           OpaqueJniIds,                   JniIdType::kDefault)  // -Xopaque-jni-ids:{true, false, swapable}
diff --git a/runtime/stack.cc b/runtime/stack.cc
index 410e0fd..72690da 100644
--- a/runtime/stack.cc
+++ b/runtime/stack.cc
@@ -748,14 +748,13 @@
       // Frame sanity.
       size_t frame_size = GetCurrentQuickFrameInfo().FrameSizeInBytes();
       CHECK_NE(frame_size, 0u);
-      // A rough guess at an upper size we expect to see for a frame.
+      // For compiled code, we could try to have a rough guess at an upper size we expect
+      // to see for a frame:
       // 256 registers
       // 2 words HandleScope overhead
       // 3+3 register spills
-      // TODO: this seems architecture specific for the case of JNI frames.
-      // TODO: 083-compiler-regressions ManyFloatArgs shows this estimate is wrong.
       // const size_t kMaxExpectedFrameSize = (256 + 2 + 3 + 3) * sizeof(word);
-      const size_t kMaxExpectedFrameSize = 2 * KB;
+      const size_t kMaxExpectedFrameSize = interpreter::kMaxNterpFrame;
       CHECK_LE(frame_size, kMaxExpectedFrameSize) << method->PrettyMethod();
       size_t return_pc_offset = GetCurrentQuickFrameInfo().GetReturnPcOffset();
       CHECK_LT(return_pc_offset, frame_size);
@@ -852,7 +851,6 @@
     cur_quick_frame_ = current_fragment->GetTopQuickFrame();
     cur_quick_frame_pc_ = 0;
     cur_oat_quick_method_header_ = nullptr;
-
     if (cur_quick_frame_ != nullptr) {  // Handle quick stack frames.
       // Can't be both a shadow and a quick fragment.
       DCHECK(current_fragment->GetTopShadowFrame() == nullptr);
diff --git a/test/566-polymorphic-inlining/src/Main.java b/test/566-polymorphic-inlining/src/Main.java
index 793b85f..e34d27a 100644
--- a/test/566-polymorphic-inlining/src/Main.java
+++ b/test/566-polymorphic-inlining/src/Main.java
@@ -47,7 +47,7 @@
     // Make testInvokeVirtual and testInvokeInterface hot to get them jitted.
     // We pass Main and Subclass to get polymorphic inlining based on calling
     // the same method.
-    for (int i = 0; i < 10000; ++i) {
+    for (int i = 0; i < 1000000; ++i) {
       testInvokeVirtual(mains[0]);
       testInvokeVirtual(mains[1]);
       testInvokeInterface(itfs[0]);
@@ -78,7 +78,7 @@
 
     // Run this once to make sure we execute the JITted code.
     $noinline$testInlineToSameTarget(mains[0]);
-    assertEquals(20001, counter);
+    assertEquals(2000001, counter);
   }
 
   public Class<?> sameInvokeVirtual() {
diff --git a/test/570-checker-osr/osr.cc b/test/570-checker-osr/osr.cc
index ee978c2..22423e2 100644
--- a/test/570-checker-osr/osr.cc
+++ b/test/570-checker-osr/osr.cc
@@ -90,7 +90,8 @@
         const OatQuickMethodHeader* header =
             Runtime::Current()->GetJit()->GetCodeCache()->LookupOsrMethodHeader(m);
         if ((header == nullptr || header != stack_visitor->GetCurrentOatQuickMethodHeader()) &&
-            stack_visitor->IsShadowFrame()) {
+            (stack_visitor->IsShadowFrame() ||
+             stack_visitor->GetCurrentOatQuickMethodHeader()->IsNterpMethodHeader())) {
           in_interpreter = true;
         }
       });
diff --git a/test/638-checker-inline-cache-intrinsic/src/Main.java b/test/638-checker-inline-cache-intrinsic/src/Main.java
index 1449f0a..5334487 100644
--- a/test/638-checker-inline-cache-intrinsic/src/Main.java
+++ b/test/638-checker-inline-cache-intrinsic/src/Main.java
@@ -64,10 +64,10 @@
 
   public static void test() {
     // Warm up inline cache.
-    for (int i = 0; i < 450; i++) {
+    for (int i = 0; i < 600000; i++) {
       $noinline$inlineMonomorphic(str);
     }
-    for (int i = 0; i < 600; i++) {
+    for (int i = 0; i < 600000; i++) {
       $noinline$stringEquals(str);
     }
     ensureJitCompiled(Main.class, "$noinline$stringEquals");
diff --git a/test/common/runtime_state.cc b/test/common/runtime_state.cc
index 4ca5fe8..22dbcce 100644
--- a/test/common/runtime_state.cc
+++ b/test/common/runtime_state.cc
@@ -179,7 +179,8 @@
   }
   const void* actual_code = method->GetEntryPointFromQuickCompiledCodePtrSize(kRuntimePointerSize);
   bool interpreter =
-      Runtime::Current()->GetClassLinker()->ShouldUseInterpreterEntrypoint(method, actual_code);
+      Runtime::Current()->GetClassLinker()->ShouldUseInterpreterEntrypoint(method, actual_code) ||
+      (actual_code == interpreter::GetNterpEntryPoint());
   return !interpreter;
 }
 
diff --git a/test/common/stack_inspect.cc b/test/common/stack_inspect.cc
index e8160b4..79c7a36 100644
--- a/test/common/stack_inspect.cc
+++ b/test/common/stack_inspect.cc
@@ -25,6 +25,7 @@
 #include "mirror/class-inl.h"
 #include "nth_caller_visitor.h"
 #include "oat_file.h"
+#include "oat_quick_method_header.h"
 #include "runtime.h"
 #include "scoped_thread_state_change-inl.h"
 #include "stack.h"
@@ -47,7 +48,10 @@
   NthCallerVisitor caller(soa.Self(), level, false);
   caller.WalkStack();
   CHECK(caller.caller != nullptr);
-  return caller.GetCurrentShadowFrame() != nullptr ? JNI_TRUE : JNI_FALSE;
+  bool is_shadow_frame = (caller.GetCurrentShadowFrame() != nullptr);
+  bool is_nterp_frame = (caller.GetCurrentQuickFrame() != nullptr) &&
+      (caller.GetCurrentOatQuickMethodHeader()->IsNterpMethodHeader());
+  return (is_shadow_frame || is_nterp_frame) ? JNI_TRUE : JNI_FALSE;
 }
 
 // public static native boolean isInterpreted();
diff --git a/tools/cpp-define-generator/thread.def b/tools/cpp-define-generator/thread.def
index 72cd2a9..4fee6df 100644
--- a/tools/cpp-define-generator/thread.def
+++ b/tools/cpp-define-generator/thread.def
@@ -39,6 +39,8 @@
            art::Thread::InterpreterCacheSizeLog2())
 ASM_DEFINE(THREAD_INTERPRETER_CACHE_SIZE_MASK,
            (sizeof(art::InterpreterCache::Entry) * (art::InterpreterCache::kSize - 1)))
+ASM_DEFINE(THREAD_INTERPRETER_CACHE_SIZE_SHIFT,
+           2)
 ASM_DEFINE(THREAD_IS_GC_MARKING_OFFSET,
            art::Thread::IsGcMarkingOffset<art::kRuntimePointerSize>().Int32Value())
 ASM_DEFINE(THREAD_LOCAL_ALLOC_STACK_END_OFFSET,