Add an arm32 port of nterp.

Currently using arm32, will try thumb once this CL lands to compare
performance.

Test: test.py, run-libcore-tests, device boots
Bug: 112676029

Change-Id: I6535e2982a3ceed83eba6664fc8ba8609974bc08
diff --git a/runtime/Android.bp b/runtime/Android.bp
index 393c2cb..9792442 100644
--- a/runtime/Android.bp
+++ b/runtime/Android.bp
@@ -282,8 +282,9 @@
         arm: {
             srcs: [
                 "interpreter/mterp/mterp.cc",
-                "interpreter/mterp/nterp_stub.cc",
+                "interpreter/mterp/nterp.cc",
                 ":libart_mterp.arm",
+                ":libart_mterp.armng",
                 "arch/arm/context_arm.cc",
                 "arch/arm/entrypoints_init_arm.cc",
                 "arch/arm/instruction_set_features_assembly_tests.S",
@@ -843,3 +844,16 @@
     ],
     cmd: "$(location interpreter/mterp/gen_mterp.py) $(out) $(in)",
 }
+
+genrule {
+    name: "libart_mterp.armng",
+    out: ["mterp_armng.S"],
+    srcs: [
+        "interpreter/mterp/armng/*.S",
+    ],
+    tool_files: [
+        "interpreter/mterp/gen_mterp.py",
+        "interpreter/mterp/common/gen_setup.py",
+    ],
+    cmd: "$(location interpreter/mterp/gen_mterp.py) $(out) $(in)",
+}
diff --git a/runtime/arch/arm/asm_support_arm.S b/runtime/arch/arm/asm_support_arm.S
index 7ffdf18..dd48d1d 100644
--- a/runtime/arch/arm/asm_support_arm.S
+++ b/runtime/arch/arm/asm_support_arm.S
@@ -36,6 +36,7 @@
 
 .syntax unified
 .arch armv7-a
+.arch_extension idiv
 .thumb
 
 .macro CFI_EXPRESSION_BREG n, b, offset
@@ -48,6 +49,22 @@
     .endif
 .endm
 
+.macro CFI_DEF_CFA_BREG_PLUS_UCONST reg, offset, size
+    .if ((\size) < 0)
+        .error "Size should be positive"
+    .endif
+    .if (((\offset) < -0x40) || ((\offset) >= 0x40))
+        .error "Unsupported offset"
+    .endif
+    .if ((\size) < 0x80)
+        CFI_DEF_CFA_BREG_PLUS_UCONST_1_1(\reg, \offset, \size)
+    .elseif ((\size) < 0x4000)
+        CFI_DEF_CFA_BREG_PLUS_UCONST_1_2(\reg, \offset, \size)
+    .else
+        .error "Unsupported size"
+    .endif
+.endm
+
 // Macro to generate the value of Runtime::Current into rDest. As it uses labels
 // then the labels need to be unique. We bind these to the function name in the ENTRY macros.
 .macro RUNTIME_CURRENT name, num, rDest
@@ -290,4 +307,73 @@
     DELIVER_PENDING_EXCEPTION_FRAME_READY
 .endm
 
+.macro  RETURN_OR_DELIVER_PENDING_EXCEPTION_REG reg
+    ldr \reg, [rSELF, #THREAD_EXCEPTION_OFFSET]  @ Get exception field.
+    cbnz \reg, 1f
+    bx lr
+1:
+    DELIVER_PENDING_EXCEPTION
+.endm
+
+.macro  RETURN_OR_DELIVER_PENDING_EXCEPTION_R1
+    RETURN_OR_DELIVER_PENDING_EXCEPTION_REG r1
+.endm
+
+.macro  RETURN_OR_DELIVER_PENDING_EXCEPTION
+    ldr ip, [rSELF, #THREAD_EXCEPTION_OFFSET]  @ Get exception field.
+    cmp ip, #0
+    bne 1f
+    bx lr
+1:
+    DELIVER_PENDING_EXCEPTION
+.endm
+
+    /*
+     * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kSaveRefsOnly).
+     */
+.macro SETUP_SAVE_REFS_ONLY_FRAME rTemp
+    // Note: We could avoid saving R8 in the case of Baker read
+    // barriers, as it is overwritten by REFRESH_MARKING_REGISTER
+    // later; but it's not worth handling this special case.
+    push {r5-r8, r10-r11, lr}                     @ 7 words of callee saves
+    .cfi_adjust_cfa_offset 28
+    .cfi_rel_offset r5, 0
+    .cfi_rel_offset r6, 4
+    .cfi_rel_offset r7, 8
+    .cfi_rel_offset r8, 12
+    .cfi_rel_offset r10, 16
+    .cfi_rel_offset r11, 20
+    .cfi_rel_offset lr, 24
+    sub sp, #4                                    @ bottom word will hold Method*
+    .cfi_adjust_cfa_offset 4
+    RUNTIME_CURRENT2 \rTemp                       @ Load Runtime::Current into rTemp.
+    @ Load kSaveRefsOnly Method* into rTemp.
+    ldr \rTemp, [\rTemp, #RUNTIME_SAVE_REFS_ONLY_METHOD_OFFSET]
+    str \rTemp, [sp, #0]                          @ Place Method* at bottom of stack.
+    str sp, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
+
+    // Ugly compile-time check, but we only have the preprocessor.
+#if (FRAME_SIZE_SAVE_REFS_ONLY != 28 + 4)
+#error "FRAME_SIZE_SAVE_REFS_ONLY(ARM) size not as expected."
+#endif
+.endm
+
+.macro RESTORE_SAVE_REFS_ONLY_FRAME
+    add sp, #4               @ bottom word holds Method*
+    .cfi_adjust_cfa_offset -4
+    // Note: Likewise, we could avoid restoring R8 in the case of Baker
+    // read barriers, as it is overwritten by REFRESH_MARKING_REGISTER
+    // later; but it's not worth handling this special case.
+    pop {r5-r8, r10-r11, lr} @ 7 words of callee saves
+    .cfi_restore r5
+    .cfi_restore r6
+    .cfi_restore r7
+    .cfi_restore r8
+    .cfi_restore r10
+    .cfi_restore r11
+    .cfi_restore lr
+    .cfi_adjust_cfa_offset -28
+.endm
+
 #endif  // ART_RUNTIME_ARCH_X86_ASM_SUPPORT_X86_S_
diff --git a/runtime/arch/arm/context_arm.h b/runtime/arch/arm/context_arm.h
index 845cdaa..006939c 100644
--- a/runtime/arch/arm/context_arm.h
+++ b/runtime/arch/arm/context_arm.h
@@ -46,6 +46,10 @@
     SetGPR(PC, new_pc);
   }
 
+  void SetNterpDexPC(uintptr_t dex_pc_ptr) override {
+    SetGPR(R11, dex_pc_ptr);
+  }
+
   void SetArg0(uintptr_t new_arg0_value) override {
     SetGPR(R0, new_arg0_value);
   }
diff --git a/runtime/arch/arm/fault_handler_arm.cc b/runtime/arch/arm/fault_handler_arm.cc
index 4e7d64c..7bd402f 100644
--- a/runtime/arch/arm/fault_handler_arm.cc
+++ b/runtime/arch/arm/fault_handler_arm.cc
@@ -107,14 +107,17 @@
   struct ucontext *uc = reinterpret_cast<struct ucontext*>(context);
   struct sigcontext *sc = reinterpret_cast<struct sigcontext*>(&uc->uc_mcontext);
   uint8_t* ptr = reinterpret_cast<uint8_t*>(sc->arm_pc);
-  uint32_t instr_size = GetInstructionSize(ptr);
-  uintptr_t gc_map_location = (sc->arm_pc + instr_size) | 1;
+  bool in_thumb_mode = sc->arm_cpsr & (1 << 5);
+  uint32_t instr_size = in_thumb_mode ? GetInstructionSize(ptr) : 4;
+  uintptr_t gc_map_location = (sc->arm_pc + instr_size) | (in_thumb_mode ? 1 : 0);
 
   // Push the gc map location to the stack and pass the fault address in LR.
   sc->arm_sp -= sizeof(uintptr_t);
   *reinterpret_cast<uintptr_t*>(sc->arm_sp) = gc_map_location;
   sc->arm_lr = reinterpret_cast<uintptr_t>(info->si_addr);
   sc->arm_pc = reinterpret_cast<uintptr_t>(art_quick_throw_null_pointer_exception_from_signal);
+  // Make sure the thumb bit is set as the handler is in thumb mode.
+  sc->arm_cpsr = sc->arm_cpsr | (1 << 5);
   // Pass the faulting address as the first argument of
   // art_quick_throw_null_pointer_exception_from_signal.
   VLOG(signals) << "Generating null pointer exception";
@@ -231,6 +234,9 @@
   // the function to which this handler returns (art_quick_throw_stack_overflow).
   sc->arm_pc = reinterpret_cast<uintptr_t>(art_quick_throw_stack_overflow);
 
+  // Make sure the thumb bit is set as the handler is in thumb mode.
+  sc->arm_cpsr = sc->arm_cpsr | (1 << 5);
+
   // The kernel will now return to the address in sc->arm_pc.
   return true;
 }
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index fae2b97..964f2ae 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -24,54 +24,6 @@
     /* Deliver an exception pending on a thread */
     .extern artDeliverPendingException
 
-    /*
-     * Macro that sets up the callee save frame to conform with
-     * Runtime::CreateCalleeSaveMethod(kSaveRefsOnly).
-     */
-.macro SETUP_SAVE_REFS_ONLY_FRAME rTemp
-    // Note: We could avoid saving R8 in the case of Baker read
-    // barriers, as it is overwritten by REFRESH_MARKING_REGISTER
-    // later; but it's not worth handling this special case.
-    push {r5-r8, r10-r11, lr}                     @ 7 words of callee saves
-    .cfi_adjust_cfa_offset 28
-    .cfi_rel_offset r5, 0
-    .cfi_rel_offset r6, 4
-    .cfi_rel_offset r7, 8
-    .cfi_rel_offset r8, 12
-    .cfi_rel_offset r10, 16
-    .cfi_rel_offset r11, 20
-    .cfi_rel_offset lr, 24
-    sub sp, #4                                    @ bottom word will hold Method*
-    .cfi_adjust_cfa_offset 4
-    RUNTIME_CURRENT2 \rTemp                       @ Load Runtime::Current into rTemp.
-    @ Load kSaveRefsOnly Method* into rTemp.
-    ldr \rTemp, [\rTemp, #RUNTIME_SAVE_REFS_ONLY_METHOD_OFFSET]
-    str \rTemp, [sp, #0]                          @ Place Method* at bottom of stack.
-    str sp, [rSELF, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
-
-    // Ugly compile-time check, but we only have the preprocessor.
-#if (FRAME_SIZE_SAVE_REFS_ONLY != 28 + 4)
-#error "FRAME_SIZE_SAVE_REFS_ONLY(ARM) size not as expected."
-#endif
-.endm
-
-.macro RESTORE_SAVE_REFS_ONLY_FRAME
-    add sp, #4               @ bottom word holds Method*
-    .cfi_adjust_cfa_offset -4
-    // Note: Likewise, we could avoid restoring R8 in the case of Baker
-    // read barriers, as it is overwritten by REFRESH_MARKING_REGISTER
-    // later; but it's not worth handling this special case.
-    pop {r5-r8, r10-r11, lr} @ 7 words of callee saves
-    .cfi_restore r5
-    .cfi_restore r6
-    .cfi_restore r7
-    .cfi_restore r8
-    .cfi_restore r10
-    .cfi_restore r11
-    .cfi_restore lr
-    .cfi_adjust_cfa_offset -28
-.endm
-
 .macro SETUP_SAVE_REFS_AND_ARGS_FRAME rTemp
     SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
     RUNTIME_CURRENT3 \rTemp                       @ Load Runtime::Current into rTemp.
@@ -230,18 +182,6 @@
 END \c_name
 .endm
 
-.macro  RETURN_OR_DELIVER_PENDING_EXCEPTION_REG reg
-    ldr \reg, [rSELF, #THREAD_EXCEPTION_OFFSET]  @ Get exception field.
-    cbnz \reg, 1f
-    bx lr
-1:
-    DELIVER_PENDING_EXCEPTION
-.endm
-
-.macro  RETURN_OR_DELIVER_PENDING_EXCEPTION_R1
-    RETURN_OR_DELIVER_PENDING_EXCEPTION_REG r1
-.endm
-
 .macro RETURN_IF_RESULT_IS_ZERO_OR_DELIVER
     RETURN_IF_RESULT_IS_ZERO
     DELIVER_PENDING_EXCEPTION
@@ -1316,10 +1256,6 @@
 .endm
 
 .macro COMPUTE_ARRAY_SIZE_UNKNOWN slow_path
-    bkpt                                                    // We should never enter here.
-                                                            // Code below is for reference.
-                                                            // Possibly a large object, go slow.
-                                                            // Also does negative array size check.
     movw r2, #((MIN_LARGE_OBJECT_THRESHOLD - MIRROR_WIDE_ARRAY_DATA_OFFSET) / 8)
     cmp r1, r2
     bhi \slow_path
@@ -1387,8 +1323,6 @@
     add    r2, r2, #(MIRROR_WIDE_ARRAY_DATA_OFFSET + OBJECT_ALIGNMENT_MASK)
 .endm
 
-// TODO(ngeoffray): art_quick_alloc_array_resolved_region_tlab is not used for arm, remove
-// the entrypoint once all backends have been updated to use the size variants.
 GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_UNKNOWN
 GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved8_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_8
 GENERATE_ALLOC_ARRAY_TLAB art_quick_alloc_array_resolved16_region_tlab, artAllocArrayFromCodeResolvedRegionTLAB, COMPUTE_ARRAY_SIZE_16
diff --git a/runtime/interpreter/mterp/armng/arithmetic.S b/runtime/interpreter/mterp/armng/arithmetic.S
new file mode 100644
index 0000000..1cec598
--- /dev/null
+++ b/runtime/interpreter/mterp/armng/arithmetic.S
@@ -0,0 +1,986 @@
+%def binop(preinstr="", result="r0", chkzero="0", instr=""):
+    /*
+     * Generic 32-bit binary operation.  Provide an "instr" line that
+     * specifies an instruction that performs "result = r0 op r1".
+     * This could be an ARM instruction or a function call.  (If the result
+     * comes back in a register other than r0, you can override "result".)
+     *
+     * If "chkzero" is set to 1, we perform a divide-by-zero check on
+     * vCC (r1).  Useful for integer division and modulus.  Note that we
+     * *don't* check for (INT_MIN / -1) here, because the ARM math lib
+     * handles it correctly.
+     *
+     * For: add-int, sub-int, mul-int, div-int, rem-int, and-int, or-int,
+     *      xor-int, shl-int, shr-int, ushr-int, add-float, sub-float,
+     *      mul-float, div-float, rem-float
+     */
+    /* binop vAA, vBB, vCC */
+    FETCH r0, 1                         @ r0<- CCBB
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    mov     r3, r0, lsr #8              @ r3<- CC
+    and     r2, r0, #255                @ r2<- BB
+    GET_VREG r1, r3                     @ r1<- vCC
+    GET_VREG r0, r2                     @ r0<- vBB
+    .if $chkzero
+    cmp     r1, #0                      @ is second operand zero?
+    beq     common_errDivideByZero
+    .endif
+
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    $preinstr                           @ optional op; may set condition codes
+    $instr                              @ $result<- op, r0-r3 changed
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG $result, r4                @ vAA<- $result
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 11-14 instructions */
+
+%def binop2addr(preinstr="", result="r0", chkzero="0", instr=""):
+    /*
+     * Generic 32-bit "/2addr" binary operation.  Provide an "instr" line
+     * that specifies an instruction that performs "result = r0 op r1".
+     * This could be an ARM instruction or a function call.  (If the result
+     * comes back in a register other than r0, you can override "result".)
+     *
+     * If "chkzero" is set to 1, we perform a divide-by-zero check on
+     * vCC (r1).  Useful for integer division and modulus.
+     *
+     * For: add-int/2addr, sub-int/2addr, mul-int/2addr, div-int/2addr,
+     *      rem-int/2addr, and-int/2addr, or-int/2addr, xor-int/2addr,
+     *      shl-int/2addr, shr-int/2addr, ushr-int/2addr, add-float/2addr,
+     *      sub-float/2addr, mul-float/2addr, div-float/2addr, rem-float/2addr
+     */
+    /* binop/2addr vA, vB */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    GET_VREG r1, r3                     @ r1<- vB
+    GET_VREG r0, r4                     @ r0<- vA
+    .if $chkzero
+    cmp     r1, #0                      @ is second operand zero?
+    beq     common_errDivideByZero
+    .endif
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+
+    $preinstr                           @ optional op; may set condition codes
+    $instr                              @ $result<- op, r0-r3 changed
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG $result, r4                @ vAA<- $result
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 10-13 instructions */
+
+%def binopLit16(result="r0", chkzero="0", instr=""):
+    /*
+     * Generic 32-bit "lit16" binary operation.  Provide an "instr" line
+     * that specifies an instruction that performs "result = r0 op r1".
+     * This could be an ARM instruction or a function call.  (If the result
+     * comes back in a register other than r0, you can override "result".)
+     *
+     * If "chkzero" is set to 1, we perform a divide-by-zero check on
+     * vCC (r1).  Useful for integer division and modulus.
+     *
+     * For: add-int/lit16, rsub-int, mul-int/lit16, div-int/lit16,
+     *      rem-int/lit16, and-int/lit16, or-int/lit16, xor-int/lit16
+     */
+    /* binop/lit16 vA, vB, #+CCCC */
+    FETCH_S r1, 1                       @ r1<- ssssCCCC (sign-extended)
+    mov     r2, rINST, lsr #12          @ r2<- B
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    GET_VREG r0, r2                     @ r0<- vB
+    .if $chkzero
+    cmp     r1, #0                      @ is second operand zero?
+    beq     common_errDivideByZero
+    .endif
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+
+    $instr                              @ $result<- op, r0-r3 changed
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG $result, r4                @ vAA<- $result
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 10-13 instructions */
+
+%def binopLit8(extract="asr     r1, r3, #8", result="r0", chkzero="0", instr=""):
+    /*
+     * Generic 32-bit "lit8" binary operation.  Provide an "instr" line
+     * that specifies an instruction that performs "result = r0 op r1".
+     * This could be an ARM instruction or a function call.  (If the result
+     * comes back in a register other than r0, you can override "result".)
+     *
+     * You can override "extract" if the extraction of the literal value
+     * from r3 to r1 is not the default "asr r1, r3, #8". The extraction
+     * can be omitted completely if the shift is embedded in "instr".
+     *
+     * If "chkzero" is set to 1, we perform a divide-by-zero check on
+     * vCC (r1).  Useful for integer division and modulus.
+     *
+     * For: add-int/lit8, rsub-int/lit8, mul-int/lit8, div-int/lit8,
+     *      rem-int/lit8, and-int/lit8, or-int/lit8, xor-int/lit8,
+     *      shl-int/lit8, shr-int/lit8, ushr-int/lit8
+     */
+    /* binop/lit8 vAA, vBB, #+CC */
+    FETCH_S r3, 1                       @ r3<- ssssCCBB (sign-extended for CC)
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    and     r2, r3, #255                @ r2<- BB
+    GET_VREG r0, r2                     @ r0<- vBB
+    $extract                            @ optional; typically r1<- ssssssCC (sign extended)
+    .if $chkzero
+    @cmp     r1, #0                     @ is second operand zero?
+    beq     common_errDivideByZero
+    .endif
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+
+    $instr                              @ $result<- op, r0-r3 changed
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG $result, r4                @ vAA<- $result
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 10-12 instructions */
+
+%def binopWide(preinstr="", result0="r0", result1="r1", chkzero="0", instr=""):
+    /*
+     * Generic 64-bit binary operation.  Provide an "instr" line that
+     * specifies an instruction that performs "result = r0-r1 op r2-r3".
+     * This could be an ARM instruction or a function call.  (If the result
+     * comes back in a register other than r0, you can override "result".)
+     *
+     * If "chkzero" is set to 1, we perform a divide-by-zero check on
+     * vCC (r1).  Useful for integer division and modulus.
+     *
+     * for: add-long, sub-long, div-long, rem-long, and-long, or-long,
+     *      xor-long, add-double, sub-double, mul-double, div-double,
+     *      rem-double
+     *
+     * IMPORTANT: you may specify "chkzero" or "preinstr" but not both.
+     */
+    /* binop vAA, vBB, vCC */
+    FETCH r0, 1                         @ r0<- CCBB
+    mov     rINST, rINST, lsr #8        @ rINST<- AA
+    and     r2, r0, #255                @ r2<- BB
+    mov     r3, r0, lsr #8              @ r3<- CC
+    VREG_INDEX_TO_ADDR r4, rINST        @ r4<- &fp[AA]
+    VREG_INDEX_TO_ADDR r2, r2           @ r2<- &fp[BB]
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[CC]
+    GET_VREG_WIDE_BY_ADDR r0, r1, r2    @ r0/r1<- vBB/vBB+1
+    GET_VREG_WIDE_BY_ADDR r2, r3, r3    @ r2/r3<- vCC/vCC+1
+    .if $chkzero
+    orrs    ip, r2, r3                  @ second arg (r2-r3) is zero?
+    beq     common_errDivideByZero
+    .endif
+    CLEAR_SHADOW_PAIR rINST, lr, ip     @ Zero out the shadow regs
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    $preinstr                           @ optional op; may set condition codes
+    $instr                              @ result<- op, r0-r3 changed
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR $result0,$result1,r4  @ vAA/vAA+1<,  $result0/$result1
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 14-17 instructions */
+
+%def binopWide2addr(preinstr="", result0="r0", result1="r1", chkzero="0", instr=""):
+    /*
+     * Generic 64-bit "/2addr" binary operation.  Provide an "instr" line
+     * that specifies an instruction that performs "result = r0-r1 op r2-r3".
+     * This could be an ARM instruction or a function call.  (If the result
+     * comes back in a register other than r0, you can override "result".)
+     *
+     * If "chkzero" is set to 1, we perform a divide-by-zero check on
+     * vCC (r1).  Useful for integer division and modulus.
+     *
+     * For: add-long/2addr, sub-long/2addr, div-long/2addr, rem-long/2addr,
+     *      and-long/2addr, or-long/2addr, xor-long/2addr, add-double/2addr,
+     *      sub-double/2addr, mul-double/2addr, div-double/2addr,
+     *      rem-double/2addr
+     */
+    /* binop/2addr vA, vB */
+    mov     r1, rINST, lsr #12          @ r1<- B
+    ubfx    rINST, rINST, #8, #4        @ rINST<- A
+    VREG_INDEX_TO_ADDR r1, r1           @ r1<- &fp[B]
+    VREG_INDEX_TO_ADDR r4, rINST        @ r4<- &fp[A]
+    GET_VREG_WIDE_BY_ADDR r2, r3, r1    @ r2/r3<- vBB/vBB+1
+    GET_VREG_WIDE_BY_ADDR r0, r1, r4    @ r0/r1<- vAA/vAA+1
+    .if $chkzero
+    orrs    ip, r2, r3                  @ second arg (r2-r3) is zero?
+    beq     common_errDivideByZero
+    .endif
+    CLEAR_SHADOW_PAIR rINST, ip, lr     @ Zero shadow regs
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    $preinstr                           @ optional op; may set condition codes
+    $instr                              @ result<- op, r0-r3 changed
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR $result0,$result1,r4  @ vAA/vAA+1<- $result0/$result1
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 12-15 instructions */
+
+%def unop(preinstr="", instr=""):
+    /*
+     * Generic 32-bit unary operation.  Provide an "instr" line that
+     * specifies an instruction that performs "result = op r0".
+     * This could be an ARM instruction or a function call.
+     *
+     * for: neg-int, not-int, neg-float, int-to-float, float-to-int,
+     *      int-to-byte, int-to-char, int-to-short
+     */
+    /* unop vA, vB */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    GET_VREG r0, r3                     @ r0<- vB
+    $preinstr                           @ optional op; may set condition codes
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    $instr                              @ r0<- op, r0-r3 changed
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG r0, r4                     @ vAA<- r0
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 8-9 instructions */
+
+%def unopNarrower(preinstr="", instr=""):
+    /*
+     * Generic 64bit-to-32bit unary operation.  Provide an "instr" line
+     * that specifies an instruction that performs "result = op r0/r1", where
+     * "result" is a 32-bit quantity in r0.
+     *
+     * For: long-to-float
+     *
+     * (This would work for long-to-int, but that instruction is actually
+     * an exact match for op_move.)
+     */
+    /* unop vA, vB */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[B]
+    GET_VREG_WIDE_BY_ADDR r0, r1, r3    @ r0/r1<- vB/vB+1
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    $preinstr                           @ optional op; may set condition codes
+    $instr                              @ r0<- op, r0-r3 changed
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG r0, r4                     @ vA<- r0
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 9-10 instructions */
+
+%def unopWide(preinstr="", instr=""):
+    /*
+     * Generic 64-bit unary operation.  Provide an "instr" line that
+     * specifies an instruction that performs "result = op r0/r1".
+     * This could be an ARM instruction or a function call.
+     *
+     * For: neg-long, not-long, neg-double, long-to-double, double-to-long
+     */
+    /* unop vA, vB */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    ubfx    rINST, rINST, #8, #4        @ rINST<- A
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[B]
+    VREG_INDEX_TO_ADDR r4, rINST        @ r4<- &fp[A]
+    GET_VREG_WIDE_BY_ADDR r0, r1, r3    @ r0/r1<- vAA
+    CLEAR_SHADOW_PAIR rINST, ip, lr     @ Zero shadow regs
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    $preinstr                           @ optional op; may set condition codes
+    $instr                              @ r0/r1<- op, r2-r3 changed
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r4    @ vAA<- r0/r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 10-11 instructions */
+
+%def unopWider(preinstr="", instr=""):
+    /*
+     * Generic 32bit-to-64bit unary operation.  Provide an "instr" line
+     * that specifies an instruction that performs "result = op r0", where
+     * "result" is a 64-bit quantity in r0/r1.
+     *
+     * For: int-to-long, int-to-double, float-to-long, float-to-double
+     */
+    /* unop vA, vB */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    ubfx    rINST, rINST, #8, #4        @ rINST<- A
+    GET_VREG r0, r3                     @ r0<- vB
+    VREG_INDEX_TO_ADDR r4, rINST        @ r4<- &fp[A]
+    $preinstr                           @ optional op; may set condition codes
+    CLEAR_SHADOW_PAIR rINST, ip, lr     @ Zero shadow regs
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    $instr                              @ r0<- op, r0-r3 changed
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r4    @ vA/vA+1<- r0/r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 9-10 instructions */
+
+%def op_add_int():
+%  binop(instr="add     r0, r0, r1")
+
+%def op_add_int_2addr():
+%  binop2addr(instr="add     r0, r0, r1")
+
+%def op_add_int_lit16():
+%  binopLit16(instr="add     r0, r0, r1")
+
+%def op_add_int_lit8():
+%  binopLit8(extract="", instr="add     r0, r0, r3, asr #8")
+
+%def op_add_long():
+%  binopWide(preinstr="adds    r0, r0, r2", instr="adc     r1, r1, r3")
+
+%def op_add_long_2addr():
+%  binopWide2addr(preinstr="adds    r0, r0, r2", instr="adc     r1, r1, r3")
+
+%def op_and_int():
+%  binop(instr="and     r0, r0, r1")
+
+%def op_and_int_2addr():
+%  binop2addr(instr="and     r0, r0, r1")
+
+%def op_and_int_lit16():
+%  binopLit16(instr="and     r0, r0, r1")
+
+%def op_and_int_lit8():
+%  binopLit8(extract="", instr="and     r0, r0, r3, asr #8")
+
+%def op_and_long():
+%  binopWide(preinstr="and     r0, r0, r2", instr="and     r1, r1, r3")
+
+%def op_and_long_2addr():
+%  binopWide2addr(preinstr="and     r0, r0, r2", instr="and     r1, r1, r3")
+
+%def op_cmp_long():
+    /*
+     * Compare two 64-bit values.  Puts 0, 1, or -1 into the destination
+     * register based on the results of the comparison.
+     */
+    /* cmp-long vAA, vBB, vCC */
+    FETCH r0, 1                         @ r0<- CCBB
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    and     r2, r0, #255                @ r2<- BB
+    mov     r3, r0, lsr #8              @ r3<- CC
+    VREG_INDEX_TO_ADDR r2, r2           @ r2<- &fp[BB]
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[CC]
+    GET_VREG_WIDE_BY_ADDR r0, r1, r2    @ r0/r1<- vBB/vBB+1
+    GET_VREG_WIDE_BY_ADDR r2, r3, r3    @ r2/r3<- vCC/vCC+1
+    cmp     r0, r2
+    sbcs    ip, r1, r3                  @ Sets correct CCs for checking LT (but not EQ/NE)
+    mov     r3, #-1
+    it      ge
+    movge   r3, #1
+    it      eq
+    cmpeq   r0, r2
+    it      eq
+    moveq   r3, #0
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    SET_VREG r3, r4                     @ vAA<- ip
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_div_int():
+    /*
+     * Specialized 32-bit binary operation
+     *
+     * Performs "r0 = r0 div r1". The selection between sdiv or the gcc helper
+     * depends on the compile time value of __ARM_ARCH_EXT_IDIV__ (defined for
+     * ARMv7 CPUs that have hardware division support).
+     *
+     * div-int
+     *
+     */
+    FETCH r0, 1                         @ r0<- CCBB
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    mov     r3, r0, lsr #8              @ r3<- CC
+    and     r2, r0, #255                @ r2<- BB
+    GET_VREG r1, r3                     @ r1<- vCC
+    GET_VREG r0, r2                     @ r0<- vBB
+    cmp     r1, #0                      @ is second operand zero?
+    beq     common_errDivideByZero
+
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+#ifdef __ARM_ARCH_EXT_IDIV__
+    sdiv    r0, r0, r1                  @ r0<- op
+#else
+    bl    __aeabi_idiv                  @ r0<- op, r0-r3 changed
+#endif
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG r0, r4                     @ vAA<- r0
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 11-14 instructions */
+
+%def op_div_int_2addr():
+    /*
+     * Specialized 32-bit binary operation
+     *
+     * Performs "r0 = r0 div r1". The selection between sdiv or the gcc helper
+     * depends on the compile time value of __ARM_ARCH_EXT_IDIV__ (defined for
+     * ARMv7 CPUs that have hardware division support).
+     *
+     * div-int/2addr
+     *
+     */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    GET_VREG r1, r3                     @ r1<- vB
+    GET_VREG r0, r4                     @ r0<- vA
+    cmp     r1, #0                      @ is second operand zero?
+    beq     common_errDivideByZero
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+
+#ifdef __ARM_ARCH_EXT_IDIV__
+    sdiv    r0, r0, r1                  @ r0<- op
+#else
+    bl       __aeabi_idiv               @ r0<- op, r0-r3 changed
+#endif
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG r0, r4                     @ vAA<- r0
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 10-13 instructions */
+
+
+%def op_div_int_lit16():
+    /*
+     * Specialized 32-bit binary operation
+     *
+     * Performs "r0 = r0 div r1". The selection between sdiv or the gcc helper
+     * depends on the compile time value of __ARM_ARCH_EXT_IDIV__ (defined for
+     * ARMv7 CPUs that have hardware division support).
+     *
+     * div-int/lit16
+     *
+     */
+    FETCH_S r1, 1                       @ r1<- ssssCCCC (sign-extended)
+    mov     r2, rINST, lsr #12          @ r2<- B
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    GET_VREG r0, r2                     @ r0<- vB
+    cmp     r1, #0                      @ is second operand zero?
+    beq     common_errDivideByZero
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+
+#ifdef __ARM_ARCH_EXT_IDIV__
+    sdiv    r0, r0, r1                  @ r0<- op
+#else
+    bl       __aeabi_idiv               @ r0<- op, r0-r3 changed
+#endif
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG r0, r4                     @ vAA<- r0
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 10-13 instructions */
+
+%def op_div_int_lit8():
+    /*
+     * Specialized 32-bit binary operation
+     *
+     * Performs "r0 = r0 div r1". The selection between sdiv or the gcc helper
+     * depends on the compile time value of __ARM_ARCH_EXT_IDIV__ (defined for
+     * ARMv7 CPUs that have hardware division support).
+     *
+     * div-int/lit8
+     *
+     */
+    FETCH_S r3, 1                       @ r3<- ssssCCBB (sign-extended for CC
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    and     r2, r3, #255                @ r2<- BB
+    GET_VREG r0, r2                     @ r0<- vBB
+    movs    r1, r3, asr #8              @ r1<- ssssssCC (sign extended)
+    @cmp     r1, #0                     @ is second operand zero?
+    beq     common_errDivideByZero
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+
+#ifdef __ARM_ARCH_EXT_IDIV__
+    sdiv    r0, r0, r1                  @ r0<- op
+#else
+    bl   __aeabi_idiv                   @ r0<- op, r0-r3 changed
+#endif
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG r0, r4                     @ vAA<- r0
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 10-12 instructions */
+
+%def op_div_long():
+%  binopWide(instr="bl      __aeabi_ldivmod", chkzero="1")
+
+%def op_div_long_2addr():
+%  binopWide2addr(instr="bl      __aeabi_ldivmod", chkzero="1")
+
+%def op_int_to_byte():
+%  unop(instr="sxtb    r0, r0")
+
+%def op_int_to_char():
+%  unop(instr="uxth    r0, r0")
+
+%def op_int_to_long():
+%  unopWider(instr="mov     r1, r0, asr #31")
+
+%def op_int_to_short():
+%  unop(instr="sxth    r0, r0")
+
+%def op_long_to_int():
+/* we ignore the high word, making this equivalent to a 32-bit reg move */
+%  op_move()
+
+/*
+ * We use "mul r0, r1, r0" instead of "r0, r0, r1". The latter was illegal in old versions.
+ * Also, for T32, this operand order allows using a 16-bit instruction (encoding T1) while the
+ * other order would require 32-bit instruction (encoding T2).
+ */
+
+%def op_mul_int():
+%  binop(instr="mul     r0, r1, r0")
+
+%def op_mul_int_2addr():
+%  binop2addr(instr="mul     r0, r1, r0")
+
+%def op_mul_int_lit16():
+%  binopLit16(instr="mul     r0, r1, r0")
+
+%def op_mul_int_lit8():
+%  binopLit8(instr="mul     r0, r1, r0")
+
+%def op_mul_long():
+    /*
+     * Signed 64-bit integer multiply.
+     *
+     * Consider WXxYZ (r1r0 x r3r2) with a long multiply:
+     *        WX
+     *      x YZ
+     *  --------
+     *     ZW ZX
+     *  YW YX
+     *
+     * The low word of the result holds ZX, the high word holds
+     * (ZW+YX) + (the high overflow from ZX).  YW doesn't matter because
+     * it doesn't fit in the low 64 bits.
+     *
+     * Unlike most ARM math operations, multiply instructions have
+     * restrictions on using the same register more than once (Rd and Rn
+     * cannot be the same).
+     */
+    /* mul-long vAA, vBB, vCC */
+    FETCH r0, 1                         @ r0<- CCBB
+    and     r2, r0, #255                @ r2<- BB
+    mov     r3, r0, lsr #8              @ r3<- CC
+    VREG_INDEX_TO_ADDR r2, r2           @ r2<- &fp[BB]
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[CC]
+    GET_VREG_WIDE_BY_ADDR r0, r1, r2    @ r0/r1<- vBB/vBB+1
+    GET_VREG_WIDE_BY_ADDR r2, r3, r3    @ r2/r3<- vCC/vCC+1
+    mul     ip, r0, r3                  @ ip<- YxX
+    umull   r0, lr, r2, r0              @ r0/lr <- ZxX RdLo == Rn - this is OK.
+    mla     r3, r1, r2, ip              @ r3<- YxX + (ZxW)
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    add     r1, r3, lr                  @ r1<- lr + low(ZxW + (YxX))
+    CLEAR_SHADOW_PAIR r4, lr, ip        @ Zero out the shadow regs
+    VREG_INDEX_TO_ADDR r4, r4           @ r2<- &fp[AA]
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1 , r4   @ vAA/vAA+1<- r1/r2
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_mul_long_2addr():
+    /*
+     * Signed 64-bit integer multiply, "/2addr" version.
+     *
+     * See op_mul_long for an explanation.
+     *
+     * We get a little tight on registers, so to avoid looking up &fp[A]
+     * again we stuff it into rINST.
+     */
+    /* mul-long/2addr vA, vB */
+    mov     r1, rINST, lsr #12          @ r1<- B
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    VREG_INDEX_TO_ADDR r1, r1           @ r1<- &fp[B]
+    VREG_INDEX_TO_ADDR rINST, r4        @ rINST<- &fp[A]
+    GET_VREG_WIDE_BY_ADDR r2, r3, r1    @ r2/r3<- vBB/vBB+1
+    GET_VREG_WIDE_BY_ADDR r0, r1, rINST @ r0/r1<- vAA/vAA+1
+    mul     ip, r0, r3                  @ ip<- YxX
+    umull   r0, lr, r2, r0              @ r0/lr <- ZxX RdLo == Rn - this is OK.
+    mla     r3, r1, r2, ip              @ r3<- YxX + (ZxW)
+    mov     r4, rINST                   @ Save vAA before FETCH_ADVANCE_INST
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    add     r1, r3, lr                  @ r1<- lr + low(ZxW + (YxX))
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r4    @ vAA/vAA+1<- r0/r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_neg_int():
+%  unop(instr="rsb     r0, r0, #0")
+
+%def op_neg_long():
+%  unopWide(preinstr="rsbs    r0, r0, #0", instr="rsc     r1, r1, #0")
+
+%def op_not_int():
+%  unop(instr="mvn     r0, r0")
+
+%def op_not_long():
+%  unopWide(preinstr="mvn     r0, r0", instr="mvn     r1, r1")
+
+%def op_or_int():
+%  binop(instr="orr     r0, r0, r1")
+
+%def op_or_int_2addr():
+%  binop2addr(instr="orr     r0, r0, r1")
+
+%def op_or_int_lit16():
+%  binopLit16(instr="orr     r0, r0, r1")
+
+%def op_or_int_lit8():
+%  binopLit8(extract="", instr="orr     r0, r0, r3, asr #8")
+
+%def op_or_long():
+%  binopWide(preinstr="orr     r0, r0, r2", instr="orr     r1, r1, r3")
+
+%def op_or_long_2addr():
+%  binopWide2addr(preinstr="orr     r0, r0, r2", instr="orr     r1, r1, r3")
+
+%def op_rem_int():
+    /*
+     * Specialized 32-bit binary operation
+     *
+     * Performs "r1 = r0 rem r1". The selection between sdiv block or the gcc helper
+     * depends on the compile time value of __ARM_ARCH_EXT_IDIV__ (defined for
+     * ARMv7 CPUs that have hardware division support).
+     *
+     * NOTE: idivmod returns quotient in r0 and remainder in r1
+     *
+     * rem-int
+     *
+     */
+    FETCH r0, 1                         @ r0<- CCBB
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    mov     r3, r0, lsr #8              @ r3<- CC
+    and     r2, r0, #255                @ r2<- BB
+    GET_VREG r1, r3                     @ r1<- vCC
+    GET_VREG r0, r2                     @ r0<- vBB
+    cmp     r1, #0                      @ is second operand zero?
+    beq     common_errDivideByZero
+
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+#ifdef __ARM_ARCH_EXT_IDIV__
+    sdiv    r2, r0, r1
+    mls     r1, r1, r2, r0                 @ r1<- op, r0-r2 changed
+#else
+    bl      __aeabi_idivmod                @ r1<- op, r0-r3 changed
+#endif
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG r1, r4                     @ vAA<- r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 11-14 instructions */
+
+%def op_rem_int_2addr():
+    /*
+     * Specialized 32-bit binary operation
+     *
+     * Performs "r1 = r0 rem r1". The selection between sdiv block or the gcc helper
+     * depends on the compile time value of __ARM_ARCH_EXT_IDIV__ (defined for
+     * ARMv7 CPUs that have hardware division support).
+     *
+     * NOTE: idivmod returns quotient in r0 and remainder in r1
+     *
+     * rem-int/2addr
+     *
+     */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    GET_VREG r1, r3                     @ r1<- vB
+    GET_VREG r0, r4                     @ r0<- vA
+    cmp     r1, #0                      @ is second operand zero?
+    beq     common_errDivideByZero
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+
+#ifdef __ARM_ARCH_EXT_IDIV__
+    sdiv    r2, r0, r1
+    mls     r1, r1, r2, r0              @ r1<- op
+#else
+    bl      __aeabi_idivmod             @ r1<- op, r0-r3 changed
+#endif
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG r1, r4                     @ vAA<- r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 10-13 instructions */
+
+
+%def op_rem_int_lit16():
+    /*
+     * Specialized 32-bit binary operation
+     *
+     * Performs "r1 = r0 rem r1". The selection between sdiv block or the gcc helper
+     * depends on the compile time value of __ARM_ARCH_EXT_IDIV__ (defined for
+     * ARMv7 CPUs that have hardware division support).
+     *
+     * NOTE: idivmod returns quotient in r0 and remainder in r1
+     *
+     * rem-int/lit16
+     *
+     */
+    FETCH_S r1, 1                       @ r1<- ssssCCCC (sign-extended)
+    mov     r2, rINST, lsr #12          @ r2<- B
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    GET_VREG r0, r2                     @ r0<- vB
+    cmp     r1, #0                      @ is second operand zero?
+    beq     common_errDivideByZero
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+
+#ifdef __ARM_ARCH_EXT_IDIV__
+    sdiv    r2, r0, r1
+    mls     r1, r1, r2, r0              @ r1<- op
+#else
+    bl     __aeabi_idivmod              @ r1<- op, r0-r3 changed
+#endif
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG r1, r4                     @ vAA<- r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 10-13 instructions */
+
+%def op_rem_int_lit8():
+    /*
+     * Specialized 32-bit binary operation
+     *
+     * Performs "r1 = r0 rem r1". The selection between sdiv block or the gcc helper
+     * depends on the compile time value of __ARM_ARCH_EXT_IDIV__ (defined for
+     * ARMv7 CPUs that have hardware division support).
+     *
+     * NOTE: idivmod returns quotient in r0 and remainder in r1
+     *
+     * rem-int/lit8
+     *
+     */
+    FETCH_S r3, 1                       @ r3<- ssssCCBB (sign-extended for CC)
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    and     r2, r3, #255                @ r2<- BB
+    GET_VREG r0, r2                     @ r0<- vBB
+    movs    r1, r3, asr #8              @ r1<- ssssssCC (sign extended)
+    @cmp     r1, #0                     @ is second operand zero?
+    beq     common_errDivideByZero
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+
+#ifdef __ARM_ARCH_EXT_IDIV__
+    sdiv    r2, r0, r1
+    mls     r1, r1, r2, r0              @ r1<- op
+#else
+    bl       __aeabi_idivmod            @ r1<- op, r0-r3 changed
+#endif
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG r1, r4                     @ vAA<- r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+    /* 10-12 instructions */
+
+%def op_rem_long():
+/* ldivmod returns quotient in r0/r1 and remainder in r2/r3 */
+%  binopWide(instr="bl      __aeabi_ldivmod", result0="r2", result1="r3", chkzero="1")
+
+%def op_rem_long_2addr():
+/* ldivmod returns quotient in r0/r1 and remainder in r2/r3 */
+%  binopWide2addr(instr="bl      __aeabi_ldivmod", result0="r2", result1="r3", chkzero="1")
+
+%def op_rsub_int():
+/* this op is "rsub-int", but can be thought of as "rsub-int/lit16" */
+%  binopLit16(instr="rsb     r0, r0, r1")
+
+%def op_rsub_int_lit8():
+%  binopLit8(extract="", instr="rsb     r0, r0, r3, asr #8")
+
+%def op_shl_int():
+%  binop(preinstr="and     r1, r1, #31", instr="mov     r0, r0, lsl r1")
+
+%def op_shl_int_2addr():
+%  binop2addr(preinstr="and     r1, r1, #31", instr="mov     r0, r0, lsl r1")
+
+%def op_shl_int_lit8():
+%  binopLit8(extract="ubfx    r1, r3, #8, #5", instr="mov     r0, r0, lsl r1")
+
+%def op_shl_long():
+    /*
+     * Long integer shift.  This is different from the generic 32/64-bit
+     * binary operations because vAA/vBB are 64-bit but vCC (the shift
+     * distance) is 32-bit.  Also, Dalvik requires us to mask off the low
+     * 6 bits of the shift distance.
+     */
+    /* shl-long vAA, vBB, vCC */
+    FETCH r0, 1                         @ r0<- CCBB
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    and     r3, r0, #255                @ r3<- BB
+    mov     r0, r0, lsr #8              @ r0<- CC
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[BB]
+    GET_VREG r2, r0                     @ r2<- vCC
+    GET_VREG_WIDE_BY_ADDR r0, r1, r3    @ r0/r1<- vBB/vBB+1
+    CLEAR_SHADOW_PAIR r4, lr, ip        @ Zero out the shadow regs
+    and     r2, r2, #63                 @ r2<- r2 & 0x3f
+    VREG_INDEX_TO_ADDR r4, r4           @ r4<- &fp[AA]
+    mov     r1, r1, asl r2              @ r1<- r1 << r2
+    rsb     r3, r2, #32                 @ r3<- 32 - r2
+    orr     r1, r1, r0, lsr r3          @ r1<- r1 | (r0 << (32-r2))
+    subs    ip, r2, #32                 @ ip<- r2 - 32
+    it      pl
+    movpl   r1, r0, asl ip              @ if r2 >= 32, r1<- r0 << (r2-32)
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    mov     r0, r0, asl r2              @ r0<- r0 << r2
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r4    @ vAA/vAA+1<- r0/r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_shl_long_2addr():
+    /*
+     * Long integer shift, 2addr version.  vA is 64-bit value/result, vB is
+     * 32-bit shift distance.
+     */
+    /* shl-long/2addr vA, vB */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    GET_VREG r2, r3                     @ r2<- vB
+    CLEAR_SHADOW_PAIR r4, lr, ip        @ Zero out the shadow regs
+    VREG_INDEX_TO_ADDR r4, r4           @ r4<- &fp[A]
+    and     r2, r2, #63                 @ r2<- r2 & 0x3f
+    GET_VREG_WIDE_BY_ADDR r0, r1, r4    @ r0/r1<- vAA/vAA+1
+    mov     r1, r1, asl r2              @ r1<- r1 << r2
+    rsb     r3, r2, #32                 @ r3<- 32 - r2
+    orr     r1, r1, r0, lsr r3          @ r1<- r1 | (r0 << (32-r2))
+    subs    ip, r2, #32                 @ ip<- r2 - 32
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    it      pl
+    movpl   r1, r0, asl ip              @ if r2 >= 32, r1<- r0 << (r2-32)
+    mov     r0, r0, asl r2              @ r0<- r0 << r2
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r4    @ vAA/vAA+1<- r0/r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_shr_int():
+%  binop(preinstr="and     r1, r1, #31", instr="mov     r0, r0, asr r1")
+
+%def op_shr_int_2addr():
+%  binop2addr(preinstr="and     r1, r1, #31", instr="mov     r0, r0, asr r1")
+
+%def op_shr_int_lit8():
+%  binopLit8(extract="ubfx    r1, r3, #8, #5", instr="mov     r0, r0, asr r1")
+
+%def op_shr_long():
+    /*
+     * Long integer shift.  This is different from the generic 32/64-bit
+     * binary operations because vAA/vBB are 64-bit but vCC (the shift
+     * distance) is 32-bit.  Also, Dalvik requires us to mask off the low
+     * 6 bits of the shift distance.
+     */
+    /* shr-long vAA, vBB, vCC */
+    FETCH r0, 1                         @ r0<- CCBB
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    and     r3, r0, #255                @ r3<- BB
+    mov     r0, r0, lsr #8              @ r0<- CC
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[BB]
+    GET_VREG r2, r0                     @ r2<- vCC
+    GET_VREG_WIDE_BY_ADDR r0, r1, r3    @ r0/r1<- vBB/vBB+1
+    CLEAR_SHADOW_PAIR r4, lr, ip        @ Zero out the shadow regs
+    and     r2, r2, #63                 @ r0<- r0 & 0x3f
+    VREG_INDEX_TO_ADDR r4, r4           @ r4<- &fp[AA]
+    mov     r0, r0, lsr r2              @ r0<- r2 >> r2
+    rsb     r3, r2, #32                 @ r3<- 32 - r2
+    orr     r0, r0, r1, lsl r3          @ r0<- r0 | (r1 << (32-r2))
+    subs    ip, r2, #32                 @ ip<- r2 - 32
+    it      pl
+    movpl   r0, r1, asr ip              @ if r2 >= 32, r0<-r1 >> (r2-32)
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    mov     r1, r1, asr r2              @ r1<- r1 >> r2
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r4    @ vAA/vAA+1<- r0/r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_shr_long_2addr():
+    /*
+     * Long integer shift, 2addr version.  vA is 64-bit value/result, vB is
+     * 32-bit shift distance.
+     */
+    /* shr-long/2addr vA, vB */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    GET_VREG r2, r3                     @ r2<- vB
+    CLEAR_SHADOW_PAIR r4, lr, ip        @ Zero out the shadow regs
+    VREG_INDEX_TO_ADDR r4, r4           @ r4<- &fp[A]
+    and     r2, r2, #63                 @ r2<- r2 & 0x3f
+    GET_VREG_WIDE_BY_ADDR r0, r1, r4    @ r0/r1<- vAA/vAA+1
+    mov     r0, r0, lsr r2              @ r0<- r2 >> r2
+    rsb     r3, r2, #32                 @ r3<- 32 - r2
+    orr     r0, r0, r1, lsl r3          @ r0<- r0 | (r1 << (32-r2))
+    subs    ip, r2, #32                 @ ip<- r2 - 32
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    it      pl
+    movpl   r0, r1, asr ip              @ if r2 >= 32, r0<-r1 >> (r2-32)
+    mov     r1, r1, asr r2              @ r1<- r1 >> r2
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r4    @ vAA/vAA+1<- r0/r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_sub_int():
+%  binop(instr="sub     r0, r0, r1")
+
+%def op_sub_int_2addr():
+%  binop2addr(instr="sub     r0, r0, r1")
+
+%def op_sub_long():
+%  binopWide(preinstr="subs    r0, r0, r2", instr="sbc     r1, r1, r3")
+
+%def op_sub_long_2addr():
+%  binopWide2addr(preinstr="subs    r0, r0, r2", instr="sbc     r1, r1, r3")
+
+%def op_ushr_int():
+%  binop(preinstr="and     r1, r1, #31", instr="mov     r0, r0, lsr r1")
+
+%def op_ushr_int_2addr():
+%  binop2addr(preinstr="and     r1, r1, #31", instr="mov     r0, r0, lsr r1")
+
+%def op_ushr_int_lit8():
+%  binopLit8(extract="ubfx    r1, r3, #8, #5", instr="mov     r0, r0, lsr r1")
+
+%def op_ushr_long():
+    /*
+     * Long integer shift.  This is different from the generic 32/64-bit
+     * binary operations because vAA/vBB are 64-bit but vCC (the shift
+     * distance) is 32-bit.  Also, Dalvik requires us to mask off the low
+     * 6 bits of the shift distance.
+     */
+    /* ushr-long vAA, vBB, vCC */
+    FETCH r0, 1                         @ r0<- CCBB
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    and     r3, r0, #255                @ r3<- BB
+    mov     r0, r0, lsr #8              @ r0<- CC
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[BB]
+    GET_VREG r2, r0                     @ r2<- vCC
+    GET_VREG_WIDE_BY_ADDR r0, r1, r3    @ r0/r1<- vBB/vBB+1
+    CLEAR_SHADOW_PAIR r4, lr, ip        @ Zero out the shadow regs
+    and     r2, r2, #63                 @ r0<- r0 & 0x3f
+    VREG_INDEX_TO_ADDR r4, r4           @ r4<- &fp[AA]
+    mov     r0, r0, lsr r2              @ r0<- r2 >> r2
+    rsb     r3, r2, #32                 @ r3<- 32 - r2
+    orr     r0, r0, r1, lsl r3          @ r0<- r0 | (r1 << (32-r2))
+    subs    ip, r2, #32                 @ ip<- r2 - 32
+    it      pl
+    movpl   r0, r1, lsr ip              @ if r2 >= 32, r0<-r1 >>> (r2-32)
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    mov     r1, r1, lsr r2              @ r1<- r1 >>> r2
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r4    @ vAA/vAA+1<- r0/r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_ushr_long_2addr():
+    /*
+     * Long integer shift, 2addr version.  vA is 64-bit value/result, vB is
+     * 32-bit shift distance.
+     */
+    /* ushr-long/2addr vA, vB */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    GET_VREG r2, r3                     @ r2<- vB
+    CLEAR_SHADOW_PAIR r4, lr, ip        @ Zero out the shadow regs
+    VREG_INDEX_TO_ADDR r4, r4           @ r4<- &fp[A]
+    and     r2, r2, #63                 @ r2<- r2 & 0x3f
+    GET_VREG_WIDE_BY_ADDR r0, r1, r4    @ r0/r1<- vAA/vAA+1
+    mov     r0, r0, lsr r2              @ r0<- r2 >> r2
+    rsb     r3, r2, #32                 @ r3<- 32 - r2
+    orr     r0, r0, r1, lsl r3          @ r0<- r0 | (r1 << (32-r2))
+    subs    ip, r2, #32                 @ ip<- r2 - 32
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    it      pl
+    movpl   r0, r1, lsr ip              @ if r2 >= 32, r0<-r1 >>> (r2-32)
+    mov     r1, r1, lsr r2              @ r1<- r1 >>> r2
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r4    @ vAA/vAA+1<- r0/r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_xor_int():
+%  binop(instr="eor     r0, r0, r1")
+
+%def op_xor_int_2addr():
+%  binop2addr(instr="eor     r0, r0, r1")
+
+%def op_xor_int_lit16():
+%  binopLit16(instr="eor     r0, r0, r1")
+
+%def op_xor_int_lit8():
+%  binopLit8(extract="", instr="eor     r0, r0, r3, asr #8")
+
+%def op_xor_long():
+%  binopWide(preinstr="eor     r0, r0, r2", instr="eor     r1, r1, r3")
+
+%def op_xor_long_2addr():
+%  binopWide2addr(preinstr="eor     r0, r0, r2", instr="eor     r1, r1, r3")
diff --git a/runtime/interpreter/mterp/armng/array.S b/runtime/interpreter/mterp/armng/array.S
new file mode 100644
index 0000000..93f11c6
--- /dev/null
+++ b/runtime/interpreter/mterp/armng/array.S
@@ -0,0 +1,171 @@
+%def op_aget(load="ldr", shift="2", data_offset="MIRROR_INT_ARRAY_DATA_OFFSET", wide="0", is_object="0"):
+/*
+ * Array get.  vAA <- vBB[vCC].
+ *
+ * for: aget, aget-boolean, aget-byte, aget-char, aget-short, aget-wide, aget-object
+ *
+ */
+    FETCH_B r2, 1, 0                    @ r2<- BB
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    FETCH_B r3, 1, 1                    @ r3<- CC
+    GET_VREG r0, r2                     @ r0<- vBB (array object)
+    GET_VREG r1, r3                     @ r1<- vCC (requested index)
+    cmp     r0, #0                      @ null array object?
+    beq     common_errNullObject        @ yes, bail
+    ldr     r3, [r0, #MIRROR_ARRAY_LENGTH_OFFSET]    @ r3<- arrayObj->length
+    add     r0, r0, r1, lsl #$shift     @ r0<- arrayObj + index*width
+    cmp     r1, r3                      @ compare unsigned index, length
+    bcs     common_errArrayIndex        @ index >= length, bail
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    .if $wide
+    CLEAR_SHADOW_PAIR r4, lr, ip        @ Zero out the shadow regs
+    ldrd    r2, [r0, #$data_offset]     @ r2/r3<- vBB[vCC]
+    VREG_INDEX_TO_ADDR r4, r4           @ r4<- &fp[AA]
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r2, r3, r4    @ vAA/vAA+1<- r2/r3
+    GOTO_OPCODE ip                      @ jump to next instruction
+    .elseif $is_object
+    $load   r2, [r0, #$data_offset]     @ w2<- vBB[vCC]
+    cmp rMR, #0
+    bne 2f
+1:
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_OBJECT r2, r4              @ vAA<- w2
+    GOTO_OPCODE ip                      @ jump to next instruction
+2:
+    bl art_quick_read_barrier_mark_reg02
+    b 1b
+    .else
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    $load   r2, [r0, #$data_offset]     @ r2<- vBB[vCC]
+    SET_VREG r2, r4                     @ vAA<- r2
+    GOTO_OPCODE ip                      @ jump to next instruction
+    .endif
+
+%def op_aget_boolean():
+%  op_aget(load="ldrb", shift="0", data_offset="MIRROR_BOOLEAN_ARRAY_DATA_OFFSET", wide="0", is_object="0")
+
+%def op_aget_byte():
+%  op_aget(load="ldrsb", shift="0", data_offset="MIRROR_BYTE_ARRAY_DATA_OFFSET", wide="0", is_object="0")
+
+%def op_aget_char():
+%  op_aget(load="ldrh", shift="1", data_offset="MIRROR_CHAR_ARRAY_DATA_OFFSET", wide="0", is_object="0")
+
+%def op_aget_object():
+%  op_aget(load="ldr", shift="2", data_offset="MIRROR_OBJECT_ARRAY_DATA_OFFSET", wide="0", is_object="1")
+
+%def op_aget_short():
+%  op_aget(load="ldrsh", shift="1", data_offset="MIRROR_SHORT_ARRAY_DATA_OFFSET", wide="0", is_object="0")
+
+%def op_aget_wide():
+%  op_aget(load="ldrd", shift="3", data_offset="MIRROR_WIDE_ARRAY_DATA_OFFSET", wide="1", is_object="0")
+
+%def op_aput(store="str", shift="2", data_offset="MIRROR_INT_ARRAY_DATA_OFFSET", wide="0", is_object="0"):
+/*
+ * Array put.  vBB[vCC] <- vAA.
+ *
+ * for: aput, aput-boolean, aput-byte, aput-char, aput-short, aput-wide, aput-object
+ *
+ */
+    FETCH_B r2, 1, 0                    @ r2<- BB
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    FETCH_B r3, 1, 1                    @ r3<- CC
+    GET_VREG r0, r2                     @ r0<- vBB (array object)
+    GET_VREG r1, r3                     @ r1<- vCC (requested index)
+    cmp     r0, #0                      @ null array object?
+    beq     common_errNullObject        @ yes, bail
+    ldr     r3, [r0, #MIRROR_ARRAY_LENGTH_OFFSET]     @ r3<- arrayObj->length
+    cmp     r1, r3                      @ compare unsigned index, length
+    bcs     common_errArrayIndex        @ index >= length, bail
+    .if $is_object
+    EXPORT_PC                           // Export PC before overwriting it.
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    GET_VREG r2, r4                     @ r2<- vAA
+    bl art_quick_aput_obj
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    .elseif $wide
+    add     r0, r0, r1, lsl #$shift     @ r0<- arrayObj + index*width
+    VREG_INDEX_TO_ADDR r4, r4           @ r4<- &fp[AA]
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    GET_VREG_WIDE_BY_ADDR r2, r3, r4    @ r2/r3<- vAA/vAA+1
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    strd    r2, [r0, #$data_offset]     @ r2/r3<- vBB[vCC]
+    .else
+    add     r0, r0, r1, lsl #$shift     @ r0<- arrayObj + index*width
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    GET_VREG r2, r4                     @ r2<- vAA
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    $store  r2, [r0, #$data_offset]     @ vBB[vCC]<- r2
+    .endif
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_aput_boolean():
+%  op_aput(store="strb", shift="0", data_offset="MIRROR_BOOLEAN_ARRAY_DATA_OFFSET", wide="0", is_object="0")
+
+%def op_aput_byte():
+%  op_aput(store="strb", shift="0", data_offset="MIRROR_BYTE_ARRAY_DATA_OFFSET", wide="0", is_object="0")
+
+%def op_aput_char():
+%  op_aput(store="strh", shift="1", data_offset="MIRROR_CHAR_ARRAY_DATA_OFFSET", wide="0", is_object="0")
+
+%def op_aput_short():
+%  op_aput(store="strh", shift="1", data_offset="MIRROR_SHORT_ARRAY_DATA_OFFSET", wide="0", is_object="0")
+
+%def op_aput_wide():
+%  op_aput(store="str", shift="3", data_offset="MIRROR_WIDE_ARRAY_DATA_OFFSET", wide="1", is_object="0")
+
+%def op_aput_object():
+%  op_aput(store="str", shift="2", data_offset="MIRROR_OBJECT_ARRAY_DATA_OFFSET", wide="0", is_object="1")
+
+%def op_array_length():
+    /*
+     * Return the length of an array.
+     */
+    mov     r1, rINST, lsr #12          @ r1<- B
+    ubfx    r2, rINST, #8, #4           @ r2<- A
+    GET_VREG r0, r1                     @ r0<- vB (object ref)
+    cmp     r0, #0                      @ is object null?
+    beq     common_errNullObject        @ yup, fail
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    ldr     r3, [r0, #MIRROR_ARRAY_LENGTH_OFFSET]    @ r3<- array length
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG r3, r2                     @ vB<- length
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_fill_array_data():
+    /* fill-array-data vAA, +BBBBBBBB */
+    EXPORT_PC
+    FETCH r0, 1                         @ r0<- bbbb (lo)
+    FETCH r1, 2                         @ r1<- BBBB (hi)
+    mov     r3, rINST, lsr #8           @ r3<- AA
+    orr     r0, r0, r1, lsl #16         @ r0<- BBBBbbbb
+    GET_VREG r1, r3                     @ r1<- vAA (array object)
+    add     r0, rPC, r0, lsl #1         @ r1<- PC + BBBBbbbb*2 (array data off.)
+    bl      art_quick_handle_fill_data  @ (payload, object)
+    FETCH_ADVANCE_INST 3                @ advance rPC, load rINST
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_filled_new_array(helper="nterp_filled_new_array"):
+/*
+ * Create a new array with elements filled from registers.
+ *
+ * for: filled-new-array, filled-new-array/range
+ */
+    /* op vB, {vD, vE, vF, vG, vA}, class@CCCC */
+    /* op {vCCCC..v(CCCC+AA-1)}, type@BBBB */
+    EXPORT_PC
+    mov     r0, rSELF
+    ldr     r1, [sp]
+    mov     r2, rFP
+    mov     r3, rPC
+    bl      $helper
+    FETCH_ADVANCE_INST 3                // advance rPC, load rINST
+    GET_INST_OPCODE ip                  // extract opcode from rINST
+    GOTO_OPCODE ip                      // jump to next instruction
+
+%def op_filled_new_array_range():
+%  op_filled_new_array(helper="nterp_filled_new_array_range")
+
+%def op_new_array():
+  b NterpNewArray
diff --git a/runtime/interpreter/mterp/armng/control_flow.S b/runtime/interpreter/mterp/armng/control_flow.S
new file mode 100644
index 0000000..3d564e7
--- /dev/null
+++ b/runtime/interpreter/mterp/armng/control_flow.S
@@ -0,0 +1,192 @@
+%def bincmp(condition=""):
+    /*
+     * Generic two-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
+     *
+     * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
+     */
+    /* if-cmp vA, vB, +CCCC */
+    mov     r1, rINST, lsr #12          @ r1<- B
+    ubfx    r0, rINST, #8, #4           @ r0<- A
+    GET_VREG r3, r1                     @ r3<- vB
+    GET_VREG r0, r0                     @ r0<- vA
+    FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
+    cmp     r0, r3                      @ compare (vA, vB)
+    b${condition} 1f
+    FETCH_ADVANCE_INST 2
+    GET_INST_OPCODE ip                  // extract opcode from rINST
+    GOTO_OPCODE ip                      // jump to next instruction
+1:
+    FETCH_S rINST, 1                    // rINST<- branch offset, in code units
+    BRANCH
+
+%def zcmp(condition=""):
+    /*
+     * Generic one-operand compare-and-branch operation.  Provide a "condition"
+     * fragment that specifies the comparison to perform.
+     *
+     * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
+     */
+    /* if-cmp vAA, +BBBB */
+    mov     r0, rINST, lsr #8           @ r0<- AA
+    GET_VREG r0, r0                     @ r0<- vAA
+    FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
+    cmp     r0, #0                      // compare (vA, 0)
+    b${condition} 1f
+    FETCH_ADVANCE_INST 2
+    GET_INST_OPCODE ip                  // extract opcode from rINST
+    GOTO_OPCODE ip                      // jump to next instruction
+1:
+    FETCH_S rINST, 1                    // rINST<- branch offset, in code units
+    BRANCH
+
+%def op_goto():
+/*
+ * Unconditional branch, 8-bit offset.
+ *
+ * The branch distance is a signed code-unit offset, which we need to
+ * double to get a byte offset.
+ */
+    /* goto +AA */
+    sbfx    rINST, rINST, #8, #8           // rINST<- ssssssAA (sign-extended)
+    BRANCH
+
+%def op_goto_16():
+/*
+ * Unconditional branch, 16-bit offset.
+ *
+ * The branch distance is a signed code-unit offset, which we need to
+ * double to get a byte offset.
+ */
+    /* goto/16 +AAAA */
+    FETCH_S rINST, 1                    // wINST<- ssssAAAA (sign-extended)
+    BRANCH
+
+%def op_goto_32():
+/*
+ * Unconditional branch, 32-bit offset.
+ *
+ * The branch distance is a signed code-unit offset, which we need to
+ * double to get a byte offset.
+ *
+ * Because we need the SF bit set, we'll use an adds
+ * to convert from Dalvik offset to byte offset.
+ */
+    /* goto/32 +AAAAAAAA */
+    FETCH r0, 1                         // r0<- aaaa (lo)
+    FETCH r1, 2                         // r1<- AAAA (hi)
+    orrs     rINST, r0, r1, lsl #16      // wINST<- AAAAaaaa
+    BRANCH
+
+%def op_if_eq():
+%  bincmp(condition="eq")
+
+%def op_if_eqz():
+%  zcmp(condition="eq")
+
+%def op_if_ge():
+%  bincmp(condition="ge")
+
+%def op_if_gez():
+%  zcmp(condition="ge")
+
+%def op_if_gt():
+%  bincmp(condition="gt")
+
+%def op_if_gtz():
+%  zcmp(condition="gt")
+
+%def op_if_le():
+%  bincmp(condition="le")
+
+%def op_if_lez():
+%  zcmp(condition="le")
+
+%def op_if_lt():
+%  bincmp(condition="lt")
+
+%def op_if_ltz():
+%  zcmp(condition="lt")
+
+%def op_if_ne():
+%  bincmp(condition="ne")
+
+%def op_if_nez():
+%  zcmp(condition="ne")
+
+%def op_packed_switch(func="NterpDoPackedSwitch"):
+/*
+ * Handle a packed-switch or sparse-switch instruction.  In both cases
+ * we decode it and hand it off to a helper function.
+ *
+ * We don't really expect backward branches in a switch statement, but
+ * they're perfectly legal, so we check for them here.
+ *
+ * for: packed-switch, sparse-switch
+ */
+    /* op vAA, +BBBB */
+    FETCH r0, 1                         @ r0<- bbbb (lo)
+    FETCH r1, 2                         @ r1<- BBBB (hi)
+    mov     r3, rINST, lsr #8           @ r3<- AA
+    orr     r0, r0, r1, lsl #16         @ r0<- BBBBbbbb
+    GET_VREG r1, r3                     @ r1<- vAA
+    add     r0, rPC, r0, lsl #1         @ r0<- PC + BBBBbbbb*2
+    bl      $func                       @ r0<- code-unit branch offset
+    mov     rINST, r0
+    BRANCH
+
+%def op_sparse_switch():
+%  op_packed_switch(func="NterpDoSparseSwitch")
+
+/*
+ * Return a 32-bit value.
+ */
+%def op_return(is_object="0", is_void="0", is_wide="0", is_no_barrier="0"):
+    .if $is_void
+      .if !$is_no_barrier
+      // Thread fence for constructor
+      dmb ishst
+      .endif
+    .else
+      mov     r2, rINST, lsr #8           @ r2<- AA
+      .if $is_wide
+        VREG_INDEX_TO_ADDR r2, r2
+        GET_VREG_WIDE_BY_ADDR r0, r1, r2 // r0,r1 <- vAA
+        // In case we're going back to compiled code, put the
+        // result also in d0.
+        vmov d0, r0, r1
+      .else
+        GET_VREG r0, r2                     // r0<- vAA
+        .if !$is_object
+        // In case we're going back to compiled code, put the
+        // result also in s0.
+        vmov s0, r0
+        .endif
+      .endif
+    .endif
+    .cfi_remember_state
+    ldr ip, [rREFS, #-4]
+    mov sp, ip
+    .cfi_def_cfa sp, CALLEE_SAVES_SIZE
+    RESTORE_ALL_CALLEE_SAVES lr_to_pc=1
+    .cfi_restore_state
+
+%def op_return_object():
+%  op_return(is_object="1", is_void="0", is_wide="0", is_no_barrier="0")
+
+%def op_return_void():
+%  op_return(is_object="0", is_void="1", is_wide="0", is_no_barrier="0")
+
+%def op_return_void_no_barrier():
+%  op_return(is_object="0", is_void="1", is_wide="0", is_no_barrier="1")
+
+%def op_return_wide():
+%  op_return(is_object="0", is_void="0", is_wide="1", is_no_barrier="0")
+
+%def op_throw():
+  EXPORT_PC
+  mov      r2, rINST, lsr #8           @ r2<- AA
+  GET_VREG r0, r2                      @ r0<- vAA (exception object)
+  mov r1, rSELF
+  bl art_quick_deliver_exception
+  bkpt 0
diff --git a/runtime/interpreter/mterp/armng/floating_point.S b/runtime/interpreter/mterp/armng/floating_point.S
new file mode 100644
index 0000000..5052f13
--- /dev/null
+++ b/runtime/interpreter/mterp/armng/floating_point.S
@@ -0,0 +1,424 @@
+%def fbinop(instr=""):
+    /*
+     * Generic 32-bit floating-point operation.  Provide an "instr" line that
+     * specifies an instruction that performs "s2 = s0 op s1".  Because we
+     * use the "softfp" ABI, this must be an instruction, not a function call.
+     *
+     * For: add-float, sub-float, mul-float, div-float
+     */
+    /* floatop vAA, vBB, vCC */
+    FETCH r0, 1                         @ r0<- CCBB
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    mov     r3, r0, lsr #8              @ r3<- CC
+    and     r2, r0, #255                @ r2<- BB
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &vCC
+    VREG_INDEX_TO_ADDR r2, r2           @ r2<- &vBB
+    GET_VREG_FLOAT_BY_ADDR s1, r3       @ s1<- vCC
+    GET_VREG_FLOAT_BY_ADDR s0, r2       @ s0<- vBB
+
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    $instr                              @ s2<- op
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_FLOAT s2, r4, lr           @ vAA<- s2
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def fbinop2addr(instr=""):
+    /*
+     * Generic 32-bit floating point "/2addr" binary operation.  Provide
+     * an "instr" line that specifies an instruction that performs
+     * "s2 = s0 op s1".
+     *
+     * For: add-float/2addr, sub-float/2addr, mul-float/2addr, div-float/2addr
+     */
+    /* binop/2addr vA, vB */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &vB
+    VREG_INDEX_TO_ADDR r4, r4           @ r4<- &vA
+    GET_VREG_FLOAT_BY_ADDR s1, r3       @ s1<- vB
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    GET_VREG_FLOAT_BY_ADDR s0, r4       @ s0<- vA
+    $instr                              @ s2<- op
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_FLOAT_BY_ADDR s2, r4       @ vAA<- s2 No need to clear as it's 2addr
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def fbinopWide(instr=""):
+    /*
+     * Generic 64-bit double-precision floating point binary operation.
+     * Provide an "instr" line that specifies an instruction that performs
+     * "d2 = d0 op d1".
+     *
+     * for: add-double, sub-double, mul-double, div-double
+     */
+    /* doubleop vAA, vBB, vCC */
+    FETCH r0, 1                         @ r0<- CCBB
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    mov     r3, r0, lsr #8              @ r3<- CC
+    and     r2, r0, #255                @ r2<- BB
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &vCC
+    VREG_INDEX_TO_ADDR r2, r2           @ r2<- &vBB
+    GET_VREG_DOUBLE_BY_ADDR d1, r3      @ d1<- vCC
+    GET_VREG_DOUBLE_BY_ADDR d0, r2      @ d0<- vBB
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    $instr                              @ d2<- op
+    CLEAR_SHADOW_PAIR r4, ip, lr        @ Zero shadow regs
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    VREG_INDEX_TO_ADDR r4, r4           @ r4<- &vAA
+    SET_VREG_DOUBLE_BY_ADDR d2, r4      @ vAA<- d2
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def fbinopWide2addr(instr=""):
+    /*
+     * Generic 64-bit floating point "/2addr" binary operation.  Provide
+     * an "instr" line that specifies an instruction that performs
+     * "d2 = d0 op d1".
+     *
+     * For: add-double/2addr, sub-double/2addr, mul-double/2addr,
+     *      div-double/2addr
+     */
+    /* binop/2addr vA, vB */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &vB
+    CLEAR_SHADOW_PAIR r4, ip, r0        @ Zero out shadow regs
+    GET_VREG_DOUBLE_BY_ADDR d1, r3      @ d1<- vB
+    VREG_INDEX_TO_ADDR r4, r4           @ r4<- &vA
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    GET_VREG_DOUBLE_BY_ADDR d0, r4      @ d0<- vA
+    $instr                              @ d2<- op
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_DOUBLE_BY_ADDR d2, r4      @ vAA<- d2
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def funop(instr=""):
+    /*
+     * Generic 32-bit unary floating-point operation.  Provide an "instr"
+     * line that specifies an instruction that performs "s1 = op s0".
+     *
+     * for: int-to-float, float-to-int
+     */
+    /* unop vA, vB */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &vB
+    GET_VREG_FLOAT_BY_ADDR s0, r3       @ s0<- vB
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    $instr                              @ s1<- op
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_FLOAT s1, r4, lr           @ vA<- s1
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def funopNarrower(instr=""):
+    /*
+     * Generic 64bit-to-32bit unary floating point operation.  Provide an
+     * "instr" line that specifies an instruction that performs "s0 = op d0".
+     *
+     * For: double-to-int, double-to-float
+     */
+    /* unop vA, vB */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &vB
+    GET_VREG_DOUBLE_BY_ADDR d0, r3      @ d0<- vB
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    $instr                              @ s0<- op
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_FLOAT s0, r4, lr           @ vA<- s0
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def funopWider(instr=""):
+    /*
+     * Generic 32bit-to-64bit floating point unary operation.  Provide an
+     * "instr" line that specifies an instruction that performs "d0 = op s0".
+     *
+     * For: int-to-double, float-to-double
+     */
+    /* unop vA, vB */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &vB
+    GET_VREG_FLOAT_BY_ADDR s0, r3       @ s0<- vB
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    $instr                              @ d0<- op
+    CLEAR_SHADOW_PAIR r4, ip, lr        @ Zero shadow regs
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    VREG_INDEX_TO_ADDR r4, r4           @ r4<- &vA
+    SET_VREG_DOUBLE_BY_ADDR d0, r4      @ vA<- d0
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_add_double():
+%  fbinopWide(instr="faddd   d2, d0, d1")
+
+%def op_add_double_2addr():
+%  fbinopWide2addr(instr="faddd   d2, d0, d1")
+
+%def op_add_float():
+%  fbinop(instr="fadds   s2, s0, s1")
+
+%def op_add_float_2addr():
+%  fbinop2addr(instr="fadds   s2, s0, s1")
+
+%def op_cmpg_double():
+    /*
+     * Compare two floating-point values.  Puts 0, 1, or -1 into the
+     * destination register based on the results of the comparison.
+     *
+     * int compare(x, y) {
+     *     if (x == y) {
+     *         return 0;
+     *     } else if (x < y) {
+     *         return -1;
+     *     } else if (x > y) {
+     *         return 1;
+     *     } else {
+     *         return 1;
+     *     }
+     * }
+     */
+    /* op vAA, vBB, vCC */
+    FETCH r0, 1                         @ r0<- CCBB
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    and     r2, r0, #255                @ r2<- BB
+    mov     r3, r0, lsr #8              @ r3<- CC
+    VREG_INDEX_TO_ADDR r2, r2           @ r2<- &vBB
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &vCC
+    GET_VREG_DOUBLE_BY_ADDR d0, r2      @ d0<- vBB
+    GET_VREG_DOUBLE_BY_ADDR d1, r3      @ d1<- vCC
+    vcmpe.f64 d0, d1                    @ compare (vBB, vCC)
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    mvn     r0, #0                      @ r0<- -1 (default)
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    fmstat
+    it      hi
+    movhi   r0, #1                      @ (greater than, or unordered) r0<- 1
+    moveq   r0, #0                      @ (equal) r0<- 0
+    SET_VREG r0, r4                     @ vAA<- r0
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_cmpg_float():
+    /*
+     * Compare two floating-point values.  Puts 0, 1, or -1 into the
+     * destination register based on the results of the comparison.
+     *
+     * int compare(x, y) {
+     *     if (x == y) {
+     *         return 0;
+     *     } else if (x < y) {
+     *         return -1;
+     *     } else if (x > y) {
+     *         return 1;
+     *     } else {
+     *         return 1;
+     *     }
+     * }
+     */
+    /* op vAA, vBB, vCC */
+    FETCH r0, 1                         @ r0<- CCBB
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    and     r2, r0, #255                @ r2<- BB
+    mov     r3, r0, lsr #8              @ r3<- CC
+    VREG_INDEX_TO_ADDR r2, r2           @ r2<- &vBB
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &vCC
+    GET_VREG_FLOAT_BY_ADDR s0, r2       @ s0<- vBB
+    GET_VREG_FLOAT_BY_ADDR s1, r3       @ s1<- vCC
+    vcmpe.f32 s0, s1                    @ compare (vBB, vCC)
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    mvn     r0, #0                      @ r0<- -1 (default)
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    fmstat
+    it      hi
+    movhi   r0, #1                      @ (greater than, or unordered) r0<- 1
+    moveq   r0, #0                      @ (equal) r0<- 0
+    SET_VREG r0, r4                     @ vAA<- r0
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_cmpl_double():
+    /*
+     * Compare two floating-point values.  Puts 0, 1, or -1 into the
+     * destination register based on the results of the comparison.
+     *
+     * int compare(x, y) {
+     *     if (x == y) {
+     *         return 0;
+     *     } else if (x > y) {
+     *         return 1;
+     *     } else if (x < y) {
+     *         return -1;
+     *     } else {
+     *         return -1;
+     *     }
+     * }
+     */
+    /* op vAA, vBB, vCC */
+    FETCH r0, 1                         @ r0<- CCBB
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    and     r2, r0, #255                @ r2<- BB
+    mov     r3, r0, lsr #8              @ r3<- CC
+    VREG_INDEX_TO_ADDR r2, r2           @ r2<- &vBB
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &vCC
+    GET_VREG_DOUBLE_BY_ADDR d0, r2      @ d0<- vBB
+    GET_VREG_DOUBLE_BY_ADDR d1, r3      @ d1<- vCC
+    vcmpe.f64 d0, d1                    @ compare (vBB, vCC)
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    mvn     r0, #0                      @ r0<- -1 (default)
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    fmstat                              @ export status flags
+    it gt
+    movgt   r0, #1                      @ (greater than) r1<- 1
+    it eq
+    moveq   r0, #0                      @ (equal) r1<- 0
+    SET_VREG r0, r4                     @ vAA<- r0
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_cmpl_float():
+    /*
+     * Compare two floating-point values.  Puts 0, 1, or -1 into the
+     * destination register based on the results of the comparison.
+     *
+     * int compare(x, y) {
+     *     if (x == y) {
+     *         return 0;
+     *     } else if (x > y) {
+     *         return 1;
+     *     } else if (x < y) {
+     *         return -1;
+     *     } else {
+     *         return -1;
+     *     }
+     * }
+     */
+    /* op vAA, vBB, vCC */
+    FETCH r0, 1                         @ r0<- CCBB
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    and     r2, r0, #255                @ r2<- BB
+    mov     r3, r0, lsr #8              @ r3<- CC
+    VREG_INDEX_TO_ADDR r2, r2           @ r2<- &vBB
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &vCC
+    GET_VREG_FLOAT_BY_ADDR s0, r2       @ s0<- vBB
+    GET_VREG_FLOAT_BY_ADDR s1, r3       @ s1<- vCC
+    vcmpe.f32  s0, s1                   @ compare (vBB, vCC)
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    mvn     r0, #0                      @ r0<- -1 (default)
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    fmstat                              @ export status flags
+    it gt
+    movgt   r0, #1                      @ (greater than) r1<- 1
+    it eq
+    moveq   r0, #0                      @ (equal) r1<- 0
+    SET_VREG r0, r4                     @ vAA<- r0
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_div_double():
+%  fbinopWide(instr="fdivd   d2, d0, d1")
+
+%def op_div_double_2addr():
+%  fbinopWide2addr(instr="fdivd   d2, d0, d1")
+
+%def op_div_float():
+%  fbinop(instr="fdivs   s2, s0, s1")
+
+%def op_div_float_2addr():
+%  fbinop2addr(instr="fdivs   s2, s0, s1")
+
+%def op_double_to_float():
+%  funopNarrower(instr="vcvt.f32.f64  s0, d0")
+
+%def op_double_to_int():
+%  funopNarrower(instr="vcvt.s32.f64  s0, d0")
+
+%def op_double_to_long():
+%  unopWide(instr="bl      nterp_d2l_doconv")
+
+%def op_float_to_double():
+%  funopWider(instr="vcvt.f64.f32  d0, s0")
+
+%def op_float_to_int():
+%  funop(instr="vcvt.s32.f32 s1, s0")
+
+%def op_float_to_long():
+%  unopWider(instr="bl      nterp_f2l_doconv")
+
+%def op_int_to_double():
+%  funopWider(instr="vcvt.f64.s32  d0, s0")
+
+%def op_int_to_float():
+%  funop(instr="vcvt.f32.s32  s1, s0")
+
+%def op_long_to_double():
+    /*
+     * Specialised 64-bit floating point operation.
+     *
+     * Note: The result will be returned in d2.
+     *
+     * For: long-to-double
+     */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    ubfx    r4, rINST, #8, #4           @ r4<- A
+    CLEAR_SHADOW_PAIR r4, ip, lr        @ Zero shadow regs
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[B]
+    VREG_INDEX_TO_ADDR r4, r4           @ r4<- &fp[A]
+    GET_VREG_DOUBLE_BY_ADDR d0, r3      @ d0<- vBB
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+
+    vcvt.f64.s32    d1, s1              @ d1<- (double)(vAAh)
+    vcvt.f64.u32    d2, s0              @ d2<- (double)(vAAl)
+    vldr            d3, constval$opcode
+    vmla.f64        d2, d1, d3          @ d2<- vAAh*2^32 + vAAl
+
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_DOUBLE_BY_ADDR d2, r4      @ vAA<- d2
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+    /* literal pool helper */
+constval${opcode}:
+    .8byte          0x41f0000000000000
+
+%def op_long_to_float():
+%  unopNarrower(instr="bl      __aeabi_l2f")
+
+%def op_mul_double():
+%  fbinopWide(instr="fmuld   d2, d0, d1")
+
+%def op_mul_double_2addr():
+%  fbinopWide2addr(instr="fmuld   d2, d0, d1")
+
+%def op_mul_float():
+%  fbinop(instr="fmuls   s2, s0, s1")
+
+%def op_mul_float_2addr():
+%  fbinop2addr(instr="fmuls   s2, s0, s1")
+
+%def op_neg_double():
+%  unopWide(instr="add     r1, r1, #0x80000000")
+
+%def op_neg_float():
+%  unop(instr="add     r0, r0, #0x80000000")
+
+%def op_rem_double():
+/* EABI doesn't define a double remainder function, but libm does */
+%  binopWide(instr="bl      fmod")
+
+%def op_rem_double_2addr():
+/* EABI doesn't define a double remainder function, but libm does */
+%  binopWide2addr(instr="bl      fmod")
+
+%def op_rem_float():
+/* EABI doesn't define a float remainder function, but libm does */
+%  binop(instr="bl      fmodf")
+
+%def op_rem_float_2addr():
+/* EABI doesn't define a float remainder function, but libm does */
+%  binop2addr(instr="bl      fmodf")
+
+%def op_sub_double():
+%  fbinopWide(instr="fsubd   d2, d0, d1")
+
+%def op_sub_double_2addr():
+%  fbinopWide2addr(instr="fsubd   d2, d0, d1")
+
+%def op_sub_float():
+%  fbinop(instr="fsubs   s2, s0, s1")
+
+%def op_sub_float_2addr():
+%  fbinop2addr(instr="fsubs   s2, s0, s1")
diff --git a/runtime/interpreter/mterp/armng/invoke.S b/runtime/interpreter/mterp/armng/invoke.S
new file mode 100644
index 0000000..47678dc
--- /dev/null
+++ b/runtime/interpreter/mterp/armng/invoke.S
@@ -0,0 +1,183 @@
+%def op_invoke_custom():
+   EXPORT_PC
+   FETCH r0, 1 // call_site index, first argument of runtime call.
+   b NterpCommonInvokeCustom
+
+%def op_invoke_custom_range():
+   EXPORT_PC
+   FETCH r0, 1 // call_site index, first argument of runtime call.
+   b NterpCommonInvokeCustomRange
+
+%def invoke_direct_or_super(helper="", range="", is_super=""):
+   EXPORT_PC
+   // Fast-path which gets the method from thread-local cache.
+   FETCH_FROM_THREAD_CACHE r0, 2f
+1:
+   // Load the first argument (the 'this' pointer).
+   FETCH r1, 2
+   .if !$range
+   and r1, r1, #0xf
+   .endif
+   GET_VREG r1, r1
+   cmp r1, #0
+   beq common_errNullObject    // bail if null
+   b $helper
+2:
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   bl nterp_get_method
+   .if $is_super
+   b 1b
+   .else
+   tst r0, #1
+   beq 1b
+   and r0, r0, #-2 // Remove the extra bit that marks it's a String.<init> method.
+   .if $range
+   b NterpHandleStringInitRange
+   .else
+   b NterpHandleStringInit
+   .endif
+   .endif
+
+%def op_invoke_direct():
+%  invoke_direct_or_super(helper="NterpCommonInvokeInstance", range="0", is_super="0")
+
+%def op_invoke_direct_range():
+%  invoke_direct_or_super(helper="NterpCommonInvokeInstanceRange", range="1", is_super="0")
+
+%def op_invoke_super():
+%  invoke_direct_or_super(helper="NterpCommonInvokeInstance", range="0", is_super="1")
+
+%def op_invoke_super_range():
+%  invoke_direct_or_super(helper="NterpCommonInvokeInstanceRange", range="1", is_super="1")
+
+%def op_invoke_polymorphic():
+   EXPORT_PC
+   // No need to fetch the target method.
+   // Load the first argument (the 'this' pointer).
+   FETCH r1, 2
+   and r1, r1, #0xf
+   GET_VREG r1, r1
+   cmp r1, #0
+   beq common_errNullObject    // bail if null
+   b NterpCommonInvokePolymorphic
+
+%def op_invoke_polymorphic_range():
+   EXPORT_PC
+   // No need to fetch the target method.
+   // Load the first argument (the 'this' pointer).
+   FETCH r1, 2
+   GET_VREG r1, r1
+   cmp r1, #0
+   beq common_errNullObject    // bail if null
+   b NterpCommonInvokePolymorphicRange
+
+%def invoke_interface(range=""):
+   EXPORT_PC
+   // Fast-path which gets the method from thread-local cache.
+   FETCH_FROM_THREAD_CACHE r0, 2f
+1:
+   // First argument is the 'this' pointer.
+   FETCH r1, 2
+   .if !$range
+   and r1, r1, #0xf
+   .endif
+   GET_VREG r1, r1
+   // Note: if r1 is null, this will be handled by our SIGSEGV handler.
+   ldr r2, [r1, #MIRROR_OBJECT_CLASS_OFFSET]
+   ldr r2, [r2, #MIRROR_CLASS_IMT_PTR_OFFSET_32]
+   ldr r0, [r2, r0, uxtw #2]
+   .if $range
+   b NterpCommonInvokeInterfaceRange
+   .else
+   b NterpCommonInvokeInterface
+   .endif
+2:
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   bl nterp_get_method
+   // For j.l.Object interface calls, the high bit is set. Also the method index is 16bits.
+   cmp r0, #0
+   bge 1b
+   ubfx r0, r0, #0, #16
+   .if $range
+   b NterpHandleInvokeInterfaceOnObjectMethodRange
+   .else
+   b NterpHandleInvokeInterfaceOnObjectMethod
+   .endif
+
+%def op_invoke_interface():
+%  invoke_interface(range="0")
+
+%def op_invoke_interface_range():
+%  invoke_interface(range="1")
+
+%def invoke_static(helper=""):
+   EXPORT_PC
+   // Fast-path which gets the method from thread-local cache.
+   FETCH_FROM_THREAD_CACHE r0, 1f
+   b $helper
+1:
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   bl nterp_get_method
+   b $helper
+
+%def op_invoke_static():
+%  invoke_static(helper="NterpCommonInvokeStatic")
+
+%def op_invoke_static_range():
+%  invoke_static(helper="NterpCommonInvokeStaticRange")
+
+%def invoke_virtual(helper="", range=""):
+   EXPORT_PC
+   // Fast-path which gets the vtable offset from thread-local cache.
+   FETCH_FROM_THREAD_CACHE r2, 2f
+1:
+   FETCH r1, 2
+   .if !$range
+   and r1, r1, #0xf
+   .endif
+   GET_VREG r1, r1
+   // Note: if r1 is null, this will be handled by our SIGSEGV handler.
+   ldr r0, [r1, #MIRROR_OBJECT_CLASS_OFFSET]
+   add r0, r0, #MIRROR_CLASS_VTABLE_OFFSET_32
+   ldr r0, [r0, r2, uxtw #2]
+   b $helper
+2:
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   bl nterp_get_method
+   mov r2, r0
+   b 1b
+
+%def op_invoke_virtual():
+%  invoke_virtual(helper="NterpCommonInvokeInstance", range="0")
+
+%def op_invoke_virtual_range():
+%  invoke_virtual(helper="NterpCommonInvokeInstanceRange", range="1")
+
+%def invoke_virtual_quick(helper="", range=""):
+   EXPORT_PC
+   FETCH r2, 1  // offset
+   // First argument is the 'this' pointer.
+   FETCH r1, 2 // arguments
+   .if !$range
+   and r1, r1, #0xf
+   .endif
+   GET_VREG r1, r1
+   // Note: if r1 is null, this will be handled by our SIGSEGV handler.
+   ldr r0, [r1, #MIRROR_OBJECT_CLASS_OFFSET]
+   add r0, r0, #MIRROR_CLASS_VTABLE_OFFSET_32
+   ldr r0, [r0, r2, uxtw #2]
+   b $helper
+
+%def op_invoke_virtual_quick():
+%  invoke_virtual_quick(helper="NterpCommonInvokeInstance", range="0")
+
+%def op_invoke_virtual_range_quick():
+%  invoke_virtual_quick(helper="NterpCommonInvokeInstanceRange", range="1")
diff --git a/runtime/interpreter/mterp/armng/main.S b/runtime/interpreter/mterp/armng/main.S
new file mode 100644
index 0000000..0b14006
--- /dev/null
+++ b/runtime/interpreter/mterp/armng/main.S
@@ -0,0 +1,2333 @@
+%def header():
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This is a #include, not a %include, because we want the C pre-processor
+ * to expand the macros into assembler assignment statements.
+ */
+#include "asm_support.h"
+#include "arch/arm/asm_support_arm.S"
+
+/**
+ * ARM EABI general notes:
+ *
+ * r0-r3 hold first 4 args to a method; they are not preserved across method calls
+ * r4-r8 are available for general use
+ * r9 is given special treatment in some situations, but not for us
+ * r10 (sl) seems to be generally available
+ * r11 (fp) is used by gcc (unless -fomit-frame-pointer is set)
+ * r12 (ip) is scratch -- not preserved across method calls
+ * r13 (sp) should be managed carefully in case a signal arrives
+ * r14 (lr) must be preserved
+ * r15 (pc) can be tinkered with directly
+ *
+ * r0 holds returns of <= 4 bytes
+ * r0-r1 hold returns of 8 bytes, low word in r0
+ *
+ * Callee must save/restore r4+ (except r12) if it modifies them.  If VFP
+ * is present, registers s16-s31 (a/k/a d8-d15, a/k/a q4-q7) must be preserved,
+ * s0-s15 (d0-d7, q0-a3) do not need to be.
+ *
+ * Stack is "full descending".  Only the arguments that don't fit in the first 4
+ * registers are placed on the stack.  "sp" points at the first stacked argument
+ * (i.e. the 5th arg).
+ *
+ * Native ABI uses soft-float, single-precision results are in r0,
+ * double-precision results in r0-r1.
+ *
+ * In the EABI, "sp" must be 64-bit aligned on entry to a function, and any
+ * 64-bit quantities (long long, double) must be 64-bit aligned.
+ *
+ * Nterp notes:
+ *
+ * The following registers have fixed assignments:
+ *
+ *   reg nick      purpose
+ *   r5  rFP       interpreted frame pointer, used for accessing locals and args
+ *   r6  rREFS     base of object references of dex registers
+ *   r7  rINST     first 16-bit code unit of current instruction
+ *   r8  rMR       marking register
+ *   r9  rSELF     self (Thread) pointer
+ *   r10 rIBASE    interpreted instruction base pointer, used for computed goto
+ *   r11 rPC       interpreted program counter, used for fetching instructions
+ *
+ *   r4, ip, and lr can be used as temporary
+ *
+ * Note that r4 is a callee-save register in ARM EABI, but not in managed code.
+ *
+ */
+
+/* single-purpose registers, given names for clarity */
+#define CFI_DEX  11 // DWARF register number of the register holding dex-pc (rPC).
+#define CFI_TMP  0  // DWARF register number of the first argument register (r0).
+#define CFI_REFS 6
+#define rFP      r5
+#define rREFS    r6
+#define rINST    r7
+#define rSELF    r9
+#define rIBASE   r10
+#define rPC      r11
+
+// To avoid putting ifdefs arond the use of rMR, make sure it's defined.
+// IsNterpSupported returns false for configurations that don't have rMR (typically CMS).
+#ifndef rMR
+#define rMR r8
+#endif
+
+// Temporary registers while setting up a frame.
+#define rNEW_FP   r8
+#define rNEW_REFS r10
+#define CFI_NEW_REFS 10
+
+#define CALLEE_SAVES_SIZE (9 * 4 + 16 * 4)
+
+// +4 for the ArtMethod of the caller.
+#define OFFSET_TO_FIRST_ARGUMENT_IN_STACK (CALLEE_SAVES_SIZE + 4)
+
+/*
+ * Fetch the next instruction from rPC into rINST.  Does not advance rPC.
+ */
+.macro FETCH_INST
+    ldrh    rINST, [rPC]
+.endm
+
+/*
+ * Fetch the next instruction from the specified offset.  Advances rPC
+ * to point to the next instruction.  "count" is in 16-bit code units.
+ *
+ * Because of the limited size of immediate constants on ARM, this is only
+ * suitable for small forward movements (i.e. don't try to implement "goto"
+ * with this).
+ *
+ * This must come AFTER anything that can throw an exception, or the
+ * exception catch may miss.  (This also implies that it must come after
+ * EXPORT_PC.)
+ */
+.macro FETCH_ADVANCE_INST count
+    ldrh    rINST, [rPC, #((\count)*2)]!
+.endm
+
+/*
+ * Similar to FETCH_ADVANCE_INST, but does not update xPC.  Used to load
+ * rINST ahead of possible exception point.  Be sure to manually advance xPC
+ * later.
+ */
+.macro PREFETCH_INST count
+    ldrh    rINST, [rPC, #((\count)*2)]
+.endm
+
+/* Advance xPC by some number of code units. */
+.macro ADVANCE count
+  add  rPC, #((\count)*2)
+.endm
+
+/*
+ * Fetch the next instruction from an offset specified by "reg" and advance xPC.
+ * xPC to point to the next instruction.  "reg" must specify the distance
+ * in bytes, *not* 16-bit code units, and may be a signed value.
+ */
+.macro FETCH_ADVANCE_INST_RB reg
+    ldrh    rINST, [rPC, \reg]!
+.endm
+
+/*
+ * Fetch a half-word code unit from an offset past the current PC.  The
+ * "count" value is in 16-bit code units.  Does not advance xPC.
+ *
+ * The "_S" variant works the same but treats the value as signed.
+ */
+.macro FETCH reg, count
+    ldrh    \reg, [rPC, #((\count)*2)]
+.endm
+
+.macro FETCH_S reg, count
+    ldrsh   \reg, [rPC, #((\count)*2)]
+.endm
+
+/*
+ * Fetch one byte from an offset past the current PC.  Pass in the same
+ * "count" as you would for FETCH, and an additional 0/1 indicating which
+ * byte of the halfword you want (lo/hi).
+ */
+.macro FETCH_B reg, count, byte
+    ldrb     \reg, [rPC, #((\count)*2+(\byte))]
+.endm
+
+/*
+ * Put the instruction's opcode field into the specified register.
+ */
+.macro GET_INST_OPCODE reg
+    and     \reg, rINST, #255
+.endm
+
+/*
+ * Begin executing the opcode in _reg.  Clobbers reg
+ */
+
+.macro GOTO_OPCODE reg
+    add     pc, rIBASE, \reg, lsl #${handler_size_bits}
+.endm
+
+/*
+ * Get/set value from a Dalvik register.
+ */
+.macro GET_VREG reg, vreg
+    ldr     \reg, [rFP, \vreg, lsl #2]
+.endm
+.macro GET_VREG_OBJECT reg, vreg
+    ldr     \reg, [rREFS, \vreg, lsl #2]
+.endm
+.macro SET_VREG reg, vreg
+    str     \reg, [rFP, \vreg, lsl #2]
+    mov     \reg, #0
+    str     \reg, [rREFS, \vreg, lsl #2]
+.endm
+.macro SET_VREG_OBJECT reg, vreg
+    str     \reg, [rFP, \vreg, lsl #2]
+    str     \reg, [rREFS, \vreg, lsl #2]
+.endm
+.macro SET_VREG_FLOAT reg, vreg, tmpreg
+    add     \tmpreg, rFP, \vreg, lsl #2
+    vstr    \reg, [\tmpreg]
+    mov     \tmpreg, #0
+    str     \tmpreg, [rREFS, \vreg, lsl #2]
+.endm
+.macro GET_VREG_WIDE_BY_ADDR reg0, reg1, addr
+    ldmia \addr, {\reg0, \reg1}
+.endm
+.macro SET_VREG_WIDE_BY_ADDR reg0, reg1, addr
+    stmia \addr, {\reg0, \reg1}
+.endm
+.macro GET_VREG_FLOAT sreg, vreg
+    ldr  \vreg, [rFP, \vreg, lsl #2]
+    vmov \sreg, \vreg
+.endm
+.macro GET_VREG_FLOAT_BY_ADDR reg, addr
+    vldr \reg, [\addr]
+.endm
+.macro SET_VREG_FLOAT_BY_ADDR reg, addr
+    vstr \reg, [\addr]
+.endm
+.macro GET_VREG_DOUBLE_BY_ADDR reg, addr
+    vldr \reg, [\addr]
+.endm
+.macro SET_VREG_DOUBLE_BY_ADDR reg, addr
+    vstr \reg, [\addr]
+.endm
+.macro SET_VREG_SHADOW reg, vreg
+    str     \reg, [rREFS, \vreg, lsl #2]
+.endm
+.macro CLEAR_SHADOW_PAIR vreg, tmp1, tmp2
+    mov     \tmp1, #0
+    add     \tmp2, \vreg, #1
+    SET_VREG_SHADOW \tmp1, \vreg
+    SET_VREG_SHADOW \tmp1, \tmp2
+.endm
+.macro VREG_INDEX_TO_ADDR reg, vreg
+    add     \reg, rFP, \vreg, lsl #2
+.endm
+
+// An assembly entry that has a OatQuickMethodHeader prefix.
+.macro OAT_ENTRY name, end
+    .arm
+    .type \name, #function
+    .hidden \name
+    .global \name
+    .balign 16
+    // Padding of 8 bytes to get 16 bytes alignment of code entry.
+    .long 0
+    .long 0
+    // OatQuickMethodHeader.
+    .long 0
+    .long (\end - \name)
+\name:
+.endm
+
+.macro SIZE name
+    .size \name, .-\name
+.endm
+
+.macro NAME_START name
+    .arm
+    .type \name, #function
+    .hidden \name  // Hide this as a global symbol, so we do not incur plt calls.
+    .global \name
+    /* Cache alignment for function entry */
+    .balign 16
+\name:
+.endm
+
+.macro NAME_END name
+  SIZE \name
+.endm
+
+// Macro for defining entrypoints into runtime. We don't need to save registers
+// (we're not holding references there), but there is no
+// kDontSave runtime method. So just use the kSaveRefsOnly runtime method.
+.macro NTERP_TRAMPOLINE name, helper
+ENTRY \name
+  SETUP_SAVE_REFS_ONLY_FRAME ip
+  bl \helper
+  RESTORE_SAVE_REFS_ONLY_FRAME
+  REFRESH_MARKING_REGISTER
+  RETURN_OR_DELIVER_PENDING_EXCEPTION
+END \name
+.endm
+
+.macro CLEAR_STATIC_VOLATILE_MARKER reg
+  and \reg, \reg, #-2
+.endm
+
+.macro CLEAR_INSTANCE_VOLATILE_MARKER reg
+  rsb \reg, \reg, #0
+.endm
+
+.macro EXPORT_PC
+    str    rPC, [rREFS, #-8]
+.endm
+
+.macro BRANCH
+    // Update method counter and do a suspend check if the branch is negative.
+    cmp rINST, #0
+    blt 2f
+1:
+    add r2, rINST, rINST                // r2<- byte offset
+    FETCH_ADVANCE_INST_RB r2            // update xPC, load rINST
+    GET_INST_OPCODE ip                  // extract opcode from rINST
+    GOTO_OPCODE ip                      // jump to next instruction
+2:
+    ldr r0, [sp]
+    ldrh r2, [r0, #ART_METHOD_HOTNESS_COUNT_OFFSET]
+    add r2, r2, #1
+    ubfx r2, r2, #0, #NTERP_HOTNESS_BITS
+    strh r2, [r0, #ART_METHOD_HOTNESS_COUNT_OFFSET]
+    // If the counter overflows, handle this in the runtime.
+    cmp r2, #0
+    beq NterpHandleHotnessOverflow
+    // Otherwise, do a suspend check.
+    ldr r0, [rSELF, #THREAD_FLAGS_OFFSET]
+    ands r0, r0, #THREAD_SUSPEND_OR_CHECKPOINT_REQUEST
+    beq 1b
+    EXPORT_PC
+    bl    art_quick_test_suspend
+    b 1b
+.endm
+
+// Expects:
+// - ip and lr to be available.
+// Outputs:
+// - \registers contains the dex registers size
+// - \outs contains the outs size
+// - if load_ins is 1, \ins contains the ins
+// - \code_item is replaced with a pointer to the instructions
+.macro FETCH_CODE_ITEM_INFO code_item, registers, outs, ins, load_ins
+    tst \code_item, #1
+    beq 5f
+    bic \code_item, \code_item, #1 // Remove the extra bit that marks it's a compact dex file
+    ldrh lr, [\code_item, #COMPACT_CODE_ITEM_FIELDS_OFFSET]
+    ubfx \registers, lr, #COMPACT_CODE_ITEM_REGISTERS_SIZE_SHIFT, #4
+    ubfx \outs, lr, #COMPACT_CODE_ITEM_OUTS_SIZE_SHIFT, #4
+    .if \load_ins
+    ubfx \ins, lr, #COMPACT_CODE_ITEM_INS_SIZE_SHIFT, #4
+    .else
+    ubfx ip, lr, #COMPACT_CODE_ITEM_INS_SIZE_SHIFT, #4
+    add \registers, \registers, ip
+    .endif
+
+    ldrh lr, [\code_item, #COMPACT_CODE_ITEM_FLAGS_OFFSET]
+    tst lr, #COMPACT_CODE_ITEM_REGISTERS_INS_OUTS_FLAGS
+    beq 4f
+    mov ip, \code_item
+    tst lr, #COMPACT_CODE_ITEM_INSNS_FLAG
+    beq 1f
+    sub ip, ip, #4
+1:
+    tst lr, #COMPACT_CODE_ITEM_REGISTERS_FLAG
+    beq 2f
+    ldrh lr, [ip, #-2]!
+    add \registers, \registers, lr
+    ldrh lr, [\code_item, #COMPACT_CODE_ITEM_FLAGS_OFFSET]
+2:
+    tst lr, #COMPACT_CODE_ITEM_INS_FLAG
+    beq 3f
+    ldrh lr, [ip, #-2]!
+    .if \load_ins
+    add \ins, \ins, lr
+    .else
+    add \registers, \registers, lr
+    .endif
+    ldrh lr, [\code_item, #COMPACT_CODE_ITEM_FLAGS_OFFSET]
+3:
+    tst lr, #COMPACT_CODE_ITEM_OUTS_FLAG
+    beq 4f
+    ldrh lr, [ip, #-2]!
+    add \outs, \outs, lr
+4:
+    .if \load_ins
+    add \registers, \registers, \ins
+    .endif
+    add \code_item, \code_item, #COMPACT_CODE_ITEM_INSNS_OFFSET
+    b 6f
+5:
+    // Fetch dex register size.
+    ldrh \registers, [\code_item, #CODE_ITEM_REGISTERS_SIZE_OFFSET]
+    // Fetch outs size.
+    ldrh \outs, [\code_item, #CODE_ITEM_OUTS_SIZE_OFFSET]
+    .if \load_ins
+    ldrh \ins, [\code_item, #CODE_ITEM_INS_SIZE_OFFSET]
+    .endif
+    add \code_item, \code_item, #CODE_ITEM_INSNS_OFFSET
+6:
+.endm
+
+// Setup the stack to start executing the method. Expects:
+// - r0 to contain the ArtMethod
+// - \code_item to already contain the code item
+// - rINST, ip, lr to be available
+//
+// Outputs
+// - rINST contains the dex registers size
+// - ip contains the old stack pointer.
+// - \code_item is replaced with a pointer to the instructions
+// - if load_ins is 1, r4 contains the ins
+//
+.macro SETUP_STACK_FRAME code_item, refs, fp, cfi_refs, load_ins
+    FETCH_CODE_ITEM_INFO \code_item, rINST, \refs, r4, \load_ins
+
+    // Compute required frame size: ((2 * rINST) + \refs) * 4 + 12
+    // 12 is for saving the previous frame, pc, and method being executed.
+    add ip, \refs, rINST, lsl #1
+
+    // Compute new stack pointer in lr
+    sub lr, sp, #12
+    sub lr, lr, ip, lsl #2
+    // Alignment
+    and lr, lr, #-16
+
+    // Set reference and dex registers.
+    add \refs, lr, \refs, lsl #2
+    add \refs, \refs, #12
+    add \fp, \refs, rINST, lsl #2
+
+    // Now setup the stack pointer.
+    mov ip, sp
+    .cfi_def_cfa_register ip
+    mov sp, lr
+    str ip, [\refs, #-4]
+    CFI_DEF_CFA_BREG_PLUS_UCONST \cfi_refs, -4, CALLEE_SAVES_SIZE
+
+    // Save the ArtMethod, and use r0 as a temporary.
+    str r0, [sp]
+
+    // Put nulls in reference frame.
+    cmp rINST, #0
+    beq 2f
+    mov lr, \refs
+    mov r0, #0
+1:
+    str r0, [lr], #4
+    str r0, [lr], #4  // May clear vreg[0].
+    cmp lr, \fp
+    blo 1b
+2:
+    ldr r0, [sp]  // Reload the ArtMethod, expected by the callers.
+.endm
+
+// Increase method hotness and do suspend check before starting executing the method.
+.macro START_EXECUTING_INSTRUCTIONS
+    ldr r0, [sp]
+    ldrh r2, [r0, #ART_METHOD_HOTNESS_COUNT_OFFSET]
+    add r2, r2, #1
+    ubfx r2, r2, #0, #NTERP_HOTNESS_BITS
+    strh r2, [r0, #ART_METHOD_HOTNESS_COUNT_OFFSET]
+    // If the counter overflows, handle this in the runtime.
+    cmp r2, #0
+    beq 2f
+    ldr r0, [rSELF, #THREAD_FLAGS_OFFSET]
+    tst r0, #THREAD_SUSPEND_OR_CHECKPOINT_REQUEST
+    bne 3f
+1:
+    FETCH_INST
+    GET_INST_OPCODE ip
+    GOTO_OPCODE ip
+2:
+    mov r1, #0
+    mov r2, rFP
+    bl nterp_hot_method
+    b 1b
+3:
+    EXPORT_PC
+    bl art_quick_test_suspend
+    b 1b
+.endm
+
+.macro SPILL_ALL_CALLEE_SAVES
+    SPILL_ALL_CALLEE_SAVE_GPRS                    @ 9 words (36 bytes) of callee saves.
+    vpush {s16-s31}                               @ 16 words (64 bytes) of floats.
+    .cfi_adjust_cfa_offset 64
+.endm
+
+.macro RESTORE_ALL_CALLEE_SAVES lr_to_pc=0
+    vpop {s16-s31}
+    .cfi_adjust_cfa_offset -64
+    pop {r4-r7}
+    .cfi_adjust_cfa_offset -16
+    .cfi_restore r4
+    .cfi_restore r5
+    .cfi_restore r6
+    .cfi_restore r7
+    // Don't restore r8, the marking register gets updated when coming back from runtime.
+    add sp, sp, #4
+    .cfi_adjust_cfa_offset -4
+    .if \lr_to_pc
+    pop {r9-r11, pc}  @ 9 words of callee saves and args.
+    .else
+    pop {r9-r11, lr}  @ 9 words of callee saves and args.
+    .cfi_adjust_cfa_offset -16
+    .cfi_restore r9
+    .cfi_restore r10
+    .cfi_restore r11
+    .cfi_restore lr
+    .endif
+.endm
+
+.macro SPILL_ALL_ARGUMENTS
+    // We spill r4 for stack alignment.
+    push {r0-r4}
+    .cfi_adjust_cfa_offset 20
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset r1, 4
+    .cfi_rel_offset r2, 8
+    .cfi_rel_offset r3, 12
+    .cfi_rel_offset r4, 16
+    vpush {s0-s15}
+    .cfi_adjust_cfa_offset 64
+.endm
+
+.macro RESTORE_ALL_ARGUMENTS
+    vpop {s0-s15}
+    .cfi_adjust_cfa_offset -64
+    pop {r0-r4}
+    .cfi_restore r0
+    .cfi_restore r1
+    .cfi_restore r2
+    .cfi_restore r3
+    .cfi_restore r4
+    .cfi_adjust_cfa_offset -20
+.endm
+
+// Helper to setup the stack after doing a nterp to nterp call. This will setup:
+// - rNEW_FP: the new pointer to dex registers
+// - rNEW_REFS: the new pointer to references
+// - rPC: the new PC pointer to execute
+// - r2: value in instruction to decode the number of arguments.
+// - r3: first dex register for range invokes, up to 4 arguments for non-range invokes.
+// - r4: top of dex register array
+//
+// The method expects:
+// - r0 to contain the ArtMethod
+// - r4 to contain the code item
+.macro SETUP_STACK_FOR_INVOKE
+   // We do the same stack overflow check as the compiler. See CanMethodUseNterp
+   // in how we limit the maximum nterp frame size.
+   sub ip, sp, #STACK_OVERFLOW_RESERVED_BYTES
+   ldr ip, [ip]
+
+   // Spill all callee saves to have a consistent stack frame whether we
+   // are called by compiled code or nterp.
+   SPILL_ALL_CALLEE_SAVES
+
+   // Setup the frame.
+   SETUP_STACK_FRAME r4, rNEW_REFS, rNEW_FP, CFI_NEW_REFS, load_ins=0
+
+   // Fetch instruction information before replacing rPC.
+   FETCH_B r2, 0, 1
+   FETCH r3, 2
+
+   // Set the dex pc pointer.
+   mov rPC, r4
+
+   // Make r4 point to the top of the dex register array.
+   add r4, rNEW_FP, rINST, lsl #2
+
+   CFI_DEFINE_DEX_PC_WITH_OFFSET(CFI_TMP, CFI_DEX, 0)
+.endm
+
+// Setup arguments based on a non-range nterp to nterp call, and start executing
+// the method. We expect:
+// - rNEW_FP: the new pointer to dex registers
+// - rPC: the new PC pointer to execute
+// - r2: number of arguments (bits 4-7), 5th argument if any (bits 0-3)
+// - r3: up to four dex register arguments
+// - r4: top of dex register array
+// - r1: receiver if non-static.
+//
+// Uses r0 and rINST as temporaries.
+.macro SETUP_NON_RANGE_ARGUMENTS_AND_EXECUTE is_static=0, is_string_init=0
+   // /* op vA, vB, {vC...vG} */
+   .if \is_static
+   asrs   r0, r2, #4
+   beq    6f
+   .else
+   asr    r0, r2, #4
+   .endif
+   mov rINST, #-4
+   cmp r0, #2
+   blt 1f
+   beq 2f
+   cmp r0, #4
+   blt 3f
+   beq 4f
+
+  // We use a decrementing rINST to store references relative
+  // to rNEW_FP and dex registers relative to r4
+  //
+  // TODO: We could set up rINST as the number of registers (this can be an additional output from
+  // SETUP_STACK_FOR_INVOKE) and then just decrement it by one before copying each arg.
+  // Maybe even introduce macros NEW_VREG_ADDRESS/NEW_VREG_REF_ADDRESS.
+5:
+   and         r2, r2, #15
+   GET_VREG_OBJECT r0, r2
+   str         r0, [rNEW_FP, rINST]
+   GET_VREG    r0, r2
+   str         r0, [r4, rINST]
+   sub         rINST, rINST, #4
+4:
+   asr         r2, r3, #12
+   GET_VREG_OBJECT r0, r2
+   str         r0, [rNEW_FP, rINST]
+   GET_VREG    r0, r2
+   str         r0, [r4, rINST]
+   sub         rINST, rINST, #4
+3:
+   ubfx        r2, r3, #8, #4
+   GET_VREG_OBJECT r0, r2
+   str         r0, [rNEW_FP, rINST]
+   GET_VREG    r0, r2
+   str         r0, [r4, rINST]
+   sub         rINST, rINST, #4
+2:
+   ubfx        r2, r3, #4, #4
+   GET_VREG_OBJECT r0, r2
+   str         r0, [rNEW_FP, rINST]
+   GET_VREG    r0, r2
+   str         r0, [r4, rINST]
+   .if !\is_string_init
+   sub         rINST, rINST, #4
+   .endif
+1:
+   .if \is_string_init
+   // Ignore the first argument
+   .elseif \is_static
+   and         r2, r3, #0xf
+   GET_VREG_OBJECT r0, r2
+   str         r0, [rNEW_FP, rINST]
+   GET_VREG    r0, r2
+   str         r0, [r4, rINST]
+   .else
+   str         r1, [rNEW_FP, rINST]
+   str         r1, [r4, rINST]
+   .endif
+
+6:
+   // Start executing the method.
+   mov rFP, rNEW_FP
+   mov rREFS, rNEW_REFS
+   CFI_DEF_CFA_BREG_PLUS_UCONST CFI_REFS, -4, CALLEE_SAVES_SIZE
+   // r8 was used for setting up the frame, restore it now.
+   REFRESH_MARKING_REGISTER
+   // Branch to the main handler, which will reload rIBASE,
+   // that was used for setting up the frame.
+   b .Lexecute_instructions
+.endm
+
+// Setup arguments based on a range nterp to nterp call, and start executing
+// the method.
+// - rNEW_FP: the new pointer to dex registers
+// - rNEW_REFS: the new pointer to references
+// - rPC: the new PC pointer to execute
+// - r2: number of arguments
+// - r3: first dex register
+// - r4: top of dex register array
+// - r1: receiver if non-static.
+//
+// Expects r0 to be available.
+.macro SETUP_RANGE_ARGUMENTS_AND_EXECUTE is_static=0, is_string_init=0
+   mov r0, #-4
+   .if \is_string_init
+   // Ignore the first argument
+   sub r2, r2, #1
+   add r3, r3, #1
+   .elseif !\is_static
+   sub r2, r2, #1
+   add r3, r3, #1
+   .endif
+
+   cmp r2, #0
+   beq 2f
+   add rREFS, rREFS, r3, lsl #2  // pointer to first argument in reference array
+   add rREFS, rREFS, r2, lsl #2    // pointer to last argument in reference array
+   add rFP, rFP, r3, lsl #2     // pointer to first argument in register array
+   add rFP, rFP, r2, lsl #2      // pointer to last argument in register array
+1:
+   ldr  r3, [rREFS, #-4]!
+   str  r3, [rNEW_FP, r0]
+   subs r2, r2, 1
+   ldr  r3, [rFP, #-4]!
+   str  r3, [r4, r0]
+   sub r0, r0, 4
+   bne 1b
+2:
+   .if \is_string_init
+   // Ignore first argument
+   .elseif !\is_static
+   str r1, [rNEW_FP, r0]
+   str r1, [r4, r0]
+   .endif
+   mov rFP, rNEW_FP
+   mov rREFS, rNEW_REFS
+   CFI_DEF_CFA_BREG_PLUS_UCONST CFI_REFS, -4, CALLEE_SAVES_SIZE
+   // r8 was used for setting up the frame, restore it now.
+   REFRESH_MARKING_REGISTER
+   // Branch to the main handler, which will reload rIBASE,
+   // that was used for setting up the frame.
+   b .Lexecute_instructions
+.endm
+
+.macro GET_SHORTY dest, is_interface, is_polymorphic, is_custom
+   push {r0-r3}
+   .if \is_polymorphic
+   ldr r0, [sp, #16]
+   mov r1, rPC
+   bl NterpGetShortyFromInvokePolymorphic
+   .elseif \is_custom
+   ldr r0, [sp, #16]
+   mov r1, rPC
+   bl NterpGetShortyFromInvokeCustom
+   .elseif \is_interface
+   ldr r0, [sp, #16]
+   FETCH r1, 1
+   bl NterpGetShortyFromMethodId
+   .else
+   bl NterpGetShorty
+   .endif
+   mov \dest, r0
+   pop {r0-r3}
+.endm
+
+// Input:  r0 contains the ArtMethod
+// Output: r4 contains the code item
+.macro GET_CODE_ITEM
+   ldr r4, [r0, #ART_METHOD_DATA_OFFSET_32]
+.endm
+
+.macro DO_ENTRY_POINT_CHECK call_compiled_code, name
+   // On entry, the method is r0, the instance is r1
+   ldr r2, .Lfetch_nterp_\name
+.Lfetch_location_\name:
+   // Note that this won't work for thumb.
+   sub r2, pc, r2
+   ldr r3, [r0, #ART_METHOD_QUICK_CODE_OFFSET_32]
+   cmp r2, r3
+   bne  \call_compiled_code
+.endm
+
+// Expects ip and lr to be available.
+.macro UPDATE_REGISTERS_FOR_STRING_INIT old_value, new_value
+   mov ip, #0
+1:
+   GET_VREG_OBJECT lr, ip
+   cmp lr, \old_value
+   bne 2f
+   SET_VREG_OBJECT \new_value, ip
+2:
+   add ip, ip, #1
+   add lr, rREFS, ip, lsl #2
+   cmp lr, rFP
+   bne 1b
+.endm
+
+// Puts the next floating point argument into the expected register,
+// fetching values based on a non-range invoke.
+// Uses ip and lr.
+.macro LOOP_OVER_SHORTY_LOADING_FPS dreg, sreg, inst, shorty, arg_index, finished, if_double
+1: // LOOP
+    ldrb ip, [\shorty], #1          // Load next character in shorty, and increment.
+    cmp ip, #0
+    beq \finished                   // if (ip == '\0') goto finished
+    cmp ip, #68                    // if (ip == 'D') goto FOUND_DOUBLE
+    beq 2f
+    cmp ip, #70                    // if (ip == 'F') goto FOUND_FLOAT
+    beq 3f
+    lsr \inst, \inst, #4
+    add \arg_index, \arg_index, #1
+    //  Handle extra argument in arg array taken by a long.
+    cmp ip, #74                   // if (ip != 'J') goto LOOP
+    bne 1b
+    lsr \inst, \inst, #4
+    add \arg_index, \arg_index, #1
+    b 1b                        // goto LOOP
+2:  // FOUND_DOUBLE
+    and ip, \inst, #0xf
+    GET_VREG ip, ip
+    lsr \inst, \inst, #4
+    add \arg_index, \arg_index, #1
+    cmp \arg_index, #4
+    beq 5f
+    and lr, \inst, #0xf
+    lsr \inst, \inst, #4
+    add \arg_index, \arg_index, #1
+    b 6f
+5:
+    FETCH_B lr, 0, 1
+    and lr, lr, #0xf
+6:
+    GET_VREG lr, lr
+    vmov \dreg, ip, lr
+    b \if_double
+3:  // FOUND_FLOAT
+    cmp \arg_index, #4
+    beq 7f
+    and ip, \inst, #0xf
+    lsr \inst, \inst, #4
+    add \arg_index, \arg_index, #1
+    b 8f
+7:
+    FETCH_B ip, 0, 1
+    and ip, ip, #0xf
+8:
+    GET_VREG_FLOAT \sreg, ip
+.endm
+
+// Puts the next int/long/object argument in the expected register,
+// fetching values based on a non-range invoke.
+// Uses ip.
+.macro LOOP_OVER_SHORTY_LOADING_GPRS gpr_reg, inst, shorty, arg_index, finished, if_long, is_r3
+1: // LOOP
+    ldrb ip, [\shorty], #1         // Load next character in shorty, and increment.
+    cmp ip, #0
+    beq \finished                   // if (ip == '\0') goto finished
+    cmp ip, #74                    // if (ip == 'J') goto FOUND_LONG
+    beq 2f
+    cmp ip, #70                    // if (ip == 'F') goto SKIP_FLOAT
+    beq 3f
+    cmp ip, #68                    // if (ip == 'D') goto SKIP_DOUBLE
+    beq 4f
+    cmp \arg_index, #4
+    beq 7f
+    and ip, \inst, #0xf
+    lsr \inst, \inst, #4
+    add \arg_index, \arg_index, #1
+    b 8f
+7:
+    FETCH_B ip, 0, 1
+    and ip, ip, #0xf
+8:
+    GET_VREG \gpr_reg, ip
+    b 5f
+2:  // FOUND_LONG
+    .if \is_r3
+    // Put back shorty and exit
+    sub \shorty, \shorty, #1
+    b 5f
+    .endif
+    and ip, \inst, #0xf
+    GET_VREG ip, ip
+    // The only one possible for non-range long is r2-r3
+    mov r2, ip
+    lsr \inst, \inst, #4
+    add \arg_index, \arg_index, #1
+    cmp \arg_index, #4
+    beq 9f
+    and ip, \inst, #0xf
+    lsr \inst, \inst, #4
+    b 10f
+9:
+    FETCH_B ip, 0, 1
+    and ip, ip, #0xf
+10:
+    GET_VREG ip, ip
+    // The only one possible for non-range long is r2-r3
+    mov r3, ip
+    add \arg_index, \arg_index, #1
+    b \if_long
+3:  // SKIP_FLOAT
+    lsr \inst, \inst, #4
+    add \arg_index, \arg_index, #1
+    b 1b
+4:  // SKIP_DOUBLE
+    lsr \inst, \inst, #8
+    add \arg_index, \arg_index, #2
+    b 1b
+5:
+.endm
+
+// Puts the next int/long/object argument in the expected stack slot,
+// fetching values based on a non-range invoke.
+// Uses ip as temporary.
+.macro LOOP_OVER_SHORTY_LOADING_INTs shorty, inst, arg_index, finished, is_string_init
+1: // LOOP
+    ldrb ip, [\shorty], #1         // Load next character in shorty, and increment.
+    cmp ip, #0
+    beq \finished                  // if (ip == '\0') goto finished
+    cmp ip, #74                    // if (ip == 'J') goto FOUND_LONG
+    beq 2f
+    cmp ip, #70                    // if (ip == 'F') goto SKIP_FLOAT
+    beq 3f
+    cmp ip, #68                    // if (ip == 'D') goto SKIP_DOUBLE
+    beq 4f
+    .if \is_string_init
+    cmp \arg_index, #4
+    .else
+    cmp \arg_index, #(4+1)         // +1 for ArtMethod
+    .endif
+    beq 7f
+    and ip, \inst, #0xf
+    lsr \inst, \inst, #4
+    b 8f
+7:
+    FETCH_B ip, 0, 1
+    and ip, ip, #0xf
+8:
+    GET_VREG ip, ip
+    str ip, [sp, \arg_index, lsl #2]
+    add \arg_index, \arg_index, #1
+    b 1b
+2:  // FOUND_LONG
+    and ip, \inst, #0xf
+    GET_VREG ip, ip
+    str ip, [sp, \arg_index, lsl #2]
+    lsr \inst, \inst, #4
+    add \arg_index, \arg_index, #1
+    .if \is_string_init
+    cmp \arg_index, #4
+    .else
+    cmp \arg_index, #(4+1)         // +1 for ArtMethod
+    .endif
+    beq 9f
+    and ip, \inst, #0xf
+    lsr \inst, \inst, #4
+    b 10f
+9:
+    FETCH_B ip, 0, 1
+    and ip, ip, #0xf
+10:
+    GET_VREG ip, ip
+    str ip, [sp, \arg_index, lsl #2]
+    add \arg_index, \arg_index, #1
+    b 1b
+3:  // SKIP_FLOAT
+    lsr \inst, \inst, #4
+    add \arg_index, \arg_index, #1
+    b 1b
+4:  // SKIP_DOUBLE
+    lsr \inst, \inst, #8
+    add \arg_index, \arg_index, #2
+    b 1b
+.endm
+
+.macro COMMON_INVOKE_NON_RANGE is_static=0, is_interface=0, suffix="", is_string_init=0, is_polymorphic=0, is_custom=0
+   .if \is_polymorphic
+   // We always go to compiled code for polymorphic calls.
+   .elseif \is_custom
+   // We always go to compiled code for custom calls.
+   .else
+     DO_ENTRY_POINT_CHECK .Lcall_compiled_code_\suffix, \suffix
+     GET_CODE_ITEM
+     .if \is_string_init
+     bl nterp_to_nterp_string_init_non_range
+     .elseif \is_static
+     bl nterp_to_nterp_static_non_range
+     .else
+     bl nterp_to_nterp_instance_non_range
+     .endif
+     b .Ldone_return_\suffix
+.Lfetch_nterp_\suffix:
+    .word   (.Lfetch_location_\suffix+8) - ExecuteNterpImpl
+   .endif
+
+.Lcall_compiled_code_\suffix:
+   GET_SHORTY rINST, \is_interface, \is_polymorphic, \is_custom
+   // From this point:
+   // - rINST contains shorty (in callee-save to switch over return value after call).
+   // - r0 contains method
+   // - r1 contains 'this' pointer for instance method.
+   // We need three registers.
+   add r3, rINST, #1  // shorty + 1  ; ie skip return arg character
+   FETCH r2, 2 // arguments
+   .if \is_string_init
+   lsr r2, r2, #4
+   mov r4, #1       // ignore first argument
+   .elseif \is_static
+   mov r4, #0      // arg_index
+   .else
+   lsr r2, r2, #4
+   mov r4, #1       // ignore first argument
+   .endif
+   LOOP_OVER_SHORTY_LOADING_FPS d0, s0, r2, r3, r4, .Lxmm_setup_finished_\suffix, .Ld1_s2_\suffix
+.Ld1_s1_\suffix:
+   LOOP_OVER_SHORTY_LOADING_FPS d1, s1, r2, r3, r4, .Lxmm_setup_finished_\suffix, .Ld2_s1_\suffix
+.Ld1_s2_\suffix:
+   LOOP_OVER_SHORTY_LOADING_FPS d1, s2, r2, r3, r4, .Lxmm_setup_finished_\suffix, .Ls4_\suffix
+.Ld2_s3_\suffix:
+   LOOP_OVER_SHORTY_LOADING_FPS d2, s3, r2, r3, r4, .Lxmm_setup_finished_\suffix, .Lxmm_setup_finished_\suffix
+   b .Ls4_\suffix
+.Ld2_s1_\suffix:
+   LOOP_OVER_SHORTY_LOADING_FPS d2, s1, r2, r3, r4, .Lxmm_setup_finished_\suffix, .Lxmm_setup_finished_\suffix
+.Ls4_\suffix:
+   // If we arrive here, we can only have a float.
+   LOOP_OVER_SHORTY_LOADING_FPS d2, s4, r2, r3, r4, .Lxmm_setup_finished_\suffix, .Lxmm_setup_finished_\suffix
+.Lxmm_setup_finished_\suffix:
+   add r4, rINST, #1  // shorty + 1  ; ie skip return arg character
+   FETCH r8, 2 // arguments
+   .if \is_string_init
+   lsr r8, r8, #4
+   mov lr, #1       // ignore first argument
+   LOOP_OVER_SHORTY_LOADING_GPRS r1, r8, r4, lr, .Lgpr_setup_finished_\suffix, .Lif_long_\suffix, is_r3=0
+   .elseif \is_static
+   mov lr, #0      // arg_index
+   LOOP_OVER_SHORTY_LOADING_GPRS r1, r8, r4, lr, .Lgpr_setup_finished_\suffix, .Lif_long_\suffix, is_r3=0
+   .else
+   lsr r8, r8, #4
+   mov lr, #1       // ignore first argument
+   .endif
+   LOOP_OVER_SHORTY_LOADING_GPRS r2, r8, r4, lr, .Lgpr_setup_finished_\suffix, .Lif_long_\suffix, is_r3=0
+   LOOP_OVER_SHORTY_LOADING_GPRS r3, r8, r4, lr, .Lgpr_setup_finished_\suffix, .Lif_long_\suffix, is_r3=1
+.Lif_long_\suffix:
+   // Store in the outs array (stored above the ArtMethod in the stack). We only do this for non-string-init
+   // calls as the index is already adjusted above.
+   .if !\is_string_init
+   add lr, lr, #1
+   .endif
+   LOOP_OVER_SHORTY_LOADING_INTs r4, r8, lr, .Lgpr_setup_finished_\suffix, \is_string_init
+.Lgpr_setup_finished_\suffix:
+   REFRESH_MARKING_REGISTER // r8 was used when setting parameters, restore it.
+   .if \is_polymorphic
+   bl art_quick_invoke_polymorphic
+   .elseif \is_custom
+   bl art_quick_invoke_custom
+   .else
+      .if \is_interface
+      // Setup hidden argument. As we don't have access to the interface method,
+      // just pass the method from the IMT. If the method is the conflict trampoline,
+      // this will make the stub go to runtime, otherwise the hidden argument is unused.
+      mov ip, r0
+      .endif
+      ldr lr, [r0, #ART_METHOD_QUICK_CODE_OFFSET_32]
+      blx lr
+   .endif
+   ldrb ip, [rINST]
+   cmp ip, #68       // Test if result type char == 'D'.
+   beq .Lreturn_double_\suffix
+   cmp ip, #70
+   bne .Ldone_return_\suffix
+.Lreturn_float_\suffix:
+   vmov r0, s0
+   b .Ldone_return_\suffix
+.Lreturn_double_\suffix:
+   vmov r0, r1, d0
+.Ldone_return_\suffix:
+   /* resume execution of caller */
+   .if \is_string_init
+   FETCH ip, 2 // arguments
+   and ip, ip, #0xf
+   GET_VREG r1, ip
+   UPDATE_REGISTERS_FOR_STRING_INIT r1, r0
+   .endif
+
+   .if \is_polymorphic
+   FETCH_ADVANCE_INST 4
+   .else
+   FETCH_ADVANCE_INST 3
+   .endif
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+.endm
+
+// Puts the next int/long/object argument in the expected register,
+// fetching values based on a range invoke.
+// Uses ip as temporary.
+.macro LOOP_RANGE_OVER_SHORTY_LOADING_GPRS reg32, shorty, arg_index, stack_index, finished, if_long, is_r3
+1: // LOOP
+    ldrb ip, [\shorty], #1         // Load next character in shorty, and increment.
+    cmp ip, #0
+    beq \finished                  // if (ip == '\0') goto finished
+    cmp ip, #74                    // if (ip == 'J') goto FOUND_LONG
+    beq 2f
+    cmp ip, #70                    // if (ip == 'F') goto SKIP_FLOAT
+    beq 3f
+    cmp ip, #68                    // if (ip == 'D') goto SKIP_DOUBLE
+    beq 4f
+    GET_VREG \reg32, \arg_index
+    add \arg_index, \arg_index, #1
+    add \stack_index, \stack_index, #1
+    b 5f
+2:  // FOUND_LONG
+    .if \is_r3
+    // Put back shorty and jump to \if_long
+    sub \shorty, \shorty, #1
+    .else
+    GET_VREG r2, \arg_index
+    add \arg_index, \arg_index, #1
+    add \stack_index, \stack_index, #1
+    GET_VREG r3, \arg_index
+    add \arg_index, \arg_index, #1
+    add \stack_index, \stack_index, #1
+    .endif
+    b \if_long
+3:  // SKIP_FLOAT
+    add \arg_index, \arg_index, #1
+    add \stack_index, \stack_index, #1
+    b 1b
+4:  // SKIP_DOUBLE
+    add \arg_index, \arg_index, #2
+    add \stack_index, \stack_index, #2
+    b 1b
+5:
+.endm
+
+// Puts the next int/long/object argument in the expected stack slot,
+// fetching values based on a range invoke.
+// Uses ip as temporary.
+.macro LOOP_RANGE_OVER_INTs shorty, arg_index, stack_index, finished
+1: // LOOP
+    ldrb ip, [\shorty], #1         // Load next character in shorty, and increment.
+    cmp ip, #0
+    beq \finished                     // if (ip == '\0') goto finished
+    cmp ip, #74                    // if (ip == 'J') goto FOUND_LONG
+    beq 2f
+    cmp ip, #70                    // if (ip == 'F') goto SKIP_FLOAT
+    beq 3f
+    cmp ip, #68                    // if (ip == 'D') goto SKIP_DOUBLE
+    beq 4f
+    GET_VREG ip, \arg_index
+    str ip, [sp, \stack_index, lsl #2]
+    add \arg_index, \arg_index, #1
+    add \stack_index, \stack_index, #1
+    b 1b
+2:  // FOUND_LONG
+    GET_VREG ip, \arg_index
+    str ip, [sp, \stack_index, lsl #2]
+    add \arg_index, \arg_index, #1
+    add \stack_index, \stack_index, #1
+    GET_VREG ip, \arg_index
+    str ip, [sp, \stack_index, lsl #2]
+    add \arg_index, \arg_index, #1
+    add \stack_index, \stack_index, #1
+    b 1b
+3:  // SKIP_FLOAT
+    add \arg_index, \arg_index, #1
+    add \stack_index, \stack_index, #1
+    b 1b
+4:  // SKIP_DOUBLE
+    add \arg_index, \arg_index, #2
+    add \stack_index, \stack_index, #2
+    b 1b
+.endm
+
+.macro COMMON_INVOKE_RANGE is_static=0, is_interface=0, suffix="", is_string_init=0, is_polymorphic=0, is_custom=0
+   .if \is_polymorphic
+   // We always go to compiled code for polymorphic calls.
+   .elseif \is_custom
+   // We always go to compiled code for custom calls.
+   .else
+     DO_ENTRY_POINT_CHECK .Lcall_compiled_code_range_\suffix, range_\suffix
+     GET_CODE_ITEM
+     .if \is_string_init
+     bl nterp_to_nterp_string_init_range
+     .elseif \is_static
+     bl nterp_to_nterp_static_range
+     .else
+     bl nterp_to_nterp_instance_range
+     .endif
+     b .Ldone_return_range_\suffix
+.Lfetch_nterp_range_\suffix:
+    .word   (.Lfetch_location_range_\suffix+8) - ExecuteNterpImpl
+   .endif
+
+.Lcall_compiled_code_range_\suffix:
+   GET_SHORTY rINST, \is_interface, \is_polymorphic, \is_custom
+   // From this point:
+   // - rINST contains shorty (in callee-save to switch over return value after call).
+   // - r0 contains method
+   // - r1 contains 'this' pointer for instance method.
+   //
+   // Save r0 and r1 before calling NterpSetupArm32Fprs.
+   push {r0, r1}
+   add r0, rINST, #1  // shorty + 1  ; ie skip return arg character
+   FETCH r1, 2 // arguments
+   .if \is_string_init
+   add r1, r1, #1  // arg start index
+   mov r2, #1       // index in stack
+   .elseif \is_static
+   mov r2, #0       // index in stack
+   .else
+   add r1, r1, #1  // arg start index
+   mov r2, #1       // index in stack
+   .endif
+   vpush {s0-s15}
+   mov r3, sp
+   // Pass the stack address for arguments, +16 for fprs, +2 for saved registers,
+   // +1 for ArtMethod.
+   add lr, sp, #((16 + 2 + 1) * 4)
+   push {rFP, lr}
+   bl NterpSetupArm32Fprs
+   add sp, sp, #8
+   vpop {s0-s15}
+   pop {r0, r1}
+.Lxmm_setup_finished_range_\suffix:
+   add r8, rINST, #1  // shorty + 1  ; ie skip return arg character
+   FETCH lr, 2 // arguments
+   .if \is_string_init
+   add lr, lr, #1  // arg start index
+   mov r4, #1       // index in stack
+   LOOP_RANGE_OVER_SHORTY_LOADING_GPRS r1, r8, lr, r4, .Lgpr_setup_finished_range_\suffix, .Lif_long_range_\suffix, is_r3=0
+   .elseif \is_static
+   mov r4, #0      // index in stack
+   LOOP_RANGE_OVER_SHORTY_LOADING_GPRS r1, r8, lr, r4, .Lgpr_setup_finished_range_\suffix, .Lif_long_range_\suffix, is_r3=0
+   .else
+   add lr, lr, #1  // arg start index
+   mov r4, #1       // index in stack
+   .endif
+   LOOP_RANGE_OVER_SHORTY_LOADING_GPRS r2, r8, lr, r4, .Lgpr_setup_finished_range_\suffix, .Lif_long_range_\suffix, is_r3=0
+   LOOP_RANGE_OVER_SHORTY_LOADING_GPRS r3, r8, lr, r4, .Lgpr_setup_finished_range_\suffix, .Lif_long_range_\suffix, is_r3=1
+.Lif_long_range_\suffix:
+   // Add 1 word for the ArtMethod stored before the outs.
+   add r4, r4, #1
+   LOOP_RANGE_OVER_INTs r8, lr, r4, .Lgpr_setup_finished_range_\suffix
+.Lgpr_setup_finished_range_\suffix:
+   REFRESH_MARKING_REGISTER // r8 was used when setting parameters, restore it.
+   .if \is_polymorphic
+   bl art_quick_invoke_polymorphic
+   .elseif \is_custom
+   bl art_quick_invoke_custom
+   .else
+      .if \is_interface
+      // Setup hidden argument. As we don't have access to the interface method,
+      // just pass the method from the IMT. If the method is the conflict trampoline,
+      // this will make the stub go to runtime, otherwise the hidden argument is unused.
+      mov ip, r0
+      .endif
+      ldr lr, [r0, #ART_METHOD_QUICK_CODE_OFFSET_32]
+      blx lr
+   .endif
+   ldrb ip, [rINST]
+   cmp ip, #68       // Test if result type char == 'D'.
+   beq .Lreturn_double_range_\suffix
+   cmp ip, #70
+   bne .Ldone_return_range_\suffix
+.Lreturn_float_range_\suffix:
+   vmov r0, s0
+   b .Ldone_return_range_\suffix
+.Lreturn_double_range_\suffix:
+   vmov r0, r1, d0
+.Ldone_return_range_\suffix:
+   /* resume execution of caller */
+   .if \is_string_init
+   FETCH ip, 2 // arguments
+   GET_VREG r1, ip
+   UPDATE_REGISTERS_FOR_STRING_INIT r1, r0
+   .endif
+
+   .if \is_polymorphic
+    FETCH_ADVANCE_INST 4
+   .else
+   FETCH_ADVANCE_INST 3
+   .endif
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+.endm
+
+// Fetch some information from the thread cache.
+// Uses ip and lr as temporaries.
+.macro FETCH_FROM_THREAD_CACHE dest_reg, slow_path
+   add      ip, rSELF, #THREAD_INTERPRETER_CACHE_OFFSET       // cache address
+   ubfx     lr, rPC, #2, #THREAD_INTERPRETER_CACHE_SIZE_LOG2  // entry index
+   add      ip, ip, lr, lsl #3             // entry address within the cache
+   ldr      \dest_reg, [ip, #4]            // value (offset)
+   ldr      ip, [ip]                       // entry key (pc)
+   cmp      ip, rPC
+   bne \slow_path
+.endm
+
+// Helper for static field get.
+.macro OP_SGET load="ldr", wide="0"
+   // Fast-path which gets the field from thread-local cache.
+   FETCH_FROM_THREAD_CACHE r0, 4f
+1:
+   ldr r1, [r0, #ART_FIELD_OFFSET_OFFSET]
+   ldr r0, [r0, #ART_FIELD_DECLARING_CLASS_OFFSET]
+   cmp rMR, #0
+   bne 3f
+2:
+   lsr r2, rINST, #8              // w2 <- A
+   .if \wide
+   add r0, r0, r1
+   ldrd r0, r1, [r0]
+   CLEAR_SHADOW_PAIR r2, ip, lr
+   VREG_INDEX_TO_ADDR r2, r2
+   SET_VREG_WIDE_BY_ADDR r0, r1, r2      // fp[A] <- value
+   .else
+   \load r0, [r0, r1]
+   SET_VREG r0, r2               // fp[A] <- value
+   .endif
+   FETCH_ADVANCE_INST 2
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+3:
+   bl art_quick_read_barrier_mark_reg00
+   b 2b
+4:
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   EXPORT_PC
+   bl nterp_get_static_field
+   tst r0, #1
+   beq 1b
+   CLEAR_STATIC_VOLATILE_MARKER r0
+   ldr r1, [r0, #ART_FIELD_OFFSET_OFFSET]
+   ldr r0, [r0, #ART_FIELD_DECLARING_CLASS_OFFSET]
+   cmp rMR, #0
+   bne 7f
+5:
+   lsr r2, rINST, #8              // w2 <- A
+   .if \wide
+   dmb ish
+   add ip, r0, r1
+6:
+   ldrexd   r0, r1, [ip]
+   strexd   r3, r0, r1, [ip]
+   cmp      r3, #0
+   bne      6b
+   dmb ish
+   CLEAR_SHADOW_PAIR r2, ip, lr
+   VREG_INDEX_TO_ADDR r2, r2
+   SET_VREG_WIDE_BY_ADDR r0, r1, r2      // fp[A] <- value
+   .else
+   dmb ish
+   \load r3, [r0, r1]
+   dmb ish
+   SET_VREG r3, r2               // fp[A] <- value
+   .endif
+   FETCH_ADVANCE_INST 2
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+7:
+   bl art_quick_read_barrier_mark_reg00
+   b 5b
+.endm
+
+// Helper for static field put.
+.macro OP_SPUT store="str", wide="0"
+   // Fast-path which gets the field from thread-local cache.
+   FETCH_FROM_THREAD_CACHE r0, 4f
+1:
+   ldr r1, [r0, #ART_FIELD_OFFSET_OFFSET]
+   ldr r0, [r0, #ART_FIELD_DECLARING_CLASS_OFFSET]
+   cmp rMR, #0
+   bne 3f
+2:
+   lsr r2, rINST, #8              // w2 <- A
+   .if \wide
+   VREG_INDEX_TO_ADDR r2, r2
+   GET_VREG_WIDE_BY_ADDR r2, r3, r2      // fp[A] <- value
+   add r0, r0, r1
+   strd r2, r3, [r0]
+   .else
+   GET_VREG r2, r2                // w2 <- v[A]
+   \store    r2, [r0, r1]
+   .endif
+   FETCH_ADVANCE_INST 2
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+3:
+   bl art_quick_read_barrier_mark_reg00
+   b 2b
+4:
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   EXPORT_PC
+   bl nterp_get_static_field
+   tst r0, #1
+   beq 1b
+   CLEAR_STATIC_VOLATILE_MARKER r0
+   ldr r1, [r0, #ART_FIELD_OFFSET_OFFSET]
+   ldr r0, [r0, #ART_FIELD_DECLARING_CLASS_OFFSET]
+   cmp rMR, #0
+   bne 6f
+5:
+   lsr r2, rINST, #8              // r2 <- A
+   .if \wide
+   VREG_INDEX_TO_ADDR r2, r2
+   GET_VREG_WIDE_BY_ADDR r2, r3, r2
+   add ip, r0, r1
+   dmb ish
+7:
+   ldrexd r0, r1, [ip]
+   strexd r0, r2, r3, [ip]
+   cmp r0, #0
+   bne 7b
+   dmb ish
+   .else
+   GET_VREG r2, r2                // r2 <- v[A]
+   dmb ish
+   \store r2, [r0, r1]
+   dmb ish
+   .endif
+   FETCH_ADVANCE_INST 2
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+6:
+   bl art_quick_read_barrier_mark_reg00
+   b 5b
+.endm
+
+
+// Helper for instance field put.
+.macro OP_IPUT store="str", wide="0":
+   // Fast-path which gets the field from thread-local cache.
+   FETCH_FROM_THREAD_CACHE r0, 2f
+1:
+   ubfx    r1, rINST, #8, #4           // r1<- A
+   lsr     r4, rINST, #12              // r2<- B
+   GET_VREG r4, r4                     // vB (object we're operating on)
+   cmp r4, #0
+   beq common_errNullObject
+   .if \wide
+   VREG_INDEX_TO_ADDR r1, r1
+   GET_VREG_WIDE_BY_ADDR r2, r3, r1      // fp[A] <- value
+   add r4, r4, r0
+   strd r2, r3, [r4]
+   .else
+   GET_VREG r1, r1                     // r1 <- v[A]
+   \store r1, [r4, r0]
+   .endif
+   FETCH_ADVANCE_INST 2
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+2:
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   EXPORT_PC
+   bl nterp_get_instance_field_offset
+   cmp r0, #0
+   bge 1b
+   CLEAR_INSTANCE_VOLATILE_MARKER r0
+   ubfx    r1, rINST, #8, #4           // r1<- A
+   lsr     r4, rINST, #12              // r2<- B
+   GET_VREG r4, r4                     // vB (object we're operating on)
+   cmp r4, #0
+   beq common_errNullObject
+   .if \wide
+   VREG_INDEX_TO_ADDR r1, r1
+   GET_VREG_WIDE_BY_ADDR r2, r3, r1
+   add ip, r4, r0
+   dmb ish
+3:
+   ldrexd r0, r1, [ip]
+   strexd r0, r2, r3, [ip]
+   cmp r0, #0
+   bne 3b
+   dmb ish
+   .else
+   GET_VREG r1, r1                     // r1 <- v[A]
+   dmb ish
+   \store r1, [r4, r0]
+   dmb ish
+   .endif
+   FETCH_ADVANCE_INST 2
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+.endm
+
+// Helper for instance field get.
+.macro OP_IGET load="ldr", wide="0"
+   // Fast-path which gets the field from thread-local cache.
+   FETCH_FROM_THREAD_CACHE r0, 2f
+1:
+   lsr     r2, rINST, #12              // w2<- B
+   GET_VREG r3, r2                     // w3<- object we're operating on
+   ubfx    r2, rINST, #8, #4           // w2<- A
+   cmp     r3, #0
+   beq common_errNullObject    // object was null
+   .if \wide
+   add r3, r3, r0
+   ldrd r0, r1, [r3]
+   CLEAR_SHADOW_PAIR r2, ip, lr
+   VREG_INDEX_TO_ADDR r2, r2
+   SET_VREG_WIDE_BY_ADDR r0, r1, r2      // fp[A] <- value
+   .else
+   \load r0, [r3, r0]
+   SET_VREG r0, r2                     // fp[A] <- value
+   .endif
+   FETCH_ADVANCE_INST 2
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+2:
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   EXPORT_PC
+   bl nterp_get_instance_field_offset
+   cmp r0, #0
+   bge 1b
+   CLEAR_INSTANCE_VOLATILE_MARKER r0
+   lsr     r2, rINST, #12              // r2<- B
+   GET_VREG r3, r2                     // r3<- object we're operating on
+   ubfx    r2, rINST, #8, #4           // r2<- A
+   cmp     r3, #0
+   beq common_errNullObject    // object was null
+   .if \wide
+   dmb ish
+   add ip, r3, r0
+3:
+   ldrexd   r0, r1, [ip]
+   strexd   r3, r0, r1, [ip]
+   cmp      r3, #0
+   bne      3b
+   dmb ish
+   CLEAR_SHADOW_PAIR r2, ip, lr
+   VREG_INDEX_TO_ADDR r2, r2
+   SET_VREG_WIDE_BY_ADDR r0, r1, r2      // fp[A] <- value
+   dmb ish
+   .else
+   dmb ish
+   \load r0, [r3, r0]
+   dmb ish
+   SET_VREG r0, r2                     // fp[A] <- value
+   .endif
+   FETCH_ADVANCE_INST 2
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+.endm
+
+// Puts the next int/long/object parameter passed in physical register
+// in the expected dex register array entry, and in case of object in the
+// expected reference array entry.
+// Uses ip as temporary.
+.macro LOOP_OVER_SHORTY_STORING_GPRS gpr_32, shorty, arg_offset, regs, refs, finished, if_long, is_r3
+1: // LOOP
+    ldrb ip, [\shorty], #1       // Load next character in shorty, and increment.
+    cmp ip, #0
+    beq \finished            // if (ip == '\0') goto finished
+    cmp ip, #74                  // if (ip == 'J') goto FOUND_LONG
+    beq 2f
+    cmp ip, #70                  // if (ip == 'F') goto SKIP_FLOAT
+    beq 3f
+    cmp ip, #68                  // if (ip == 'D') goto SKIP_DOUBLE
+    beq 4f
+    str \gpr_32, [\regs, \arg_offset]
+    cmp ip, #76                  // if (ip != 'L') goto NOT_REFERENCE
+    bne 6f
+    str \gpr_32, [\refs, \arg_offset]
+6:  // NOT_REFERENCE
+    add \arg_offset, \arg_offset, #4
+    b 5f
+2:  // FOUND_LONG
+    .if \is_r3
+    // Put back shorty and jump to \if_long
+    sub \shorty, \shorty, #1
+    .else
+    // A long can only be in r2, r3
+    str r2, [\regs, \arg_offset]
+    add \arg_offset, \arg_offset, #4
+    str r3, [\regs, \arg_offset]
+    add \arg_offset, \arg_offset, #4
+    .endif
+    b \if_long
+3:  // SKIP_FLOAT
+    add \arg_offset, \arg_offset, #4
+    b 1b
+4:  // SKIP_DOUBLE
+    add \arg_offset, \arg_offset, #8
+    b 1b
+5:
+.endm
+
+// Puts the next int/long/object parameter passed in stack
+// in the expected dex register array entry, and in case of object in the
+// expected reference array entry.
+.macro LOOP_OVER_INTs shorty, arg_offset, regs, refs, stack_ptr, tmp1, tmp2, finished
+1: // LOOP
+    ldrb \tmp1, [\shorty], #1       // Load next character in shorty, and increment.
+    cmp \tmp1, #0
+    beq \finished                   // if (\tmp1 == '\0') goto finished
+    cmp \tmp1, #74                  // if (\tmp1 == 'J') goto FOUND_LONG
+    beq 2f
+    cmp \tmp1, #70                  // if (\tmp1 == 'F') goto SKIP_FLOAT
+    beq 3f
+    cmp \tmp1, #68                  // if (\tmp1 == 'D') goto SKIP_DOUBLE
+    beq 4f
+    add \tmp2, \stack_ptr, \arg_offset
+    ldr \tmp2, [\tmp2,  #OFFSET_TO_FIRST_ARGUMENT_IN_STACK]
+    str \tmp2, [\regs, \arg_offset]
+    cmp \tmp1, #76                  // if (\tmp1 != 'L') goto loop
+    bne 3f
+    str \tmp2, [\refs, \arg_offset]
+    add \arg_offset, \arg_offset, #4
+    b 1b
+2:  // FOUND_LONG
+    add \tmp1, \stack_ptr, \arg_offset
+    ldr \tmp1, [\tmp1,  #OFFSET_TO_FIRST_ARGUMENT_IN_STACK]
+    str \tmp1, [\regs, \arg_offset]
+    add \arg_offset, \arg_offset, #4
+    add \tmp1, \stack_ptr, \arg_offset
+    ldr \tmp1, [\tmp1,  #OFFSET_TO_FIRST_ARGUMENT_IN_STACK]
+    str \tmp1, [\regs, \arg_offset]
+    add \arg_offset, \arg_offset, #4
+    b 1b
+3:  // SKIP_FLOAT
+    add \arg_offset, \arg_offset, #4
+    b 1b
+4:  // SKIP_DOUBLE
+    add \arg_offset, \arg_offset, #8
+    b 1b
+.endm
+
+%def entry():
+/*
+ * ArtMethod entry point.
+ *
+ * On entry:
+ *  r0   ArtMethod* callee
+ *  rest  method parameters
+ */
+
+OAT_ENTRY ExecuteNterpImpl, EndExecuteNterpImpl
+    .cfi_startproc
+    sub ip, sp, #STACK_OVERFLOW_RESERVED_BYTES
+    ldr ip, [ip]
+    /* Spill callee save regs */
+    SPILL_ALL_CALLEE_SAVES
+
+    // TODO: Get shorty in a better way and remove below
+    SPILL_ALL_ARGUMENTS
+
+    bl NterpGetShorty
+    // Save shorty in callee-save rIBASE.
+    mov rIBASE, r0
+
+    RESTORE_ALL_ARGUMENTS
+
+    ldr rPC, [r0, #ART_METHOD_DATA_OFFSET_32]
+
+    // Setup the stack for executing the method.
+    SETUP_STACK_FRAME rPC, rREFS, rFP, CFI_REFS, load_ins=1
+
+    // Setup the parameters
+    cmp r4, #0
+    beq .Lxmm_setup_finished
+
+    sub r4, rINST, r4
+    lsl r4, r4, #2 // r4 is now the offset for inputs into the registers array.
+
+    mov lr, ip // lr contains the old stack pointer
+
+    ldr ip, [r0, #ART_METHOD_ACCESS_FLAGS_OFFSET]
+    // r0 is now available.
+    // Setup shorty, pointer to inputs in FP and pointer to inputs in REFS
+    add r0, rIBASE, #1  // shorty + 1  ; ie skip return arg character
+    add r7, rFP, r4
+    add r8, rREFS, r4
+    tst ip, #ART_METHOD_IS_STATIC_FLAG
+    bne .Lhandle_static_method
+    str r1, [r7], #4
+    str r1, [r8], #4
+    add lr, lr, #4
+    mov r4, #0
+    b .Lcontinue_setup_gprs
+.Lhandle_static_method:
+    mov r4, #0
+    LOOP_OVER_SHORTY_STORING_GPRS r1, r0, r4, r7, r8, .Lgpr_setup_finished, .Lif_long, is_r3=0
+.Lcontinue_setup_gprs:
+    LOOP_OVER_SHORTY_STORING_GPRS r2, r0, r4, r7, r8, .Lgpr_setup_finished, .Lif_long, is_r3=0
+    LOOP_OVER_SHORTY_STORING_GPRS r3, r0, r4, r7, r8, .Lgpr_setup_finished, .Lif_long, is_r3=1
+.Lif_long:
+    LOOP_OVER_INTs r0, r4, r7, r8, lr, ip, r1, .Lgpr_setup_finished
+.Lgpr_setup_finished:
+    add r0, rIBASE, #1  // shorty + 1  ; ie skip return arg character
+    mov r1, r7
+    add r2, lr, #OFFSET_TO_FIRST_ARGUMENT_IN_STACK
+    vpush {s0-s15}
+    mov r3, sp
+    bl NterpStoreArm32Fprs
+    add sp, sp, #(16 * 4)
+.Lxmm_setup_finished:
+    CFI_DEFINE_DEX_PC_WITH_OFFSET(CFI_TMP, CFI_DEX, 0)
+    // r8 was used for setting up the frame, restore it now.
+    REFRESH_MARKING_REGISTER
+.Lexecute_instructions:
+    // Set rIBASE
+    adr rIBASE, artNterpAsmInstructionStart
+    /* start executing the instruction at rPC */
+    START_EXECUTING_INSTRUCTIONS
+    /* NOTE: no fallthrough */
+    // cfi info continues, and covers the whole nterp implementation.
+    SIZE ExecuteNterpImpl
+
+%def opcode_pre():
+
+%def helpers():
+
+%def footer():
+/*
+ * ===========================================================================
+ *  Common subroutines and data
+ * ===========================================================================
+ */
+
+    .text
+    .align  2
+
+// Note: mterp also uses the common_* names below for helpers, but that's OK
+// as the assembler compiled each interpreter separately.
+common_errDivideByZero:
+    EXPORT_PC
+    bl art_quick_throw_div_zero
+
+// Expect index in r1, length in r3
+common_errArrayIndex:
+    EXPORT_PC
+    mov r0, r1
+    mov r1, r3
+    bl art_quick_throw_array_bounds
+
+common_errNullObject:
+    EXPORT_PC
+    bl art_quick_throw_null_pointer_exception
+
+NterpCommonInvokeStatic:
+    COMMON_INVOKE_NON_RANGE is_static=1, suffix="invokeStatic"
+
+NterpCommonInvokeStaticRange:
+    COMMON_INVOKE_RANGE is_static=1, suffix="invokeStatic"
+
+NterpCommonInvokeInstance:
+    COMMON_INVOKE_NON_RANGE suffix="invokeInstance"
+
+NterpCommonInvokeInstanceRange:
+    COMMON_INVOKE_RANGE suffix="invokeInstance"
+
+NterpCommonInvokeInterface:
+    COMMON_INVOKE_NON_RANGE is_interface=1, suffix="invokeInterface"
+
+NterpCommonInvokeInterfaceRange:
+    COMMON_INVOKE_RANGE is_interface=1, suffix="invokeInterface"
+
+NterpCommonInvokePolymorphic:
+    COMMON_INVOKE_NON_RANGE is_polymorphic=1, suffix="invokePolymorphic"
+
+NterpCommonInvokePolymorphicRange:
+    COMMON_INVOKE_RANGE is_polymorphic=1, suffix="invokePolymorphic"
+
+NterpCommonInvokeCustom:
+    COMMON_INVOKE_NON_RANGE is_static=1, is_custom=1, suffix="invokeCustom"
+
+NterpCommonInvokeCustomRange:
+    COMMON_INVOKE_RANGE is_static=1, is_custom=1, suffix="invokeCustom"
+
+NterpHandleStringInit:
+   COMMON_INVOKE_NON_RANGE is_string_init=1, suffix="stringInit"
+
+NterpHandleStringInitRange:
+   COMMON_INVOKE_RANGE is_string_init=1, suffix="stringInit"
+
+NterpNewInstance:
+   EXPORT_PC
+   // Fast-path which gets the class from thread-local cache.
+   FETCH_FROM_THREAD_CACHE r0, 2f
+   cmp rMR, #0
+   bne 3f
+4:
+   ldr lr, [rSELF, #THREAD_ALLOC_OBJECT_ENTRYPOINT_OFFSET]
+   blx lr
+1:
+   lsr r1, rINST, #8                    // r1 <- A
+   SET_VREG_OBJECT r0, r1               // fp[A] <- value
+   FETCH_ADVANCE_INST 2
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+2:
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   bl nterp_get_class_or_allocate_object
+   b 1b
+3:
+   bl art_quick_read_barrier_mark_reg00
+   b 4b
+
+NterpNewArray:
+   /* new-array vA, vB, class@CCCC */
+   EXPORT_PC
+   // Fast-path which gets the class from thread-local cache.
+   FETCH_FROM_THREAD_CACHE r0, 2f
+   cmp rMR, #0
+   bne 3f
+1:
+   lsr     r1, rINST, #12              // r1<- B
+   GET_VREG r1, r1                     // r1<- vB (array length)
+   ldr lr, [rSELF, #THREAD_ALLOC_ARRAY_ENTRYPOINT_OFFSET]
+   blx lr
+   ubfx    r1, rINST, #8, #4           // r1<- A
+   SET_VREG_OBJECT r0, r1
+   FETCH_ADVANCE_INST 2
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+2:
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   bl nterp_get_class_or_allocate_object
+   b 1b
+3:
+   bl art_quick_read_barrier_mark_reg00
+   b 1b
+
+NterpPutObjectInstanceField:
+   // Fast-path which gets the field from thread-local cache.
+   FETCH_FROM_THREAD_CACHE r0, 3f
+1:
+   ubfx    r1, rINST, #8, #4           // r1<- A
+   lsr     r2, rINST, #12              // r2<- B
+   GET_VREG r2, r2                     // vB (object we're operating on)
+   cmp r2, #0
+   beq common_errNullObject            // is object null?
+   GET_VREG r1, r1                     // r1 <- v[A]
+   str r1, [r2, r0]
+4:
+   cmp r1, #0
+   beq 2f
+   ldr r1, [rSELF, #THREAD_CARD_TABLE_OFFSET]
+   lsr r3, r2, #CARD_TABLE_CARD_SHIFT
+   strb r1, [r1, r3]
+2:
+   FETCH_ADVANCE_INST 2
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+3:
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   EXPORT_PC
+   bl nterp_get_instance_field_offset
+   cmp r0, #0
+   bge 1b
+   CLEAR_INSTANCE_VOLATILE_MARKER r0
+   ubfx    r1, rINST, #8, #4           // r1<- A
+   lsr     r2, rINST, #12              // r2<- B
+   GET_VREG r2, r2                     // vB (object we're operating on)
+   cmp r2, #0
+   beq common_errNullObject            // is object null?
+   GET_VREG r1, r1                     // r1 <- v[A]
+   dmb ish
+   str r1, [r2, r0]
+   dmb ish
+   b 4b
+
+NterpGetObjectInstanceField:
+   // Fast-path which gets the field from thread-local cache.
+   FETCH_FROM_THREAD_CACHE r0, 4f
+1:
+   ubfx    r1, rINST, #8, #4           // r1<- A
+   lsr     r2, rINST, #12              // r2<- B
+   GET_VREG r2, r2                     // vB (object we're operating on)
+   cmp r2, #0
+   beq common_errNullObject
+   ldr r0, [r2, r0]
+7:
+   cmp rMR, #0
+   bne 3f
+2:
+   SET_VREG_OBJECT r0, r1              // fp[A] <- value
+   FETCH_ADVANCE_INST 2
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+3:
+   bl art_quick_read_barrier_mark_reg00
+   b 2b
+4:
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   EXPORT_PC
+   bl nterp_get_instance_field_offset
+   cmp r0, #0
+   bge 1b
+   CLEAR_INSTANCE_VOLATILE_MARKER r0
+   ubfx    r1, rINST, #8, #4           // r1<- A
+   lsr     r2, rINST, #12              // r2<- B
+   GET_VREG r2, r2                     // vB (object we're operating on)
+   cmp r2, #0
+   beq common_errNullObject
+   dmb ish
+   ldr r0, [r2, r0]
+   dmb ish
+   b 7b
+
+NterpPutObjectStaticField:
+   // Fast-path which gets the field from thread-local cache.
+   FETCH_FROM_THREAD_CACHE r0, 5f
+1:
+   ldr r1, [r0, #ART_FIELD_OFFSET_OFFSET]
+   ldr r0, [r0, #ART_FIELD_DECLARING_CLASS_OFFSET]
+   cmp rMR, #0
+   bne 4f
+2:
+   lsr r2, rINST, #8                    // w2 <- A
+   GET_VREG r2, r2
+   str r2, [r0, r1]
+8:
+   cmp r2, #0
+   beq 3f
+   ldr r1, [rSELF, #THREAD_CARD_TABLE_OFFSET]
+   lsr r3, r0, #CARD_TABLE_CARD_SHIFT
+   strb r1, [r1, r3]
+3:
+   FETCH_ADVANCE_INST 2
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+4:
+   bl art_quick_read_barrier_mark_reg00
+   b 2b
+5:
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   EXPORT_PC
+   bl nterp_get_static_field
+   tst r0, #1
+   beq 1b
+   CLEAR_STATIC_VOLATILE_MARKER r0
+   ldr r1, [r0, #ART_FIELD_OFFSET_OFFSET]
+   ldr r0, [r0, #ART_FIELD_DECLARING_CLASS_OFFSET]
+   cmp rMR, #0
+   bne 7f
+6:
+   lsr r2, rINST, #8                    // 21 <- A
+   GET_VREG r2, r2
+   dmb ish
+   str r2, [r0, r1]
+   dmb ish
+   b 8b
+7:
+   bl art_quick_read_barrier_mark_reg00
+   b 6b
+
+NterpGetObjectStaticField:
+   // Fast-path which gets the field from thread-local cache.
+   FETCH_FROM_THREAD_CACHE r0, 4f
+1:
+   ldr r1, [r0, #ART_FIELD_OFFSET_OFFSET]
+   ldr r0, [r0, #ART_FIELD_DECLARING_CLASS_OFFSET]
+   cmp rMR, #0
+   bne 3f
+   ldr r0, [r0, r1]
+   // No need to check the marking register, we know it's not set here.
+2:
+   lsr r1, rINST, #8                    // r1 <- A
+   SET_VREG_OBJECT r0, r1               // fp[A] <- value
+   FETCH_ADVANCE_INST 2
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+3:
+   bl art_quick_read_barrier_mark_reg00
+   ldr r0, [r0, r1]
+   // Here, we know the marking register is set.
+   bl art_quick_read_barrier_mark_reg00
+   b 2b
+4:
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   EXPORT_PC
+   bl nterp_get_static_field
+   tst r0, #1
+   beq 1b
+   CLEAR_STATIC_VOLATILE_MARKER r0
+   ldr r1, [r0, #ART_FIELD_OFFSET_OFFSET]
+   ldr r0, [r0, #ART_FIELD_DECLARING_CLASS_OFFSET]
+   cmp rMR, #0
+   bne 7f
+5:
+   dmb ish
+   ldr r0, [r0, r1]
+   dmb ish
+   cmp rMR, #0
+   bne 8f
+   b 2b
+7:
+   bl art_quick_read_barrier_mark_reg00
+   b 5b
+8:
+   bl art_quick_read_barrier_mark_reg00
+   b 2b
+
+NterpGetBooleanStaticField:
+  OP_SGET load="ldrb", wide=0
+
+NterpGetByteStaticField:
+  OP_SGET load="ldrsb", wide=0
+
+NterpGetCharStaticField:
+  OP_SGET load="ldrh", wide=0
+
+NterpGetShortStaticField:
+  OP_SGET load="ldrsh", wide=0
+
+NterpGetWideStaticField:
+  OP_SGET load="ldr", wide=1
+
+NterpGetIntStaticField:
+  OP_SGET load="ldr", wide=0
+
+NterpPutStaticField:
+  OP_SPUT store="str", wide=0
+
+NterpPutBooleanStaticField:
+NterpPutByteStaticField:
+  OP_SPUT store="strb", wide=0
+
+NterpPutCharStaticField:
+NterpPutShortStaticField:
+  OP_SPUT store="strh", wide=0
+
+NterpPutWideStaticField:
+  OP_SPUT store="str", wide=1
+
+NterpPutInstanceField:
+  OP_IPUT store="str", wide=0
+
+NterpPutBooleanInstanceField:
+NterpPutByteInstanceField:
+  OP_IPUT store="strb", wide=0
+
+NterpPutCharInstanceField:
+NterpPutShortInstanceField:
+  OP_IPUT store="strh", wide=0
+
+NterpPutWideInstanceField:
+  OP_IPUT store="str", wide=1
+
+NterpGetBooleanInstanceField:
+  OP_IGET load="ldrb", wide=0
+
+NterpGetByteInstanceField:
+  OP_IGET load="ldrsb", wide=0
+
+NterpGetCharInstanceField:
+  OP_IGET load="ldrh", wide=0
+
+NterpGetShortInstanceField:
+  OP_IGET load="ldrsh", wide=0
+
+NterpGetWideInstanceField:
+  OP_IGET load="ldr", wide=1
+
+NterpGetInstanceField:
+  OP_IGET load="ldr", wide=0
+
+NterpInstanceOf:
+   /* instance-of vA, vB, class@CCCC */
+   // Fast-path which gets the class from thread-local cache.
+   EXPORT_PC
+   FETCH_FROM_THREAD_CACHE r1, 3f
+   cmp rMR, #0
+   bne 4f
+1:
+   lsr     r2, rINST, #12              // r2<- B
+   GET_VREG r0, r2                     // r0<- vB (object)
+   cmp r0, #0
+   beq 2f
+   bl artInstanceOfFromCode
+2:
+   ubfx    r1, rINST, #8, #4           // r1<- A
+   SET_VREG r0, r1
+   FETCH_ADVANCE_INST 2
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+3:
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   bl nterp_get_class_or_allocate_object
+   mov r1, r0
+   b 1b
+4:
+   bl art_quick_read_barrier_mark_reg01
+   b 1b
+
+NterpCheckCast:
+   // Fast-path which gets the class from thread-local cache.
+   EXPORT_PC
+   FETCH_FROM_THREAD_CACHE r1, 3f
+   cmp rMR, #0
+   bne 4f
+1:
+   lsr     r2, rINST, #8               // r2<- A
+   GET_VREG r0, r2                     // r0<- vA (object)
+   cmp r0, #0
+   beq 2f
+   bl art_quick_check_instance_of
+2:
+   FETCH_ADVANCE_INST 2
+   GET_INST_OPCODE ip
+   GOTO_OPCODE ip
+3:
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   bl nterp_get_class_or_allocate_object
+   mov r1, r0
+   b 1b
+4:
+   bl art_quick_read_barrier_mark_reg01
+   b 1b
+
+NterpHandleInvokeInterfaceOnObjectMethodRange:
+   // First argument is the 'this' pointer.
+   FETCH r1, 2
+   GET_VREG r1, r1
+   // Note: if r1 is null, this will be handled by our SIGSEGV handler.
+   ldr r2, [r1, #MIRROR_OBJECT_CLASS_OFFSET]
+   add r2, r2, #MIRROR_CLASS_VTABLE_OFFSET_32
+   ldr r0, [r2, r0, lsl #2]
+   b NterpCommonInvokeInstanceRange
+
+NterpHandleInvokeInterfaceOnObjectMethod:
+   // First argument is the 'this' pointer.
+   FETCH r1, 2
+   and r1, r1, #0xf
+   GET_VREG r1, r1
+   // Note: if r1 is null, this will be handled by our SIGSEGV handler.
+   ldr r2, [r1, #MIRROR_OBJECT_CLASS_OFFSET]
+   add r2, r2, #MIRROR_CLASS_VTABLE_OFFSET_32
+   ldr r0, [r2, r0, lsl #2]
+   b NterpCommonInvokeInstance
+
+NterpHandleHotnessOverflow:
+    add r1, rPC, rINST, lsl #1
+    mov r2, rFP
+    bl nterp_hot_method
+    cmp r0, #0
+    bne 1f
+    add r2, rINST, rINST                // w2<- byte offset
+    FETCH_ADVANCE_INST_RB r2            // update rPC, load rINST
+    GET_INST_OPCODE ip                  // extract opcode from rINST
+    GOTO_OPCODE ip                      // jump to next instruction
+1:
+    // Drop the current frame.
+    ldr ip, [rREFS, #-4]
+    mov sp, ip
+    .cfi_def_cfa sp, CALLEE_SAVES_SIZE
+
+    // The transition frame of type SaveAllCalleeSaves saves r4, r8, and r9,
+    // but not managed ABI. So we need to restore callee-saves of the nterp frame,
+    // and save managed ABI callee saves, which will be restored by the callee upon
+    // return.
+
+    RESTORE_ALL_CALLEE_SAVES
+    push {r5-r7, r10-r11, lr}
+   .cfi_adjust_cfa_offset 24
+   .cfi_rel_offset r5, 0
+   .cfi_rel_offset r6, 4
+   .cfi_rel_offset r7, 8
+   .cfi_rel_offset r10, 12
+   .cfi_rel_offset r11, 16
+   .cfi_rel_offset lr, 20
+    vpush {s16-s31}
+    .cfi_adjust_cfa_offset 64
+
+    // Setup the new frame
+    ldr r1, [r0, #OSR_DATA_FRAME_SIZE]
+    // Given stack size contains all callee saved registers, remove them.
+    sub r1, r1, #(CALLEE_SAVES_SIZE - 12)
+
+    // We know r1 cannot be 0, as it at least contains the ArtMethod.
+
+    // Remember CFA in a callee-save register.
+    mov rINST, sp
+    .cfi_def_cfa_register rINST
+
+    sub sp, sp, r1
+
+    add r2, r0, #OSR_DATA_MEMORY
+2:
+    sub r1, r1, #4
+    ldr ip, [r2, r1]
+    str ip, [sp, r1]
+    cmp r1, #0
+    bne 2b
+
+    // Fetch the native PC to jump to and save it in a callee-save register.
+    ldr rFP, [r0, #OSR_DATA_NATIVE_PC]
+
+    // Free the memory holding OSR Data.
+    bl free
+
+    // Jump to the compiled code.
+    bx rFP
+// This is the logical end of ExecuteNterpImpl, where the frame info applies.
+// EndExecuteNterpImpl includes the methods below as we want the runtime to
+// see them as part of the Nterp PCs.
+.cfi_endproc
+
+nterp_to_nterp_static_non_range:
+    .cfi_startproc
+    SETUP_STACK_FOR_INVOKE
+    SETUP_NON_RANGE_ARGUMENTS_AND_EXECUTE is_static=1, is_string_init=0
+    .cfi_endproc
+
+nterp_to_nterp_string_init_non_range:
+    .cfi_startproc
+    SETUP_STACK_FOR_INVOKE
+    SETUP_NON_RANGE_ARGUMENTS_AND_EXECUTE is_static=0, is_string_init=1
+    .cfi_endproc
+
+nterp_to_nterp_instance_non_range:
+    .cfi_startproc
+    SETUP_STACK_FOR_INVOKE
+    SETUP_NON_RANGE_ARGUMENTS_AND_EXECUTE is_static=0, is_string_init=0
+    .cfi_endproc
+
+nterp_to_nterp_static_range:
+    .cfi_startproc
+    SETUP_STACK_FOR_INVOKE
+    SETUP_RANGE_ARGUMENTS_AND_EXECUTE is_static=1, is_string_init=0
+    .cfi_endproc
+
+nterp_to_nterp_string_init_range:
+    .cfi_startproc
+    SETUP_STACK_FOR_INVOKE
+    SETUP_RANGE_ARGUMENTS_AND_EXECUTE is_static=0, is_string_init=1
+    .cfi_endproc
+
+nterp_to_nterp_instance_range:
+    .cfi_startproc
+    SETUP_STACK_FOR_INVOKE
+    SETUP_RANGE_ARGUMENTS_AND_EXECUTE is_static=0, is_string_init=0
+    .cfi_endproc
+
+// This is the end of PCs contained by the OatQuickMethodHeader created for the interpreter
+// entry point.
+    .type EndExecuteNterpImpl, #function
+    .hidden EndExecuteNterpImpl
+    .global EndExecuteNterpImpl
+EndExecuteNterpImpl:
+
+/*
+ * Convert the double in r0/r1 to a long in r0/r1.
+ *
+ * We have to clip values to long min/max per the specification.  The
+ * expected common case is a "reasonable" value that converts directly
+ * to modest integer.  The EABI convert function isn't doing this for us.
+ */
+nterp_d2l_doconv:
+    ubfx    r2, r1, #20, #11            @ grab the exponent
+    movw    r3, #0x43e
+    cmp     r2, r3                      @ MINLONG < x > MAXLONG?
+    bhs     d2l_special_cases
+    b       __aeabi_d2lz                @ tail call to convert double to long
+d2l_special_cases:
+    movw    r3, #0x7ff
+    cmp     r2, r3
+    beq     d2l_maybeNaN                @ NaN?
+d2l_notNaN:
+    adds    r1, r1, r1                  @ sign bit to carry
+    mov     r0, #0xffffffff             @ assume maxlong for lsw
+    mov     r1, #0x7fffffff             @ assume maxlong for msw
+    adc     r0, r0, #0
+    adc     r1, r1, #0                  @ convert maxlong to minlong if exp negative
+    bx      lr                          @ return
+d2l_maybeNaN:
+    orrs    r3, r0, r1, lsl #12
+    beq     d2l_notNaN                  @ if fraction is non-zero, it's a NaN
+    mov     r0, #0
+    mov     r1, #0
+    bx      lr                          @ return 0 for NaN
+
+/*
+ * Convert the float in r0 to a long in r0/r1.
+ *
+ * We have to clip values to long min/max per the specification.  The
+ * expected common case is a "reasonable" value that converts directly
+ * to modest integer.  The EABI convert function isn't doing this for us.
+ */
+nterp_f2l_doconv:
+    ubfx    r2, r0, #23, #8             @ grab the exponent
+    cmp     r2, #0xbe                   @ MININT < x > MAXINT?
+    bhs     f2l_special_cases
+    b       __aeabi_f2lz                @ tail call to convert float to long
+f2l_special_cases:
+    cmp     r2, #0xff                   @ NaN or infinity?
+    beq     f2l_maybeNaN
+f2l_notNaN:
+    adds    r0, r0, r0                  @ sign bit to carry
+    mov     r0, #0xffffffff             @ assume maxlong for lsw
+    mov     r1, #0x7fffffff             @ assume maxlong for msw
+    adc     r0, r0, #0
+    adc     r1, r1, #0                  @ convert maxlong to minlong if exp negative
+    bx      lr                          @ return
+f2l_maybeNaN:
+    lsls    r3, r0, #9
+    beq     f2l_notNaN                  @ if fraction is non-zero, it's a NaN
+    mov     r0, #0
+    mov     r1, #0
+    bx      lr                          @ return 0 for NaN
+
+// Entrypoints into runtime.
+NTERP_TRAMPOLINE nterp_get_static_field, NterpGetStaticField
+NTERP_TRAMPOLINE nterp_get_instance_field_offset, NterpGetInstanceFieldOffset
+NTERP_TRAMPOLINE nterp_filled_new_array, NterpFilledNewArray
+NTERP_TRAMPOLINE nterp_filled_new_array_range, NterpFilledNewArrayRange
+NTERP_TRAMPOLINE nterp_get_class_or_allocate_object, NterpGetClassOrAllocateObject
+NTERP_TRAMPOLINE nterp_get_method, NterpGetMethod
+NTERP_TRAMPOLINE nterp_hot_method, NterpHotMethod
+NTERP_TRAMPOLINE nterp_load_object, NterpLoadObject
+
+// gen_mterp.py will inline the following definitions
+// within [ExecuteNterpImpl, EndExecuteNterpImpl).
+%def instruction_end():
+
+    .type artNterpAsmInstructionEnd, #object
+    .hidden artNterpAsmInstructionEnd
+    .global artNterpAsmInstructionEnd
+artNterpAsmInstructionEnd:
+    // artNterpAsmInstructionEnd is used as landing pad for exception handling.
+    FETCH_INST
+    GET_INST_OPCODE ip
+    GOTO_OPCODE ip
+
+%def instruction_start():
+
+    .type artNterpAsmInstructionStart, #object
+    .hidden artNterpAsmInstructionStart
+    .global artNterpAsmInstructionStart
+artNterpAsmInstructionStart = .L_op_nop
+    .text
+
+%def opcode_start():
+    NAME_START nterp_${opcode}
+%def opcode_end():
+    NAME_END nterp_${opcode}
+%def helper_start(name):
+    NAME_START ${name}
+%def helper_end(name):
+    NAME_END ${name}
diff --git a/runtime/interpreter/mterp/armng/object.S b/runtime/interpreter/mterp/armng/object.S
new file mode 100644
index 0000000..0b1589f
--- /dev/null
+++ b/runtime/interpreter/mterp/armng/object.S
@@ -0,0 +1,201 @@
+%def op_check_cast():
+   b NterpCheckCast
+
+%def op_instance_of():
+   b NterpInstanceOf
+
+%def op_iget_boolean():
+   b NterpGetBooleanInstanceField
+
+%def op_iget_boolean_quick():
+%  op_iget_quick(load="ldrb")
+
+%def op_iget_byte():
+   b NterpGetByteInstanceField
+
+%def op_iget_byte_quick():
+%  op_iget_quick(load="ldrsb")
+
+%def op_iget_char():
+   b NterpGetCharInstanceField
+
+%def op_iget_char_quick():
+%  op_iget_quick(load="ldrh")
+
+%def op_iget_object():
+   b NterpGetObjectInstanceField
+
+%def op_iget_object_quick():
+    /* For: iget-object-quick */
+    /* op vA, vB, offset@CCCC */
+    mov     r2, rINST, lsr #12          @ r2<- B
+    FETCH r1, 1                         @ r1<- field byte offset
+    EXPORT_PC
+    GET_VREG r0, r2                     @ r0<- object we're operating on
+    cmp r0, #0
+    beq common_errNullObject
+    ldr r0, [r0, r1]
+    cmp rMR, #0
+    bne 2f
+1:
+    ubfx    r2, rINST, #8, #4           @ r2<- A
+    FETCH_ADVANCE_INST 2
+    SET_VREG_OBJECT r0, r2              @ fp[A]<- r0
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+2:
+    bl art_quick_read_barrier_mark_reg00
+    b 1b
+
+%def op_iget():
+   b NterpGetInstanceField
+
+%def op_iget_quick(load="ldr", wide="0"):
+    /* For: iget-quick, iget-boolean-quick, iget-byte-quick, iget-char-quick, iget-short-quick, iget-wide-quick*/
+    /* op vA, vB, offset@CCCC */
+    mov     r2, rINST, lsr #12          @ r2<- B
+    FETCH r1, 1                         @ r1<- field byte offset
+    GET_VREG r3, r2                     @ r3<- object we're operating on
+    ubfx    r2, rINST, #8, #4           @ r2<- A
+    cmp     r3, #0                      @ check object for null
+    beq     common_errNullObject        @ object was null
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    .if $wide
+    ldrd    r0, [r3, r1]                @ r0<- obj.field (64 bits, aligned)
+    VREG_INDEX_TO_ADDR r3, r2           @ r3<- &fp[A]
+    CLEAR_SHADOW_PAIR r2, ip, lr        @ Zero out the shadow regs
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r3    @ fp[A]<- r0/r1
+    .else
+    $load   r0, [r3, r1]                @ r0<- obj.field
+    SET_VREG r0, r2                     @ fp[A]<- r0
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    .endif
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_iget_short():
+   b NterpGetShortInstanceField
+
+%def op_iget_short_quick():
+%  op_iget_quick(load="ldrsh")
+
+%def op_iget_wide():
+   b NterpGetWideInstanceField
+
+%def op_iget_wide_quick():
+%  op_iget_quick(load="ldr", wide="1")
+
+%def op_iput_boolean():
+   b NterpPutBooleanInstanceField
+
+%def op_iput_boolean_quick():
+%  op_iput_quick(store="strb")
+
+%def op_iput_byte():
+   b NterpPutByteInstanceField
+
+%def op_iput_byte_quick():
+%  op_iput_quick(store="strb")
+
+%def op_iput_char():
+   b NterpPutCharInstanceField
+
+%def op_iput_char_quick():
+%  op_iput_quick(store="strh")
+
+%def op_iput_object():
+   b NterpPutObjectInstanceField
+
+%def op_iput_object_quick():
+%  op_iput_quick(store="str", wide="0", is_object="1")
+
+%def op_iput():
+   b NterpPutInstanceField
+
+%def op_iput_quick(store="str", wide="0", is_object="0"):
+    /* For: iput-quick, iput-object-quick */
+    /* op vA, vB, offset@CCCC */
+    mov     r2, rINST, lsr #12          @ r2<- B
+    FETCH ip, 1                         @ r1<- field byte offset
+    GET_VREG r3, r2                     @ r3<- fp[B], the object pointer
+    ubfx    r2, rINST, #8, #4           @ r2<- A
+    cmp     r3, #0                      @ check object for null
+    beq     common_errNullObject        @ object was null
+    .if $wide
+    VREG_INDEX_TO_ADDR r0, r2           @ r0<- &fp[A]
+    GET_VREG_WIDE_BY_ADDR r0, r1, r0    @ r0/r1<- fp[A]/fp[A+1]
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    strd    r0, [r3, ip]                @ obj.field<- r0/r1
+    .else
+    GET_VREG r0, r2                     @ r0<- fp[A]
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    $store     r0, [r3, ip]             @ obj.field<- r0
+    .endif
+    .if $is_object
+    cmp r0, #0
+    beq 1f
+    ldr r1, [rSELF, #THREAD_CARD_TABLE_OFFSET]
+    lsr r0, r3, #CARD_TABLE_CARD_SHIFT
+    strb r1, [r1, r0]
+1:
+   .endif
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_iput_short():
+   b NterpPutShortInstanceField
+
+%def op_iput_short_quick():
+%  op_iput_quick(store="strh")
+
+%def op_iput_wide():
+   b NterpPutWideInstanceField
+
+%def op_iput_wide_quick():
+%  op_iput_quick(store="str", wide="1", is_object="0")
+
+%def op_sget_boolean():
+   b NterpGetBooleanStaticField
+
+%def op_sget_byte():
+   b NterpGetByteStaticField
+
+%def op_sget_char():
+   b NterpGetCharStaticField
+
+%def op_sget_object():
+   b NterpGetObjectStaticField
+
+%def op_sget():
+   b NterpGetIntStaticField
+
+%def op_sget_short():
+   b NterpGetShortStaticField
+
+%def op_sget_wide():
+   b NterpGetWideStaticField
+
+%def op_sput_boolean():
+   b NterpPutBooleanStaticField
+
+%def op_sput_byte():
+   b NterpPutByteStaticField
+
+%def op_sput_char():
+   b NterpPutCharStaticField
+
+%def op_sput_object():
+   b NterpPutObjectStaticField
+
+%def op_sput():
+   b NterpPutStaticField
+
+%def op_sput_short():
+   b NterpPutShortStaticField
+
+%def op_sput_wide():
+   b NterpPutWideStaticField
+
+%def op_new_instance():
+   // The routine is too big to fit in a handler, so jump to it.
+   b NterpNewInstance
diff --git a/runtime/interpreter/mterp/armng/other.S b/runtime/interpreter/mterp/armng/other.S
new file mode 100644
index 0000000..9100ed7
--- /dev/null
+++ b/runtime/interpreter/mterp/armng/other.S
@@ -0,0 +1,361 @@
+%def unused():
+    bkpt
+
+%def op_const():
+    /* const vAA, #+BBBBbbbb */
+    mov     r3, rINST, lsr #8           @ r3<- AA
+    FETCH r0, 1                         @ r0<- bbbb (low)
+    FETCH r1, 2                         @ r1<- BBBB (high)
+    FETCH_ADVANCE_INST 3                @ advance rPC, load rINST
+    orr     r0, r0, r1, lsl #16         @ r0<- BBBBbbbb
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG r0, r3                     @ vAA<- r0
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_const_16():
+    /* const/16 vAA, #+BBBB */
+    FETCH_S r0, 1                       @ r0<- ssssBBBB (sign-extended)
+    mov     r3, rINST, lsr #8           @ r3<- AA
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    SET_VREG r0, r3                     @ vAA<- r0
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_const_4():
+    /* const/4 vA, #+B */
+    sbfx    r1, rINST, #12, #4          @ r1<- sssssssB (sign-extended)
+    ubfx    r0, rINST, #8, #4           @ r0<- A
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    GET_INST_OPCODE ip                  @ ip<- opcode from rINST
+    SET_VREG r1, r0                     @ fp[A]<- r1
+    GOTO_OPCODE ip                      @ execute next instruction
+
+%def op_const_high16():
+    /* const/high16 vAA, #+BBBB0000 */
+    FETCH r0, 1                         @ r0<- 0000BBBB (zero-extended)
+    mov     r3, rINST, lsr #8           @ r3<- AA
+    mov     r0, r0, lsl #16             @ r0<- BBBB0000
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    SET_VREG r0, r3                     @ vAA<- r0
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_const_object(jumbo="0", helper="nterp_load_object"):
+   // Fast-path which gets the object from thread-local cache.
+   FETCH_FROM_THREAD_CACHE r0, 2f
+   cmp rMR, #0
+   bne 3f
+1:
+   mov     r1, rINST, lsr #8           @ r1<- AA
+   .if $jumbo
+   FETCH_ADVANCE_INST 3                // advance rPC, load rINST
+   .else
+   FETCH_ADVANCE_INST 2                // advance rPC, load rINST
+   .endif
+   GET_INST_OPCODE ip                  // extract opcode from rINST
+   SET_VREG_OBJECT r0, r1              // vAA <- value
+   GOTO_OPCODE ip                      // jump to next instruction
+2:
+   EXPORT_PC
+   mov r0, rSELF
+   ldr r1, [sp]
+   mov r2, rPC
+   bl $helper
+   b 1b
+3:
+   bl art_quick_read_barrier_mark_reg00
+   b 1b
+
+%def op_const_class():
+%  op_const_object(jumbo="0", helper="nterp_get_class_or_allocate_object")
+
+%def op_const_method_handle():
+%  op_const_object(jumbo="0")
+
+%def op_const_method_type():
+%  op_const_object(jumbo="0")
+
+%def op_const_string():
+   /* const/string vAA, String@BBBB */
+%  op_const_object(jumbo="0")
+
+%def op_const_string_jumbo():
+   /* const/string vAA, String@BBBBBBBB */
+%  op_const_object(jumbo="1")
+
+%def op_const_wide():
+    /* const-wide vAA, #+HHHHhhhhBBBBbbbb */
+    FETCH r0, 1                         @ r0<- bbbb (low)
+    FETCH r1, 2                         @ r1<- BBBB (low middle)
+    FETCH r2, 3                         @ r2<- hhhh (high middle)
+    orr     r0, r0, r1, lsl #16         @ r0<- BBBBbbbb (low word)
+    FETCH r3, 4                         @ r3<- HHHH (high)
+    mov     r4, rINST, lsr #8           @ r4<- AA
+    orr     r1, r2, r3, lsl #16         @ r1<- HHHHhhhh (high word)
+    CLEAR_SHADOW_PAIR r4, r2, r3        @ Zero out the shadow regs
+    FETCH_ADVANCE_INST 5                @ advance rPC, load rINST
+    VREG_INDEX_TO_ADDR r4, r4           @ r4<- &fp[AA]
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r4    @ vAA<- r0/r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_const_wide_16():
+    /* const-wide/16 vAA, #+BBBB */
+    FETCH_S r0, 1                       @ r0<- ssssBBBB (sign-extended)
+    mov     r3, rINST, lsr #8           @ r3<- AA
+    mov     r1, r0, asr #31             @ r1<- ssssssss
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    CLEAR_SHADOW_PAIR r3, r2, lr        @ Zero out the shadow regs
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[AA]
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r3    @ vAA<- r0/r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_const_wide_32():
+    /* const-wide/32 vAA, #+BBBBbbbb */
+    FETCH r0, 1                         @ r0<- 0000bbbb (low)
+    mov     r3, rINST, lsr #8           @ r3<- AA
+    FETCH_S r2, 2                       @ r2<- ssssBBBB (high)
+    FETCH_ADVANCE_INST 3                @ advance rPC, load rINST
+    orr     r0, r0, r2, lsl #16         @ r0<- BBBBbbbb
+    CLEAR_SHADOW_PAIR r3, r2, lr        @ Zero out the shadow regs
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[AA]
+    mov     r1, r0, asr #31             @ r1<- ssssssss
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r3    @ vAA<- r0/r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_const_wide_high16():
+    /* const-wide/high16 vAA, #+BBBB000000000000 */
+    FETCH r1, 1                         @ r1<- 0000BBBB (zero-extended)
+    mov     r3, rINST, lsr #8           @ r3<- AA
+    mov     r0, #0                      @ r0<- 00000000
+    mov     r1, r1, lsl #16             @ r1<- BBBB0000
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    CLEAR_SHADOW_PAIR r3, r0, r2        @ Zero shadow regs
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[AA]
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r3    @ vAA<- r0/r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_monitor_enter():
+    /*
+     * Synchronize on an object.
+     */
+    /* monitor-enter vAA */
+    EXPORT_PC
+    mov      r2, rINST, lsr #8           @ r2<- AA
+    GET_VREG r0, r2                      @ r0<- vAA (object)
+    bl       art_quick_lock_object
+    FETCH_ADVANCE_INST 1
+    GET_INST_OPCODE ip                   @ extract opcode from rINST
+    GOTO_OPCODE ip                       @ jump to next instruction
+
+%def op_monitor_exit():
+    /*
+     * Unlock an object.
+     *
+     * Exceptions that occur when unlocking a monitor need to appear as
+     * if they happened at the following instruction.  See the Dalvik
+     * instruction spec.
+     */
+    /* monitor-exit vAA */
+    EXPORT_PC
+    mov      r2, rINST, lsr #8          @ r2<- AA
+    GET_VREG r0, r2                     @ r0<- vAA (object)
+    bl       art_quick_unlock_object
+    FETCH_ADVANCE_INST 1                @ before throw: advance rPC, load rINST
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_move(is_object="0"):
+    /* for move, move-object, long-to-int */
+    /* op vA, vB */
+    mov     r1, rINST, lsr #12          @ r1<- B from 15:12
+    ubfx    r0, rINST, #8, #4           @ r0<- A from 11:8
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    GET_VREG r2, r1                     @ r2<- fp[B]
+    GET_INST_OPCODE ip                  @ ip<- opcode from rINST
+    .if $is_object
+    SET_VREG_OBJECT r2, r0              @ fp[A]<- r2
+    .else
+    SET_VREG r2, r0                     @ fp[A]<- r2
+    .endif
+    GOTO_OPCODE ip                      @ execute next instruction
+
+%def op_move_16(is_object="0"):
+    /* for: move/16, move-object/16 */
+    /* op vAAAA, vBBBB */
+    FETCH r1, 2                         @ r1<- BBBB
+    FETCH r0, 1                         @ r0<- AAAA
+    FETCH_ADVANCE_INST 3                @ advance rPC, load rINST
+    GET_VREG r2, r1                     @ r2<- fp[BBBB]
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    .if $is_object
+    SET_VREG_OBJECT r2, r0              @ fp[AAAA]<- r2
+    .else
+    SET_VREG r2, r0                     @ fp[AAAA]<- r2
+    .endif
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_move_exception():
+    /* move-exception vAA */
+    mov     r2, rINST, lsr #8           @ r2<- AA
+    ldr     r3, [rSELF, #THREAD_EXCEPTION_OFFSET]
+    mov     r1, #0                      @ r1<- 0
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    SET_VREG_OBJECT r3, r2              @ fp[AA]<- exception obj
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    str     r1, [rSELF, #THREAD_EXCEPTION_OFFSET]  @ clear exception
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_move_from16(is_object="0"):
+    /* for: move/from16, move-object/from16 */
+    /* op vAA, vBBBB */
+    FETCH r1, 1                         @ r1<- BBBB
+    mov     r0, rINST, lsr #8           @ r0<- AA
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    GET_VREG r2, r1                     @ r2<- fp[BBBB]
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    .if $is_object
+    SET_VREG_OBJECT r2, r0              @ fp[AA]<- r2
+    .else
+    SET_VREG r2, r0                     @ fp[AA]<- r2
+    .endif
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_move_object():
+%  op_move(is_object="1")
+
+%def op_move_object_16():
+%  op_move_16(is_object="1")
+
+%def op_move_object_from16():
+%  op_move_from16(is_object="1")
+
+%def op_move_result(is_object="0"):
+    /* for: move-result, move-result-object */
+    /* op vAA */
+    mov     r2, rINST, lsr #8           @ r2<- AA
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    .if $is_object
+    SET_VREG_OBJECT r0, r2              @ fp[AA]<- r0
+    .else
+    SET_VREG r0, r2                     @ fp[AA]<- r0
+    .endif
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_move_result_object():
+%  op_move_result(is_object="1")
+
+%def op_move_result_wide():
+    /* move-result-wide vAA */
+    mov     rINST, rINST, lsr #8        @ rINST<- AA
+    VREG_INDEX_TO_ADDR r2, rINST        @ r2<- &fp[AA]
+    CLEAR_SHADOW_PAIR rINST, ip, lr     @ Zero out the shadow regs
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r2    @ fp[AA]<- r0/r1
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_move_wide():
+    /* move-wide vA, vB */
+    /* NOTE: regs can overlap, e.g. "move v6,v7" or "move v7,v6" */
+    mov     r3, rINST, lsr #12          @ r3<- B
+    ubfx    rINST, rINST, #8, #4        @ rINST<- A
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[B]
+    VREG_INDEX_TO_ADDR r2, rINST        @ r2<- &fp[A]
+    GET_VREG_WIDE_BY_ADDR r0, r1, r3    @ r0/r1<- fp[B]
+    CLEAR_SHADOW_PAIR rINST, ip, lr     @ Zero out the shadow regs
+    FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r2    @ fp[A]<- r0/r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_move_wide_16():
+    /* move-wide/16 vAAAA, vBBBB */
+    /* NOTE: regs can overlap, e.g. "move v6,v7" or "move v7,v6" */
+    FETCH r3, 2                         @ r3<- BBBB
+    FETCH r2, 1                         @ r2<- AAAA
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[BBBB]
+    VREG_INDEX_TO_ADDR lr, r2           @ r2<- &fp[AAAA]
+    GET_VREG_WIDE_BY_ADDR r0, r1, r3    @ r0/r1<- fp[BBBB]
+    FETCH_ADVANCE_INST 3                @ advance rPC, load rINST
+    CLEAR_SHADOW_PAIR r2, r3, ip        @ Zero out the shadow regs
+    SET_VREG_WIDE_BY_ADDR r0, r1, lr    @ fp[AAAA]<- r0/r1
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_move_wide_from16():
+    /* move-wide/from16 vAA, vBBBB */
+    /* NOTE: regs can overlap, e.g. "move v6,v7" or "move v7,v6" */
+    FETCH r3, 1                         @ r3<- BBBB
+    mov     rINST, rINST, lsr #8        @ rINST<- AA
+    VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[BBBB]
+    VREG_INDEX_TO_ADDR r2, rINST        @ r2<- &fp[AA]
+    GET_VREG_WIDE_BY_ADDR r0, r1, r3    @ r0/r1<- fp[BBBB]
+    CLEAR_SHADOW_PAIR rINST, ip, lr     @ Zero out the shadow regs
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    SET_VREG_WIDE_BY_ADDR r0, r1, r2    @ fp[AA]<- r0/r1
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+%def op_nop():
+    FETCH_ADVANCE_INST 1                @ advance to next instr, load rINST
+    GET_INST_OPCODE ip                  @ ip<- opcode from rINST
+    GOTO_OPCODE ip                      @ execute it
+
+%def op_unused_3e():
+%  unused()
+
+%def op_unused_3f():
+%  unused()
+
+%def op_unused_40():
+%  unused()
+
+%def op_unused_41():
+%  unused()
+
+%def op_unused_42():
+%  unused()
+
+%def op_unused_43():
+%  unused()
+
+%def op_unused_73():
+%  unused()
+
+%def op_unused_79():
+%  unused()
+
+%def op_unused_7a():
+%  unused()
+
+%def op_unused_f3():
+%  unused()
+
+%def op_unused_f4():
+%  unused()
+
+%def op_unused_f5():
+%  unused()
+
+%def op_unused_f6():
+%  unused()
+
+%def op_unused_f7():
+%  unused()
+
+%def op_unused_f8():
+%  unused()
+
+%def op_unused_f9():
+%  unused()
+
+%def op_unused_fc():
+%  unused()
+
+%def op_unused_fd():
+%  unused()
diff --git a/runtime/interpreter/mterp/nterp.cc b/runtime/interpreter/mterp/nterp.cc
index 74e49e7..3d92473 100644
--- a/runtime/interpreter/mterp/nterp.cc
+++ b/runtime/interpreter/mterp/nterp.cc
@@ -118,6 +118,111 @@
   UpdateCache(self, dex_pc_ptr, reinterpret_cast<size_t>(value));
 }
 
+#ifdef __arm__
+
+extern "C" void NterpStoreArm32Fprs(const char* shorty,
+                                    uint32_t* registers,
+                                    uint32_t* stack_args,
+                                    const uint32_t* fprs) {
+  // Note `shorty` has already the returned type removed.
+  ScopedAssertNoThreadSuspension sants("In nterp");
+  uint32_t arg_index = 0;
+  uint32_t fpr_double_index = 0;
+  uint32_t fpr_index = 0;
+  for (uint32_t shorty_index = 0; shorty[shorty_index] != '\0'; ++shorty_index) {
+    char arg_type = shorty[shorty_index];
+    switch (arg_type) {
+      case 'D': {
+        // Double should not overlap with float.
+        fpr_double_index = std::max(fpr_double_index, RoundUp(fpr_index, 2));
+        if (fpr_double_index < 16) {
+          registers[arg_index] = fprs[fpr_double_index++];
+          registers[arg_index + 1] = fprs[fpr_double_index++];
+        } else {
+          registers[arg_index] = stack_args[arg_index];
+          registers[arg_index + 1] = stack_args[arg_index + 1];
+        }
+        arg_index += 2;
+        break;
+      }
+      case 'F': {
+        if (fpr_index % 2 == 0) {
+          fpr_index = std::max(fpr_double_index, fpr_index);
+        }
+        if (fpr_index < 16) {
+          registers[arg_index] = fprs[fpr_index++];
+        } else {
+          registers[arg_index] = stack_args[arg_index];
+        }
+        arg_index++;
+        break;
+      }
+      case 'J': {
+        arg_index += 2;
+        break;
+      }
+      default: {
+        arg_index++;
+        break;
+      }
+    }
+  }
+}
+
+extern "C" void NterpSetupArm32Fprs(const char* shorty,
+                                    uint32_t dex_register,
+                                    uint32_t stack_index,
+                                    uint32_t* fprs,
+                                    uint32_t* registers,
+                                    uint32_t* stack_args) {
+  // Note `shorty` has already the returned type removed.
+  ScopedAssertNoThreadSuspension sants("In nterp");
+  uint32_t fpr_double_index = 0;
+  uint32_t fpr_index = 0;
+  for (uint32_t shorty_index = 0; shorty[shorty_index] != '\0'; ++shorty_index) {
+    char arg_type = shorty[shorty_index];
+    switch (arg_type) {
+      case 'D': {
+        // Double should not overlap with float.
+        fpr_double_index = std::max(fpr_double_index, RoundUp(fpr_index, 2));
+        if (fpr_double_index < 16) {
+          fprs[fpr_double_index++] = registers[dex_register++];
+          fprs[fpr_double_index++] = registers[dex_register++];
+          stack_index += 2;
+        } else {
+          stack_args[stack_index++] = registers[dex_register++];
+          stack_args[stack_index++] = registers[dex_register++];
+        }
+        break;
+      }
+      case 'F': {
+        if (fpr_index % 2 == 0) {
+          fpr_index = std::max(fpr_double_index, fpr_index);
+        }
+        if (fpr_index < 16) {
+          fprs[fpr_index++] = registers[dex_register++];
+          stack_index++;
+        } else {
+          stack_args[stack_index++] = registers[dex_register++];
+        }
+        break;
+      }
+      case 'J': {
+        stack_index += 2;
+        dex_register += 2;
+        break;
+      }
+      default: {
+        stack_index++;
+        dex_register++;
+        break;
+      }
+    }
+  }
+}
+
+#endif
+
 extern "C" const dex::CodeItem* NterpGetCodeItem(ArtMethod* method)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   ScopedAssertNoThreadSuspension sants("In nterp");
@@ -294,6 +399,7 @@
   } else if (resolved_method->GetDeclaringClass()->IsStringClass()
              && !resolved_method->IsStatic()
              && resolved_method->IsConstructor()) {
+    CHECK_NE(invoke_type, kSuper);
     resolved_method = WellKnownClasses::StringInitToStringFactory(resolved_method);
     // Or the result with 1 to notify to nterp this is a string init method. We
     // also don't cache the result as we don't want nterp to have its fast path always
diff --git a/test/813-fp-args/expected-stderr.txt b/test/813-fp-args/expected-stderr.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/813-fp-args/expected-stderr.txt
diff --git a/test/813-fp-args/expected-stdout.txt b/test/813-fp-args/expected-stdout.txt
new file mode 100644
index 0000000..6a5618e
--- /dev/null
+++ b/test/813-fp-args/expected-stdout.txt
@@ -0,0 +1 @@
+JNI_OnLoad called
diff --git a/test/813-fp-args/info.txt b/test/813-fp-args/info.txt
new file mode 100644
index 0000000..5c204cb
--- /dev/null
+++ b/test/813-fp-args/info.txt
@@ -0,0 +1 @@
+Regression test on floating point native ABI, targeted for ARM.
diff --git a/test/813-fp-args/src/Main.java b/test/813-fp-args/src/Main.java
new file mode 100644
index 0000000..fa8ac07
--- /dev/null
+++ b/test/813-fp-args/src/Main.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  public static void main(String[] args) {
+    System.loadLibrary(args[0]);
+    // Compile it to ensure we're calling compiled code.
+    ensureJitCompiled(Main.class, "myMethod");
+    myMethod(1, 2, 3, 4);
+  }
+
+  public static void assertEquals(float expected, float actual) {
+    if (expected != actual) {
+      throw new Error("Expected " + expected + " got " + actual);
+    }
+  }
+
+  public static void assertEquals(double expected, double actual) {
+    if (expected != actual) {
+      throw new Error("Expected " + expected + " got " + actual);
+    }
+  }
+
+  public static void myMethod(float a, double b, float c, float d) {
+    assertEquals(1, a);
+    assertEquals(2, b);
+    assertEquals(3, c);
+    assertEquals(4, d);
+  }
+
+  public static native void ensureJitCompiled(Class<?> cls, String name);
+}
diff --git a/tools/cpp-define-generator/globals.def b/tools/cpp-define-generator/globals.def
index 50ca3d6..50f9b33 100644
--- a/tools/cpp-define-generator/globals.def
+++ b/tools/cpp-define-generator/globals.def
@@ -83,3 +83,5 @@
            GetStackOverflowReservedBytes(art::kRuntimeISA))
 ASM_DEFINE(NTERP_HOTNESS_MASK,
            art::interpreter::kNterpHotnessMask)
+ASM_DEFINE(NTERP_HOTNESS_BITS,
+           art::POPCOUNT(art::interpreter::kNterpHotnessMask))