arm64 nterp: Use fewer instructions for wide ops.

Test: testrunner.py --target --64 --jit
Change-Id: I84b042575c7521db72820375861c4317888fe508
diff --git a/runtime/interpreter/mterp/arm64ng/arithmetic.S b/runtime/interpreter/mterp/arm64ng/arithmetic.S
index cf9dd86..f751f99 100644
--- a/runtime/interpreter/mterp/arm64ng/arithmetic.S
+++ b/runtime/interpreter/mterp/arm64ng/arithmetic.S
@@ -141,21 +141,22 @@
      *      xor-long, add-double, sub-double, mul-double, div-double, rem-double
      */
     /* binop vAA, vBB, vCC */
-    FETCH w0, 1                         // w0<- CCBB
-    lsr     w4, wINST, #8               // w4<- AA
-    lsr     w2, w0, #8                  // w2<- CC
-    and     w1, w0, #255                // w1<- BB
-    GET_VREG_WIDE $r2, w2               // w2<- vCC
-    GET_VREG_WIDE $r1, w1               // w1<- vBB
+    FETCH w0, 1                           // w0<- CCBB
+    LOAD_SCALED_VREG_MASK w5, 0xff        // w5<- ff * sizeof(vreg)
+    EXTRACT_SCALED_VREG w4, w5, wINST, 8  // w4<- AA * sizeof(vreg)
+    EXTRACT_SCALED_VREG w2, w5, w0, 8     // w2<- CC * sizeof(vreg)
+    EXTRACT_SCALED_VREG w1, w5, w0, 0     // w1<- BB * sizeof(vreg)
+    GET_VREG_WIDE_PRESCALED $r2, w2       // w2<- vCC
+    GET_VREG_WIDE_PRESCALED $r1, w1       // w1<- vBB
     .if $chkzero
-    cbz     $r2, common_errDivideByZero  // is second operand zero?
+    cbz     $r2, common_errDivideByZero   // is second operand zero?
     .endif
-    FETCH_ADVANCE_INST 2                // advance rPC, load rINST
+    FETCH_ADVANCE_INST 2                  // advance rPC, load rINST
     $preinstr
-    $instr                              // $result<- op, w0-w4 changed
-    GET_INST_OPCODE ip                  // extract opcode from rINST
-    SET_VREG_WIDE $result, w4           // vAA<- $result
-    GOTO_OPCODE ip                      // jump to next instruction
+    $instr                                // $result<- op, w0-w4 changed
+    GET_INST_OPCODE ip                    // extract opcode from rINST
+    SET_VREG_WIDE_PRESCALED $result, w4   // vAA<- $result
+    GOTO_OPCODE ip                        // jump to next instruction
     /* 11-14 instructions */
 
 %def binopWide2addr(preinstr="", instr="add x0, x0, x1", r0="x0", r1="x1", chkzero="0"):
@@ -300,11 +301,12 @@
 
 %def op_cmp_long():
     FETCH w0, 1                         // w0<- CCBB
+    LOAD_SCALED_VREG_MASK w5, 0xff      // w4<- ff * sizeof(vreg)
     lsr     w4, wINST, #8               // w4<- AA
-    and     w2, w0, #255                // w2<- BB
-    lsr     w3, w0, #8                  // w3<- CC
-    GET_VREG_WIDE x1, w2
-    GET_VREG_WIDE x2, w3
+    EXTRACT_SCALED_VREG w2, w5, w0, 0   // w2<- BB * sizeof(vreg)
+    EXTRACT_SCALED_VREG w3, w5, w0, 8   // w3<- CC * sizeof(vreg)
+    GET_VREG_WIDE_PRESCALED x1, w2
+    GET_VREG_WIDE_PRESCALED x2, w3
     cmp     x1, x2
     cset    w0, ne
     cneg    w0, w0, lt
diff --git a/runtime/interpreter/mterp/arm64ng/floating_point.S b/runtime/interpreter/mterp/arm64ng/floating_point.S
index ad42db3..a91fcf7 100644
--- a/runtime/interpreter/mterp/arm64ng/floating_point.S
+++ b/runtime/interpreter/mterp/arm64ng/floating_point.S
@@ -69,20 +69,24 @@
     SET_VREG_DOUBLE $r0, w2             // vAA<- result
     GOTO_OPCODE ip                      // jump to next instruction
 
-%def fcmp(wide="", r1="s1", r2="s2", cond="lt"):
+%def fcmp(r1="s1", r2="s2", cond="lt"):
     /*
      * Compare two floating-point values.  Puts 0, 1, or -1 into the
      * destination register based on the results of the comparison.
      */
     /* op vAA, vBB, vCC */
     FETCH w0, 1                         // w0<- CCBB
+%  if r1.startswith("d"):
+    LOAD_SCALED_VREG_MASK w5, 0xff      // w4<- ff * sizeof(vreg)
+    lsr     w4, wINST, #8               // w4<- AA
+    EXTRACT_SCALED_VREG w2, w5, w0, 0   // w2<- BB * sizeof(vreg)
+    EXTRACT_SCALED_VREG w3, w5, w0, 8   // w3<- CC * sizeof(vreg)
+    GET_VREG_DOUBLE_PRESCALED $r1, w2
+    GET_VREG_DOUBLE_PRESCALED $r2, w3
+%  else:
     lsr     w4, wINST, #8               // w4<- AA
     and     w2, w0, #255                // w2<- BB
     lsr     w3, w0, #8                  // w3<- CC
-%  if r1.startswith("d"):
-    GET_VREG_DOUBLE $r1, w2
-    GET_VREG_DOUBLE $r2, w3
-%  else:
     GET_VREG $r1, w2
     GET_VREG $r2, w3
 %  #endif
@@ -188,16 +192,16 @@
 %  fbinop2addr(instr="fadd   s2, s0, s1")
 
 %def op_cmpg_double():
-%  fcmp(wide="_WIDE", r1="d1", r2="d2", cond="cc")
+%  fcmp(r1="d1", r2="d2", cond="cc")
 
 %def op_cmpg_float():
-%  fcmp(wide="", r1="s1", r2="s2", cond="cc")
+%  fcmp(r1="s1", r2="s2", cond="cc")
 
 %def op_cmpl_double():
-%  fcmp(wide="_WIDE", r1="d1", r2="d2", cond="lt")
+%  fcmp(r1="d1", r2="d2", cond="lt")
 
 %def op_cmpl_float():
-%  fcmp(wide="", r1="s1", r2="s2", cond="lt")
+%  fcmp(r1="s1", r2="s2", cond="lt")
 
 %def op_div_double():
 %  fbinopWide(instr="fdiv d0, d1, d2", result="d0", r1="d1", r2="d2")
diff --git a/runtime/interpreter/mterp/arm64ng/main.S b/runtime/interpreter/mterp/arm64ng/main.S
index 0ed237c..ae08f3c 100644
--- a/runtime/interpreter/mterp/arm64ng/main.S
+++ b/runtime/interpreter/mterp/arm64ng/main.S
@@ -203,25 +203,39 @@
 /*
  * Get/set the 64-bit value from a Dalvik register.
  */
+.macro LOAD_SCALED_VREG_MASK scaled_mask_reg, unscaled_mask
+    mov     \scaled_mask_reg, \unscaled_mask << 2
+.endm
+.macro EXTRACT_SCALED_VREG scaled_vreg, scaled_mask_reg, src_reg, lsb
+    .if \lsb < 2
+    and     \scaled_vreg, \scaled_mask_reg, \src_reg, lsl #(2 - \lsb)
+    .else
+    and     \scaled_vreg, \scaled_mask_reg, \src_reg, lsr #(\lsb - 2)
+    .endif
+.endm
+.macro GET_VREG_WIDE_PRESCALED reg, scaled_vreg
+    ldr     \reg, [xFP, \scaled_vreg, uxtw]
+.endm
 .macro GET_VREG_WIDE reg, vreg
-    add     ip2, xFP, \vreg, uxtw #2
-    ldr     \reg, [ip2]
+    lsl     wip2, \vreg, #2
+    GET_VREG_WIDE_PRESCALED \reg, wip2
+.endm
+.macro SET_VREG_WIDE_PRESCALED reg, scaled_vreg
+    str     \reg, [xFP, \scaled_vreg, uxtw]
+    str     xzr, [xREFS, \scaled_vreg, uxtw]
 .endm
 .macro SET_VREG_WIDE reg, vreg
-    add     ip2, xFP, \vreg, uxtw #2
-    str     \reg, [ip2]
-    add     ip2, xREFS, \vreg, uxtw #2
-    str     xzr, [ip2]
+    lsl     wip2, \vreg, #2
+    SET_VREG_WIDE_PRESCALED \reg, wip2
+.endm
+.macro GET_VREG_DOUBLE_PRESCALED reg, scaled_vreg
+    GET_VREG_WIDE_PRESCALED \reg, \scaled_vreg
 .endm
 .macro GET_VREG_DOUBLE reg, vreg
-    add     ip2, xFP, \vreg, uxtw #2
-    ldr     \reg, [ip2]
+    GET_VREG_WIDE \reg, \vreg
 .endm
 .macro SET_VREG_DOUBLE reg, vreg
-    add     ip2, xFP, \vreg, uxtw #2
-    str     \reg, [ip2]
-    add     ip2, xREFS, \vreg, uxtw #2
-    str     xzr, [ip2]
+    SET_VREG_WIDE \reg, \vreg
 .endm
 
 /*