riscv64: invoke opcodes, range variants

Also hotfix for slow path 'L' ref loads: clear upper bits for correctness.
Also disable 01 fastpath to rethink approach for 'L' ref loads vs shorty cost.

See invoke.S for notes on code structure.

Test: Run these opcodes against all interpreter
tests on a Linux RISC-V VM.

(1) setup
  lunch aosp_riscv64-trunk-userdebug

  export ART_TEST_SSH_USER=ubuntu
  export ART_TEST_SSH_HOST=localhost
  export ART_TEST_SSH_PORT=10001
  export ART_TEST_ON_VM=true

  . art/tools/buildbot-utils.sh
  art/tools/buildbot-build.sh --target

  # Create, boot and configure the VM.
  art/tools/buildbot-vm.sh create
  art/tools/buildbot-vm.sh boot
  art/tools/buildbot-vm.sh setup-ssh  # password: 'ubuntu'

  art/tools/buildbot-cleanup-device.sh
  art/tools/buildbot-setup-device.sh
  art/tools/buildbot-sync.sh

(2) test
  art/test.py --target -r --no-prebuild --ndebug --64  -j 12 --cdex-none --interpreter

Clean with `m check_cfi`.

Bug: 283082047

Change-Id: I20426fd1e7b397f7fce51a1f43661056d5b8e844
diff --git a/runtime/interpreter/mterp/riscv64/invoke.S b/runtime/interpreter/mterp/riscv64/invoke.S
index e93cc9f..569b750 100644
--- a/runtime/interpreter/mterp/riscv64/invoke.S
+++ b/runtime/interpreter/mterp/riscv64/invoke.S
@@ -126,7 +126,7 @@
    and t0, a0, 0x1                 // t0 := string-init bit
    beqz t0, 1b                     // not string init
    and a0, a0, ~0x1                // clear string-init bit
-   tail NterpInvokeStringInit      // args a0, a1, s7
+   tail NterpInvokeStringInit${range}  // args a0, s7
 3:
    tail common_errNullObject
 
@@ -379,25 +379,28 @@
 // NterpInvokeDirect
 // a0: ArtMethod*
 // a1: this
-// s7: vreg ids F|E|D|C
-%def nterp_invoke_direct(uniq="invoke_direct"):
+// s7: (regular) vreg ids F|E|D|C, (range) vreg id CCCC
+%def nterp_invoke_direct(uniq="invoke_direct", range=""):
    ld s8, ART_METHOD_QUICK_CODE_OFFSET_64(a0)
                                  // s8 := quick code
 %  try_nterp(quick="s8", z0="t0", skip=f".L{uniq}_simple")
-   call NterpToNterpInstance     // args a0, a1
+   call NterpToNterpInstance${range}  // args a0, a1
    j .L${uniq}_next_op
 
 .L${uniq}_simple:
-   srliw t0, xINST, 12           // t0 := A
-%  try_simple_args(ins="t0", v_fedc="s7", z0="t1", skip=f".L{uniq}_01", uniq=uniq)
-                                 // a2, a3, a4, a5 := fp[D], fp[E], fp[F], fp[G]
-   jalr s8                       // args a0 - a5
+%  if range == 'Range':
+%    try_simple_args_range(vC="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", skip=f".L{uniq}_01", uniq=uniq)
+%  else:
+%    try_simple_args(v_fedc="s7", z0="t0", z1="t1", skip=f".L{uniq}_01", uniq=uniq)
+%#:
+   jalr s8                       // (regular) args a0 - a5, (range) args a0 - a7 and stack
    j .L${uniq}_next_op
 
 .L${uniq}_01:
+   j .L${uniq}_slow  // TODO fix this fastpath
    mv s9, zero                   // initialize shorty reg
-%  try_01_args(ins="t0", v_fedc="s7", z0="t1", skip=f".L{uniq}_slow", call=f".L{uniq}_01_call", uniq=uniq)
-                                 // a2, fa0 := fp[D], maybe
+%  try_01_args(vreg="s7", z0="t1", z1="t2", skip=f".L{uniq}_slow", call=f".L{uniq}_01_call", uniq=uniq, range=range)
+                                 // a2, fa0 := (regular) fp[D] or (range) fp[CCCC + 1], maybe
    // Return value expected. Get shorty, stash in callee-save to be available on return.
    // When getting shorty, stash this fast path's arg registers then restore.
    // Unconditionally restores a2/fa0, even if extra arg not found.
@@ -414,7 +417,11 @@
 
 .L${uniq}_slow:
 %  get_shorty_save_a0_a1(shorty="s9", y0="s10", y1="s11")
-%  slow_setup_args(shorty="s9", vregs="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", uniq=uniq)
+%  if range == 'Range':
+%    slow_setup_args_range(shorty="s9", vC="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", z6="t6", z7="s10", uniq=uniq)
+%  else:
+%    slow_setup_args(shorty="s9", vregs="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", z6="t6", uniq=uniq)
+%#:
    jalr s8                       // args in a0-a5, fa0-fa4
 %  maybe_float_returned(shorty="s9", z0="t0", z1="t1", uniq=f"{uniq}_1")
                                  // a0 := fa0 if float return
@@ -426,23 +433,27 @@
 
 // NterpInvokeStringInit
 // a0: ArtMethod*
-// a1: this
-// s7: vreg ids F|E|D|C
-%def nterp_invoke_string_init(uniq="invoke_string_init"):
+// s7: (regular) vreg ids F|E|D|C, (range) vreg id CCCC
+%def nterp_invoke_string_init(uniq="invoke_string_init", range=""):
    ld s8, ART_METHOD_QUICK_CODE_OFFSET_64(a0)
-                                 // s8 := quick code
+                        // s8 := quick code
 %  try_nterp(quick="s8", z0="t0", skip=f".L{uniq}_slow")
-   call NterpToNterpStringInit   // args a0, a1
+   call NterpToNterpStringInit${range}  // arg a0
    j .L${uniq}_next_op
 
 .L${uniq}_slow:
 %  get_shorty_save_a0_a1(shorty="s9", y0="s10", y1="s11")
-%  slow_setup_args_string_init(shorty="s9", vregs="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", uniq=uniq)
-   mv s9, a1                     // save "this" in callee-save for return-time fixup
-   jalr s8                       // args in a0-a5, fa0-fa4
+%  if range == 'Range':
+%    slow_setup_args_string_init_range(shorty="s9", vC="s7", z0="t0", z1="t1", z2="t2", z3="t3", uniq=uniq)
+%  else:
+%    slow_setup_args_string_init(shorty="s9", v_fedc="s7", z0="t0", z1="t1", z2="t2", uniq=uniq)
+%#:
+   jalr s8              // args (regular) a0 - a5, (range) a0 - a5
 
 .L${uniq}_next_op:
-%  subst_vreg_references(old="s9", new="a0", z0="t0", z1="t1", z2="t2", uniq=uniq)
+%  fetch_receiver(reg="t0", vreg="s7", range=range)
+                        // t0 := fp[C] (this)
+%  subst_vreg_references(old="t0", new="a0", z0="t1", z1="t2", z2="t3", uniq=uniq)
    FETCH_ADVANCE_INST 3
    GET_INST_OPCODE t0
    GOTO_OPCODE t0
@@ -450,43 +461,50 @@
 
 // NterpInvokeStatic
 // a0: ArtMethod*
-// s7: vreg ids F|E|D|C
-%def nterp_invoke_static(uniq="invoke_static"):
+// s7: (regular) vreg ids F|E|D|C, (range) vreg id CCCC
+%def nterp_invoke_static(uniq="invoke_static", range=""):
    ld s8, ART_METHOD_QUICK_CODE_OFFSET_64(a0)
-                                 // s8 := quick code
+                               // s8 := quick code
 %  try_nterp(quick="s8", z0="t0", skip=f".L{uniq}_simple")
-   call NterpToNterpStatic       // arg a0
+   call NterpToNterpStatic${range}  // arg a0
    j .L${uniq}_next_op
 
 .L${uniq}_simple:
-   srliw t0, xINST, 12           // t0 := A
-%  try_simple_args_static(ins="t0", v_fedc="s7", z0="t1", skip=f".L{uniq}_01", uniq=uniq)
-                                 // a1, a2, a3, a4, a5 := fp[C], fp[D], fp[E], fp[F], fp[G]
-   jalr s8                       // args a0 - a5
+%  if range == 'Range':
+%    try_simple_args_range(vC="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", arg_start="0", skip=f".L{uniq}_01", uniq=uniq)
+%  else:
+%    try_simple_args(v_fedc="s7", z0="t0", z1="t1", arg_start="0", skip=f".L{uniq}_01", uniq=uniq)
+%#:
+   jalr s8                     // args (regular) a0 - a5, (range) a0 - a7 and maybe stack
    j .L${uniq}_next_op
 
 .L${uniq}_01:
-   mv s9, zero                   // initialize shorty reg
-%  try_01_args_static(ins="t0", v_fedc="s7", z0="t1", skip=f".L{uniq}_slow", call=f".L{uniq}_01_call", uniq=uniq)
-                                 // a1, fa0 := fp[C], maybe
+   j .L${uniq}_slow  // TODO fix this fastpath
+   mv s9, zero                 // initialize shorty reg
+%  try_01_args_static(vreg="s7", z0="t0", z1="t1", skip=f".L{uniq}_slow", call=f".L{uniq}_01_call", uniq=uniq, range=range)
+                               // a1, fa0 := fp[C], maybe
    // Return value expected. Get shorty, stash in callee-save to be available on return.
    // When getting shorty, stash this fast path's arg registers then restore.
    // Unconditionally restores a1/fa0, even if extra arg not found.
 %  get_shorty_save_a0_a1(shorty="s9", y0="s10", y1="s11")
    fmv.w.x fa0, s11
 .L${uniq}_01_call:
-   jalr s8                       // args a0, and maybe a1, fa0
-   beqz s9, .L${uniq}_next_op    // no shorty, no return value
+   jalr s8                     // args a0, and maybe a1, fa0
+   beqz s9, .L${uniq}_next_op  // no shorty, no return value
 %  maybe_float_returned(shorty="s9", z0="t0", z1="t1", uniq=f"{uniq}_0")
-                                 // a0 := fa0 if float return
+                               // a0 := fa0 if float return
    j .L${uniq}_next_op
 
 .L${uniq}_slow:
 %  get_shorty_save_a0(shorty="s9", y0="s10")
-%  slow_setup_args(shorty="s9", vregs="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", arg_start="0", uniq=uniq)
-   jalr s8                       // args in a0-a5, fa0-fa4
+%  if range == 'Range':
+%    slow_setup_args_range(shorty="s9", vC="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", z6="t6", z7="s10", arg_start="0", uniq=uniq)
+%  else:
+%    slow_setup_args(shorty="s9", vregs="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", z6="t6", arg_start="0", uniq=uniq)
+%#:
+   jalr s8                     // args (regular) a0 - a5 and fa0 - fa4, (range) a0 - a7 and fa0 - fa7 and maybe stack
 %  maybe_float_returned(shorty="s9", z0="t0", z1="t1", uniq=f"{uniq}_1")
-                                 // a0 := fa0 if float return
+                               // a0 := fa0 if float return
 .L${uniq}_next_op:
    FETCH_ADVANCE_INST 3
    GET_INST_OPCODE t0
@@ -499,44 +517,57 @@
 // s7: vreg ids F|E|D|C
 // t0: the target interface method
 //     - ignored in nterp-to-nterp transfer
-//     - side-loaded into T0 as a "hidden argument" in managed ABI transfer
-%def nterp_invoke_interface(uniq="invoke_interface"):
+//     - preserved through shorty calls
+//     - side-loaded as a "hidden argument" in managed ABI transfer
+%def nterp_invoke_interface(uniq="invoke_interface", range=""):
    ld s8, ART_METHOD_QUICK_CODE_OFFSET_64(a0)
-                                 // s8 := quick code
+                               // s8 := quick code
 %  try_nterp(quick="s8", z0="t1", skip=f".L{uniq}_simple")
-   call NterpToNterpInstance     // args a0, a1
+   call NterpToNterpInstance${range}  // args a0, a1
    j .L${uniq}_next_op
 
 .L${uniq}_simple:
-   srliw t1, xINST, 12           // t1 := A
-%  try_simple_args(ins="t1", v_fedc="s7", z0="t2", skip=f".L{uniq}_01", uniq=uniq)
-                                 // a2, a3, a4, a5 := fp[D], fp[E], fp[F], fp[G]
-   jalr s8                       // args a0 - a5, and t0
+%  if range == 'Range':
+%    try_simple_args_range(vC="s7", z0="t1", z1="t2", z2="t3", z3="t4", z4="t5", skip=f".L{uniq}_01", uniq=uniq)
+%  else:
+%    try_simple_args(v_fedc="s7", z0="t1", z1="t2", skip=f".L{uniq}_01", uniq=uniq)
+%#:
+   jalr s8                     // args (regular) a0 - a5 and t0, (range) a0 - a7 and t0 and maybe stack
    j .L${uniq}_next_op
 
 .L${uniq}_01:
-   mv s9, zero                   // initialize shorty reg
-%  try_01_args(ins="t1", v_fedc="s7", z0="t2", skip=f".L{uniq}_slow", call=f".L{uniq}_01_call", uniq=uniq)
+   j .L${uniq}_slow  // TODO fix this fastpath
+   mv s9, zero                 // initialize shorty reg
+%  try_01_args(vreg="s7", z0="t1", z1="t2", skip=f".L{uniq}_slow", call=f".L{uniq}_01_call", uniq=uniq, range=range)
+                               // a2, fa0 := (regular) fp[D] or (range) fp[CCCC + 1], maybe
    // Return value expected. Get shorty, stash in callee-save to be available on return.
    // When getting shorty, stash this fast path's arg registers then restore.
    // Unconditionally stores a2/fa0, even if extra arg not found.
-   mv s0, a2                    // skip fa0, bitwise equiv to a2
+   mv s7, a2                   // skip fa0, bitwise equiv to a2. vreg in s7 no longer needed.
+   mv s0, t0
 %  get_shorty_for_interface_save_a0_a1(shorty="s9", y0="s10", y1="s11")
-   mv a2, s0
-   fmv.w.x fa0, s0
+   mv t0, s0
+   mv a2, s7
+   fmv.w.x fa0, s7
 .L${uniq}_01_call:
-   jalr s8                       // args a0, a1, and t0, and maybe a2, fa0
-   beqz s9, .L${uniq}_next_op    // no shorty, no return value
+   jalr s8                     // args a0, a1, and t0, and maybe a2, fa0
+   beqz s9, .L${uniq}_next_op  // no shorty, no return value
 %  maybe_float_returned(shorty="s9", z0="t0", z1="t1", uniq=f"{uniq}_0")
-                                 // a0 := fa0 if float return
+                               // a0 := fa0 if float return
    j .L${uniq}_next_op
 
 .L${uniq}_slow:
+   mv s0, t0
 %  get_shorty_for_interface_save_a0_a1(shorty="s9", y0="s10", y1="s11")
-%  slow_setup_args(shorty="s9", vregs="s7", z0="t1", z1="t2", z2="t3", z3="t4", z4="t5", z5="t6", uniq=uniq)
-   jalr s8                       // args a0-a5, fa0-fa4, and t0
+   mv t0, s0
+%  if range == 'Range':
+%    slow_setup_args_range(shorty="s9", vC="s7", z0="s10", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", z6="t6", z7="s11", uniq=uniq)
+%  else:
+%    slow_setup_args(shorty="s9", vregs="s7", z0="s10", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", z6="t6", uniq=uniq)
+%#:
+   jalr s8                     // args (regular) a0 - a5, fa0 - fa4, t0, (range) a0 - a7, fa0 - fa7, t0
 %  maybe_float_returned(shorty="s9", z0="t0", z1="t1", uniq=f"{uniq}_1")
-                                 // a0 := fa0 if float return
+                               // a0 := fa0 if float return
 .L${uniq}_next_op:
    FETCH_ADVANCE_INST 3
    GET_INST_OPCODE t0
@@ -554,38 +585,54 @@
 
 
 // NterpInvokeVirtualRange
+// a0: ArtMethod*
+// a1: this
+// s7: vreg id CCCC
 %def nterp_invoke_virtual_range():
-%  nterp_invoke_direct_range(uniq="invoke_virtual_range")
+%  nterp_invoke_direct(uniq="invoke_virtual_range", range="Range")
 
 
 // NterpInvokeSuperRange
+// a0: ArtMethod*
+// a1: this
+// s7: vreg id CCCC
 %def nterp_invoke_super_range():
-%  nterp_invoke_direct_range(uniq="invoke_super_range")
+%  nterp_invoke_direct(uniq="invoke_super_range", range="Range")
 
 
 // NterpInvokeDirectRange
-%def nterp_invoke_direct_range(uniq="invoke_direct_range"):
-   unimp
+// Hardcoded:
+// a0: ArtMethod*
+// a1: this
+// s7: vreg id CCCC
+%def nterp_invoke_direct_range():
+%  nterp_invoke_direct(uniq="invoke_direct_range", range="Range")
 
 
 // NterpInvokeStringInitRange
-%def nterp_invoke_string_init_range(uniq="invoke_string_init_range"):
-   unimp
+// a0: ArtMethod*
+// s7: vreg id CCCC
+%def nterp_invoke_string_init_range():
+%  nterp_invoke_string_init(uniq="invoke_string_init_range", range="Range")
 
 
 // NterpInvokeStaticRange
-%def nterp_invoke_static_range(uniq="invoke_static_range"):
-   unimp
+// a0: ArtMethod*
+// s7: vreg id CCCC
+%def nterp_invoke_static_range():
+%  nterp_invoke_static(uniq="invoke_static_range", range="Range")
 
 
 // NterpInvokeInterfaceRange
 // a0: ArtMethod*
 // a1: this
-// a2: the target interface method
+// s7: vreg id CCCC
+// t0: the target interface method
 //     - ignored in nterp-to-nterp transfer
-//     - side-loaded into T0 as a "hidden argument" in managed ABI transfer
-%def nterp_invoke_interface_range(uniq="invoke_interface_range"):
-   unimp
+//     - preserved through shorty calls
+//     - side-loaded as a "hidden argument" in managed ABI transfer
+%def nterp_invoke_interface_range():
+%  nterp_invoke_interface(uniq="invoke_interface_range", range="Range")
 
 
 // NterpInvokePolymorphicRange
@@ -611,82 +658,137 @@
 
 // Hardcoded
 // - a0: ArtMethod*
+// - xINST
 // Input
-// - ins: arg count
 // - v_fedc: vreg ids F|E|D|C
-// Temporaries: z0
-%def try_simple_args(ins="", v_fedc="", z0="", skip="", uniq=""):
+// Temporaries: z0, z1
+%def try_simple_args(v_fedc="", z0="", z1="", arg_start="1", skip="", uniq=""):
    lwu $z0, ART_METHOD_ACCESS_FLAGS_OFFSET(a0)
    BRANCH_IF_BIT_CLEAR $z0, $z0, ART_METHOD_NTERP_INVOKE_FAST_PATH_FLAG_BIT, $skip
-   li $z0, 2
-   blt $ins, $z0, .L${uniq}_simple_done  // A = 1: no further args.
-   beq $ins, $z0, .L${uniq}_simple_2  // A = 2
-   li $z0, 4
-   blt $ins, $z0, .L${uniq}_simple_3  // A = 3
-   beq $ins, $z0, .L${uniq}_simple_4  // A = 4
-   // A = 5
-   srliw $z0, xINST, 8                // z0 := A|G
-   andi $z0, $z0, 0xF                 // z0 := G
-   GET_VREG a5, $z0
-.L${uniq}_simple_4:
-   srliw $z0, $v_fedc, 12             // z0 := F
-   GET_VREG a4, $z0
-.L${uniq}_simple_3:
-   srliw $z0, $v_fedc, 8              // z0 := F|E
-   andi $z0, $z0, 0xF                 // z0 := E
-   GET_VREG a3, $z0
-.L${uniq}_simple_2:
-   srliw $z0, $v_fedc, 4              // z0 := F|E|D
-   andi $z0, $z0, 0xF                 // z0 := D
-   GET_VREG a2, $z0
-.L${uniq}_simple_done:
-   // a1 already set to "this"
 
-
-// Static variant.
-%def try_simple_args_static(ins="", v_fedc="", z0="", skip="", uniq=""):
-   lwu $z0, ART_METHOD_ACCESS_FLAGS_OFFSET(a0)
-   BRANCH_IF_BIT_CLEAR $z0, $z0, ART_METHOD_NTERP_INVOKE_FAST_PATH_FLAG_BIT, $skip
-   beqz $ins, .L${uniq}_simple_done   // A = 0: no further args.
-   li $z0, 2
-   blt $ins, $z0, .L${uniq}_simple_1  // A = 1
-   beq $ins, $z0, .L${uniq}_simple_2  // A = 2
-   li $z0, 4
-   blt $ins, $z0, .L${uniq}_simple_3  // A = 3
-   beq $ins, $z0, .L${uniq}_simple_4  // A = 4
+   srliw $z0, xINST, 12              // z0 := A
+%  if arg_start == "0":
+     beqz $z0, .L${uniq}_simple_done // A = 0: no further args.
+%#:
+   li $z1, 2
+   blt $z0, $z1, .L${uniq}_simple_1  // A = 1
+   beq $z0, $z1, .L${uniq}_simple_2  // A = 2
+   li $z1, 4
+   blt $z0, $z1, .L${uniq}_simple_3  // A = 3
+   beq $z0, $z1, .L${uniq}_simple_4  // A = 4
    // A = 5
-   srliw $z0, xINST, 8                // z0 := A|G
-   andi $z0, $z0, 0xF                 // z0 := G
-   GET_VREG a5, $z0
+   srliw $z1, xINST, 8               // z1 := A|G
+   andi $z1, $z1, 0xF                // z1 := G
+   GET_VREG a5, $z1
 .L${uniq}_simple_4:
-   srliw $z0, $v_fedc, 12             // z0 := F
-   GET_VREG a4, $z0
+   srliw $z1, $v_fedc, 12            // z1 := F
+   GET_VREG a4, $z1
 .L${uniq}_simple_3:
-   srliw $z0, $v_fedc , 8             // z0 := F|E
-   andi $z0, $z0, 0xF                 // z0 := E
-   GET_VREG a3, $z0
+   srliw $z1, $v_fedc, 8             // z1 := F|E
+   andi $z1, $z1, 0xF                // z1 := E
+   GET_VREG a3, $z1
 .L${uniq}_simple_2:
-   srliw $z0, $v_fedc, 4              // z0 := F|E|D
-   andi $z0, $z0, 0xF                 // z0 := D
-   GET_VREG a2, $z0
+   srliw $z1, $v_fedc, 4             // z1 := F|E|D
+   andi $z1, $z1, 0xF                // z1 := D
+   GET_VREG a2, $z1
 .L${uniq}_simple_1:
-   andi $z0, $v_fedc, 0xF             // z0 := C
-   GET_VREG a1, $z0
+%  if arg_start == "0":
+     andi $z1, $v_fedc, 0xF          // z1 := C
+     GET_VREG a1, $z1
+   // instance: a1 already set to "this"
+.L${uniq}_simple_done:
+
+
+// Range variant.
+%def try_simple_args_range(vC="", z0="", z1="", z2="", z3="", z4="", skip="", arg_start="1", uniq=""):
+   lwu $z0, ART_METHOD_ACCESS_FLAGS_OFFSET(a0)
+   BRANCH_IF_BIT_CLEAR $z0, $z0, ART_METHOD_NTERP_INVOKE_FAST_PATH_FLAG_BIT, $skip
+
+   srliw $z0, xINST, 8                 // z0 := AA
+%  if arg_start == "0":  # static:
+     beqz $z0, .L${uniq}_simple_done   // AA = 0: no further args.
+     sh2add $z1, $vC, xFP              // z1 := &FP[CCCC]
+     li $z2, 2
+     blt $z0, $z2, .L${uniq}_simple_1  // AA = 1
+%  else:  # instance:
+     li $z2, 2
+     blt $z0, $z2, .L${uniq}_simple_done  // AA = 1, and a1 already loaded.
+     sh2add $z1, $vC, xFP               // z1 := &FP[CCCC]
+%#:
+   // Here: z0, z1, z2 same values for static vs instance.
+   beq $z0, $z2, .L${uniq}_simple_2  // AA = 2
+   li $z2, 4
+   blt $z0, $z2, .L${uniq}_simple_3  // AA = 3
+   beq $z0, $z2, .L${uniq}_simple_4  // AA = 4
+   li $z2, 6
+   blt $z0, $z2, .L${uniq}_simple_5  // AA = 5
+   beq $z0, $z2, .L${uniq}_simple_6  // AA = 6
+   li $z2, 7
+   beq $z0, $z2, .L${uniq}_simple_7  // AA = 7
+
+   // AA >= 8: store in stack. Load/store from FP[CCCC + 7] upwards.
+   slli $z2, $z0, 63                 // z2 := negative if z0 bit #0 is set (odd)
+   sh2add $z0, $z0, $z1              // z0 := loop guard at top of stack
+   addi $z3, $z1, 7*4                // z3 := &FP[CCCC + 7]
+   addi $z4, sp, __SIZEOF_POINTER__ + 7*4
+                                     // z4 := &OUT[CCCC + 7]
+   bltz $z2, .L${uniq}_simple_loop_wide
+                                     // if AA odd, branch to wide-copy
+   lw $z2, ($z3)
+   sw $z2, ($z4)
+   addi $z3, $z3, 4
+   addi $z4, $z4, 4
+
+.L${uniq}_simple_loop_wide:
+   // TODO: Consider ensuring 64-bit stores are aligned.
+   beq $z3, $z0, .L${uniq}_simple_7
+   ld $z2, ($z3)
+   sd $z2, ($z4)
+   addi $z3, $z3, 8
+   addi $z4, $z4, 8
+   j .L${uniq}_simple_loop_wide
+
+   // Bottom 7 slots of OUT array never written; first args are passed with a1-a7.
+.L${uniq}_simple_7:
+   lw a7, 6*4($z1)
+.L${uniq}_simple_6:
+   lw a6, 5*4($z1)
+.L${uniq}_simple_5:
+   lw a5, 4*4($z1)
+.L${uniq}_simple_4:
+   lw a4, 3*4($z1)
+.L${uniq}_simple_3:
+   lw a3, 2*4($z1)
+.L${uniq}_simple_2:
+   lw a2, 1*4($z1)
+.L${uniq}_simple_1:
+%  if arg_start == "0":  # static:
+     lw a1, 0*4($z1)
+%#:
 .L${uniq}_simple_done:
 
 
 // Check if a 0/1 arg invoke form is possible, set up a2 and fa0 if needed.
 // If a return value expected, move possible float return to a0.
 // zN are temporaries
-// yN are callee-saved
-%def try_01_args(ins="", v_fedc="", z0="", skip="", call="", uniq=""):
-   li $z0, 2                    // z0 := 2
-   blt $ins, $z0, .L${uniq}_01_peek_next  // A = 1
-   bgt $ins, $z0, $skip         // A >= 3
-   // A = 2: this, plus one arg
-   srliw $z0, $v_fedc, 4        // z0 := F|E|D
-   andi $z0, $z0, 0xF           // z0 := D
-   GET_VREG a2, $z0
+%def try_01_args(vreg="", z0="", z1="", skip="", call="", uniq="", range=""):
+%  if range == 'Range':
+     srliw $z0, xINST, 8   // z0 := AA
+%  else:
+     srliw $z0, xINST, 12  // z0 := A
+%#:
+   li $z1, 2               // z1 := 2
+   blt $z0, $z1, .L${uniq}_01_peek_next  // ins = 1
+   bgt $z0, $z1, $skip     // ins >= 3
+   // ins = 2: this, plus one arg
+%  if range == 'Range':
+     addi $z1, $vreg, 1    // z1 := CCCC + 1
+     GET_VREG a2, $z1      // a2 := fp[CCCC + 1]
+%  else:
+     srliw $z1, $vreg, 4   // z1 := F|E|D
+     andi $z1, $z1, 0xF    // z1 := D
+     GET_VREG a2, $z1      // a2 := fp[D]
+%#:
    fmv.w.x fa0, a2
 .L${uniq}_01_peek_next:
 %  try_01_args_peek_next(z0=z0)  # z0 is zero if invoke has return value
@@ -694,13 +796,22 @@
 
 
 // Static variant.
-%def try_01_args_static(ins="", v_fedc="", z0="", skip="", call="", uniq=""):
-   beqz $ins, .L${uniq}_01_peek_next  // A = 0
-   li $z0, 1                    // z0 := imm 1
-   bgt $ins, $z0, $skip         // A >= 2
-   // A = 1
-   andi $z0, $v_fedc, 0xF       // z0 := C
-   GET_VREG a1, $z0
+%def try_01_args_static(vreg="", z0="", z1="", skip="", call="", uniq="", range=""):
+%  if range == 'Range':
+     srliw $z0, xINST, 8     // z0 := AA
+%  else:
+     srliw $z0, xINST, 12    // z0 := A
+%#:
+   beqz $z0, .L${uniq}_01_peek_next  // ins = 0
+   li $z1, 1                 // z1 := 1
+   bgt $z0, $z1, $skip       // ins >= 2
+   // ins = 1: one arg
+%  if range == 'Range':
+     GET_VREG a1, $vreg      // a1 := fp[CCCC]
+%  else:
+     andi $z1, $vreg, 0xF    // z1 := C
+     GET_VREG a1, $z1        // a1 := fp[C]
+%#:
    fmv.w.x fa0, a1
 .L${uniq}_01_peek_next:
 %  try_01_args_peek_next(z0=z0)  # z0 is zero if invoke has return value
@@ -792,7 +903,7 @@
 // - a1: this
 // Input
 // - vregs: F|E|D|C from dex
-%def slow_setup_args(shorty="", vregs="", z0="", z1="", z2="", z3="", z4="", z5="", arg_start="1", uniq=""):
+%def slow_setup_args(shorty="", vregs="", z0="", z1="", z2="", z3="", z4="", z5="", z6="", arg_start="1", uniq=""):
    srliw $z0, xINST, 12     // z0 := A
    li $z1, 5
    blt $z0, $z1, .L${uniq}_slow_gpr
@@ -809,13 +920,14 @@
    li $z2, 'D'              // double
    li $z3, 'F'              // float
    li $z4, 'J'              // long
+   li $z5, 'L'              // ref
    // linear scan through shorty: extract non-float vregs
 %  if arg_start == "0":  # static can place vC into a1; instance already loaded "this" into a1.
-%    load_vreg_in_gpr(gpr="a1", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_0")
-%  load_vreg_in_gpr(gpr="a2", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_1")
-%  load_vreg_in_gpr(gpr="a3", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_2")
-%  load_vreg_in_gpr(gpr="a4", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_3")
-%  load_vreg_in_gpr(gpr="a5", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_4")
+%    load_vreg_in_gpr(gpr="a1", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, L=z5, z0=z6, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_0")
+%  load_vreg_in_gpr(gpr="a2", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, L=z5, z0=z6, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_1")
+%  load_vreg_in_gpr(gpr="a3", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, L=z5, z0=z6, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_2")
+%  load_vreg_in_gpr(gpr="a4", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, L=z5, z0=z6, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_3")
+%  load_vreg_in_gpr(gpr="a5", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, L=z5, z0=z6, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_4")
 
 .L${uniq}_slow_fpr:
    addi $z0, $shorty, 1     // z0 := first arg of shorty
@@ -832,44 +944,138 @@
 .L${uniq}_slow_done:
 
 
-// string-init variant
-%def slow_setup_args_string_init(shorty="", vregs="", z0="", z1="", z2="", z3="", z4="", z5="", uniq=""):
-   srliw $z0, xINST, 12     // z0 := A
-   li $z1, 5
-   blt $z0, $z1, .L${uniq}_slow_gpr
-   // A = 5: need vreg G
-   srliw $z1, xINST, 8      // z1 := A|G
-   andi $z1, $z1, 0xF       // z1 := G
-   slliw $z1, $z1, 16       // z1 := G0000
-   add $vregs, $z1, $vregs  // vregs := G|F|E|D|C
+// String-init variant: up to 4 args, no long/double/float args.
+// Ref args ('L') loaded with LW *must* apply ZEXT.W to avoid subtle address bugs.
+%def slow_setup_args_string_init(shorty="", v_fedc="", z0="", z1="", z2="", uniq=""):
+   srliw $z0, xINST, 12            // z0 := A; possible values 1-5
+   li $z1, 2
+   blt $z0, $z1, .L${uniq}_slow_1  // A = 1
+   li $z2, 'L'                     // z2 := ref type
+   beq $z0, $z1, .L${uniq}_slow_2  // A = 2
+   li $z1, 4
+   blt $z0, $z1, .L${uniq}_slow_3  // A = 3
+   beq $z0, $z1, .L${uniq}_slow_4  // A = 4
 
-.L${uniq}_slow_gpr:
-   addi $z0, $shorty, 1     // z0 := first arg of shorty
-   srliw $z1, $vregs, 4     // z1 := (instance) F|E|D or G|F|E|D
-   li $z2, 'D'              // double
-   li $z3, 'F'              // float
-   li $z4, 'J'              // long
+   // A = 5
+   srliw $z0, xINST, 8             // z0 := A|G
+   andi $z0, $z0, 0xF              // z0 := G
+   GET_VREG a4, $z0
+   lb $z1, 4($shorty)              // shorty RDEFG
+   bne $z1, $z2, .L${uniq}_slow_4
+   zext.w a4, a4
+.L${uniq}_slow_4:
+   srliw $z1, $v_fedc, 12          // z1 := F
+   GET_VREG a3, $z1
+   lb $z1, 3($shorty)              // shorty RDEF
+   bne $z1, $z2, .L${uniq}_slow_3
+   zext.w a3, a3
+.L${uniq}_slow_3:
+   srliw $z1, $v_fedc, 8           // z1 := F|E
+   andi $z1, $z1, 0xF              // z1 := E
+   GET_VREG a2, $z1
+   lb $z1, 2($shorty)              // shorty RDE
+   bne $z1, $z2, .L${uniq}_slow_2
+   zext.w a2, a2
+.L${uniq}_slow_2:
+   srliw $z1, $v_fedc, 4           // z1 := F|E|D
+   andi $z1, $z1, 0xF              // z1 := D
+   GET_VREG a1, $z1
+   lb $z1, 1($shorty)              // shorty RD
+   bne $z1, $z2, .L${uniq}_slow_1
+   zext.w a1, a1
+.L${uniq}_slow_1:
+   // "this" never read in string-init
+
+
+// Range and static-range variant.
+// Hardcoded
+// - (caller) xPC, xINST, xFP
+// - (callee) sp
+// Input
+// - vC: CCCC from dex
+%def slow_setup_args_range(shorty="", vC="", z0="", z1="", z2="", z3="", z4="", z5="", z6="", z7="", arg_start="1", uniq=""):
+   addi $z0, $shorty, 1       // z0 := first arg of shorty
+   addi $z1, $vC, $arg_start  // z1 := (instance) CCCC+1, (static) CCCC
+   mv $z2, zero               // z2 := is_out_stack_needed false
+   li $z3, 'D'                // double
+   li $z4, 'F'                // float
+   li $z5, 'J'                // long
+   li $z6, 'L'                // ref
+
    // linear scan through shorty: extract non-float vregs
-%  load_vreg_in_gpr(gpr="a1", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_0")
-%  load_vreg_in_gpr(gpr="a2", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_1")
-%  load_vreg_in_gpr(gpr="a3", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_2")
-%  load_vreg_in_gpr(gpr="a4", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_3")
+%  if arg_start == "0":  # static can place vCCCC into a1; instance already loaded "this" into a1.
+%    load_vreg_in_gpr_range(gpr="a1", shorty=z0, idx=z1, D=z3, F=z4, J=z5, L=z6, z0=z7, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_1")
+%  load_vreg_in_gpr_range(gpr="a2", shorty=z0, idx=z1, D=z3, F=z4, J=z5, L=z6, z0=z7, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_2")
+%  load_vreg_in_gpr_range(gpr="a3", shorty=z0, idx=z1, D=z3, F=z4, J=z5, L=z6, z0=z7, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_3")
+%  load_vreg_in_gpr_range(gpr="a4", shorty=z0, idx=z1, D=z3, F=z4, J=z5, L=z6, z0=z7, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_4")
+%  load_vreg_in_gpr_range(gpr="a5", shorty=z0, idx=z1, D=z3, F=z4, J=z5, L=z6, z0=z7, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_5")
+%  load_vreg_in_gpr_range(gpr="a6", shorty=z0, idx=z1, D=z3, F=z4, J=z5, L=z6, z0=z7, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_6")
+%  load_vreg_in_gpr_range(gpr="a7", shorty=z0, idx=z1, D=z3, F=z4, J=z5, L=z6, z0=z7, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_7")
+%  is_out_stack_needed(needed=z2, shorty=z0, D=z3, F=z4, z0=z1, uniq=uniq)
 
-   // TODO: java.lang.StringFactory methods don't have floating point args; skip FPR loads.
 .L${uniq}_slow_fpr:
-   addi $z0, $shorty, 1     // z0 := first arg of shorty
-   srliw $z1, $vregs, 4     // z1 := (instance) F|E|D or G|F|E|D
+   addi $z0, $shorty, 1       // z0 := first arg of shorty
+   addi $z1, $vC, $arg_start  // z1 := (instance) CCCC+1, (static) CCCC
    // linear scan through shorty: extract float/double vregs
-%  load_vreg_in_fpr(fpr="fa0", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_done", uniq=f"{uniq}_0")
-%  load_vreg_in_fpr(fpr="fa1", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_done", uniq=f"{uniq}_1")
-%  load_vreg_in_fpr(fpr="fa2", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_done", uniq=f"{uniq}_2")
-%  load_vreg_in_fpr(fpr="fa3", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_done", uniq=f"{uniq}_3")
+%  load_vreg_in_fpr_range(fpr="fa0", shorty=z0, idx=z1, D=z3, F=z4, J=z5, z0=z6, done=f".L{uniq}_slow_stack", uniq=f"{uniq}_0")
+%  load_vreg_in_fpr_range(fpr="fa1", shorty=z0, idx=z1, D=z3, F=z4, J=z5, z0=z6, done=f".L{uniq}_slow_stack", uniq=f"{uniq}_1")
+%  load_vreg_in_fpr_range(fpr="fa2", shorty=z0, idx=z1, D=z3, F=z4, J=z5, z0=z6, done=f".L{uniq}_slow_stack", uniq=f"{uniq}_2")
+%  load_vreg_in_fpr_range(fpr="fa3", shorty=z0, idx=z1, D=z3, F=z4, J=z5, z0=z6, done=f".L{uniq}_slow_stack", uniq=f"{uniq}_3")
+%  load_vreg_in_fpr_range(fpr="fa4", shorty=z0, idx=z1, D=z3, F=z4, J=z5, z0=z6, done=f".L{uniq}_slow_stack", uniq=f"{uniq}_4")
+%  load_vreg_in_fpr_range(fpr="fa5", shorty=z0, idx=z1, D=z3, F=z4, J=z5, z0=z6, done=f".L{uniq}_slow_stack", uniq=f"{uniq}_5")
+%  load_vreg_in_fpr_range(fpr="fa6", shorty=z0, idx=z1, D=z3, F=z4, J=z5, z0=z6, done=f".L{uniq}_slow_stack", uniq=f"{uniq}_6")
+%  load_vreg_in_fpr_range(fpr="fa7", shorty=z0, idx=z1, D=z3, F=z4, J=z5, z0=z6, done=f".L{uniq}_slow_stack", uniq=f"{uniq}_7")
+%  is_out_stack_needed_float(needed=z2, shorty=z0, D=z3, F=z4, z0=z1, uniq=uniq)
+
+.L${uniq}_slow_stack:
+   beqz $z2, .L${uniq}_slow_done  // No stack needed, skip it. Otherwise copy-paste it all with LD/SD.
+   addi $z0, sp, 8            // z0 := base addr of out array
+   sh2add $z1, $vC, xFP       // z1 := base addr of FP[CCCC]
+   srliw $z2, xINST, 8        // z2 := AA, vreg count
+   sh2add $z2, $z2, $z1       // z2 := loop guard, addr of one slot past top of xFP array
+%  copy_vregs_to_out(out=z0, fp=z1, fp_top=z2, z0=z3, uniq=uniq)
 .L${uniq}_slow_done:
 
 
+// String-init variant: up to 4 args, no long/float/double args.
+// Ref args ('L') loaded with LW *must* apply ZEXT.W to avoid subtle address bugs.
+%def slow_setup_args_string_init_range(shorty="", vC="", z0="", z1="", z2="", z3="", uniq=""):
+   srliw $z0, xINST, 8             // z0 := AA; possible values 1-5
+   li $z1, 2
+   blt $z0, $z1, .L${uniq}_slow_1  // A = 1
+   sh2add $z2, $vC, xFP            // z2 := &fp[CCCC]
+   li $z3, 'L'                     // z3 := ref type
+   beq $z0, $z1, .L${uniq}_slow_2  // A = 2
+   li $z1, 4
+   blt $z0, $z1, .L${uniq}_slow_3  // A = 3
+   beq $z0, $z1, .L${uniq}_slow_4  // A = 4
+   // A = 5
+   lw a4, 4*4($z2)
+   lb $z1, 4($shorty)
+   bne $z1, $z3, .L${uniq}_slow_4
+   zext.w a4, a4
+.L${uniq}_slow_4:
+   lw a3, 3*4($z2)
+   lb $z1, 3($shorty)
+   bne $z1, $z3, .L${uniq}_slow_3
+   zext.w a3, a3
+.L${uniq}_slow_3:
+   lw a2, 2*4($z2)
+   lb $z1, 2($shorty)
+   bne $z1, $z3, .L${uniq}_slow_2
+   zext.w a2, a2
+.L${uniq}_slow_2:
+   lw a1, 1*4($z2)
+   lb $z1, 1($shorty)
+   bne $z1, $z3, .L${uniq}_slow_1
+   zext.w a1, a1
+.L${uniq}_slow_1:
+   // "this" never read in string-init
+
+
 // Iterate through 4-bit vreg ids in the "vregs" register, load a non-FP value
 // into one argument register.
-%def load_vreg_in_gpr(gpr="", shorty="", vregs="", z0="", D="", F="", J="", done="", uniq=""):
+%def load_vreg_in_gpr(gpr="", shorty="", vregs="", D="", F="", J="", L="", z0="", done="", uniq=""):
 .L${uniq}_gpr_find:
     lb $z0, ($shorty)         // z0 := next shorty arg spec
     addi $shorty, $shorty, 1  // increment char ptr
@@ -880,6 +1086,9 @@
     andi $gpr, $vregs, 0xF    // gpr := vreg id
     beq $z0, $J, .L${uniq}_gpr_load_8_bytes
     GET_VREG $gpr, $gpr       // gpr := 32-bit load
+    bne $z0, $L, .L${uniq}_gpr_load_common
+    zext.w $gpr, $gpr
+.L${uniq}_gpr_load_common:
     srliw $vregs, $vregs, 4   // shift out the processed arg, one vreg
     j .L${uniq}_gpr_set       // and exit
 .L${uniq}_gpr_load_8_bytes:
@@ -897,7 +1106,7 @@
 
 // Iterate through 4-bit vreg ids in the "vregs" register, load a float or double
 // value into one floating point argument register.
-%def load_vreg_in_fpr(fpr="", shorty="", vregs="", z0="", D="", F="", J="", done="", uniq=""):
+%def load_vreg_in_fpr(fpr="", shorty="", vregs="", D="", F="", J="", z0="", done="", uniq=""):
 .L${uniq}_fpr_find:
     lb $z0, ($shorty)         // z0 := next shorty arg spec
     addi $shorty, $shorty, 1  // increment char ptr
@@ -922,6 +1131,104 @@
 .L${uniq}_fpr_set:
 
 
+// Range variant
+%def load_vreg_in_gpr_range(gpr="", shorty="", idx="", D="", F="", J="", L="", z0="", done="", uniq=""):
+.L${uniq}_gpr_range_find:
+    lb $z0, ($shorty)           // z0 := next shorty arg
+    addi $shorty, $shorty, 1    // increment char ptr
+    beqz $z0, $done             // z0 == \0
+    beq $z0, $F, .L${uniq}_gpr_range_skip_1_vreg
+    beq $z0, $D, .L${uniq}_gpr_range_skip_2_vreg
+
+    beq $z0, $J, .L${uniq}_gpr_range_load_2_vreg
+    GET_VREG $gpr, $idx
+    bne $z0, $L, .L${uniq}_gpr_range_load_common
+    zext.w $gpr, $gpr
+.L${uniq}_gpr_range_load_common:
+    addi $idx, $idx, 1
+    j .L${uniq}_gpr_range_done
+.L${uniq}_gpr_range_load_2_vreg:
+    GET_VREG_WIDE $gpr, $idx
+    addi $idx, $idx, 2
+    j .L${uniq}_gpr_range_done
+
+.L${uniq}_gpr_range_skip_2_vreg:
+    addi $idx, $idx, 1
+.L${uniq}_gpr_range_skip_1_vreg:
+    addi $idx, $idx, 1
+    j .L${uniq}_gpr_range_find
+.L${uniq}_gpr_range_done:
+
+
+// Range variant.
+%def load_vreg_in_fpr_range(fpr="", shorty="", idx="", D="", F="", J="", z0="", done="", uniq=""):
+.L${uniq}_fpr_range_find:
+    lb $z0, ($shorty)         // z0 := next shorty arg
+    addi $shorty, $shorty, 1  // increment char ptr
+    beqz $z0, $done           // z0 == \0
+    beq $z0, $F, .L${uniq}_fpr_range_load_4_bytes
+    beq $z0, $D, .L${uniq}_fpr_range_load_8_bytes
+
+    addi $idx, $idx, 1        // increment idx
+    bne $z0, $J, .L${uniq}_fpr_range_find
+    addi $idx, $idx, 1        // increment once more for J
+    j .L${uniq}_fpr_range_find
+
+.L${uniq}_fpr_range_load_4_bytes:
+    mv $z0, $idx
+    GET_VREG_FLOAT $fpr, $z0
+    addi $idx, $idx, 1
+    j .L${uniq}_fpr_range_set
+.L${uniq}_fpr_range_load_8_bytes:
+    mv $z0, $idx
+    GET_VREG_DOUBLE $fpr, $z0
+    addi $idx, $idx, 2
+.L${uniq}_fpr_range_set:
+
+
+%def is_out_stack_needed(needed="", shorty="", D="", F="", z0="", uniq=""):
+.L${uniq}_scan_arg:
+   lb $z0, ($shorty)
+   addi $shorty, $shorty, 1
+   beqz $z0, .L${uniq}_scan_done
+   beq $z0, $F, .L${uniq}_scan_arg
+   beq $z0, $D, .L${uniq}_scan_arg
+   li $needed, 1
+.L${uniq}_scan_done:
+
+
+%def is_out_stack_needed_float(needed="", shorty="", D="", F="", z0="", uniq=""):
+   bnez $needed, .L${uniq}_scan_float_done
+.L${uniq}_scan_float_arg:
+   lb $z0, ($shorty)
+   addi $shorty, $shorty, 1
+   beqz $z0, .L${uniq}_scan_float_done
+   beq $z0, $F, .L${uniq}_scan_float_found
+   beq $z0, $D, .L${uniq}_scan_float_found
+   j .L${uniq}_scan_float_arg
+.L${uniq}_scan_float_found:
+   li $needed, 1
+.L${uniq}_scan_float_done:
+
+
+%def copy_vregs_to_out(out="", fp="", fp_top="", z0="", uniq=""):
+   sub $z0, $fp_top, $fp  // z0 := byte range
+   BRANCH_IF_BIT_CLEAR $z0, $z0, 2, .L${uniq}_copy_wide
+                          // branch if odd count of slots
+   lwu $z0, ($fp)
+   sw $z0, ($out)
+   addi $fp, $fp, 4
+   addi $out, $out, 4
+.L${uniq}_copy_wide:
+   beq $fp, $fp_top, .L${uniq}_copy_done
+   ld $z0, ($fp)
+   sd $z0, ($out)
+   addi $fp, $fp, 8
+   addi $out, $out, 8
+   j .L${uniq}_copy_wide
+.L${uniq}_copy_done:
+
+
 // NterpToNterpInstance
 // a0: ArtMethod*
 // a1: this
@@ -931,9 +1238,8 @@
 
 // NterpToNterpStringInit
 // a0: ArtMethod*
-// a1: this
 %def nterp_to_nterp_string_init():
-%  nterp_to_nterp(how_vC="in_a1", uniq="n2n_string_init")
+%  nterp_to_nterp(how_vC="skip", uniq="n2n_string_init")
 
 
 // NterpToNterpStatic
@@ -944,22 +1250,23 @@
 
 // NterpToNterpInstanceRange
 %def nterp_to_nterp_instance_range():
-%  nterp_to_nterp_range()
+%  nterp_to_nterp(how_vC="in_a1", uniq="n2n_instance_range", range="Range")
 
 
 // NterpToNterpStringInitRange
 %def nterp_to_nterp_string_init_range():
-%  nterp_to_nterp_range()
+%  nterp_to_nterp(how_vC="skip", uniq="n2n_string_init_range", range="Range")
 
 
 // NterpToNterpStaticRange
 %def nterp_to_nterp_static_range():
-%  nterp_to_nterp_range()
+%  nterp_to_nterp(a1_instance=False, how_vC="load", uniq="n2n_static_range", range="Range")
 
 
 // helpers
 
-%def nterp_to_nterp(a1_instance=True, how_vC="", uniq=""):
+
+%def nterp_to_nterp(a1_instance=True, how_vC="", uniq="", range=""):
    .cfi_startproc
 %  setup_nterp_frame(cfi_refs="23", refs="s8", fp="s9", pc="s10", regs="s11", spills_sp="t0", z0="t1", z1="t2", z2="t3", z3="t4", uniq=uniq)
        // s8  := callee xREFS
@@ -968,7 +1275,11 @@
        // s11 := fp/refs vreg count
        // t0  := post-spills pre-frame sp (unused here)
        // sp  := post-frame callee sp
-%  n2n_arg_move(refs="s8", fp="s9", pc="s10", regs="s11", v_fedc="s7", z0="t0", z1="t1", z2="t2", z3="t3", a1_instance=a1_instance, how_vC=how_vC, uniq=uniq)
+%  if range == 'Range':
+%    n2n_arg_move_range(refs="s8", fp="s9", regs="s11", vC="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", a1_instance=a1_instance, how_vC=how_vC, uniq=uniq)
+%  else:
+%    n2n_arg_move(refs="s8", fp="s9", pc="s10", regs="s11", v_fedc="s7", z0="t0", z1="t1", z2="t2", z3="t3", a1_instance=a1_instance, how_vC=how_vC, uniq=uniq)
+%#:
    mv xREFS, s8
    mv xFP, s9
    mv xPC, s10
@@ -978,12 +1289,6 @@
    .cfi_endproc
 
 
-%def nterp_to_nterp_range(a1_instance=True, how_vC="", uniq=""):
-   .cfi_startproc
-   unimp
-   .cfi_endproc
-
-
 // See runtime/nterp_helpers.cc for a diagram of the setup.
 // Hardcoded
 // - a0 - ArtMethod*
@@ -1130,6 +1435,52 @@
 .L${uniq}_arg_done:
 
 
+%def n2n_arg_move_range(refs="", fp="", regs="", vC="", z0="", z1="", z2="", z3="", z4="", z5="", a1_instance=True, how_vC="", uniq=""):
+   srliw $z0, xINST, 8     // z0 := AA (arg count)
+
+%  if not a1_instance:
+     beqz $z0, .L${uniq}_arg_range_done
+%#:
+   // AA >= 1, iterator setup
+   sub $z4, $regs, $z0     // z4 := regs - AA; starting idx in fp and refs
+   sh2add $z1, $vC, xREFS  // z1 := addr of xREFS[CCCC]
+   sh2add $z2, $vC, xFP    // z2 := addr of xFP[CCCC]
+   sh2add $z3, $z4, $refs  // z3 := addr of refs[z4]
+   sh2add $z4, $z4, $fp    // z4 := addr of fp[z4]
+
+   BRANCH_IF_BIT_CLEAR $z0, $z0, 0, .L${uniq}_arg_range_copy_wide
+                           // branch if AA is even
+   // AA is odd, transfer one slot. Apply some optimizations.
+%  if how_vC == "in_a1":
+     sw a1, ($z3)
+     sw a1, ($z4)
+%  elif how_vC == "skip":
+     // string init doesn't read "this"
+%  elif how_vC == "load":
+     lw $z0, ($z1)
+     lw $z5, ($z2)
+     sw $z0, ($z3)
+     sw $z5, ($z4)
+%#:
+   addi $z1, $z1, 4
+   addi $z2, $z2, 4
+   addi $z3, $z3, 4
+   addi $z4, $z4, 4
+.L${uniq}_arg_range_copy_wide:
+   // Even count of vreg slots, apply LD/SD.
+   beq $z3, $fp, .L${uniq}_arg_range_done  // terminate loop if refs[regs] == fp[0]
+   ld $z0, ($z1)
+   ld $z5, ($z2)
+   sd $z0, ($z3)
+   sd $z5, ($z4)
+   addi $z1, $z1, 8
+   addi $z2, $z2, 8
+   addi $z3, $z3, 8
+   addi $z4, $z4, 8
+   j .L${uniq}_arg_range_copy_wide
+.L${uniq}_arg_range_done:
+
+
 //
 // Nterp entry point helpers
 //
diff --git a/runtime/interpreter/mterp/riscv64/main.S b/runtime/interpreter/mterp/riscv64/main.S
index cc556ad..35f53a9 100644
--- a/runtime/interpreter/mterp/riscv64/main.S
+++ b/runtime/interpreter/mterp/riscv64/main.S
@@ -306,6 +306,7 @@
 .endm
 
 // Typed read, defaults to 32-bit read
+// Note: Incorrect for an object ref; it requires LWU, or LW;ZEXT.W.
 // Clobbers: \reg
 // Safe if \reg == \vreg.
 .macro GET_VREG reg, vreg, is_wide=0
@@ -318,6 +319,7 @@
 .endm
 
 // Typed write, defaults to 32-bit write.
+// Note: Incorrect for an object ref; it requires 2nd SW into xREFS.
 // Clobbers: z0
 .macro SET_VREG reg, vreg, z0, is_wide=0
     .if \is_wide
@@ -514,28 +516,29 @@
    // We drained arg registers, so continue from caller stack's out array. Unlike the reference-only
    // fast-path, the continuation offset in the out array can vary, depending on the presence of
    // 64-bit values in the arg registers. \offset tracks this value as a byte offset.
-   addi t4, s9, (NTERP_SIZE_SAVE_CALLEE_SAVES + 8)
-                            // t4 := (caller) outs array base address, for here and fargs
-   add t0, t3, t0           // t0 := (callee) &FP[next]
+   addi t5, s9, (NTERP_SIZE_SAVE_CALLEE_SAVES + 8)
+                            // t5 := (caller) outs array base address
+   add t4, t3, t0           // t4 := (callee) &FP[next]
    add t1, t3, t1           // t1 := (callee) &REFS[next]
-   add t3, t3, t4           // t3 := (caller) &OUTS[next]
-%  store_outs_to_vregs(outs="t3", shorty="t2", fp="t0", refs="t1", z0="t5", z1="t6", D="s0", F="s4", J="s5", L="s8", next=".Lentry_fargs")
-
+   add t3, t3, t5           // t3 := (caller) &OUTS[next]
+%  store_outs_to_vregs(outs="t3", shorty="t2", fp="t4", refs="t1", z0="t5", z1="t6", D="s0", F="s4", J="s5", L="s8", next=".Lentry_fargs")
+                            // t0 = &xFP[a1], unclobbered
 .Lentry_fargs:
-   sh2add t0, s7, xFP       // t0 := &FP[a1]
    addi t1, s11, 1          // t1 := shorty arg (skip return type)
    slliw t2, s10, 2         // t2 := starting byte offset for fp/outs, static and instance
    // linear scan through shorty: extract float args
-%  store_fpr_to_vreg(fpr="fa0", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t5", D="s0", F="s4", J="s5", next=".Lentry_go")
-%  store_fpr_to_vreg(fpr="fa1", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t5", D="s0", F="s4", J="s5", next=".Lentry_go")
-%  store_fpr_to_vreg(fpr="fa2", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t5", D="s0", F="s4", J="s5", next=".Lentry_go")
-%  store_fpr_to_vreg(fpr="fa3", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t5", D="s0", F="s4", J="s5", next=".Lentry_go")
-%  store_fpr_to_vreg(fpr="fa4", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t5", D="s0", F="s4", J="s5", next=".Lentry_go")
-%  store_fpr_to_vreg(fpr="fa5", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t5", D="s0", F="s4", J="s5", next=".Lentry_go")
-%  store_fpr_to_vreg(fpr="fa6", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t5", D="s0", F="s4", J="s5", next=".Lentry_go")
-%  store_fpr_to_vreg(fpr="fa7", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t5", D="s0", F="s4", J="s5", next=".Lentry_go")
+%  store_fpr_to_vreg(fpr="fa0", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t4", D="s0", F="s4", J="s5", next=".Lentry_go")
+%  store_fpr_to_vreg(fpr="fa1", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t4", D="s0", F="s4", J="s5", next=".Lentry_go")
+%  store_fpr_to_vreg(fpr="fa2", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t4", D="s0", F="s4", J="s5", next=".Lentry_go")
+%  store_fpr_to_vreg(fpr="fa3", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t4", D="s0", F="s4", J="s5", next=".Lentry_go")
+%  store_fpr_to_vreg(fpr="fa4", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t4", D="s0", F="s4", J="s5", next=".Lentry_go")
+%  store_fpr_to_vreg(fpr="fa5", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t4", D="s0", F="s4", J="s5", next=".Lentry_go")
+%  store_fpr_to_vreg(fpr="fa6", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t4", D="s0", F="s4", J="s5", next=".Lentry_go")
+%  store_fpr_to_vreg(fpr="fa7", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t4", D="s0", F="s4", J="s5", next=".Lentry_go")
+   addi t3, s9, (NTERP_SIZE_SAVE_CALLEE_SAVES + 8)
+                            // t3 := (caller) outs array base address
    add t0, t2, t0           // t0 := (callee) &FP[next]
-   add t2, t2, t4           // t2 := (caller) &OUTS[next]
+   add t2, t2, t3           // t2 := (caller) &OUTS[next]
 %  store_float_outs_to_vregs(outs="t2", shorty="t1", fp="t0", z0="t3", D="s0", F="s4", J="s5", next=".Lentry_go")
 
 .Lentry_go:
@@ -613,10 +616,10 @@
 %  nterp_to_nterp_static()
 NterpToNterpInstanceRange:
 %  nterp_to_nterp_instance_range()
-NterpToNterpStaticRange:
-%  nterp_to_nterp_static_range()
 NterpToNterpStringInitRange:
 %  nterp_to_nterp_string_init_range()
+NterpToNterpStaticRange:
+%  nterp_to_nterp_static_range()
 
 NAME_END nterp_helper
 
diff --git a/runtime/nterp_helpers.cc b/runtime/nterp_helpers.cc
index f411e73..83057f8 100644
--- a/runtime/nterp_helpers.cc
+++ b/runtime/nterp_helpers.cc
@@ -279,6 +279,11 @@
         case Instruction::INVOKE_DIRECT:
         case Instruction::INVOKE_STATIC:
         case Instruction::INVOKE_INTERFACE:
+        case Instruction::INVOKE_VIRTUAL_RANGE:
+        case Instruction::INVOKE_SUPER_RANGE:
+        case Instruction::INVOKE_DIRECT_RANGE:
+        case Instruction::INVOKE_STATIC_RANGE:
+        case Instruction::INVOKE_INTERFACE_RANGE:
         case Instruction::NEG_INT:
         case Instruction::NOT_INT:
         case Instruction::NEG_LONG: