riscv64: invoke opcodes, range variants
Also hotfix for slow path 'L' ref loads: clear upper bits for correctness.
Also disable 01 fastpath to rethink approach for 'L' ref loads vs shorty cost.
See invoke.S for notes on code structure.
Test: Run these opcodes against all interpreter
tests on a Linux RISC-V VM.
(1) setup
lunch aosp_riscv64-trunk-userdebug
export ART_TEST_SSH_USER=ubuntu
export ART_TEST_SSH_HOST=localhost
export ART_TEST_SSH_PORT=10001
export ART_TEST_ON_VM=true
. art/tools/buildbot-utils.sh
art/tools/buildbot-build.sh --target
# Create, boot and configure the VM.
art/tools/buildbot-vm.sh create
art/tools/buildbot-vm.sh boot
art/tools/buildbot-vm.sh setup-ssh # password: 'ubuntu'
art/tools/buildbot-cleanup-device.sh
art/tools/buildbot-setup-device.sh
art/tools/buildbot-sync.sh
(2) test
art/test.py --target -r --no-prebuild --ndebug --64 -j 12 --cdex-none --interpreter
Clean with `m check_cfi`.
Bug: 283082047
Change-Id: I20426fd1e7b397f7fce51a1f43661056d5b8e844
diff --git a/runtime/interpreter/mterp/riscv64/invoke.S b/runtime/interpreter/mterp/riscv64/invoke.S
index e93cc9f..569b750 100644
--- a/runtime/interpreter/mterp/riscv64/invoke.S
+++ b/runtime/interpreter/mterp/riscv64/invoke.S
@@ -126,7 +126,7 @@
and t0, a0, 0x1 // t0 := string-init bit
beqz t0, 1b // not string init
and a0, a0, ~0x1 // clear string-init bit
- tail NterpInvokeStringInit // args a0, a1, s7
+ tail NterpInvokeStringInit${range} // args a0, s7
3:
tail common_errNullObject
@@ -379,25 +379,28 @@
// NterpInvokeDirect
// a0: ArtMethod*
// a1: this
-// s7: vreg ids F|E|D|C
-%def nterp_invoke_direct(uniq="invoke_direct"):
+// s7: (regular) vreg ids F|E|D|C, (range) vreg id CCCC
+%def nterp_invoke_direct(uniq="invoke_direct", range=""):
ld s8, ART_METHOD_QUICK_CODE_OFFSET_64(a0)
// s8 := quick code
% try_nterp(quick="s8", z0="t0", skip=f".L{uniq}_simple")
- call NterpToNterpInstance // args a0, a1
+ call NterpToNterpInstance${range} // args a0, a1
j .L${uniq}_next_op
.L${uniq}_simple:
- srliw t0, xINST, 12 // t0 := A
-% try_simple_args(ins="t0", v_fedc="s7", z0="t1", skip=f".L{uniq}_01", uniq=uniq)
- // a2, a3, a4, a5 := fp[D], fp[E], fp[F], fp[G]
- jalr s8 // args a0 - a5
+% if range == 'Range':
+% try_simple_args_range(vC="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", skip=f".L{uniq}_01", uniq=uniq)
+% else:
+% try_simple_args(v_fedc="s7", z0="t0", z1="t1", skip=f".L{uniq}_01", uniq=uniq)
+%#:
+ jalr s8 // (regular) args a0 - a5, (range) args a0 - a7 and stack
j .L${uniq}_next_op
.L${uniq}_01:
+ j .L${uniq}_slow // TODO fix this fastpath
mv s9, zero // initialize shorty reg
-% try_01_args(ins="t0", v_fedc="s7", z0="t1", skip=f".L{uniq}_slow", call=f".L{uniq}_01_call", uniq=uniq)
- // a2, fa0 := fp[D], maybe
+% try_01_args(vreg="s7", z0="t1", z1="t2", skip=f".L{uniq}_slow", call=f".L{uniq}_01_call", uniq=uniq, range=range)
+ // a2, fa0 := (regular) fp[D] or (range) fp[CCCC + 1], maybe
// Return value expected. Get shorty, stash in callee-save to be available on return.
// When getting shorty, stash this fast path's arg registers then restore.
// Unconditionally restores a2/fa0, even if extra arg not found.
@@ -414,7 +417,11 @@
.L${uniq}_slow:
% get_shorty_save_a0_a1(shorty="s9", y0="s10", y1="s11")
-% slow_setup_args(shorty="s9", vregs="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", uniq=uniq)
+% if range == 'Range':
+% slow_setup_args_range(shorty="s9", vC="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", z6="t6", z7="s10", uniq=uniq)
+% else:
+% slow_setup_args(shorty="s9", vregs="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", z6="t6", uniq=uniq)
+%#:
jalr s8 // args in a0-a5, fa0-fa4
% maybe_float_returned(shorty="s9", z0="t0", z1="t1", uniq=f"{uniq}_1")
// a0 := fa0 if float return
@@ -426,23 +433,27 @@
// NterpInvokeStringInit
// a0: ArtMethod*
-// a1: this
-// s7: vreg ids F|E|D|C
-%def nterp_invoke_string_init(uniq="invoke_string_init"):
+// s7: (regular) vreg ids F|E|D|C, (range) vreg id CCCC
+%def nterp_invoke_string_init(uniq="invoke_string_init", range=""):
ld s8, ART_METHOD_QUICK_CODE_OFFSET_64(a0)
- // s8 := quick code
+ // s8 := quick code
% try_nterp(quick="s8", z0="t0", skip=f".L{uniq}_slow")
- call NterpToNterpStringInit // args a0, a1
+ call NterpToNterpStringInit${range} // arg a0
j .L${uniq}_next_op
.L${uniq}_slow:
% get_shorty_save_a0_a1(shorty="s9", y0="s10", y1="s11")
-% slow_setup_args_string_init(shorty="s9", vregs="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", uniq=uniq)
- mv s9, a1 // save "this" in callee-save for return-time fixup
- jalr s8 // args in a0-a5, fa0-fa4
+% if range == 'Range':
+% slow_setup_args_string_init_range(shorty="s9", vC="s7", z0="t0", z1="t1", z2="t2", z3="t3", uniq=uniq)
+% else:
+% slow_setup_args_string_init(shorty="s9", v_fedc="s7", z0="t0", z1="t1", z2="t2", uniq=uniq)
+%#:
+ jalr s8 // args (regular) a0 - a5, (range) a0 - a5
.L${uniq}_next_op:
-% subst_vreg_references(old="s9", new="a0", z0="t0", z1="t1", z2="t2", uniq=uniq)
+% fetch_receiver(reg="t0", vreg="s7", range=range)
+ // t0 := fp[C] (this)
+% subst_vreg_references(old="t0", new="a0", z0="t1", z1="t2", z2="t3", uniq=uniq)
FETCH_ADVANCE_INST 3
GET_INST_OPCODE t0
GOTO_OPCODE t0
@@ -450,43 +461,50 @@
// NterpInvokeStatic
// a0: ArtMethod*
-// s7: vreg ids F|E|D|C
-%def nterp_invoke_static(uniq="invoke_static"):
+// s7: (regular) vreg ids F|E|D|C, (range) vreg id CCCC
+%def nterp_invoke_static(uniq="invoke_static", range=""):
ld s8, ART_METHOD_QUICK_CODE_OFFSET_64(a0)
- // s8 := quick code
+ // s8 := quick code
% try_nterp(quick="s8", z0="t0", skip=f".L{uniq}_simple")
- call NterpToNterpStatic // arg a0
+ call NterpToNterpStatic${range} // arg a0
j .L${uniq}_next_op
.L${uniq}_simple:
- srliw t0, xINST, 12 // t0 := A
-% try_simple_args_static(ins="t0", v_fedc="s7", z0="t1", skip=f".L{uniq}_01", uniq=uniq)
- // a1, a2, a3, a4, a5 := fp[C], fp[D], fp[E], fp[F], fp[G]
- jalr s8 // args a0 - a5
+% if range == 'Range':
+% try_simple_args_range(vC="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", arg_start="0", skip=f".L{uniq}_01", uniq=uniq)
+% else:
+% try_simple_args(v_fedc="s7", z0="t0", z1="t1", arg_start="0", skip=f".L{uniq}_01", uniq=uniq)
+%#:
+ jalr s8 // args (regular) a0 - a5, (range) a0 - a7 and maybe stack
j .L${uniq}_next_op
.L${uniq}_01:
- mv s9, zero // initialize shorty reg
-% try_01_args_static(ins="t0", v_fedc="s7", z0="t1", skip=f".L{uniq}_slow", call=f".L{uniq}_01_call", uniq=uniq)
- // a1, fa0 := fp[C], maybe
+ j .L${uniq}_slow // TODO fix this fastpath
+ mv s9, zero // initialize shorty reg
+% try_01_args_static(vreg="s7", z0="t0", z1="t1", skip=f".L{uniq}_slow", call=f".L{uniq}_01_call", uniq=uniq, range=range)
+ // a1, fa0 := fp[C], maybe
// Return value expected. Get shorty, stash in callee-save to be available on return.
// When getting shorty, stash this fast path's arg registers then restore.
// Unconditionally restores a1/fa0, even if extra arg not found.
% get_shorty_save_a0_a1(shorty="s9", y0="s10", y1="s11")
fmv.w.x fa0, s11
.L${uniq}_01_call:
- jalr s8 // args a0, and maybe a1, fa0
- beqz s9, .L${uniq}_next_op // no shorty, no return value
+ jalr s8 // args a0, and maybe a1, fa0
+ beqz s9, .L${uniq}_next_op // no shorty, no return value
% maybe_float_returned(shorty="s9", z0="t0", z1="t1", uniq=f"{uniq}_0")
- // a0 := fa0 if float return
+ // a0 := fa0 if float return
j .L${uniq}_next_op
.L${uniq}_slow:
% get_shorty_save_a0(shorty="s9", y0="s10")
-% slow_setup_args(shorty="s9", vregs="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", arg_start="0", uniq=uniq)
- jalr s8 // args in a0-a5, fa0-fa4
+% if range == 'Range':
+% slow_setup_args_range(shorty="s9", vC="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", z6="t6", z7="s10", arg_start="0", uniq=uniq)
+% else:
+% slow_setup_args(shorty="s9", vregs="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", z6="t6", arg_start="0", uniq=uniq)
+%#:
+ jalr s8 // args (regular) a0 - a5 and fa0 - fa4, (range) a0 - a7 and fa0 - fa7 and maybe stack
% maybe_float_returned(shorty="s9", z0="t0", z1="t1", uniq=f"{uniq}_1")
- // a0 := fa0 if float return
+ // a0 := fa0 if float return
.L${uniq}_next_op:
FETCH_ADVANCE_INST 3
GET_INST_OPCODE t0
@@ -499,44 +517,57 @@
// s7: vreg ids F|E|D|C
// t0: the target interface method
// - ignored in nterp-to-nterp transfer
-// - side-loaded into T0 as a "hidden argument" in managed ABI transfer
-%def nterp_invoke_interface(uniq="invoke_interface"):
+// - preserved through shorty calls
+// - side-loaded as a "hidden argument" in managed ABI transfer
+%def nterp_invoke_interface(uniq="invoke_interface", range=""):
ld s8, ART_METHOD_QUICK_CODE_OFFSET_64(a0)
- // s8 := quick code
+ // s8 := quick code
% try_nterp(quick="s8", z0="t1", skip=f".L{uniq}_simple")
- call NterpToNterpInstance // args a0, a1
+ call NterpToNterpInstance${range} // args a0, a1
j .L${uniq}_next_op
.L${uniq}_simple:
- srliw t1, xINST, 12 // t1 := A
-% try_simple_args(ins="t1", v_fedc="s7", z0="t2", skip=f".L{uniq}_01", uniq=uniq)
- // a2, a3, a4, a5 := fp[D], fp[E], fp[F], fp[G]
- jalr s8 // args a0 - a5, and t0
+% if range == 'Range':
+% try_simple_args_range(vC="s7", z0="t1", z1="t2", z2="t3", z3="t4", z4="t5", skip=f".L{uniq}_01", uniq=uniq)
+% else:
+% try_simple_args(v_fedc="s7", z0="t1", z1="t2", skip=f".L{uniq}_01", uniq=uniq)
+%#:
+ jalr s8 // args (regular) a0 - a5 and t0, (range) a0 - a7 and t0 and maybe stack
j .L${uniq}_next_op
.L${uniq}_01:
- mv s9, zero // initialize shorty reg
-% try_01_args(ins="t1", v_fedc="s7", z0="t2", skip=f".L{uniq}_slow", call=f".L{uniq}_01_call", uniq=uniq)
+ j .L${uniq}_slow // TODO fix this fastpath
+ mv s9, zero // initialize shorty reg
+% try_01_args(vreg="s7", z0="t1", z1="t2", skip=f".L{uniq}_slow", call=f".L{uniq}_01_call", uniq=uniq, range=range)
+ // a2, fa0 := (regular) fp[D] or (range) fp[CCCC + 1], maybe
// Return value expected. Get shorty, stash in callee-save to be available on return.
// When getting shorty, stash this fast path's arg registers then restore.
// Unconditionally stores a2/fa0, even if extra arg not found.
- mv s0, a2 // skip fa0, bitwise equiv to a2
+ mv s7, a2 // skip fa0, bitwise equiv to a2. vreg in s7 no longer needed.
+ mv s0, t0
% get_shorty_for_interface_save_a0_a1(shorty="s9", y0="s10", y1="s11")
- mv a2, s0
- fmv.w.x fa0, s0
+ mv t0, s0
+ mv a2, s7
+ fmv.w.x fa0, s7
.L${uniq}_01_call:
- jalr s8 // args a0, a1, and t0, and maybe a2, fa0
- beqz s9, .L${uniq}_next_op // no shorty, no return value
+ jalr s8 // args a0, a1, and t0, and maybe a2, fa0
+ beqz s9, .L${uniq}_next_op // no shorty, no return value
% maybe_float_returned(shorty="s9", z0="t0", z1="t1", uniq=f"{uniq}_0")
- // a0 := fa0 if float return
+ // a0 := fa0 if float return
j .L${uniq}_next_op
.L${uniq}_slow:
+ mv s0, t0
% get_shorty_for_interface_save_a0_a1(shorty="s9", y0="s10", y1="s11")
-% slow_setup_args(shorty="s9", vregs="s7", z0="t1", z1="t2", z2="t3", z3="t4", z4="t5", z5="t6", uniq=uniq)
- jalr s8 // args a0-a5, fa0-fa4, and t0
+ mv t0, s0
+% if range == 'Range':
+% slow_setup_args_range(shorty="s9", vC="s7", z0="s10", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", z6="t6", z7="s11", uniq=uniq)
+% else:
+% slow_setup_args(shorty="s9", vregs="s7", z0="s10", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", z6="t6", uniq=uniq)
+%#:
+ jalr s8 // args (regular) a0 - a5, fa0 - fa4, t0, (range) a0 - a7, fa0 - fa7, t0
% maybe_float_returned(shorty="s9", z0="t0", z1="t1", uniq=f"{uniq}_1")
- // a0 := fa0 if float return
+ // a0 := fa0 if float return
.L${uniq}_next_op:
FETCH_ADVANCE_INST 3
GET_INST_OPCODE t0
@@ -554,38 +585,54 @@
// NterpInvokeVirtualRange
+// a0: ArtMethod*
+// a1: this
+// s7: vreg id CCCC
%def nterp_invoke_virtual_range():
-% nterp_invoke_direct_range(uniq="invoke_virtual_range")
+% nterp_invoke_direct(uniq="invoke_virtual_range", range="Range")
// NterpInvokeSuperRange
+// a0: ArtMethod*
+// a1: this
+// s7: vreg id CCCC
%def nterp_invoke_super_range():
-% nterp_invoke_direct_range(uniq="invoke_super_range")
+% nterp_invoke_direct(uniq="invoke_super_range", range="Range")
// NterpInvokeDirectRange
-%def nterp_invoke_direct_range(uniq="invoke_direct_range"):
- unimp
+// Hardcoded:
+// a0: ArtMethod*
+// a1: this
+// s7: vreg id CCCC
+%def nterp_invoke_direct_range():
+% nterp_invoke_direct(uniq="invoke_direct_range", range="Range")
// NterpInvokeStringInitRange
-%def nterp_invoke_string_init_range(uniq="invoke_string_init_range"):
- unimp
+// a0: ArtMethod*
+// s7: vreg id CCCC
+%def nterp_invoke_string_init_range():
+% nterp_invoke_string_init(uniq="invoke_string_init_range", range="Range")
// NterpInvokeStaticRange
-%def nterp_invoke_static_range(uniq="invoke_static_range"):
- unimp
+// a0: ArtMethod*
+// s7: vreg id CCCC
+%def nterp_invoke_static_range():
+% nterp_invoke_static(uniq="invoke_static_range", range="Range")
// NterpInvokeInterfaceRange
// a0: ArtMethod*
// a1: this
-// a2: the target interface method
+// s7: vreg id CCCC
+// t0: the target interface method
// - ignored in nterp-to-nterp transfer
-// - side-loaded into T0 as a "hidden argument" in managed ABI transfer
-%def nterp_invoke_interface_range(uniq="invoke_interface_range"):
- unimp
+// - preserved through shorty calls
+// - side-loaded as a "hidden argument" in managed ABI transfer
+%def nterp_invoke_interface_range():
+% nterp_invoke_interface(uniq="invoke_interface_range", range="Range")
// NterpInvokePolymorphicRange
@@ -611,82 +658,137 @@
// Hardcoded
// - a0: ArtMethod*
+// - xINST
// Input
-// - ins: arg count
// - v_fedc: vreg ids F|E|D|C
-// Temporaries: z0
-%def try_simple_args(ins="", v_fedc="", z0="", skip="", uniq=""):
+// Temporaries: z0, z1
+%def try_simple_args(v_fedc="", z0="", z1="", arg_start="1", skip="", uniq=""):
lwu $z0, ART_METHOD_ACCESS_FLAGS_OFFSET(a0)
BRANCH_IF_BIT_CLEAR $z0, $z0, ART_METHOD_NTERP_INVOKE_FAST_PATH_FLAG_BIT, $skip
- li $z0, 2
- blt $ins, $z0, .L${uniq}_simple_done // A = 1: no further args.
- beq $ins, $z0, .L${uniq}_simple_2 // A = 2
- li $z0, 4
- blt $ins, $z0, .L${uniq}_simple_3 // A = 3
- beq $ins, $z0, .L${uniq}_simple_4 // A = 4
- // A = 5
- srliw $z0, xINST, 8 // z0 := A|G
- andi $z0, $z0, 0xF // z0 := G
- GET_VREG a5, $z0
-.L${uniq}_simple_4:
- srliw $z0, $v_fedc, 12 // z0 := F
- GET_VREG a4, $z0
-.L${uniq}_simple_3:
- srliw $z0, $v_fedc, 8 // z0 := F|E
- andi $z0, $z0, 0xF // z0 := E
- GET_VREG a3, $z0
-.L${uniq}_simple_2:
- srliw $z0, $v_fedc, 4 // z0 := F|E|D
- andi $z0, $z0, 0xF // z0 := D
- GET_VREG a2, $z0
-.L${uniq}_simple_done:
- // a1 already set to "this"
-
-// Static variant.
-%def try_simple_args_static(ins="", v_fedc="", z0="", skip="", uniq=""):
- lwu $z0, ART_METHOD_ACCESS_FLAGS_OFFSET(a0)
- BRANCH_IF_BIT_CLEAR $z0, $z0, ART_METHOD_NTERP_INVOKE_FAST_PATH_FLAG_BIT, $skip
- beqz $ins, .L${uniq}_simple_done // A = 0: no further args.
- li $z0, 2
- blt $ins, $z0, .L${uniq}_simple_1 // A = 1
- beq $ins, $z0, .L${uniq}_simple_2 // A = 2
- li $z0, 4
- blt $ins, $z0, .L${uniq}_simple_3 // A = 3
- beq $ins, $z0, .L${uniq}_simple_4 // A = 4
+ srliw $z0, xINST, 12 // z0 := A
+% if arg_start == "0":
+ beqz $z0, .L${uniq}_simple_done // A = 0: no further args.
+%#:
+ li $z1, 2
+ blt $z0, $z1, .L${uniq}_simple_1 // A = 1
+ beq $z0, $z1, .L${uniq}_simple_2 // A = 2
+ li $z1, 4
+ blt $z0, $z1, .L${uniq}_simple_3 // A = 3
+ beq $z0, $z1, .L${uniq}_simple_4 // A = 4
// A = 5
- srliw $z0, xINST, 8 // z0 := A|G
- andi $z0, $z0, 0xF // z0 := G
- GET_VREG a5, $z0
+ srliw $z1, xINST, 8 // z1 := A|G
+ andi $z1, $z1, 0xF // z1 := G
+ GET_VREG a5, $z1
.L${uniq}_simple_4:
- srliw $z0, $v_fedc, 12 // z0 := F
- GET_VREG a4, $z0
+ srliw $z1, $v_fedc, 12 // z1 := F
+ GET_VREG a4, $z1
.L${uniq}_simple_3:
- srliw $z0, $v_fedc , 8 // z0 := F|E
- andi $z0, $z0, 0xF // z0 := E
- GET_VREG a3, $z0
+ srliw $z1, $v_fedc, 8 // z1 := F|E
+ andi $z1, $z1, 0xF // z1 := E
+ GET_VREG a3, $z1
.L${uniq}_simple_2:
- srliw $z0, $v_fedc, 4 // z0 := F|E|D
- andi $z0, $z0, 0xF // z0 := D
- GET_VREG a2, $z0
+ srliw $z1, $v_fedc, 4 // z1 := F|E|D
+ andi $z1, $z1, 0xF // z1 := D
+ GET_VREG a2, $z1
.L${uniq}_simple_1:
- andi $z0, $v_fedc, 0xF // z0 := C
- GET_VREG a1, $z0
+% if arg_start == "0":
+ andi $z1, $v_fedc, 0xF // z1 := C
+ GET_VREG a1, $z1
+ // instance: a1 already set to "this"
+.L${uniq}_simple_done:
+
+
+// Range variant.
+%def try_simple_args_range(vC="", z0="", z1="", z2="", z3="", z4="", skip="", arg_start="1", uniq=""):
+ lwu $z0, ART_METHOD_ACCESS_FLAGS_OFFSET(a0)
+ BRANCH_IF_BIT_CLEAR $z0, $z0, ART_METHOD_NTERP_INVOKE_FAST_PATH_FLAG_BIT, $skip
+
+ srliw $z0, xINST, 8 // z0 := AA
+% if arg_start == "0": # static:
+ beqz $z0, .L${uniq}_simple_done // AA = 0: no further args.
+ sh2add $z1, $vC, xFP // z1 := &FP[CCCC]
+ li $z2, 2
+ blt $z0, $z2, .L${uniq}_simple_1 // AA = 1
+% else: # instance:
+ li $z2, 2
+ blt $z0, $z2, .L${uniq}_simple_done // AA = 1, and a1 already loaded.
+ sh2add $z1, $vC, xFP // z1 := &FP[CCCC]
+%#:
+ // Here: z0, z1, z2 same values for static vs instance.
+ beq $z0, $z2, .L${uniq}_simple_2 // AA = 2
+ li $z2, 4
+ blt $z0, $z2, .L${uniq}_simple_3 // AA = 3
+ beq $z0, $z2, .L${uniq}_simple_4 // AA = 4
+ li $z2, 6
+ blt $z0, $z2, .L${uniq}_simple_5 // AA = 5
+ beq $z0, $z2, .L${uniq}_simple_6 // AA = 6
+ li $z2, 7
+ beq $z0, $z2, .L${uniq}_simple_7 // AA = 7
+
+ // AA >= 8: store in stack. Load/store from FP[CCCC + 7] upwards.
+ slli $z2, $z0, 63 // z2 := negative if z0 bit #0 is set (odd)
+ sh2add $z0, $z0, $z1 // z0 := loop guard at top of stack
+ addi $z3, $z1, 7*4 // z3 := &FP[CCCC + 7]
+ addi $z4, sp, __SIZEOF_POINTER__ + 7*4
+ // z4 := &OUT[CCCC + 7]
+ bltz $z2, .L${uniq}_simple_loop_wide
+ // if AA odd, branch to wide-copy
+ lw $z2, ($z3)
+ sw $z2, ($z4)
+ addi $z3, $z3, 4
+ addi $z4, $z4, 4
+
+.L${uniq}_simple_loop_wide:
+ // TODO: Consider ensuring 64-bit stores are aligned.
+ beq $z3, $z0, .L${uniq}_simple_7
+ ld $z2, ($z3)
+ sd $z2, ($z4)
+ addi $z3, $z3, 8
+ addi $z4, $z4, 8
+ j .L${uniq}_simple_loop_wide
+
+ // Bottom 7 slots of OUT array never written; first args are passed with a1-a7.
+.L${uniq}_simple_7:
+ lw a7, 6*4($z1)
+.L${uniq}_simple_6:
+ lw a6, 5*4($z1)
+.L${uniq}_simple_5:
+ lw a5, 4*4($z1)
+.L${uniq}_simple_4:
+ lw a4, 3*4($z1)
+.L${uniq}_simple_3:
+ lw a3, 2*4($z1)
+.L${uniq}_simple_2:
+ lw a2, 1*4($z1)
+.L${uniq}_simple_1:
+% if arg_start == "0": # static:
+ lw a1, 0*4($z1)
+%#:
.L${uniq}_simple_done:
// Check if a 0/1 arg invoke form is possible, set up a2 and fa0 if needed.
// If a return value expected, move possible float return to a0.
// zN are temporaries
-// yN are callee-saved
-%def try_01_args(ins="", v_fedc="", z0="", skip="", call="", uniq=""):
- li $z0, 2 // z0 := 2
- blt $ins, $z0, .L${uniq}_01_peek_next // A = 1
- bgt $ins, $z0, $skip // A >= 3
- // A = 2: this, plus one arg
- srliw $z0, $v_fedc, 4 // z0 := F|E|D
- andi $z0, $z0, 0xF // z0 := D
- GET_VREG a2, $z0
+%def try_01_args(vreg="", z0="", z1="", skip="", call="", uniq="", range=""):
+% if range == 'Range':
+ srliw $z0, xINST, 8 // z0 := AA
+% else:
+ srliw $z0, xINST, 12 // z0 := A
+%#:
+ li $z1, 2 // z1 := 2
+ blt $z0, $z1, .L${uniq}_01_peek_next // ins = 1
+ bgt $z0, $z1, $skip // ins >= 3
+ // ins = 2: this, plus one arg
+% if range == 'Range':
+ addi $z1, $vreg, 1 // z1 := CCCC + 1
+ GET_VREG a2, $z1 // a2 := fp[CCCC + 1]
+% else:
+ srliw $z1, $vreg, 4 // z1 := F|E|D
+ andi $z1, $z1, 0xF // z1 := D
+ GET_VREG a2, $z1 // a2 := fp[D]
+%#:
fmv.w.x fa0, a2
.L${uniq}_01_peek_next:
% try_01_args_peek_next(z0=z0) # z0 is zero if invoke has return value
@@ -694,13 +796,22 @@
// Static variant.
-%def try_01_args_static(ins="", v_fedc="", z0="", skip="", call="", uniq=""):
- beqz $ins, .L${uniq}_01_peek_next // A = 0
- li $z0, 1 // z0 := imm 1
- bgt $ins, $z0, $skip // A >= 2
- // A = 1
- andi $z0, $v_fedc, 0xF // z0 := C
- GET_VREG a1, $z0
+%def try_01_args_static(vreg="", z0="", z1="", skip="", call="", uniq="", range=""):
+% if range == 'Range':
+ srliw $z0, xINST, 8 // z0 := AA
+% else:
+ srliw $z0, xINST, 12 // z0 := A
+%#:
+ beqz $z0, .L${uniq}_01_peek_next // ins = 0
+ li $z1, 1 // z1 := 1
+ bgt $z0, $z1, $skip // ins >= 2
+ // ins = 1: one arg
+% if range == 'Range':
+ GET_VREG a1, $vreg // a1 := fp[CCCC]
+% else:
+ andi $z1, $vreg, 0xF // z1 := C
+ GET_VREG a1, $z1 // a1 := fp[C]
+%#:
fmv.w.x fa0, a1
.L${uniq}_01_peek_next:
% try_01_args_peek_next(z0=z0) # z0 is zero if invoke has return value
@@ -792,7 +903,7 @@
// - a1: this
// Input
// - vregs: F|E|D|C from dex
-%def slow_setup_args(shorty="", vregs="", z0="", z1="", z2="", z3="", z4="", z5="", arg_start="1", uniq=""):
+%def slow_setup_args(shorty="", vregs="", z0="", z1="", z2="", z3="", z4="", z5="", z6="", arg_start="1", uniq=""):
srliw $z0, xINST, 12 // z0 := A
li $z1, 5
blt $z0, $z1, .L${uniq}_slow_gpr
@@ -809,13 +920,14 @@
li $z2, 'D' // double
li $z3, 'F' // float
li $z4, 'J' // long
+ li $z5, 'L' // ref
// linear scan through shorty: extract non-float vregs
% if arg_start == "0": # static can place vC into a1; instance already loaded "this" into a1.
-% load_vreg_in_gpr(gpr="a1", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_0")
-% load_vreg_in_gpr(gpr="a2", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_1")
-% load_vreg_in_gpr(gpr="a3", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_2")
-% load_vreg_in_gpr(gpr="a4", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_3")
-% load_vreg_in_gpr(gpr="a5", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_4")
+% load_vreg_in_gpr(gpr="a1", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, L=z5, z0=z6, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_0")
+% load_vreg_in_gpr(gpr="a2", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, L=z5, z0=z6, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_1")
+% load_vreg_in_gpr(gpr="a3", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, L=z5, z0=z6, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_2")
+% load_vreg_in_gpr(gpr="a4", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, L=z5, z0=z6, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_3")
+% load_vreg_in_gpr(gpr="a5", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, L=z5, z0=z6, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_4")
.L${uniq}_slow_fpr:
addi $z0, $shorty, 1 // z0 := first arg of shorty
@@ -832,44 +944,138 @@
.L${uniq}_slow_done:
-// string-init variant
-%def slow_setup_args_string_init(shorty="", vregs="", z0="", z1="", z2="", z3="", z4="", z5="", uniq=""):
- srliw $z0, xINST, 12 // z0 := A
- li $z1, 5
- blt $z0, $z1, .L${uniq}_slow_gpr
- // A = 5: need vreg G
- srliw $z1, xINST, 8 // z1 := A|G
- andi $z1, $z1, 0xF // z1 := G
- slliw $z1, $z1, 16 // z1 := G0000
- add $vregs, $z1, $vregs // vregs := G|F|E|D|C
+// String-init variant: up to 4 args, no long/double/float args.
+// Ref args ('L') loaded with LW *must* apply ZEXT.W to avoid subtle address bugs.
+%def slow_setup_args_string_init(shorty="", v_fedc="", z0="", z1="", z2="", uniq=""):
+ srliw $z0, xINST, 12 // z0 := A; possible values 1-5
+ li $z1, 2
+ blt $z0, $z1, .L${uniq}_slow_1 // A = 1
+ li $z2, 'L' // z2 := ref type
+ beq $z0, $z1, .L${uniq}_slow_2 // A = 2
+ li $z1, 4
+ blt $z0, $z1, .L${uniq}_slow_3 // A = 3
+ beq $z0, $z1, .L${uniq}_slow_4 // A = 4
-.L${uniq}_slow_gpr:
- addi $z0, $shorty, 1 // z0 := first arg of shorty
- srliw $z1, $vregs, 4 // z1 := (instance) F|E|D or G|F|E|D
- li $z2, 'D' // double
- li $z3, 'F' // float
- li $z4, 'J' // long
+ // A = 5
+ srliw $z0, xINST, 8 // z0 := A|G
+ andi $z0, $z0, 0xF // z0 := G
+ GET_VREG a4, $z0
+ lb $z1, 4($shorty) // shorty RDEFG
+ bne $z1, $z2, .L${uniq}_slow_4
+ zext.w a4, a4
+.L${uniq}_slow_4:
+ srliw $z1, $v_fedc, 12 // z1 := F
+ GET_VREG a3, $z1
+ lb $z1, 3($shorty) // shorty RDEF
+ bne $z1, $z2, .L${uniq}_slow_3
+ zext.w a3, a3
+.L${uniq}_slow_3:
+ srliw $z1, $v_fedc, 8 // z1 := F|E
+ andi $z1, $z1, 0xF // z1 := E
+ GET_VREG a2, $z1
+ lb $z1, 2($shorty) // shorty RDE
+ bne $z1, $z2, .L${uniq}_slow_2
+ zext.w a2, a2
+.L${uniq}_slow_2:
+ srliw $z1, $v_fedc, 4 // z1 := F|E|D
+ andi $z1, $z1, 0xF // z1 := D
+ GET_VREG a1, $z1
+ lb $z1, 1($shorty) // shorty RD
+ bne $z1, $z2, .L${uniq}_slow_1
+ zext.w a1, a1
+.L${uniq}_slow_1:
+ // "this" never read in string-init
+
+
+// Range and static-range variant.
+// Hardcoded
+// - (caller) xPC, xINST, xFP
+// - (callee) sp
+// Input
+// - vC: CCCC from dex
+%def slow_setup_args_range(shorty="", vC="", z0="", z1="", z2="", z3="", z4="", z5="", z6="", z7="", arg_start="1", uniq=""):
+ addi $z0, $shorty, 1 // z0 := first arg of shorty
+ addi $z1, $vC, $arg_start // z1 := (instance) CCCC+1, (static) CCCC
+ mv $z2, zero // z2 := is_out_stack_needed false
+ li $z3, 'D' // double
+ li $z4, 'F' // float
+ li $z5, 'J' // long
+ li $z6, 'L' // ref
+
// linear scan through shorty: extract non-float vregs
-% load_vreg_in_gpr(gpr="a1", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_0")
-% load_vreg_in_gpr(gpr="a2", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_1")
-% load_vreg_in_gpr(gpr="a3", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_2")
-% load_vreg_in_gpr(gpr="a4", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_3")
+% if arg_start == "0": # static can place vCCCC into a1; instance already loaded "this" into a1.
+% load_vreg_in_gpr_range(gpr="a1", shorty=z0, idx=z1, D=z3, F=z4, J=z5, L=z6, z0=z7, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_1")
+% load_vreg_in_gpr_range(gpr="a2", shorty=z0, idx=z1, D=z3, F=z4, J=z5, L=z6, z0=z7, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_2")
+% load_vreg_in_gpr_range(gpr="a3", shorty=z0, idx=z1, D=z3, F=z4, J=z5, L=z6, z0=z7, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_3")
+% load_vreg_in_gpr_range(gpr="a4", shorty=z0, idx=z1, D=z3, F=z4, J=z5, L=z6, z0=z7, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_4")
+% load_vreg_in_gpr_range(gpr="a5", shorty=z0, idx=z1, D=z3, F=z4, J=z5, L=z6, z0=z7, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_5")
+% load_vreg_in_gpr_range(gpr="a6", shorty=z0, idx=z1, D=z3, F=z4, J=z5, L=z6, z0=z7, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_6")
+% load_vreg_in_gpr_range(gpr="a7", shorty=z0, idx=z1, D=z3, F=z4, J=z5, L=z6, z0=z7, done=f".L{uniq}_slow_fpr", uniq=f"{uniq}_7")
+% is_out_stack_needed(needed=z2, shorty=z0, D=z3, F=z4, z0=z1, uniq=uniq)
- // TODO: java.lang.StringFactory methods don't have floating point args; skip FPR loads.
.L${uniq}_slow_fpr:
- addi $z0, $shorty, 1 // z0 := first arg of shorty
- srliw $z1, $vregs, 4 // z1 := (instance) F|E|D or G|F|E|D
+ addi $z0, $shorty, 1 // z0 := first arg of shorty
+ addi $z1, $vC, $arg_start // z1 := (instance) CCCC+1, (static) CCCC
// linear scan through shorty: extract float/double vregs
-% load_vreg_in_fpr(fpr="fa0", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_done", uniq=f"{uniq}_0")
-% load_vreg_in_fpr(fpr="fa1", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_done", uniq=f"{uniq}_1")
-% load_vreg_in_fpr(fpr="fa2", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_done", uniq=f"{uniq}_2")
-% load_vreg_in_fpr(fpr="fa3", shorty=z0, vregs=z1, D=z2, F=z3, J=z4, z0=z5, done=f".L{uniq}_slow_done", uniq=f"{uniq}_3")
+% load_vreg_in_fpr_range(fpr="fa0", shorty=z0, idx=z1, D=z3, F=z4, J=z5, z0=z6, done=f".L{uniq}_slow_stack", uniq=f"{uniq}_0")
+% load_vreg_in_fpr_range(fpr="fa1", shorty=z0, idx=z1, D=z3, F=z4, J=z5, z0=z6, done=f".L{uniq}_slow_stack", uniq=f"{uniq}_1")
+% load_vreg_in_fpr_range(fpr="fa2", shorty=z0, idx=z1, D=z3, F=z4, J=z5, z0=z6, done=f".L{uniq}_slow_stack", uniq=f"{uniq}_2")
+% load_vreg_in_fpr_range(fpr="fa3", shorty=z0, idx=z1, D=z3, F=z4, J=z5, z0=z6, done=f".L{uniq}_slow_stack", uniq=f"{uniq}_3")
+% load_vreg_in_fpr_range(fpr="fa4", shorty=z0, idx=z1, D=z3, F=z4, J=z5, z0=z6, done=f".L{uniq}_slow_stack", uniq=f"{uniq}_4")
+% load_vreg_in_fpr_range(fpr="fa5", shorty=z0, idx=z1, D=z3, F=z4, J=z5, z0=z6, done=f".L{uniq}_slow_stack", uniq=f"{uniq}_5")
+% load_vreg_in_fpr_range(fpr="fa6", shorty=z0, idx=z1, D=z3, F=z4, J=z5, z0=z6, done=f".L{uniq}_slow_stack", uniq=f"{uniq}_6")
+% load_vreg_in_fpr_range(fpr="fa7", shorty=z0, idx=z1, D=z3, F=z4, J=z5, z0=z6, done=f".L{uniq}_slow_stack", uniq=f"{uniq}_7")
+% is_out_stack_needed_float(needed=z2, shorty=z0, D=z3, F=z4, z0=z1, uniq=uniq)
+
+.L${uniq}_slow_stack:
+ beqz $z2, .L${uniq}_slow_done // No stack needed, skip it. Otherwise copy-paste it all with LD/SD.
+ addi $z0, sp, 8 // z0 := base addr of out array
+ sh2add $z1, $vC, xFP // z1 := base addr of FP[CCCC]
+ srliw $z2, xINST, 8 // z2 := AA, vreg count
+ sh2add $z2, $z2, $z1 // z2 := loop guard, addr of one slot past top of xFP array
+% copy_vregs_to_out(out=z0, fp=z1, fp_top=z2, z0=z3, uniq=uniq)
.L${uniq}_slow_done:
+// String-init variant: up to 4 args, no long/float/double args.
+// Ref args ('L') loaded with LW *must* apply ZEXT.W to avoid subtle address bugs.
+%def slow_setup_args_string_init_range(shorty="", vC="", z0="", z1="", z2="", z3="", uniq=""):
+ srliw $z0, xINST, 8 // z0 := AA; possible values 1-5
+ li $z1, 2
+ blt $z0, $z1, .L${uniq}_slow_1 // A = 1
+ sh2add $z2, $vC, xFP // z2 := &fp[CCCC]
+ li $z3, 'L' // z3 := ref type
+ beq $z0, $z1, .L${uniq}_slow_2 // A = 2
+ li $z1, 4
+ blt $z0, $z1, .L${uniq}_slow_3 // A = 3
+ beq $z0, $z1, .L${uniq}_slow_4 // A = 4
+ // A = 5
+ lw a4, 4*4($z2)
+ lb $z1, 4($shorty)
+ bne $z1, $z3, .L${uniq}_slow_4
+ zext.w a4, a4
+.L${uniq}_slow_4:
+ lw a3, 3*4($z2)
+ lb $z1, 3($shorty)
+ bne $z1, $z3, .L${uniq}_slow_3
+ zext.w a3, a3
+.L${uniq}_slow_3:
+ lw a2, 2*4($z2)
+ lb $z1, 2($shorty)
+ bne $z1, $z3, .L${uniq}_slow_2
+ zext.w a2, a2
+.L${uniq}_slow_2:
+ lw a1, 1*4($z2)
+ lb $z1, 1($shorty)
+ bne $z1, $z3, .L${uniq}_slow_1
+ zext.w a1, a1
+.L${uniq}_slow_1:
+ // "this" never read in string-init
+
+
// Iterate through 4-bit vreg ids in the "vregs" register, load a non-FP value
// into one argument register.
-%def load_vreg_in_gpr(gpr="", shorty="", vregs="", z0="", D="", F="", J="", done="", uniq=""):
+%def load_vreg_in_gpr(gpr="", shorty="", vregs="", D="", F="", J="", L="", z0="", done="", uniq=""):
.L${uniq}_gpr_find:
lb $z0, ($shorty) // z0 := next shorty arg spec
addi $shorty, $shorty, 1 // increment char ptr
@@ -880,6 +1086,9 @@
andi $gpr, $vregs, 0xF // gpr := vreg id
beq $z0, $J, .L${uniq}_gpr_load_8_bytes
GET_VREG $gpr, $gpr // gpr := 32-bit load
+ bne $z0, $L, .L${uniq}_gpr_load_common
+ zext.w $gpr, $gpr
+.L${uniq}_gpr_load_common:
srliw $vregs, $vregs, 4 // shift out the processed arg, one vreg
j .L${uniq}_gpr_set // and exit
.L${uniq}_gpr_load_8_bytes:
@@ -897,7 +1106,7 @@
// Iterate through 4-bit vreg ids in the "vregs" register, load a float or double
// value into one floating point argument register.
-%def load_vreg_in_fpr(fpr="", shorty="", vregs="", z0="", D="", F="", J="", done="", uniq=""):
+%def load_vreg_in_fpr(fpr="", shorty="", vregs="", D="", F="", J="", z0="", done="", uniq=""):
.L${uniq}_fpr_find:
lb $z0, ($shorty) // z0 := next shorty arg spec
addi $shorty, $shorty, 1 // increment char ptr
@@ -922,6 +1131,104 @@
.L${uniq}_fpr_set:
+// Range variant
+%def load_vreg_in_gpr_range(gpr="", shorty="", idx="", D="", F="", J="", L="", z0="", done="", uniq=""):
+.L${uniq}_gpr_range_find:
+ lb $z0, ($shorty) // z0 := next shorty arg
+ addi $shorty, $shorty, 1 // increment char ptr
+ beqz $z0, $done // z0 == \0
+ beq $z0, $F, .L${uniq}_gpr_range_skip_1_vreg
+ beq $z0, $D, .L${uniq}_gpr_range_skip_2_vreg
+
+ beq $z0, $J, .L${uniq}_gpr_range_load_2_vreg
+ GET_VREG $gpr, $idx
+ bne $z0, $L, .L${uniq}_gpr_range_load_common
+ zext.w $gpr, $gpr
+.L${uniq}_gpr_range_load_common:
+ addi $idx, $idx, 1
+ j .L${uniq}_gpr_range_done
+.L${uniq}_gpr_range_load_2_vreg:
+ GET_VREG_WIDE $gpr, $idx
+ addi $idx, $idx, 2
+ j .L${uniq}_gpr_range_done
+
+.L${uniq}_gpr_range_skip_2_vreg:
+ addi $idx, $idx, 1
+.L${uniq}_gpr_range_skip_1_vreg:
+ addi $idx, $idx, 1
+ j .L${uniq}_gpr_range_find
+.L${uniq}_gpr_range_done:
+
+
+// Range variant.
+%def load_vreg_in_fpr_range(fpr="", shorty="", idx="", D="", F="", J="", z0="", done="", uniq=""):
+.L${uniq}_fpr_range_find:
+ lb $z0, ($shorty) // z0 := next shorty arg
+ addi $shorty, $shorty, 1 // increment char ptr
+ beqz $z0, $done // z0 == \0
+ beq $z0, $F, .L${uniq}_fpr_range_load_4_bytes
+ beq $z0, $D, .L${uniq}_fpr_range_load_8_bytes
+
+ addi $idx, $idx, 1 // increment idx
+ bne $z0, $J, .L${uniq}_fpr_range_find
+ addi $idx, $idx, 1 // increment once more for J
+ j .L${uniq}_fpr_range_find
+
+.L${uniq}_fpr_range_load_4_bytes:
+ mv $z0, $idx
+ GET_VREG_FLOAT $fpr, $z0
+ addi $idx, $idx, 1
+ j .L${uniq}_fpr_range_set
+.L${uniq}_fpr_range_load_8_bytes:
+ mv $z0, $idx
+ GET_VREG_DOUBLE $fpr, $z0
+ addi $idx, $idx, 2
+.L${uniq}_fpr_range_set:
+
+
+%def is_out_stack_needed(needed="", shorty="", D="", F="", z0="", uniq=""):
+.L${uniq}_scan_arg:
+ lb $z0, ($shorty)
+ addi $shorty, $shorty, 1
+ beqz $z0, .L${uniq}_scan_done
+ beq $z0, $F, .L${uniq}_scan_arg
+ beq $z0, $D, .L${uniq}_scan_arg
+ li $needed, 1
+.L${uniq}_scan_done:
+
+
+%def is_out_stack_needed_float(needed="", shorty="", D="", F="", z0="", uniq=""):
+ bnez $needed, .L${uniq}_scan_float_done
+.L${uniq}_scan_float_arg:
+ lb $z0, ($shorty)
+ addi $shorty, $shorty, 1
+ beqz $z0, .L${uniq}_scan_float_done
+ beq $z0, $F, .L${uniq}_scan_float_found
+ beq $z0, $D, .L${uniq}_scan_float_found
+ j .L${uniq}_scan_float_arg
+.L${uniq}_scan_float_found:
+ li $needed, 1
+.L${uniq}_scan_float_done:
+
+
+%def copy_vregs_to_out(out="", fp="", fp_top="", z0="", uniq=""):
+ sub $z0, $fp_top, $fp // z0 := byte range
+ BRANCH_IF_BIT_CLEAR $z0, $z0, 2, .L${uniq}_copy_wide
+ // branch if odd count of slots
+ lwu $z0, ($fp)
+ sw $z0, ($out)
+ addi $fp, $fp, 4
+ addi $out, $out, 4
+.L${uniq}_copy_wide:
+ beq $fp, $fp_top, .L${uniq}_copy_done
+ ld $z0, ($fp)
+ sd $z0, ($out)
+ addi $fp, $fp, 8
+ addi $out, $out, 8
+ j .L${uniq}_copy_wide
+.L${uniq}_copy_done:
+
+
// NterpToNterpInstance
// a0: ArtMethod*
// a1: this
@@ -931,9 +1238,8 @@
// NterpToNterpStringInit
// a0: ArtMethod*
-// a1: this
%def nterp_to_nterp_string_init():
-% nterp_to_nterp(how_vC="in_a1", uniq="n2n_string_init")
+% nterp_to_nterp(how_vC="skip", uniq="n2n_string_init")
// NterpToNterpStatic
@@ -944,22 +1250,23 @@
// NterpToNterpInstanceRange
%def nterp_to_nterp_instance_range():
-% nterp_to_nterp_range()
+% nterp_to_nterp(how_vC="in_a1", uniq="n2n_instance_range", range="Range")
// NterpToNterpStringInitRange
%def nterp_to_nterp_string_init_range():
-% nterp_to_nterp_range()
+% nterp_to_nterp(how_vC="skip", uniq="n2n_string_init_range", range="Range")
// NterpToNterpStaticRange
%def nterp_to_nterp_static_range():
-% nterp_to_nterp_range()
+% nterp_to_nterp(a1_instance=False, how_vC="load", uniq="n2n_static_range", range="Range")
// helpers
-%def nterp_to_nterp(a1_instance=True, how_vC="", uniq=""):
+
+%def nterp_to_nterp(a1_instance=True, how_vC="", uniq="", range=""):
.cfi_startproc
% setup_nterp_frame(cfi_refs="23", refs="s8", fp="s9", pc="s10", regs="s11", spills_sp="t0", z0="t1", z1="t2", z2="t3", z3="t4", uniq=uniq)
// s8 := callee xREFS
@@ -968,7 +1275,11 @@
// s11 := fp/refs vreg count
// t0 := post-spills pre-frame sp (unused here)
// sp := post-frame callee sp
-% n2n_arg_move(refs="s8", fp="s9", pc="s10", regs="s11", v_fedc="s7", z0="t0", z1="t1", z2="t2", z3="t3", a1_instance=a1_instance, how_vC=how_vC, uniq=uniq)
+% if range == 'Range':
+% n2n_arg_move_range(refs="s8", fp="s9", regs="s11", vC="s7", z0="t0", z1="t1", z2="t2", z3="t3", z4="t4", z5="t5", a1_instance=a1_instance, how_vC=how_vC, uniq=uniq)
+% else:
+% n2n_arg_move(refs="s8", fp="s9", pc="s10", regs="s11", v_fedc="s7", z0="t0", z1="t1", z2="t2", z3="t3", a1_instance=a1_instance, how_vC=how_vC, uniq=uniq)
+%#:
mv xREFS, s8
mv xFP, s9
mv xPC, s10
@@ -978,12 +1289,6 @@
.cfi_endproc
-%def nterp_to_nterp_range(a1_instance=True, how_vC="", uniq=""):
- .cfi_startproc
- unimp
- .cfi_endproc
-
-
// See runtime/nterp_helpers.cc for a diagram of the setup.
// Hardcoded
// - a0 - ArtMethod*
@@ -1130,6 +1435,52 @@
.L${uniq}_arg_done:
+%def n2n_arg_move_range(refs="", fp="", regs="", vC="", z0="", z1="", z2="", z3="", z4="", z5="", a1_instance=True, how_vC="", uniq=""):
+ srliw $z0, xINST, 8 // z0 := AA (arg count)
+
+% if not a1_instance:
+ beqz $z0, .L${uniq}_arg_range_done
+%#:
+ // AA >= 1, iterator setup
+ sub $z4, $regs, $z0 // z4 := regs - AA; starting idx in fp and refs
+ sh2add $z1, $vC, xREFS // z1 := addr of xREFS[CCCC]
+ sh2add $z2, $vC, xFP // z2 := addr of xFP[CCCC]
+ sh2add $z3, $z4, $refs // z3 := addr of refs[z4]
+ sh2add $z4, $z4, $fp // z4 := addr of fp[z4]
+
+ BRANCH_IF_BIT_CLEAR $z0, $z0, 0, .L${uniq}_arg_range_copy_wide
+ // branch if AA is even
+ // AA is odd, transfer one slot. Apply some optimizations.
+% if how_vC == "in_a1":
+ sw a1, ($z3)
+ sw a1, ($z4)
+% elif how_vC == "skip":
+ // string init doesn't read "this"
+% elif how_vC == "load":
+ lw $z0, ($z1)
+ lw $z5, ($z2)
+ sw $z0, ($z3)
+ sw $z5, ($z4)
+%#:
+ addi $z1, $z1, 4
+ addi $z2, $z2, 4
+ addi $z3, $z3, 4
+ addi $z4, $z4, 4
+.L${uniq}_arg_range_copy_wide:
+ // Even count of vreg slots, apply LD/SD.
+ beq $z3, $fp, .L${uniq}_arg_range_done // terminate loop if refs[regs] == fp[0]
+ ld $z0, ($z1)
+ ld $z5, ($z2)
+ sd $z0, ($z3)
+ sd $z5, ($z4)
+ addi $z1, $z1, 8
+ addi $z2, $z2, 8
+ addi $z3, $z3, 8
+ addi $z4, $z4, 8
+ j .L${uniq}_arg_range_copy_wide
+.L${uniq}_arg_range_done:
+
+
//
// Nterp entry point helpers
//
diff --git a/runtime/interpreter/mterp/riscv64/main.S b/runtime/interpreter/mterp/riscv64/main.S
index cc556ad..35f53a9 100644
--- a/runtime/interpreter/mterp/riscv64/main.S
+++ b/runtime/interpreter/mterp/riscv64/main.S
@@ -306,6 +306,7 @@
.endm
// Typed read, defaults to 32-bit read
+// Note: Incorrect for an object ref; it requires LWU, or LW;ZEXT.W.
// Clobbers: \reg
// Safe if \reg == \vreg.
.macro GET_VREG reg, vreg, is_wide=0
@@ -318,6 +319,7 @@
.endm
// Typed write, defaults to 32-bit write.
+// Note: Incorrect for an object ref; it requires 2nd SW into xREFS.
// Clobbers: z0
.macro SET_VREG reg, vreg, z0, is_wide=0
.if \is_wide
@@ -514,28 +516,29 @@
// We drained arg registers, so continue from caller stack's out array. Unlike the reference-only
// fast-path, the continuation offset in the out array can vary, depending on the presence of
// 64-bit values in the arg registers. \offset tracks this value as a byte offset.
- addi t4, s9, (NTERP_SIZE_SAVE_CALLEE_SAVES + 8)
- // t4 := (caller) outs array base address, for here and fargs
- add t0, t3, t0 // t0 := (callee) &FP[next]
+ addi t5, s9, (NTERP_SIZE_SAVE_CALLEE_SAVES + 8)
+ // t5 := (caller) outs array base address
+ add t4, t3, t0 // t4 := (callee) &FP[next]
add t1, t3, t1 // t1 := (callee) &REFS[next]
- add t3, t3, t4 // t3 := (caller) &OUTS[next]
-% store_outs_to_vregs(outs="t3", shorty="t2", fp="t0", refs="t1", z0="t5", z1="t6", D="s0", F="s4", J="s5", L="s8", next=".Lentry_fargs")
-
+ add t3, t3, t5 // t3 := (caller) &OUTS[next]
+% store_outs_to_vregs(outs="t3", shorty="t2", fp="t4", refs="t1", z0="t5", z1="t6", D="s0", F="s4", J="s5", L="s8", next=".Lentry_fargs")
+ // t0 = &xFP[a1], unclobbered
.Lentry_fargs:
- sh2add t0, s7, xFP // t0 := &FP[a1]
addi t1, s11, 1 // t1 := shorty arg (skip return type)
slliw t2, s10, 2 // t2 := starting byte offset for fp/outs, static and instance
// linear scan through shorty: extract float args
-% store_fpr_to_vreg(fpr="fa0", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t5", D="s0", F="s4", J="s5", next=".Lentry_go")
-% store_fpr_to_vreg(fpr="fa1", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t5", D="s0", F="s4", J="s5", next=".Lentry_go")
-% store_fpr_to_vreg(fpr="fa2", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t5", D="s0", F="s4", J="s5", next=".Lentry_go")
-% store_fpr_to_vreg(fpr="fa3", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t5", D="s0", F="s4", J="s5", next=".Lentry_go")
-% store_fpr_to_vreg(fpr="fa4", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t5", D="s0", F="s4", J="s5", next=".Lentry_go")
-% store_fpr_to_vreg(fpr="fa5", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t5", D="s0", F="s4", J="s5", next=".Lentry_go")
-% store_fpr_to_vreg(fpr="fa6", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t5", D="s0", F="s4", J="s5", next=".Lentry_go")
-% store_fpr_to_vreg(fpr="fa7", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t5", D="s0", F="s4", J="s5", next=".Lentry_go")
+% store_fpr_to_vreg(fpr="fa0", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t4", D="s0", F="s4", J="s5", next=".Lentry_go")
+% store_fpr_to_vreg(fpr="fa1", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t4", D="s0", F="s4", J="s5", next=".Lentry_go")
+% store_fpr_to_vreg(fpr="fa2", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t4", D="s0", F="s4", J="s5", next=".Lentry_go")
+% store_fpr_to_vreg(fpr="fa3", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t4", D="s0", F="s4", J="s5", next=".Lentry_go")
+% store_fpr_to_vreg(fpr="fa4", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t4", D="s0", F="s4", J="s5", next=".Lentry_go")
+% store_fpr_to_vreg(fpr="fa5", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t4", D="s0", F="s4", J="s5", next=".Lentry_go")
+% store_fpr_to_vreg(fpr="fa6", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t4", D="s0", F="s4", J="s5", next=".Lentry_go")
+% store_fpr_to_vreg(fpr="fa7", offset="t2", shorty="t1", fp="t0", z0="t3", z1="t4", D="s0", F="s4", J="s5", next=".Lentry_go")
+ addi t3, s9, (NTERP_SIZE_SAVE_CALLEE_SAVES + 8)
+ // t3 := (caller) outs array base address
add t0, t2, t0 // t0 := (callee) &FP[next]
- add t2, t2, t4 // t2 := (caller) &OUTS[next]
+ add t2, t2, t3 // t2 := (caller) &OUTS[next]
% store_float_outs_to_vregs(outs="t2", shorty="t1", fp="t0", z0="t3", D="s0", F="s4", J="s5", next=".Lentry_go")
.Lentry_go:
@@ -613,10 +616,10 @@
% nterp_to_nterp_static()
NterpToNterpInstanceRange:
% nterp_to_nterp_instance_range()
-NterpToNterpStaticRange:
-% nterp_to_nterp_static_range()
NterpToNterpStringInitRange:
% nterp_to_nterp_string_init_range()
+NterpToNterpStaticRange:
+% nterp_to_nterp_static_range()
NAME_END nterp_helper
diff --git a/runtime/nterp_helpers.cc b/runtime/nterp_helpers.cc
index f411e73..83057f8 100644
--- a/runtime/nterp_helpers.cc
+++ b/runtime/nterp_helpers.cc
@@ -279,6 +279,11 @@
case Instruction::INVOKE_DIRECT:
case Instruction::INVOKE_STATIC:
case Instruction::INVOKE_INTERFACE:
+ case Instruction::INVOKE_VIRTUAL_RANGE:
+ case Instruction::INVOKE_SUPER_RANGE:
+ case Instruction::INVOKE_DIRECT_RANGE:
+ case Instruction::INVOKE_STATIC_RANGE:
+ case Instruction::INVOKE_INTERFACE_RANGE:
case Instruction::NEG_INT:
case Instruction::NOT_INT:
case Instruction::NEG_LONG: