summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
author Richard Neill <richard.neill@arm.com> 2023-01-25 15:08:18 +0000
committer VladimĂ­r Marko <vmarko@google.com> 2023-03-02 15:46:45 +0000
commit6149a22d1a02d5a2905f1917fbaf14ddbb2c0bd2 (patch)
treece0f77c9b2c400e83d51911f913a497cec8c2623
parent20f783080b772843a1c1186811e73ec74bab3ae8 (diff)
Optimize the System.arraycopy(char) arm64 intrinsic
Introduce loop inversion, loop unrolling optimization, and larger memory copies to the System.arraycopy(char) intrinsic. Instead of a single loop with a pair of strh/ldrh instructions, we now have two loops: one with ldr/str instructions and the other one with strh/ldrh. The goal is to use increased step size (8 bytes) in the first loop and then after it's over, finish processing of the rest of the array using the smaller step. If the copy length is constant we further optimize by always unrolling the tail loop, and also unrolling the head loop when the copy length is small. Also increase the threshold array size when intrinsic implementation is chosen. Also add tests to copy character arrays of different sizes. Original author: Artem Kotsiuba <artem.kotsiuba@linaro.org> Test: 011-array-copy Test: test-art-target Change-Id: I1dee1dab7931c3de4b91b2bbd5d0180a9820771a
-rw-r--r--compiler/optimizing/intrinsics_arm64.cc124
-rw-r--r--test/011-array-copy/expected-stdout.txt25
-rw-r--r--test/011-array-copy/src/Main.java187
3 files changed, 316 insertions, 20 deletions
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 40df479f5a..946d4348af 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -2576,9 +2576,9 @@ void IntrinsicCodeGeneratorARM64::VisitStringGetCharsNoCheck(HInvoke* invoke) {
__ Bind(&done);
}
-// Mirrors ARRAYCOPY_SHORT_CHAR_ARRAY_THRESHOLD in libcore, so we can choose to use the native
-// implementation there for longer copy lengths.
-static constexpr int32_t kSystemArrayCopyCharThreshold = 32;
+// This value is greater than ARRAYCOPY_SHORT_CHAR_ARRAY_THRESHOLD in libcore,
+// so if we choose to jump to the slow path we will end up in the native implementation.
+static constexpr int32_t kSystemArrayCopyCharThreshold = 192;
static void SetSystemArrayCopyLocationRequires(LocationSummary* locations,
uint32_t at,
@@ -2710,11 +2710,13 @@ static void GenSystemArrayCopyAddresses(MacroAssembler* masm,
__ Add(dst_base, dst_base, Operand(XRegisterFrom(dst_pos), LSL, element_size_shift));
}
- if (copy_length.IsConstant()) {
- int32_t constant = copy_length.GetConstant()->AsIntConstant()->GetValue();
- __ Add(src_end, src_base, element_size * constant);
- } else {
- __ Add(src_end, src_base, Operand(XRegisterFrom(copy_length), LSL, element_size_shift));
+ if (src_end.IsValid()) {
+ if (copy_length.IsConstant()) {
+ int32_t constant = copy_length.GetConstant()->AsIntConstant()->GetValue();
+ __ Add(src_end, src_base, element_size * constant);
+ } else {
+ __ Add(src_end, src_base, Operand(XRegisterFrom(copy_length), LSL, element_size_shift));
+ }
}
}
@@ -2745,13 +2747,14 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopyChar(HInvoke* invoke) {
if (!length.IsConstant()) {
// Merge the following two comparisons into one:
// If the length is negative, bail out (delegate to libcore's native implementation).
- // If the length > 32 then (currently) prefer libcore's native implementation.
+ // If the length > kSystemArrayCopyCharThreshold then (currently) prefer libcore's
+ // native implementation.
__ Cmp(WRegisterFrom(length), kSystemArrayCopyCharThreshold);
__ B(slow_path->GetEntryLabel(), hi);
} else {
// We have already checked in the LocationsBuilder for the constant case.
DCHECK_GE(length.GetConstant()->AsIntConstant()->GetValue(), 0);
- DCHECK_LE(length.GetConstant()->AsIntConstant()->GetValue(), 32);
+ DCHECK_LE(length.GetConstant()->AsIntConstant()->GetValue(), kSystemArrayCopyCharThreshold);
}
Register src_curr_addr = WRegisterFrom(locations->GetTemp(0));
@@ -2787,21 +2790,102 @@ void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopyChar(HInvoke* invoke) {
length,
src_curr_addr,
dst_curr_addr,
- src_stop_addr);
+ Register());
// Iterate over the arrays and do a raw copy of the chars.
const int32_t char_size = DataType::Size(DataType::Type::kUint16);
UseScratchRegisterScope temps(masm);
- Register tmp = temps.AcquireW();
- vixl::aarch64::Label loop, done;
- __ Bind(&loop);
- __ Cmp(src_curr_addr, src_stop_addr);
- __ B(&done, eq);
- __ Ldrh(tmp, MemOperand(src_curr_addr, char_size, PostIndex));
- __ Strh(tmp, MemOperand(dst_curr_addr, char_size, PostIndex));
- __ B(&loop);
- __ Bind(&done);
+ // We split processing of the array in two parts: head and tail.
+ // A first loop handles the head by copying a block of characters per
+ // iteration (see: chars_per_block).
+ // A second loop handles the tail by copying the remaining characters.
+ // If the copy length is not constant, we copy them one-by-one.
+ // If the copy length is constant, we optimize by always unrolling the tail
+ // loop, and also unrolling the head loop when the copy length is small (see:
+ // unroll_threshold).
+ //
+ // Both loops are inverted for better performance, meaning they are
+ // implemented as conditional do-while loops.
+ // Here, the loop condition is first checked to determine if there are
+ // sufficient chars to run an iteration, then we enter the do-while: an
+ // iteration is performed followed by a conditional branch only if another
+ // iteration is necessary. As opposed to a standard while-loop, this inversion
+ // can save some branching (e.g. we don't branch back to the initial condition
+ // at the end of every iteration only to potentially immediately branch
+ // again).
+ //
+ // A full block of chars is subtracted and added before and after the head
+ // loop, respectively. This ensures that any remaining length after each
+ // head loop iteration means there is a full block remaining, reducing the
+ // number of conditional checks required on every iteration.
+ constexpr int32_t chars_per_block = 4;
+ constexpr int32_t unroll_threshold = 2 * chars_per_block;
+ vixl::aarch64::Label loop1, loop2, pre_loop2, done;
+
+ Register length_tmp = src_stop_addr.W();
+ Register tmp = temps.AcquireRegisterOfSize(char_size * chars_per_block * kBitsPerByte);
+
+ auto emitHeadLoop = [&]() {
+ __ Bind(&loop1);
+ __ Ldr(tmp, MemOperand(src_curr_addr, char_size * chars_per_block, PostIndex));
+ __ Subs(length_tmp, length_tmp, chars_per_block);
+ __ Str(tmp, MemOperand(dst_curr_addr, char_size * chars_per_block, PostIndex));
+ __ B(&loop1, ge);
+ };
+
+ auto emitTailLoop = [&]() {
+ __ Bind(&loop2);
+ __ Ldrh(tmp, MemOperand(src_curr_addr, char_size, PostIndex));
+ __ Subs(length_tmp, length_tmp, 1);
+ __ Strh(tmp, MemOperand(dst_curr_addr, char_size, PostIndex));
+ __ B(&loop2, gt);
+ };
+
+ auto emitUnrolledTailLoop = [&](const int32_t tail_length) {
+ DCHECK_LT(tail_length, 4);
+
+ // Don't use post-index addressing, and instead add a constant offset later.
+ if ((tail_length & 2) != 0) {
+ __ Ldr(tmp.W(), MemOperand(src_curr_addr));
+ __ Str(tmp.W(), MemOperand(dst_curr_addr));
+ }
+ if ((tail_length & 1) != 0) {
+ const int32_t offset = (tail_length & ~1) * char_size;
+ __ Ldrh(tmp, MemOperand(src_curr_addr, offset));
+ __ Strh(tmp, MemOperand(dst_curr_addr, offset));
+ }
+ };
+
+ if (length.IsConstant()) {
+ const int32_t constant_length = length.GetConstant()->AsIntConstant()->GetValue();
+ if (constant_length >= unroll_threshold) {
+ __ Mov(length_tmp, constant_length - chars_per_block);
+ emitHeadLoop();
+ } else {
+ static_assert(unroll_threshold == 8, "The unroll_threshold must be 8.");
+ // Fully unroll both the head and tail loops.
+ if ((constant_length & 4) != 0) {
+ __ Ldr(tmp, MemOperand(src_curr_addr, 4 * char_size, PostIndex));
+ __ Str(tmp, MemOperand(dst_curr_addr, 4 * char_size, PostIndex));
+ }
+ }
+ emitUnrolledTailLoop(constant_length % chars_per_block);
+ } else {
+ Register length_reg = WRegisterFrom(length);
+ __ Subs(length_tmp, length_reg, chars_per_block);
+ __ B(&pre_loop2, lt);
+
+ emitHeadLoop();
+
+ __ Bind(&pre_loop2);
+ __ Adds(length_tmp, length_tmp, chars_per_block);
+ __ B(&done, eq);
+
+ emitTailLoop();
+ }
+
+ __ Bind(&done);
__ Bind(slow_path->GetExitLabel());
}
diff --git a/test/011-array-copy/expected-stdout.txt b/test/011-array-copy/expected-stdout.txt
index 724786ee36..9c53fb7c97 100644
--- a/test/011-array-copy/expected-stdout.txt
+++ b/test/011-array-copy/expected-stdout.txt
@@ -13,3 +13,28 @@ copy: 1,3,5: [0, 1, 2, 1, 2, 3, 4, 5]
copy: 0,3,5: [0, 1, 2, 0, 1, 2, 3, 4]
copy: 3,0,5: [3, 4, 5, 6, 7, 5, 6, 7]
copy: 0,5,1: [0, 1, 2, 3, 4, 0, 6, 7]
+arraycopy(char) const case 2 passed
+arraycopy(char) const case 3 passed
+arraycopy(char) const case 5 passed
+arraycopy(char) const case 7 passed
+arraycopy(char) const case 8 passed
+arraycopy(char) const case 9 passed
+arraycopy(char) const case 11 passed
+arraycopy(char) 0 passed
+arraycopy(char) 1 passed
+arraycopy(char) 3 passed
+arraycopy(char) 4 passed
+arraycopy(char) 5 passed
+arraycopy(char) 7 passed
+arraycopy(char) 15 passed
+arraycopy(char) 16 passed
+arraycopy(char) 17 passed
+arraycopy(char) 31 passed
+arraycopy(char) 32 passed
+arraycopy(char) 33 passed
+arraycopy(char) 63 passed
+arraycopy(char) 64 passed
+arraycopy(char) 65 passed
+arraycopy(char) 255 passed
+arraycopy(char) 513 passed
+arraycopy(char) 1025 passed
diff --git a/test/011-array-copy/src/Main.java b/test/011-array-copy/src/Main.java
index d9b61e7acf..d4d1f67435 100644
--- a/test/011-array-copy/src/Main.java
+++ b/test/011-array-copy/src/Main.java
@@ -24,6 +24,7 @@ public class Main {
testObjectCopy();
testOverlappingMoves();
testFloatAndDouble();
+ testArrayCopyChar();
}
public static void testObjectCopy() {
@@ -165,4 +166,190 @@ public class Main {
System.arraycopy(new float[len], 0, new float[len], 0, len);
System.arraycopy(new double[len], 0, new double[len], 0, len);
}
+
+ static final char SRC_INIT_CHAR = '1';
+ static final char DST_CHAR = '0';
+
+ /* Return a char array of the specified length.
+ * If do_increment is true, populate the array with (numerically) ascending
+ * characters starting from initChar (note: char wraps-around on overflow).
+ * If do_increment is false, populate all array elements with initChar.
+ */
+ public static char[] createCharArray(int length, char initChar, boolean do_increment) {
+ char[] charArr = new char[length];
+ char nextChar = initChar;
+
+ for (int i = 0; i < length; ++i) {
+ charArr[i] = nextChar;
+ if (do_increment) {
+ nextChar++;
+ }
+ }
+ return charArr;
+ }
+
+ public static boolean verifyCorrectness(char[] src, char[] dst, int copiedPrefixLength) {
+ for (int i = 0; i < dst.length; ++i) {
+ if (i < copiedPrefixLength) {
+ // Check that we copied source array.
+ if (dst[i] != src[i]) {
+ return false;
+ }
+ } else {
+ // Check that we didn't write more chars than necessary.
+ if (dst[i] != DST_CHAR) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ public static void testArrayCopyCharConstCase2() {
+ final int copy_length = 2;
+ char[] src = createCharArray(2 * copy_length, SRC_INIT_CHAR, true);
+ char[] dst = createCharArray(4 * copy_length, DST_CHAR, false);
+
+ System.arraycopy(src, 0, dst, 0, copy_length);
+
+ boolean passed = verifyCorrectness(src, dst, copy_length);
+ if (!passed) {
+ System.out.println("arraycopy(char) const case 2 failed");
+ } else {
+ System.out.println("arraycopy(char) const case 2 passed");
+ }
+ }
+
+ public static void testArrayCopyCharConstCase3() {
+ final int copy_length = 3;
+ char[] src = createCharArray(2 * copy_length, SRC_INIT_CHAR, true);
+ char[] dst = createCharArray(4 * copy_length, DST_CHAR, false);
+
+ System.arraycopy(src, 0, dst, 0, copy_length);
+
+ boolean passed = verifyCorrectness(src, dst, copy_length);
+ if (!passed) {
+ System.out.println("arraycopy(char) const case 3 failed");
+ } else {
+ System.out.println("arraycopy(char) const case 3 passed");
+ }
+ }
+
+ public static void testArrayCopyCharConstCase5() {
+ final int copy_length = 5;
+ char[] src = createCharArray(2 * copy_length, SRC_INIT_CHAR, true);
+ char[] dst = createCharArray(4 * copy_length, DST_CHAR, false);
+
+ System.arraycopy(src, 0, dst, 0, copy_length);
+
+ boolean passed = verifyCorrectness(src, dst, copy_length);
+ if (!passed) {
+ System.out.println("arraycopy(char) const case 5 failed");
+ } else {
+ System.out.println("arraycopy(char) const case 5 passed");
+ }
+ }
+
+ public static void testArrayCopyCharConstCase7() {
+ final int copy_length = 7;
+ char[] src = createCharArray(2 * copy_length, SRC_INIT_CHAR, true);
+ char[] dst = createCharArray(4 * copy_length, DST_CHAR, false);
+
+ System.arraycopy(src, 0, dst, 0, copy_length);
+
+ boolean passed = verifyCorrectness(src, dst, copy_length);
+ if (!passed) {
+ System.out.println("arraycopy(char) const case 7 failed");
+ } else {
+ System.out.println("arraycopy(char) const case 7 passed");
+ }
+ }
+
+ public static void testArrayCopyCharConstCase8() {
+ final int copy_length = 8;
+ char[] src = createCharArray(2 * copy_length, SRC_INIT_CHAR, true);
+ char[] dst = createCharArray(4 * copy_length, DST_CHAR, false);
+
+ System.arraycopy(src, 0, dst, 0, copy_length);
+
+ boolean passed = verifyCorrectness(src, dst, copy_length);
+ if (!passed) {
+ System.out.println("arraycopy(char) const case 8 failed");
+ } else {
+ System.out.println("arraycopy(char) const case 8 passed");
+ }
+ }
+
+ public static void testArrayCopyCharConstCase9() {
+ final int copy_length = 9;
+ char[] src = createCharArray(2 * copy_length, SRC_INIT_CHAR, true);
+ char[] dst = createCharArray(4 * copy_length, DST_CHAR, false);
+
+ System.arraycopy(src, 0, dst, 0, copy_length);
+
+ boolean passed = verifyCorrectness(src, dst, copy_length);
+ if (!passed) {
+ System.out.println("arraycopy(char) const case 9 failed");
+ } else {
+ System.out.println("arraycopy(char) const case 9 passed");
+ }
+ }
+
+ public static void testArrayCopyCharConstCase11() {
+ final int copy_length = 11;
+ char[] src = createCharArray(2 * copy_length, SRC_INIT_CHAR, true);
+ char[] dst = createCharArray(4 * copy_length, DST_CHAR, false);
+
+ System.arraycopy(src, 0, dst, 0, copy_length);
+
+ boolean passed = verifyCorrectness(src, dst, copy_length);
+ if (!passed) {
+ System.out.println("arraycopy(char) const case 11 failed");
+ } else {
+ System.out.println("arraycopy(char) const case 11 passed");
+ }
+ }
+
+ public static void testArrayCopyCharCase(int copy_length) {
+ char[] src = createCharArray(2 * copy_length, SRC_INIT_CHAR, true);
+ char[] dst = createCharArray(4 * copy_length, DST_CHAR, false);
+
+ System.arraycopy(src, 0, dst, 0, copy_length);
+
+ boolean passed = verifyCorrectness(src, dst, copy_length);
+ if (!passed) {
+ System.out.println("arraycopy(char) " + copy_length + " failed");
+ } else {
+ System.out.println("arraycopy(char) " + copy_length + " passed");
+ }
+ }
+
+ public static void testArrayCopyChar() {
+ testArrayCopyCharConstCase2();
+ testArrayCopyCharConstCase3();
+ testArrayCopyCharConstCase5();
+ testArrayCopyCharConstCase7();
+ testArrayCopyCharConstCase8();
+ testArrayCopyCharConstCase9();
+ testArrayCopyCharConstCase11();
+ testArrayCopyCharCase(0);
+ testArrayCopyCharCase(1);
+ testArrayCopyCharCase(3);
+ testArrayCopyCharCase(4);
+ testArrayCopyCharCase(5);
+ testArrayCopyCharCase(7);
+ testArrayCopyCharCase(15);
+ testArrayCopyCharCase(16);
+ testArrayCopyCharCase(17);
+ testArrayCopyCharCase(31);
+ testArrayCopyCharCase(32);
+ testArrayCopyCharCase(33);
+ testArrayCopyCharCase(63);
+ testArrayCopyCharCase(64);
+ testArrayCopyCharCase(65);
+ testArrayCopyCharCase(255);
+ testArrayCopyCharCase(513);
+ testArrayCopyCharCase(1025);
+ }
+
}