ARM64: Add new String.compareTo intrinsic.
Benchmarked on Nexus6P big, little, and all cores. The new intrinsic
is faster than pStringCompareTo for compare lengths on [1,512], so the
runtime call is no longer needed.
Change-Id: If94bfe24d9bf4dddcca648cc0b563709fc407b34
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 04ae3a6..bf79767 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -47,6 +47,7 @@
using helpers::WRegisterFrom;
using helpers::XRegisterFrom;
using helpers::InputRegisterAt;
+using helpers::OutputRegister;
namespace {
@@ -1173,31 +1174,118 @@
void IntrinsicLocationsBuilderARM64::VisitStringCompareTo(HInvoke* invoke) {
LocationSummary* locations = new (arena_) LocationSummary(invoke,
- LocationSummary::kCall,
+ invoke->InputAt(1)->CanBeNull()
+ ? LocationSummary::kCallOnSlowPath
+ : LocationSummary::kNoCall,
kIntrinsified);
- InvokeRuntimeCallingConvention calling_convention;
- locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
- locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
- locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimInt));
+ locations->SetInAt(0, Location::RequiresRegister());
+ locations->SetInAt(1, Location::RequiresRegister());
+ locations->AddTemp(Location::RequiresRegister());
+ locations->AddTemp(Location::RequiresRegister());
+ locations->AddTemp(Location::RequiresRegister());
+ locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
}
void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) {
vixl::MacroAssembler* masm = GetVIXLAssembler();
LocationSummary* locations = invoke->GetLocations();
+ Register str = XRegisterFrom(locations->InAt(0));
+ Register arg = XRegisterFrom(locations->InAt(1));
+ Register out = OutputRegister(invoke);
+
+ Register temp0 = WRegisterFrom(locations->GetTemp(0));
+ Register temp1 = WRegisterFrom(locations->GetTemp(1));
+ Register temp2 = WRegisterFrom(locations->GetTemp(2));
+
+ vixl::Label loop;
+ vixl::Label find_char_diff;
+ vixl::Label end;
+
+ // Get offsets of count and value fields within a string object.
+ const int32_t count_offset = mirror::String::CountOffset().Int32Value();
+ const int32_t value_offset = mirror::String::ValueOffset().Int32Value();
+
// Note that the null check must have been done earlier.
DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0)));
- Register argument = WRegisterFrom(locations->InAt(1));
- __ Cmp(argument, 0);
- SlowPathCodeARM64* slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
- codegen_->AddSlowPath(slow_path);
- __ B(eq, slow_path->GetEntryLabel());
+ // Take slow path and throw if input can be and is null.
+ SlowPathCodeARM64* slow_path = nullptr;
+ const bool can_slow_path = invoke->InputAt(1)->CanBeNull();
+ if (can_slow_path) {
+ slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
+ codegen_->AddSlowPath(slow_path);
+ __ Cbz(arg, slow_path->GetEntryLabel());
+ }
- __ Ldr(
- lr, MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, pStringCompareTo).Int32Value()));
- __ Blr(lr);
- __ Bind(slow_path->GetExitLabel());
+ // Reference equality check, return 0 if same reference.
+ __ Subs(out, str, arg);
+ __ B(&end, eq);
+ // Load lengths of this and argument strings.
+ __ Ldr(temp0, MemOperand(str.X(), count_offset));
+ __ Ldr(temp1, MemOperand(arg.X(), count_offset));
+ // Return zero if both strings are empty.
+ __ Orr(out, temp0, temp1);
+ __ Cbz(out, &end);
+ // out = length diff.
+ __ Subs(out, temp0, temp1);
+ // temp2 = min(len(str), len(arg)).
+ __ Csel(temp2, temp1, temp0, ge);
+ // Shorter string is empty?
+ __ Cbz(temp2, &end);
+
+ // Store offset of string value in preparation for comparison loop.
+ __ Mov(temp1, value_offset);
+
+ UseScratchRegisterScope scratch_scope(masm);
+ Register temp4 = scratch_scope.AcquireX();
+
+ // Assertions that must hold in order to compare strings 4 characters at a time.
+ DCHECK_ALIGNED(value_offset, 8);
+ static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded");
+
+ const size_t char_size = Primitive::ComponentSize(Primitive::kPrimChar);
+ DCHECK_EQ(char_size, 2u);
+
+ // Promote temp0 to an X reg, ready for LDR.
+ temp0 = temp0.X();
+
+ // Loop to compare 4x16-bit characters at a time (ok because of string data alignment).
+ __ Bind(&loop);
+ __ Ldr(temp4, MemOperand(str.X(), temp1));
+ __ Ldr(temp0, MemOperand(arg.X(), temp1));
+ __ Cmp(temp4, temp0);
+ __ B(ne, &find_char_diff);
+ __ Add(temp1, temp1, char_size * 4);
+ __ Subs(temp2, temp2, 4);
+ __ B(gt, &loop);
+ __ B(&end);
+
+ // Promote temp1 to an X reg, ready for EOR.
+ temp1 = temp1.X();
+
+ // Find the single 16-bit character difference.
+ __ Bind(&find_char_diff);
+ // Get the bit position of the first character that differs.
+ __ Eor(temp1, temp0, temp4);
+ __ Rbit(temp1, temp1);
+ __ Clz(temp1, temp1);
+ __ Bic(temp1, temp1, 0xf);
+ // If the number of 16-bit chars remaining <= the index where the difference occurs (0-3), then
+ // the difference occurs outside the remaining string data, so just return length diff (out).
+ __ Cmp(temp2, Operand(temp1, LSR, 4));
+ __ B(le, &end);
+ // Extract the characters and calculate the difference.
+ __ Lsr(temp0, temp0, temp1);
+ __ Lsr(temp4, temp4, temp1);
+ __ And(temp4, temp4, 0xffff);
+ __ Sub(out, temp4, Operand(temp0, UXTH));
+
+ __ Bind(&end);
+
+ if (can_slow_path) {
+ __ Bind(slow_path->GetExitLabel());
+ }
}
void IntrinsicLocationsBuilderARM64::VisitStringEquals(HInvoke* invoke) {