ARM64: FP16.compare() intrinsic for ARMv8 This CL implements an intrinsic for compare() method with ARMv8.2 FP16 instructions. The performance improvements using timeCompareFP16 FP16Intrinsic micro intrinsic benchmark on pixel4: - Java implementation libcore.util.FP16.compare: - big cluster only: 742 - little cluster only: 2286 - arm64 compare Intrinisic implementation: - big cluster only: 492 (~34% faster) - little cluster only: 1535 (~33% faster) The benchmark can be found in the following patch: https://android-review.linaro.org/c/linaro/art-testing/+/21039 Authors: Usama Arif, Edward Pickup, Joel Goddard Test: 580-checker-fp16 Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac Change-Id: Idbe9f56f964f044e6d725bd696459fb04d2ac76c

commit: ecbdc07474cf8be4837f40a12a5b6b82579df99a [log] [tgz]
author: Usama Arif <usama.arif@linaro.org> Wed Nov 13 13:32:54 2019 +0000
committer: Joel Goddard <joel.goddard@linaro.org> Fri Sep 03 15:29:45 2021 +0100
tree: f09293f8d14dc4ee655710d21fbd305d14b97805
parent: 3676b36e832961426b285ec6a58e2b4c5a4b7816 [diff] [blame]
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 800e5bd..14b16bb 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc

@@ -3819,6 +3819,69 @@
   GenerateFP16Compare(invoke, codegen_, masm, ls);
 }
 
+void IntrinsicLocationsBuilderARM64::VisitFP16Compare(HInvoke* invoke) {
+  if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+    return;
+  }
+
+  CreateIntIntToIntLocations(allocator_, invoke);
+  invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16Compare(HInvoke* invoke) {
+  MacroAssembler* masm = GetVIXLAssembler();
+  auto compareOp = [masm](const Register out,
+                          const VRegister& in0,
+                          const VRegister& in1) {
+    vixl::aarch64::Label end;
+    vixl::aarch64::Label equal;
+    vixl::aarch64::Label normal;
+
+    // The normal cases for this method are:
+    // - in0 > in1 => out = 1
+    // - in0 < in1 => out = -1
+    // - in0 == in1 => out = 0
+    // +/-Infinity are ordered by default so are handled by the normal case.
+    // There are two special cases that Fcmp is insufficient for distinguishing:
+    // - in0 and in1 are +0 and -0 => +0 > -0 so compare encoding instead of value
+    // - in0 or in1 is NaN => manually compare with in0 and in1 separately
+    __ Fcmp(in0, in1);
+    __ B(eq, &equal);  // in0==in1 or +0 -0 case.
+    __ B(vc, &normal);  // in0 and in1 are ordered (not NaN).
+
+    // Either of the inputs is NaN.
+    // NaN is equal to itself and greater than any other number so:
+    // - if only in0 is NaN => return 1
+    // - if only in1 is NaN => return -1
+    // - if both in0 and in1 are NaN => return 0
+    __ Fcmp(in0, 0.0);
+    __ Mov(out, -1);
+    __ B(vc, &end);  // in0 != NaN => out = -1.
+    __ Fcmp(in1, 0.0);
+    __ Cset(out, vc);  // if in1 != NaN => out = 1, otherwise both are NaNs => out = 0.
+    __ B(&end);
+
+    // in0 == in1 or if one of the inputs is +0 and the other is -0.
+    __ Bind(&equal);
+    // Compare encoding of in0 and in1 as the denormal fraction of single precision float.
+    // Reverse operand order because -0 > +0 when compared as S registers.
+    // The instruction Fmov(Hregister, Wregister) zero extends the Hregister.
+    // Therefore the value of bits[127:16] will not matter when doing the
+    // below Fcmp as they are set to 0.
+    __ Fcmp(in1.S(), in0.S());
+
+    __ Bind(&normal);
+    __ Cset(out, gt);  // if in0 > in1 => out = 1, otherwise out = 0.
+                       // Note: could be from equals path or original comparison
+    __ Csinv(out, out, wzr, pl);  // if in0 >= in1 out=out, otherwise out=-1.
+
+    __ Bind(&end);
+  };
+
+  GenerateFP16Compare(invoke, codegen_, masm, compareOp);
+}
+
 static void GenerateDivideUnsigned(HInvoke* invoke, CodeGeneratorARM64* codegen) {
   LocationSummary* locations = invoke->GetLocations();
   MacroAssembler* masm = codegen->GetVIXLAssembler();
commit	ecbdc07474cf8be4837f40a12a5b6b82579df99a	[log] [tgz]
author	Usama Arif <usama.arif@linaro.org>	Wed Nov 13 13:32:54 2019 +0000
committer	Joel Goddard <joel.goddard@linaro.org>	Fri Sep 03 15:29:45 2021 +0100
tree	f09293f8d14dc4ee655710d21fbd305d14b97805
parent	3676b36e832961426b285ec6a58e2b4c5a4b7816 [diff] [blame]