ARM64: FP16 greater/less/greaterEquals/lessEquals intrinsics for ARMv8 This CL implements intrinsics for greater, greaterEquals, less, lessEquals methods with ARMv8.2 FP16 instructions. This requires the ARMv8.2 AArch64 asimd half precision extension. The time required in milliseconds to execute the below code for the four intrinsics on Pixel3 is (The code below is for FP16.less but is similar for the rest of the intrinsics): - Java implementation libcore.util.FP16.less(): - big cluster only: 19876 - little cluster only: 47525 - arm64 Intrinisic implementationi for less: - big cluster only: 14526 (~27% faster) - little cluster only: 45815 (~4% faster) - Java implementation libcore.util.FP16.lessEquals(): - big cluster only: 19856 - little cluster only: 47419 - arm64 Intrinisic implementation for lessEquals: - big cluster only: 14469 (~27% faster) - little cluster only: 45762 (~4% faster) - Java implementation libcore.util.FP16.greater(): - big cluster only: 19854 - little cluster only: 47623 - arm64 Intrinisic implementation for greater: - big cluster only: 14519 (~27% faster) - little cluster only: 45722 (~4% faster) - Java implementation libcore.util.FP16.greaterEquals(): - big cluster only: 19865 - little cluster only: 47216 - arm64 Intrinisic implementation for greaterEquals: - big cluster only: 14485 (~27% faster) - little cluster only: 45729 (~4% faster) public static boolean benchmarkComparison(){ boolean ret = false; long before = 0; long after = 0; before = System.currentTimeMillis(); for(long i = 0; i < 1e9; i++){ // FP16.toHalf(12.3) = 0x4a26, FP16.toHalf(12.4) = 0x4a33 // FP16.toHalf(-12.3) = 0xca26, FP16.toHalf(-12.4) = 0xca33 ret |= FP16.less((short) 0x4a26,(short) 0x4a33); ret |= FP16.less((short) 0x4a33,(short) 0x4a26); ret |= FP16.less((short) 0xca26,(short) 0xca33); ret |= FP16.less((short) 0xca33,(short) 0xca26); } after = System.currentTimeMillis(); System.out.println("Time of FP16.less (ms): " + (after - before)); System.out.println(ret); return ret; } Test: 580-fp16 Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac Change-Id: Id1a2c3e7328c82c798fcaf1fa74f5908a822cd0b

commit: 457e9fa3833ef11530056d010f247ad087fd2184 [log] [tgz]
author: Usama Arif <usama.arif@linaro.org> Mon Nov 11 15:29:59 2019 +0000
committer: Vladimir Marko <vmarko@google.com> Thu Nov 28 09:51:05 2019 +0000
tree: 54b8a9dcf44646c3e43a9085d581660c5d9a0132
parent: 17a39babb7f42cbe108d6fab2760cbdc68b821a2 [diff]
diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
index 9c80f32..36e68e4 100644
--- a/compiler/optimizing/common_arm64.h
+++ b/compiler/optimizing/common_arm64.h

@@ -107,6 +107,11 @@
   return vixl::aarch64::VRegister::GetSRegFromCode(location.reg());
 }
 
+inline vixl::aarch64::FPRegister HRegisterFrom(Location location) {
+  DCHECK(location.IsFpuRegister()) << location;
+  return vixl::aarch64::FPRegister::GetHRegFromCode(location.reg());
+}
+
 inline vixl::aarch64::VRegister FPRegisterFrom(Location location, DataType::Type type) {
   DCHECK(DataType::IsFloatingPointType(type)) << type;
   return type == DataType::Type::kFloat64 ? DRegisterFrom(location) : SRegisterFrom(location);

diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 542bd17..bdeb6a4 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc

@@ -54,6 +54,7 @@
 using helpers::SRegisterFrom;
 using helpers::WRegisterFrom;
 using helpers::XRegisterFrom;
+using helpers::HRegisterFrom;
 using helpers::InputRegisterAt;
 using helpers::OutputRegister;
 
@@ -299,6 +300,14 @@
   locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
 }
 
+static void CreateIntIntToIntLocations(ArenaAllocator* allocator, HInvoke* invoke) {
+  LocationSummary* locations =
+      new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
 static void GenReverseBytes(LocationSummary* locations,
                             DataType::Type type,
                             MacroAssembler* masm) {
@@ -3304,6 +3313,92 @@
   GenerateFP16Round(invoke, codegen_, masm, roundOp);
 }
 
+template<typename OP>
+void GenerateFP16Compare(HInvoke* invoke,
+                         CodeGeneratorARM64* codegen,
+                         MacroAssembler* masm,
+                         const OP compareOp) {
+  DCHECK(codegen->GetInstructionSetFeatures().HasFP16());
+  LocationSummary* locations = invoke->GetLocations();
+  Register out = WRegisterFrom(locations->Out());
+  VRegister half0 = HRegisterFrom(locations->GetTemp(0));
+  VRegister half1 = HRegisterFrom(locations->GetTemp(1));
+  __ Fmov(half0, WRegisterFrom(locations->InAt(0)));
+  __ Fmov(half1, WRegisterFrom(locations->InAt(1)));
+  compareOp(out, half0, half1);
+}
+
+static inline void GenerateFP16Compare(HInvoke* invoke,
+                                       CodeGeneratorARM64* codegen,
+                                       MacroAssembler* masm,
+                                       vixl::aarch64::Condition cond) {
+  auto compareOp = [masm, cond](const Register out, const VRegister& in0, const VRegister& in1) {
+    __ Fcmp(in0, in1);
+    __ Cset(out, cond);
+  };
+  GenerateFP16Compare(invoke, codegen, masm, compareOp);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitFP16Greater(HInvoke* invoke) {
+  if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+    return;
+  }
+
+  CreateIntIntToIntLocations(allocator_, invoke);
+  invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16Greater(HInvoke* invoke) {
+  MacroAssembler* masm = GetVIXLAssembler();
+  GenerateFP16Compare(invoke, codegen_, masm, gt);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitFP16GreaterEquals(HInvoke* invoke) {
+  if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+    return;
+  }
+
+  CreateIntIntToIntLocations(allocator_, invoke);
+  invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16GreaterEquals(HInvoke* invoke) {
+  MacroAssembler* masm = GetVIXLAssembler();
+  GenerateFP16Compare(invoke, codegen_, masm, ge);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitFP16Less(HInvoke* invoke) {
+  if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+    return;
+  }
+
+  CreateIntIntToIntLocations(allocator_, invoke);
+  invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16Less(HInvoke* invoke) {
+  MacroAssembler* masm = GetVIXLAssembler();
+  GenerateFP16Compare(invoke, codegen_, masm, mi);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitFP16LessEquals(HInvoke* invoke) {
+  if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+    return;
+  }
+
+  CreateIntIntToIntLocations(allocator_, invoke);
+  invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+  invoke->GetLocations()->AddTemp(Location::RequiresFpuRegister());
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16LessEquals(HInvoke* invoke) {
+  MacroAssembler* masm = GetVIXLAssembler();
+  GenerateFP16Compare(invoke, codegen_, masm, ls);
+}
+
 UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent)
 
 UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf);

diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 1dfebdd..89e5203 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc

@@ -3075,6 +3075,10 @@
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Floor)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Ceil)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Rint)
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Greater)
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16GreaterEquals)
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Less)
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16LessEquals)
 
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOfAfter);

diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index ea9c591..537255f 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc

@@ -2712,6 +2712,10 @@
 UNIMPLEMENTED_INTRINSIC(MIPS, FP16Floor)
 UNIMPLEMENTED_INTRINSIC(MIPS, FP16Ceil)
 UNIMPLEMENTED_INTRINSIC(MIPS, FP16Rint)
+UNIMPLEMENTED_INTRINSIC(MIPS, FP16Greater)
+UNIMPLEMENTED_INTRINSIC(MIPS, FP16GreaterEquals)
+UNIMPLEMENTED_INTRINSIC(MIPS, FP16Less)
+UNIMPLEMENTED_INTRINSIC(MIPS, FP16LessEquals)
 
 UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOfAfter);

diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index fd93902..5920394 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc

@@ -2362,6 +2362,10 @@
 UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Floor)
 UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Ceil)
 UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Rint)
+UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Greater)
+UNIMPLEMENTED_INTRINSIC(MIPS64, FP16GreaterEquals)
+UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Less)
+UNIMPLEMENTED_INTRINSIC(MIPS64, FP16LessEquals)
 
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOfAfter);

diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 9d3cecb..6d7462e 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc

@@ -3086,6 +3086,10 @@
 UNIMPLEMENTED_INTRINSIC(X86, FP16Floor)
 UNIMPLEMENTED_INTRINSIC(X86, FP16Ceil)
 UNIMPLEMENTED_INTRINSIC(X86, FP16Rint)
+UNIMPLEMENTED_INTRINSIC(X86, FP16Greater)
+UNIMPLEMENTED_INTRINSIC(X86, FP16GreaterEquals)
+UNIMPLEMENTED_INTRINSIC(X86, FP16Less)
+UNIMPLEMENTED_INTRINSIC(X86, FP16LessEquals)
 
 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOfAfter);

diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 1111a59..0f6b006 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc

@@ -2753,6 +2753,10 @@
 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Floor)
 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Ceil)
 UNIMPLEMENTED_INTRINSIC(X86_64, FP16Rint)
+UNIMPLEMENTED_INTRINSIC(X86_64, FP16Greater)
+UNIMPLEMENTED_INTRINSIC(X86_64, FP16GreaterEquals)
+UNIMPLEMENTED_INTRINSIC(X86_64, FP16Less)
+UNIMPLEMENTED_INTRINSIC(X86_64, FP16LessEquals)
 
 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter);
commit	457e9fa3833ef11530056d010f247ad087fd2184	[log] [tgz]
author	Usama Arif <usama.arif@linaro.org>	Mon Nov 11 15:29:59 2019 +0000
committer	Vladimir Marko <vmarko@google.com>	Thu Nov 28 09:51:05 2019 +0000
tree	54b8a9dcf44646c3e43a9085d581660c5d9a0132
parent	17a39babb7f42cbe108d6fab2760cbdc68b821a2 [diff]