From 2cc0c0f4b6a76dfb1ad205cfd79efe7efe2904d6 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Tue, 15 Oct 2019 15:36:51 +0100 Subject: ARM64: toHalf() intrinsic for ARMv8 This CL implements an intrinsic for toHalf() method with ARMv8.2 FP16 instructions. This intrinsic implementation achieves bit-level compatibility with the original Java implementation android.util.Half.toFloat(). The time required to execute the below code on Pixel3: - Java implementation android.util.Half.toFloat(): - big cluster only: 2136ms - little cluster only: 6442ms - arm64 Intrinisic implementation: - big cluster only: 1347ms (~37% faster) - little cluster only: 4937ms (~ 23% faster) int benchmarkToHalf() { int result = 0; // 5.9605E-8 is the smallest positive subnormal number that can be // represented by FP16. This is 0x33800032 in float bits. int raw_input = 0x33800032; long before = 0; long after = 0; before = System.currentTimeMillis(); do { float input = Float.intBitsToFloat(raw_input); short output = FP16.toHalf(input); result += output; } while (++raw_input != 0x477fff00); // 65535 is the max possible integer that can be represented by FP16. //This is 0x477fff00 in float bits. after = System.currentTimeMillis(); System.out.println("Time of FP16.toHalf (ms): " + (after - before)); return result; } Test: 580-fp16 Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac Test: test-art-host, test-art-target Change-Id: I69b152682390e5ffa5b3fdca60b496261191655d --- compiler/optimizing/intrinsics_arm64.cc | 24 ++++++++++++++++++++++++ compiler/optimizing/intrinsics_arm_vixl.cc | 1 + compiler/optimizing/intrinsics_mips.cc | 1 + compiler/optimizing/intrinsics_mips64.cc | 1 + compiler/optimizing/intrinsics_x86.cc | 1 + compiler/optimizing/intrinsics_x86_64.cc | 1 + 6 files changed, 29 insertions(+) (limited to 'compiler/optimizing') diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc index 185d487dff..6a666c9eef 100644 --- a/compiler/optimizing/intrinsics_arm64.cc +++ b/compiler/optimizing/intrinsics_arm64.cc @@ -3216,6 +3216,30 @@ void IntrinsicCodeGeneratorARM64::VisitFP16ToFloat(HInvoke* invoke) { __ Fcvt(out, half); } +void IntrinsicLocationsBuilderARM64::VisitFP16ToHalf(HInvoke* invoke) { + if (!codegen_->GetInstructionSetFeatures().HasFP16()) { + return; + } + + LocationSummary* locations = new (allocator_) LocationSummary(invoke, + LocationSummary::kNoCall, + kIntrinsified); + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresRegister()); +} + +void IntrinsicCodeGeneratorARM64::VisitFP16ToHalf(HInvoke* invoke) { + DCHECK(codegen_->GetInstructionSetFeatures().HasFP16()); + MacroAssembler* masm = GetVIXLAssembler(); + UseScratchRegisterScope scratch_scope(masm); + FPRegister in = SRegisterFrom(invoke->GetLocations()->InAt(0)); + FPRegister half = scratch_scope.AcquireH(); + Register out = WRegisterFrom(invoke->GetLocations()->Out()); + __ Fcvt(half, in); + __ Fmov(out, half); + __ Sxth(out, out); // sign extend due to returning a short type. +} + UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent) UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf); diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc index 65f388837d..74e861fa8e 100644 --- a/compiler/optimizing/intrinsics_arm_vixl.cc +++ b/compiler/optimizing/intrinsics_arm_vixl.cc @@ -3071,6 +3071,7 @@ UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32Update) UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32UpdateBytes) UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32UpdateByteBuffer) UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToFloat) +UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToHalf) UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOf); UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOfAfter); diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc index f71d281d5a..b18bbdde2d 100644 --- a/compiler/optimizing/intrinsics_mips.cc +++ b/compiler/optimizing/intrinsics_mips.cc @@ -2708,6 +2708,7 @@ UNIMPLEMENTED_INTRINSIC(MIPS, CRC32Update) UNIMPLEMENTED_INTRINSIC(MIPS, CRC32UpdateBytes) UNIMPLEMENTED_INTRINSIC(MIPS, CRC32UpdateByteBuffer) UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToFloat) +UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToHalf) UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOf); UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOfAfter); diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc index 7b87b03b50..e4627db33f 100644 --- a/compiler/optimizing/intrinsics_mips64.cc +++ b/compiler/optimizing/intrinsics_mips64.cc @@ -2358,6 +2358,7 @@ UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32Update) UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32UpdateBytes) UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32UpdateByteBuffer) UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToFloat) +UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToHalf) UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOf); UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOfAfter); diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc index 5a622ca6d1..95aa4c0eaa 100644 --- a/compiler/optimizing/intrinsics_x86.cc +++ b/compiler/optimizing/intrinsics_x86.cc @@ -3082,6 +3082,7 @@ UNIMPLEMENTED_INTRINSIC(X86, CRC32Update) UNIMPLEMENTED_INTRINSIC(X86, CRC32UpdateBytes) UNIMPLEMENTED_INTRINSIC(X86, CRC32UpdateByteBuffer) UNIMPLEMENTED_INTRINSIC(X86, FP16ToFloat) +UNIMPLEMENTED_INTRINSIC(X86, FP16ToHalf) UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOf); UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOfAfter); diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc index cbf66069fe..8dbc0d3062 100644 --- a/compiler/optimizing/intrinsics_x86_64.cc +++ b/compiler/optimizing/intrinsics_x86_64.cc @@ -2749,6 +2749,7 @@ UNIMPLEMENTED_INTRINSIC(X86_64, CRC32Update) UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateBytes) UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateByteBuffer) UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToFloat) +UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToHalf) UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf); UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter); -- cgit v1.2.3-59-g8ed1b