diff options
author | 2019-10-25 17:37:33 +0100 | |
---|---|---|
committer | 2019-11-12 20:58:17 +0000 | |
commit | b9f02c2f8624bbf0746939e3b2735a1537a567b6 (patch) | |
tree | ac6ad1f4125bd459a3b424cb5ff8b8029a3d5c7e | |
parent | f1b18facd1edd6c8652c42085c5432c878507c8e (diff) |
ARM64: FP16.floor() intrinsic for ARMv8
This CL implements an intrinsic for floor() method with ARMv8.2 FP16
instructions. This intrinsic calls a template GenerateFP16Round function
which will be used to implement other intrinisics such as ceil and
rint.
This intrinsic implementation achieves bit-level compatibility with the
original Java implementation android.util.Half.floor().
The time required in milliseconds to execute the below code on Pixel3:
- Java implementation android.util.Half.floor():
- big cluster only: 18623
- little cluster only: 60424
- arm64 Intrinisic implementation:
- big cluster only: 14213 (~24% faster)
- little cluster only: 54398 (~10% faster)
Analysis of this function with simpleperf showed that approximately only
60-65% of the time is spent in libcore.util.FP16.floor. So the percentage
improvement using intrinsics is likely to be more than the numbers stated
above.
Another reason that the performance improvement with intrinsic is lower
than expected is because the java implementation for values between -1 and
1 (abs < 0x3c00) only requires a few instructions and should almost give
a similar performance to the intrinsic in this case. In the benchmark function
below, 46.8% of the values tested are between -1 and 1.
public static short benchmarkFloor(){
short ret = 0;
long before = 0;
long after = 0;
before = System.currentTimeMillis();
for(int i = 0; i < 50000; i++){
for (short h = Short.MIN_VALUE; h < Short.MAX_VALUE; h++) {
ret += FP16.floor(h);
}
}
after = System.currentTimeMillis();
System.out.println("Time of FP16.floor (ms): " + (after - before));
System.out.println(ret);
return ret;
}
Test: 580-fp16
Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac
Change-Id: Iad1dd032d456af54932f13c5cf27228f8652a0b5
-rw-r--r-- | compiler/optimizing/intrinsics_arm64.cc | 32 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_arm_vixl.cc | 1 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_mips.cc | 1 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_mips64.cc | 1 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_x86.cc | 1 | ||||
-rw-r--r-- | compiler/optimizing/intrinsics_x86_64.cc | 1 | ||||
-rw-r--r-- | runtime/hidden_api.h | 1 | ||||
-rw-r--r-- | runtime/image.cc | 2 | ||||
-rw-r--r-- | runtime/interpreter/interpreter_intrinsics.cc | 1 | ||||
-rw-r--r-- | runtime/intrinsics_list.h | 1 | ||||
-rw-r--r-- | test/580-fp16/src-art/Main.java | 26 |
11 files changed, 67 insertions, 1 deletions
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc index c48aaf5904..228255aca6 100644 --- a/compiler/optimizing/intrinsics_arm64.cc +++ b/compiler/optimizing/intrinsics_arm64.cc @@ -3240,6 +3240,38 @@ void IntrinsicCodeGeneratorARM64::VisitFP16ToHalf(HInvoke* invoke) { __ Sxth(out, out); // sign extend due to returning a short type. } +template<typename OP> +void GenerateFP16Round(HInvoke* invoke, + CodeGeneratorARM64* const codegen_, + MacroAssembler* masm, + const OP roundOp) { + DCHECK(codegen_->GetInstructionSetFeatures().HasFP16()); + LocationSummary* locations = invoke->GetLocations(); + UseScratchRegisterScope scratch_scope(masm); + Register out = WRegisterFrom(locations->Out()); + VRegister half = scratch_scope.AcquireH(); + __ Fmov(half, WRegisterFrom(locations->InAt(0))); + roundOp(half, half); + __ Fmov(out, half); + __ Sxth(out, out); +} + +void IntrinsicLocationsBuilderARM64::VisitFP16Floor(HInvoke* invoke) { + if (!codegen_->GetInstructionSetFeatures().HasFP16()) { + return; + } + + CreateIntToIntLocations(allocator_, invoke); +} + +void IntrinsicCodeGeneratorARM64::VisitFP16Floor(HInvoke* invoke) { + MacroAssembler* masm = GetVIXLAssembler(); + auto roundOp = [masm](const VRegister& out, const VRegister& in) { + __ Frintm(out, in); // Round towards Minus infinity + }; + GenerateFP16Round(invoke, codegen_, masm, roundOp); +} + UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent) UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf); diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc index 74e861fa8e..8217980ad0 100644 --- a/compiler/optimizing/intrinsics_arm_vixl.cc +++ b/compiler/optimizing/intrinsics_arm_vixl.cc @@ -3072,6 +3072,7 @@ UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32UpdateBytes) UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32UpdateByteBuffer) UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToFloat) UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToHalf) +UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16Floor) UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOf); UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOfAfter); diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc index b18bbdde2d..0bab2a0b17 100644 --- a/compiler/optimizing/intrinsics_mips.cc +++ b/compiler/optimizing/intrinsics_mips.cc @@ -2709,6 +2709,7 @@ UNIMPLEMENTED_INTRINSIC(MIPS, CRC32UpdateBytes) UNIMPLEMENTED_INTRINSIC(MIPS, CRC32UpdateByteBuffer) UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToFloat) UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToHalf) +UNIMPLEMENTED_INTRINSIC(MIPS, FP16Floor) UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOf); UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOfAfter); diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc index e4627db33f..6ed1133c90 100644 --- a/compiler/optimizing/intrinsics_mips64.cc +++ b/compiler/optimizing/intrinsics_mips64.cc @@ -2359,6 +2359,7 @@ UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32UpdateBytes) UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32UpdateByteBuffer) UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToFloat) UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToHalf) +UNIMPLEMENTED_INTRINSIC(MIPS64, FP16Floor) UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOf); UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOfAfter); diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc index 95aa4c0eaa..7bc9b63d0f 100644 --- a/compiler/optimizing/intrinsics_x86.cc +++ b/compiler/optimizing/intrinsics_x86.cc @@ -3083,6 +3083,7 @@ UNIMPLEMENTED_INTRINSIC(X86, CRC32UpdateBytes) UNIMPLEMENTED_INTRINSIC(X86, CRC32UpdateByteBuffer) UNIMPLEMENTED_INTRINSIC(X86, FP16ToFloat) UNIMPLEMENTED_INTRINSIC(X86, FP16ToHalf) +UNIMPLEMENTED_INTRINSIC(X86, FP16Floor) UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOf); UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOfAfter); diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc index 8dbc0d3062..e11208c818 100644 --- a/compiler/optimizing/intrinsics_x86_64.cc +++ b/compiler/optimizing/intrinsics_x86_64.cc @@ -2750,6 +2750,7 @@ UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateBytes) UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateByteBuffer) UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToFloat) UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToHalf) +UNIMPLEMENTED_INTRINSIC(X86_64, FP16Floor) UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf); UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter); diff --git a/runtime/hidden_api.h b/runtime/hidden_api.h index 2ef3522eee..ec24c42135 100644 --- a/runtime/hidden_api.h +++ b/runtime/hidden_api.h @@ -357,6 +357,7 @@ ALWAYS_INLINE inline uint32_t GetRuntimeFlags(ArtMethod* method) case Intrinsics::kVarHandleWeakCompareAndSetRelease: return 0u; case Intrinsics::kUnsafeGetLong: + case Intrinsics::kFP16Floor: case Intrinsics::kFP16ToFloat: case Intrinsics::kFP16ToHalf: return kAccCorePlatformApi; diff --git a/runtime/image.cc b/runtime/image.cc index 06ba946549..2b4099f258 100644 --- a/runtime/image.cc +++ b/runtime/image.cc @@ -29,7 +29,7 @@ namespace art { const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' }; -const uint8_t ImageHeader::kImageVersion[] = { '0', '8', '0', '\0' }; // Chained checksums. +const uint8_t ImageHeader::kImageVersion[] = { '0', '8', '1', '\0' }; // FP16Floor intrinsic ImageHeader::ImageHeader(uint32_t image_reservation_size, uint32_t component_count, diff --git a/runtime/interpreter/interpreter_intrinsics.cc b/runtime/interpreter/interpreter_intrinsics.cc index 3759225b91..4ddf9bb465 100644 --- a/runtime/interpreter/interpreter_intrinsics.cc +++ b/runtime/interpreter/interpreter_intrinsics.cc @@ -575,6 +575,7 @@ bool MterpHandleIntrinsic(ShadowFrame* shadow_frame, UNIMPLEMENTED_CASE(CRC32UpdateByteBuffer /* (IJII)I */) UNIMPLEMENTED_CASE(FP16ToFloat /* (S)F */) UNIMPLEMENTED_CASE(FP16ToHalf /* (F)S */) + UNIMPLEMENTED_CASE(FP16Floor /* (S)S */) INTRINSIC_CASE(VarHandleFullFence) INTRINSIC_CASE(VarHandleAcquireFence) INTRINSIC_CASE(VarHandleReleaseFence) diff --git a/runtime/intrinsics_list.h b/runtime/intrinsics_list.h index bb41ca732d..045c808356 100644 --- a/runtime/intrinsics_list.h +++ b/runtime/intrinsics_list.h @@ -165,6 +165,7 @@ V(MemoryPokeIntNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeIntNative", "(JI)V") \ V(MemoryPokeLongNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeLongNative", "(JJ)V") \ V(MemoryPokeShortNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeShortNative", "(JS)V") \ + V(FP16Floor, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "floor", "(S)S") \ V(FP16ToFloat, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "toFloat", "(S)F") \ V(FP16ToHalf, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "toHalf", "(F)S") \ V(StringCharAt, kVirtual, kNeedsEnvironmentOrCache, kReadSideEffects, kCanThrow, "Ljava/lang/String;", "charAt", "(I)C") \ diff --git a/test/580-fp16/src-art/Main.java b/test/580-fp16/src-art/Main.java index 4aa8d55faa..815c9f52e8 100644 --- a/test/580-fp16/src-art/Main.java +++ b/test/580-fp16/src-art/Main.java @@ -112,9 +112,35 @@ public class Main { assertEquals(0xffffe000, TestFP16ToFloatRawIntBits((short)(0xffff))); // QNaN->QNaN } + public static void testFloor() { + // These tests have been taken from the cts HalfTest + assertEquals(FP16.POSITIVE_INFINITY, FP16.floor(FP16.POSITIVE_INFINITY)); + assertEquals(FP16.NEGATIVE_INFINITY, FP16.floor(FP16.NEGATIVE_INFINITY)); + assertEquals(FP16.POSITIVE_ZERO, FP16.floor(FP16.POSITIVE_ZERO)); + assertEquals(FP16.NEGATIVE_ZERO, FP16.floor(FP16.NEGATIVE_ZERO)); + assertEquals(FP16.NaN, FP16.floor(FP16.NaN)); + assertEquals(FP16.LOWEST_VALUE, FP16.floor(FP16.LOWEST_VALUE)); + assertEquals(FP16.POSITIVE_ZERO, FP16.floor(FP16.MIN_NORMAL)); + assertEquals(FP16.POSITIVE_ZERO, FP16.floor((short) 0x3ff)); + assertEquals(FP16.POSITIVE_ZERO, FP16.floor(FP16.toHalf(0.2f))); + assertEquals(-1.0f, FP16.toFloat(FP16.floor(FP16.toHalf(-0.2f)))); + assertEquals(-1.0f, FP16.toFloat(FP16.floor(FP16.toHalf(-0.7f)))); + assertEquals(FP16.POSITIVE_ZERO, FP16.floor(FP16.toHalf(0.7f))); + assertEquals(124.0f, FP16.toFloat(FP16.floor(FP16.toHalf(124.7f)))); + assertEquals(-125.0f, FP16.toFloat(FP16.floor(FP16.toHalf(-124.7f)))); + assertEquals(124.0f, FP16.toFloat(FP16.floor(FP16.toHalf(124.2f)))); + assertEquals(-125.0f, FP16.toFloat(FP16.floor(FP16.toHalf(-124.2f)))); + // floor for NaN values + assertEquals((short) 0x7e01, FP16.floor((short) 0x7c01)); + assertEquals((short) 0x7f00, FP16.floor((short) 0x7d00)); + assertEquals((short) 0xfe01, FP16.floor((short) 0xfc01)); + assertEquals((short) 0xff00, FP16.floor((short) 0xfd00)); + } + public static void main(String args[]) { testHalfToFloatToHalfConversions(); testToHalf(); testToFloat(); + testFloor(); } } |