ARM64: toHalf() intrinsic for ARMv8

This CL implements an intrinsic for toHalf() method with ARMv8.2 FP16 instructions. This intrinsic implementation achieves bit-level compatibility with the original Java implementation android.util.Half.toFloat(). The time required to execute the below code on Pixel3: - Java implementation android.util.Half.toFloat(): - big cluster only: 2136ms - little cluster only: 6442ms - arm64 Intrinisic implementation: - big cluster only: 1347ms (~37% faster) - little cluster only: 4937ms (~ 23% faster) int benchmarkToHalf() { int result = 0; // 5.9605E-8 is the smallest positive subnormal number that can be // represented by FP16. This is 0x33800032 in float bits. int raw_input = 0x33800032; long before = 0; long after = 0; before = System.currentTimeMillis(); do { float input = Float.intBitsToFloat(raw_input); short output = FP16.toHalf(input); result += output; } while (++raw_input != 0x477fff00); // 65535 is the max possible integer that can be represented by FP16. //This is 0x477fff00 in float bits. after = System.currentTimeMillis(); System.out.println("Time of FP16.toHalf (ms): " + (after - before)); return result; } Test: 580-fp16 Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac Test: test-art-host, test-art-target Change-Id: I69b152682390e5ffa5b3fdca60b496261191655d
author: Usama Arif <usama.arif@linaro.org> 2019-10-15 15:36:51 +0100
committer: Vladimir Marko <vmarko@google.com> 2019-10-23 12:04:00 +0000
commit: 2cc0c0f4b6a76dfb1ad205cfd79efe7efe2904d6 (patch)
tree: 5ff5f5f7c5c25f441a36506a84988fa95d2dbd46
parent: b8c884e5f22390386b202459ab55ef3046631e42 (diff)
11 files changed, 80 insertions, 9 deletions
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 185d487dff..6a666c9eef 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -3216,6 +3216,30 @@ void IntrinsicCodeGeneratorARM64::VisitFP16ToFloat(HInvoke* invoke) {
   __ Fcvt(out, half);
 }
 
+void IntrinsicLocationsBuilderARM64::VisitFP16ToHalf(HInvoke* invoke) {
+  if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+    return;
+  }
+
+  LocationSummary* locations = new (allocator_) LocationSummary(invoke,
+                                                                LocationSummary::kNoCall,
+                                                                kIntrinsified);
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetOut(Location::RequiresRegister());
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16ToHalf(HInvoke* invoke) {
+  DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
+  MacroAssembler* masm = GetVIXLAssembler();
+  UseScratchRegisterScope scratch_scope(masm);
+  FPRegister in = SRegisterFrom(invoke->GetLocations()->InAt(0));
+  FPRegister half = scratch_scope.AcquireH();
+  Register out = WRegisterFrom(invoke->GetLocations()->Out());
+  __ Fcvt(half, in);
+  __ Fmov(out, half);
+  __ Sxth(out, out);  // sign extend due to returning a short type.
+}
+
 UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent)
 
 UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf);
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 65f388837d..74e861fa8e 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -3071,6 +3071,7 @@ UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32Update)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32UpdateBytes)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToFloat)
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToHalf)
 
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index f71d281d5a..b18bbdde2d 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -2708,6 +2708,7 @@ UNIMPLEMENTED_INTRINSIC(MIPS, CRC32Update)
 UNIMPLEMENTED_INTRINSIC(MIPS, CRC32UpdateBytes)
 UNIMPLEMENTED_INTRINSIC(MIPS, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToFloat)
+UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToHalf)
 
 UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index 7b87b03b50..e4627db33f 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -2358,6 +2358,7 @@ UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32Update)
 UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32UpdateBytes)
 UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToFloat)
+UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToHalf)
 
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 5a622ca6d1..95aa4c0eaa 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -3082,6 +3082,7 @@ UNIMPLEMENTED_INTRINSIC(X86, CRC32Update)
 UNIMPLEMENTED_INTRINSIC(X86, CRC32UpdateBytes)
 UNIMPLEMENTED_INTRINSIC(X86, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(X86, FP16ToFloat)
+UNIMPLEMENTED_INTRINSIC(X86, FP16ToHalf)
 
 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index cbf66069fe..8dbc0d3062 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -2749,6 +2749,7 @@ UNIMPLEMENTED_INTRINSIC(X86_64, CRC32Update)
 UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateBytes)
 UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateByteBuffer)
 UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToFloat)
+UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToHalf)
 
 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter);
diff --git a/runtime/hidden_api.h b/runtime/hidden_api.h
index a21225b376..2ef3522eee 100644
--- a/runtime/hidden_api.h
+++ b/runtime/hidden_api.h
@@ -358,6 +358,7 @@ ALWAYS_INLINE inline uint32_t GetRuntimeFlags(ArtMethod* method)
         return 0u;
       case Intrinsics::kUnsafeGetLong:
       case Intrinsics::kFP16ToFloat:
+      case Intrinsics::kFP16ToHalf:
         return kAccCorePlatformApi;
       default:
         // Remaining intrinsics are public API. We DCHECK that in SetIntrinsic().
diff --git a/runtime/image.cc b/runtime/image.cc
index 11fac590b0..256b957c60 100644
--- a/runtime/image.cc
+++ b/runtime/image.cc
@@ -29,7 +29,7 @@
 namespace art {
 
 const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
-const uint8_t ImageHeader::kImageVersion[] = { '0', '7', '8', '\0' };  // FP16ToFloat intrinsic
+const uint8_t ImageHeader::kImageVersion[] = { '0', '7', '9', '\0' };  // FP16ToHalf intrinsic
 
 ImageHeader::ImageHeader(uint32_t image_reservation_size,
                          uint32_t component_count,
diff --git a/runtime/interpreter/interpreter_intrinsics.cc b/runtime/interpreter/interpreter_intrinsics.cc
index 6b2d989cd3..3759225b91 100644
--- a/runtime/interpreter/interpreter_intrinsics.cc
+++ b/runtime/interpreter/interpreter_intrinsics.cc
@@ -574,6 +574,7 @@ bool MterpHandleIntrinsic(ShadowFrame* shadow_frame,
     UNIMPLEMENTED_CASE(CRC32UpdateBytes /* (I[BII)I */)
     UNIMPLEMENTED_CASE(CRC32UpdateByteBuffer /* (IJII)I */)
     UNIMPLEMENTED_CASE(FP16ToFloat /* (S)F */)
+    UNIMPLEMENTED_CASE(FP16ToHalf /* (F)S */)
     INTRINSIC_CASE(VarHandleFullFence)
     INTRINSIC_CASE(VarHandleAcquireFence)
     INTRINSIC_CASE(VarHandleReleaseFence)
diff --git a/runtime/intrinsics_list.h b/runtime/intrinsics_list.h
index 15ae309624..bb41ca732d 100644
--- a/runtime/intrinsics_list.h
+++ b/runtime/intrinsics_list.h
@@ -166,6 +166,7 @@
   V(MemoryPokeLongNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeLongNative", "(JJ)V") \
   V(MemoryPokeShortNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeShortNative", "(JS)V") \
   V(FP16ToFloat, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "toFloat", "(S)F") \
+  V(FP16ToHalf, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "toHalf", "(F)S") \
   V(StringCharAt, kVirtual, kNeedsEnvironmentOrCache, kReadSideEffects, kCanThrow, "Ljava/lang/String;", "charAt", "(I)C") \
   V(StringCompareTo, kVirtual, kNeedsEnvironmentOrCache, kReadSideEffects, kCanThrow, "Ljava/lang/String;", "compareTo", "(Ljava/lang/String;)I") \
   V(StringEquals, kVirtual, kNeedsEnvironmentOrCache, kReadSideEffects, kCanThrow, "Ljava/lang/String;", "equals", "(Ljava/lang/Object;)Z") \
diff --git a/test/580-fp16/src-art/Main.java b/test/580-fp16/src-art/Main.java
index 798b52dd34..a89e1000d0 100644
--- a/test/580-fp16/src-art/Main.java
+++ b/test/580-fp16/src-art/Main.java
@@ -28,15 +28,14 @@ public class Main {
         return Float.floatToRawIntBits(f);
     }
 
-    public static void assertEquals(int expected, int actual) {
-        if (expected != actual) {
-            throw new Error("Expected: " + expected + ", found: " + actual);
+    public static void assertEquals(short expected, short calculated) {
+        if (expected != calculated) {
+            throw new Error("Expected: " + expected + ", Calculated: " + calculated);
         }
     }
-
-    public static void assertEquals(float expected, float actual) {
-        if (expected != actual) {
-            throw new Error("Expected: " + expected + ", found: " + actual);
+    public static void assertEquals(float expected, float calculated) {
+        if (expected != calculated) {
+            throw new Error("Expected: " + expected + ", Calculated: " + calculated);
         }
     }
 
@@ -47,8 +46,48 @@ public class Main {
                 // NaN inputs are tested below.
                 continue;
             }
-            assertEquals(FP16.toHalf(FP16.toFloat(h)), h);
+            assertEquals(h, FP16.toHalf(FP16.toFloat(h)));
         }
+
+        // These asserts check some known values and edge cases for FP16.toHalf
+        // and have been inspired by the cts HalfTest.
+        // Zeroes, NaN and infinities
+        assertEquals(FP16.POSITIVE_ZERO, FP16.toHalf(0.0f));
+        assertEquals(FP16.NEGATIVE_ZERO, FP16.toHalf(-0.0f));
+        assertEquals(FP16.NaN, FP16.toHalf(Float.NaN));
+        assertEquals(FP16.POSITIVE_INFINITY, FP16.toHalf(Float.POSITIVE_INFINITY));
+        assertEquals(FP16.NEGATIVE_INFINITY, FP16.toHalf(Float.NEGATIVE_INFINITY));
+        // Known values
+        assertEquals((short) 0x3c01, FP16.toHalf(1.0009765625f));
+        assertEquals((short) 0xc000, FP16.toHalf(-2.0f));
+        assertEquals((short) 0x0400, FP16.toHalf(6.10352e-5f));
+        assertEquals((short) 0x7bff, FP16.toHalf(65504.0f));
+        assertEquals((short) 0x3555, FP16.toHalf(1.0f / 3.0f));
+        // Subnormals
+        assertEquals((short) 0x03ff, FP16.toHalf(6.09756e-5f));
+        assertEquals(FP16.MIN_VALUE, FP16.toHalf(5.96046e-8f));
+        assertEquals((short) 0x83ff, FP16.toHalf(-6.09756e-5f));
+        assertEquals((short) 0x8001, FP16.toHalf(-5.96046e-8f));
+        // Subnormals (flushed to +/-0)
+        assertEquals(FP16.POSITIVE_ZERO, FP16.toHalf(5.96046e-9f));
+        assertEquals(FP16.NEGATIVE_ZERO, FP16.toHalf(-5.96046e-9f));
+        // Test for values that overflow the mantissa bits into exp bits
+        assertEquals(0x1000, FP16.toHalf(Float.intBitsToFloat(0x39fff000)));
+        assertEquals(0x0400, FP16.toHalf(Float.intBitsToFloat(0x387fe000)));
+        // Floats with absolute value above +/-65519 are rounded to +/-inf
+        // when using round-to-even
+        assertEquals(0x7bff, FP16.toHalf(65519.0f));
+        assertEquals(0x7bff, FP16.toHalf(65519.9f));
+        assertEquals(FP16.POSITIVE_INFINITY, FP16.toHalf(65520.0f));
+        assertEquals(FP16.NEGATIVE_INFINITY, FP16.toHalf(-65520.0f));
+        // Check if numbers are rounded to nearest even when they
+        // cannot be accurately represented by Half
+        assertEquals(0x6800, FP16.toHalf(2049.0f));
+        assertEquals(0x6c00, FP16.toHalf(4098.0f));
+        assertEquals(0x7000, FP16.toHalf(8196.0f));
+        assertEquals(0x7400, FP16.toHalf(16392.0f));
+        assertEquals(0x7800, FP16.toHalf(32784.0f));
+
         // FP16 SNaN/QNaN inputs to float
         // The most significant bit of mantissa:
         //                 V
author	Usama Arif <usama.arif@linaro.org>	2019-10-15 15:36:51 +0100
committer	Vladimir Marko <vmarko@google.com>	2019-10-23 12:04:00 +0000
commit	2cc0c0f4b6a76dfb1ad205cfd79efe7efe2904d6 (patch)
tree	5ff5f5f7c5c25f441a36506a84988fa95d2dbd46
parent	b8c884e5f22390386b202459ab55ef3046631e42 (diff)