ARM64: toFloat() intrinsics with ARMv8 FP16.

This CL intrinsifies toFloat() method with ARMv8.2 FP16 instructions.

This CL depends on the android framework and libcore changes:
moving FP16 implementations into libcore.

Tested with local micro benchmark on Pixel 3, compared to original
android.util.Half.toFloat() Java implementation, this intrinsic is
50% faster.

In real-life case, the FP16 toFloat() intrinsic can help
accelerate ColorLong ARGB decoding in Android framework.

This intrinsic implementation archieves bit-level compatibility with the
original Java implementation android.util.Half.toFloat().

Test: 580-fp16
Test: art/test/testrunner/run_build_test_target.py -j80 art-test-javac
Test: test-art-host, test-art-target

Change-Id: I059c69747067b84f2c532465e32a1dcd3c25269f
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index da1874e..1fab712 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -3169,6 +3169,29 @@
   GenerateCodeForCalculationCRC32ValueOfBytes(masm, crc, ptr, length, out);
 }
 
+void IntrinsicLocationsBuilderARM64::VisitFP16ToFloat(HInvoke* invoke) {
+  if (!codegen_->GetInstructionSetFeatures().HasFP16()) {
+    return;
+  }
+
+  LocationSummary* locations = new (allocator_) LocationSummary(invoke,
+                                                                LocationSummary::kNoCall,
+                                                                kIntrinsified);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetOut(Location::RequiresFpuRegister());
+}
+
+void IntrinsicCodeGeneratorARM64::VisitFP16ToFloat(HInvoke* invoke) {
+  DCHECK(codegen_->GetInstructionSetFeatures().HasFP16());
+  MacroAssembler* masm = GetVIXLAssembler();
+  UseScratchRegisterScope scratch_scope(masm);
+  Register bits = InputRegisterAt(invoke, 0);
+  FPRegister out = SRegisterFrom(invoke->GetLocations()->Out());
+  FPRegister half = scratch_scope.AcquireH();
+  __ Fmov(half, bits);  // ARMv8.2
+  __ Fcvt(out, half);
+}
+
 UNIMPLEMENTED_INTRINSIC(ARM64, ReferenceGetReferent)
 
 UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf);
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index a497cca..65f3888 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -3070,6 +3070,7 @@
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32Update)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32UpdateBytes)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, CRC32UpdateByteBuffer)
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, FP16ToFloat)
 
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 9e5316d..f71d281 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -2707,6 +2707,7 @@
 UNIMPLEMENTED_INTRINSIC(MIPS, CRC32Update)
 UNIMPLEMENTED_INTRINSIC(MIPS, CRC32UpdateBytes)
 UNIMPLEMENTED_INTRINSIC(MIPS, CRC32UpdateByteBuffer)
+UNIMPLEMENTED_INTRINSIC(MIPS, FP16ToFloat)
 
 UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(MIPS, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc
index d8ec892..7b87b03 100644
--- a/compiler/optimizing/intrinsics_mips64.cc
+++ b/compiler/optimizing/intrinsics_mips64.cc
@@ -2357,6 +2357,7 @@
 UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32Update)
 UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32UpdateBytes)
 UNIMPLEMENTED_INTRINSIC(MIPS64, CRC32UpdateByteBuffer)
+UNIMPLEMENTED_INTRINSIC(MIPS64, FP16ToFloat)
 
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(MIPS64, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index f8d3bf5..5a622ca 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -3081,6 +3081,7 @@
 UNIMPLEMENTED_INTRINSIC(X86, CRC32Update)
 UNIMPLEMENTED_INTRINSIC(X86, CRC32UpdateBytes)
 UNIMPLEMENTED_INTRINSIC(X86, CRC32UpdateByteBuffer)
+UNIMPLEMENTED_INTRINSIC(X86, FP16ToFloat)
 
 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(X86, StringStringIndexOfAfter);
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index f2a6c53..cbf6606 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -2748,6 +2748,7 @@
 UNIMPLEMENTED_INTRINSIC(X86_64, CRC32Update)
 UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateBytes)
 UNIMPLEMENTED_INTRINSIC(X86_64, CRC32UpdateByteBuffer)
+UNIMPLEMENTED_INTRINSIC(X86_64, FP16ToFloat)
 
 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(X86_64, StringStringIndexOfAfter);
diff --git a/runtime/hidden_api.h b/runtime/hidden_api.h
index 9d7c646..a21225b 100644
--- a/runtime/hidden_api.h
+++ b/runtime/hidden_api.h
@@ -357,6 +357,7 @@
       case Intrinsics::kVarHandleWeakCompareAndSetRelease:
         return 0u;
       case Intrinsics::kUnsafeGetLong:
+      case Intrinsics::kFP16ToFloat:
         return kAccCorePlatformApi;
       default:
         // Remaining intrinsics are public API. We DCHECK that in SetIntrinsic().
diff --git a/runtime/image.cc b/runtime/image.cc
index 76efd5f..11fac59 100644
--- a/runtime/image.cc
+++ b/runtime/image.cc
@@ -29,7 +29,7 @@
 namespace art {
 
 const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
-const uint8_t ImageHeader::kImageVersion[] = { '0', '7', '7', '\0' };  // Use boot image sentinel.
+const uint8_t ImageHeader::kImageVersion[] = { '0', '7', '8', '\0' };  // FP16ToFloat intrinsic
 
 ImageHeader::ImageHeader(uint32_t image_reservation_size,
                          uint32_t component_count,
diff --git a/runtime/interpreter/interpreter_intrinsics.cc b/runtime/interpreter/interpreter_intrinsics.cc
index 81c06bc..6b2d989 100644
--- a/runtime/interpreter/interpreter_intrinsics.cc
+++ b/runtime/interpreter/interpreter_intrinsics.cc
@@ -573,6 +573,7 @@
     UNIMPLEMENTED_CASE(CRC32Update /* (II)I */)
     UNIMPLEMENTED_CASE(CRC32UpdateBytes /* (I[BII)I */)
     UNIMPLEMENTED_CASE(CRC32UpdateByteBuffer /* (IJII)I */)
+    UNIMPLEMENTED_CASE(FP16ToFloat /* (S)F */)
     INTRINSIC_CASE(VarHandleFullFence)
     INTRINSIC_CASE(VarHandleAcquireFence)
     INTRINSIC_CASE(VarHandleReleaseFence)
diff --git a/runtime/intrinsics_list.h b/runtime/intrinsics_list.h
index 7cd8b0b..15ae309 100644
--- a/runtime/intrinsics_list.h
+++ b/runtime/intrinsics_list.h
@@ -165,6 +165,7 @@
   V(MemoryPokeIntNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeIntNative", "(JI)V") \
   V(MemoryPokeLongNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeLongNative", "(JJ)V") \
   V(MemoryPokeShortNative, kStatic, kNeedsEnvironmentOrCache, kWriteSideEffects, kCanThrow, "Llibcore/io/Memory;", "pokeShortNative", "(JS)V") \
+  V(FP16ToFloat, kStatic, kNeedsEnvironmentOrCache, kNoSideEffects, kNoThrow, "Llibcore/util/FP16;", "toFloat", "(S)F") \
   V(StringCharAt, kVirtual, kNeedsEnvironmentOrCache, kReadSideEffects, kCanThrow, "Ljava/lang/String;", "charAt", "(I)C") \
   V(StringCompareTo, kVirtual, kNeedsEnvironmentOrCache, kReadSideEffects, kCanThrow, "Ljava/lang/String;", "compareTo", "(Ljava/lang/String;)I") \
   V(StringEquals, kVirtual, kNeedsEnvironmentOrCache, kReadSideEffects, kCanThrow, "Ljava/lang/String;", "equals", "(Ljava/lang/Object;)Z") \
diff --git a/test/580-fp16/expected.txt b/test/580-fp16/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/580-fp16/expected.txt
diff --git a/test/580-fp16/info.txt b/test/580-fp16/info.txt
new file mode 100644
index 0000000..547ae22
--- /dev/null
+++ b/test/580-fp16/info.txt
@@ -0,0 +1 @@
+This test case is used to test libcore.util.FP16.
diff --git a/test/580-fp16/src-art/Main.java b/test/580-fp16/src-art/Main.java
new file mode 100644
index 0000000..798b52d
--- /dev/null
+++ b/test/580-fp16/src-art/Main.java
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import libcore.util.FP16;
+
+public class Main {
+    public Main() {
+    }
+
+    public static int TestFP16ToFloatRawIntBits(short half) {
+        float f = FP16.toFloat(half);
+        // Since in this test class we need to check the integer representing of
+        // the actual float NaN values, the floatToRawIntBits() is used instead of
+        // floatToIntBits().
+        return Float.floatToRawIntBits(f);
+    }
+
+    public static void assertEquals(int expected, int actual) {
+        if (expected != actual) {
+            throw new Error("Expected: " + expected + ", found: " + actual);
+        }
+    }
+
+    public static void assertEquals(float expected, float actual) {
+        if (expected != actual) {
+            throw new Error("Expected: " + expected + ", found: " + actual);
+        }
+    }
+
+    public static void main(String args[]) {
+        // Test FP16 to float
+        for (short h = Short.MIN_VALUE; h < Short.MAX_VALUE; h++) {
+            if (FP16.isNaN(h)) {
+                // NaN inputs are tested below.
+                continue;
+            }
+            assertEquals(FP16.toHalf(FP16.toFloat(h)), h);
+        }
+        // FP16 SNaN/QNaN inputs to float
+        // The most significant bit of mantissa:
+        //                 V
+        // 0xfc01: 1 11111 0000000001 (signaling NaN)
+        // 0xfdff: 1 11111 0111111111 (signaling NaN)
+        // 0xfe00: 1 11111 1000000000 (quiet NaN)
+        // 0xffff: 1 11111 1111111111 (quiet NaN)
+        // This test is inspired by Java implementation of android.util.Half.toFloat(),
+        // where the implementation performs SNaN->QNaN conversion.
+        assert(Float.isNaN(FP16.toFloat((short)0xfc01)));
+        assert(Float.isNaN(FP16.toFloat((short)0xfdff)));
+        assert(Float.isNaN(FP16.toFloat((short)0xfe00)));
+        assert(Float.isNaN(FP16.toFloat((short)0xffff)));
+        assertEquals(0xffc02000, TestFP16ToFloatRawIntBits((short)(0xfc01)));  // SNaN->QNaN
+        assertEquals(0xffffe000, TestFP16ToFloatRawIntBits((short)(0xfdff)));  // SNaN->QNaN
+        assertEquals(0xffc00000, TestFP16ToFloatRawIntBits((short)(0xfe00)));  // QNaN->QNaN
+        assertEquals(0xffffe000, TestFP16ToFloatRawIntBits((short)(0xffff)));  // QNaN->QNaN
+    }
+}
diff --git a/test/knownfailures.json b/test/knownfailures.json
index 1d6e36d..4f119a8 100644
--- a/test/knownfailures.json
+++ b/test/knownfailures.json
@@ -950,6 +950,7 @@
           "574-irreducible-and-constant-area",
           "575-checker-string-init-alias",
           "580-checker-string-fact-intrinsics",
+          "580-fp16",
           "585-inline-unresolved",
           "586-checker-null-array-get",
           "587-inline-class-error",