Add Math.fma (double & float versions) intrinsics for arm64 Intrinsic implementation is ~500 times faster than java implementation using BigInteger. Bug: 199373643 Test: ./art/test/testrunner/testrunner.py --target \ --optimizing --64 -t 082-inline-execute Change-Id: I50eae88b332ba9338b0a59fecad7d2158a97ffbb

commit: 3fa6e46c0f379a35d51e37b9253c1d7369e3158f [log] [tgz]
author: Nikita Iashchenko <nikitai@google.com> Fri Sep 10 17:30:04 2021 +0100
committer: Nikita Iashchenko <nikitai@google.com> Fri Nov 12 15:52:56 2021 +0000
tree: cf5550ec4a680ae3edebc4395262b69f26757380
parent: a1d4c56b0d92817939dcf18aa44ea516e4829863 [diff]
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index a1ba873..92776f1 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc

@@ -2259,6 +2259,22 @@
   locations->SetOut(calling_convention.GetReturnLocation(invoke->GetType()));
 }
 
+static void CreateFPFPFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
+  DCHECK_EQ(invoke->GetNumberOfArguments(), 3U);
+  DCHECK(DataType::IsFloatingPointType(invoke->InputAt(0)->GetType()));
+  DCHECK(DataType::IsFloatingPointType(invoke->InputAt(1)->GetType()));
+  DCHECK(DataType::IsFloatingPointType(invoke->InputAt(2)->GetType()));
+  DCHECK(DataType::IsFloatingPointType(invoke->GetType()));
+
+  LocationSummary* const locations =
+      new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
+
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetInAt(1, Location::RequiresFpuRegister());
+  locations->SetInAt(2, Location::RequiresFpuRegister());
+  locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+}
+
 static void GenFPToFPCall(HInvoke* invoke,
                           CodeGeneratorARM64* codegen,
                           QuickEntrypointEnum entry) {
@@ -4179,6 +4195,33 @@
   __ Smulh(out, x, y);
 }
 
+static void GenerateMathFma(HInvoke* invoke, CodeGeneratorARM64* codegen) {
+  MacroAssembler* masm = codegen->GetVIXLAssembler();
+
+  VRegister n = helpers::InputFPRegisterAt(invoke, 0);
+  VRegister m = helpers::InputFPRegisterAt(invoke, 1);
+  VRegister a = helpers::InputFPRegisterAt(invoke, 2);
+  VRegister out = helpers::OutputFPRegister(invoke);
+
+  __ Fmadd(out, n, m, a);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathFmaDouble(HInvoke* invoke) {
+  CreateFPFPFPToFPCallLocations(allocator_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathFmaDouble(HInvoke* invoke) {
+  GenerateMathFma(invoke, codegen_);
+}
+
+void IntrinsicLocationsBuilderARM64::VisitMathFmaFloat(HInvoke* invoke) {
+  CreateFPFPFPToFPCallLocations(allocator_, invoke);
+}
+
+void IntrinsicCodeGeneratorARM64::VisitMathFmaFloat(HInvoke* invoke) {
+  GenerateMathFma(invoke, codegen_);
+}
+
 class VarHandleSlowPathARM64 : public IntrinsicSlowPathARM64 {
  public:
   VarHandleSlowPathARM64(HInvoke* invoke, std::memory_order order)
@@ -5597,9 +5640,6 @@
 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderToString);
 
 // 1.8.
-UNIMPLEMENTED_INTRINSIC(ARM64, MathFmaDouble)
-UNIMPLEMENTED_INTRINSIC(ARM64, MathFmaFloat)
-
 UNIMPLEMENTED_INTRINSIC(ARM64, UnsafeGetAndAddInt)
 UNIMPLEMENTED_INTRINSIC(ARM64, UnsafeGetAndAddLong)
 UNIMPLEMENTED_INTRINSIC(ARM64, UnsafeGetAndSetInt)

diff --git a/runtime/image.cc b/runtime/image.cc
index e62380a..b39ffe5 100644
--- a/runtime/image.cc
+++ b/runtime/image.cc

@@ -30,7 +30,7 @@
 
 const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
 // Last change: Math.fma(double, double, double) intrinsic.
-const uint8_t ImageHeader::kImageVersion[] = { '1', '0', '5', '\0' };
+const uint8_t ImageHeader::kImageVersion[] = { '1', '0', '6', '\0' };
 
 ImageHeader::ImageHeader(uint32_t image_reservation_size,
                          uint32_t component_count,
commit	3fa6e46c0f379a35d51e37b9253c1d7369e3158f	[log] [tgz]
author	Nikita Iashchenko <nikitai@google.com>	Fri Sep 10 17:30:04 2021 +0100
committer	Nikita Iashchenko <nikitai@google.com>	Fri Nov 12 15:52:56 2021 +0000
tree	cf5550ec4a680ae3edebc4395262b69f26757380
parent	a1d4c56b0d92817939dcf18aa44ea516e4829863 [diff]