Support Math.fma Intrinsic for x86 and x86_64

Intrinsic Implementation is 2287x faster than base
Time for base: 188.25s
Time for patch: 0.0822s

Test: ./run-test --host --64 --optimizing 082-inline-execute
Test: ./run-test --host --64 --optimizing --instruction-set-features sse4.1 082-inline-execute
Test: ./run-test --host --64 --optimizing --instruction-set-features sse4.1,avx2 082-inline-execute
Signed-off-by: Shalini Salomi Bodapati <shalini.salomi.bodapati@intel.com>
Change-Id: I68b96a35f41f3bb23d7e26d87fee1dd2a5ebf6a6
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 7924e56..b5677e5 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -1087,7 +1087,8 @@
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
       move_resolver_(graph->GetAllocator(), this),
-      assembler_(graph->GetAllocator()),
+      assembler_(graph->GetAllocator(),
+                 compiler_options.GetInstructionSetFeatures()->AsX86InstructionSetFeatures()),
       boot_image_method_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
       method_bss_entry_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
       boot_image_type_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index f737d06..a2faa43 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -1481,7 +1481,8 @@
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
       move_resolver_(graph->GetAllocator(), this),
-      assembler_(graph->GetAllocator()),
+      assembler_(graph->GetAllocator(),
+                 compiler_options.GetInstructionSetFeatures()->AsX86_64InstructionSetFeatures()),
       constant_area_start_(0),
       boot_image_method_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
       method_bss_entry_patches_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)),
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index a7c5639..a1ba873 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -5597,6 +5597,9 @@
 UNIMPLEMENTED_INTRINSIC(ARM64, StringBuilderToString);
 
 // 1.8.
+UNIMPLEMENTED_INTRINSIC(ARM64, MathFmaDouble)
+UNIMPLEMENTED_INTRINSIC(ARM64, MathFmaFloat)
+
 UNIMPLEMENTED_INTRINSIC(ARM64, UnsafeGetAndAddInt)
 UNIMPLEMENTED_INTRINSIC(ARM64, UnsafeGetAndAddLong)
 UNIMPLEMENTED_INTRINSIC(ARM64, UnsafeGetAndSetInt)
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 2b3d48b..fe4c27e 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -5526,6 +5526,9 @@
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, StringBuilderToString);
 
 // 1.8.
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathFmaDouble)
+UNIMPLEMENTED_INTRINSIC(ARMVIXL, MathFmaFloat)
+
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, UnsafeGetAndAddInt)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, UnsafeGetAndAddLong)
 UNIMPLEMENTED_INTRINSIC(ARMVIXL, UnsafeGetAndSetInt)
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 5e55a1b..6015a6d 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -728,6 +728,17 @@
   locations->SetOut(Location::FpuRegisterLocation(XMM0));
 }
 
+static void CreateFPFPFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
+  DCHECK_EQ(invoke->GetNumberOfArguments(), 3U);
+  LocationSummary* locations =
+      new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetInAt(1, Location::RequiresFpuRegister());
+  locations->SetInAt(2, Location::RequiresFpuRegister());
+  locations->SetOut(Location::SameAsFirstInput());
+}
+
 void IntrinsicLocationsBuilderX86::VisitMathAtan2(HInvoke* invoke) {
   CreateFPFPToFPCallLocations(allocator_, invoke);
 }
@@ -4733,6 +4744,44 @@
   GenerateVarHandleGetAndBitwiseOp(invoke, codegen_);
 }
 
+static void GenerateMathFma(HInvoke* invoke, CodeGeneratorX86* codegen) {
+  DCHECK(DataType::IsFloatingPointType(invoke->GetType()));
+  LocationSummary* locations = invoke->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  X86Assembler* assembler = codegen->GetAssembler();
+  XmmRegister left = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister right = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister accumulator = locations->InAt(2).AsFpuRegister<XmmRegister>();
+  if (invoke->GetType() == DataType::Type::kFloat32) {
+    __ vfmadd213ss(left, right, accumulator);
+  } else {
+    DCHECK_EQ(invoke->GetType(), DataType::Type::kFloat64);
+    __ vfmadd213sd(left, right, accumulator);
+  }
+}
+
+void IntrinsicCodeGeneratorX86::VisitMathFmaDouble(HInvoke* invoke) {
+  DCHECK(codegen_->GetInstructionSetFeatures().HasAVX2());
+  GenerateMathFma(invoke, codegen_);
+}
+
+void IntrinsicLocationsBuilderX86::VisitMathFmaDouble(HInvoke* invoke) {
+  if (codegen_->GetInstructionSetFeatures().HasAVX2()) {
+    CreateFPFPFPToFPCallLocations(allocator_, invoke);
+  }
+}
+
+void IntrinsicCodeGeneratorX86::VisitMathFmaFloat(HInvoke* invoke) {
+  DCHECK(codegen_->GetInstructionSetFeatures().HasAVX2());
+  GenerateMathFma(invoke, codegen_);
+}
+
+void IntrinsicLocationsBuilderX86::VisitMathFmaFloat(HInvoke* invoke) {
+  if (codegen_->GetInstructionSetFeatures().HasAVX2()) {
+    CreateFPFPFPToFPCallLocations(allocator_, invoke);
+  }
+}
+
 UNIMPLEMENTED_INTRINSIC(X86, MathRoundDouble)
 UNIMPLEMENTED_INTRINSIC(X86, FloatIsInfinite)
 UNIMPLEMENTED_INTRINSIC(X86, DoubleIsInfinite)
@@ -4775,6 +4824,7 @@
 UNIMPLEMENTED_INTRINSIC(X86, StringBuilderToString);
 
 // 1.8.
+
 UNIMPLEMENTED_INTRINSIC(X86, UnsafeGetAndAddInt)
 UNIMPLEMENTED_INTRINSIC(X86, UnsafeGetAndAddLong)
 UNIMPLEMENTED_INTRINSIC(X86, UnsafeGetAndSetInt)
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 4d617f7..c536492 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -551,6 +551,17 @@
   CodeGeneratorX86_64::BlockNonVolatileXmmRegisters(locations);
 }
 
+static void CreateFPFPFPToFPCallLocations(ArenaAllocator* allocator, HInvoke* invoke) {
+  DCHECK_EQ(invoke->GetNumberOfArguments(), 3U);
+  LocationSummary* locations =
+      new (allocator) LocationSummary(invoke, LocationSummary::kNoCall, kIntrinsified);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetInAt(1, Location::RequiresFpuRegister());
+  locations->SetInAt(2, Location::RequiresFpuRegister());
+  locations->SetOut(Location::SameAsFirstInput());
+}
+
 void IntrinsicLocationsBuilderX86_64::VisitMathAtan2(HInvoke* invoke) {
   CreateFPFPToFPCallLocations(allocator_, invoke);
 }
@@ -3262,6 +3273,44 @@
   bool is_atomic_;
 };
 
+static void GenerateMathFma(HInvoke* invoke, CodeGeneratorX86_64* codegen) {
+  DCHECK(DataType::IsFloatingPointType(invoke->GetType()));
+  X86_64Assembler* assembler = codegen->GetAssembler();
+  LocationSummary* locations = invoke->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister left = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  XmmRegister right = locations->InAt(1).AsFpuRegister<XmmRegister>();
+  XmmRegister accumulator = locations->InAt(2).AsFpuRegister<XmmRegister>();
+  if (invoke->GetType() == DataType::Type::kFloat32) {
+    __ vfmadd213ss(left, right, accumulator);
+  } else {
+    DCHECK_EQ(invoke->GetType(), DataType::Type::kFloat64);
+    __ vfmadd213sd(left, right, accumulator);
+  }
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitMathFmaDouble(HInvoke* invoke) {
+  DCHECK(codegen_->GetInstructionSetFeatures().HasAVX2());
+  GenerateMathFma(invoke, codegen_);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitMathFmaDouble(HInvoke* invoke) {
+  if (codegen_->GetInstructionSetFeatures().HasAVX2()) {
+    CreateFPFPFPToFPCallLocations(allocator_, invoke);
+  }
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitMathFmaFloat(HInvoke* invoke) {
+  DCHECK(codegen_->GetInstructionSetFeatures().HasAVX2());
+  GenerateMathFma(invoke, codegen_);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitMathFmaFloat(HInvoke* invoke) {
+  if (codegen_->GetInstructionSetFeatures().HasAVX2()) {
+    CreateFPFPFPToFPCallLocations(allocator_, invoke);
+  }
+}
+
 // Generate subtype check without read barriers.
 static void GenerateSubTypeObjectCheckNoReadBarrier(CodeGeneratorX86_64* codegen,
                                                     VarHandleSlowPathX86_64* slow_path,
@@ -4713,6 +4762,7 @@
 UNIMPLEMENTED_INTRINSIC(X86_64, StringBuilderToString);
 
 // 1.8.
+
 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndAddInt)
 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndAddLong)
 UNIMPLEMENTED_INTRINSIC(X86_64, UnsafeGetAndSetInt)
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index b6708de..4b64e92 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -783,6 +783,43 @@
   EmitXmmRegisterOperand(dst, src2);
 }
 
+void X86Assembler::vfmadd213ss(XmmRegister acc, XmmRegister left, XmmRegister right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+  ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ false);
+  X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(left);
+  ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+                                 /*X=*/ false,
+                                 /*B=*/ false,
+                                 SET_VEX_M_0F_38);
+  ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  EmitUint8(ByteTwo);
+  EmitUint8(0xA9);
+  EmitXmmRegisterOperand(acc, right);
+}
+
+void X86Assembler::vfmadd213sd(XmmRegister acc, XmmRegister left, XmmRegister right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+  ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ false);
+  X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(left);
+  ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+                                 /*X=*/ false,
+                                 /*B=*/ false,
+                                 SET_VEX_M_0F_38);
+  ByteTwo = EmitVexPrefixByteTwo(/*W=*/ true, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  EmitUint8(ByteTwo);
+  EmitUint8(0xA9);
+  EmitXmmRegisterOperand(acc, right);
+}
 
 void X86Assembler::movapd(XmmRegister dst, XmmRegister src) {
   if (CpuHasAVXorAVX2FeatureFlag()) {
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index f6e7fbc..dc2427d 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -492,6 +492,9 @@
   void vsubpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
   void vaddpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
 
+  void vfmadd213ss(XmmRegister acc, XmmRegister left, XmmRegister right);
+  void vfmadd213sd(XmmRegister acc, XmmRegister left, XmmRegister right);
+
   void movapd(XmmRegister dst, XmmRegister src);     // move
   void movapd(XmmRegister dst, const Address& src);  // load aligned
   void movupd(XmmRegister dst, const Address& src);  // load unaligned
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index 92ce788..5db2b65 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -1073,6 +1073,18 @@
       RepeatFFF(&x86::X86Assembler::vpmaddwd, "vpmaddwd %{reg3}, %{reg2}, %{reg1}"), "vpmaddwd");
 }
 
+TEST_F(AssemblerX86AVXTest, VFMadd213SS) {
+  DriverStr(
+      RepeatFFF(&x86::X86Assembler::vfmadd213ss,
+                "vfmadd213ss %{reg3}, %{reg2}, %{reg1}"), "vfmadd213ss");
+}
+
+TEST_F(AssemblerX86AVXTest, VFMadd213SD) {
+  DriverStr(
+      RepeatFFF(&x86::X86Assembler::vfmadd213sd,
+                "vfmadd213sd %{reg3}, %{reg2}, %{reg1}"), "vfmadd213sd");
+}
+
 TEST_F(AssemblerX86Test, PHAddW) {
   DriverStr(RepeatFF(&x86::X86Assembler::phaddw, "phaddw %{reg2}, %{reg1}"), "phaddw");
 }
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 0f90ec8..5edc309 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -984,6 +984,43 @@
   EmitXmmRegisterOperand(dst.LowBits(), src2);
 }
 
+void X86_64Assembler::vfmadd213ss(XmmRegister acc, XmmRegister left, XmmRegister right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+  ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ false);
+  X86_64ManagedRegister vvvv_reg =
+      X86_64ManagedRegister::FromXmmRegister(left.AsFloatRegister());
+  ByteOne = EmitVexPrefixByteOne(acc.NeedsRex(),
+                                 /*X=*/ false,
+                                 right.NeedsRex(),
+                                 SET_VEX_M_0F_38);
+  ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  EmitUint8(ByteTwo);
+  EmitUint8(0xA9);
+  EmitXmmRegisterOperand(acc.LowBits(), right);
+}
+
+void X86_64Assembler::vfmadd213sd(XmmRegister acc, XmmRegister left, XmmRegister right) {
+  DCHECK(CpuHasAVXorAVX2FeatureFlag());
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+  ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ false);
+  X86_64ManagedRegister vvvv_reg =
+      X86_64ManagedRegister::FromXmmRegister(left.AsFloatRegister());
+  ByteOne = EmitVexPrefixByteOne(acc.NeedsRex(),
+                                 /*X=*/ false,
+                                 right.NeedsRex(),
+                                 SET_VEX_M_0F_38);
+  ByteTwo = EmitVexPrefixByteTwo(/*W=*/ true, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+  EmitUint8(ByteZero);
+  EmitUint8(ByteOne);
+  EmitUint8(ByteTwo);
+  EmitUint8(0xA9);
+  EmitXmmRegisterOperand(acc.LowBits(), right);
+}
 void X86_64Assembler::flds(const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xD9);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 2eb018d..2cbc9d9 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -528,6 +528,9 @@
   void vsubpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
   void vaddpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
 
+  void vfmadd213ss(XmmRegister accumulator, XmmRegister left, XmmRegister right);
+  void vfmadd213sd(XmmRegister accumulator, XmmRegister left, XmmRegister right);
+
   void movapd(XmmRegister dst, XmmRegister src);     // move
   void movapd(XmmRegister dst, const Address& src);  // load aligned
   void movupd(XmmRegister dst, const Address& src);  // load unaligned
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 38fc568..4c07354 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -1866,6 +1866,16 @@
                       "vpmaddwd %{reg3}, %{reg2}, %{reg1}"), "vpmaddwd");
 }
 
+TEST_F(AssemblerX86_64AVXTest, VFmadd213ss) {
+  DriverStr(RepeatFFF(&x86_64::X86_64Assembler::vfmadd213ss,
+                      "vfmadd213ss %{reg3}, %{reg2}, %{reg1}"), "vfmadd213ss");
+}
+
+TEST_F(AssemblerX86_64AVXTest, VFmadd213sd) {
+  DriverStr(RepeatFFF(&x86_64::X86_64Assembler::vfmadd213sd,
+                      "vfmadd213sd %{reg3}, %{reg2}, %{reg1}"), "vfmadd213sd");
+}
+
 TEST_F(AssemblerX86_64Test, Phaddw) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::phaddw, "phaddw %{reg2}, %{reg1}"), "phaddw");
 }
diff --git a/runtime/image.cc b/runtime/image.cc
index 7e258d1..e62380a 100644
--- a/runtime/image.cc
+++ b/runtime/image.cc
@@ -29,8 +29,8 @@
 namespace art {
 
 const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
-// Last change: added multiple kJdkUnsafe intrinsics
-const uint8_t ImageHeader::kImageVersion[] = { '1', '0', '4', '\0' };
+// Last change: Math.fma(double, double, double) intrinsic.
+const uint8_t ImageHeader::kImageVersion[] = { '1', '0', '5', '\0' };
 
 ImageHeader::ImageHeader(uint32_t image_reservation_size,
                          uint32_t component_count,
diff --git a/runtime/interpreter/interpreter_intrinsics.cc b/runtime/interpreter/interpreter_intrinsics.cc
index 2e18d5f..9c6c614 100644
--- a/runtime/interpreter/interpreter_intrinsics.cc
+++ b/runtime/interpreter/interpreter_intrinsics.cc
@@ -468,6 +468,8 @@
     INTRINSIC_CASE(MathAbsFloat)
     INTRINSIC_CASE(MathAbsLong)
     INTRINSIC_CASE(MathAbsInt)
+    UNIMPLEMENTED_CASE(MathFmaDouble /* (DDD)D */)
+    UNIMPLEMENTED_CASE(MathFmaFloat /* (FFF)F */)
     UNIMPLEMENTED_CASE(MathMinDoubleDouble /* (DD)D */)
     UNIMPLEMENTED_CASE(MathMinFloatFloat /* (FF)F */)
     INTRINSIC_CASE(MathMinLongLong)
diff --git a/runtime/intrinsics_list.h b/runtime/intrinsics_list.h
index 67cb124..568daff 100644
--- a/runtime/intrinsics_list.h
+++ b/runtime/intrinsics_list.h
@@ -124,6 +124,8 @@
   V(MathAbsFloat, kStatic, kNeedsEnvironment, kNoSideEffects, kNoThrow, "Ljava/lang/Math;", "abs", "(F)F") \
   V(MathAbsLong, kStatic, kNeedsEnvironment, kNoSideEffects, kNoThrow, "Ljava/lang/Math;", "abs", "(J)J") \
   V(MathAbsInt, kStatic, kNeedsEnvironment, kNoSideEffects, kNoThrow, "Ljava/lang/Math;", "abs", "(I)I") \
+  V(MathFmaDouble, kStatic, kNeedsEnvironment, kNoSideEffects, kNoThrow, "Ljava/lang/Math;", "fma", "(DDD)D") \
+  V(MathFmaFloat, kStatic, kNeedsEnvironment, kNoSideEffects, kNoThrow, "Ljava/lang/Math;", "fma", "(FFF)F") \
   V(MathMinDoubleDouble, kStatic, kNeedsEnvironment, kNoSideEffects, kNoThrow, "Ljava/lang/Math;", "min", "(DD)D") \
   V(MathMinFloatFloat, kStatic, kNeedsEnvironment, kNoSideEffects, kNoThrow, "Ljava/lang/Math;", "min", "(FF)F") \
   V(MathMinLongLong, kStatic, kNeedsEnvironment, kNoSideEffects, kNoThrow, "Ljava/lang/Math;", "min", "(JJ)J") \
diff --git a/test/082-inline-execute/src/Main.java b/test/082-inline-execute/src/Main.java
index f4a234b..97ecf47 100644
--- a/test/082-inline-execute/src/Main.java
+++ b/test/082-inline-execute/src/Main.java
@@ -45,6 +45,8 @@
     test_Math_isInfinite_D();
     test_Math_isInfinite_F();
     test_Math_multiplyHigh();
+    test_Math_fma_D();
+    test_Math_fma_F();
     test_Short_reverseBytes();
     test_Integer_reverseBytes();
     test_Long_reverseBytes();
@@ -985,6 +987,24 @@
     Assert.assertEquals(Math.multiplyHigh(Long.MAX_VALUE, Long.MAX_VALUE), 4611686018427387903L);
   }
 
+  public static void test_Math_fma_D() {
+    final double MAX_D = Double.MAX_VALUE;
+    final double MIN_D = Double.MIN_VALUE;
+    Math.fma(3.0, 4.0, 5.0);
+    Assert.assertEquals(Double.compare(Math.fma(3.0, 4.0, 5.0), 17.0), 0);
+    Assert.assertEquals(Double.compare(Math.fma(MAX_D, MIN_D, 1.0), 1.0000000000000009), 0);
+    Assert.assertEquals(Double.compare(Math.fma(MAX_D, MIN_D, 0.0), 8.881784197001251E-16), 0);
+  }
+
+  public static void test_Math_fma_F() {
+    final float MAX_F = Float.MAX_VALUE;
+    final float MIN_F = Float.MIN_VALUE;
+    Math.fma(3.0f, 4.0f, 5.0f);
+    Assert.assertEquals(Float.compare(Math.fma(3.0f, 4.0f, 5.0f), 17.0f), 0);
+    Assert.assertEquals(Float.compare(Math.fma(MAX_F, MIN_F, 1.0f), 1.0000005f), 0);
+    Assert.assertEquals(Float.compare(Math.fma(MAX_F, MIN_F, 0.0f), 4.7683712978141557E-7f), 0);
+  }
+
   public static void test_StrictMath_abs_I() {
     StrictMath.abs(-1);
     Assert.assertEquals(StrictMath.abs(0), 0);