Support Math.fma Intrinsic for x86 and x86_64

Intrinsic Implementation is 2287x faster than base
Time for base: 188.25s
Time for patch: 0.0822s

Test: ./run-test --host --64 --optimizing 082-inline-execute
Test: ./run-test --host --64 --optimizing --instruction-set-features sse4.1 082-inline-execute
Test: ./run-test --host --64 --optimizing --instruction-set-features sse4.1,avx2 082-inline-execute
Signed-off-by: Shalini Salomi Bodapati <shalini.salomi.bodapati@intel.com>
Change-Id: I68b96a35f41f3bb23d7e26d87fee1dd2a5ebf6a6
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index 92ce788..5db2b65 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -1073,6 +1073,18 @@
       RepeatFFF(&x86::X86Assembler::vpmaddwd, "vpmaddwd %{reg3}, %{reg2}, %{reg1}"), "vpmaddwd");
 }
 
+TEST_F(AssemblerX86AVXTest, VFMadd213SS) {
+  DriverStr(
+      RepeatFFF(&x86::X86Assembler::vfmadd213ss,
+                "vfmadd213ss %{reg3}, %{reg2}, %{reg1}"), "vfmadd213ss");
+}
+
+TEST_F(AssemblerX86AVXTest, VFMadd213SD) {
+  DriverStr(
+      RepeatFFF(&x86::X86Assembler::vfmadd213sd,
+                "vfmadd213sd %{reg3}, %{reg2}, %{reg1}"), "vfmadd213sd");
+}
+
 TEST_F(AssemblerX86Test, PHAddW) {
   DriverStr(RepeatFF(&x86::X86Assembler::phaddw, "phaddw %{reg2}, %{reg1}"), "phaddw");
 }