Introduce a number of SIMD extensions for x86/x86_64 (SSE).

Rationale:
As a first step exploring how useful an ART vectorizer may be,
introducing a number of floating-point SIMD instructions.

Test: assembler_x86[_64]_test
Bug: 34083438
Change-Id: I0285dd9fca51f31875a6bbe728f873c48089940d
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index cd30872..d3b15ac 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -350,6 +350,38 @@
 }
 
 
+void X86Assembler::movaps(XmmRegister dst, const Address& src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x28);
+  EmitOperand(dst, src);
+}
+
+
+void X86Assembler::movups(XmmRegister dst, const Address& src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x10);
+  EmitOperand(dst, src);
+}
+
+
+void X86Assembler::movaps(const Address& dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x29);
+  EmitOperand(src, dst);
+}
+
+
+void X86Assembler::movups(const Address& dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x11);
+  EmitOperand(src, dst);
+}
+
+
 void X86Assembler::movss(XmmRegister dst, const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
@@ -467,6 +499,83 @@
 }
 
 
+void X86Assembler::addps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x58);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
+void X86Assembler::subps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x5C);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
+void X86Assembler::mulps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x59);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
+void X86Assembler::divps(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x0F);
+  EmitUint8(0x5E);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
+void X86Assembler::movapd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x28);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
+void X86Assembler::movapd(XmmRegister dst, const Address& src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x28);
+  EmitOperand(dst, src);
+}
+
+
+void X86Assembler::movupd(XmmRegister dst, const Address& src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x10);
+  EmitOperand(dst, src);
+}
+
+
+void X86Assembler::movapd(const Address& dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x29);
+  EmitOperand(src, dst);
+}
+
+
+void X86Assembler::movupd(const Address& dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x11);
+  EmitOperand(src, dst);
+}
+
+
 void X86Assembler::flds(const Address& src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xD9);
@@ -638,6 +747,42 @@
 }
 
 
+void X86Assembler::addpd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x58);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
+void X86Assembler::subpd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x5C);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
+void X86Assembler::mulpd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x59);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
+void X86Assembler::divpd(XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x5E);
+  EmitXmmRegisterOperand(dst, src);
+}
+
+
 void X86Assembler::cvtsi2ss(XmmRegister dst, Register src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF3);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 114986b..a93616c 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -371,7 +371,12 @@
 
   void setb(Condition condition, Register dst);
 
-  void movaps(XmmRegister dst, XmmRegister src);
+  void movaps(XmmRegister dst, XmmRegister src);     // move
+  void movaps(XmmRegister dst, const Address& src);  // load aligned
+  void movups(XmmRegister dst, const Address& src);  // load unaligned
+  void movaps(const Address& dst, XmmRegister src);  // store aligned
+  void movups(const Address& dst, XmmRegister src);  // store unaligned
+
   void movss(XmmRegister dst, const Address& src);
   void movss(const Address& dst, XmmRegister src);
   void movss(XmmRegister dst, XmmRegister src);
@@ -388,6 +393,17 @@
   void divss(XmmRegister dst, XmmRegister src);
   void divss(XmmRegister dst, const Address& src);
 
+  void addps(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
+  void subps(XmmRegister dst, XmmRegister src);
+  void mulps(XmmRegister dst, XmmRegister src);
+  void divps(XmmRegister dst, XmmRegister src);
+
+  void movapd(XmmRegister dst, XmmRegister src);     // move
+  void movapd(XmmRegister dst, const Address& src);  // load aligned
+  void movupd(XmmRegister dst, const Address& src);  // load unaligned
+  void movapd(const Address& dst, XmmRegister src);  // store aligned
+  void movupd(const Address& dst, XmmRegister src);  // store unaligned
+
   void movsd(XmmRegister dst, const Address& src);
   void movsd(const Address& dst, XmmRegister src);
   void movsd(XmmRegister dst, XmmRegister src);
@@ -409,6 +425,11 @@
   void divsd(XmmRegister dst, XmmRegister src);
   void divsd(XmmRegister dst, const Address& src);
 
+  void addpd(XmmRegister dst, XmmRegister src);  // no addr variant (for now)
+  void subpd(XmmRegister dst, XmmRegister src);
+  void mulpd(XmmRegister dst, XmmRegister src);
+  void divpd(XmmRegister dst, XmmRegister src);
+
   void cvtsi2ss(XmmRegister dst, Register src);
   void cvtsi2sd(XmmRegister dst, Register src);
 
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index 9bae6c2..4d60a12 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -423,6 +423,98 @@
   DriverStr(expected, "TestlAddressImmediate");
 }
 
+TEST_F(AssemblerX86Test, Movaps) {
+  DriverStr(RepeatFF(&x86::X86Assembler::movaps, "movaps %{reg2}, %{reg1}"), "movaps");
+}
+
+TEST_F(AssemblerX86Test, MovapsAddr) {
+  GetAssembler()->movaps(x86::XmmRegister(x86::XMM0), x86::Address(x86::Register(x86::ESP), 4));
+  GetAssembler()->movaps(x86::Address(x86::Register(x86::ESP), 2), x86::XmmRegister(x86::XMM1));
+  const char* expected =
+    "movaps 0x4(%ESP), %xmm0\n"
+    "movaps %xmm1, 0x2(%ESP)\n";
+  DriverStr(expected, "movaps_address");
+}
+
+TEST_F(AssemblerX86Test, MovupsAddr) {
+  GetAssembler()->movups(x86::XmmRegister(x86::XMM0), x86::Address(x86::Register(x86::ESP), 4));
+  GetAssembler()->movups(x86::Address(x86::Register(x86::ESP), 2), x86::XmmRegister(x86::XMM1));
+  const char* expected =
+    "movups 0x4(%ESP), %xmm0\n"
+    "movups %xmm1, 0x2(%ESP)\n";
+  DriverStr(expected, "movups_address");
+}
+
+TEST_F(AssemblerX86Test, Movapd) {
+  DriverStr(RepeatFF(&x86::X86Assembler::movapd, "movapd %{reg2}, %{reg1}"), "movapd");
+}
+
+TEST_F(AssemblerX86Test, MovapdAddr) {
+  GetAssembler()->movapd(x86::XmmRegister(x86::XMM0), x86::Address(x86::Register(x86::ESP), 4));
+  GetAssembler()->movapd(x86::Address(x86::Register(x86::ESP), 2), x86::XmmRegister(x86::XMM1));
+  const char* expected =
+    "movapd 0x4(%ESP), %xmm0\n"
+    "movapd %xmm1, 0x2(%ESP)\n";
+  DriverStr(expected, "movapd_address");
+}
+
+TEST_F(AssemblerX86Test, MovupdAddr) {
+  GetAssembler()->movupd(x86::XmmRegister(x86::XMM0), x86::Address(x86::Register(x86::ESP), 4));
+  GetAssembler()->movupd(x86::Address(x86::Register(x86::ESP), 2), x86::XmmRegister(x86::XMM1));
+  const char* expected =
+    "movupd 0x4(%ESP), %xmm0\n"
+    "movupd %xmm1, 0x2(%ESP)\n";
+  DriverStr(expected, "movupd_address");
+}
+
+TEST_F(AssemblerX86Test, AddPS) {
+  GetAssembler()->addps(x86::XmmRegister(x86::XMM0), x86::XmmRegister(x86::XMM1));
+  const char* expected = "addps %xmm1, %xmm0\n";
+  DriverStr(expected, "addps");
+}
+
+TEST_F(AssemblerX86Test, AddPD) {
+  GetAssembler()->addpd(x86::XmmRegister(x86::XMM0), x86::XmmRegister(x86::XMM1));
+  const char* expected = "addpd %xmm1, %xmm0\n";
+  DriverStr(expected, "addpd");
+}
+
+TEST_F(AssemblerX86Test, SubPS) {
+  GetAssembler()->subps(x86::XmmRegister(x86::XMM0), x86::XmmRegister(x86::XMM1));
+  const char* expected = "subps %xmm1, %xmm0\n";
+  DriverStr(expected, "subps");
+}
+
+TEST_F(AssemblerX86Test, SubPD) {
+  GetAssembler()->subpd(x86::XmmRegister(x86::XMM0), x86::XmmRegister(x86::XMM1));
+  const char* expected = "subpd %xmm1, %xmm0\n";
+  DriverStr(expected, "subpd");
+}
+
+TEST_F(AssemblerX86Test, MulPS) {
+  GetAssembler()->mulps(x86::XmmRegister(x86::XMM0), x86::XmmRegister(x86::XMM1));
+  const char* expected = "mulps %xmm1, %xmm0\n";
+  DriverStr(expected, "mulps");
+}
+
+TEST_F(AssemblerX86Test, MulPD) {
+  GetAssembler()->mulpd(x86::XmmRegister(x86::XMM0), x86::XmmRegister(x86::XMM1));
+  const char* expected = "mulpd %xmm1, %xmm0\n";
+  DriverStr(expected, "mulpd");
+}
+
+TEST_F(AssemblerX86Test, DivPS) {
+  GetAssembler()->divps(x86::XmmRegister(x86::XMM0), x86::XmmRegister(x86::XMM1));
+  const char* expected = "divps %xmm1, %xmm0\n";
+  DriverStr(expected, "divps");
+}
+
+TEST_F(AssemblerX86Test, DivPD) {
+  GetAssembler()->divpd(x86::XmmRegister(x86::XMM0), x86::XmmRegister(x86::XMM1));
+  const char* expected = "divpd %xmm1, %xmm0\n";
+  DriverStr(expected, "divpd");
+}
+
 /////////////////
 // Near labels //
 /////////////////