Add AVX support for packed mul/div instructions.
This is a follow up for the below patch:
https://android-review.googlesource.com/c/platform/build/+/830841
Test: ./test.py --host --64, test-art-host-gtest
Change-Id: Id2aa473035556ee230e66addeb69707df8530e75
Signed-off-by: Shalini Salomi Bodapati <shalini.salomi.bodapati@intel.com>
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 3eaf93a..84a8564 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -745,6 +745,20 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vmulps(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+ X86ManagedRegister::FromXmmRegister(src1),
+ SET_VEX_L_128,
+ SET_VEX_PP_NONE);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0x59);
+ EmitXmmRegisterOperand(dst, src2);
+}
void X86Assembler::divps(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -754,6 +768,22 @@
}
+void X86Assembler::vdivps(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+ X86ManagedRegister::FromXmmRegister(src1),
+ SET_VEX_L_128,
+ SET_VEX_PP_NONE);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0x5E);
+ EmitXmmRegisterOperand(dst, src2);
+}
+
+
void X86Assembler::movapd(XmmRegister dst, XmmRegister src) {
if (CpuHasAVXorAVX2FeatureFlag()) {
vmovapd(dst, src);
@@ -1113,6 +1143,20 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vmulpd(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+ X86ManagedRegister::FromXmmRegister(src1),
+ SET_VEX_L_128,
+ SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0x59);
+ EmitXmmRegisterOperand(dst, src2);
+}
void X86Assembler::divpd(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1122,6 +1166,20 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vdivpd(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+ X86ManagedRegister::FromXmmRegister(src1),
+ SET_VEX_L_128,
+ SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0x5E);
+ EmitXmmRegisterOperand(dst, src2);
+}
void X86Assembler::movdqa(XmmRegister dst, XmmRegister src) {
if (CpuHasAVXorAVX2FeatureFlag()) {
@@ -1425,6 +1483,40 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vpmulld(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ false);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+ /*X=*/ false,
+ /*B=*/ false,
+ SET_VEX_M_0F_38);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false,
+ X86ManagedRegister::FromXmmRegister(src1),
+ SET_VEX_L_128,
+ SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(ByteTwo);
+ EmitUint8(0x40);
+ EmitRegisterOperand(dst, src2);
+}
+
+void X86Assembler::vpmullw(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+ X86ManagedRegister::FromXmmRegister(src1),
+ SET_VEX_L_128,
+ SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0xD5);
+ EmitRegisterOperand(dst, src2);
+}
void X86Assembler::paddq(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 17039f0..dce546e 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -417,6 +417,11 @@
void mulps(XmmRegister dst, XmmRegister src);
void divps(XmmRegister dst, XmmRegister src);
+ void vmulps(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+ void vmulpd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+ void vdivps(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+ void vdivpd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
void vaddps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
void vsubps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
void vsubpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
@@ -476,6 +481,7 @@
void paddw(XmmRegister dst, XmmRegister src);
void psubw(XmmRegister dst, XmmRegister src);
void pmullw(XmmRegister dst, XmmRegister src);
+ void vpmullw(XmmRegister dst, XmmRegister src1, XmmRegister src2);
void vpsubb(XmmRegister dst, XmmRegister src1, XmmRegister src2);
void vpsubw(XmmRegister dst, XmmRegister src1, XmmRegister src2);
@@ -485,6 +491,8 @@
void psubd(XmmRegister dst, XmmRegister src);
void pmulld(XmmRegister dst, XmmRegister src);
+ void vpmulld(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
void vpaddd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
void paddq(XmmRegister dst, XmmRegister src);
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index 42ee383..bce0346 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -677,7 +677,7 @@
DriverStr(RepeatFF(&x86::X86Assembler::addps, "addps %{reg2}, %{reg1}"), "addps");
}
-TEST_F(AssemblerX86AVXTest, VAddps) {
+TEST_F(AssemblerX86AVXTest, VAddPS) {
DriverStr(RepeatFFF(&x86::X86Assembler::vaddps, "vaddps %{reg3}, %{reg2}, %{reg1}"), "vaddps");
}
@@ -693,41 +693,55 @@
DriverStr(RepeatFF(&x86::X86Assembler::subps, "subps %{reg2}, %{reg1}"), "subps");
}
-TEST_F(AssemblerX86AVXTest, VSubps) {
+TEST_F(AssemblerX86AVXTest, VSubPS) {
DriverStr(RepeatFFF(&x86::X86Assembler::vsubps, "vsubps %{reg3},%{reg2}, %{reg1}"), "vsubps");
}
-
TEST_F(AssemblerX86Test, SubPD) {
DriverStr(RepeatFF(&x86::X86Assembler::subpd, "subpd %{reg2}, %{reg1}"), "subpd");
}
-TEST_F(AssemblerX86AVXTest, VSubpd) {
+TEST_F(AssemblerX86AVXTest, VSubPD) {
DriverStr(RepeatFFF(&x86::X86Assembler::vsubpd, "vsubpd %{reg3}, %{reg2}, %{reg1}"), "vsubpd");
}
-
TEST_F(AssemblerX86Test, MulPS) {
DriverStr(RepeatFF(&x86::X86Assembler::mulps, "mulps %{reg2}, %{reg1}"), "mulps");
}
+TEST_F(AssemblerX86AVXTest, VMulPS) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vmulps, "vmulps %{reg3}, %{reg2}, %{reg1}"), "vmulps");
+}
+
TEST_F(AssemblerX86Test, MulPD) {
DriverStr(RepeatFF(&x86::X86Assembler::mulpd, "mulpd %{reg2}, %{reg1}"), "mulpd");
}
+TEST_F(AssemblerX86AVXTest, VMulPD) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vmulpd, "vmulpd %{reg3}, %{reg2}, %{reg1}"), "vmulpd");
+}
+
TEST_F(AssemblerX86Test, DivPS) {
DriverStr(RepeatFF(&x86::X86Assembler::divps, "divps %{reg2}, %{reg1}"), "divps");
}
+TEST_F(AssemblerX86AVXTest, VDivPS) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vdivps, "vdivps %{reg3}, %{reg2}, %{reg1}"), "vdivps");
+}
+
TEST_F(AssemblerX86Test, DivPD) {
DriverStr(RepeatFF(&x86::X86Assembler::divpd, "divpd %{reg2}, %{reg1}"), "divpd");
}
+TEST_F(AssemblerX86AVXTest, VDivPD) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vdivpd, "vdivpd %{reg3}, %{reg2}, %{reg1}"), "vdivpd");
+}
+
TEST_F(AssemblerX86Test, PAddB) {
DriverStr(RepeatFF(&x86::X86Assembler::paddb, "paddb %{reg2}, %{reg1}"), "paddb");
}
-TEST_F(AssemblerX86AVXTest, VPaddb) {
+TEST_F(AssemblerX86AVXTest, VPaddB) {
DriverStr(RepeatFFF(&x86::X86Assembler::vpaddb, "vpaddb %{reg3}, %{reg2}, %{reg1}"), "vpaddb");
}
@@ -735,7 +749,7 @@
DriverStr(RepeatFF(&x86::X86Assembler::psubb, "psubb %{reg2}, %{reg1}"), "psubb");
}
-TEST_F(AssemblerX86AVXTest, VPsubb) {
+TEST_F(AssemblerX86AVXTest, VPsubB) {
DriverStr(RepeatFFF(&x86::X86Assembler::vpsubb, "vpsubb %{reg3},%{reg2}, %{reg1}"), "vpsubb");
}
@@ -743,7 +757,7 @@
DriverStr(RepeatFF(&x86::X86Assembler::paddw, "paddw %{reg2}, %{reg1}"), "paddw");
}
-TEST_F(AssemblerX86AVXTest, VPaddw) {
+TEST_F(AssemblerX86AVXTest, VPaddW) {
DriverStr(RepeatFFF(&x86::X86Assembler::vpaddw, "vpaddw %{reg3}, %{reg2}, %{reg1}"), "vpaddw");
}
@@ -751,7 +765,7 @@
DriverStr(RepeatFF(&x86::X86Assembler::psubw, "psubw %{reg2}, %{reg1}"), "psubw");
}
-TEST_F(AssemblerX86AVXTest, VPsubw) {
+TEST_F(AssemblerX86AVXTest, VPsubW) {
DriverStr(RepeatFFF(&x86::X86Assembler::vpsubw, "vpsubw %{reg3}, %{reg2}, %{reg1}"), "vpsubw");
}
@@ -759,11 +773,15 @@
DriverStr(RepeatFF(&x86::X86Assembler::pmullw, "pmullw %{reg2}, %{reg1}"), "pmullw");
}
+TEST_F(AssemblerX86AVXTest, VPMullW) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vpmullw, "vpmullw %{reg3}, %{reg2}, %{reg1}"), "vpmullw");
+}
+
TEST_F(AssemblerX86Test, PAddD) {
DriverStr(RepeatFF(&x86::X86Assembler::paddd, "paddd %{reg2}, %{reg1}"), "paddd");
}
-TEST_F(AssemblerX86AVXTest, VPaddd) {
+TEST_F(AssemblerX86AVXTest, VPaddD) {
DriverStr(RepeatFFF(&x86::X86Assembler::vpaddd, "vpaddd %{reg3}, %{reg2}, %{reg1}"), "vpaddd");
}
@@ -771,7 +789,7 @@
DriverStr(RepeatFF(&x86::X86Assembler::psubd, "psubd %{reg2}, %{reg1}"), "psubd");
}
-TEST_F(AssemblerX86AVXTest, VPsubd) {
+TEST_F(AssemblerX86AVXTest, VPsubD) {
DriverStr(RepeatFFF(&x86::X86Assembler::vpsubd, "vpsubd %{reg3}, %{reg2}, %{reg1}"), "vpsubd");
}
@@ -779,11 +797,15 @@
DriverStr(RepeatFF(&x86::X86Assembler::pmulld, "pmulld %{reg2}, %{reg1}"), "pmulld");
}
+TEST_F(AssemblerX86AVXTest, VPMullD) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vpmulld, "vpmulld %{reg3}, %{reg2}, %{reg1}"), "vpmulld");
+}
+
TEST_F(AssemblerX86Test, PAddQ) {
DriverStr(RepeatFF(&x86::X86Assembler::paddq, "paddq %{reg2}, %{reg1}"), "paddq");
}
-TEST_F(AssemblerX86AVXTest, VPaddq) {
+TEST_F(AssemblerX86AVXTest, VPaddQ) {
DriverStr(RepeatFFF(&x86::X86Assembler::vpaddq, "vpaddq %{reg3}, %{reg2}, %{reg1}"), "vpaddq");
}
@@ -791,7 +813,7 @@
DriverStr(RepeatFF(&x86::X86Assembler::psubq, "psubq %{reg2}, %{reg1}"), "psubq");
}
-TEST_F(AssemblerX86AVXTest, VPsubq) {
+TEST_F(AssemblerX86AVXTest, VPsubQ) {
DriverStr(RepeatFFF(&x86::X86Assembler::vpsubq, "vpsubq %{reg3}, %{reg2}, %{reg1}"), "vpsubq");
}