Add AVX support for packed mul/div instructions.
This is a follow up for the below patch:
https://android-review.googlesource.com/c/platform/build/+/830841
Test: ./test.py --host --64, test-art-host-gtest
Change-Id: Id2aa473035556ee230e66addeb69707df8530e75
Signed-off-by: Shalini Salomi Bodapati <shalini.salomi.bodapati@intel.com>
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index c8964dd..29a1354 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -431,48 +431,6 @@
}
}
-void LocationsBuilderX86::VisitVecAdd(HVecAdd* instruction) {
- CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
-}
-
-void InstructionCodeGeneratorX86::VisitVecAdd(HVecAdd* instruction) {
- LocationSummary* locations = instruction->GetLocations();
- DCHECK(locations->InAt(0).Equals(locations->Out()));
- XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
- XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
- switch (instruction->GetPackedType()) {
- case DataType::Type::kUint8:
- case DataType::Type::kInt8:
- DCHECK_EQ(16u, instruction->GetVectorLength());
- __ paddb(dst, src);
- break;
- case DataType::Type::kUint16:
- case DataType::Type::kInt16:
- DCHECK_EQ(8u, instruction->GetVectorLength());
- __ paddw(dst, src);
- break;
- case DataType::Type::kInt32:
- DCHECK_EQ(4u, instruction->GetVectorLength());
- __ paddd(dst, src);
- break;
- case DataType::Type::kInt64:
- DCHECK_EQ(2u, instruction->GetVectorLength());
- __ paddq(dst, src);
- break;
- case DataType::Type::kFloat32:
- DCHECK_EQ(4u, instruction->GetVectorLength());
- __ addps(dst, src);
- break;
- case DataType::Type::kFloat64:
- DCHECK_EQ(2u, instruction->GetVectorLength());
- __ addpd(dst, src);
- break;
- default:
- LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
- UNREACHABLE();
- }
-}
-
static void CreateVecTerOpLocations(ArenaAllocator* allocator, HVecOperation* instruction) {
LocationSummary* locations = new (allocator) LocationSummary(instruction);
switch (instruction->GetPackedType()) {
@@ -495,44 +453,50 @@
}
}
-void LocationsBuilderX86::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+void LocationsBuilderX86::VisitVecAdd(HVecAdd* instruction) {
+ if (CpuHasAvxFeatureFlag()) {
CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+ } else {
+ CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+ }
}
-void InstructionCodeGeneratorX86::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+void InstructionCodeGeneratorX86::VisitVecAdd(HVecAdd* instruction) {
+ bool cpu_has_avx = CpuHasAvxFeatureFlag();
LocationSummary* locations = instruction->GetLocations();
- XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
- XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+ DCHECK(cpu_has_avx || other_src == dst);
switch (instruction->GetPackedType()) {
case DataType::Type::kUint8:
case DataType::Type::kInt8:
DCHECK_EQ(16u, instruction->GetVectorLength());
- __ vpaddb(dst, src1, src2);
+ cpu_has_avx ? __ vpaddb(dst, other_src, src) : __ paddb(dst, src);
break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
DCHECK_EQ(8u, instruction->GetVectorLength());
- __ vpaddw(dst, src1, src2);
+ cpu_has_avx ? __ vpaddw(dst, other_src, src) : __ paddw(dst, src);
break;
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ vpaddd(dst, src1, src2);
+ cpu_has_avx ? __ vpaddd(dst, other_src, src) : __ paddd(dst, src);
break;
case DataType::Type::kInt64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ vpaddq(dst, src1, src2);
+ cpu_has_avx ? __ vpaddq(dst, other_src, src) : __ paddq(dst, src);
break;
case DataType::Type::kFloat32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ vaddps(dst, src1, src2);
+ cpu_has_avx ? __ vaddps(dst, other_src, src) : __ addps(dst, src);
break;
case DataType::Type::kFloat64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ vaddpd(dst, src1, src2);
+ cpu_has_avx ? __ vaddpd(dst, other_src, src) : __ addpd(dst, src);
break;
default:
- LOG(FATAL) << "Unsupported SIMD type";
+ LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
UNREACHABLE();
}
}
@@ -597,85 +561,49 @@
}
void LocationsBuilderX86::VisitVecSub(HVecSub* instruction) {
- CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
-}
-
-void InstructionCodeGeneratorX86::VisitVecSub(HVecSub* instruction) {
- LocationSummary* locations = instruction->GetLocations();
- DCHECK(locations->InAt(0).Equals(locations->Out()));
- XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
- XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
- switch (instruction->GetPackedType()) {
- case DataType::Type::kUint8:
- case DataType::Type::kInt8:
- DCHECK_EQ(16u, instruction->GetVectorLength());
- __ psubb(dst, src);
- break;
- case DataType::Type::kUint16:
- case DataType::Type::kInt16:
- DCHECK_EQ(8u, instruction->GetVectorLength());
- __ psubw(dst, src);
- break;
- case DataType::Type::kInt32:
- DCHECK_EQ(4u, instruction->GetVectorLength());
- __ psubd(dst, src);
- break;
- case DataType::Type::kInt64:
- DCHECK_EQ(2u, instruction->GetVectorLength());
- __ psubq(dst, src);
- break;
- case DataType::Type::kFloat32:
- DCHECK_EQ(4u, instruction->GetVectorLength());
- __ subps(dst, src);
- break;
- case DataType::Type::kFloat64:
- DCHECK_EQ(2u, instruction->GetVectorLength());
- __ subpd(dst, src);
- break;
- default:
- LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
- UNREACHABLE();
+ if (CpuHasAvxFeatureFlag()) {
+ CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+ } else {
+ CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
}
}
-void LocationsBuilderX86::VisitVecAvxSub(HVecAvxSub* instruction) {
- CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
-}
-
-void InstructionCodeGeneratorX86::VisitVecAvxSub(HVecAvxSub* instruction) {
+void InstructionCodeGeneratorX86::VisitVecSub(HVecSub* instruction) {
+ bool cpu_has_avx = CpuHasAvxFeatureFlag();
LocationSummary* locations = instruction->GetLocations();
- XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
- XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+ DCHECK(cpu_has_avx || other_src == dst);
switch (instruction->GetPackedType()) {
case DataType::Type::kUint8:
case DataType::Type::kInt8:
DCHECK_EQ(16u, instruction->GetVectorLength());
- __ vpsubb(dst, src1, src2);
+ cpu_has_avx ? __ vpsubb(dst, other_src, src) : __ psubb(dst, src);
break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
DCHECK_EQ(8u, instruction->GetVectorLength());
- __ vpsubw(dst, src1, src2);
+ cpu_has_avx ? __ vpsubw(dst, other_src, src) : __ psubw(dst, src);
break;
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ vpsubd(dst, src1, src2);
+ cpu_has_avx ? __ vpsubd(dst, other_src, src) : __ psubd(dst, src);
break;
case DataType::Type::kInt64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ vpsubq(dst, src1, src2);
+ cpu_has_avx ? __ vpsubq(dst, other_src, src) : __ psubq(dst, src);
break;
case DataType::Type::kFloat32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ vsubps(dst, src1, src2);
+ cpu_has_avx ? __ vsubps(dst, other_src, src) : __ subps(dst, src);
break;
case DataType::Type::kFloat64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ vsubpd(dst, src1, src2);
+ cpu_has_avx ? __ vsubpd(dst, other_src, src) : __ subpd(dst, src);
break;
default:
- LOG(FATAL) << "Unsupported SIMD type";
+ LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
UNREACHABLE();
}
}
@@ -713,31 +641,37 @@
}
void LocationsBuilderX86::VisitVecMul(HVecMul* instruction) {
- CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+ if (CpuHasAvxFeatureFlag()) {
+ CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+ } else {
+ CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+ }
}
void InstructionCodeGeneratorX86::VisitVecMul(HVecMul* instruction) {
+ bool cpu_has_avx = CpuHasAvxFeatureFlag();
LocationSummary* locations = instruction->GetLocations();
- DCHECK(locations->InAt(0).Equals(locations->Out()));
XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+ DCHECK(cpu_has_avx || other_src == dst);
switch (instruction->GetPackedType()) {
case DataType::Type::kUint16:
case DataType::Type::kInt16:
DCHECK_EQ(8u, instruction->GetVectorLength());
- __ pmullw(dst, src);
+ cpu_has_avx ? __ vpmullw(dst, other_src, src) : __ pmullw(dst, src);
break;
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ pmulld(dst, src);
+ cpu_has_avx ? __ vpmulld(dst, other_src, src) : __ pmulld(dst, src);
break;
case DataType::Type::kFloat32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ mulps(dst, src);
+ cpu_has_avx ? __ vmulps(dst, other_src, src) : __ mulps(dst, src);
break;
case DataType::Type::kFloat64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ mulpd(dst, src);
+ cpu_has_avx ? __ vmulpd(dst, other_src, src) : __ mulpd(dst, src);
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -746,22 +680,28 @@
}
void LocationsBuilderX86::VisitVecDiv(HVecDiv* instruction) {
- CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+ if (CpuHasAvxFeatureFlag()) {
+ CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+ } else {
+ CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+ }
}
void InstructionCodeGeneratorX86::VisitVecDiv(HVecDiv* instruction) {
+ bool cpu_has_avx = CpuHasAvxFeatureFlag();
LocationSummary* locations = instruction->GetLocations();
- DCHECK(locations->InAt(0).Equals(locations->Out()));
XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+ DCHECK(cpu_has_avx || other_src == dst);
switch (instruction->GetPackedType()) {
case DataType::Type::kFloat32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ divps(dst, src);
+ cpu_has_avx ? __ vdivps(dst, other_src, src) : __ divps(dst, src);
break;
case DataType::Type::kFloat64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ divpd(dst, src);
+ cpu_has_avx ? __ vdivpd(dst, other_src, src) : __ divpd(dst, src);
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index c147659..f28268b 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -437,85 +437,49 @@
}
void LocationsBuilderX86_64::VisitVecAdd(HVecAdd* instruction) {
- CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
-}
-
-void InstructionCodeGeneratorX86_64::VisitVecAdd(HVecAdd* instruction) {
- LocationSummary* locations = instruction->GetLocations();
- DCHECK(locations->InAt(0).Equals(locations->Out()));
- XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
- XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
- switch (instruction->GetPackedType()) {
- case DataType::Type::kUint8:
- case DataType::Type::kInt8:
- DCHECK_EQ(16u, instruction->GetVectorLength());
- __ paddb(dst, src);
- break;
- case DataType::Type::kUint16:
- case DataType::Type::kInt16:
- DCHECK_EQ(8u, instruction->GetVectorLength());
- __ paddw(dst, src);
- break;
- case DataType::Type::kInt32:
- DCHECK_EQ(4u, instruction->GetVectorLength());
- __ paddd(dst, src);
- break;
- case DataType::Type::kInt64:
- DCHECK_EQ(2u, instruction->GetVectorLength());
- __ paddq(dst, src);
- break;
- case DataType::Type::kFloat32:
- DCHECK_EQ(4u, instruction->GetVectorLength());
- __ addps(dst, src);
- break;
- case DataType::Type::kFloat64:
- DCHECK_EQ(2u, instruction->GetVectorLength());
- __ addpd(dst, src);
- break;
- default:
- LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
- UNREACHABLE();
+ if (CpuHasAvxFeatureFlag()) {
+ CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+ } else {
+ CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
}
}
-void LocationsBuilderX86_64::VisitVecAvxAdd(HVecAvxAdd* instruction) {
- CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
-}
-
-void InstructionCodeGeneratorX86_64::VisitVecAvxAdd(HVecAvxAdd* instruction) {
+void InstructionCodeGeneratorX86_64::VisitVecAdd(HVecAdd* instruction) {
+ bool cpu_has_avx = CpuHasAvxFeatureFlag();
LocationSummary* locations = instruction->GetLocations();
- XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
- XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+ DCHECK(cpu_has_avx || other_src == dst);
switch (instruction->GetPackedType()) {
case DataType::Type::kUint8:
case DataType::Type::kInt8:
DCHECK_EQ(16u, instruction->GetVectorLength());
- __ vpaddb(dst, src1, src2);
+ cpu_has_avx ? __ vpaddb(dst, other_src, src) : __ paddb(dst, src);
break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
DCHECK_EQ(8u, instruction->GetVectorLength());
- __ vpaddw(dst, src1, src2);
+ cpu_has_avx ? __ vpaddw(dst, other_src, src) : __ paddw(dst, src);
break;
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ vpaddd(dst, src1, src2);
+ cpu_has_avx ? __ vpaddd(dst, other_src, src) : __ paddd(dst, src);
break;
case DataType::Type::kInt64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ vpaddq(dst, src1, src2);
+ cpu_has_avx ? __ vpaddq(dst, other_src, src) : __ paddq(dst, src);
break;
case DataType::Type::kFloat32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ vaddps(dst, src1, src2);
+ cpu_has_avx ? __ vaddps(dst, other_src, src) : __ addps(dst, src);
break;
case DataType::Type::kFloat64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ vaddpd(dst, src1, src2);
+ cpu_has_avx ? __ vaddpd(dst, other_src, src) : __ addpd(dst, src);
break;
default:
- LOG(FATAL) << "Unsupported SIMD type";
+ LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
UNREACHABLE();
}
}
@@ -580,85 +544,49 @@
}
void LocationsBuilderX86_64::VisitVecSub(HVecSub* instruction) {
- CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
-}
-
-void InstructionCodeGeneratorX86_64::VisitVecSub(HVecSub* instruction) {
- LocationSummary* locations = instruction->GetLocations();
- DCHECK(locations->InAt(0).Equals(locations->Out()));
- XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
- XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
- switch (instruction->GetPackedType()) {
- case DataType::Type::kUint8:
- case DataType::Type::kInt8:
- DCHECK_EQ(16u, instruction->GetVectorLength());
- __ psubb(dst, src);
- break;
- case DataType::Type::kUint16:
- case DataType::Type::kInt16:
- DCHECK_EQ(8u, instruction->GetVectorLength());
- __ psubw(dst, src);
- break;
- case DataType::Type::kInt32:
- DCHECK_EQ(4u, instruction->GetVectorLength());
- __ psubd(dst, src);
- break;
- case DataType::Type::kInt64:
- DCHECK_EQ(2u, instruction->GetVectorLength());
- __ psubq(dst, src);
- break;
- case DataType::Type::kFloat32:
- DCHECK_EQ(4u, instruction->GetVectorLength());
- __ subps(dst, src);
- break;
- case DataType::Type::kFloat64:
- DCHECK_EQ(2u, instruction->GetVectorLength());
- __ subpd(dst, src);
- break;
- default:
- LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
- UNREACHABLE();
+ if (CpuHasAvxFeatureFlag()) {
+ CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+ } else {
+ CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
}
}
-void LocationsBuilderX86_64::VisitVecAvxSub(HVecAvxSub* instruction) {
- CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
-}
-
-void InstructionCodeGeneratorX86_64::VisitVecAvxSub(HVecAvxSub* instruction) {
+void InstructionCodeGeneratorX86_64::VisitVecSub(HVecSub* instruction) {
+ bool cpu_has_avx = CpuHasAvxFeatureFlag();
LocationSummary* locations = instruction->GetLocations();
- XmmRegister src1 = locations->InAt(0).AsFpuRegister<XmmRegister>();
- XmmRegister src2 = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+ DCHECK(cpu_has_avx || other_src == dst);
switch (instruction->GetPackedType()) {
case DataType::Type::kUint8:
case DataType::Type::kInt8:
DCHECK_EQ(16u, instruction->GetVectorLength());
- __ vpsubb(dst, src1, src2);
+ cpu_has_avx ? __ vpsubb(dst, other_src, src) : __ psubb(dst, src);
break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
DCHECK_EQ(8u, instruction->GetVectorLength());
- __ vpsubw(dst, src1, src2);
+ cpu_has_avx ? __ vpsubw(dst, other_src, src) : __ psubw(dst, src);
break;
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ vpsubd(dst, src1, src2);
+ cpu_has_avx ? __ vpsubd(dst, other_src, src) : __ psubd(dst, src);
break;
case DataType::Type::kInt64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ vpsubq(dst, src1, src2);
+ cpu_has_avx ? __ vpsubq(dst, other_src, src) : __ psubq(dst, src);
break;
case DataType::Type::kFloat32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ vsubps(dst, src1, src2);
+ cpu_has_avx ? __ vsubps(dst, other_src, src) : __ subps(dst, src);
break;
case DataType::Type::kFloat64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ vsubpd(dst, src1, src2);
+ cpu_has_avx ? __ vsubpd(dst, other_src, src) : __ subpd(dst, src);
break;
default:
- LOG(FATAL) << "Unsupported SIMD type";
+ LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
UNREACHABLE();
}
}
@@ -696,31 +624,37 @@
}
void LocationsBuilderX86_64::VisitVecMul(HVecMul* instruction) {
- CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+ if (CpuHasAvxFeatureFlag()) {
+ CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+ } else {
+ CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+ }
}
void InstructionCodeGeneratorX86_64::VisitVecMul(HVecMul* instruction) {
+ bool cpu_has_avx = CpuHasAvxFeatureFlag();
LocationSummary* locations = instruction->GetLocations();
- DCHECK(locations->InAt(0).Equals(locations->Out()));
XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+ DCHECK(cpu_has_avx || other_src == dst);
switch (instruction->GetPackedType()) {
case DataType::Type::kUint16:
case DataType::Type::kInt16:
DCHECK_EQ(8u, instruction->GetVectorLength());
- __ pmullw(dst, src);
+ cpu_has_avx ? __ vpmullw(dst, other_src, src) : __ pmullw(dst, src);
break;
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ pmulld(dst, src);
+ cpu_has_avx ? __ vpmulld(dst, other_src, src): __ pmulld(dst, src);
break;
case DataType::Type::kFloat32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ mulps(dst, src);
+ cpu_has_avx ? __ vmulps(dst, other_src, src) : __ mulps(dst, src);
break;
case DataType::Type::kFloat64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ mulpd(dst, src);
+ cpu_has_avx ? __ vmulpd(dst, other_src, src) : __ mulpd(dst, src);
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -729,22 +663,28 @@
}
void LocationsBuilderX86_64::VisitVecDiv(HVecDiv* instruction) {
- CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+ if (CpuHasAvxFeatureFlag()) {
+ CreateVecTerOpLocations(GetGraph()->GetAllocator(), instruction);
+ } else {
+ CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+ }
}
void InstructionCodeGeneratorX86_64::VisitVecDiv(HVecDiv* instruction) {
+ bool cpu_has_avx = CpuHasAvxFeatureFlag();
LocationSummary* locations = instruction->GetLocations();
- DCHECK(locations->InAt(0).Equals(locations->Out()));
XmmRegister src = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister other_src = locations->InAt(0).AsFpuRegister<XmmRegister>();
XmmRegister dst = locations->Out().AsFpuRegister<XmmRegister>();
+ DCHECK(cpu_has_avx || other_src == dst);
switch (instruction->GetPackedType()) {
case DataType::Type::kFloat32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ divps(dst, src);
+ cpu_has_avx ? __ vdivps(dst, other_src, src) : __ divps(dst, src);
break;
case DataType::Type::kFloat64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ divpd(dst, src);
+ cpu_has_avx ? __ vdivpd(dst, other_src, src) : __ divpd(dst, src);
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index d8a54e5..7f7e3a5 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -8425,6 +8425,19 @@
LOG(FATAL) << "Unreachable";
}
+bool LocationsBuilderX86::CpuHasAvxFeatureFlag() {
+ return codegen_->GetInstructionSetFeatures().HasAVX();
+}
+bool LocationsBuilderX86::CpuHasAvx2FeatureFlag() {
+ return codegen_->GetInstructionSetFeatures().HasAVX2();
+}
+bool InstructionCodeGeneratorX86::CpuHasAvxFeatureFlag() {
+ return codegen_->GetInstructionSetFeatures().HasAVX();
+}
+bool InstructionCodeGeneratorX86::CpuHasAvx2FeatureFlag() {
+ return codegen_->GetInstructionSetFeatures().HasAVX2();
+}
+
#undef __
} // namespace x86
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index bbca764..368c584 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -175,6 +175,8 @@
void HandleShift(HBinaryOperation* instruction);
void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
+ bool CpuHasAvxFeatureFlag();
+ bool CpuHasAvx2FeatureFlag();
CodeGeneratorX86* const codegen_;
InvokeDexCallingConventionVisitorX86 parameter_visitor_;
@@ -307,6 +309,8 @@
HBasicBlock* default_block);
void GenerateFPCompare(Location lhs, Location rhs, HInstruction* insn, bool is_double);
+ bool CpuHasAvxFeatureFlag();
+ bool CpuHasAvx2FeatureFlag();
X86Assembler* const assembler_;
CodeGeneratorX86* const codegen_;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index b3d76a3..8067b9c 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -7667,6 +7667,22 @@
}
}
+bool LocationsBuilderX86_64::CpuHasAvxFeatureFlag() {
+ return codegen_->GetInstructionSetFeatures().HasAVX();
+}
+
+bool LocationsBuilderX86_64::CpuHasAvx2FeatureFlag() {
+ return codegen_->GetInstructionSetFeatures().HasAVX2();
+}
+
+bool InstructionCodeGeneratorX86_64::CpuHasAvxFeatureFlag() {
+ return codegen_->GetInstructionSetFeatures().HasAVX();
+}
+
+bool InstructionCodeGeneratorX86_64::CpuHasAvx2FeatureFlag() {
+ return codegen_->GetInstructionSetFeatures().HasAVX2();
+}
+
#undef __
} // namespace x86_64
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index a25c29f..d3b49ea 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -177,6 +177,8 @@
void HandleShift(HBinaryOperation* operation);
void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
void HandleFieldGet(HInstruction* instruction);
+ bool CpuHasAvxFeatureFlag();
+ bool CpuHasAvx2FeatureFlag();
CodeGeneratorX86_64* const codegen_;
InvokeDexCallingConventionVisitorX86_64 parameter_visitor_;
@@ -287,6 +289,9 @@
void HandleGoto(HInstruction* got, HBasicBlock* successor);
+ bool CpuHasAvxFeatureFlag();
+ bool CpuHasAvx2FeatureFlag();
+
X86_64Assembler* const assembler_;
CodeGeneratorX86_64* const codegen_;
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index c6e7560..9914127 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -353,9 +353,6 @@
static HVecReduce::ReductionKind GetReductionKind(HVecOperation* reduction) {
if (reduction->IsVecAdd() ||
reduction->IsVecSub() ||
- #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
- reduction->IsVecAvxSub() || reduction->IsVecAvxAdd() ||
- #endif
reduction->IsVecSADAccumulate() ||
reduction->IsVecDotProd()) {
return HVecReduce::kSum;
@@ -1943,34 +1940,10 @@
new (global_allocator_) HVecCnv(global_allocator_, opa, type, vector_length_, dex_pc),
new (global_allocator_) HTypeConversion(org_type, opa, dex_pc));
case HInstruction::kAdd:
- #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
- if ((compiler_options_->GetInstructionSet() == InstructionSet::kX86 ||
- compiler_options_->GetInstructionSet() == InstructionSet::kX86_64) &&
- compiler_options_->GetInstructionSetFeatures()->AsX86InstructionSetFeatures()
- ->HasAVX2()) {
- GENERATE_VEC(
- new (global_allocator_) HVecAvxAdd(
- global_allocator_, opa, opb, type, vector_length_, dex_pc),
- new (global_allocator_) HAdd(org_type, opa, opb, dex_pc));
- UNREACHABLE(); // GENERATE_VEC ends with a "break".
- }
- #endif
GENERATE_VEC(
new (global_allocator_) HVecAdd(global_allocator_, opa, opb, type, vector_length_, dex_pc),
new (global_allocator_) HAdd(org_type, opa, opb, dex_pc));
case HInstruction::kSub:
- #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
- if ((compiler_options_->GetInstructionSet() == InstructionSet::kX86 ||
- compiler_options_->GetInstructionSet() == InstructionSet::kX86_64) &&
- compiler_options_->GetInstructionSetFeatures()->AsX86InstructionSetFeatures()
- ->HasAVX2()) {
- GENERATE_VEC(
- new (global_allocator_) HVecAvxSub(
- global_allocator_, opa, opb, type, vector_length_, dex_pc),
- new (global_allocator_) HSub(org_type, opa, opb, dex_pc));
- UNREACHABLE(); // GENERATE_VEC ends with a "break".
- }
- #endif
GENERATE_VEC(
new (global_allocator_) HVecSub(global_allocator_, opa, opb, type, vector_length_, dex_pc),
new (global_allocator_) HSub(org_type, opa, opb, dex_pc));
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 3e6e211..25f9e3c 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1540,9 +1540,7 @@
#if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
#define FOR_EACH_CONCRETE_INSTRUCTION_X86_COMMON(M) \
M(X86AndNot, Instruction) \
- M(X86MaskOrResetLeastSetBit, Instruction) \
- M(VecAvxSub, VecOperation) \
- M(VecAvxAdd, VecOperation)
+ M(X86MaskOrResetLeastSetBit, Instruction)
#else
#define FOR_EACH_CONCRETE_INSTRUCTION_X86_COMMON(M)
#endif
@@ -7874,7 +7872,6 @@
#endif
#if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
#include "nodes_x86.h"
-#include "nodes_vector_x86.h"
#endif
namespace art {
diff --git a/compiler/optimizing/nodes_vector_x86.h b/compiler/optimizing/nodes_vector_x86.h
deleted file mode 100644
index a8f576f..0000000
--- a/compiler/optimizing/nodes_vector_x86.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (C) 2018 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_
-#define ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_
-
-#include "nodes_vector.h"
-
-namespace art {
-
-class HVecAvxAdd final : public HVecOperation {
- public:
- HVecAvxAdd(ArenaAllocator* allocator,
- HInstruction* src1,
- HInstruction* src2,
- DataType::Type packed_type,
- size_t vector_length,
- uint32_t dex_pc)
- : HVecOperation(kVecAvxAdd,
- allocator,
- packed_type,
- SideEffects::None(),
- /* number_of_inputs */ 2,
- vector_length,
- dex_pc) {
- DCHECK(HasConsistentPackedTypes(src1, packed_type));
- DCHECK(HasConsistentPackedTypes(src2, packed_type));
- SetRawInputAt(0, src1);
- SetRawInputAt(1, src2);
- }
-
- bool CanBeMoved() const override { return true; }
-
- DECLARE_INSTRUCTION(VecAvxAdd);
-
- protected:
- DEFAULT_COPY_CONSTRUCTOR(VecAvxAdd);
-};
-
-class HVecAvxSub final : public HVecOperation {
- public:
- HVecAvxSub(ArenaAllocator* allocator,
- HInstruction* src1,
- HInstruction* src2,
- DataType::Type packed_type,
- size_t vector_length,
- uint32_t dex_pc)
- : HVecOperation(kVecAvxSub,
- allocator,
- packed_type,
- SideEffects::None(),
- /* number_of_inputs */ 2,
- vector_length,
- dex_pc) {
- DCHECK(HasConsistentPackedTypes(src1, packed_type));
- DCHECK(HasConsistentPackedTypes(src2, packed_type));
- SetRawInputAt(0, src1);
- SetRawInputAt(1, src2);
- }
-
- bool CanBeMoved() const override { return true; }
-
- DECLARE_INSTRUCTION(VecAvxSub);
-
- protected:
- DEFAULT_COPY_CONSTRUCTOR(VecAvxSub);
-};
-
-} // namespace art
-
-#endif // ART_COMPILER_OPTIMIZING_NODES_VECTOR_X86_H_
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 3eaf93a..84a8564 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -745,6 +745,20 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vmulps(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+ X86ManagedRegister::FromXmmRegister(src1),
+ SET_VEX_L_128,
+ SET_VEX_PP_NONE);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0x59);
+ EmitXmmRegisterOperand(dst, src2);
+}
void X86Assembler::divps(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -754,6 +768,22 @@
}
+void X86Assembler::vdivps(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+ X86ManagedRegister::FromXmmRegister(src1),
+ SET_VEX_L_128,
+ SET_VEX_PP_NONE);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0x5E);
+ EmitXmmRegisterOperand(dst, src2);
+}
+
+
void X86Assembler::movapd(XmmRegister dst, XmmRegister src) {
if (CpuHasAVXorAVX2FeatureFlag()) {
vmovapd(dst, src);
@@ -1113,6 +1143,20 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vmulpd(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+ X86ManagedRegister::FromXmmRegister(src1),
+ SET_VEX_L_128,
+ SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0x59);
+ EmitXmmRegisterOperand(dst, src2);
+}
void X86Assembler::divpd(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1122,6 +1166,20 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vdivpd(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+ X86ManagedRegister::FromXmmRegister(src1),
+ SET_VEX_L_128,
+ SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0x5E);
+ EmitXmmRegisterOperand(dst, src2);
+}
void X86Assembler::movdqa(XmmRegister dst, XmmRegister src) {
if (CpuHasAVXorAVX2FeatureFlag()) {
@@ -1425,6 +1483,40 @@
EmitXmmRegisterOperand(dst, src);
}
+void X86Assembler::vpmulld(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ false);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+ /*X=*/ false,
+ /*B=*/ false,
+ SET_VEX_M_0F_38);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false,
+ X86ManagedRegister::FromXmmRegister(src1),
+ SET_VEX_L_128,
+ SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(ByteTwo);
+ EmitUint8(0x40);
+ EmitRegisterOperand(dst, src2);
+}
+
+void X86Assembler::vpmullw(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form=*/ true);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false,
+ X86ManagedRegister::FromXmmRegister(src1),
+ SET_VEX_L_128,
+ SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0xD5);
+ EmitRegisterOperand(dst, src2);
+}
void X86Assembler::paddq(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 17039f0..dce546e 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -417,6 +417,11 @@
void mulps(XmmRegister dst, XmmRegister src);
void divps(XmmRegister dst, XmmRegister src);
+ void vmulps(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+ void vmulpd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+ void vdivps(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+ void vdivpd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
void vaddps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
void vsubps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
void vsubpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
@@ -476,6 +481,7 @@
void paddw(XmmRegister dst, XmmRegister src);
void psubw(XmmRegister dst, XmmRegister src);
void pmullw(XmmRegister dst, XmmRegister src);
+ void vpmullw(XmmRegister dst, XmmRegister src1, XmmRegister src2);
void vpsubb(XmmRegister dst, XmmRegister src1, XmmRegister src2);
void vpsubw(XmmRegister dst, XmmRegister src1, XmmRegister src2);
@@ -485,6 +491,8 @@
void psubd(XmmRegister dst, XmmRegister src);
void pmulld(XmmRegister dst, XmmRegister src);
+ void vpmulld(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
void vpaddd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
void paddq(XmmRegister dst, XmmRegister src);
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index 42ee383..bce0346 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -677,7 +677,7 @@
DriverStr(RepeatFF(&x86::X86Assembler::addps, "addps %{reg2}, %{reg1}"), "addps");
}
-TEST_F(AssemblerX86AVXTest, VAddps) {
+TEST_F(AssemblerX86AVXTest, VAddPS) {
DriverStr(RepeatFFF(&x86::X86Assembler::vaddps, "vaddps %{reg3}, %{reg2}, %{reg1}"), "vaddps");
}
@@ -693,41 +693,55 @@
DriverStr(RepeatFF(&x86::X86Assembler::subps, "subps %{reg2}, %{reg1}"), "subps");
}
-TEST_F(AssemblerX86AVXTest, VSubps) {
+TEST_F(AssemblerX86AVXTest, VSubPS) {
DriverStr(RepeatFFF(&x86::X86Assembler::vsubps, "vsubps %{reg3},%{reg2}, %{reg1}"), "vsubps");
}
-
TEST_F(AssemblerX86Test, SubPD) {
DriverStr(RepeatFF(&x86::X86Assembler::subpd, "subpd %{reg2}, %{reg1}"), "subpd");
}
-TEST_F(AssemblerX86AVXTest, VSubpd) {
+TEST_F(AssemblerX86AVXTest, VSubPD) {
DriverStr(RepeatFFF(&x86::X86Assembler::vsubpd, "vsubpd %{reg3}, %{reg2}, %{reg1}"), "vsubpd");
}
-
TEST_F(AssemblerX86Test, MulPS) {
DriverStr(RepeatFF(&x86::X86Assembler::mulps, "mulps %{reg2}, %{reg1}"), "mulps");
}
+TEST_F(AssemblerX86AVXTest, VMulPS) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vmulps, "vmulps %{reg3}, %{reg2}, %{reg1}"), "vmulps");
+}
+
TEST_F(AssemblerX86Test, MulPD) {
DriverStr(RepeatFF(&x86::X86Assembler::mulpd, "mulpd %{reg2}, %{reg1}"), "mulpd");
}
+TEST_F(AssemblerX86AVXTest, VMulPD) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vmulpd, "vmulpd %{reg3}, %{reg2}, %{reg1}"), "vmulpd");
+}
+
TEST_F(AssemblerX86Test, DivPS) {
DriverStr(RepeatFF(&x86::X86Assembler::divps, "divps %{reg2}, %{reg1}"), "divps");
}
+TEST_F(AssemblerX86AVXTest, VDivPS) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vdivps, "vdivps %{reg3}, %{reg2}, %{reg1}"), "vdivps");
+}
+
TEST_F(AssemblerX86Test, DivPD) {
DriverStr(RepeatFF(&x86::X86Assembler::divpd, "divpd %{reg2}, %{reg1}"), "divpd");
}
+TEST_F(AssemblerX86AVXTest, VDivPD) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vdivpd, "vdivpd %{reg3}, %{reg2}, %{reg1}"), "vdivpd");
+}
+
TEST_F(AssemblerX86Test, PAddB) {
DriverStr(RepeatFF(&x86::X86Assembler::paddb, "paddb %{reg2}, %{reg1}"), "paddb");
}
-TEST_F(AssemblerX86AVXTest, VPaddb) {
+TEST_F(AssemblerX86AVXTest, VPaddB) {
DriverStr(RepeatFFF(&x86::X86Assembler::vpaddb, "vpaddb %{reg3}, %{reg2}, %{reg1}"), "vpaddb");
}
@@ -735,7 +749,7 @@
DriverStr(RepeatFF(&x86::X86Assembler::psubb, "psubb %{reg2}, %{reg1}"), "psubb");
}
-TEST_F(AssemblerX86AVXTest, VPsubb) {
+TEST_F(AssemblerX86AVXTest, VPsubB) {
DriverStr(RepeatFFF(&x86::X86Assembler::vpsubb, "vpsubb %{reg3},%{reg2}, %{reg1}"), "vpsubb");
}
@@ -743,7 +757,7 @@
DriverStr(RepeatFF(&x86::X86Assembler::paddw, "paddw %{reg2}, %{reg1}"), "paddw");
}
-TEST_F(AssemblerX86AVXTest, VPaddw) {
+TEST_F(AssemblerX86AVXTest, VPaddW) {
DriverStr(RepeatFFF(&x86::X86Assembler::vpaddw, "vpaddw %{reg3}, %{reg2}, %{reg1}"), "vpaddw");
}
@@ -751,7 +765,7 @@
DriverStr(RepeatFF(&x86::X86Assembler::psubw, "psubw %{reg2}, %{reg1}"), "psubw");
}
-TEST_F(AssemblerX86AVXTest, VPsubw) {
+TEST_F(AssemblerX86AVXTest, VPsubW) {
DriverStr(RepeatFFF(&x86::X86Assembler::vpsubw, "vpsubw %{reg3}, %{reg2}, %{reg1}"), "vpsubw");
}
@@ -759,11 +773,15 @@
DriverStr(RepeatFF(&x86::X86Assembler::pmullw, "pmullw %{reg2}, %{reg1}"), "pmullw");
}
+TEST_F(AssemblerX86AVXTest, VPMullW) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vpmullw, "vpmullw %{reg3}, %{reg2}, %{reg1}"), "vpmullw");
+}
+
TEST_F(AssemblerX86Test, PAddD) {
DriverStr(RepeatFF(&x86::X86Assembler::paddd, "paddd %{reg2}, %{reg1}"), "paddd");
}
-TEST_F(AssemblerX86AVXTest, VPaddd) {
+TEST_F(AssemblerX86AVXTest, VPaddD) {
DriverStr(RepeatFFF(&x86::X86Assembler::vpaddd, "vpaddd %{reg3}, %{reg2}, %{reg1}"), "vpaddd");
}
@@ -771,7 +789,7 @@
DriverStr(RepeatFF(&x86::X86Assembler::psubd, "psubd %{reg2}, %{reg1}"), "psubd");
}
-TEST_F(AssemblerX86AVXTest, VPsubd) {
+TEST_F(AssemblerX86AVXTest, VPsubD) {
DriverStr(RepeatFFF(&x86::X86Assembler::vpsubd, "vpsubd %{reg3}, %{reg2}, %{reg1}"), "vpsubd");
}
@@ -779,11 +797,15 @@
DriverStr(RepeatFF(&x86::X86Assembler::pmulld, "pmulld %{reg2}, %{reg1}"), "pmulld");
}
+TEST_F(AssemblerX86AVXTest, VPMullD) {
+ DriverStr(RepeatFFF(&x86::X86Assembler::vpmulld, "vpmulld %{reg3}, %{reg2}, %{reg1}"), "vpmulld");
+}
+
TEST_F(AssemblerX86Test, PAddQ) {
DriverStr(RepeatFF(&x86::X86Assembler::paddq, "paddq %{reg2}, %{reg1}"), "paddq");
}
-TEST_F(AssemblerX86AVXTest, VPaddq) {
+TEST_F(AssemblerX86AVXTest, VPaddQ) {
DriverStr(RepeatFFF(&x86::X86Assembler::vpaddq, "vpaddq %{reg3}, %{reg2}, %{reg1}"), "vpaddq");
}
@@ -791,7 +813,7 @@
DriverStr(RepeatFF(&x86::X86Assembler::psubq, "psubq %{reg2}, %{reg1}"), "psubq");
}
-TEST_F(AssemblerX86AVXTest, VPsubq) {
+TEST_F(AssemblerX86AVXTest, VPsubQ) {
DriverStr(RepeatFFF(&x86::X86Assembler::vpsubq, "vpsubq %{reg3}, %{reg2}, %{reg1}"), "vpsubq");
}
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 72b7ae0..be8fe59 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -919,6 +919,34 @@
EmitXmmRegisterOperand(dst.LowBits(), src);
}
+void X86_64Assembler::vmulps(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ if (!src2.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(src1.AsFloatRegister());
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_NONE);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ src2.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_NONE);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0x59);
+ EmitXmmRegisterOperand(dst.LowBits(), src2);
+}
void X86_64Assembler::divps(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -928,6 +956,34 @@
EmitXmmRegisterOperand(dst.LowBits(), src);
}
+void X86_64Assembler::vdivps(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ if (!src2.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(src1.AsFloatRegister());
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_NONE);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ src2.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_NONE);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0x5E);
+ EmitXmmRegisterOperand(dst.LowBits(), src2);
+}
void X86_64Assembler::flds(const Address& src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1423,6 +1479,34 @@
EmitXmmRegisterOperand(dst.LowBits(), src);
}
+void X86_64Assembler::vmulpd(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ if (!src2.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(src1.AsFloatRegister());
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ src2.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0x59);
+ EmitXmmRegisterOperand(dst.LowBits(), src2);
+}
void X86_64Assembler::divpd(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1434,6 +1518,36 @@
}
+void X86_64Assembler::vdivpd(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ if (!src2.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(src1.AsFloatRegister());
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ src2.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0x5E);
+ EmitXmmRegisterOperand(dst.LowBits(), src2);
+}
+
+
void X86_64Assembler::movdqa(XmmRegister dst, XmmRegister src) {
if (CpuHasAVXorAVX2FeatureFlag()) {
vmovdqa(dst, src);
@@ -1878,6 +1992,34 @@
EmitXmmRegisterOperand(dst.LowBits(), src);
}
+void X86_64Assembler::vpmullw(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ if (!src2.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(src1.AsFloatRegister());
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ src2.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0xD5);
+ EmitXmmRegisterOperand(dst.LowBits(), src2);
+}
void X86_64Assembler::paddd(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -1937,6 +2079,24 @@
EmitXmmRegisterOperand(dst.LowBits(), src);
}
+void X86_64Assembler::vpmulld(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/*is_twobyte_form*/ false);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(src1.AsFloatRegister());
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ src2.NeedsRex(),
+ SET_VEX_M_0F_38);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(ByteTwo);
+ EmitUint8(0x40);
+ EmitXmmRegisterOperand(dst.LowBits(), src2);
+}
void X86_64Assembler::paddq(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 8fc69f6..100707a 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -452,6 +452,11 @@
void mulps(XmmRegister dst, XmmRegister src);
void divps(XmmRegister dst, XmmRegister src);
+ void vmulps(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+ void vmulpd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+ void vdivps(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+ void vdivpd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
void vaddps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
void vsubps(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
void vsubpd(XmmRegister dst, XmmRegister add_left, XmmRegister add_right);
@@ -508,6 +513,7 @@
void paddw(XmmRegister dst, XmmRegister src);
void psubw(XmmRegister dst, XmmRegister src);
void pmullw(XmmRegister dst, XmmRegister src);
+ void vpmullw(XmmRegister dst, XmmRegister src1, XmmRegister src2);
void vpsubb(XmmRegister dst, XmmRegister src1, XmmRegister src2);
void vpsubw(XmmRegister dst, XmmRegister src1, XmmRegister src2);
@@ -516,6 +522,7 @@
void paddd(XmmRegister dst, XmmRegister src);
void psubd(XmmRegister dst, XmmRegister src);
void pmulld(XmmRegister dst, XmmRegister src);
+ void vpmulld(XmmRegister dst, XmmRegister src1, XmmRegister src2);
void vpaddd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 24a6c3c..3d58a6d 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -1375,10 +1375,20 @@
DriverStr(RepeatFF(&x86_64::X86_64Assembler::mulps, "mulps %{reg2}, %{reg1}"), "mulps");
}
+TEST_F(AssemblerX86_64AVXTest, VMulps) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vmulps, "vmulps %{reg3}, %{reg2}, %{reg1}"), "vmulps");
+}
+
TEST_F(AssemblerX86_64Test, Mulpd) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::mulpd, "mulpd %{reg2}, %{reg1}"), "mulpd");
}
+TEST_F(AssemblerX86_64AVXTest, VMulpd) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vmulpd, "vmulpd %{reg3}, %{reg2}, %{reg1}"), "vmulpd");
+}
+
TEST_F(AssemblerX86_64Test, Divss) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::divss, "divss %{reg2}, %{reg1}"), "divss");
}
@@ -1391,10 +1401,20 @@
DriverStr(RepeatFF(&x86_64::X86_64Assembler::divps, "divps %{reg2}, %{reg1}"), "divps");
}
+TEST_F(AssemblerX86_64AVXTest, VDivps) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vdivps, "vdivps %{reg3}, %{reg2}, %{reg1}"), "vdivps");
+}
+
TEST_F(AssemblerX86_64Test, Divpd) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::divpd, "divpd %{reg2}, %{reg1}"), "divpd");
}
+TEST_F(AssemblerX86_64AVXTest, VDivpd) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vdivpd, "vdivpd %{reg3}, %{reg2}, %{reg1}"), "vdivpd");
+}
+
TEST_F(AssemblerX86_64Test, Paddb) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddb, "paddb %{reg2}, %{reg1}"), "paddb");
}
@@ -1427,7 +1447,6 @@
RepeatFFF(&x86_64::X86_64Assembler::vpaddw, "vpaddw %{reg3}, %{reg2}, %{reg1}"), "vpaddw");
}
-
TEST_F(AssemblerX86_64Test, Psubw) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::psubw, "psubw %{reg2}, %{reg1}"), "psubw");
}
@@ -1436,6 +1455,11 @@
DriverStr(RepeatFF(&x86_64::X86_64Assembler::pmullw, "pmullw %{reg2}, %{reg1}"), "pmullw");
}
+TEST_F(AssemblerX86_64AVXTest, VPmullw) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vpmullw, "vpmullw %{reg3}, %{reg2}, %{reg1}"), "vpmullw");
+}
+
TEST_F(AssemblerX86_64Test, Paddd) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddd, "paddd %{reg2}, %{reg1}"), "paddd");
}
@@ -1458,6 +1482,11 @@
DriverStr(RepeatFF(&x86_64::X86_64Assembler::pmulld, "pmulld %{reg2}, %{reg1}"), "pmulld");
}
+TEST_F(AssemblerX86_64AVXTest, VPmulld) {
+ DriverStr(
+ RepeatFFF(&x86_64::X86_64Assembler::vpmulld, "vpmulld %{reg3}, %{reg2}, %{reg1}"), "vpmulld");
+}
+
TEST_F(AssemblerX86_64Test, Paddq) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::paddq, "paddq %{reg2}, %{reg1}"), "paddq");
}
diff --git a/compiler/utils/x86_64/constants_x86_64.h b/compiler/utils/x86_64/constants_x86_64.h
index b02e246..5335398 100644
--- a/compiler/utils/x86_64/constants_x86_64.h
+++ b/compiler/utils/x86_64/constants_x86_64.h
@@ -59,6 +59,9 @@
constexpr bool NeedsRex() const {
return reg_ > 7;
}
+ bool operator==(XmmRegister& other) {
+ return reg_ == other.reg_;
+ }
private:
const FloatRegister reg_;
};