summaryrefslogtreecommitdiff
path: root/compiler
diff options
context:
space:
mode:
Diffstat (limited to 'compiler')
-rw-r--r--compiler/optimizing/code_generator_vector_x86.cc31
-rw-r--r--compiler/optimizing/code_generator_vector_x86_64.cc31
-rw-r--r--compiler/optimizing/loop_optimization.cc12
-rw-r--r--compiler/utils/x86/assembler_x86.cc14
-rw-r--r--compiler/utils/x86/assembler_x86.h1
-rw-r--r--compiler/utils/x86/assembler_x86_test.cc5
-rw-r--r--compiler/utils/x86_64/assembler_x86_64.cc33
-rw-r--r--compiler/utils/x86_64/assembler_x86_64.h1
-rw-r--r--compiler/utils/x86_64/assembler_x86_64_test.cc5
9 files changed, 123 insertions, 10 deletions
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 68aef779f2..1390af2435 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -1201,11 +1201,38 @@ void InstructionCodeGeneratorX86::VisitVecSADAccumulate(HVecSADAccumulate* instr
}
void LocationsBuilderX86::VisitVecDotProd(HVecDotProd* instruction) {
- LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+ locations->SetInAt(0, Location::RequiresFpuRegister());
+ locations->SetInAt(1, Location::RequiresFpuRegister());
+ locations->SetInAt(2, Location::RequiresFpuRegister());
+ locations->SetOut(Location::SameAsFirstInput());
+ locations->AddTemp(Location::RequiresFpuRegister());
}
void InstructionCodeGeneratorX86::VisitVecDotProd(HVecDotProd* instruction) {
- LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ bool cpu_has_avx = CpuHasAvxFeatureFlag();
+ LocationSummary* locations = instruction->GetLocations();
+ XmmRegister acc = locations->InAt(0).AsFpuRegister<XmmRegister>();
+ XmmRegister left = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister right = locations->InAt(2).AsFpuRegister<XmmRegister>();
+ switch (instruction->GetPackedType()) {
+ case DataType::Type::kInt32: {
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+ if (!cpu_has_avx) {
+ __ movaps(tmp, right);
+ __ pmaddwd(tmp, left);
+ __ paddd(acc, tmp);
+ } else {
+ __ vpmaddwd(tmp, left, right);
+ __ vpaddd(acc, acc, tmp);
+ }
+ break;
+ }
+ default:
+ LOG(FATAL) << "Unsupported SIMD Type" << instruction->GetPackedType();
+ UNREACHABLE();
+ }
}
// Helper to set up locations for vector memory operations.
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 19dfd1d2a8..7fac44dea8 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -1174,11 +1174,38 @@ void InstructionCodeGeneratorX86_64::VisitVecSADAccumulate(HVecSADAccumulate* in
}
void LocationsBuilderX86_64::VisitVecDotProd(HVecDotProd* instruction) {
- LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+ locations->SetInAt(0, Location::RequiresFpuRegister());
+ locations->SetInAt(1, Location::RequiresFpuRegister());
+ locations->SetInAt(2, Location::RequiresFpuRegister());
+ locations->SetOut(Location::SameAsFirstInput());
+ locations->AddTemp(Location::RequiresFpuRegister());
}
void InstructionCodeGeneratorX86_64::VisitVecDotProd(HVecDotProd* instruction) {
- LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ bool cpu_has_avx = CpuHasAvxFeatureFlag();
+ LocationSummary* locations = instruction->GetLocations();
+ XmmRegister acc = locations->InAt(0).AsFpuRegister<XmmRegister>();
+ XmmRegister left = locations->InAt(1).AsFpuRegister<XmmRegister>();
+ XmmRegister right = locations->InAt(2).AsFpuRegister<XmmRegister>();
+ switch (instruction->GetPackedType()) {
+ case DataType::Type::kInt32: {
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ XmmRegister tmp = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+ if (!cpu_has_avx) {
+ __ movaps(tmp, right);
+ __ pmaddwd(tmp, left);
+ __ paddd(acc, tmp);
+ } else {
+ __ vpmaddwd(tmp, left, right);
+ __ vpaddd(acc, acc, tmp);
+ }
+ break;
+ }
+ default:
+ LOG(FATAL) << "Unsupported SIMD Type" << instruction->GetPackedType();
+ UNREACHABLE();
+ }
}
// Helper to set up locations for vector memory operations.
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 9c4e9d25f7..567a41e2fd 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -1623,14 +1623,20 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
kNoDotProd;
return TrySetVectorLength(16);
case DataType::Type::kUint16:
- case DataType::Type::kInt16:
*restrictions |= kNoDiv |
kNoAbs |
kNoSignedHAdd |
kNoUnroundedHAdd |
- kNoSAD|
+ kNoSAD |
kNoDotProd;
return TrySetVectorLength(8);
+ case DataType::Type::kInt16:
+ *restrictions |= kNoDiv |
+ kNoAbs |
+ kNoSignedHAdd |
+ kNoUnroundedHAdd |
+ kNoSAD;
+ return TrySetVectorLength(8);
case DataType::Type::kInt32:
*restrictions |= kNoDiv | kNoSAD;
return TrySetVectorLength(4);
@@ -2166,7 +2172,7 @@ bool HLoopOptimization::VectorizeDotProdIdiom(LoopNode* node,
bool generate_code,
DataType::Type reduction_type,
uint64_t restrictions) {
- if (!instruction->IsAdd() || (reduction_type != DataType::Type::kInt32)) {
+ if (!instruction->IsAdd() || reduction_type != DataType::Type::kInt32) {
return false;
}
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 166aec81a7..55f7691514 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -2268,6 +2268,20 @@ void X86Assembler::pmaddwd(XmmRegister dst, XmmRegister src) {
}
+void X86Assembler::vpmaddwd(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t ByteZero = 0x00, ByteOne = 0x00;
+ ByteZero = EmitVexPrefixByteZero(/* is_twobyte_form=*/ true);
+ X86ManagedRegister vvvv_reg = X86ManagedRegister::FromXmmRegister(src1);
+ ByteOne = EmitVexPrefixByteOne(/*R=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ EmitUint8(0xF5);
+ EmitXmmRegisterOperand(dst, src2);
+}
+
+
void X86Assembler::phaddw(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0x66);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 1b6941c2e6..27fde26c80 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -577,6 +577,7 @@ class X86Assembler final : public Assembler {
void pavgw(XmmRegister dst, XmmRegister src);
void psadbw(XmmRegister dst, XmmRegister src);
void pmaddwd(XmmRegister dst, XmmRegister src);
+ void vpmaddwd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
void phaddw(XmmRegister dst, XmmRegister src);
void phaddd(XmmRegister dst, XmmRegister src);
void haddps(XmmRegister dst, XmmRegister src);
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index 12d9646ace..9253730797 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -965,6 +965,11 @@ TEST_F(AssemblerX86Test, PMAddWD) {
DriverStr(RepeatFF(&x86::X86Assembler::pmaddwd, "pmaddwd %{reg2}, %{reg1}"), "pmaddwd");
}
+TEST_F(AssemblerX86AVXTest, VPMAddWD) {
+ DriverStr(
+ RepeatFFF(&x86::X86Assembler::vpmaddwd, "vpmaddwd %{reg3}, %{reg2}, %{reg1}"), "vpmaddwd");
+}
+
TEST_F(AssemblerX86Test, PHAddW) {
DriverStr(RepeatFF(&x86::X86Assembler::phaddw, "phaddw %{reg2}, %{reg1}"), "phaddw");
}
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 64246aae0e..2c5dd9e949 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -71,6 +71,7 @@ bool X86_64Assembler::CpuHasAVXorAVX2FeatureFlag() {
return false;
}
+
void X86_64Assembler::call(CpuRegister reg) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitOptionalRex32(reg);
@@ -758,7 +759,6 @@ void X86_64Assembler::movd(CpuRegister dst, XmmRegister src, bool is64bit) {
EmitOperand(src.LowBits(), Operand(dst));
}
-
void X86_64Assembler::addss(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0xF3);
@@ -768,7 +768,6 @@ void X86_64Assembler::addss(XmmRegister dst, XmmRegister src) {
EmitXmmRegisterOperand(dst.LowBits(), src);
}
-
void X86_64Assembler::addss(XmmRegister dst, const Address& src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0xF3);
@@ -2633,7 +2632,6 @@ void X86_64Assembler::xorps(XmmRegister dst, XmmRegister src) {
EmitXmmRegisterOperand(dst.LowBits(), src);
}
-
void X86_64Assembler::pxor(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0x66);
@@ -3145,6 +3143,35 @@ void X86_64Assembler::pmaddwd(XmmRegister dst, XmmRegister src) {
EmitXmmRegisterOperand(dst.LowBits(), src);
}
+void X86_64Assembler::vpmaddwd(XmmRegister dst, XmmRegister src1, XmmRegister src2) {
+ DCHECK(CpuHasAVXorAVX2FeatureFlag());
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ bool is_twobyte_form = false;
+ uint8_t ByteZero = 0x00, ByteOne = 0x00, ByteTwo = 0x00;
+ if (!src2.NeedsRex()) {
+ is_twobyte_form = true;
+ }
+ ByteZero = EmitVexPrefixByteZero(is_twobyte_form);
+ X86_64ManagedRegister vvvv_reg =
+ X86_64ManagedRegister::FromXmmRegister(src1.AsFloatRegister());
+ if (is_twobyte_form) {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(), vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ } else {
+ ByteOne = EmitVexPrefixByteOne(dst.NeedsRex(),
+ /*X=*/ false,
+ src2.NeedsRex(),
+ SET_VEX_M_0F);
+ ByteTwo = EmitVexPrefixByteTwo(/*W=*/ false, vvvv_reg, SET_VEX_L_128, SET_VEX_PP_66);
+ }
+ EmitUint8(ByteZero);
+ EmitUint8(ByteOne);
+ if (!is_twobyte_form) {
+ EmitUint8(ByteTwo);
+ }
+ EmitUint8(0xF5);
+ EmitXmmRegisterOperand(dst.LowBits(), src2);
+}
+
void X86_64Assembler::phaddw(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0x66);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 15f3ab9f25..70072d9224 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -615,6 +615,7 @@ class X86_64Assembler final : public Assembler {
void pavgw(XmmRegister dst, XmmRegister src);
void psadbw(XmmRegister dst, XmmRegister src);
void pmaddwd(XmmRegister dst, XmmRegister src);
+ void vpmaddwd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
void phaddw(XmmRegister dst, XmmRegister src);
void phaddd(XmmRegister dst, XmmRegister src);
void haddps(XmmRegister dst, XmmRegister src);
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index e3b8390468..3921c4afb0 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -1740,6 +1740,11 @@ TEST_F(AssemblerX86_64Test, Pmaddwd) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::pmaddwd, "pmaddwd %{reg2}, %{reg1}"), "pmadwd");
}
+TEST_F(AssemblerX86_64AVXTest, VPmaddwd) {
+ DriverStr(RepeatFFF(&x86_64::X86_64Assembler::vpmaddwd,
+ "vpmaddwd %{reg3}, %{reg2}, %{reg1}"), "vpmaddwd");
+}
+
TEST_F(AssemblerX86_64Test, Phaddw) {
DriverStr(RepeatFF(&x86_64::X86_64Assembler::phaddw, "phaddw %{reg2}, %{reg1}"), "phaddw");
}