Implement Sum-of-Abs-Differences idiom recognition.
Rationale:
Currently just on ARM64 (x86 lacks proper support),
using the SAD idiom yields great speedup on loops
that compute the sum-of-abs-difference operation.
Also includes some refinements around type conversions.
Speedup ExoPlayerAudio (golem run):
1.3x on ARM64
1.1x on x86
Test: test-art-host test-art-target
Bug: 64091002
Change-Id: Ia2b711d2bc23609a2ed50493dfe6719eedfe0130
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 18a55c8..3f576c8 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -949,20 +949,18 @@
}
}
-void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
- LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
- switch (instr->GetPackedType()) {
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+ LocationSummary* locations = new (arena) LocationSummary(instruction);
+ switch (instruction->GetPackedType()) {
case Primitive::kPrimByte:
case Primitive::kPrimChar:
case Primitive::kPrimShort:
case Primitive::kPrimInt:
- locations->SetInAt(
- HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
- locations->SetInAt(
- HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
- locations->SetInAt(
- HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
- DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+ case Primitive::kPrimLong:
+ locations->SetInAt(0, Location::RequiresFpuRegister());
+ locations->SetInAt(1, Location::RequiresFpuRegister());
+ locations->SetInAt(2, Location::RequiresFpuRegister());
locations->SetOut(Location::SameAsFirstInput());
break;
default:
@@ -971,18 +969,25 @@
}
}
+void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+ CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
// Some early revisions of the Cortex-A53 have an erratum (835769) whereby it is possible for a
// 64-bit scalar multiply-accumulate instruction in AArch64 state to generate an incorrect result.
// However vector MultiplyAccumulate instruction is not affected.
-void InstructionCodeGeneratorARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
- LocationSummary* locations = instr->GetLocations();
- VRegister acc = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputAccumulatorIndex));
- VRegister left = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulLeftIndex));
- VRegister right = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulRightIndex));
- switch (instr->GetPackedType()) {
+void InstructionCodeGeneratorARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+ LocationSummary* locations = instruction->GetLocations();
+ VRegister acc = VRegisterFrom(locations->InAt(0));
+ VRegister left = VRegisterFrom(locations->InAt(1));
+ VRegister right = VRegisterFrom(locations->InAt(2));
+
+ DCHECK(locations->InAt(0).Equals(locations->Out()));
+
+ switch (instruction->GetPackedType()) {
case Primitive::kPrimByte:
- DCHECK_EQ(16u, instr->GetVectorLength());
- if (instr->GetOpKind() == HInstruction::kAdd) {
+ DCHECK_EQ(16u, instruction->GetVectorLength());
+ if (instruction->GetOpKind() == HInstruction::kAdd) {
__ Mla(acc.V16B(), left.V16B(), right.V16B());
} else {
__ Mls(acc.V16B(), left.V16B(), right.V16B());
@@ -990,16 +995,16 @@
break;
case Primitive::kPrimChar:
case Primitive::kPrimShort:
- DCHECK_EQ(8u, instr->GetVectorLength());
- if (instr->GetOpKind() == HInstruction::kAdd) {
+ DCHECK_EQ(8u, instruction->GetVectorLength());
+ if (instruction->GetOpKind() == HInstruction::kAdd) {
__ Mla(acc.V8H(), left.V8H(), right.V8H());
} else {
__ Mls(acc.V8H(), left.V8H(), right.V8H());
}
break;
case Primitive::kPrimInt:
- DCHECK_EQ(4u, instr->GetVectorLength());
- if (instr->GetOpKind() == HInstruction::kAdd) {
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ if (instruction->GetOpKind() == HInstruction::kAdd) {
__ Mla(acc.V4S(), left.V4S(), right.V4S());
} else {
__ Mls(acc.V4S(), left.V4S(), right.V4S());
@@ -1007,6 +1012,186 @@
break;
default:
LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+}
+
+void LocationsBuilderARM64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+ CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+ // Some conversions require temporary registers.
+ LocationSummary* locations = instruction->GetLocations();
+ HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
+ HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
+ DCHECK_EQ(a->GetPackedType(), b->GetPackedType());
+ switch (a->GetPackedType()) {
+ case Primitive::kPrimByte:
+ switch (instruction->GetPackedType()) {
+ case Primitive::kPrimLong:
+ locations->AddTemp(Location::RequiresFpuRegister());
+ locations->AddTemp(Location::RequiresFpuRegister());
+ FALLTHROUGH_INTENDED;
+ case Primitive::kPrimInt:
+ locations->AddTemp(Location::RequiresFpuRegister());
+ locations->AddTemp(Location::RequiresFpuRegister());
+ break;
+ default:
+ break;
+ }
+ break;
+ case Primitive::kPrimChar:
+ case Primitive::kPrimShort:
+ if (instruction->GetPackedType() == Primitive::kPrimLong) {
+ locations->AddTemp(Location::RequiresFpuRegister());
+ locations->AddTemp(Location::RequiresFpuRegister());
+ }
+ break;
+ case Primitive::kPrimInt:
+ case Primitive::kPrimLong:
+ if (instruction->GetPackedType() == a->GetPackedType()) {
+ locations->AddTemp(Location::RequiresFpuRegister());
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+void InstructionCodeGeneratorARM64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+ LocationSummary* locations = instruction->GetLocations();
+ VRegister acc = VRegisterFrom(locations->InAt(0));
+ VRegister left = VRegisterFrom(locations->InAt(1));
+ VRegister right = VRegisterFrom(locations->InAt(2));
+
+ DCHECK(locations->InAt(0).Equals(locations->Out()));
+
+ // Handle all feasible acc_T += sad(a_S, b_S) type combinations (T x S).
+ HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
+ HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
+ DCHECK_EQ(a->GetPackedType(), b->GetPackedType());
+ switch (a->GetPackedType()) {
+ case Primitive::kPrimByte:
+ DCHECK_EQ(16u, a->GetVectorLength());
+ switch (instruction->GetPackedType()) {
+ case Primitive::kPrimChar:
+ case Primitive::kPrimShort:
+ DCHECK_EQ(8u, instruction->GetVectorLength());
+ __ Sabal(acc.V8H(), left.V8B(), right.V8B());
+ __ Sabal2(acc.V8H(), left.V16B(), right.V16B());
+ break;
+ case Primitive::kPrimInt: {
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ VRegister tmp1 = VRegisterFrom(locations->GetTemp(0));
+ VRegister tmp2 = VRegisterFrom(locations->GetTemp(1));
+ __ Sxtl(tmp1.V8H(), left.V8B());
+ __ Sxtl(tmp2.V8H(), right.V8B());
+ __ Sabal(acc.V4S(), tmp1.V4H(), tmp2.V4H());
+ __ Sabal2(acc.V4S(), tmp1.V8H(), tmp2.V8H());
+ __ Sxtl2(tmp1.V8H(), left.V16B());
+ __ Sxtl2(tmp2.V8H(), right.V16B());
+ __ Sabal(acc.V4S(), tmp1.V4H(), tmp2.V4H());
+ __ Sabal2(acc.V4S(), tmp1.V8H(), tmp2.V8H());
+ break;
+ }
+ case Primitive::kPrimLong: {
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ VRegister tmp1 = VRegisterFrom(locations->GetTemp(0));
+ VRegister tmp2 = VRegisterFrom(locations->GetTemp(1));
+ VRegister tmp3 = VRegisterFrom(locations->GetTemp(2));
+ VRegister tmp4 = VRegisterFrom(locations->GetTemp(3));
+ __ Sxtl(tmp1.V8H(), left.V8B());
+ __ Sxtl(tmp2.V8H(), right.V8B());
+ __ Sxtl(tmp3.V4S(), tmp1.V4H());
+ __ Sxtl(tmp4.V4S(), tmp2.V4H());
+ __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
+ __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
+ __ Sxtl2(tmp3.V4S(), tmp1.V8H());
+ __ Sxtl2(tmp4.V4S(), tmp2.V8H());
+ __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
+ __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
+ __ Sxtl2(tmp1.V8H(), left.V16B());
+ __ Sxtl2(tmp2.V8H(), right.V16B());
+ __ Sxtl(tmp3.V4S(), tmp1.V4H());
+ __ Sxtl(tmp4.V4S(), tmp2.V4H());
+ __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
+ __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
+ __ Sxtl2(tmp3.V4S(), tmp1.V8H());
+ __ Sxtl2(tmp4.V4S(), tmp2.V8H());
+ __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
+ __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
+ break;
+ }
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+ break;
+ case Primitive::kPrimChar:
+ case Primitive::kPrimShort:
+ DCHECK_EQ(8u, a->GetVectorLength());
+ switch (instruction->GetPackedType()) {
+ case Primitive::kPrimInt:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ Sabal(acc.V4S(), left.V4H(), right.V4H());
+ __ Sabal2(acc.V4S(), left.V8H(), right.V8H());
+ break;
+ case Primitive::kPrimLong: {
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ VRegister tmp1 = VRegisterFrom(locations->GetTemp(0));
+ VRegister tmp2 = VRegisterFrom(locations->GetTemp(1));
+ __ Sxtl(tmp1.V4S(), left.V4H());
+ __ Sxtl(tmp2.V4S(), right.V4H());
+ __ Sabal(acc.V2D(), tmp1.V2S(), tmp2.V2S());
+ __ Sabal2(acc.V2D(), tmp1.V4S(), tmp2.V4S());
+ __ Sxtl2(tmp1.V4S(), left.V8H());
+ __ Sxtl2(tmp2.V4S(), right.V8H());
+ __ Sabal(acc.V2D(), tmp1.V2S(), tmp2.V2S());
+ __ Sabal2(acc.V2D(), tmp1.V4S(), tmp2.V4S());
+ break;
+ }
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+ break;
+ case Primitive::kPrimInt:
+ DCHECK_EQ(4u, a->GetVectorLength());
+ switch (instruction->GetPackedType()) {
+ case Primitive::kPrimInt: {
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ VRegister tmp = VRegisterFrom(locations->GetTemp(0));
+ __ Sub(tmp.V4S(), left.V4S(), right.V4S());
+ __ Abs(tmp.V4S(), tmp.V4S());
+ __ Add(acc.V4S(), acc.V4S(), tmp.V4S());
+ break;
+ }
+ case Primitive::kPrimLong:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ Sabal(acc.V2D(), left.V2S(), right.V2S());
+ __ Sabal2(acc.V2D(), left.V4S(), right.V4S());
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+ break;
+ case Primitive::kPrimLong:
+ DCHECK_EQ(2u, a->GetVectorLength());
+ switch (instruction->GetPackedType()) {
+ case Primitive::kPrimLong: {
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ VRegister tmp = VRegisterFrom(locations->GetTemp(0));
+ __ Sub(tmp.V2D(), left.V2D(), right.V2D());
+ __ Abs(tmp.V2D(), tmp.V2D());
+ __ Add(acc.V2D(), acc.V2D(), tmp.V2D());
+ break;
+ }
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
}
}
diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc
index 7a11dff..069054c 100644
--- a/compiler/optimizing/code_generator_vector_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc
@@ -629,12 +629,40 @@
LOG(FATAL) << "No SIMD for " << instruction->GetId();
}
-void LocationsBuilderARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
- LOG(FATAL) << "No SIMD for " << instr->GetId();
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+ LocationSummary* locations = new (arena) LocationSummary(instruction);
+ switch (instruction->GetPackedType()) {
+ case Primitive::kPrimByte:
+ case Primitive::kPrimChar:
+ case Primitive::kPrimShort:
+ case Primitive::kPrimInt:
+ case Primitive::kPrimLong:
+ locations->SetInAt(0, Location::RequiresFpuRegister());
+ locations->SetInAt(1, Location::RequiresFpuRegister());
+ locations->SetInAt(2, Location::RequiresFpuRegister());
+ locations->SetOut(Location::SameAsFirstInput());
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
}
-void InstructionCodeGeneratorARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
- LOG(FATAL) << "No SIMD for " << instr->GetId();
+void LocationsBuilderARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+ CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+ CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
}
// Return whether the vector memory access operation is guaranteed to be word-aligned (ARM word
diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc
index c2fbf7f..0bedafc 100644
--- a/compiler/optimizing/code_generator_vector_mips.cc
+++ b/compiler/optimizing/code_generator_vector_mips.cc
@@ -826,21 +826,18 @@
LOG(FATAL) << "No SIMD for " << instruction->GetId();
}
-void LocationsBuilderMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
- LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
- switch (instr->GetPackedType()) {
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+ LocationSummary* locations = new (arena) LocationSummary(instruction);
+ switch (instruction->GetPackedType()) {
case Primitive::kPrimByte:
case Primitive::kPrimChar:
case Primitive::kPrimShort:
case Primitive::kPrimInt:
case Primitive::kPrimLong:
- locations->SetInAt(
- HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
- locations->SetInAt(
- HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
- locations->SetInAt(
- HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
- DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+ locations->SetInAt(0, Location::RequiresFpuRegister());
+ locations->SetInAt(1, Location::RequiresFpuRegister());
+ locations->SetInAt(2, Location::RequiresFpuRegister());
locations->SetOut(Location::SameAsFirstInput());
break;
default:
@@ -849,18 +846,19 @@
}
}
-void InstructionCodeGeneratorMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
- LocationSummary* locations = instr->GetLocations();
- VectorRegister acc =
- VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputAccumulatorIndex));
- VectorRegister left =
- VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulLeftIndex));
- VectorRegister right =
- VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulRightIndex));
- switch (instr->GetPackedType()) {
+void LocationsBuilderMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+ CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+ LocationSummary* locations = instruction->GetLocations();
+ VectorRegister acc = VectorRegisterFrom(locations->InAt(0));
+ VectorRegister left = VectorRegisterFrom(locations->InAt(1));
+ VectorRegister right = VectorRegisterFrom(locations->InAt(2));
+ switch (instruction->GetPackedType()) {
case Primitive::kPrimByte:
- DCHECK_EQ(16u, instr->GetVectorLength());
- if (instr->GetOpKind() == HInstruction::kAdd) {
+ DCHECK_EQ(16u, instruction->GetVectorLength());
+ if (instruction->GetOpKind() == HInstruction::kAdd) {
__ MaddvB(acc, left, right);
} else {
__ MsubvB(acc, left, right);
@@ -868,24 +866,24 @@
break;
case Primitive::kPrimChar:
case Primitive::kPrimShort:
- DCHECK_EQ(8u, instr->GetVectorLength());
- if (instr->GetOpKind() == HInstruction::kAdd) {
+ DCHECK_EQ(8u, instruction->GetVectorLength());
+ if (instruction->GetOpKind() == HInstruction::kAdd) {
__ MaddvH(acc, left, right);
} else {
__ MsubvH(acc, left, right);
}
break;
case Primitive::kPrimInt:
- DCHECK_EQ(4u, instr->GetVectorLength());
- if (instr->GetOpKind() == HInstruction::kAdd) {
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ if (instruction->GetOpKind() == HInstruction::kAdd) {
__ MaddvW(acc, left, right);
} else {
__ MsubvW(acc, left, right);
}
break;
case Primitive::kPrimLong:
- DCHECK_EQ(2u, instr->GetVectorLength());
- if (instr->GetOpKind() == HInstruction::kAdd) {
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ if (instruction->GetOpKind() == HInstruction::kAdd) {
__ MaddvD(acc, left, right);
} else {
__ MsubvD(acc, left, right);
@@ -897,6 +895,15 @@
}
}
+void LocationsBuilderMIPS::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+ CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ // TODO: implement this, location helper already filled out (shared with MulAcc).
+}
+
// Helper to set up locations for vector memory operations.
static void CreateVecMemLocations(ArenaAllocator* arena,
HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc
index 9d3a777..db31bdc 100644
--- a/compiler/optimizing/code_generator_vector_mips64.cc
+++ b/compiler/optimizing/code_generator_vector_mips64.cc
@@ -830,21 +830,18 @@
LOG(FATAL) << "No SIMD for " << instruction->GetId();
}
-void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
- LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
- switch (instr->GetPackedType()) {
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+ LocationSummary* locations = new (arena) LocationSummary(instruction);
+ switch (instruction->GetPackedType()) {
case Primitive::kPrimByte:
case Primitive::kPrimChar:
case Primitive::kPrimShort:
case Primitive::kPrimInt:
case Primitive::kPrimLong:
- locations->SetInAt(
- HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
- locations->SetInAt(
- HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
- locations->SetInAt(
- HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
- DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+ locations->SetInAt(0, Location::RequiresFpuRegister());
+ locations->SetInAt(1, Location::RequiresFpuRegister());
+ locations->SetInAt(2, Location::RequiresFpuRegister());
locations->SetOut(Location::SameAsFirstInput());
break;
default:
@@ -853,18 +850,19 @@
}
}
-void InstructionCodeGeneratorMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
- LocationSummary* locations = instr->GetLocations();
- VectorRegister acc =
- VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputAccumulatorIndex));
- VectorRegister left =
- VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulLeftIndex));
- VectorRegister right =
- VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulRightIndex));
- switch (instr->GetPackedType()) {
+void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+ CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+ LocationSummary* locations = instruction->GetLocations();
+ VectorRegister acc = VectorRegisterFrom(locations->InAt(0));
+ VectorRegister left = VectorRegisterFrom(locations->InAt(1));
+ VectorRegister right = VectorRegisterFrom(locations->InAt(2));
+ switch (instruction->GetPackedType()) {
case Primitive::kPrimByte:
- DCHECK_EQ(16u, instr->GetVectorLength());
- if (instr->GetOpKind() == HInstruction::kAdd) {
+ DCHECK_EQ(16u, instruction->GetVectorLength());
+ if (instruction->GetOpKind() == HInstruction::kAdd) {
__ MaddvB(acc, left, right);
} else {
__ MsubvB(acc, left, right);
@@ -872,24 +870,24 @@
break;
case Primitive::kPrimChar:
case Primitive::kPrimShort:
- DCHECK_EQ(8u, instr->GetVectorLength());
- if (instr->GetOpKind() == HInstruction::kAdd) {
+ DCHECK_EQ(8u, instruction->GetVectorLength());
+ if (instruction->GetOpKind() == HInstruction::kAdd) {
__ MaddvH(acc, left, right);
} else {
__ MsubvH(acc, left, right);
}
break;
case Primitive::kPrimInt:
- DCHECK_EQ(4u, instr->GetVectorLength());
- if (instr->GetOpKind() == HInstruction::kAdd) {
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ if (instruction->GetOpKind() == HInstruction::kAdd) {
__ MaddvW(acc, left, right);
} else {
__ MsubvW(acc, left, right);
}
break;
case Primitive::kPrimLong:
- DCHECK_EQ(2u, instr->GetVectorLength());
- if (instr->GetOpKind() == HInstruction::kAdd) {
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ if (instruction->GetOpKind() == HInstruction::kAdd) {
__ MaddvD(acc, left, right);
} else {
__ MsubvD(acc, left, right);
@@ -901,6 +899,15 @@
}
}
+void LocationsBuilderMIPS64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+ CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ // TODO: implement this, location helper already filled out (shared with MulAcc).
+}
+
// Helper to set up locations for vector memory operations.
static void CreateVecMemLocations(ArenaAllocator* arena,
HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 37190f8..5a012e7 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -51,7 +51,6 @@
: Location::RequiresFpuRegister());
locations->SetOut(is_zero ? Location::RequiresFpuRegister()
: Location::SameAsFirstInput());
-
break;
default:
LOG(FATAL) << "Unsupported SIMD type";
@@ -1033,12 +1032,42 @@
}
}
-void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
- LOG(FATAL) << "No SIMD for " << instr->GetId();
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+ LocationSummary* locations = new (arena) LocationSummary(instruction);
+ switch (instruction->GetPackedType()) {
+ case Primitive::kPrimByte:
+ case Primitive::kPrimChar:
+ case Primitive::kPrimShort:
+ case Primitive::kPrimInt:
+ case Primitive::kPrimLong:
+ locations->SetInAt(0, Location::RequiresFpuRegister());
+ locations->SetInAt(1, Location::RequiresFpuRegister());
+ locations->SetInAt(2, Location::RequiresFpuRegister());
+ locations->SetOut(Location::SameAsFirstInput());
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
}
-void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
- LOG(FATAL) << "No SIMD for " << instr->GetId();
+void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+ CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+ // TODO: pmaddwd?
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+ CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+ // TODO: psadbw for unsigned?
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
}
// Helper to set up locations for vector memory operations.
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index edd0209..3698b7f 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -1005,11 +1005,41 @@
}
}
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+ LocationSummary* locations = new (arena) LocationSummary(instruction);
+ switch (instruction->GetPackedType()) {
+ case Primitive::kPrimByte:
+ case Primitive::kPrimChar:
+ case Primitive::kPrimShort:
+ case Primitive::kPrimInt:
+ case Primitive::kPrimLong:
+ locations->SetInAt(0, Location::RequiresFpuRegister());
+ locations->SetInAt(1, Location::RequiresFpuRegister());
+ locations->SetInAt(2, Location::RequiresFpuRegister());
+ locations->SetOut(Location::SameAsFirstInput());
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type";
+ UNREACHABLE();
+ }
+}
+
void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
- LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
}
void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+ // TODO: pmaddwd?
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+ CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+ // TODO: psadbw for unsigned?
LOG(FATAL) << "No SIMD for " << instruction->GetId();
}
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index baa0453..6f8743b 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -71,10 +71,13 @@
return false;
}
-// Detect a sign extension from the given type. Returns the promoted operand on success.
+// Detect a sign extension in instruction from the given type. The to64 parameter
+// denotes if result is long, and thus sign extension from int can be included.
+// Returns the promoted operand on success.
static bool IsSignExtensionAndGet(HInstruction* instruction,
Primitive::Type type,
- /*out*/ HInstruction** operand) {
+ /*out*/ HInstruction** operand,
+ bool to64 = false) {
// Accept any already wider constant that would be handled properly by sign
// extension when represented in the *width* of the given narrower data type
// (the fact that char normally zero extends does not matter here).
@@ -82,20 +85,24 @@
if (IsInt64AndGet(instruction, /*out*/ &value)) {
switch (type) {
case Primitive::kPrimByte:
- if (std::numeric_limits<int8_t>::min() <= value &&
- std::numeric_limits<int8_t>::max() >= value) {
+ if (IsInt<8>(value)) {
*operand = instruction;
return true;
}
return false;
case Primitive::kPrimChar:
case Primitive::kPrimShort:
- if (std::numeric_limits<int16_t>::min() <= value &&
- std::numeric_limits<int16_t>::max() <= value) {
+ if (IsInt<16>(value)) {
*operand = instruction;
return true;
}
return false;
+ case Primitive::kPrimInt:
+ if (IsInt<32>(value)) {
+ *operand = instruction;
+ return to64;
+ }
+ return false;
default:
return false;
}
@@ -110,40 +117,52 @@
case Primitive::kPrimShort:
*operand = instruction;
return true;
+ case Primitive::kPrimInt:
+ *operand = instruction;
+ return to64;
default:
return false;
}
}
- // TODO: perhaps explicit conversions later too?
- // (this may return something different from instruction)
+ // Explicit type conversion to long.
+ if (instruction->IsTypeConversion() && instruction->GetType() == Primitive::kPrimLong) {
+ return IsSignExtensionAndGet(instruction->InputAt(0), type, /*out*/ operand, /*to64*/ true);
+ }
return false;
}
-// Detect a zero extension from the given type. Returns the promoted operand on success.
+// Detect a zero extension in instruction from the given type. The to64 parameter
+// denotes if result is long, and thus zero extension from int can be included.
+// Returns the promoted operand on success.
static bool IsZeroExtensionAndGet(HInstruction* instruction,
Primitive::Type type,
- /*out*/ HInstruction** operand) {
+ /*out*/ HInstruction** operand,
+ bool to64 = false) {
// Accept any already wider constant that would be handled properly by zero
// extension when represented in the *width* of the given narrower data type
- // (the fact that byte/short normally sign extend does not matter here).
+ // (the fact that byte/short/int normally sign extend does not matter here).
int64_t value = 0;
if (IsInt64AndGet(instruction, /*out*/ &value)) {
switch (type) {
case Primitive::kPrimByte:
- if (std::numeric_limits<uint8_t>::min() <= value &&
- std::numeric_limits<uint8_t>::max() >= value) {
+ if (IsUint<8>(value)) {
*operand = instruction;
return true;
}
return false;
case Primitive::kPrimChar:
case Primitive::kPrimShort:
- if (std::numeric_limits<uint16_t>::min() <= value &&
- std::numeric_limits<uint16_t>::max() <= value) {
+ if (IsUint<16>(value)) {
*operand = instruction;
return true;
}
return false;
+ case Primitive::kPrimInt:
+ if (IsUint<32>(value)) {
+ *operand = instruction;
+ return to64;
+ }
+ return false;
default:
return false;
}
@@ -170,14 +189,21 @@
(IsInt64AndGet(b, /*out*/ &mask) && (IsSignExtensionAndGet(a, type, /*out*/ operand) ||
IsZeroExtensionAndGet(a, type, /*out*/ operand)))) {
switch ((*operand)->GetType()) {
- case Primitive::kPrimByte: return mask == std::numeric_limits<uint8_t>::max();
+ case Primitive::kPrimByte:
+ return mask == std::numeric_limits<uint8_t>::max();
case Primitive::kPrimChar:
- case Primitive::kPrimShort: return mask == std::numeric_limits<uint16_t>::max();
+ case Primitive::kPrimShort:
+ return mask == std::numeric_limits<uint16_t>::max();
+ case Primitive::kPrimInt:
+ return mask == std::numeric_limits<uint32_t>::max() && to64;
default: return false;
}
}
}
- // TODO: perhaps explicit conversions later too?
+ // Explicit type conversion to long.
+ if (instruction->IsTypeConversion() && instruction->GetType() == Primitive::kPrimLong) {
+ return IsZeroExtensionAndGet(instruction->InputAt(0), type, /*out*/ operand, /*to64*/ true);
+ }
return false;
}
@@ -214,6 +240,55 @@
return false;
}
+// Compute relative vector length based on type difference.
+static size_t GetOtherVL(Primitive::Type other_type, Primitive::Type vector_type, size_t vl) {
+ switch (other_type) {
+ case Primitive::kPrimBoolean:
+ case Primitive::kPrimByte:
+ switch (vector_type) {
+ case Primitive::kPrimBoolean:
+ case Primitive::kPrimByte: return vl;
+ default: break;
+ }
+ return vl;
+ case Primitive::kPrimChar:
+ case Primitive::kPrimShort:
+ switch (vector_type) {
+ case Primitive::kPrimBoolean:
+ case Primitive::kPrimByte: return vl >> 1;
+ case Primitive::kPrimChar:
+ case Primitive::kPrimShort: return vl;
+ default: break;
+ }
+ break;
+ case Primitive::kPrimInt:
+ switch (vector_type) {
+ case Primitive::kPrimBoolean:
+ case Primitive::kPrimByte: return vl >> 2;
+ case Primitive::kPrimChar:
+ case Primitive::kPrimShort: return vl >> 1;
+ case Primitive::kPrimInt: return vl;
+ default: break;
+ }
+ break;
+ case Primitive::kPrimLong:
+ switch (vector_type) {
+ case Primitive::kPrimBoolean:
+ case Primitive::kPrimByte: return vl >> 3;
+ case Primitive::kPrimChar:
+ case Primitive::kPrimShort: return vl >> 2;
+ case Primitive::kPrimInt: return vl >> 1;
+ case Primitive::kPrimLong: return vl;
+ default: break;
+ }
+ break;
+ default:
+ break;
+ }
+ LOG(FATAL) << "Unsupported idiom conversion";
+ UNREACHABLE();
+}
+
// Detect up to two instructions a and b, and an acccumulated constant c.
static bool IsAddConstHelper(HInstruction* instruction,
/*out*/ HInstruction** a,
@@ -260,16 +335,16 @@
}
// Detect reductions of the following forms,
-// under assumption phi has only *one* use:
// x = x_phi + ..
// x = x_phi - ..
// x = max(x_phi, ..)
// x = min(x_phi, ..)
static bool HasReductionFormat(HInstruction* reduction, HInstruction* phi) {
if (reduction->IsAdd()) {
- return reduction->InputAt(0) == phi || reduction->InputAt(1) == phi;
+ return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) ||
+ (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi);
} else if (reduction->IsSub()) {
- return reduction->InputAt(0) == phi;
+ return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi);
} else if (reduction->IsInvokeStaticOrDirect()) {
switch (reduction->AsInvokeStaticOrDirect()->GetIntrinsic()) {
case Intrinsics::kMathMinIntInt:
@@ -280,7 +355,8 @@
case Intrinsics::kMathMaxLongLong:
case Intrinsics::kMathMaxFloatFloat:
case Intrinsics::kMathMaxDoubleDouble:
- return reduction->InputAt(0) == phi || reduction->InputAt(1) == phi;
+ return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) ||
+ (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi);
default:
return false;
}
@@ -288,9 +364,9 @@
return false;
}
-// Translates operation to reduction kind.
-static HVecReduce::ReductionKind GetReductionKind(HInstruction* reduction) {
- if (reduction->IsVecAdd() || reduction->IsVecSub()) {
+// Translates vector operation to reduction kind.
+static HVecReduce::ReductionKind GetReductionKind(HVecOperation* reduction) {
+ if (reduction->IsVecAdd() || reduction->IsVecSub() || reduction->IsVecSADAccumulate()) {
return HVecReduce::kSum;
} else if (reduction->IsVecMin()) {
return HVecReduce::kMin;
@@ -720,7 +796,6 @@
HBasicBlock* block,
HBasicBlock* exit,
int64_t trip_count) {
- Primitive::Type induc_type = Primitive::kPrimInt;
HBasicBlock* header = node->loop_info->GetHeader();
HBasicBlock* preheader = node->loop_info->GetPreHeader();
@@ -739,6 +814,10 @@
vector_header_ = header;
vector_body_ = block;
+ // Loop induction type.
+ Primitive::Type induc_type = main_phi->GetType();
+ DCHECK(induc_type == Primitive::kPrimInt || induc_type == Primitive::kPrimLong) << induc_type;
+
// Generate dynamic loop peeling trip count, if needed, under the assumption
// that the Android runtime guarantees at least "component size" alignment:
// ptc = (ALIGN - (&a[initial] % ALIGN)) / type-size
@@ -767,10 +846,10 @@
HInstruction* rem = Insert(
preheader, new (global_allocator_) HAnd(induc_type,
diff,
- graph_->GetIntConstant(chunk - 1)));
+ graph_->GetConstant(induc_type, chunk - 1)));
vtc = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, rem));
}
- vector_index_ = graph_->GetIntConstant(0);
+ vector_index_ = graph_->GetConstant(induc_type, 0);
// Generate runtime disambiguation test:
// vtc = a != b ? vtc : 0;
@@ -779,7 +858,8 @@
preheader,
new (global_allocator_) HNotEqual(vector_runtime_test_a_, vector_runtime_test_b_));
vtc = Insert(preheader,
- new (global_allocator_) HSelect(rt, vtc, graph_->GetIntConstant(0), kNoDexPc));
+ new (global_allocator_)
+ HSelect(rt, vtc, graph_->GetConstant(induc_type, 0), kNoDexPc));
needs_cleanup = true;
}
@@ -793,7 +873,7 @@
graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
vector_index_,
ptc,
- graph_->GetIntConstant(1),
+ graph_->GetConstant(induc_type, 1),
kNoUnrollingFactor);
}
@@ -806,7 +886,7 @@
graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
vector_index_,
vtc,
- graph_->GetIntConstant(vector_length_), // increment per unroll
+ graph_->GetConstant(induc_type, vector_length_), // increment per unroll
unroll);
HLoopInformation* vloop = vector_header_->GetLoopInformation();
@@ -820,14 +900,20 @@
graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
vector_index_,
stc,
- graph_->GetIntConstant(1),
+ graph_->GetConstant(induc_type, 1),
kNoUnrollingFactor);
}
// Link reductions to their final uses.
for (auto i = reductions_->begin(); i != reductions_->end(); ++i) {
if (i->first->IsPhi()) {
- i->first->ReplaceWith(ReduceAndExtractIfNeeded(i->second));
+ HInstruction* phi = i->first;
+ HInstruction* repl = ReduceAndExtractIfNeeded(i->second);
+ // Deal with regular uses.
+ for (const HUseListNode<HInstruction*>& use : phi->GetUses()) {
+ induction_range_.Replace(use.GetUser(), phi, repl); // update induction use
+ }
+ phi->ReplaceWith(repl);
}
}
@@ -853,7 +939,7 @@
HInstruction* step,
uint32_t unroll) {
DCHECK(unroll == 1 || vector_mode_ == kVector);
- Primitive::Type induc_type = Primitive::kPrimInt;
+ Primitive::Type induc_type = lo->GetType();
// Prepare new loop.
vector_preheader_ = new_preheader,
vector_header_ = vector_preheader_->GetSingleSuccessor();
@@ -942,8 +1028,10 @@
auto redit = reductions_->find(instruction);
if (redit != reductions_->end()) {
Primitive::Type type = instruction->GetType();
- if (TrySetVectorType(type, &restrictions) &&
- VectorizeUse(node, instruction, generate_code, type, restrictions)) {
+ // Recognize SAD idiom or direct reduction.
+ if (VectorizeSADIdiom(node, instruction, generate_code, type, restrictions) ||
+ (TrySetVectorType(type, &restrictions) &&
+ VectorizeUse(node, instruction, generate_code, type, restrictions))) {
if (generate_code) {
HInstruction* new_red = vector_map_->Get(instruction);
vector_permanent_map_->Put(new_red, vector_map_->Get(redit->second));
@@ -1029,14 +1117,20 @@
HInstruction* opa = conversion->InputAt(0);
Primitive::Type from = conversion->GetInputType();
Primitive::Type to = conversion->GetResultType();
- if ((to == Primitive::kPrimByte ||
- to == Primitive::kPrimChar ||
- to == Primitive::kPrimShort) && from == Primitive::kPrimInt) {
- // Accept a "narrowing" type conversion from a "wider" computation for
- // (1) conversion into final required type,
- // (2) vectorizable operand,
- // (3) "wider" operations cannot bring in higher order bits.
- if (to == type && VectorizeUse(node, opa, generate_code, type, restrictions | kNoHiBits)) {
+ if (Primitive::IsIntegralType(from) && Primitive::IsIntegralType(to)) {
+ size_t size_vec = Primitive::ComponentSize(type);
+ size_t size_from = Primitive::ComponentSize(from);
+ size_t size_to = Primitive::ComponentSize(to);
+ // Accept an integral conversion
+ // (1a) narrowing into vector type, "wider" operations cannot bring in higher order bits, or
+ // (1b) widening from at least vector type, and
+ // (2) vectorizable operand.
+ if ((size_to < size_from &&
+ size_to == size_vec &&
+ VectorizeUse(node, opa, generate_code, type, restrictions | kNoHiBits)) ||
+ (size_to >= size_from &&
+ size_from >= size_vec &&
+ VectorizeUse(node, opa, generate_code, type, restrictions))) {
if (generate_code) {
if (vector_mode_ == kVector) {
vector_map_->Put(instruction, vector_map_->Get(opa)); // operand pass-through
@@ -1088,7 +1182,7 @@
return true;
}
} else if (instruction->IsShl() || instruction->IsShr() || instruction->IsUShr()) {
- // Recognize vectorization idioms.
+ // Recognize halving add idiom.
if (VectorizeHalvingAddIdiom(node, instruction, generate_code, type, restrictions)) {
return true;
}
@@ -1181,7 +1275,8 @@
return false; // reject, unless all operands are same-extension narrower
}
// Accept MIN/MAX(x, y) for vectorizable operands.
- DCHECK(r != nullptr && s != nullptr);
+ DCHECK(r != nullptr);
+ DCHECK(s != nullptr);
if (generate_code && vector_mode_ != kVector) { // de-idiom
r = opa;
s = opb;
@@ -1232,11 +1327,11 @@
switch (type) {
case Primitive::kPrimBoolean:
case Primitive::kPrimByte:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv;
return TrySetVectorLength(16);
case Primitive::kPrimChar:
case Primitive::kPrimShort:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv;
return TrySetVectorLength(8);
case Primitive::kPrimInt:
*restrictions |= kNoDiv;
@@ -1261,17 +1356,17 @@
case Primitive::kPrimBoolean:
case Primitive::kPrimByte:
*restrictions |=
- kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction;
+ kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoSAD;
return TrySetVectorLength(16);
case Primitive::kPrimChar:
case Primitive::kPrimShort:
- *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction;
+ *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoSAD;
return TrySetVectorLength(8);
case Primitive::kPrimInt:
- *restrictions |= kNoDiv;
+ *restrictions |= kNoDiv | kNoSAD;
return TrySetVectorLength(4);
case Primitive::kPrimLong:
- *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax;
+ *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax | kNoSAD;
return TrySetVectorLength(2);
case Primitive::kPrimFloat:
*restrictions |= kNoMinMax | kNoReduction; // minmax: -0.0 vs +0.0
@@ -1289,17 +1384,17 @@
switch (type) {
case Primitive::kPrimBoolean:
case Primitive::kPrimByte:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv | kNoReduction | kNoSAD;
return TrySetVectorLength(16);
case Primitive::kPrimChar:
case Primitive::kPrimShort:
- *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
+ *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoSAD;
return TrySetVectorLength(8);
case Primitive::kPrimInt:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv | kNoReduction | kNoSAD;
return TrySetVectorLength(4);
case Primitive::kPrimLong:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv | kNoReduction | kNoSAD;
return TrySetVectorLength(2);
case Primitive::kPrimFloat:
*restrictions |= kNoMinMax | kNoReduction; // min/max(x, NaN)
@@ -1317,17 +1412,17 @@
switch (type) {
case Primitive::kPrimBoolean:
case Primitive::kPrimByte:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv | kNoReduction | kNoSAD;
return TrySetVectorLength(16);
case Primitive::kPrimChar:
case Primitive::kPrimShort:
- *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
+ *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoSAD;
return TrySetVectorLength(8);
case Primitive::kPrimInt:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv | kNoReduction | kNoSAD;
return TrySetVectorLength(4);
case Primitive::kPrimLong:
- *restrictions |= kNoDiv | kNoReduction;
+ *restrictions |= kNoDiv | kNoReduction | kNoSAD;
return TrySetVectorLength(2);
case Primitive::kPrimFloat:
*restrictions |= kNoMinMax | kNoReduction; // min/max(x, NaN)
@@ -1371,8 +1466,16 @@
if (it != vector_permanent_map_->end()) {
vector = it->second; // reuse during unrolling
} else {
- vector = new (global_allocator_) HVecReplicateScalar(
- global_allocator_, org, type, vector_length_);
+ // Generates ReplicateScalar( (optional_type_conv) org ).
+ HInstruction* input = org;
+ Primitive::Type input_type = input->GetType();
+ if (type != input_type && (type == Primitive::kPrimLong ||
+ input_type == Primitive::kPrimLong)) {
+ input = Insert(vector_preheader_,
+ new (global_allocator_) HTypeConversion(type, input, kNoDexPc));
+ }
+ vector = new (global_allocator_)
+ HVecReplicateScalar(global_allocator_, input, type, vector_length_);
vector_permanent_map_->Put(org, Insert(vector_preheader_, vector));
}
vector_map_->Put(org, vector);
@@ -1465,10 +1568,15 @@
// Prepare the new initialization.
if (vector_mode_ == kVector) {
// Generate a [initial, 0, .., 0] vector.
- new_init = Insert(
- vector_preheader_,
- new (global_allocator_) HVecSetScalars(
- global_allocator_, &new_init, phi->GetType(), vector_length_, 1));
+ HVecOperation* red_vector = new_red->AsVecOperation();
+ size_t vector_length = red_vector->GetVectorLength();
+ Primitive::Type type = red_vector->GetPackedType();
+ new_init = Insert(vector_preheader_,
+ new (global_allocator_) HVecSetScalars(global_allocator_,
+ &new_init,
+ type,
+ vector_length,
+ 1));
} else {
new_init = ReduceAndExtractIfNeeded(new_init);
}
@@ -1484,18 +1592,20 @@
if (instruction->IsPhi()) {
HInstruction* input = instruction->InputAt(1);
if (input->IsVecOperation()) {
- Primitive::Type type = input->AsVecOperation()->GetPackedType();
+ HVecOperation* input_vector = input->AsVecOperation();
+ size_t vector_length = input_vector->GetVectorLength();
+ Primitive::Type type = input_vector->GetPackedType();
+ HVecReduce::ReductionKind kind = GetReductionKind(input_vector);
HBasicBlock* exit = instruction->GetBlock()->GetSuccessors()[0];
// Generate a vector reduction and scalar extract
// x = REDUCE( [x_1, .., x_n] )
// y = x_1
// along the exit of the defining loop.
- HVecReduce::ReductionKind kind = GetReductionKind(input);
HInstruction* reduce = new (global_allocator_) HVecReduce(
- global_allocator_, instruction, type, vector_length_, kind);
+ global_allocator_, instruction, type, vector_length, kind);
exit->InsertInstructionBefore(reduce, exit->GetFirstInstruction());
instruction = new (global_allocator_) HVecExtractScalar(
- global_allocator_, reduce, type, vector_length_, 0);
+ global_allocator_, reduce, type, vector_length, 0);
exit->InsertInstructionAfter(instruction, reduce);
}
}
@@ -1516,27 +1626,19 @@
HInstruction* opb,
Primitive::Type type,
bool is_unsigned) {
- if (vector_mode_ == kSequential) {
- // Non-converting scalar code follows implicit integral promotion.
- if (!org->IsTypeConversion() && (type == Primitive::kPrimBoolean ||
- type == Primitive::kPrimByte ||
- type == Primitive::kPrimChar ||
- type == Primitive::kPrimShort)) {
- type = Primitive::kPrimInt;
- }
- }
HInstruction* vector = nullptr;
+ Primitive::Type org_type = org->GetType();
switch (org->GetKind()) {
case HInstruction::kNeg:
DCHECK(opb == nullptr);
GENERATE_VEC(
new (global_allocator_) HVecNeg(global_allocator_, opa, type, vector_length_),
- new (global_allocator_) HNeg(type, opa));
+ new (global_allocator_) HNeg(org_type, opa));
case HInstruction::kNot:
DCHECK(opb == nullptr);
GENERATE_VEC(
new (global_allocator_) HVecNot(global_allocator_, opa, type, vector_length_),
- new (global_allocator_) HNot(type, opa));
+ new (global_allocator_) HNot(org_type, opa));
case HInstruction::kBooleanNot:
DCHECK(opb == nullptr);
GENERATE_VEC(
@@ -1546,47 +1648,47 @@
DCHECK(opb == nullptr);
GENERATE_VEC(
new (global_allocator_) HVecCnv(global_allocator_, opa, type, vector_length_),
- new (global_allocator_) HTypeConversion(type, opa, kNoDexPc));
+ new (global_allocator_) HTypeConversion(org_type, opa, kNoDexPc));
case HInstruction::kAdd:
GENERATE_VEC(
new (global_allocator_) HVecAdd(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HAdd(type, opa, opb));
+ new (global_allocator_) HAdd(org_type, opa, opb));
case HInstruction::kSub:
GENERATE_VEC(
new (global_allocator_) HVecSub(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HSub(type, opa, opb));
+ new (global_allocator_) HSub(org_type, opa, opb));
case HInstruction::kMul:
GENERATE_VEC(
new (global_allocator_) HVecMul(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HMul(type, opa, opb));
+ new (global_allocator_) HMul(org_type, opa, opb));
case HInstruction::kDiv:
GENERATE_VEC(
new (global_allocator_) HVecDiv(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HDiv(type, opa, opb, kNoDexPc));
+ new (global_allocator_) HDiv(org_type, opa, opb, kNoDexPc));
case HInstruction::kAnd:
GENERATE_VEC(
new (global_allocator_) HVecAnd(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HAnd(type, opa, opb));
+ new (global_allocator_) HAnd(org_type, opa, opb));
case HInstruction::kOr:
GENERATE_VEC(
new (global_allocator_) HVecOr(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HOr(type, opa, opb));
+ new (global_allocator_) HOr(org_type, opa, opb));
case HInstruction::kXor:
GENERATE_VEC(
new (global_allocator_) HVecXor(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HXor(type, opa, opb));
+ new (global_allocator_) HXor(org_type, opa, opb));
case HInstruction::kShl:
GENERATE_VEC(
new (global_allocator_) HVecShl(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HShl(type, opa, opb));
+ new (global_allocator_) HShl(org_type, opa, opb));
case HInstruction::kShr:
GENERATE_VEC(
new (global_allocator_) HVecShr(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HShr(type, opa, opb));
+ new (global_allocator_) HShr(org_type, opa, opb));
case HInstruction::kUShr:
GENERATE_VEC(
new (global_allocator_) HVecUShr(global_allocator_, opa, opb, type, vector_length_),
- new (global_allocator_) HUShr(type, opa, opb));
+ new (global_allocator_) HUShr(org_type, opa, opb));
case HInstruction::kInvokeStaticOrDirect: {
HInvokeStaticOrDirect* invoke = org->AsInvokeStaticOrDirect();
if (vector_mode_ == kVector) {
@@ -1667,8 +1769,8 @@
//
// Method recognizes the following idioms:
-// rounding halving add (a + b + 1) >> 1 for unsigned/signed operands a, b
-// regular halving add (a + b) >> 1 for unsigned/signed operands a, b
+// rounding halving add (a + b + 1) >> 1 for unsigned/signed operands a, b
+// truncated halving add (a + b) >> 1 for unsigned/signed operands a, b
// Provided that the operands are promoted to a wider form to do the arithmetic and
// then cast back to narrower form, the idioms can be mapped into efficient SIMD
// implementation that operates directly in narrower form (plus one extra bit).
@@ -1712,7 +1814,8 @@
}
// Accept recognized halving add for vectorizable operands. Vectorized code uses the
// shorthand idiomatic operation. Sequential code uses the original scalar expressions.
- DCHECK(r != nullptr && s != nullptr);
+ DCHECK(r != nullptr);
+ DCHECK(s != nullptr);
if (generate_code && vector_mode_ != kVector) { // de-idiom
r = instruction->InputAt(0);
s = instruction->InputAt(1);
@@ -1741,6 +1844,88 @@
return false;
}
+// Method recognizes the following idiom:
+// q += ABS(a - b) for signed operands a, b
+// Provided that the operands have the same type or are promoted to a wider form.
+// Since this may involve a vector length change, the idiom is handled by going directly
+// to a sad-accumulate node (rather than relying combining finer grained nodes later).
+// TODO: unsigned SAD too?
+bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
+ HInstruction* instruction,
+ bool generate_code,
+ Primitive::Type reduction_type,
+ uint64_t restrictions) {
+ // Filter integral "q += ABS(a - b);" reduction, where ABS and SUB
+ // are done in the same precision (either int or long).
+ if (!instruction->IsAdd() ||
+ (reduction_type != Primitive::kPrimInt && reduction_type != Primitive::kPrimLong)) {
+ return false;
+ }
+ HInstruction* q = instruction->InputAt(0);
+ HInstruction* v = instruction->InputAt(1);
+ HInstruction* a = nullptr;
+ HInstruction* b = nullptr;
+ if (v->IsInvokeStaticOrDirect() &&
+ (v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsInt ||
+ v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsLong)) {
+ HInstruction* x = v->InputAt(0);
+ if (x->IsSub() && x->GetType() == reduction_type) {
+ a = x->InputAt(0);
+ b = x->InputAt(1);
+ }
+ }
+ if (a == nullptr || b == nullptr) {
+ return false;
+ }
+ // Accept same-type or consistent sign extension for narrower-type on operands a and b.
+ // The same-type or narrower operands are called r (a or lower) and s (b or lower).
+ HInstruction* r = a;
+ HInstruction* s = b;
+ bool is_unsigned = false;
+ Primitive::Type sub_type = a->GetType();
+ if (a->IsTypeConversion()) {
+ sub_type = a->InputAt(0)->GetType();
+ } else if (b->IsTypeConversion()) {
+ sub_type = b->InputAt(0)->GetType();
+ }
+ if (reduction_type != sub_type &&
+ (!IsNarrowerOperands(a, b, sub_type, &r, &s, &is_unsigned) || is_unsigned)) {
+ return false;
+ }
+ // Try same/narrower type and deal with vector restrictions.
+ if (!TrySetVectorType(sub_type, &restrictions) || HasVectorRestrictions(restrictions, kNoSAD)) {
+ return false;
+ }
+ // Accept SAD idiom for vectorizable operands. Vectorized code uses the shorthand
+ // idiomatic operation. Sequential code uses the original scalar expressions.
+ DCHECK(r != nullptr);
+ DCHECK(s != nullptr);
+ if (generate_code && vector_mode_ != kVector) { // de-idiom
+ r = s = v->InputAt(0);
+ }
+ if (VectorizeUse(node, q, generate_code, sub_type, restrictions) &&
+ VectorizeUse(node, r, generate_code, sub_type, restrictions) &&
+ VectorizeUse(node, s, generate_code, sub_type, restrictions)) {
+ if (generate_code) {
+ if (vector_mode_ == kVector) {
+ vector_map_->Put(instruction, new (global_allocator_) HVecSADAccumulate(
+ global_allocator_,
+ vector_map_->Get(q),
+ vector_map_->Get(r),
+ vector_map_->Get(s),
+ reduction_type,
+ GetOtherVL(reduction_type, sub_type, vector_length_)));
+ MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom);
+ } else {
+ GenerateVecOp(v, vector_map_->Get(r), nullptr, reduction_type);
+ GenerateVecOp(instruction, vector_map_->Get(q), vector_map_->Get(v), reduction_type);
+ }
+ }
+ return true;
+ }
+ return false;
+}
+
//
// Vectorization heuristics.
//
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index f347518..ae2ea76 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -75,6 +75,7 @@
kNoMinMax = 1 << 8, // no min/max
kNoStringCharAt = 1 << 9, // no StringCharAt
kNoReduction = 1 << 10, // no reduction
+ kNoSAD = 1 << 11, // no sum of absolute differences (SAD)
};
/*
@@ -172,6 +173,11 @@
bool generate_code,
Primitive::Type type,
uint64_t restrictions);
+ bool VectorizeSADIdiom(LoopNode* node,
+ HInstruction* instruction,
+ bool generate_code,
+ Primitive::Type type,
+ uint64_t restrictions);
// Vectorization heuristics.
bool IsVectorizationProfitable(int64_t trip_count);
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index a6d0da1..6bc5111 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1396,6 +1396,7 @@
M(VecUShr, VecBinaryOperation) \
M(VecSetScalars, VecOperation) \
M(VecMultiplyAccumulate, VecOperation) \
+ M(VecSADAccumulate, VecOperation) \
M(VecLoad, VecMemoryOperation) \
M(VecStore, VecMemoryOperation) \
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index c5e75a7..1488b70 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -461,8 +461,8 @@
};
// Performs halving add on every component in the two vectors, viz.
-// rounded [ x1, .. , xn ] hradd [ y1, .. , yn ] = [ (x1 + y1 + 1) >> 1, .. , (xn + yn + 1) >> 1 ]
-// or [ x1, .. , xn ] hadd [ y1, .. , yn ] = [ (x1 + y1) >> 1, .. , (xn + yn ) >> 1 ]
+// rounded [ x1, .. , xn ] hradd [ y1, .. , yn ] = [ (x1 + y1 + 1) >> 1, .. , (xn + yn + 1) >> 1 ]
+// truncated [ x1, .. , xn ] hadd [ y1, .. , yn ] = [ (x1 + y1) >> 1, .. , (xn + yn ) >> 1 ]
// for signed operands x, y (sign extension) or unsigned operands x, y (zero extension).
class HVecHalvingAdd FINAL : public HVecBinaryOperation {
public:
@@ -810,8 +810,8 @@
//
// Assigns the given scalar elements to a vector,
-// viz. set( array(x1, .., xn) ) = [ x1, .. , xn ] if n == m,
-// set( array(x1, .., xm) ) = [ x1, .. , xm, 0, .., 0 ] if m < n.
+// viz. set( array(x1, .. , xn) ) = [ x1, .. , xn ] if n == m,
+// set( array(x1, .. , xm) ) = [ x1, .. , xm, 0, .. , 0 ] if m < n.
class HVecSetScalars FINAL : public HVecOperation {
public:
HVecSetScalars(ArenaAllocator* arena,
@@ -842,9 +842,8 @@
DISALLOW_COPY_AND_ASSIGN(HVecSetScalars);
};
-// Multiplies every component in the two vectors, adds the result vector to the accumulator vector.
-// viz. [ acc1, .., accn ] + [ x1, .. , xn ] * [ y1, .. , yn ] =
-// [ acc1 + x1 * y1, .. , accn + xn * yn ].
+// Multiplies every component in the two vectors, adds the result vector to the accumulator vector,
+// viz. [ a1, .. , an ] + [ x1, .. , xn ] * [ y1, .. , yn ] = [ a1 + x1 * y1, .. , an + xn * yn ].
class HVecMultiplyAccumulate FINAL : public HVecOperation {
public:
HVecMultiplyAccumulate(ArenaAllocator* arena,
@@ -866,15 +865,11 @@
DCHECK(HasConsistentPackedTypes(accumulator, packed_type));
DCHECK(HasConsistentPackedTypes(mul_left, packed_type));
DCHECK(HasConsistentPackedTypes(mul_right, packed_type));
- SetRawInputAt(kInputAccumulatorIndex, accumulator);
- SetRawInputAt(kInputMulLeftIndex, mul_left);
- SetRawInputAt(kInputMulRightIndex, mul_right);
+ SetRawInputAt(0, accumulator);
+ SetRawInputAt(1, mul_left);
+ SetRawInputAt(2, mul_right);
}
- static constexpr int kInputAccumulatorIndex = 0;
- static constexpr int kInputMulLeftIndex = 1;
- static constexpr int kInputMulRightIndex = 2;
-
bool CanBeMoved() const OVERRIDE { return true; }
bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
@@ -894,6 +889,42 @@
DISALLOW_COPY_AND_ASSIGN(HVecMultiplyAccumulate);
};
+// Takes the absolute difference of two vectors, and adds the results to
+// same-precision or wider-precision components in the accumulator,
+// viz. SAD([ a1, .. , am ], [ x1, .. , xn ], [ y1, .. , yn ] =
+// [ a1 + sum abs(xi-yi), .. , am + sum abs(xj-yj) ],
+// for m <= n and non-overlapping sums.
+class HVecSADAccumulate FINAL : public HVecOperation {
+ public:
+ HVecSADAccumulate(ArenaAllocator* arena,
+ HInstruction* accumulator,
+ HInstruction* sad_left,
+ HInstruction* sad_right,
+ Primitive::Type packed_type,
+ size_t vector_length,
+ uint32_t dex_pc = kNoDexPc)
+ : HVecOperation(arena,
+ packed_type,
+ SideEffects::None(),
+ /* number_of_inputs */ 3,
+ vector_length,
+ dex_pc) {
+ DCHECK(HasConsistentPackedTypes(accumulator, packed_type));
+ DCHECK(sad_left->IsVecOperation());
+ DCHECK(sad_right->IsVecOperation());
+ DCHECK_EQ(sad_left->AsVecOperation()->GetPackedType(),
+ sad_right->AsVecOperation()->GetPackedType());
+ SetRawInputAt(0, accumulator);
+ SetRawInputAt(1, sad_left);
+ SetRawInputAt(2, sad_right);
+ }
+
+ DECLARE_INSTRUCTION(VecSADAccumulate);
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(HVecSADAccumulate);
+};
+
// Loads a vector from memory, viz. load(mem, 1)
// yield the vector [ mem(1), .. , mem(n) ].
class HVecLoad FINAL : public HVecMemoryOperation {