summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--TEST_MAPPING6
-rw-r--r--compiler/optimizing/code_generator_arm64.h29
-rw-r--r--compiler/optimizing/code_generator_riscv64.cc24
-rw-r--r--compiler/optimizing/code_generator_vector_arm64_neon.cc24
-rw-r--r--compiler/optimizing/code_generator_vector_arm64_sve.cc130
-rw-r--r--compiler/optimizing/code_generator_vector_arm_vixl.cc24
-rw-r--r--compiler/optimizing/code_generator_vector_x86.cc24
-rw-r--r--compiler/optimizing/code_generator_vector_x86_64.cc24
-rw-r--r--compiler/optimizing/codegen_test.cc9
-rw-r--r--compiler/optimizing/dead_code_elimination.cc14
-rw-r--r--compiler/optimizing/graph_visualizer.cc1
-rw-r--r--compiler/optimizing/load_store_analysis.cc7
-rw-r--r--compiler/optimizing/load_store_analysis.h2
-rw-r--r--compiler/optimizing/load_store_elimination.cc9
-rw-r--r--compiler/optimizing/load_store_elimination_test.cc42
-rw-r--r--compiler/optimizing/loop_optimization.cc543
-rw-r--r--compiler/optimizing/loop_optimization.h139
-rw-r--r--compiler/optimizing/nodes.cc9
-rw-r--r--compiler/optimizing/nodes.h60
-rw-r--r--compiler/optimizing/nodes_vector.h87
-rw-r--r--test/530-checker-lse-simd/src/Main.java49
-rw-r--r--test/661-checker-simd-cf-loops/Android.bp43
-rw-r--r--test/661-checker-simd-cf-loops/expected-stderr.txt0
-rw-r--r--test/661-checker-simd-cf-loops/expected-stdout.txt1
-rw-r--r--test/661-checker-simd-cf-loops/info.txt1
-rw-r--r--test/661-checker-simd-cf-loops/src/Main.java746
26 files changed, 1831 insertions, 216 deletions
diff --git a/TEST_MAPPING b/TEST_MAPPING
index 38ff5eda55..d4ea0d8428 100644
--- a/TEST_MAPPING
+++ b/TEST_MAPPING
@@ -1181,6 +1181,9 @@
"name": "art-run-test-660-checker-simd-sad[com.google.android.art.apex]"
},
{
+ "name": "art-run-test-661-checker-simd-cf-loops[com.google.android.art.apex]"
+ },
+ {
"name": "art-run-test-661-checker-simd-reduc[com.google.android.art.apex]"
},
{
@@ -2600,6 +2603,9 @@
"name": "art-run-test-660-checker-simd-sad"
},
{
+ "name": "art-run-test-661-checker-simd-cf-loops"
+ },
+ {
"name": "art-run-test-661-checker-simd-reduc"
},
{
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 13734d7fb8..bf25418a2f 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -554,12 +554,31 @@ class InstructionCodeGeneratorARM64Sve : public InstructionCodeGeneratorARM64 {
// register size (full SIMD register is used).
void ValidateVectorLength(HVecOperation* instr) const;
- // Returns default predicate register which is used as governing vector predicate
- // to implement predicated loop execution.
+ vixl::aarch64::PRegister GetVecGoverningPReg(HVecOperation* instr) {
+ return GetVecPredSetFixedOutPReg(instr->GetGoverningPredicate());
+ }
+
+ // Returns a fixed p-reg for predicate setting instruction.
//
- // TODO: This is a hack to be addressed when register allocator supports SIMD types.
- static vixl::aarch64::PRegister LoopPReg() {
- return vixl::aarch64::p0;
+ // Currently we only support diamond CF loops for predicated vectorization; also we don't have
+ // register allocator support for vector predicates. Thus we use fixed P-regs for loop main,
+ // True and False predicates as a temporary solution.
+ //
+ // TODO: Support SIMD types and registers in ART.
+ static vixl::aarch64::PRegister GetVecPredSetFixedOutPReg(HVecPredSetOperation* instr) {
+ if (instr->IsVecPredWhile() || instr->IsVecPredSetAll()) {
+ // VecPredWhile and VecPredSetAll live ranges never overlap due to the current vectorization
+ // scheme: the former only is live inside a vectorized loop and the later is never in a
+ // loop and never spans across loops.
+ return vixl::aarch64::p0;
+ } else if (instr->IsVecPredNot()) {
+ // This relies on the fact that we only use PredNot manually in the autovectorizer,
+ // so there is only one of them in each loop.
+ return vixl::aarch64::p1;
+ } else {
+ DCHECK(instr->IsVecCondition());
+ return vixl::aarch64::p2;
+ }
}
};
diff --git a/compiler/optimizing/code_generator_riscv64.cc b/compiler/optimizing/code_generator_riscv64.cc
index c7cd26c83f..df40337f51 100644
--- a/compiler/optimizing/code_generator_riscv64.cc
+++ b/compiler/optimizing/code_generator_riscv64.cc
@@ -2017,12 +2017,32 @@ void InstructionCodeGeneratorRISCV64::VisitVecPredWhile(HVecPredWhile* instructi
LOG(FATAL) << "Unimplemented";
}
-void LocationsBuilderRISCV64::VisitVecPredCondition(HVecPredCondition* instruction) {
+void LocationsBuilderRISCV64::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
UNUSED(instruction);
LOG(FATAL) << "Unimplemented";
}
-void InstructionCodeGeneratorRISCV64::VisitVecPredCondition(HVecPredCondition* instruction) {
+void InstructionCodeGeneratorRISCV64::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
+ UNUSED(instruction);
+ LOG(FATAL) << "Unimplemented";
+}
+
+void LocationsBuilderRISCV64::VisitVecCondition(HVecCondition* instruction) {
+ UNUSED(instruction);
+ LOG(FATAL) << "Unimplemented";
+}
+
+void InstructionCodeGeneratorRISCV64::VisitVecCondition(HVecCondition* instruction) {
+ UNUSED(instruction);
+ LOG(FATAL) << "Unimplemented";
+}
+
+void LocationsBuilderRISCV64::VisitVecPredNot(HVecPredNot* instruction) {
+ UNUSED(instruction);
+ LOG(FATAL) << "Unimplemented";
+}
+
+void InstructionCodeGeneratorRISCV64::VisitVecPredNot(HVecPredNot* instruction) {
UNUSED(instruction);
LOG(FATAL) << "Unimplemented";
}
diff --git a/compiler/optimizing/code_generator_vector_arm64_neon.cc b/compiler/optimizing/code_generator_vector_arm64_neon.cc
index ce02bfa21a..848b5e7567 100644
--- a/compiler/optimizing/code_generator_vector_arm64_neon.cc
+++ b/compiler/optimizing/code_generator_vector_arm64_neon.cc
@@ -1531,12 +1531,32 @@ void InstructionCodeGeneratorARM64Neon::VisitVecPredWhile(HVecPredWhile* instruc
UNREACHABLE();
}
-void LocationsBuilderARM64Neon::VisitVecPredCondition(HVecPredCondition* instruction) {
+void LocationsBuilderARM64Neon::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
LOG(FATAL) << "No SIMD for " << instruction->GetId();
UNREACHABLE();
}
-void InstructionCodeGeneratorARM64Neon::VisitVecPredCondition(HVecPredCondition* instruction) {
+void InstructionCodeGeneratorARM64Neon::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ UNREACHABLE();
+}
+
+void LocationsBuilderARM64Neon::VisitVecCondition(HVecCondition* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ UNREACHABLE();
+}
+
+void InstructionCodeGeneratorARM64Neon::VisitVecCondition(HVecCondition* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ UNREACHABLE();
+}
+
+void LocationsBuilderARM64Neon::VisitVecPredNot(HVecPredNot* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ UNREACHABLE();
+}
+
+void InstructionCodeGeneratorARM64Neon::VisitVecPredNot(HVecPredNot* instruction) {
LOG(FATAL) << "No SIMD for " << instruction->GetId();
UNREACHABLE();
}
diff --git a/compiler/optimizing/code_generator_vector_arm64_sve.cc b/compiler/optimizing/code_generator_vector_arm64_sve.cc
index 4c16c3eb38..ef79932899 100644
--- a/compiler/optimizing/code_generator_vector_arm64_sve.cc
+++ b/compiler/optimizing/code_generator_vector_arm64_sve.cc
@@ -245,7 +245,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReduce(HVecReduce* instruction) {
LocationSummary* locations = instruction->GetLocations();
const ZRegister src = ZRegisterFrom(locations->InAt(0));
const VRegister dst = DRegisterFrom(locations->Out());
- const PRegister p_reg = LoopPReg();
+ const PRegister p_reg = GetVecGoverningPReg(instruction);
ValidateVectorLength(instruction);
switch (instruction->GetPackedType()) {
case DataType::Type::kInt32:
@@ -283,7 +283,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecCnv(HVecCnv* instruction) {
LocationSummary* locations = instruction->GetLocations();
const ZRegister src = ZRegisterFrom(locations->InAt(0));
const ZRegister dst = ZRegisterFrom(locations->Out());
- const PRegisterM p_reg = LoopPReg().Merging();
+ const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
DataType::Type from = instruction->GetInputType();
DataType::Type to = instruction->GetResultType();
ValidateVectorLength(instruction);
@@ -303,7 +303,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecNeg(HVecNeg* instruction) {
LocationSummary* locations = instruction->GetLocations();
const ZRegister src = ZRegisterFrom(locations->InAt(0));
const ZRegister dst = ZRegisterFrom(locations->Out());
- const PRegisterM p_reg = LoopPReg().Merging();
+ const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
ValidateVectorLength(instruction);
switch (instruction->GetPackedType()) {
case DataType::Type::kUint8:
@@ -341,7 +341,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAbs(HVecAbs* instruction) {
LocationSummary* locations = instruction->GetLocations();
const ZRegister src = ZRegisterFrom(locations->InAt(0));
const ZRegister dst = ZRegisterFrom(locations->Out());
- const PRegisterM p_reg = LoopPReg().Merging();
+ const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
ValidateVectorLength(instruction);
switch (instruction->GetPackedType()) {
case DataType::Type::kInt8:
@@ -377,7 +377,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecNot(HVecNot* instruction) {
LocationSummary* locations = instruction->GetLocations();
const ZRegister src = ZRegisterFrom(locations->InAt(0));
const ZRegister dst = ZRegisterFrom(locations->Out());
- const PRegisterM p_reg = LoopPReg().Merging();
+ const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
ValidateVectorLength(instruction);
switch (instruction->GetPackedType()) {
case DataType::Type::kBool: // special case boolean-not
@@ -437,7 +437,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAdd(HVecAdd* instruction) {
const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
const ZRegister dst = ZRegisterFrom(locations->Out());
- const PRegisterM p_reg = LoopPReg().Merging();
+ const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
ValidateVectorLength(instruction);
switch (instruction->GetPackedType()) {
case DataType::Type::kUint8:
@@ -496,7 +496,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecSub(HVecSub* instruction) {
const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
const ZRegister dst = ZRegisterFrom(locations->Out());
- const PRegisterM p_reg = LoopPReg().Merging();
+ const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
ValidateVectorLength(instruction);
switch (instruction->GetPackedType()) {
case DataType::Type::kUint8:
@@ -545,7 +545,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMul(HVecMul* instruction) {
const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
const ZRegister dst = ZRegisterFrom(locations->Out());
- const PRegisterM p_reg = LoopPReg().Merging();
+ const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
ValidateVectorLength(instruction);
switch (instruction->GetPackedType()) {
case DataType::Type::kUint8:
@@ -584,7 +584,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecDiv(HVecDiv* instruction) {
const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
const ZRegister dst = ZRegisterFrom(locations->Out());
- const PRegisterM p_reg = LoopPReg().Merging();
+ const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
ValidateVectorLength(instruction);
// Note: VIXL guarantees StrictNaNPropagation for Fdiv.
@@ -632,7 +632,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAnd(HVecAnd* instruction) {
const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
const ZRegister dst = ZRegisterFrom(locations->Out());
- const PRegisterM p_reg = LoopPReg().Merging();
+ const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
ValidateVectorLength(instruction);
switch (instruction->GetPackedType()) {
case DataType::Type::kBool:
@@ -677,7 +677,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecOr(HVecOr* instruction) {
const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
const ZRegister dst = ZRegisterFrom(locations->Out());
- const PRegisterM p_reg = LoopPReg().Merging();
+ const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
ValidateVectorLength(instruction);
switch (instruction->GetPackedType()) {
case DataType::Type::kBool:
@@ -713,7 +713,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecXor(HVecXor* instruction) {
const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
const ZRegister dst = ZRegisterFrom(locations->Out());
- const PRegisterM p_reg = LoopPReg().Merging();
+ const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
ValidateVectorLength(instruction);
switch (instruction->GetPackedType()) {
case DataType::Type::kBool:
@@ -768,7 +768,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecShl(HVecShl* instruction) {
LocationSummary* locations = instruction->GetLocations();
const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
const ZRegister dst = ZRegisterFrom(locations->Out());
- const PRegisterM p_reg = LoopPReg().Merging();
+ const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
ValidateVectorLength(instruction);
switch (instruction->GetPackedType()) {
@@ -801,7 +801,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecShr(HVecShr* instruction) {
LocationSummary* locations = instruction->GetLocations();
const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
const ZRegister dst = ZRegisterFrom(locations->Out());
- const PRegisterM p_reg = LoopPReg().Merging();
+ const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
ValidateVectorLength(instruction);
switch (instruction->GetPackedType()) {
@@ -834,7 +834,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecUShr(HVecUShr* instruction) {
LocationSummary* locations = instruction->GetLocations();
const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
const ZRegister dst = ZRegisterFrom(locations->Out());
- const PRegisterM p_reg = LoopPReg().Merging();
+ const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
ValidateVectorLength(instruction);
switch (instruction->GetPackedType()) {
@@ -965,7 +965,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate(
const ZRegister acc = ZRegisterFrom(locations->InAt(0));
const ZRegister left = ZRegisterFrom(locations->InAt(1));
const ZRegister right = ZRegisterFrom(locations->InAt(2));
- const PRegisterM p_reg = LoopPReg().Merging();
+ const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
DCHECK(locations->InAt(0).Equals(locations->Out()));
ValidateVectorLength(instruction);
@@ -1028,7 +1028,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecDotProd(HVecDotProd* instruction)
const ZRegister acc = ZRegisterFrom(locations->InAt(0));
const ZRegister left = ZRegisterFrom(locations->InAt(1));
const ZRegister right = ZRegisterFrom(locations->InAt(2));
- const PRegisterM p_reg = LoopPReg().Merging();
+ const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
@@ -1098,7 +1098,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecLoad(HVecLoad* instruction) {
const ZRegister reg = ZRegisterFrom(locations->Out());
UseScratchRegisterScope temps(GetVIXLAssembler());
Register scratch;
- const PRegisterZ p_reg = LoopPReg().Zeroing();
+ const PRegisterZ p_reg = GetVecGoverningPReg(instruction).Zeroing();
ValidateVectorLength(instruction);
switch (instruction->GetPackedType()) {
@@ -1140,7 +1140,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecStore(HVecStore* instruction) {
const ZRegister reg = ZRegisterFrom(locations->InAt(2));
UseScratchRegisterScope temps(GetVIXLAssembler());
Register scratch;
- const PRegisterZ p_reg = LoopPReg().Zeroing();
+ const PRegisterZ p_reg = GetVecGoverningPReg(instruction).Zeroing();
ValidateVectorLength(instruction);
switch (instruction->GetPackedType()) {
@@ -1181,25 +1181,25 @@ void LocationsBuilderARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instruction) {
void InstructionCodeGeneratorARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instruction) {
// Instruction is not predicated, see nodes_vector.h
DCHECK(!instruction->IsPredicated());
- const PRegister p_reg = LoopPReg();
+ const PRegister output_p_reg = GetVecPredSetFixedOutPReg(instruction);
switch (instruction->GetPackedType()) {
case DataType::Type::kBool:
case DataType::Type::kUint8:
case DataType::Type::kInt8:
- __ Ptrue(p_reg.VnB(), vixl::aarch64::SVE_ALL);
+ __ Ptrue(output_p_reg.VnB(), vixl::aarch64::SVE_ALL);
break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
- __ Ptrue(p_reg.VnH(), vixl::aarch64::SVE_ALL);
+ __ Ptrue(output_p_reg.VnH(), vixl::aarch64::SVE_ALL);
break;
case DataType::Type::kInt32:
case DataType::Type::kFloat32:
- __ Ptrue(p_reg.VnS(), vixl::aarch64::SVE_ALL);
+ __ Ptrue(output_p_reg.VnS(), vixl::aarch64::SVE_ALL);
break;
case DataType::Type::kInt64:
case DataType::Type::kFloat64:
- __ Ptrue(p_reg.VnD(), vixl::aarch64::SVE_ALL);
+ __ Ptrue(output_p_reg.VnD(), vixl::aarch64::SVE_ALL);
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -1207,6 +1207,67 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instru
}
}
+void LocationsBuilderARM64Sve::VisitVecCondition(HVecCondition* instruction) {
+ LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+ locations->SetInAt(0, Location::RequiresFpuRegister());
+ locations->SetInAt(1, Location::RequiresFpuRegister());
+ locations->SetOut(Location::RequiresRegister());
+}
+
+void InstructionCodeGeneratorARM64Sve::VisitVecCondition(HVecCondition* instruction) {
+ DCHECK(instruction->IsPredicated());
+ LocationSummary* locations = instruction->GetLocations();
+ const ZRegister left = ZRegisterFrom(locations->InAt(0));
+ const ZRegister right = ZRegisterFrom(locations->InAt(1));
+ const PRegisterZ p_reg = GetVecGoverningPReg(instruction).Zeroing();
+ const PRegister output_p_reg = GetVecPredSetFixedOutPReg(instruction);
+
+ HVecOperation* a = instruction->InputAt(0)->AsVecOperation();
+ HVecOperation* b = instruction->InputAt(1)->AsVecOperation();
+ DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
+ HVecOperation::ToSignedType(b->GetPackedType()));
+ ValidateVectorLength(instruction);
+
+ // TODO: Support other condition OPs and types.
+ switch (instruction->GetPackedType()) {
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ __ Cmpeq(output_p_reg.VnB(), p_reg, left.VnB(), right.VnB());
+ break;
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ __ Cmpeq(output_p_reg.VnH(), p_reg, left.VnH(), right.VnH());
+ break;
+ case DataType::Type::kInt32:
+ __ Cmpeq(output_p_reg.VnS(), p_reg, left.VnS(), right.VnS());
+ break;
+ case DataType::Type::kInt64:
+ __ Cmpeq(output_p_reg.VnD(), p_reg, left.VnD(), right.VnD());
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
+ UNREACHABLE();
+ }
+}
+
+void LocationsBuilderARM64Sve::VisitVecPredNot(HVecPredNot* instruction) {
+ LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+ DCHECK(instruction->InputAt(0)->IsVecPredSetOperation());
+ locations->SetInAt(0, Location::NoLocation());
+ locations->SetOut(Location::RequiresRegister());
+}
+
+void InstructionCodeGeneratorARM64Sve::VisitVecPredNot(HVecPredNot* instruction) {
+ DCHECK(instruction->IsPredicated());
+
+ const PRegister input_p_reg = GetVecPredSetFixedOutPReg(
+ instruction->InputAt(0)->AsVecPredSetOperation());
+ const PRegister control_p_reg = GetVecGoverningPReg(instruction);
+ const PRegister output_p_reg = GetVecPredSetFixedOutPReg(instruction);
+
+ __ Not(output_p_reg.VnB(), control_p_reg.Zeroing(), input_p_reg.VnB());
+}
+
void LocationsBuilderARM64Sve::VisitVecPredWhile(HVecPredWhile* instruction) {
LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
locations->SetInAt(0, Location::RequiresRegister());
@@ -1217,8 +1278,8 @@ void LocationsBuilderARM64Sve::VisitVecPredWhile(HVecPredWhile* instruction) {
// Semantically, the out location of this instruction and predicate inputs locations of
// its users should be a fixed predicate register (similar to
// Location::RegisterLocation(int reg)). But the register allocator (RA) doesn't support
- // SIMD regs (e.g. predicate), so LoopPReg() is used explicitly without exposing it
- // to the RA.
+ // SIMD regs (e.g. predicate), so fixed registers are used explicitly without exposing it
+ // to the RA (through GetVecPredSetFixedOutPReg()).
//
// To make the RA happy Location::NoLocation() was used for all the vector instructions
// predicate inputs; but for the PredSetOperations (e.g. VecPredWhile) Location::NoLocation()
@@ -1240,21 +1301,22 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredWhile(HVecPredWhile* instruct
DCHECK(instruction->GetCondKind() == HVecPredWhile::CondKind::kLO);
Register left = InputRegisterAt(instruction, 0);
Register right = InputRegisterAt(instruction, 1);
+ const PRegister output_p_reg = GetVecPredSetFixedOutPReg(instruction);
DCHECK_EQ(codegen_->GetSIMDRegisterWidth() % instruction->GetVectorLength(), 0u);
switch (codegen_->GetSIMDRegisterWidth() / instruction->GetVectorLength()) {
case 1u:
- __ Whilelo(LoopPReg().VnB(), left, right);
+ __ Whilelo(output_p_reg.VnB(), left, right);
break;
case 2u:
- __ Whilelo(LoopPReg().VnH(), left, right);
+ __ Whilelo(output_p_reg.VnH(), left, right);
break;
case 4u:
- __ Whilelo(LoopPReg().VnS(), left, right);
+ __ Whilelo(output_p_reg.VnS(), left, right);
break;
case 8u:
- __ Whilelo(LoopPReg().VnD(), left, right);
+ __ Whilelo(output_p_reg.VnD(), left, right);
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -1262,20 +1324,20 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredWhile(HVecPredWhile* instruct
}
}
-void LocationsBuilderARM64Sve::VisitVecPredCondition(HVecPredCondition* instruction) {
+void LocationsBuilderARM64Sve::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
locations->SetInAt(0, Location::NoLocation());
// Result of the operation - a boolean value in a core register.
locations->SetOut(Location::RequiresRegister());
}
-void InstructionCodeGeneratorARM64Sve::VisitVecPredCondition(HVecPredCondition* instruction) {
+void InstructionCodeGeneratorARM64Sve::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
// Instruction is not predicated, see nodes_vector.h
DCHECK(!instruction->IsPredicated());
Register reg = OutputRegister(instruction);
- // Currently VecPredCondition is only used as part of vectorized loop check condition
+ // Currently VecPredToBoolean is only used as part of vectorized loop check condition
// evaluation.
- DCHECK(instruction->GetPCondKind() == HVecPredCondition::PCondKind::kNFirst);
+ DCHECK(instruction->GetPCondKind() == HVecPredToBoolean::PCondKind::kNFirst);
__ Cset(reg, pl);
}
diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc
index e8ecf28386..70f22af17b 100644
--- a/compiler/optimizing/code_generator_vector_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc
@@ -1069,12 +1069,32 @@ void InstructionCodeGeneratorARMVIXL::VisitVecPredWhile(HVecPredWhile* instructi
UNREACHABLE();
}
-void LocationsBuilderARMVIXL::VisitVecPredCondition(HVecPredCondition* instruction) {
+void LocationsBuilderARMVIXL::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
LOG(FATAL) << "No SIMD for " << instruction->GetId();
UNREACHABLE();
}
-void InstructionCodeGeneratorARMVIXL::VisitVecPredCondition(HVecPredCondition* instruction) {
+void InstructionCodeGeneratorARMVIXL::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ UNREACHABLE();
+}
+
+void LocationsBuilderARMVIXL::VisitVecCondition(HVecCondition* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ UNREACHABLE();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecCondition(HVecCondition* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ UNREACHABLE();
+}
+
+void LocationsBuilderARMVIXL::VisitVecPredNot(HVecPredNot* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ UNREACHABLE();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecPredNot(HVecPredNot* instruction) {
LOG(FATAL) << "No SIMD for " << instruction->GetId();
UNREACHABLE();
}
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 343a6e1af4..1f9b2578ac 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -1401,12 +1401,32 @@ void InstructionCodeGeneratorX86::VisitVecPredWhile(HVecPredWhile* instruction)
UNREACHABLE();
}
-void LocationsBuilderX86::VisitVecPredCondition(HVecPredCondition* instruction) {
+void LocationsBuilderX86::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
LOG(FATAL) << "No SIMD for " << instruction->GetId();
UNREACHABLE();
}
-void InstructionCodeGeneratorX86::VisitVecPredCondition(HVecPredCondition* instruction) {
+void InstructionCodeGeneratorX86::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ UNREACHABLE();
+}
+
+void LocationsBuilderX86::VisitVecCondition(HVecCondition* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ UNREACHABLE();
+}
+
+void InstructionCodeGeneratorX86::VisitVecCondition(HVecCondition* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ UNREACHABLE();
+}
+
+void LocationsBuilderX86::VisitVecPredNot(HVecPredNot* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ UNREACHABLE();
+}
+
+void InstructionCodeGeneratorX86::VisitVecPredNot(HVecPredNot* instruction) {
LOG(FATAL) << "No SIMD for " << instruction->GetId();
UNREACHABLE();
}
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index fb6e4e753f..47afa3b4a1 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -1374,12 +1374,32 @@ void InstructionCodeGeneratorX86_64::VisitVecPredWhile(HVecPredWhile* instructio
UNREACHABLE();
}
-void LocationsBuilderX86_64::VisitVecPredCondition(HVecPredCondition* instruction) {
+void LocationsBuilderX86_64::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
LOG(FATAL) << "No SIMD for " << instruction->GetId();
UNREACHABLE();
}
-void InstructionCodeGeneratorX86_64::VisitVecPredCondition(HVecPredCondition* instruction) {
+void InstructionCodeGeneratorX86_64::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ UNREACHABLE();
+}
+
+void LocationsBuilderX86_64::VisitVecCondition(HVecCondition* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ UNREACHABLE();
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecCondition(HVecCondition* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ UNREACHABLE();
+}
+
+void LocationsBuilderX86_64::VisitVecPredNot(HVecPredNot* instruction) {
+ LOG(FATAL) << "No SIMD for " << instruction->GetId();
+ UNREACHABLE();
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecPredNot(HVecPredNot* instruction) {
LOG(FATAL) << "No SIMD for " << instruction->GetId();
UNREACHABLE();
}
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index f890ba9cc0..c72d3ea24a 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -796,7 +796,7 @@ TEST_F(CodegenTest, ARM64ParallelMoveResolverSIMD) {
codegen.Initialize();
- graph->SetHasSIMD(true);
+ graph->SetHasTraditionalSIMD(true);
for (int i = 0; i < 2; i++) {
HParallelMove* move = new (graph->GetAllocator()) HParallelMove(graph->GetAllocator());
move->AddMove(Location::SIMDStackSlot(0),
@@ -816,7 +816,7 @@ TEST_F(CodegenTest, ARM64ParallelMoveResolverSIMD) {
DataType::Type::kFloat64,
nullptr);
codegen.GetMoveResolver()->EmitNativeCode(move);
- graph->SetHasSIMD(false);
+ graph->SetHasTraditionalSIMD(false);
}
codegen.Finalize();
@@ -864,7 +864,7 @@ TEST_F(CodegenTest, ARM64FrameSizeSIMD) {
arm64::CodeGeneratorARM64 codegen(graph, *compiler_options);
codegen.Initialize();
- graph->SetHasSIMD(true);
+ graph->SetHasTraditionalSIMD(true);
DCHECK_EQ(arm64::callee_saved_fp_registers.GetCount(), 8);
vixl::aarch64::CPURegList reg_list = arm64::callee_saved_fp_registers;
@@ -884,7 +884,8 @@ TEST_F(CodegenTest, ARM64FrameSizeNoSIMD) {
arm64::CodeGeneratorARM64 codegen(graph, *compiler_options);
codegen.Initialize();
- graph->SetHasSIMD(false);
+ graph->SetHasTraditionalSIMD(false);
+ graph->SetHasPredicatedSIMD(false);
DCHECK_EQ(arm64::callee_saved_fp_registers.GetCount(), 8);
vixl::aarch64::CPURegList reg_list = arm64::callee_saved_fp_registers;
diff --git a/compiler/optimizing/dead_code_elimination.cc b/compiler/optimizing/dead_code_elimination.cc
index cf49e39849..8e6b6db236 100644
--- a/compiler/optimizing/dead_code_elimination.cc
+++ b/compiler/optimizing/dead_code_elimination.cc
@@ -24,6 +24,7 @@
#include "base/scoped_arena_containers.h"
#include "base/stl_util.h"
#include "optimizing/nodes.h"
+#include "optimizing/nodes_vector.h"
#include "ssa_phi_elimination.h"
namespace art HIDDEN {
@@ -842,7 +843,8 @@ void HDeadCodeElimination::RemoveDeadInstructions() {
void HDeadCodeElimination::UpdateGraphFlags() {
bool has_monitor_operations = false;
- bool has_simd = false;
+ bool has_traditional_simd = false;
+ bool has_predicated_simd = false;
bool has_bounds_checks = false;
bool has_always_throwing_invokes = false;
@@ -852,7 +854,12 @@ void HDeadCodeElimination::UpdateGraphFlags() {
if (instruction->IsMonitorOperation()) {
has_monitor_operations = true;
} else if (instruction->IsVecOperation()) {
- has_simd = true;
+ HVecOperation* vec_instruction = instruction->AsVecOperation();
+ if (vec_instruction->IsPredicated()) {
+ has_predicated_simd = true;
+ } else {
+ has_traditional_simd = true;
+ }
} else if (instruction->IsBoundsCheck()) {
has_bounds_checks = true;
} else if (instruction->IsInvoke() && instruction->AsInvoke()->AlwaysThrows()) {
@@ -862,7 +869,8 @@ void HDeadCodeElimination::UpdateGraphFlags() {
}
graph_->SetHasMonitorOperations(has_monitor_operations);
- graph_->SetHasSIMD(has_simd);
+ graph_->SetHasTraditionalSIMD(has_traditional_simd);
+ graph_->SetHasPredicatedSIMD(has_predicated_simd);
graph_->SetHasBoundsChecks(has_bounds_checks);
graph_->SetHasAlwaysThrowingInvokes(has_always_throwing_invokes);
}
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 73bdd1e223..bd33fde907 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -610,6 +610,7 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
}
void VisitVecMemoryOperation(HVecMemoryOperation* vec_mem_operation) override {
+ VisitVecOperation(vec_mem_operation);
StartAttributeStream("alignment") << vec_mem_operation->GetAlignment().ToString();
}
diff --git a/compiler/optimizing/load_store_analysis.cc b/compiler/optimizing/load_store_analysis.cc
index b46e3e18d9..75000c8b91 100644
--- a/compiler/optimizing/load_store_analysis.cc
+++ b/compiler/optimizing/load_store_analysis.cc
@@ -268,6 +268,13 @@ bool HeapLocationCollector::CanArrayElementsAlias(const HInstruction* idx1,
}
bool LoadStoreAnalysis::Run() {
+ // Currently load_store analysis can't handle predicated load/stores; specifically pairs of
+ // memory operations with different predicates.
+ // TODO: support predicated SIMD.
+ if (graph_->HasPredicatedSIMD()) {
+ return false;
+ }
+
for (HBasicBlock* block : graph_->GetReversePostOrder()) {
heap_location_collector_.VisitBasicBlock(block);
}
diff --git a/compiler/optimizing/load_store_analysis.h b/compiler/optimizing/load_store_analysis.h
index c46a5b9cc1..ee425454a0 100644
--- a/compiler/optimizing/load_store_analysis.h
+++ b/compiler/optimizing/load_store_analysis.h
@@ -610,6 +610,7 @@ class HeapLocationCollector : public HGraphVisitor {
}
void VisitVecLoad(HVecLoad* instruction) override {
+ DCHECK(!instruction->IsPredicated());
HInstruction* array = instruction->InputAt(0);
HInstruction* index = instruction->InputAt(1);
DataType::Type type = instruction->GetPackedType();
@@ -618,6 +619,7 @@ class HeapLocationCollector : public HGraphVisitor {
}
void VisitVecStore(HVecStore* instruction) override {
+ DCHECK(!instruction->IsPredicated());
HInstruction* array = instruction->InputAt(0);
HInstruction* index = instruction->InputAt(1);
DataType::Type type = instruction->GetPackedType();
diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc
index 9cabb12a9f..58fdd1cd05 100644
--- a/compiler/optimizing/load_store_elimination.cc
+++ b/compiler/optimizing/load_store_elimination.cc
@@ -1080,10 +1080,12 @@ class LSEVisitor final : private HGraphDelegateVisitor {
}
void VisitVecLoad(HVecLoad* instruction) override {
+ DCHECK(!instruction->IsPredicated());
VisitGetLocation(instruction, heap_location_collector_.GetArrayHeapLocation(instruction));
}
void VisitVecStore(HVecStore* instruction) override {
+ DCHECK(!instruction->IsPredicated());
size_t idx = heap_location_collector_.GetArrayHeapLocation(instruction);
VisitSetLocation(instruction, idx, instruction->GetValue());
}
@@ -4041,6 +4043,13 @@ bool LoadStoreElimination::Run(bool enable_partial_lse) {
return false;
}
+ // Currently load_store analysis can't handle predicated load/stores; specifically pairs of
+ // memory operations with different predicates.
+ // TODO: support predicated SIMD.
+ if (graph_->HasPredicatedSIMD()) {
+ return false;
+ }
+
std::unique_ptr<LSEVisitorWrapper> lse_visitor(new (&allocator) LSEVisitorWrapper(
graph_, heap_location_collector, enable_partial_lse, stats_));
lse_visitor->Run();
diff --git a/compiler/optimizing/load_store_elimination_test.cc b/compiler/optimizing/load_store_elimination_test.cc
index 9fcc8ddef6..d3cf8bfa2a 100644
--- a/compiler/optimizing/load_store_elimination_test.cc
+++ b/compiler/optimizing/load_store_elimination_test.cc
@@ -573,7 +573,8 @@ TEST_F(LoadStoreEliminationTest, SameHeapValue2) {
AddVecStore(entry_block_, array_, j_);
HInstruction* vstore = AddVecStore(entry_block_, array_, i_);
- graph_->SetHasSIMD(true);
+ // TODO: enable LSE for graphs with predicated SIMD.
+ graph_->SetHasTraditionalSIMD(true);
PerformLSE();
ASSERT_FALSE(IsRemoved(vstore));
@@ -589,7 +590,8 @@ TEST_F(LoadStoreEliminationTest, SameHeapValue3) {
AddVecStore(entry_block_, array_, i_add1_);
HInstruction* vstore = AddVecStore(entry_block_, array_, i_);
- graph_->SetHasSIMD(true);
+ // TODO: enable LSE for graphs with predicated SIMD.
+ graph_->SetHasTraditionalSIMD(true);
PerformLSE();
ASSERT_FALSE(IsRemoved(vstore));
@@ -634,7 +636,8 @@ TEST_F(LoadStoreEliminationTest, OverlappingLoadStore) {
AddArraySet(entry_block_, array_, i_, c1);
HInstruction* vload5 = AddVecLoad(entry_block_, array_, i_);
- graph_->SetHasSIMD(true);
+ // TODO: enable LSE for graphs with predicated SIMD.
+ graph_->SetHasTraditionalSIMD(true);
PerformLSE();
ASSERT_TRUE(IsRemoved(load1));
@@ -668,7 +671,8 @@ TEST_F(LoadStoreEliminationTest, StoreAfterLoopWithoutSideEffects) {
// a[j] = 1;
HInstruction* array_set = AddArraySet(return_block_, array_, j_, c1);
- graph_->SetHasSIMD(true);
+ // TODO: enable LSE for graphs with predicated SIMD.
+ graph_->SetHasTraditionalSIMD(true);
PerformLSE();
ASSERT_TRUE(IsRemoved(array_set));
@@ -706,7 +710,8 @@ TEST_F(LoadStoreEliminationTest, StoreAfterSIMDLoopWithSideEffects) {
// a[j] = 0;
HInstruction* a_set = AddArraySet(return_block_, array_, j_, c0);
- graph_->SetHasSIMD(true);
+ // TODO: enable LSE for graphs with predicated SIMD.
+ graph_->SetHasTraditionalSIMD(true);
PerformLSE();
ASSERT_TRUE(IsRemoved(vload));
@@ -745,7 +750,8 @@ TEST_F(LoadStoreEliminationTest, LoadAfterSIMDLoopWithSideEffects) {
// x = a[j];
HInstruction* load = AddArrayGet(return_block_, array_, j_);
- graph_->SetHasSIMD(true);
+ // TODO: enable LSE for graphs with predicated SIMD.
+ graph_->SetHasTraditionalSIMD(true);
PerformLSE();
ASSERT_TRUE(IsRemoved(vload));
@@ -786,7 +792,8 @@ TEST_F(LoadStoreEliminationTest, MergePredecessorVecStores) {
// down: a[i,... i + 3] = [1,...1]
HInstruction* vstore4 = AddVecStore(down, array_, i_, vdata);
- graph_->SetHasSIMD(true);
+ // TODO: enable LSE for graphs with predicated SIMD.
+ graph_->SetHasTraditionalSIMD(true);
PerformLSE();
ASSERT_TRUE(IsRemoved(vstore2));
@@ -877,7 +884,8 @@ TEST_F(LoadStoreEliminationTest, RedundantVStoreVLoadInLoop) {
HInstruction* vstore2 = AddVecStore(loop_, array_b, phi_, vload);
HInstruction* vstore3 = AddVecStore(loop_, array_a, phi_, vstore1->InputAt(2));
- graph_->SetHasSIMD(true);
+ // TODO: enable LSE for graphs with predicated SIMD.
+ graph_->SetHasTraditionalSIMD(true);
PerformLSE();
ASSERT_FALSE(IsRemoved(vstore1));
@@ -965,7 +973,8 @@ TEST_F(LoadStoreEliminationTest, VLoadDefaultValueInLoopWithoutWriteSideEffects)
HInstruction* vload = AddVecLoad(loop_, array_a, phi_);
HInstruction* vstore = AddVecStore(return_block_, array_, c0, vload);
- graph_->SetHasSIMD(true);
+ // TODO: enable LSE for graphs with predicated SIMD.
+ graph_->SetHasTraditionalSIMD(true);
PerformLSE();
ASSERT_FALSE(IsRemoved(vload));
@@ -989,7 +998,8 @@ TEST_F(LoadStoreEliminationTest, VLoadDefaultValue) {
HInstruction* vload = AddVecLoad(pre_header_, array_a, c0);
HInstruction* vstore = AddVecStore(return_block_, array_, c0, vload);
- graph_->SetHasSIMD(true);
+ // TODO: enable LSE for graphs with predicated SIMD.
+ graph_->SetHasTraditionalSIMD(true);
PerformLSE();
ASSERT_FALSE(IsRemoved(vload));
@@ -1066,7 +1076,8 @@ TEST_F(LoadStoreEliminationTest, VLoadAndLoadDefaultValueInLoopWithoutWriteSideE
HInstruction* vstore = AddVecStore(return_block_, array_, c0, vload);
HInstruction* store = AddArraySet(return_block_, array_, c0, load);
- graph_->SetHasSIMD(true);
+ // TODO: enable LSE for graphs with predicated SIMD.
+ graph_->SetHasTraditionalSIMD(true);
PerformLSE();
ASSERT_FALSE(IsRemoved(vload));
@@ -1097,7 +1108,8 @@ TEST_F(LoadStoreEliminationTest, VLoadAndLoadDefaultValue) {
HInstruction* vstore = AddVecStore(return_block_, array_, c0, vload);
HInstruction* store = AddArraySet(return_block_, array_, c0, load);
- graph_->SetHasSIMD(true);
+ // TODO: enable LSE for graphs with predicated SIMD.
+ graph_->SetHasTraditionalSIMD(true);
PerformLSE();
ASSERT_FALSE(IsRemoved(vload));
@@ -1129,7 +1141,8 @@ TEST_F(LoadStoreEliminationTest, VLoadDefaultValueAndVLoadInLoopWithoutWriteSide
HInstruction* vstore1 = AddVecStore(return_block_, array_, c0, vload1);
HInstruction* vstore2 = AddVecStore(return_block_, array_, c128, vload2);
- graph_->SetHasSIMD(true);
+ // TODO: enable LSE for graphs with predicated SIMD.
+ graph_->SetHasTraditionalSIMD(true);
PerformLSE();
ASSERT_FALSE(IsRemoved(vload1));
@@ -1160,7 +1173,8 @@ TEST_F(LoadStoreEliminationTest, VLoadDefaultValueAndVLoad) {
HInstruction* vstore1 = AddVecStore(return_block_, array_, c0, vload1);
HInstruction* vstore2 = AddVecStore(return_block_, array_, c128, vload2);
- graph_->SetHasSIMD(true);
+ // TODO: enable LSE for graphs with predicated SIMD.
+ graph_->SetHasTraditionalSIMD(true);
PerformLSE();
ASSERT_FALSE(IsRemoved(vload1));
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index aef6f1f5bd..f62a355ae4 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -453,6 +453,54 @@ static DataType::Type GetNarrowerType(HInstruction* a, HInstruction* b) {
return type;
}
+// Returns whether the loop is of a diamond structure:
+//
+// header <----------------+
+// | |
+// diamond_hif |
+// / \ |
+// diamond_true diamond_false |
+// \ / |
+// back_edge |
+// | |
+// +---------------------+
+static bool HasLoopDiamondStructure(HLoopInformation* loop_info) {
+ HBasicBlock* header = loop_info->GetHeader();
+ if (loop_info->NumberOfBackEdges() != 1 || header->GetSuccessors().size() != 2) {
+ return false;
+ }
+ HBasicBlock* header_succ_0 = header->GetSuccessors()[0];
+ HBasicBlock* header_succ_1 = header->GetSuccessors()[1];
+ HBasicBlock* diamond_top = loop_info->Contains(*header_succ_0) ?
+ header_succ_0 :
+ header_succ_1;
+ if (!diamond_top->GetLastInstruction()->IsIf()) {
+ return false;
+ }
+
+ HIf* diamond_hif = diamond_top->GetLastInstruction()->AsIf();
+ HBasicBlock* diamond_true = diamond_hif->IfTrueSuccessor();
+ HBasicBlock* diamond_false = diamond_hif->IfFalseSuccessor();
+
+ if (diamond_true->GetSuccessors().size() != 1 || diamond_false->GetSuccessors().size() != 1) {
+ return false;
+ }
+
+ HBasicBlock* back_edge = diamond_true->GetSingleSuccessor();
+ if (back_edge != diamond_false->GetSingleSuccessor() ||
+ back_edge != loop_info->GetBackEdges()[0]) {
+ return false;
+ }
+
+ DCHECK_EQ(loop_info->GetBlocks().NumSetBits(), 5u);
+ return true;
+}
+
+static bool IsPredicatedLoopControlFlowSupported(HLoopInformation* loop_info) {
+ size_t num_of_blocks = loop_info->GetBlocks().NumSetBits();
+ return num_of_blocks == 2 || HasLoopDiamondStructure(loop_info);
+}
+
//
// Public methods.
//
@@ -483,12 +531,12 @@ HLoopOptimization::HLoopOptimization(HGraph* graph,
vector_map_(nullptr),
vector_permanent_map_(nullptr),
vector_external_set_(nullptr),
+ predicate_info_map_(nullptr),
vector_mode_(kSequential),
vector_preheader_(nullptr),
vector_header_(nullptr),
vector_body_(nullptr),
vector_index_(nullptr),
- loop_main_pred_(nullptr),
arch_loop_helper_(ArchNoOptsLoopHelper::Create(codegen, global_allocator_)) {
}
@@ -545,6 +593,8 @@ bool HLoopOptimization::LocalRun() {
ScopedArenaSafeMap<HInstruction*, HInstruction*> perm(
std::less<HInstruction*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization));
ScopedArenaSet<HInstruction*> ext_set(loop_allocator_->Adapter(kArenaAllocLoopOptimization));
+ ScopedArenaSafeMap<HBasicBlock*, BlockPredicateInfo*> pred(
+ std::less<HBasicBlock*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization));
// Attach.
iset_ = &iset;
reductions_ = &reds;
@@ -552,6 +602,7 @@ bool HLoopOptimization::LocalRun() {
vector_map_ = &map;
vector_permanent_map_ = &perm;
vector_external_set_ = &ext_set;
+ predicate_info_map_ = &pred;
// Traverse.
const bool did_loop_opt = TraverseLoopsInnerToOuter(top_loop_);
// Detach.
@@ -561,6 +612,7 @@ bool HLoopOptimization::LocalRun() {
vector_map_ = nullptr;
vector_permanent_map_ = nullptr;
vector_external_set_ = nullptr;
+ predicate_info_map_ = nullptr;
return did_loop_opt;
}
@@ -793,6 +845,37 @@ void HLoopOptimization::SimplifyBlocks(LoopNode* node) {
}
}
+// Checks whether the loop has exit structure suitable for InnerLoopFinite optimization:
+// - has single loop exit.
+// - the exit block has only single predecessor - a block inside the loop.
+//
+// In that case returns single exit basic block (outside the loop); otherwise nullptr.
+static HBasicBlock* GetInnerLoopFiniteSingleExit(HLoopInformation* loop_info) {
+ HBasicBlock* exit = nullptr;
+ for (HBlocksInLoopIterator block_it(*loop_info);
+ !block_it.Done();
+ block_it.Advance()) {
+ HBasicBlock* block = block_it.Current();
+
+ // Check whether one of the successor is loop exit.
+ for (HBasicBlock* successor : block->GetSuccessors()) {
+ if (!loop_info->Contains(*successor)) {
+ if (exit != nullptr) {
+ // The loop has more than one exit.
+ return nullptr;
+ }
+ exit = successor;
+
+ // Ensure exit can only be reached by exiting loop.
+ if (successor->GetPredecessors().size() != 1) {
+ return nullptr;
+ }
+ }
+ }
+ }
+ return exit;
+}
+
bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) {
HBasicBlock* header = node->loop_info->GetHeader();
HBasicBlock* preheader = node->loop_info->GetPreHeader();
@@ -801,33 +884,22 @@ bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) {
if (!induction_range_.IsFinite(node->loop_info, &trip_count)) {
return false;
}
- // Ensure there is only a single loop-body (besides the header).
- HBasicBlock* body = nullptr;
- for (HBlocksInLoopIterator it(*node->loop_info); !it.Done(); it.Advance()) {
- if (it.Current() != header) {
- if (body != nullptr) {
- return false;
- }
- body = it.Current();
- }
- }
- CHECK(body != nullptr);
- // Ensure there is only a single exit point.
- if (header->GetSuccessors().size() != 2) {
- return false;
- }
- HBasicBlock* exit = (header->GetSuccessors()[0] == body)
- ? header->GetSuccessors()[1]
- : header->GetSuccessors()[0];
- // Ensure exit can only be reached by exiting loop.
- if (exit->GetPredecessors().size() != 1) {
+ // Check loop exits.
+ HBasicBlock* exit = GetInnerLoopFiniteSingleExit(node->loop_info);
+ if (exit == nullptr) {
return false;
}
+
+ HBasicBlock* body = (header->GetSuccessors()[0] == exit)
+ ? header->GetSuccessors()[1]
+ : header->GetSuccessors()[0];
// Detect either an empty loop (no side effects other than plain iteration) or
// a trivial loop (just iterating once). Replace subsequent index uses, if any,
// with the last value and remove the loop, possibly after unrolling its body.
HPhi* main_phi = nullptr;
- if (TrySetSimpleLoopHeader(header, &main_phi)) {
+ size_t num_of_blocks = header->GetLoopInformation()->GetBlocks().NumSetBits();
+
+ if (num_of_blocks == 2 && TrySetSimpleLoopHeader(header, &main_phi)) {
bool is_empty = IsEmptyBody(body);
if (reductions_->empty() && // TODO: possible with some effort
(is_empty || trip_count == 1) &&
@@ -850,32 +922,61 @@ bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) {
return true;
}
}
-
- bool enable_alignment_strategies = !IsInPredicatedVectorizationMode();
// Vectorize loop, if possible and valid.
if (!kEnableVectorization ||
// Disable vectorization for debuggable graphs: this is a workaround for the bug
// in 'GenerateNewLoop' which caused the SuspendCheck environment to be invalid.
// TODO: b/138601207, investigate other possible cases with wrong environment values and
// possibly switch back vectorization on for debuggable graphs.
- graph_->IsDebuggable() ||
- !TrySetSimpleLoopHeader(header, &main_phi) ||
- !CanVectorizeDataFlow(node, body, enable_alignment_strategies)) {
+ graph_->IsDebuggable()) {
return false;
}
- if (!IsVectorizationProfitable(trip_count) ||
- !TryAssignLastValue(node->loop_info, main_phi, preheader, /*collect_loop_uses*/ true)) {
+ if (IsInPredicatedVectorizationMode()) {
+ return TryVectorizePredicated(node, body, exit, main_phi, trip_count);
+ } else {
+ return TryVectorizedTraditional(node, body, exit, main_phi, trip_count);
+ }
+}
+
+bool HLoopOptimization::TryVectorizePredicated(LoopNode* node,
+ HBasicBlock* body,
+ HBasicBlock* exit,
+ HPhi* main_phi,
+ int64_t trip_count) {
+ if (!IsPredicatedLoopControlFlowSupported(node->loop_info) ||
+ !ShouldVectorizeCommon(node, main_phi, trip_count)) {
return false;
}
- if (IsInPredicatedVectorizationMode()) {
- VectorizePredicated(node, body, exit);
- } else {
- VectorizeTraditional(node, body, exit, trip_count);
+ // Currently we can only generate cleanup loops for loops with 2 basic block.
+ //
+ // TODO: Support array disambiguation tests for CF loops.
+ if (NeedsArrayRefsDisambiguationTest() &&
+ node->loop_info->GetBlocks().NumSetBits() != 2) {
+ return false;
+ }
+
+ VectorizePredicated(node, body, exit);
+ MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorized);
+ graph_->SetHasPredicatedSIMD(true); // flag SIMD usage
+ return true;
+}
+
+bool HLoopOptimization::TryVectorizedTraditional(LoopNode* node,
+ HBasicBlock* body,
+ HBasicBlock* exit,
+ HPhi* main_phi,
+ int64_t trip_count) {
+ HBasicBlock* header = node->loop_info->GetHeader();
+ size_t num_of_blocks = header->GetLoopInformation()->GetBlocks().NumSetBits();
+
+ if (num_of_blocks != 2 || !ShouldVectorizeCommon(node, main_phi, trip_count)) {
+ return false;
}
- graph_->SetHasSIMD(true); // flag SIMD usage
+ VectorizeTraditional(node, body, exit, trip_count);
MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorized);
+ graph_->SetHasTraditionalSIMD(true); // flag SIMD usage
return true;
}
@@ -1023,8 +1124,9 @@ bool HLoopOptimization::TryLoopScalarOpts(LoopNode* node) {
// Intel Press, June, 2004 (http://www.aartbik.com/).
//
+
bool HLoopOptimization::CanVectorizeDataFlow(LoopNode* node,
- HBasicBlock* block,
+ HBasicBlock* header,
bool collect_alignment_info) {
// Reset vector bookkeeping.
vector_length_ = 0;
@@ -1034,16 +1136,30 @@ bool HLoopOptimization::CanVectorizeDataFlow(LoopNode* node,
vector_runtime_test_a_ =
vector_runtime_test_b_ = nullptr;
- // Phis in the loop-body prevent vectorization.
- if (!block->GetPhis().IsEmpty()) {
- return false;
- }
+ // Traverse the data flow of the loop, in the original program order.
+ for (HBlocksInLoopReversePostOrderIterator block_it(*header->GetLoopInformation());
+ !block_it.Done();
+ block_it.Advance()) {
+ HBasicBlock* block = block_it.Current();
- // Scan the loop-body, starting a right-hand-side tree traversal at each left-hand-side
- // occurrence, which allows passing down attributes down the use tree.
- for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
- if (!VectorizeDef(node, it.Current(), /*generate_code*/ false)) {
- return false; // failure to vectorize a left-hand-side
+ if (block == header) {
+ // The header is of a certain structure (TrySetSimpleLoopHeader) and doesn't need to be
+ // processed here.
+ continue;
+ }
+
+ // Phis in the loop-body prevent vectorization.
+ // TODO: Enable vectorization of CF loops with Phis.
+ if (!block->GetPhis().IsEmpty()) {
+ return false;
+ }
+
+ // Scan the loop-body instructions, starting a right-hand-side tree traversal at each
+ // left-hand-side occurrence, which allows passing down attributes down the use tree.
+ for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+ if (!VectorizeDef(node, it.Current(), /*generate_code*/ false)) {
+ return false; // failure to vectorize a left-hand-side
+ }
}
}
@@ -1139,6 +1255,23 @@ bool HLoopOptimization::CanVectorizeDataFlow(LoopNode* node,
return true;
}
+bool HLoopOptimization::ShouldVectorizeCommon(LoopNode* node,
+ HPhi* main_phi,
+ int64_t trip_count) {
+ HBasicBlock* header = node->loop_info->GetHeader();
+ HBasicBlock* preheader = node->loop_info->GetPreHeader();
+
+ bool enable_alignment_strategies = !IsInPredicatedVectorizationMode();
+ if (!TrySetSimpleLoopHeader(header, &main_phi) ||
+ !CanVectorizeDataFlow(node, header, enable_alignment_strategies) ||
+ !IsVectorizationProfitable(trip_count) ||
+ !TryAssignLastValue(node->loop_info, main_phi, preheader, /*collect_loop_uses*/ true)) {
+ return false;
+ }
+
+ return true;
+}
+
void HLoopOptimization::VectorizePredicated(LoopNode* node,
HBasicBlock* block,
HBasicBlock* exit) {
@@ -1185,7 +1318,6 @@ void HLoopOptimization::VectorizePredicated(LoopNode* node,
graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit);
vector_mode_ = kVector;
GenerateNewLoopPredicated(node,
- block,
preheader_for_vector_loop,
vector_index_,
vtc,
@@ -1200,7 +1332,6 @@ void HLoopOptimization::VectorizePredicated(LoopNode* node,
graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit);
// Use "Traditional" version for the sequential loop.
GenerateNewLoopScalarOrTraditional(node,
- block,
preheader_for_cleanup_loop,
vector_index_,
stc,
@@ -1208,7 +1339,7 @@ void HLoopOptimization::VectorizePredicated(LoopNode* node,
LoopAnalysisInfo::kNoUnrollingFactor);
}
- FinalizeVectorization(node, block);
+ FinalizeVectorization(node);
// Assign governing predicates for the predicated instructions inserted during vectorization
// outside the loop.
@@ -1339,7 +1470,6 @@ void HLoopOptimization::VectorizeTraditional(LoopNode* node,
HBasicBlock* preheader_for_peeling_loop =
graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit);
GenerateNewLoopScalarOrTraditional(node,
- block,
preheader_for_peeling_loop,
vector_index_,
ptc,
@@ -1354,7 +1484,6 @@ void HLoopOptimization::VectorizeTraditional(LoopNode* node,
HBasicBlock* preheader_for_vector_loop =
graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit);
GenerateNewLoopScalarOrTraditional(node,
- block,
preheader_for_vector_loop,
vector_index_,
vtc,
@@ -1369,7 +1498,6 @@ void HLoopOptimization::VectorizeTraditional(LoopNode* node,
HBasicBlock* preheader_for_cleanup_loop =
graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit);
GenerateNewLoopScalarOrTraditional(node,
- block,
preheader_for_cleanup_loop,
vector_index_,
stc,
@@ -1377,10 +1505,10 @@ void HLoopOptimization::VectorizeTraditional(LoopNode* node,
LoopAnalysisInfo::kNoUnrollingFactor);
}
- FinalizeVectorization(node, block);
+ FinalizeVectorization(node);
}
-void HLoopOptimization::FinalizeVectorization(LoopNode* node, HBasicBlock* block) {
+void HLoopOptimization::FinalizeVectorization(LoopNode* node) {
HBasicBlock* header = node->loop_info->GetHeader();
HBasicBlock* preheader = node->loop_info->GetPreHeader();
HLoopInformation* vloop = vector_header_->GetLoopInformation();
@@ -1397,9 +1525,16 @@ void HLoopOptimization::FinalizeVectorization(LoopNode* node, HBasicBlock* block
}
}
- // Remove the original loop by disconnecting the body block
- // and removing all instructions from the header.
- block->DisconnectAndDelete();
+ // Remove the original loop.
+ for (HBlocksInLoopPostOrderIterator it_loop(*node->loop_info);
+ !it_loop.Done();
+ it_loop.Advance()) {
+ HBasicBlock* cur_block = it_loop.Current();
+ if (cur_block == node->loop_info->GetHeader()) {
+ continue;
+ }
+ cur_block->DisconnectAndDelete();
+ }
while (!header->GetFirstInstruction()->IsGoto()) {
header->RemoveInstruction(header->GetFirstInstruction());
@@ -1426,12 +1561,12 @@ HPhi* HLoopOptimization::InitializeForNewLoop(HBasicBlock* new_preheader, HInstr
vector_index_ = phi;
vector_permanent_map_->clear();
vector_external_set_->clear();
+ predicate_info_map_->clear();
return phi;
}
void HLoopOptimization::GenerateNewLoopScalarOrTraditional(LoopNode* node,
- HBasicBlock* body,
HBasicBlock* new_preheader,
HInstruction* lo,
HInstruction* hi,
@@ -1447,14 +1582,13 @@ void HLoopOptimization::GenerateNewLoopScalarOrTraditional(LoopNode* node,
vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
for (uint32_t u = 0; u < unroll; u++) {
- GenerateNewLoopBodyOnce(node, body, induc_type, step);
+ GenerateNewLoopBodyOnce(node, induc_type, step);
}
FinalizePhisForNewLoop(phi, lo);
}
void HLoopOptimization::GenerateNewLoopPredicated(LoopNode* node,
- HBasicBlock* body,
HBasicBlock* new_preheader,
HInstruction* lo,
HInstruction* hi,
@@ -1475,44 +1609,50 @@ void HLoopOptimization::GenerateNewLoopPredicated(LoopNode* node,
0u);
HInstruction* cond =
- new (global_allocator_) HVecPredCondition(global_allocator_,
+ new (global_allocator_) HVecPredToBoolean(global_allocator_,
pred_while,
- HVecPredCondition::PCondKind::kNFirst,
+ HVecPredToBoolean::PCondKind::kNFirst,
DataType::Type::kInt32,
vector_length_,
0u);
vector_header_->AddInstruction(pred_while);
vector_header_->AddInstruction(cond);
- loop_main_pred_ = pred_while;
-
-
vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
- GenerateNewLoopBodyOnce(node, body, induc_type, step);
+ PreparePredicateInfoMap(node);
+ GenerateNewLoopBodyOnce(node, induc_type, step);
+ InitPredicateInfoMap(node, pred_while);
- // Assign governing predicates for instructions in the loop.
- for (HInstructionIterator it(body->GetInstructions()); !it.Done(); it.Advance()) {
- auto i = vector_map_->find(it.Current());
- if (i != vector_map_->end()) {
- HInstruction* instr = i->second;
+ // Assign governing predicates for instructions in the loop; the traversal order doesn't matter.
+ for (HBlocksInLoopIterator block_it(*node->loop_info);
+ !block_it.Done();
+ block_it.Advance()) {
+ HBasicBlock* cur_block = block_it.Current();
- if (!instr->IsVecOperation()) {
- continue;
- }
- // There are cases when a vector instruction, which corresponds to some instruction in the
- // original scalar loop, is located not in the newly created vector loop but
- // in the vector loop preheader (and hence recorded in vector_external_set_).
- //
- // Governing predicates will be set for such instructions separately.
- bool in_vector_loop = vector_header_->GetLoopInformation()->Contains(*instr->GetBlock());
- DCHECK_IMPLIES(!in_vector_loop,
- vector_external_set_->find(instr) != vector_external_set_->end());
-
- if (in_vector_loop &&
- !instr->AsVecOperation()->IsPredicated()) {
- HVecOperation* op = instr->AsVecOperation();
- op->SetMergingGoverningPredicate(loop_main_pred_);
+ for (HInstructionIterator it(cur_block->GetInstructions()); !it.Done(); it.Advance()) {
+ auto i = vector_map_->find(it.Current());
+ if (i != vector_map_->end()) {
+ HInstruction* instr = i->second;
+
+ if (!instr->IsVecOperation()) {
+ continue;
+ }
+ // There are cases when a vector instruction, which corresponds to some instruction in the
+ // original scalar loop, is located not in the newly created vector loop but
+ // in the vector loop preheader (and hence recorded in vector_external_set_).
+ //
+ // Governing predicates will be set for such instructions separately.
+ bool in_vector_loop = vector_header_->GetLoopInformation()->Contains(*instr->GetBlock());
+ DCHECK_IMPLIES(!in_vector_loop,
+ vector_external_set_->find(instr) != vector_external_set_->end());
+
+ if (in_vector_loop &&
+ !instr->AsVecOperation()->IsPredicated()) {
+ HVecOperation* op = instr->AsVecOperation();
+ HVecPredSetOperation* pred = predicate_info_map_->Get(cur_block)->GetControlPredicate();
+ op->SetMergingGoverningPredicate(pred);
+ }
}
}
}
@@ -1521,24 +1661,47 @@ void HLoopOptimization::GenerateNewLoopPredicated(LoopNode* node,
}
void HLoopOptimization::GenerateNewLoopBodyOnce(LoopNode* node,
- HBasicBlock* body,
DataType::Type induc_type,
HInstruction* step) {
// Generate instruction map.
vector_map_->clear();
- for (HInstructionIterator it(body->GetInstructions()); !it.Done(); it.Advance()) {
- bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
- DCHECK(vectorized_def);
+ HLoopInformation* loop_info = node->loop_info;
+
+ // Traverse the data flow of the loop, in the original program order.
+ for (HBlocksInLoopReversePostOrderIterator block_it(*loop_info);
+ !block_it.Done();
+ block_it.Advance()) {
+ HBasicBlock* cur_block = block_it.Current();
+
+ if (cur_block == loop_info->GetHeader()) {
+ continue;
+ }
+
+ for (HInstructionIterator it(cur_block->GetInstructions()); !it.Done(); it.Advance()) {
+ bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
+ DCHECK(vectorized_def);
+ }
}
- // Generate body from the instruction map, but in original program order.
+
+ // Generate body from the instruction map, in the original program order.
HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment();
- for (HInstructionIterator it(body->GetInstructions()); !it.Done(); it.Advance()) {
- auto i = vector_map_->find(it.Current());
- if (i != vector_map_->end() && !i->second->IsInBlock()) {
- Insert(vector_body_, i->second);
- // Deal with instructions that need an environment, such as the scalar intrinsics.
- if (i->second->NeedsEnvironment()) {
- i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
+ for (HBlocksInLoopReversePostOrderIterator block_it(*loop_info);
+ !block_it.Done();
+ block_it.Advance()) {
+ HBasicBlock* cur_block = block_it.Current();
+
+ if (cur_block == loop_info->GetHeader()) {
+ continue;
+ }
+
+ for (HInstructionIterator it(cur_block->GetInstructions()); !it.Done(); it.Advance()) {
+ auto i = vector_map_->find(it.Current());
+ if (i != vector_map_->end() && !i->second->IsInBlock()) {
+ Insert(vector_body_, i->second);
+ // Deal with instructions that need an environment, such as the scalar intrinsics.
+ if (i->second->NeedsEnvironment()) {
+ i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
+ }
}
}
}
@@ -1626,6 +1789,10 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node,
if (instruction->IsGoto()) {
return true;
}
+
+ if (instruction->IsIf()) {
+ return VectorizeIfCondition(node, instruction, generate_code, restrictions);
+ }
// Otherwise accept only expressions with no effects outside the immediate loop-body.
// Note that actual uses are inspected during right-hand-side tree traversal.
return !IsUsedOutsideLoop(node->loop_info, instruction)
@@ -1845,6 +2012,7 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
case InstructionSet::kThumb2:
// Allow vectorization for all ARM devices, because Android assumes that
// ARM 32-bit always supports advanced SIMD (64-bit SIMD).
+ *restrictions |= kNoIfCond;
switch (type) {
case DataType::Type::kBool:
case DataType::Type::kUint8:
@@ -1870,6 +2038,13 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
DCHECK_EQ(simd_register_size_ % DataType::Size(type), 0u);
switch (type) {
case DataType::Type::kBool:
+ *restrictions |= kNoDiv |
+ kNoSignedHAdd |
+ kNoUnsignedHAdd |
+ kNoUnroundedHAdd |
+ kNoSAD |
+ kNoIfCond;
+ return TrySetVectorLength(type, vector_length);
case DataType::Type::kUint8:
case DataType::Type::kInt8:
*restrictions |= kNoDiv |
@@ -1892,13 +2067,13 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
*restrictions |= kNoDiv | kNoSAD;
return TrySetVectorLength(type, vector_length);
case DataType::Type::kInt64:
- *restrictions |= kNoDiv | kNoSAD;
+ *restrictions |= kNoDiv | kNoSAD | kNoIfCond;
return TrySetVectorLength(type, vector_length);
case DataType::Type::kFloat32:
- *restrictions |= kNoReduction;
+ *restrictions |= kNoReduction | kNoIfCond;
return TrySetVectorLength(type, vector_length);
case DataType::Type::kFloat64:
- *restrictions |= kNoReduction;
+ *restrictions |= kNoReduction | kNoIfCond;
return TrySetVectorLength(type, vector_length);
default:
break;
@@ -1907,6 +2082,7 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
} else {
// Allow vectorization for all ARM devices, because Android assumes that
// ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD).
+ *restrictions |= kNoIfCond;
switch (type) {
case DataType::Type::kBool:
case DataType::Type::kUint8:
@@ -1937,6 +2113,7 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
case InstructionSet::kX86:
case InstructionSet::kX86_64:
// Allow vectorization for SSE4.1-enabled X86 devices only (128-bit SIMD).
+ *restrictions |= kNoIfCond;
if (features->AsX86InstructionSetFeatures()->HasSSE4_1()) {
switch (type) {
case DataType::Type::kBool:
@@ -2203,10 +2380,10 @@ HInstruction* HLoopOptimization::ReduceAndExtractIfNeeded(HInstruction* instruct
} \
break;
-void HLoopOptimization::GenerateVecOp(HInstruction* org,
- HInstruction* opa,
- HInstruction* opb,
- DataType::Type type) {
+HInstruction* HLoopOptimization::GenerateVecOp(HInstruction* org,
+ HInstruction* opa,
+ HInstruction* opb,
+ DataType::Type type) {
uint32_t dex_pc = org->GetDexPc();
HInstruction* vector = nullptr;
DataType::Type org_type = org->GetType();
@@ -2276,11 +2453,23 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org,
GENERATE_VEC(
new (global_allocator_) HVecAbs(global_allocator_, opa, type, vector_length_, dex_pc),
new (global_allocator_) HAbs(org_type, opa, dex_pc));
+ case HInstruction::kEqual: {
+ // Special case.
+ if (vector_mode_ == kVector) {
+ vector = new (global_allocator_) HVecCondition(
+ global_allocator_, opa, opb, type, vector_length_, dex_pc);
+ } else {
+ DCHECK(vector_mode_ == kSequential);
+ UNREACHABLE();
+ }
+ }
+ break;
default:
break;
} // switch
CHECK(vector != nullptr) << "Unsupported SIMD operator";
vector_map_->Put(org, vector);
+ return vector;
}
#undef GENERATE_VEC
@@ -2520,6 +2709,89 @@ bool HLoopOptimization::VectorizeDotProdIdiom(LoopNode* node,
return false;
}
+bool HLoopOptimization::VectorizeIfCondition(LoopNode* node,
+ HInstruction* hif,
+ bool generate_code,
+ uint64_t restrictions) {
+ DCHECK(hif->IsIf());
+ HInstruction* if_input = hif->InputAt(0);
+
+ if (!if_input->HasOnlyOneNonEnvironmentUse()) {
+ // Avoid the complications of the condition used as materialized boolean.
+ return false;
+ }
+
+ if (!if_input->IsEqual()) {
+ // TODO: Support other condition types.
+ return false;
+ }
+
+ HCondition* cond = if_input->AsCondition();
+ HInstruction* opa = cond->InputAt(0);
+ HInstruction* opb = cond->InputAt(1);
+ DataType::Type type = GetNarrowerType(opa, opb);
+
+ if (!DataType::IsIntegralType(type)) {
+ return false;
+ }
+
+ bool is_unsigned = false;
+ HInstruction* opa_promoted = opa;
+ HInstruction* opb_promoted = opb;
+ bool is_int_case = DataType::Type::kInt32 == opa->GetType() &&
+ DataType::Type::kInt32 == opa->GetType();
+
+ // Condition arguments should be either both int32 or consistently extended signed/unsigned
+ // narrower operands.
+ if (!is_int_case &&
+ !IsNarrowerOperands(opa, opb, type, &opa_promoted, &opb_promoted, &is_unsigned)) {
+ return false;
+ }
+ type = HVecOperation::ToProperType(type, is_unsigned);
+
+ // For narrow types, explicit type conversion may have been
+ // optimized way, so set the no hi bits restriction here.
+ if (DataType::Size(type) <= 2) {
+ restrictions |= kNoHiBits;
+ }
+
+ if (!TrySetVectorType(type, &restrictions) ||
+ HasVectorRestrictions(restrictions, kNoIfCond)) {
+ return false;
+ }
+
+ if (generate_code && vector_mode_ != kVector) { // de-idiom
+ opa_promoted = opa;
+ opb_promoted = opb;
+ }
+
+ if (VectorizeUse(node, opa_promoted, generate_code, type, restrictions) &&
+ VectorizeUse(node, opb_promoted, generate_code, type, restrictions)) {
+ if (generate_code) {
+ HInstruction* vec_cond = GenerateVecOp(cond,
+ vector_map_->Get(opa_promoted),
+ vector_map_->Get(opb_promoted),
+ type);
+
+ if (vector_mode_ == kVector) {
+ HInstruction* vec_pred_not = new (global_allocator_) HVecPredNot(
+ global_allocator_, vec_cond, type, vector_length_, hif->GetDexPc());
+
+ vector_map_->Put(hif, vec_pred_not);
+ BlockPredicateInfo* pred_info = predicate_info_map_->Get(hif->GetBlock());
+ pred_info->SetControlFlowInfo(vec_cond->AsVecPredSetOperation(),
+ vec_pred_not->AsVecPredSetOperation());
+ } else {
+ DCHECK(vector_mode_ == kSequential);
+ UNREACHABLE();
+ }
+ }
+ return true;
+ }
+
+ return false;
+}
+
//
// Vectorization heuristics.
//
@@ -2834,4 +3106,67 @@ bool HLoopOptimization::CanRemoveCycle() {
return true;
}
+void HLoopOptimization::PreparePredicateInfoMap(LoopNode* node) {
+ HLoopInformation* loop_info = node->loop_info;
+
+ DCHECK(IsPredicatedLoopControlFlowSupported(loop_info));
+
+ for (HBlocksInLoopIterator block_it(*loop_info);
+ !block_it.Done();
+ block_it.Advance()) {
+ HBasicBlock* cur_block = block_it.Current();
+ BlockPredicateInfo* pred_info = new (loop_allocator_) BlockPredicateInfo();
+
+ predicate_info_map_->Put(cur_block, pred_info);
+ }
+}
+
+void HLoopOptimization::InitPredicateInfoMap(LoopNode* node,
+ HVecPredSetOperation* loop_main_pred) {
+ HLoopInformation* loop_info = node->loop_info;
+ HBasicBlock* header = loop_info->GetHeader();
+ BlockPredicateInfo* header_info = predicate_info_map_->Get(header);
+ // Loop header is a special case; it doesn't have a false predicate because we
+ // would just exit the loop then.
+ header_info->SetControlFlowInfo(loop_main_pred, loop_main_pred);
+
+ size_t blocks_in_loop = header->GetLoopInformation()->GetBlocks().NumSetBits();
+ if (blocks_in_loop == 2) {
+ for (HBasicBlock* successor : header->GetSuccessors()) {
+ if (loop_info->Contains(*successor)) {
+ // This is loop second block - body.
+ BlockPredicateInfo* body_info = predicate_info_map_->Get(successor);
+ body_info->SetControlPredicate(loop_main_pred);
+ return;
+ }
+ }
+ UNREACHABLE();
+ }
+
+ // TODO: support predicated vectorization of CF loop of more complex structure.
+ DCHECK(HasLoopDiamondStructure(loop_info));
+ HBasicBlock* header_succ_0 = header->GetSuccessors()[0];
+ HBasicBlock* header_succ_1 = header->GetSuccessors()[1];
+ HBasicBlock* diamond_top = loop_info->Contains(*header_succ_0) ?
+ header_succ_0 :
+ header_succ_1;
+
+ HIf* diamond_hif = diamond_top->GetLastInstruction()->AsIf();
+ HBasicBlock* diamond_true = diamond_hif->IfTrueSuccessor();
+ HBasicBlock* diamond_false = diamond_hif->IfFalseSuccessor();
+ HBasicBlock* back_edge = diamond_true->GetSingleSuccessor();
+
+ BlockPredicateInfo* diamond_top_info = predicate_info_map_->Get(diamond_top);
+ BlockPredicateInfo* diamond_true_info = predicate_info_map_->Get(diamond_true);
+ BlockPredicateInfo* diamond_false_info = predicate_info_map_->Get(diamond_false);
+ BlockPredicateInfo* back_edge_info = predicate_info_map_->Get(back_edge);
+
+ diamond_top_info->SetControlPredicate(header_info->GetTruePredicate());
+
+ diamond_true_info->SetControlPredicate(diamond_top_info->GetTruePredicate());
+ diamond_false_info->SetControlPredicate(diamond_top_info->GetFalsePredicate());
+
+ back_edge_info->SetControlPredicate(header_info->GetTruePredicate());
+}
+
} // namespace art
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 3da8f8fe39..86a9f0fcb8 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -101,6 +101,7 @@ class HLoopOptimization : public HOptimization {
kNoSAD = 1 << 11, // no sum of absolute differences (SAD)
kNoWideSAD = 1 << 12, // no sum of absolute differences (SAD) with operand widening
kNoDotProd = 1 << 13, // no dot product
+ kNoIfCond = 1 << 14, // no if condition conversion
};
/*
@@ -136,6 +137,95 @@ class HLoopOptimization : public HOptimization {
bool is_string_char_at; // compressed string read
};
+ // This structure describes the control flow (CF) -> data flow (DF) conversion of the loop
+ // with control flow (see below) for the purpose of predicated autovectorization.
+ //
+ // Lets define "loops without control-flow" (or non-CF loops) as loops with two consecutive
+ // blocks and without the branching structure except for the loop exit. And
+ // "loop with control-flow" (or CF-loops) - all other loops.
+ //
+ // In the execution of the original CF-loop on each iteration some basic block Y will be
+ // either executed or not executed, depending on the control flow of the loop. More
+ // specifically, a block will be executed if all the conditional branches of the nodes in
+ // the control dependency graph for that block Y are taken according to the path from the loop
+ // header to that basic block.
+ //
+ // This is the key idea of CF->DF conversion: a boolean value
+ // 'ctrl_pred == cond1 && cond2 && ...' will determine whether the basic block Y will be
+ // executed, where cond_K is whether the branch of the node K in the control dependency
+ // graph upward traversal was taken in the 'right' direction.
+ //
+ // Def.: BB Y is control dependent on BB X iff
+ // (1) there exists a directed path P from X to Y with any basic block Z in P (excluding X
+ // and Y) post-dominated by Y and
+ // (2) X is not post-dominated by Y.
+ // ...
+ // X
+ // false / \ true
+ // / \
+ // ...
+ // |
+ // Y
+ // ...
+ //
+ // When doing predicated autovectorization of a CF loop, we use the CF->DF conversion approach:
+ // 1) do the data analysis and vector operation creation as if it was a non-CF loop.
+ // 2) for each HIf block create two vector predicate setting instructions - for True and False
+ // edges/paths.
+ // 3) assign a governing vector predicate (see comments near HVecPredSetOperation)
+ // to each vector operation Alpha in the loop (including to those vector predicate setting
+ // instructions created in #2); do this by:
+ // - finding the immediate control dependent block of the instruction Alpha's block.
+ // - choosing the True or False predicate setting instruction (created in #2) depending
+ // on the path to the instruction.
+ //
+ // For more information check the papers:
+ //
+ // - Allen, John R and Kennedy, Ken and Porterfield, Carrie and Warren, Joe,
+ // “Conversion of Control Dependence to Data Dependence,” in Proceedings of the 10th ACM
+ // SIGACT-SIGPLAN Symposium on Principles of Programming Languages, 1983, pp. 177–189.
+ // - JEANNE FERRANTE, KARL J. OTTENSTEIN, JOE D. WARREN,
+ // "The Program Dependence Graph and Its Use in Optimization"
+ //
+ class BlockPredicateInfo : public ArenaObject<kArenaAllocLoopOptimization> {
+ public:
+ BlockPredicateInfo() :
+ control_predicate_(nullptr),
+ true_predicate_(nullptr),
+ false_predicate_(nullptr) {}
+
+ void SetControlFlowInfo(HVecPredSetOperation* true_predicate,
+ HVecPredSetOperation* false_predicate) {
+ DCHECK(!HasControlFlowOps());
+ true_predicate_ = true_predicate;
+ false_predicate_ = false_predicate;
+ }
+
+ bool HasControlFlowOps() const {
+ // Note: a block must have both T/F predicates set or none of them.
+ DCHECK_EQ(true_predicate_ == nullptr, false_predicate_ == nullptr);
+ return true_predicate_ != nullptr;
+ }
+
+ HVecPredSetOperation* GetControlPredicate() const { return control_predicate_; }
+ void SetControlPredicate(HVecPredSetOperation* control_predicate) {
+ control_predicate_ = control_predicate;
+ }
+
+ HVecPredSetOperation* GetTruePredicate() const { return true_predicate_; }
+ HVecPredSetOperation* GetFalsePredicate() const { return false_predicate_; }
+
+ private:
+ // Vector control predicate operation, associated with the block which will determine
+ // the active lanes for all vector operations, originated from this block.
+ HVecPredSetOperation* control_predicate_;
+
+ // Vector predicate instruction, associated with the true sucessor of the block.
+ HVecPredSetOperation* true_predicate_;
+ // Vector predicate instruction, associated with the false sucessor of the block.
+ HVecPredSetOperation* false_predicate_;
+ };
+
//
// Loop setup and traversal.
//
@@ -208,10 +298,12 @@ class HLoopOptimization : public HOptimization {
// - checks whether instructions are vectorizable for the target.
// - conducts data dependence analysis for array references.
// - additionally, collects info on peeling and aligment strategy.
- bool CanVectorizeDataFlow(LoopNode* node, HBasicBlock* block, bool collect_alignment_info);
+ bool CanVectorizeDataFlow(LoopNode* node, HBasicBlock* header, bool collect_alignment_info);
+ // Does the checks (common for predicated and traditional mode) for the loop.
+ bool ShouldVectorizeCommon(LoopNode* node, HPhi* main_phi, int64_t trip_count);
- // Vectorizes the loop for which all checks have been already done.
+ // Try to vectorize the loop, returns whether it was successful.
//
// There are two versions/algorithms:
// - Predicated: all the vector operations have governing predicates which control
@@ -220,6 +312,19 @@ class HLoopOptimization : public HOptimization {
// - Traditional: a regular mode in which all vector operations lanes are unconditionally
// active.
// Example: vectoriation using AArch64 NEON.
+ bool TryVectorizePredicated(LoopNode* node,
+ HBasicBlock* body,
+ HBasicBlock* exit,
+ HPhi* main_phi,
+ int64_t trip_count);
+
+ bool TryVectorizedTraditional(LoopNode* node,
+ HBasicBlock* body,
+ HBasicBlock* exit,
+ HPhi* main_phi,
+ int64_t trip_count);
+
+ // Vectorizes the loop for which all checks have been already done.
void VectorizePredicated(LoopNode* node,
HBasicBlock* block,
HBasicBlock* exit);
@@ -230,14 +335,13 @@ class HLoopOptimization : public HOptimization {
// Performs final steps for whole vectorization process: links reduction, removes the original
// scalar loop, updates loop info.
- void FinalizeVectorization(LoopNode* node, HBasicBlock* block);
+ void FinalizeVectorization(LoopNode* node);
// Helpers that do the vector instruction synthesis for the previously created loop; create
// and fill the loop body with instructions.
//
// A version to generate a vector loop in predicated mode.
void GenerateNewLoopPredicated(LoopNode* node,
- HBasicBlock* block,
HBasicBlock* new_preheader,
HInstruction* lo,
HInstruction* hi,
@@ -246,7 +350,6 @@ class HLoopOptimization : public HOptimization {
// A version to generate a vector loop in traditional mode or to generate
// a scalar loop for both modes.
void GenerateNewLoopScalarOrTraditional(LoopNode* node,
- HBasicBlock* block,
HBasicBlock* new_preheader,
HInstruction* lo,
HInstruction* hi,
@@ -264,9 +367,15 @@ class HLoopOptimization : public HOptimization {
// Finalizes reduction and induction phis' inputs for the newly created loop.
void FinalizePhisForNewLoop(HPhi* phi, HInstruction* lo);
+ // Creates empty predicate info object for each basic block and puts it into the map.
+ void PreparePredicateInfoMap(LoopNode* node);
+
+ // Set up block true/false predicates using info, collected through data flow and control
+ // dependency analysis.
+ void InitPredicateInfoMap(LoopNode* node, HVecPredSetOperation* loop_main_pred);
+
// Performs instruction synthesis for the loop body.
void GenerateNewLoopBodyOnce(LoopNode* node,
- HBasicBlock* body,
DataType::Type induc_type,
HInstruction* step);
@@ -300,10 +409,10 @@ class HLoopOptimization : public HOptimization {
void GenerateVecReductionPhi(HPhi* phi);
void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction);
HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction);
- void GenerateVecOp(HInstruction* org,
- HInstruction* opa,
- HInstruction* opb,
- DataType::Type type);
+ HInstruction* GenerateVecOp(HInstruction* org,
+ HInstruction* opa,
+ HInstruction* opb,
+ DataType::Type type);
// Vectorization idioms.
bool VectorizeSaturationIdiom(LoopNode* node,
@@ -326,6 +435,10 @@ class HLoopOptimization : public HOptimization {
bool generate_code,
DataType::Type type,
uint64_t restrictions);
+ bool VectorizeIfCondition(LoopNode* node,
+ HInstruction* instruction,
+ bool generate_code,
+ uint64_t restrictions);
// Vectorization heuristics.
Alignment ComputeAlignment(HInstruction* offset,
@@ -435,13 +548,17 @@ class HLoopOptimization : public HOptimization {
// for loop reductions).
ScopedArenaSet<HInstruction*>* vector_external_set_;
+ // A mapping between a basic block of the original loop and its associated PredicateInfo.
+ //
+ // Only used in predicated loop vectorization mode.
+ ScopedArenaSafeMap<HBasicBlock*, BlockPredicateInfo*>* predicate_info_map_;
+
// Temporary vectorization bookkeeping.
VectorMode vector_mode_; // synthesis mode
HBasicBlock* vector_preheader_; // preheader of the new loop
HBasicBlock* vector_header_; // header of the new loop
HBasicBlock* vector_body_; // body of the new loop
HInstruction* vector_index_; // normalized index of the new loop
- HInstruction* loop_main_pred_; // Loop main predicate - for predicated mode.
// Helper for target-specific behaviour for loop optimizations.
ArchNoOptsLoopHelper* arch_loop_helper_;
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 83b58763a4..9caa766858 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -2797,8 +2797,11 @@ HInstruction* HGraph::InlineInto(HGraph* outer_graph, HInvoke* invoke) {
if (HasMonitorOperations()) {
outer_graph->SetHasMonitorOperations(true);
}
- if (HasSIMD()) {
- outer_graph->SetHasSIMD(true);
+ if (HasTraditionalSIMD()) {
+ outer_graph->SetHasTraditionalSIMD(true);
+ }
+ if (HasPredicatedSIMD()) {
+ outer_graph->SetHasPredicatedSIMD(true);
}
if (HasAlwaysThrowingInvokes()) {
outer_graph->SetHasAlwaysThrowingInvokes(true);
@@ -3125,6 +3128,8 @@ void HGraph::TransformLoopHeaderForBCE(HBasicBlock* header) {
new_pre_header, old_pre_header, /* replace_if_back_edge= */ false);
}
+// Creates a new two-basic-block loop and inserts it between original loop header and
+// original loop exit; also adjusts dominators, post order and new LoopInformation.
HBasicBlock* HGraph::TransformLoopForVectorization(HBasicBlock* header,
HBasicBlock* body,
HBasicBlock* exit) {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 9643e4c789..dec86e30dd 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -403,7 +403,8 @@ class HGraph : public ArenaObject<kArenaAllocGraph> {
has_bounds_checks_(false),
has_try_catch_(false),
has_monitor_operations_(false),
- has_simd_(false),
+ has_traditional_simd_(false),
+ has_predicated_simd_(false),
has_loops_(false),
has_irreducible_loops_(false),
has_direct_critical_native_call_(false),
@@ -708,8 +709,13 @@ class HGraph : public ArenaObject<kArenaAllocGraph> {
bool HasMonitorOperations() const { return has_monitor_operations_; }
void SetHasMonitorOperations(bool value) { has_monitor_operations_ = value; }
- bool HasSIMD() const { return has_simd_; }
- void SetHasSIMD(bool value) { has_simd_ = value; }
+ bool HasTraditionalSIMD() { return has_traditional_simd_; }
+ void SetHasTraditionalSIMD(bool value) { has_traditional_simd_ = value; }
+
+ bool HasPredicatedSIMD() { return has_predicated_simd_; }
+ void SetHasPredicatedSIMD(bool value) { has_predicated_simd_ = value; }
+
+ bool HasSIMD() const { return has_traditional_simd_ || has_predicated_simd_; }
bool HasLoops() const { return has_loops_; }
void SetHasLoops(bool value) { has_loops_ = value; }
@@ -822,10 +828,11 @@ class HGraph : public ArenaObject<kArenaAllocGraph> {
// DexRegisterMap to be present to allow deadlock analysis for non-debuggable code.
bool has_monitor_operations_;
- // Flag whether SIMD instructions appear in the graph. If true, the
- // code generators may have to be more careful spilling the wider
+ // Flags whether SIMD (traditional or predicated) instructions appear in the graph.
+ // If either is true, the code generators may have to be more careful spilling the wider
// contents of SIMD registers.
- bool has_simd_;
+ bool has_traditional_simd_;
+ bool has_predicated_simd_;
// Flag whether there are any loops in the graph. We can skip loop
// optimization if it's false.
@@ -1636,7 +1643,9 @@ class HLoopInformationOutwardIterator : public ValueObject {
M(VecStore, VecMemoryOperation) \
M(VecPredSetAll, VecPredSetOperation) \
M(VecPredWhile, VecPredSetOperation) \
- M(VecPredCondition, VecOperation) \
+ M(VecPredToBoolean, VecOperation) \
+ M(VecCondition, VecPredSetOperation) \
+ M(VecPredNot, VecPredSetOperation) \
#define FOR_EACH_CONCRETE_INSTRUCTION_COMMON(M) \
FOR_EACH_CONCRETE_INSTRUCTION_SCALAR_COMMON(M) \
@@ -8634,7 +8643,7 @@ class CloneAndReplaceInstructionVisitor final : public HGraphDelegateVisitor {
DISALLOW_COPY_AND_ASSIGN(CloneAndReplaceInstructionVisitor);
};
-// Iterator over the blocks that art part of the loop. Includes blocks part
+// Iterator over the blocks that are part of the loop; includes blocks which are part
// of an inner loop. The order in which the blocks are iterated is on their
// block id.
class HBlocksInLoopIterator : public ValueObject {
@@ -8667,7 +8676,7 @@ class HBlocksInLoopIterator : public ValueObject {
DISALLOW_COPY_AND_ASSIGN(HBlocksInLoopIterator);
};
-// Iterator over the blocks that art part of the loop. Includes blocks part
+// Iterator over the blocks that are part of the loop; includes blocks which are part
// of an inner loop. The order in which the blocks are iterated is reverse
// post order.
class HBlocksInLoopReversePostOrderIterator : public ValueObject {
@@ -8700,6 +8709,39 @@ class HBlocksInLoopReversePostOrderIterator : public ValueObject {
DISALLOW_COPY_AND_ASSIGN(HBlocksInLoopReversePostOrderIterator);
};
+// Iterator over the blocks that are part of the loop; includes blocks which are part
+// of an inner loop. The order in which the blocks are iterated is post order.
+class HBlocksInLoopPostOrderIterator : public ValueObject {
+ public:
+ explicit HBlocksInLoopPostOrderIterator(const HLoopInformation& info)
+ : blocks_in_loop_(info.GetBlocks()),
+ blocks_(info.GetHeader()->GetGraph()->GetReversePostOrder()),
+ index_(blocks_.size() - 1) {
+ if (!blocks_in_loop_.IsBitSet(blocks_[index_]->GetBlockId())) {
+ Advance();
+ }
+ }
+
+ bool Done() const { return index_ < 0; }
+ HBasicBlock* Current() const { return blocks_[index_]; }
+ void Advance() {
+ --index_;
+ for (; index_ >= 0; --index_) {
+ if (blocks_in_loop_.IsBitSet(blocks_[index_]->GetBlockId())) {
+ break;
+ }
+ }
+ }
+
+ private:
+ const BitVector& blocks_in_loop_;
+ const ArenaVector<HBasicBlock*>& blocks_;
+
+ int32_t index_;
+
+ DISALLOW_COPY_AND_ASSIGN(HBlocksInLoopPostOrderIterator);
+};
+
// Returns int64_t value of a properly typed constant.
inline int64_t Int64FromConstant(HConstant* constant) {
if (constant->IsIntConstant()) {
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index 73f6c40a0d..6a60d6be01 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -1384,8 +1384,8 @@ class HVecPredWhile final : public HVecPredSetOperation {
static constexpr size_t kCondKind = HVecOperation::kNumberOfVectorOpPackedBits;
static constexpr size_t kCondKindSize =
MinimumBitsToStore(static_cast<size_t>(CondKind::kLast));
- static constexpr size_t kNumberOfVecPredConditionPackedBits = kCondKind + kCondKindSize;
- static_assert(kNumberOfVecPredConditionPackedBits <= kMaxNumberOfPackedBits,
+ static constexpr size_t kNumberOfVecPredWhilePackedBits = kCondKind + kCondKindSize;
+ static_assert(kNumberOfVecPredWhilePackedBits <= kMaxNumberOfPackedBits,
"Too many packed fields.");
using CondKindField = BitField<CondKind, kCondKind, kCondKindSize>;
@@ -1395,13 +1395,13 @@ class HVecPredWhile final : public HVecPredSetOperation {
// Evaluates the predicate condition (PCondKind) for a vector predicate; outputs
// a scalar boolean value result.
//
-// Note: as VecPredCondition can be also predicated, only active elements (determined by the
+// Note: as VecPredToBoolean can be also predicated, only active elements (determined by the
// instruction's governing predicate) of the input vector predicate are used for condition
// evaluation.
//
// Note: this instruction is currently used as a workaround for the fact that IR instructions
// can't have more than one output.
-class HVecPredCondition final : public HVecOperation {
+class HVecPredToBoolean final : public HVecOperation {
public:
// To get more info on the condition kinds please see "2.2 Process state, PSTATE" section of
// "ARM Architecture Reference Manual Supplement. The Scalable Vector Extension (SVE),
@@ -1418,13 +1418,13 @@ class HVecPredCondition final : public HVecOperation {
kEnumLast = kPLast
};
- HVecPredCondition(ArenaAllocator* allocator,
+ HVecPredToBoolean(ArenaAllocator* allocator,
HInstruction* input,
PCondKind pred_cond,
DataType::Type packed_type,
size_t vector_length,
uint32_t dex_pc)
- : HVecOperation(kVecPredCondition,
+ : HVecOperation(kVecPredToBoolean,
allocator,
packed_type,
SideEffects::None(),
@@ -1447,19 +1447,86 @@ class HVecPredCondition final : public HVecOperation {
return GetPackedField<CondKindField>();
}
- DECLARE_INSTRUCTION(VecPredCondition);
+ DECLARE_INSTRUCTION(VecPredToBoolean);
protected:
// Additional packed bits.
static constexpr size_t kCondKind = HVecOperation::kNumberOfVectorOpPackedBits;
static constexpr size_t kCondKindSize =
MinimumBitsToStore(static_cast<size_t>(PCondKind::kEnumLast));
- static constexpr size_t kNumberOfVecPredConditionPackedBits = kCondKind + kCondKindSize;
- static_assert(kNumberOfVecPredConditionPackedBits <= kMaxNumberOfPackedBits,
+ static constexpr size_t kNumberOfVecPredToBooleanPackedBits = kCondKind + kCondKindSize;
+ static_assert(kNumberOfVecPredToBooleanPackedBits <= kMaxNumberOfPackedBits,
"Too many packed fields.");
using CondKindField = BitField<PCondKind, kCondKind, kCondKindSize>;
- DEFAULT_COPY_CONSTRUCTOR(VecPredCondition);
+ DEFAULT_COPY_CONSTRUCTOR(VecPredToBoolean);
+};
+
+// Evaluates condition for pairwise elements in two input vectors and sets the result
+// as an output predicate vector.
+//
+// viz. [ p1, .. , pn ] = [ x1 OP y1 , x2 OP y2, .. , xn OP yn] where OP is CondKind
+// condition.
+//
+// Currently only kEqual is supported by this vector instruction - we don't even define
+// the kCondType here.
+// TODO: support other condition ops.
+class HVecCondition final : public HVecPredSetOperation {
+ public:
+ HVecCondition(ArenaAllocator* allocator,
+ HInstruction* left,
+ HInstruction* right,
+ DataType::Type packed_type,
+ size_t vector_length,
+ uint32_t dex_pc) :
+ HVecPredSetOperation(kVecCondition,
+ allocator,
+ packed_type,
+ SideEffects::None(),
+ /* number_of_inputs= */ 2,
+ vector_length,
+ dex_pc) {
+ DCHECK(left->IsVecOperation());
+ DCHECK(!left->IsVecPredSetOperation());
+ DCHECK(right->IsVecOperation());
+ DCHECK(!right->IsVecPredSetOperation());
+ SetRawInputAt(0, left);
+ SetRawInputAt(1, right);
+ }
+
+ DECLARE_INSTRUCTION(VecCondition);
+
+ protected:
+ DEFAULT_COPY_CONSTRUCTOR(VecCondition);
+};
+
+// Inverts every component in the predicate vector.
+//
+// viz. [ p1, .. , pn ] = [ !px1 , !px2 , .. , !pxn ].
+class HVecPredNot final : public HVecPredSetOperation {
+ public:
+ HVecPredNot(ArenaAllocator* allocator,
+ HInstruction* input,
+ DataType::Type packed_type,
+ size_t vector_length,
+ uint32_t dex_pc) :
+ HVecPredSetOperation(kVecPredNot,
+ allocator,
+ packed_type,
+ SideEffects::None(),
+ /* number_of_inputs= */ 1,
+ vector_length,
+ dex_pc) {
+ DCHECK(input->IsVecOperation());
+ DCHECK(input->IsVecPredSetOperation());
+
+ SetRawInputAt(0, input);
+ }
+
+ DECLARE_INSTRUCTION(VecPredNot);
+
+ protected:
+ DEFAULT_COPY_CONSTRUCTOR(VecPredNot);
};
} // namespace art
diff --git a/test/530-checker-lse-simd/src/Main.java b/test/530-checker-lse-simd/src/Main.java
index 619ac281b3..ec2faf51ec 100644
--- a/test/530-checker-lse-simd/src/Main.java
+++ b/test/530-checker-lse-simd/src/Main.java
@@ -38,10 +38,17 @@ public class Main {
/// CHECK-NEXT: Sub
/// CHECK-NEXT: Mul
/// CHECK-NEXT: ArraySet
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-NEXT: ArrayGet
+ //
+ /// CHECK-FI:
/// CHECK-NEXT: LessThanOrEqual
/// CHECK-NEXT: Select
/// CHECK-NEXT: Add
/// CHECK-NEXT: Goto loop:{{B\d+}}
+ //
+ // TODO: reenable LSE for graphs with Predicated SIMD.
static double $noinline$vecgen(double a[], double b[], int n) {
double norma = 0.0;
int init = 1325;
@@ -87,10 +94,17 @@ public class Main {
/// CHECK-NEXT: ArrayGet
/// CHECK-NEXT: Mul
/// CHECK-NEXT: ArraySet
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-NEXT: ArrayGet
+ //
+ /// CHECK-FI:
/// CHECK-NEXT: ArrayLength
/// CHECK-NEXT: BelowOrEqual
//
/// CHECK: Return
+ //
+ // TODO: reenable LSE for graphs with Predicated SIMD.
static double $noinline$test02(double a[], int n) {
double b[] = new double[n];
a[0] = a[0] / 2;
@@ -120,7 +134,13 @@ public class Main {
/// CHECK-NEXT: Return
/// CHECK-START: double Main.$noinline$test03(int) load_store_elimination (after)
- /// CHECK-NOT: ArrayGet loop:none
+ /// CHECK-IF: not hasIsaFeature("sve")
+ //
+ /// CHECK-NOT: ArrayGet loop:none
+ //
+ /// CHECK-FI:
+ //
+ // TODO: reenable LSE for graphs with Predicated SIMD.
static double $noinline$test03(int n) {
double a[] = new double[n];
double b[] = new double[n];
@@ -164,7 +184,14 @@ public class Main {
/// CHECK: Add
/// CHECK: Goto loop:{{B\d+}}
//
- /// CHECK-NOT: VecStore
+
+ /// CHECK-IF: not hasIsaFeature("sve")
+ //
+ /// CHECK-NOT: VecStore
+ //
+ /// CHECK-FI:
+ //
+ // TODO: reenable LSE for graphs with Predicated SIMD.
static double[] $noinline$test04(int n) {
double a[] = new double[n];
double b[] = new double[n];
@@ -194,18 +221,14 @@ public class Main {
/// CHECK: Goto loop:{{B\d+}}
/// CHECK-START-ARM64: double[] Main.$noinline$test05(int) load_store_elimination (after)
- /// CHECK-IF: not hasIsaFeature("sve")
- //
- // In NEON case there is a post-loop which prevents the store to be removed.
- /// CHECK: VecStore
- //
- /// CHECK-FI:
- //
+ /// CHECK: VecStore
/// CHECK: VecStore
/// CHECK: Add
/// CHECK: Goto loop:{{B\d+}}
//
/// CHECK-NOT: VecStore
+ //
+ // TODO: reenable LSE for graphs with Predicated SIMD.
static double[] $noinline$test05(int n) {
double a[] = new double[n];
double b[] = new double[n];
@@ -249,7 +272,13 @@ public class Main {
/// CHECK: VecAdd
/// CHECK: VecStore
//
- /// CHECK-NOT: VecStore
+ /// CHECK-IF: not hasIsaFeature("sve")
+ //
+ /// CHECK-NOT: VecStore
+ //
+ /// CHECK-FI:
+ //
+ // TODO: reenable LSE for graphs with Predicated SIMD.
static double[] $noinline$test06(int n) {
double a[] = new double[n];
double b[] = new double[n];
diff --git a/test/661-checker-simd-cf-loops/Android.bp b/test/661-checker-simd-cf-loops/Android.bp
new file mode 100644
index 0000000000..8d2c193c02
--- /dev/null
+++ b/test/661-checker-simd-cf-loops/Android.bp
@@ -0,0 +1,43 @@
+// Generated by `regen-test-files`. Do not edit manually.
+
+// Build rules for ART run-test `661-checker-simd-cf-loops`.
+
+package {
+ // See: http://go/android-license-faq
+ // A large-scale-change added 'default_applicable_licenses' to import
+ // all of the 'license_kinds' from "art_license"
+ // to get the below license kinds:
+ // SPDX-license-identifier-Apache-2.0
+ default_applicable_licenses: ["art_license"],
+}
+
+// Test's Dex code.
+java_test {
+ name: "art-run-test-661-checker-simd-cf-loops",
+ defaults: ["art-run-test-defaults"],
+ test_config_template: ":art-run-test-target-template",
+ srcs: ["src/**/*.java"],
+ data: [
+ ":art-run-test-661-checker-simd-cf-loops-expected-stdout",
+ ":art-run-test-661-checker-simd-cf-loops-expected-stderr",
+ ],
+ // Include the Java source files in the test's artifacts, to make Checker assertions
+ // available to the TradeFed test runner.
+ include_srcs: true,
+}
+
+// Test's expected standard output.
+genrule {
+ name: "art-run-test-661-checker-simd-cf-loops-expected-stdout",
+ out: ["art-run-test-661-checker-simd-cf-loops-expected-stdout.txt"],
+ srcs: ["expected-stdout.txt"],
+ cmd: "cp -f $(in) $(out)",
+}
+
+// Test's expected standard error.
+genrule {
+ name: "art-run-test-661-checker-simd-cf-loops-expected-stderr",
+ out: ["art-run-test-661-checker-simd-cf-loops-expected-stderr.txt"],
+ srcs: ["expected-stderr.txt"],
+ cmd: "cp -f $(in) $(out)",
+}
diff --git a/test/661-checker-simd-cf-loops/expected-stderr.txt b/test/661-checker-simd-cf-loops/expected-stderr.txt
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/test/661-checker-simd-cf-loops/expected-stderr.txt
diff --git a/test/661-checker-simd-cf-loops/expected-stdout.txt b/test/661-checker-simd-cf-loops/expected-stdout.txt
new file mode 100644
index 0000000000..b0aad4deb5
--- /dev/null
+++ b/test/661-checker-simd-cf-loops/expected-stdout.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/661-checker-simd-cf-loops/info.txt b/test/661-checker-simd-cf-loops/info.txt
new file mode 100644
index 0000000000..bc7881d16a
--- /dev/null
+++ b/test/661-checker-simd-cf-loops/info.txt
@@ -0,0 +1 @@
+Functional tests on vectorization of loops with control flow.
diff --git a/test/661-checker-simd-cf-loops/src/Main.java b/test/661-checker-simd-cf-loops/src/Main.java
new file mode 100644
index 0000000000..95c09490bb
--- /dev/null
+++ b/test/661-checker-simd-cf-loops/src/Main.java
@@ -0,0 +1,746 @@
+/*
+ * Copyright (C) 2023 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for autovectorization of loops with control flow.
+ */
+public class Main {
+
+ public static final int ARRAY_LENGTH = 128;
+ public static final int USED_ARRAY_LENGTH = ARRAY_LENGTH - 1;
+
+ public static boolean[] booleanArray = new boolean[ARRAY_LENGTH];
+ public static boolean[] booleanArray2 = new boolean[ARRAY_LENGTH];
+ public static byte[] byteArray = new byte[ARRAY_LENGTH];
+ public static short[] shortArray = new short[ARRAY_LENGTH];
+ public static char[] charArray = new char[ARRAY_LENGTH];
+ public static int[] intArray = new int[ARRAY_LENGTH];
+ public static long[] longArray = new long[ARRAY_LENGTH];
+ public static float[] floatArray = new float[ARRAY_LENGTH];
+ public static double[] doubleArray = new double[ARRAY_LENGTH];
+
+ public static final int MAGIC_VALUE_A = 2;
+ public static final int MAGIC_VALUE_B = 10;
+ public static final int MAGIC_VALUE_C = 100;
+
+ public static final int MAGIC_ADD_CONST = 99;
+
+ public static final float MAGIC_FLOAT_VALUE_A = 2.0f;
+ public static final float MAGIC_FLOAT_VALUE_B = 10.0f;
+ public static final float MAGIC_FLOAT_VALUE_C = 100.0f;
+
+ public static final float MAGIC_FLOAT_ADD_CONST = 99.0f;
+
+ /// CHECK-START-ARM64: int Main.$compile$noinline$FullDiamond(int[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-DAG: <<C0:i\d+>> IntConstant 0 loop:none
+ /// CHECK-DAG: <<C4:i\d+>> IntConstant 4 loop:none
+ /// CHECK-DAG: <<C99:i\d+>> IntConstant 99 loop:none
+ /// CHECK-DAG: <<C100:i\d+>> IntConstant 100 loop:none
+ /// CHECK-DAG: <<Vec4:d\d+>> VecReplicateScalar [<<C4>>,{{j\d+}}] loop:none
+ /// CHECK-DAG: <<Vec99:d\d+>> VecReplicateScalar [<<C99>>,{{j\d+}}] loop:none
+ /// CHECK-DAG: <<Vec100:d\d+>> VecReplicateScalar [<<C100>>,{{j\d+}}] loop:none
+ //
+ /// CHECK-DAG: <<Phi:i\d+>> Phi [<<C0>>,{{i\d+}}] loop:<<Loop:B\d+>> outer_loop:none
+ /// CHECK-DAG: <<LoopP:j\d+>> VecPredWhile [<<Phi>>,{{i\d+}}] loop:<<Loop>> outer_loop:none
+ //
+ /// CHECK-DAG: <<Load1:d\d+>> VecLoad [<<Arr:l\d+>>,<<Phi>>,<<LoopP>>] loop:<<Loop>> outer_loop:none
+ /// CHECK-DAG: <<Cond:j\d+>> VecCondition [<<Load1>>,<<Vec100>>,<<LoopP>>] loop:<<Loop>> outer_loop:none
+ /// CHECK-DAG: <<CondR:j\d+>> VecPredNot [<<Cond>>,<<LoopP>>] loop:<<Loop>> outer_loop:none
+ /// CHECK-DAG: <<AddT:d\d+>> VecAdd [<<Load1>>,<<Vec99>>,<<CondR>>] loop:<<Loop>> outer_loop:none
+ /// CHECK-DAG: <<StT:d\d+>> VecStore [<<Arr>>,<<Phi>>,<<AddT>>,<<CondR>>] loop:<<Loop>> outer_loop:none
+ /// CHECK-DAG: <<StF:d\d+>> VecStore [<<Arr>>,<<Phi>>,{{d\d+}},<<Cond>>] loop:<<Loop>> outer_loop:none
+ /// CHECK-DAG: <<Ld2:d\d+>> VecLoad [<<Arr>>,<<Phi>>,<<LoopP>>] loop:<<Loop>> outer_loop:none
+ /// CHECK-DAG: <<Add2:d\d+>> VecAdd [<<Ld2>>,<<Vec4>>,<<LoopP>>] loop:<<Loop>> outer_loop:none
+ /// CHECK-DAG: <<St21:d\d+>> VecStore [<<Arr>>,<<Phi>>,<<Add2>>,<<LoopP>>] loop:<<Loop>> outer_loop:none
+ //
+ /// CHECK-ELSE:
+ //
+ /// CHECK-NOT: VecLoad
+ //
+ /// CHECK-FI:
+ public static int $compile$noinline$FullDiamond(int[] x) {
+ int i = 0;
+ for (; i < USED_ARRAY_LENGTH; i++) {
+ int val = x[i];
+ if (val != MAGIC_VALUE_C) {
+ x[i] += MAGIC_ADD_CONST;
+ } else {
+ x[i] += 3;
+ }
+ x[i] += 4;
+ }
+ return i;
+ }
+
+ //
+ // Test various types.
+ //
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleBoolean(boolean[], boolean[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-NOT: VecLoad
+ //
+ /// CHECK-FI:
+ //
+ // TODO: Support extra condition types and boolean comparisons.
+ public static void $compile$noinline$SimpleBoolean(boolean[] x, boolean[] y) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ boolean val = x[i];
+ if (val != y[i]) {
+ x[i] |= y[i];
+ }
+ }
+ }
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleByte(byte[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-DAG: VecLoad
+ //
+ /// CHECK-FI:
+ public static void $compile$noinline$SimpleByte(byte[] x) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ byte val = x[i];
+ if (val != MAGIC_VALUE_C) {
+ x[i] += MAGIC_ADD_CONST;
+ }
+ }
+ }
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleUByte(byte[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-DAG: VecLoad
+ //
+ /// CHECK-FI:
+ public static void $compile$noinline$SimpleUByte(byte[] x) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ if ((x[i] & 0xFF) != MAGIC_VALUE_C) {
+ x[i] += MAGIC_ADD_CONST;
+ }
+ }
+ }
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleShort(short[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-DAG: VecLoad
+ //
+ /// CHECK-FI:
+ public static void $compile$noinline$SimpleShort(short[] x) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ short val = x[i];
+ if (val != MAGIC_VALUE_C) {
+ x[i] += MAGIC_ADD_CONST;
+ }
+ }
+ }
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleChar(char[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-DAG: VecLoad
+ //
+ /// CHECK-FI:
+ public static void $compile$noinline$SimpleChar(char[] x) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ char val = x[i];
+ if (val != MAGIC_VALUE_C) {
+ x[i] += MAGIC_ADD_CONST;
+ }
+ }
+ }
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleInt(int[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-DAG: VecLoad
+ //
+ /// CHECK-FI:
+ public static void $compile$noinline$SimpleInt(int[] x) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ int val = x[i];
+ if (val != MAGIC_VALUE_C) {
+ x[i] += MAGIC_ADD_CONST;
+ }
+ }
+ }
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleLong(long[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-NOT: VecLoad
+ //
+ /// CHECK-FI:
+ //
+ // TODO: Support long comparisons.
+ public static void $compile$noinline$SimpleLong(long[] x) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ long val = x[i];
+ if (val != MAGIC_VALUE_C) {
+ x[i] += MAGIC_ADD_CONST;
+ }
+ }
+ }
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleFloat(float[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-NOT: VecLoad
+ //
+ /// CHECK-FI:
+ //
+ // TODO: Support FP comparisons.
+ public static void $compile$noinline$SimpleFloat(float[] x) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ float val = x[i];
+ if (val > 10.0f) {
+ x[i] += 99.1f;
+ }
+ }
+ }
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleDouble(double[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-NOT: VecLoad
+ //
+ /// CHECK-FI:
+ //
+ // TODO: Support FP comparisons.
+ public static void $compile$noinline$SimpleDouble(double[] x) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ double val = x[i];
+ if (val != 10.0) {
+ x[i] += 99.1;
+ }
+ }
+ }
+
+ //
+ // Narrowing types.
+ //
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$ByteConv(byte[], byte[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-DAG: VecLoad
+ //
+ /// CHECK-FI:
+ public static void $compile$noinline$ByteConv(byte[] x, byte[] y) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ byte val = (byte)(x[i] + 1);
+ if (val != y[i]) {
+ x[i] += MAGIC_ADD_CONST;
+ }
+ }
+ }
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$UByteAndWrongConst(byte[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-NOT: VecLoad
+ //
+ /// CHECK-FI:
+ //
+ // 'NarrowerOperands' not met: the constant is not a ubyte one.
+ public static void $compile$noinline$UByteAndWrongConst(byte[] x) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ if ((x[i] & 0xFF) != (MAGIC_VALUE_C | 0x100)) {
+ x[i] += MAGIC_ADD_CONST;
+ }
+ }
+ }
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$ByteNoHiBits(byte[], byte[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-NOT: VecLoad
+ //
+ /// CHECK-FI:
+ //
+ // Check kNoHiBits case when "wider" operations cannot bring in higher order bits.
+ public static void $compile$noinline$ByteNoHiBits(byte[] x, byte[] y) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ byte val = x[i];
+ if ((val >>> 3) != y[i]) {
+ x[i] += MAGIC_ADD_CONST;
+ }
+ }
+ }
+
+ //
+ // Test condition types.
+ //
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleBelow(int[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-NOT: VecLoad
+ //
+ /// CHECK-FI:
+ //
+ // TODO: Support other conditions.
+ public static void $compile$noinline$SimpleBelow(int[] x) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ int val = x[i];
+ if (val < MAGIC_VALUE_C) {
+ x[i] += MAGIC_ADD_CONST;
+ }
+ }
+ }
+
+ //
+ // Test vectorization idioms.
+ //
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$Select(int[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-NOT: VecLoad
+ //
+ /// CHECK-FI:
+ //
+ // TODO: vectorize loops with select in the body.
+ public static void $compile$noinline$Select(int[] x) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ int val = x[i];
+ if (val != MAGIC_VALUE_C) {
+ val += MAGIC_ADD_CONST;
+ }
+ x[i] = val;
+ }
+ }
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$Phi(int[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-NOT: VecLoad
+ //
+ /// CHECK-FI:
+ //
+ // TODO: vectorize loops with phis in the body.
+ public static void $compile$noinline$Phi(int[] x) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ int val = x[i];
+ if (val != MAGIC_VALUE_C) {
+ val += MAGIC_ADD_CONST;
+ x[i] += val;
+ }
+ x[i] += val;
+ }
+ }
+
+ // TODO: when Phis are supported, test dotprod and sad idioms.
+
+ /// CHECK-START-ARM64: int Main.$compile$noinline$Reduction(int[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-NOT: VecLoad
+ //
+ /// CHECK-FI:
+ //
+ // TODO: vectorize loops with phis and reductions in the body.
+ private static int $compile$noinline$Reduction(int[] x) {
+ int sum = 0;
+ for (int i = 0; i < ARRAY_LENGTH; i++) {
+ int val = x[i];
+ if (val != MAGIC_VALUE_C) {
+ sum += val + x[i];
+ }
+ }
+ return sum;
+ }
+
+ /// CHECK-START-ARM64: int Main.$compile$noinline$ReductionBackEdge(int[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-DAG: VecLoad
+ //
+ /// CHECK-FI:
+ //
+ // Reduction in the back edge block, non-CF-dependent.
+ public static int $compile$noinline$ReductionBackEdge(int[] x) {
+ int sum = 0;
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ int val = x[i];
+ if (val != MAGIC_VALUE_C) {
+ x[i] += MAGIC_ADD_CONST;
+ }
+ sum += x[i];
+ }
+ return sum;
+ }
+
+ //
+ // Negative compile tests.
+ //
+
+ public static final int STENCIL_ARRAY_SIZE = 130;
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$stencilAlike(int[], int[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-NOT: VecLoad
+ //
+ /// CHECK-FI:
+ //
+ // This loop needs a runtime test for array references disambiguation and a scalar cleanup loop.
+ // Currently we can't generate a scalar clean up loop with control flow.
+ private static void $compile$noinline$stencilAlike(int[] a, int[] b) {
+ for (int i = 1; i < STENCIL_ARRAY_SIZE - 1; i++) {
+ int val0 = b[i - 1];
+ int val1 = b[i];
+ int val2 = b[i + 1];
+ int un = a[i];
+ if (val1 != MAGIC_VALUE_C) {
+ a[i] = val0 + val1 + val2;
+ }
+ }
+ }
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$NotDiamondCf(int[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-NOT: VecLoad
+ //
+ /// CHECK-FI:
+ //
+ // Loops with complex CF are not supported.
+ public static void $compile$noinline$NotDiamondCf(int[] x) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ int val = x[i];
+ if (val != MAGIC_VALUE_C) {
+ if (val != 1234) {
+ x[i] += MAGIC_ADD_CONST;
+ }
+ }
+ }
+ }
+
+ /// CHECK-START-ARM64: void Main.$compile$noinline$BrokenInduction(int[]) loop_optimization (after)
+ /// CHECK-IF: hasIsaFeature("sve")
+ //
+ /// CHECK-NOT: VecLoad
+ //
+ /// CHECK-FI:
+ public static void $compile$noinline$BrokenInduction(int[] x) {
+ for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+ int val = x[i];
+ if (val != MAGIC_VALUE_C) {
+ x[i] += MAGIC_ADD_CONST;
+ i++;
+ }
+ }
+ }
+
+ //
+ // Main driver.
+ //
+
+ public static void main(String[] args) {
+ initIntArray(intArray);
+ int final_ind_value = $compile$noinline$FullDiamond(intArray);
+ expectIntEquals(23755, IntArraySum(intArray));
+ expectIntEquals(USED_ARRAY_LENGTH, final_ind_value);
+
+ // Types.
+ initBooleanArray(booleanArray);
+ booleanArray2[12] = true;
+ $compile$noinline$SimpleBoolean(booleanArray, booleanArray2);
+ expectIntEquals(86, BooleanArraySum(booleanArray));
+
+ initByteArray(byteArray);
+ $compile$noinline$SimpleByte(byteArray);
+ expectIntEquals(-64, ByteArraySum(byteArray));
+
+ initByteArray(byteArray);
+ $compile$noinline$SimpleUByte(byteArray);
+ expectIntEquals(-64, ByteArraySum(byteArray));
+
+ initShortArray(shortArray);
+ $compile$noinline$SimpleShort(shortArray);
+ expectIntEquals(23121, ShortArraySum(shortArray));
+
+ initCharArray(charArray);
+ $compile$noinline$SimpleChar(charArray);
+ expectIntEquals(23121, CharArraySum(charArray));
+
+ initIntArray(intArray);
+ $compile$noinline$SimpleInt(intArray);
+ expectIntEquals(23121, IntArraySum(intArray));
+
+ initLongArray(longArray);
+ $compile$noinline$SimpleLong(longArray);
+ expectLongEquals(23121, LongArraySum(longArray));
+
+ initFloatArray(floatArray);
+ $compile$noinline$SimpleFloat(floatArray);
+ expectFloatEquals(18868.2f, FloatArraySum(floatArray));
+
+ initDoubleArray(doubleArray);
+ $compile$noinline$SimpleDouble(doubleArray);
+ expectDoubleEquals(23129.5, DoubleArraySum(doubleArray));
+
+ // Narrowing types.
+ initByteArray(byteArray);
+ $compile$noinline$ByteConv(byteArray, byteArray);
+ expectIntEquals(-2, ByteArraySum(byteArray));
+
+ initByteArray(byteArray);
+ $compile$noinline$UByteAndWrongConst(byteArray);
+ expectIntEquals(-2, ByteArraySum(byteArray));
+
+ initByteArray(byteArray);
+ $compile$noinline$ByteNoHiBits(byteArray, byteArray);
+ expectIntEquals(-2, ByteArraySum(byteArray));
+
+ // Conditions.
+ initIntArray(intArray);
+ $compile$noinline$SimpleBelow(intArray);
+ expectIntEquals(23121, IntArraySum(intArray));
+
+ // Idioms.
+ initIntArray(intArray);
+ $compile$noinline$Select(intArray);
+ expectIntEquals(23121, IntArraySum(intArray));
+
+ initIntArray(intArray);
+ $compile$noinline$Phi(intArray);
+ expectIntEquals(36748, IntArraySum(intArray));
+
+ int reduction_result = 0;
+
+ initIntArray(intArray);
+ reduction_result = $compile$noinline$Reduction(intArray);
+ expectIntEquals(14706, IntArraySum(intArray));
+ expectIntEquals(21012, reduction_result);
+
+ initIntArray(intArray);
+ reduction_result = $compile$noinline$ReductionBackEdge(intArray);
+ expectIntEquals(23121, IntArraySum(intArray));
+ expectIntEquals(13121, reduction_result);
+
+ int[] stencilArrayA = new int[STENCIL_ARRAY_SIZE];
+ int[] stencilArrayB = new int[STENCIL_ARRAY_SIZE];
+ initIntArray(stencilArrayA);
+ initIntArray(stencilArrayB);
+ $compile$noinline$stencilAlike(stencilArrayA, stencilArrayB);
+ expectIntEquals(43602, IntArraySum(stencilArrayA));
+
+ initIntArray(intArray);
+ $compile$noinline$NotDiamondCf(intArray);
+ expectIntEquals(23121, IntArraySum(intArray));
+
+ initIntArray(intArray);
+ $compile$noinline$BrokenInduction(intArray);
+ expectIntEquals(18963, IntArraySum(intArray));
+
+ System.out.println("passed");
+ }
+
+ public static void initBooleanArray(boolean[] a) {
+ for (int i = 0; i < ARRAY_LENGTH; i++) {
+ if (i % 3 != 0) {
+ a[i] = true;
+ }
+ }
+ }
+
+ public static void initByteArray(byte[] a) {
+ for (int i = 0; i < ARRAY_LENGTH; i++) {
+ if (i % 3 == 0) {
+ a[i] = (byte)MAGIC_VALUE_A;
+ } else if (i % 3 == 1) {
+ a[i] = (byte)MAGIC_VALUE_B;
+ } else {
+ a[i] = (byte)MAGIC_VALUE_C;
+ }
+ }
+ a[USED_ARRAY_LENGTH] = 127;
+ }
+
+ public static void initShortArray(short[] a) {
+ for (int i = 0; i < ARRAY_LENGTH; i++) {
+ if (i % 3 == 0) {
+ a[i] = (short)MAGIC_VALUE_A;
+ } else if (i % 3 == 1) {
+ a[i] = (short)MAGIC_VALUE_B;
+ } else {
+ a[i] = (short)MAGIC_VALUE_C;
+ }
+ }
+ a[USED_ARRAY_LENGTH] = 10000;
+ }
+
+ public static void initCharArray(char[] a) {
+ for (int i = 0; i < ARRAY_LENGTH; i++) {
+ if (i % 3 == 0) {
+ a[i] = (char)MAGIC_VALUE_A;
+ } else if (i % 3 == 1) {
+ a[i] = (char)MAGIC_VALUE_B;
+ } else {
+ a[i] = (char)MAGIC_VALUE_C;
+ }
+ }
+ a[USED_ARRAY_LENGTH] = 10000;
+ }
+
+ public static void initIntArray(int[] a) {
+ for (int i = 0; i < ARRAY_LENGTH; i++) {
+ if (i % 3 == 0) {
+ a[i] = MAGIC_VALUE_A;
+ } else if (i % 3 == 1) {
+ a[i] = MAGIC_VALUE_B;
+ } else {
+ a[i] = MAGIC_VALUE_C;
+ }
+ }
+ a[USED_ARRAY_LENGTH] = 10000;
+ }
+
+ public static void initLongArray(long[] a) {
+ for (int i = 0; i < ARRAY_LENGTH; i++) {
+ if (i % 3 == 0) {
+ a[i] = MAGIC_VALUE_A;
+ } else if (i % 3 == 1) {
+ a[i] = MAGIC_VALUE_B;
+ } else {
+ a[i] = MAGIC_VALUE_C;
+ }
+ }
+ a[USED_ARRAY_LENGTH] = 10000;
+ }
+
+ public static void initFloatArray(float[] a) {
+ for (int i = 0; i < ARRAY_LENGTH; i++) {
+ if (i % 3 == 0) {
+ a[i] = MAGIC_FLOAT_VALUE_A;
+ } else if (i % 3 == 1) {
+ a[i] = MAGIC_FLOAT_VALUE_B;
+ } else {
+ a[i] = MAGIC_FLOAT_VALUE_C;
+ }
+ }
+ a[USED_ARRAY_LENGTH] = 10000.0f;
+ }
+
+ public static void initDoubleArray(double[] a) {
+ for (int i = 0; i < ARRAY_LENGTH; i++) {
+ if (i % 3 == 0) {
+ a[i] = MAGIC_FLOAT_VALUE_A;
+ } else if (i % 3 == 1) {
+ a[i] = MAGIC_FLOAT_VALUE_B;
+ } else {
+ a[i] = MAGIC_FLOAT_VALUE_C;
+ }
+ }
+ a[USED_ARRAY_LENGTH] = 10000.0f;
+ }
+
+ public static byte BooleanArraySum(boolean[] a) {
+ byte sum = 0;
+ for (int i = 0; i < a.length; i++) {
+ sum += a[i] ? 1 : 0;
+ }
+ return sum;
+ }
+
+ public static byte ByteArraySum(byte[] a) {
+ byte sum = 0;
+ for (int i = 0; i < a.length; i++) {
+ sum += a[i];
+ }
+ return sum;
+ }
+
+ public static short ShortArraySum(short[] a) {
+ short sum = 0;
+ for (int i = 0; i < a.length; i++) {
+ sum += a[i];
+ }
+ return sum;
+ }
+
+ public static char CharArraySum(char[] a) {
+ char sum = 0;
+ for (int i = 0; i < a.length; i++) {
+ sum += a[i];
+ }
+ return sum;
+ }
+
+ public static int IntArraySum(int[] a) {
+ int sum = 0;
+ for (int i = 0; i < a.length; i++) {
+ sum += a[i];
+ }
+ return sum;
+ }
+
+ public static long LongArraySum(long[] a) {
+ long sum = 0;
+ for (int i = 0; i < a.length; i++) {
+ sum += a[i];
+ }
+ return sum;
+ }
+
+ public static float FloatArraySum(float[] a) {
+ float sum = 0.0f;
+ for (int i = 0; i < a.length; i++) {
+ sum += a[i];
+ }
+ return sum;
+ }
+
+ public static double DoubleArraySum(double[] a) {
+ double sum = 0.0;
+ for (int i = 0; i < a.length; i++) {
+ sum += a[i];
+ }
+ return sum;
+ }
+
+ private static void expectIntEquals(int expected, int result) {
+ if (expected != result) {
+ throw new Error("Expected: " + expected + ", found: " + result);
+ }
+ }
+
+ private static void expectLongEquals(long expected, long result) {
+ if (expected != result) {
+ throw new Error("Expected: " + expected + ", found: " + result);
+ }
+ }
+
+ private static void expectFloatEquals(float expected, float result) {
+ final float THRESHOLD = .1f;
+ if (Math.abs(expected - result) >= THRESHOLD) {
+ throw new Error("Expected: " + expected + ", found: " + result);
+ }
+ }
+
+ private static void expectDoubleEquals(double expected, double result) {
+ final double THRESHOLD = .1;
+ if (Math.abs(expected - result) >= THRESHOLD) {
+ throw new Error("Expected: " + expected + ", found: " + result);
+ }
+ }
+}