diff options
26 files changed, 1831 insertions, 216 deletions
diff --git a/TEST_MAPPING b/TEST_MAPPING index 38ff5eda55..d4ea0d8428 100644 --- a/TEST_MAPPING +++ b/TEST_MAPPING @@ -1181,6 +1181,9 @@ "name": "art-run-test-660-checker-simd-sad[com.google.android.art.apex]" }, { + "name": "art-run-test-661-checker-simd-cf-loops[com.google.android.art.apex]" + }, + { "name": "art-run-test-661-checker-simd-reduc[com.google.android.art.apex]" }, { @@ -2600,6 +2603,9 @@ "name": "art-run-test-660-checker-simd-sad" }, { + "name": "art-run-test-661-checker-simd-cf-loops" + }, + { "name": "art-run-test-661-checker-simd-reduc" }, { diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index 13734d7fb8..bf25418a2f 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -554,12 +554,31 @@ class InstructionCodeGeneratorARM64Sve : public InstructionCodeGeneratorARM64 { // register size (full SIMD register is used). void ValidateVectorLength(HVecOperation* instr) const; - // Returns default predicate register which is used as governing vector predicate - // to implement predicated loop execution. + vixl::aarch64::PRegister GetVecGoverningPReg(HVecOperation* instr) { + return GetVecPredSetFixedOutPReg(instr->GetGoverningPredicate()); + } + + // Returns a fixed p-reg for predicate setting instruction. // - // TODO: This is a hack to be addressed when register allocator supports SIMD types. - static vixl::aarch64::PRegister LoopPReg() { - return vixl::aarch64::p0; + // Currently we only support diamond CF loops for predicated vectorization; also we don't have + // register allocator support for vector predicates. Thus we use fixed P-regs for loop main, + // True and False predicates as a temporary solution. + // + // TODO: Support SIMD types and registers in ART. + static vixl::aarch64::PRegister GetVecPredSetFixedOutPReg(HVecPredSetOperation* instr) { + if (instr->IsVecPredWhile() || instr->IsVecPredSetAll()) { + // VecPredWhile and VecPredSetAll live ranges never overlap due to the current vectorization + // scheme: the former only is live inside a vectorized loop and the later is never in a + // loop and never spans across loops. + return vixl::aarch64::p0; + } else if (instr->IsVecPredNot()) { + // This relies on the fact that we only use PredNot manually in the autovectorizer, + // so there is only one of them in each loop. + return vixl::aarch64::p1; + } else { + DCHECK(instr->IsVecCondition()); + return vixl::aarch64::p2; + } } }; diff --git a/compiler/optimizing/code_generator_riscv64.cc b/compiler/optimizing/code_generator_riscv64.cc index c7cd26c83f..df40337f51 100644 --- a/compiler/optimizing/code_generator_riscv64.cc +++ b/compiler/optimizing/code_generator_riscv64.cc @@ -2017,12 +2017,32 @@ void InstructionCodeGeneratorRISCV64::VisitVecPredWhile(HVecPredWhile* instructi LOG(FATAL) << "Unimplemented"; } -void LocationsBuilderRISCV64::VisitVecPredCondition(HVecPredCondition* instruction) { +void LocationsBuilderRISCV64::VisitVecPredToBoolean(HVecPredToBoolean* instruction) { UNUSED(instruction); LOG(FATAL) << "Unimplemented"; } -void InstructionCodeGeneratorRISCV64::VisitVecPredCondition(HVecPredCondition* instruction) { +void InstructionCodeGeneratorRISCV64::VisitVecPredToBoolean(HVecPredToBoolean* instruction) { + UNUSED(instruction); + LOG(FATAL) << "Unimplemented"; +} + +void LocationsBuilderRISCV64::VisitVecCondition(HVecCondition* instruction) { + UNUSED(instruction); + LOG(FATAL) << "Unimplemented"; +} + +void InstructionCodeGeneratorRISCV64::VisitVecCondition(HVecCondition* instruction) { + UNUSED(instruction); + LOG(FATAL) << "Unimplemented"; +} + +void LocationsBuilderRISCV64::VisitVecPredNot(HVecPredNot* instruction) { + UNUSED(instruction); + LOG(FATAL) << "Unimplemented"; +} + +void InstructionCodeGeneratorRISCV64::VisitVecPredNot(HVecPredNot* instruction) { UNUSED(instruction); LOG(FATAL) << "Unimplemented"; } diff --git a/compiler/optimizing/code_generator_vector_arm64_neon.cc b/compiler/optimizing/code_generator_vector_arm64_neon.cc index ce02bfa21a..848b5e7567 100644 --- a/compiler/optimizing/code_generator_vector_arm64_neon.cc +++ b/compiler/optimizing/code_generator_vector_arm64_neon.cc @@ -1531,12 +1531,32 @@ void InstructionCodeGeneratorARM64Neon::VisitVecPredWhile(HVecPredWhile* instruc UNREACHABLE(); } -void LocationsBuilderARM64Neon::VisitVecPredCondition(HVecPredCondition* instruction) { +void LocationsBuilderARM64Neon::VisitVecPredToBoolean(HVecPredToBoolean* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); UNREACHABLE(); } -void InstructionCodeGeneratorARM64Neon::VisitVecPredCondition(HVecPredCondition* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecPredToBoolean(HVecPredToBoolean* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + UNREACHABLE(); +} + +void LocationsBuilderARM64Neon::VisitVecCondition(HVecCondition* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + UNREACHABLE(); +} + +void InstructionCodeGeneratorARM64Neon::VisitVecCondition(HVecCondition* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + UNREACHABLE(); +} + +void LocationsBuilderARM64Neon::VisitVecPredNot(HVecPredNot* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + UNREACHABLE(); +} + +void InstructionCodeGeneratorARM64Neon::VisitVecPredNot(HVecPredNot* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); UNREACHABLE(); } diff --git a/compiler/optimizing/code_generator_vector_arm64_sve.cc b/compiler/optimizing/code_generator_vector_arm64_sve.cc index 4c16c3eb38..ef79932899 100644 --- a/compiler/optimizing/code_generator_vector_arm64_sve.cc +++ b/compiler/optimizing/code_generator_vector_arm64_sve.cc @@ -245,7 +245,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReduce(HVecReduce* instruction) { LocationSummary* locations = instruction->GetLocations(); const ZRegister src = ZRegisterFrom(locations->InAt(0)); const VRegister dst = DRegisterFrom(locations->Out()); - const PRegister p_reg = LoopPReg(); + const PRegister p_reg = GetVecGoverningPReg(instruction); ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kInt32: @@ -283,7 +283,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecCnv(HVecCnv* instruction) { LocationSummary* locations = instruction->GetLocations(); const ZRegister src = ZRegisterFrom(locations->InAt(0)); const ZRegister dst = ZRegisterFrom(locations->Out()); - const PRegisterM p_reg = LoopPReg().Merging(); + const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging(); DataType::Type from = instruction->GetInputType(); DataType::Type to = instruction->GetResultType(); ValidateVectorLength(instruction); @@ -303,7 +303,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecNeg(HVecNeg* instruction) { LocationSummary* locations = instruction->GetLocations(); const ZRegister src = ZRegisterFrom(locations->InAt(0)); const ZRegister dst = ZRegisterFrom(locations->Out()); - const PRegisterM p_reg = LoopPReg().Merging(); + const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging(); ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: @@ -341,7 +341,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAbs(HVecAbs* instruction) { LocationSummary* locations = instruction->GetLocations(); const ZRegister src = ZRegisterFrom(locations->InAt(0)); const ZRegister dst = ZRegisterFrom(locations->Out()); - const PRegisterM p_reg = LoopPReg().Merging(); + const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging(); ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kInt8: @@ -377,7 +377,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecNot(HVecNot* instruction) { LocationSummary* locations = instruction->GetLocations(); const ZRegister src = ZRegisterFrom(locations->InAt(0)); const ZRegister dst = ZRegisterFrom(locations->Out()); - const PRegisterM p_reg = LoopPReg().Merging(); + const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging(); ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: // special case boolean-not @@ -437,7 +437,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAdd(HVecAdd* instruction) { const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); - const PRegisterM p_reg = LoopPReg().Merging(); + const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging(); ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: @@ -496,7 +496,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecSub(HVecSub* instruction) { const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); - const PRegisterM p_reg = LoopPReg().Merging(); + const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging(); ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: @@ -545,7 +545,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMul(HVecMul* instruction) { const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); - const PRegisterM p_reg = LoopPReg().Merging(); + const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging(); ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: @@ -584,7 +584,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecDiv(HVecDiv* instruction) { const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); - const PRegisterM p_reg = LoopPReg().Merging(); + const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging(); ValidateVectorLength(instruction); // Note: VIXL guarantees StrictNaNPropagation for Fdiv. @@ -632,7 +632,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAnd(HVecAnd* instruction) { const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); - const PRegisterM p_reg = LoopPReg().Merging(); + const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging(); ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: @@ -677,7 +677,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecOr(HVecOr* instruction) { const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); - const PRegisterM p_reg = LoopPReg().Merging(); + const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging(); ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: @@ -713,7 +713,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecXor(HVecXor* instruction) { const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); - const PRegisterM p_reg = LoopPReg().Merging(); + const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging(); ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: @@ -768,7 +768,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecShl(HVecShl* instruction) { LocationSummary* locations = instruction->GetLocations(); const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); const ZRegister dst = ZRegisterFrom(locations->Out()); - const PRegisterM p_reg = LoopPReg().Merging(); + const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging(); int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { @@ -801,7 +801,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecShr(HVecShr* instruction) { LocationSummary* locations = instruction->GetLocations(); const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); const ZRegister dst = ZRegisterFrom(locations->Out()); - const PRegisterM p_reg = LoopPReg().Merging(); + const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging(); int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { @@ -834,7 +834,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecUShr(HVecUShr* instruction) { LocationSummary* locations = instruction->GetLocations(); const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); const ZRegister dst = ZRegisterFrom(locations->Out()); - const PRegisterM p_reg = LoopPReg().Merging(); + const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging(); int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { @@ -965,7 +965,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate( const ZRegister acc = ZRegisterFrom(locations->InAt(0)); const ZRegister left = ZRegisterFrom(locations->InAt(1)); const ZRegister right = ZRegisterFrom(locations->InAt(2)); - const PRegisterM p_reg = LoopPReg().Merging(); + const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging(); DCHECK(locations->InAt(0).Equals(locations->Out())); ValidateVectorLength(instruction); @@ -1028,7 +1028,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecDotProd(HVecDotProd* instruction) const ZRegister acc = ZRegisterFrom(locations->InAt(0)); const ZRegister left = ZRegisterFrom(locations->InAt(1)); const ZRegister right = ZRegisterFrom(locations->InAt(2)); - const PRegisterM p_reg = LoopPReg().Merging(); + const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging(); HVecOperation* a = instruction->InputAt(1)->AsVecOperation(); HVecOperation* b = instruction->InputAt(2)->AsVecOperation(); DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()), @@ -1098,7 +1098,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecLoad(HVecLoad* instruction) { const ZRegister reg = ZRegisterFrom(locations->Out()); UseScratchRegisterScope temps(GetVIXLAssembler()); Register scratch; - const PRegisterZ p_reg = LoopPReg().Zeroing(); + const PRegisterZ p_reg = GetVecGoverningPReg(instruction).Zeroing(); ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { @@ -1140,7 +1140,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecStore(HVecStore* instruction) { const ZRegister reg = ZRegisterFrom(locations->InAt(2)); UseScratchRegisterScope temps(GetVIXLAssembler()); Register scratch; - const PRegisterZ p_reg = LoopPReg().Zeroing(); + const PRegisterZ p_reg = GetVecGoverningPReg(instruction).Zeroing(); ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { @@ -1181,25 +1181,25 @@ void LocationsBuilderARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instruction) { void InstructionCodeGeneratorARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instruction) { // Instruction is not predicated, see nodes_vector.h DCHECK(!instruction->IsPredicated()); - const PRegister p_reg = LoopPReg(); + const PRegister output_p_reg = GetVecPredSetFixedOutPReg(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: - __ Ptrue(p_reg.VnB(), vixl::aarch64::SVE_ALL); + __ Ptrue(output_p_reg.VnB(), vixl::aarch64::SVE_ALL); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - __ Ptrue(p_reg.VnH(), vixl::aarch64::SVE_ALL); + __ Ptrue(output_p_reg.VnH(), vixl::aarch64::SVE_ALL); break; case DataType::Type::kInt32: case DataType::Type::kFloat32: - __ Ptrue(p_reg.VnS(), vixl::aarch64::SVE_ALL); + __ Ptrue(output_p_reg.VnS(), vixl::aarch64::SVE_ALL); break; case DataType::Type::kInt64: case DataType::Type::kFloat64: - __ Ptrue(p_reg.VnD(), vixl::aarch64::SVE_ALL); + __ Ptrue(output_p_reg.VnD(), vixl::aarch64::SVE_ALL); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -1207,6 +1207,67 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instru } } +void LocationsBuilderARM64Sve::VisitVecCondition(HVecCondition* instruction) { + LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresRegister()); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecCondition(HVecCondition* instruction) { + DCHECK(instruction->IsPredicated()); + LocationSummary* locations = instruction->GetLocations(); + const ZRegister left = ZRegisterFrom(locations->InAt(0)); + const ZRegister right = ZRegisterFrom(locations->InAt(1)); + const PRegisterZ p_reg = GetVecGoverningPReg(instruction).Zeroing(); + const PRegister output_p_reg = GetVecPredSetFixedOutPReg(instruction); + + HVecOperation* a = instruction->InputAt(0)->AsVecOperation(); + HVecOperation* b = instruction->InputAt(1)->AsVecOperation(); + DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()), + HVecOperation::ToSignedType(b->GetPackedType())); + ValidateVectorLength(instruction); + + // TODO: Support other condition OPs and types. + switch (instruction->GetPackedType()) { + case DataType::Type::kUint8: + case DataType::Type::kInt8: + __ Cmpeq(output_p_reg.VnB(), p_reg, left.VnB(), right.VnB()); + break; + case DataType::Type::kUint16: + case DataType::Type::kInt16: + __ Cmpeq(output_p_reg.VnH(), p_reg, left.VnH(), right.VnH()); + break; + case DataType::Type::kInt32: + __ Cmpeq(output_p_reg.VnS(), p_reg, left.VnS(), right.VnS()); + break; + case DataType::Type::kInt64: + __ Cmpeq(output_p_reg.VnD(), p_reg, left.VnD(), right.VnD()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecPredNot(HVecPredNot* instruction) { + LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); + DCHECK(instruction->InputAt(0)->IsVecPredSetOperation()); + locations->SetInAt(0, Location::NoLocation()); + locations->SetOut(Location::RequiresRegister()); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecPredNot(HVecPredNot* instruction) { + DCHECK(instruction->IsPredicated()); + + const PRegister input_p_reg = GetVecPredSetFixedOutPReg( + instruction->InputAt(0)->AsVecPredSetOperation()); + const PRegister control_p_reg = GetVecGoverningPReg(instruction); + const PRegister output_p_reg = GetVecPredSetFixedOutPReg(instruction); + + __ Not(output_p_reg.VnB(), control_p_reg.Zeroing(), input_p_reg.VnB()); +} + void LocationsBuilderARM64Sve::VisitVecPredWhile(HVecPredWhile* instruction) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); locations->SetInAt(0, Location::RequiresRegister()); @@ -1217,8 +1278,8 @@ void LocationsBuilderARM64Sve::VisitVecPredWhile(HVecPredWhile* instruction) { // Semantically, the out location of this instruction and predicate inputs locations of // its users should be a fixed predicate register (similar to // Location::RegisterLocation(int reg)). But the register allocator (RA) doesn't support - // SIMD regs (e.g. predicate), so LoopPReg() is used explicitly without exposing it - // to the RA. + // SIMD regs (e.g. predicate), so fixed registers are used explicitly without exposing it + // to the RA (through GetVecPredSetFixedOutPReg()). // // To make the RA happy Location::NoLocation() was used for all the vector instructions // predicate inputs; but for the PredSetOperations (e.g. VecPredWhile) Location::NoLocation() @@ -1240,21 +1301,22 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredWhile(HVecPredWhile* instruct DCHECK(instruction->GetCondKind() == HVecPredWhile::CondKind::kLO); Register left = InputRegisterAt(instruction, 0); Register right = InputRegisterAt(instruction, 1); + const PRegister output_p_reg = GetVecPredSetFixedOutPReg(instruction); DCHECK_EQ(codegen_->GetSIMDRegisterWidth() % instruction->GetVectorLength(), 0u); switch (codegen_->GetSIMDRegisterWidth() / instruction->GetVectorLength()) { case 1u: - __ Whilelo(LoopPReg().VnB(), left, right); + __ Whilelo(output_p_reg.VnB(), left, right); break; case 2u: - __ Whilelo(LoopPReg().VnH(), left, right); + __ Whilelo(output_p_reg.VnH(), left, right); break; case 4u: - __ Whilelo(LoopPReg().VnS(), left, right); + __ Whilelo(output_p_reg.VnS(), left, right); break; case 8u: - __ Whilelo(LoopPReg().VnD(), left, right); + __ Whilelo(output_p_reg.VnD(), left, right); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -1262,20 +1324,20 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredWhile(HVecPredWhile* instruct } } -void LocationsBuilderARM64Sve::VisitVecPredCondition(HVecPredCondition* instruction) { +void LocationsBuilderARM64Sve::VisitVecPredToBoolean(HVecPredToBoolean* instruction) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); locations->SetInAt(0, Location::NoLocation()); // Result of the operation - a boolean value in a core register. locations->SetOut(Location::RequiresRegister()); } -void InstructionCodeGeneratorARM64Sve::VisitVecPredCondition(HVecPredCondition* instruction) { +void InstructionCodeGeneratorARM64Sve::VisitVecPredToBoolean(HVecPredToBoolean* instruction) { // Instruction is not predicated, see nodes_vector.h DCHECK(!instruction->IsPredicated()); Register reg = OutputRegister(instruction); - // Currently VecPredCondition is only used as part of vectorized loop check condition + // Currently VecPredToBoolean is only used as part of vectorized loop check condition // evaluation. - DCHECK(instruction->GetPCondKind() == HVecPredCondition::PCondKind::kNFirst); + DCHECK(instruction->GetPCondKind() == HVecPredToBoolean::PCondKind::kNFirst); __ Cset(reg, pl); } diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc index e8ecf28386..70f22af17b 100644 --- a/compiler/optimizing/code_generator_vector_arm_vixl.cc +++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc @@ -1069,12 +1069,32 @@ void InstructionCodeGeneratorARMVIXL::VisitVecPredWhile(HVecPredWhile* instructi UNREACHABLE(); } -void LocationsBuilderARMVIXL::VisitVecPredCondition(HVecPredCondition* instruction) { +void LocationsBuilderARMVIXL::VisitVecPredToBoolean(HVecPredToBoolean* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); UNREACHABLE(); } -void InstructionCodeGeneratorARMVIXL::VisitVecPredCondition(HVecPredCondition* instruction) { +void InstructionCodeGeneratorARMVIXL::VisitVecPredToBoolean(HVecPredToBoolean* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + UNREACHABLE(); +} + +void LocationsBuilderARMVIXL::VisitVecCondition(HVecCondition* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + UNREACHABLE(); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecCondition(HVecCondition* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + UNREACHABLE(); +} + +void LocationsBuilderARMVIXL::VisitVecPredNot(HVecPredNot* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + UNREACHABLE(); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecPredNot(HVecPredNot* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); UNREACHABLE(); } diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc index 343a6e1af4..1f9b2578ac 100644 --- a/compiler/optimizing/code_generator_vector_x86.cc +++ b/compiler/optimizing/code_generator_vector_x86.cc @@ -1401,12 +1401,32 @@ void InstructionCodeGeneratorX86::VisitVecPredWhile(HVecPredWhile* instruction) UNREACHABLE(); } -void LocationsBuilderX86::VisitVecPredCondition(HVecPredCondition* instruction) { +void LocationsBuilderX86::VisitVecPredToBoolean(HVecPredToBoolean* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); UNREACHABLE(); } -void InstructionCodeGeneratorX86::VisitVecPredCondition(HVecPredCondition* instruction) { +void InstructionCodeGeneratorX86::VisitVecPredToBoolean(HVecPredToBoolean* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + UNREACHABLE(); +} + +void LocationsBuilderX86::VisitVecCondition(HVecCondition* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + UNREACHABLE(); +} + +void InstructionCodeGeneratorX86::VisitVecCondition(HVecCondition* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + UNREACHABLE(); +} + +void LocationsBuilderX86::VisitVecPredNot(HVecPredNot* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + UNREACHABLE(); +} + +void InstructionCodeGeneratorX86::VisitVecPredNot(HVecPredNot* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); UNREACHABLE(); } diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc index fb6e4e753f..47afa3b4a1 100644 --- a/compiler/optimizing/code_generator_vector_x86_64.cc +++ b/compiler/optimizing/code_generator_vector_x86_64.cc @@ -1374,12 +1374,32 @@ void InstructionCodeGeneratorX86_64::VisitVecPredWhile(HVecPredWhile* instructio UNREACHABLE(); } -void LocationsBuilderX86_64::VisitVecPredCondition(HVecPredCondition* instruction) { +void LocationsBuilderX86_64::VisitVecPredToBoolean(HVecPredToBoolean* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); UNREACHABLE(); } -void InstructionCodeGeneratorX86_64::VisitVecPredCondition(HVecPredCondition* instruction) { +void InstructionCodeGeneratorX86_64::VisitVecPredToBoolean(HVecPredToBoolean* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + UNREACHABLE(); +} + +void LocationsBuilderX86_64::VisitVecCondition(HVecCondition* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + UNREACHABLE(); +} + +void InstructionCodeGeneratorX86_64::VisitVecCondition(HVecCondition* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + UNREACHABLE(); +} + +void LocationsBuilderX86_64::VisitVecPredNot(HVecPredNot* instruction) { + LOG(FATAL) << "No SIMD for " << instruction->GetId(); + UNREACHABLE(); +} + +void InstructionCodeGeneratorX86_64::VisitVecPredNot(HVecPredNot* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); UNREACHABLE(); } diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc index f890ba9cc0..c72d3ea24a 100644 --- a/compiler/optimizing/codegen_test.cc +++ b/compiler/optimizing/codegen_test.cc @@ -796,7 +796,7 @@ TEST_F(CodegenTest, ARM64ParallelMoveResolverSIMD) { codegen.Initialize(); - graph->SetHasSIMD(true); + graph->SetHasTraditionalSIMD(true); for (int i = 0; i < 2; i++) { HParallelMove* move = new (graph->GetAllocator()) HParallelMove(graph->GetAllocator()); move->AddMove(Location::SIMDStackSlot(0), @@ -816,7 +816,7 @@ TEST_F(CodegenTest, ARM64ParallelMoveResolverSIMD) { DataType::Type::kFloat64, nullptr); codegen.GetMoveResolver()->EmitNativeCode(move); - graph->SetHasSIMD(false); + graph->SetHasTraditionalSIMD(false); } codegen.Finalize(); @@ -864,7 +864,7 @@ TEST_F(CodegenTest, ARM64FrameSizeSIMD) { arm64::CodeGeneratorARM64 codegen(graph, *compiler_options); codegen.Initialize(); - graph->SetHasSIMD(true); + graph->SetHasTraditionalSIMD(true); DCHECK_EQ(arm64::callee_saved_fp_registers.GetCount(), 8); vixl::aarch64::CPURegList reg_list = arm64::callee_saved_fp_registers; @@ -884,7 +884,8 @@ TEST_F(CodegenTest, ARM64FrameSizeNoSIMD) { arm64::CodeGeneratorARM64 codegen(graph, *compiler_options); codegen.Initialize(); - graph->SetHasSIMD(false); + graph->SetHasTraditionalSIMD(false); + graph->SetHasPredicatedSIMD(false); DCHECK_EQ(arm64::callee_saved_fp_registers.GetCount(), 8); vixl::aarch64::CPURegList reg_list = arm64::callee_saved_fp_registers; diff --git a/compiler/optimizing/dead_code_elimination.cc b/compiler/optimizing/dead_code_elimination.cc index cf49e39849..8e6b6db236 100644 --- a/compiler/optimizing/dead_code_elimination.cc +++ b/compiler/optimizing/dead_code_elimination.cc @@ -24,6 +24,7 @@ #include "base/scoped_arena_containers.h" #include "base/stl_util.h" #include "optimizing/nodes.h" +#include "optimizing/nodes_vector.h" #include "ssa_phi_elimination.h" namespace art HIDDEN { @@ -842,7 +843,8 @@ void HDeadCodeElimination::RemoveDeadInstructions() { void HDeadCodeElimination::UpdateGraphFlags() { bool has_monitor_operations = false; - bool has_simd = false; + bool has_traditional_simd = false; + bool has_predicated_simd = false; bool has_bounds_checks = false; bool has_always_throwing_invokes = false; @@ -852,7 +854,12 @@ void HDeadCodeElimination::UpdateGraphFlags() { if (instruction->IsMonitorOperation()) { has_monitor_operations = true; } else if (instruction->IsVecOperation()) { - has_simd = true; + HVecOperation* vec_instruction = instruction->AsVecOperation(); + if (vec_instruction->IsPredicated()) { + has_predicated_simd = true; + } else { + has_traditional_simd = true; + } } else if (instruction->IsBoundsCheck()) { has_bounds_checks = true; } else if (instruction->IsInvoke() && instruction->AsInvoke()->AlwaysThrows()) { @@ -862,7 +869,8 @@ void HDeadCodeElimination::UpdateGraphFlags() { } graph_->SetHasMonitorOperations(has_monitor_operations); - graph_->SetHasSIMD(has_simd); + graph_->SetHasTraditionalSIMD(has_traditional_simd); + graph_->SetHasPredicatedSIMD(has_predicated_simd); graph_->SetHasBoundsChecks(has_bounds_checks); graph_->SetHasAlwaysThrowingInvokes(has_always_throwing_invokes); } diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc index 73bdd1e223..bd33fde907 100644 --- a/compiler/optimizing/graph_visualizer.cc +++ b/compiler/optimizing/graph_visualizer.cc @@ -610,6 +610,7 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor { } void VisitVecMemoryOperation(HVecMemoryOperation* vec_mem_operation) override { + VisitVecOperation(vec_mem_operation); StartAttributeStream("alignment") << vec_mem_operation->GetAlignment().ToString(); } diff --git a/compiler/optimizing/load_store_analysis.cc b/compiler/optimizing/load_store_analysis.cc index b46e3e18d9..75000c8b91 100644 --- a/compiler/optimizing/load_store_analysis.cc +++ b/compiler/optimizing/load_store_analysis.cc @@ -268,6 +268,13 @@ bool HeapLocationCollector::CanArrayElementsAlias(const HInstruction* idx1, } bool LoadStoreAnalysis::Run() { + // Currently load_store analysis can't handle predicated load/stores; specifically pairs of + // memory operations with different predicates. + // TODO: support predicated SIMD. + if (graph_->HasPredicatedSIMD()) { + return false; + } + for (HBasicBlock* block : graph_->GetReversePostOrder()) { heap_location_collector_.VisitBasicBlock(block); } diff --git a/compiler/optimizing/load_store_analysis.h b/compiler/optimizing/load_store_analysis.h index c46a5b9cc1..ee425454a0 100644 --- a/compiler/optimizing/load_store_analysis.h +++ b/compiler/optimizing/load_store_analysis.h @@ -610,6 +610,7 @@ class HeapLocationCollector : public HGraphVisitor { } void VisitVecLoad(HVecLoad* instruction) override { + DCHECK(!instruction->IsPredicated()); HInstruction* array = instruction->InputAt(0); HInstruction* index = instruction->InputAt(1); DataType::Type type = instruction->GetPackedType(); @@ -618,6 +619,7 @@ class HeapLocationCollector : public HGraphVisitor { } void VisitVecStore(HVecStore* instruction) override { + DCHECK(!instruction->IsPredicated()); HInstruction* array = instruction->InputAt(0); HInstruction* index = instruction->InputAt(1); DataType::Type type = instruction->GetPackedType(); diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc index 9cabb12a9f..58fdd1cd05 100644 --- a/compiler/optimizing/load_store_elimination.cc +++ b/compiler/optimizing/load_store_elimination.cc @@ -1080,10 +1080,12 @@ class LSEVisitor final : private HGraphDelegateVisitor { } void VisitVecLoad(HVecLoad* instruction) override { + DCHECK(!instruction->IsPredicated()); VisitGetLocation(instruction, heap_location_collector_.GetArrayHeapLocation(instruction)); } void VisitVecStore(HVecStore* instruction) override { + DCHECK(!instruction->IsPredicated()); size_t idx = heap_location_collector_.GetArrayHeapLocation(instruction); VisitSetLocation(instruction, idx, instruction->GetValue()); } @@ -4041,6 +4043,13 @@ bool LoadStoreElimination::Run(bool enable_partial_lse) { return false; } + // Currently load_store analysis can't handle predicated load/stores; specifically pairs of + // memory operations with different predicates. + // TODO: support predicated SIMD. + if (graph_->HasPredicatedSIMD()) { + return false; + } + std::unique_ptr<LSEVisitorWrapper> lse_visitor(new (&allocator) LSEVisitorWrapper( graph_, heap_location_collector, enable_partial_lse, stats_)); lse_visitor->Run(); diff --git a/compiler/optimizing/load_store_elimination_test.cc b/compiler/optimizing/load_store_elimination_test.cc index 9fcc8ddef6..d3cf8bfa2a 100644 --- a/compiler/optimizing/load_store_elimination_test.cc +++ b/compiler/optimizing/load_store_elimination_test.cc @@ -573,7 +573,8 @@ TEST_F(LoadStoreEliminationTest, SameHeapValue2) { AddVecStore(entry_block_, array_, j_); HInstruction* vstore = AddVecStore(entry_block_, array_, i_); - graph_->SetHasSIMD(true); + // TODO: enable LSE for graphs with predicated SIMD. + graph_->SetHasTraditionalSIMD(true); PerformLSE(); ASSERT_FALSE(IsRemoved(vstore)); @@ -589,7 +590,8 @@ TEST_F(LoadStoreEliminationTest, SameHeapValue3) { AddVecStore(entry_block_, array_, i_add1_); HInstruction* vstore = AddVecStore(entry_block_, array_, i_); - graph_->SetHasSIMD(true); + // TODO: enable LSE for graphs with predicated SIMD. + graph_->SetHasTraditionalSIMD(true); PerformLSE(); ASSERT_FALSE(IsRemoved(vstore)); @@ -634,7 +636,8 @@ TEST_F(LoadStoreEliminationTest, OverlappingLoadStore) { AddArraySet(entry_block_, array_, i_, c1); HInstruction* vload5 = AddVecLoad(entry_block_, array_, i_); - graph_->SetHasSIMD(true); + // TODO: enable LSE for graphs with predicated SIMD. + graph_->SetHasTraditionalSIMD(true); PerformLSE(); ASSERT_TRUE(IsRemoved(load1)); @@ -668,7 +671,8 @@ TEST_F(LoadStoreEliminationTest, StoreAfterLoopWithoutSideEffects) { // a[j] = 1; HInstruction* array_set = AddArraySet(return_block_, array_, j_, c1); - graph_->SetHasSIMD(true); + // TODO: enable LSE for graphs with predicated SIMD. + graph_->SetHasTraditionalSIMD(true); PerformLSE(); ASSERT_TRUE(IsRemoved(array_set)); @@ -706,7 +710,8 @@ TEST_F(LoadStoreEliminationTest, StoreAfterSIMDLoopWithSideEffects) { // a[j] = 0; HInstruction* a_set = AddArraySet(return_block_, array_, j_, c0); - graph_->SetHasSIMD(true); + // TODO: enable LSE for graphs with predicated SIMD. + graph_->SetHasTraditionalSIMD(true); PerformLSE(); ASSERT_TRUE(IsRemoved(vload)); @@ -745,7 +750,8 @@ TEST_F(LoadStoreEliminationTest, LoadAfterSIMDLoopWithSideEffects) { // x = a[j]; HInstruction* load = AddArrayGet(return_block_, array_, j_); - graph_->SetHasSIMD(true); + // TODO: enable LSE for graphs with predicated SIMD. + graph_->SetHasTraditionalSIMD(true); PerformLSE(); ASSERT_TRUE(IsRemoved(vload)); @@ -786,7 +792,8 @@ TEST_F(LoadStoreEliminationTest, MergePredecessorVecStores) { // down: a[i,... i + 3] = [1,...1] HInstruction* vstore4 = AddVecStore(down, array_, i_, vdata); - graph_->SetHasSIMD(true); + // TODO: enable LSE for graphs with predicated SIMD. + graph_->SetHasTraditionalSIMD(true); PerformLSE(); ASSERT_TRUE(IsRemoved(vstore2)); @@ -877,7 +884,8 @@ TEST_F(LoadStoreEliminationTest, RedundantVStoreVLoadInLoop) { HInstruction* vstore2 = AddVecStore(loop_, array_b, phi_, vload); HInstruction* vstore3 = AddVecStore(loop_, array_a, phi_, vstore1->InputAt(2)); - graph_->SetHasSIMD(true); + // TODO: enable LSE for graphs with predicated SIMD. + graph_->SetHasTraditionalSIMD(true); PerformLSE(); ASSERT_FALSE(IsRemoved(vstore1)); @@ -965,7 +973,8 @@ TEST_F(LoadStoreEliminationTest, VLoadDefaultValueInLoopWithoutWriteSideEffects) HInstruction* vload = AddVecLoad(loop_, array_a, phi_); HInstruction* vstore = AddVecStore(return_block_, array_, c0, vload); - graph_->SetHasSIMD(true); + // TODO: enable LSE for graphs with predicated SIMD. + graph_->SetHasTraditionalSIMD(true); PerformLSE(); ASSERT_FALSE(IsRemoved(vload)); @@ -989,7 +998,8 @@ TEST_F(LoadStoreEliminationTest, VLoadDefaultValue) { HInstruction* vload = AddVecLoad(pre_header_, array_a, c0); HInstruction* vstore = AddVecStore(return_block_, array_, c0, vload); - graph_->SetHasSIMD(true); + // TODO: enable LSE for graphs with predicated SIMD. + graph_->SetHasTraditionalSIMD(true); PerformLSE(); ASSERT_FALSE(IsRemoved(vload)); @@ -1066,7 +1076,8 @@ TEST_F(LoadStoreEliminationTest, VLoadAndLoadDefaultValueInLoopWithoutWriteSideE HInstruction* vstore = AddVecStore(return_block_, array_, c0, vload); HInstruction* store = AddArraySet(return_block_, array_, c0, load); - graph_->SetHasSIMD(true); + // TODO: enable LSE for graphs with predicated SIMD. + graph_->SetHasTraditionalSIMD(true); PerformLSE(); ASSERT_FALSE(IsRemoved(vload)); @@ -1097,7 +1108,8 @@ TEST_F(LoadStoreEliminationTest, VLoadAndLoadDefaultValue) { HInstruction* vstore = AddVecStore(return_block_, array_, c0, vload); HInstruction* store = AddArraySet(return_block_, array_, c0, load); - graph_->SetHasSIMD(true); + // TODO: enable LSE for graphs with predicated SIMD. + graph_->SetHasTraditionalSIMD(true); PerformLSE(); ASSERT_FALSE(IsRemoved(vload)); @@ -1129,7 +1141,8 @@ TEST_F(LoadStoreEliminationTest, VLoadDefaultValueAndVLoadInLoopWithoutWriteSide HInstruction* vstore1 = AddVecStore(return_block_, array_, c0, vload1); HInstruction* vstore2 = AddVecStore(return_block_, array_, c128, vload2); - graph_->SetHasSIMD(true); + // TODO: enable LSE for graphs with predicated SIMD. + graph_->SetHasTraditionalSIMD(true); PerformLSE(); ASSERT_FALSE(IsRemoved(vload1)); @@ -1160,7 +1173,8 @@ TEST_F(LoadStoreEliminationTest, VLoadDefaultValueAndVLoad) { HInstruction* vstore1 = AddVecStore(return_block_, array_, c0, vload1); HInstruction* vstore2 = AddVecStore(return_block_, array_, c128, vload2); - graph_->SetHasSIMD(true); + // TODO: enable LSE for graphs with predicated SIMD. + graph_->SetHasTraditionalSIMD(true); PerformLSE(); ASSERT_FALSE(IsRemoved(vload1)); diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index aef6f1f5bd..f62a355ae4 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -453,6 +453,54 @@ static DataType::Type GetNarrowerType(HInstruction* a, HInstruction* b) { return type; } +// Returns whether the loop is of a diamond structure: +// +// header <----------------+ +// | | +// diamond_hif | +// / \ | +// diamond_true diamond_false | +// \ / | +// back_edge | +// | | +// +---------------------+ +static bool HasLoopDiamondStructure(HLoopInformation* loop_info) { + HBasicBlock* header = loop_info->GetHeader(); + if (loop_info->NumberOfBackEdges() != 1 || header->GetSuccessors().size() != 2) { + return false; + } + HBasicBlock* header_succ_0 = header->GetSuccessors()[0]; + HBasicBlock* header_succ_1 = header->GetSuccessors()[1]; + HBasicBlock* diamond_top = loop_info->Contains(*header_succ_0) ? + header_succ_0 : + header_succ_1; + if (!diamond_top->GetLastInstruction()->IsIf()) { + return false; + } + + HIf* diamond_hif = diamond_top->GetLastInstruction()->AsIf(); + HBasicBlock* diamond_true = diamond_hif->IfTrueSuccessor(); + HBasicBlock* diamond_false = diamond_hif->IfFalseSuccessor(); + + if (diamond_true->GetSuccessors().size() != 1 || diamond_false->GetSuccessors().size() != 1) { + return false; + } + + HBasicBlock* back_edge = diamond_true->GetSingleSuccessor(); + if (back_edge != diamond_false->GetSingleSuccessor() || + back_edge != loop_info->GetBackEdges()[0]) { + return false; + } + + DCHECK_EQ(loop_info->GetBlocks().NumSetBits(), 5u); + return true; +} + +static bool IsPredicatedLoopControlFlowSupported(HLoopInformation* loop_info) { + size_t num_of_blocks = loop_info->GetBlocks().NumSetBits(); + return num_of_blocks == 2 || HasLoopDiamondStructure(loop_info); +} + // // Public methods. // @@ -483,12 +531,12 @@ HLoopOptimization::HLoopOptimization(HGraph* graph, vector_map_(nullptr), vector_permanent_map_(nullptr), vector_external_set_(nullptr), + predicate_info_map_(nullptr), vector_mode_(kSequential), vector_preheader_(nullptr), vector_header_(nullptr), vector_body_(nullptr), vector_index_(nullptr), - loop_main_pred_(nullptr), arch_loop_helper_(ArchNoOptsLoopHelper::Create(codegen, global_allocator_)) { } @@ -545,6 +593,8 @@ bool HLoopOptimization::LocalRun() { ScopedArenaSafeMap<HInstruction*, HInstruction*> perm( std::less<HInstruction*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization)); ScopedArenaSet<HInstruction*> ext_set(loop_allocator_->Adapter(kArenaAllocLoopOptimization)); + ScopedArenaSafeMap<HBasicBlock*, BlockPredicateInfo*> pred( + std::less<HBasicBlock*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization)); // Attach. iset_ = &iset; reductions_ = &reds; @@ -552,6 +602,7 @@ bool HLoopOptimization::LocalRun() { vector_map_ = ↦ vector_permanent_map_ = &perm; vector_external_set_ = &ext_set; + predicate_info_map_ = &pred; // Traverse. const bool did_loop_opt = TraverseLoopsInnerToOuter(top_loop_); // Detach. @@ -561,6 +612,7 @@ bool HLoopOptimization::LocalRun() { vector_map_ = nullptr; vector_permanent_map_ = nullptr; vector_external_set_ = nullptr; + predicate_info_map_ = nullptr; return did_loop_opt; } @@ -793,6 +845,37 @@ void HLoopOptimization::SimplifyBlocks(LoopNode* node) { } } +// Checks whether the loop has exit structure suitable for InnerLoopFinite optimization: +// - has single loop exit. +// - the exit block has only single predecessor - a block inside the loop. +// +// In that case returns single exit basic block (outside the loop); otherwise nullptr. +static HBasicBlock* GetInnerLoopFiniteSingleExit(HLoopInformation* loop_info) { + HBasicBlock* exit = nullptr; + for (HBlocksInLoopIterator block_it(*loop_info); + !block_it.Done(); + block_it.Advance()) { + HBasicBlock* block = block_it.Current(); + + // Check whether one of the successor is loop exit. + for (HBasicBlock* successor : block->GetSuccessors()) { + if (!loop_info->Contains(*successor)) { + if (exit != nullptr) { + // The loop has more than one exit. + return nullptr; + } + exit = successor; + + // Ensure exit can only be reached by exiting loop. + if (successor->GetPredecessors().size() != 1) { + return nullptr; + } + } + } + } + return exit; +} + bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) { HBasicBlock* header = node->loop_info->GetHeader(); HBasicBlock* preheader = node->loop_info->GetPreHeader(); @@ -801,33 +884,22 @@ bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) { if (!induction_range_.IsFinite(node->loop_info, &trip_count)) { return false; } - // Ensure there is only a single loop-body (besides the header). - HBasicBlock* body = nullptr; - for (HBlocksInLoopIterator it(*node->loop_info); !it.Done(); it.Advance()) { - if (it.Current() != header) { - if (body != nullptr) { - return false; - } - body = it.Current(); - } - } - CHECK(body != nullptr); - // Ensure there is only a single exit point. - if (header->GetSuccessors().size() != 2) { - return false; - } - HBasicBlock* exit = (header->GetSuccessors()[0] == body) - ? header->GetSuccessors()[1] - : header->GetSuccessors()[0]; - // Ensure exit can only be reached by exiting loop. - if (exit->GetPredecessors().size() != 1) { + // Check loop exits. + HBasicBlock* exit = GetInnerLoopFiniteSingleExit(node->loop_info); + if (exit == nullptr) { return false; } + + HBasicBlock* body = (header->GetSuccessors()[0] == exit) + ? header->GetSuccessors()[1] + : header->GetSuccessors()[0]; // Detect either an empty loop (no side effects other than plain iteration) or // a trivial loop (just iterating once). Replace subsequent index uses, if any, // with the last value and remove the loop, possibly after unrolling its body. HPhi* main_phi = nullptr; - if (TrySetSimpleLoopHeader(header, &main_phi)) { + size_t num_of_blocks = header->GetLoopInformation()->GetBlocks().NumSetBits(); + + if (num_of_blocks == 2 && TrySetSimpleLoopHeader(header, &main_phi)) { bool is_empty = IsEmptyBody(body); if (reductions_->empty() && // TODO: possible with some effort (is_empty || trip_count == 1) && @@ -850,32 +922,61 @@ bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) { return true; } } - - bool enable_alignment_strategies = !IsInPredicatedVectorizationMode(); // Vectorize loop, if possible and valid. if (!kEnableVectorization || // Disable vectorization for debuggable graphs: this is a workaround for the bug // in 'GenerateNewLoop' which caused the SuspendCheck environment to be invalid. // TODO: b/138601207, investigate other possible cases with wrong environment values and // possibly switch back vectorization on for debuggable graphs. - graph_->IsDebuggable() || - !TrySetSimpleLoopHeader(header, &main_phi) || - !CanVectorizeDataFlow(node, body, enable_alignment_strategies)) { + graph_->IsDebuggable()) { return false; } - if (!IsVectorizationProfitable(trip_count) || - !TryAssignLastValue(node->loop_info, main_phi, preheader, /*collect_loop_uses*/ true)) { + if (IsInPredicatedVectorizationMode()) { + return TryVectorizePredicated(node, body, exit, main_phi, trip_count); + } else { + return TryVectorizedTraditional(node, body, exit, main_phi, trip_count); + } +} + +bool HLoopOptimization::TryVectorizePredicated(LoopNode* node, + HBasicBlock* body, + HBasicBlock* exit, + HPhi* main_phi, + int64_t trip_count) { + if (!IsPredicatedLoopControlFlowSupported(node->loop_info) || + !ShouldVectorizeCommon(node, main_phi, trip_count)) { return false; } - if (IsInPredicatedVectorizationMode()) { - VectorizePredicated(node, body, exit); - } else { - VectorizeTraditional(node, body, exit, trip_count); + // Currently we can only generate cleanup loops for loops with 2 basic block. + // + // TODO: Support array disambiguation tests for CF loops. + if (NeedsArrayRefsDisambiguationTest() && + node->loop_info->GetBlocks().NumSetBits() != 2) { + return false; + } + + VectorizePredicated(node, body, exit); + MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorized); + graph_->SetHasPredicatedSIMD(true); // flag SIMD usage + return true; +} + +bool HLoopOptimization::TryVectorizedTraditional(LoopNode* node, + HBasicBlock* body, + HBasicBlock* exit, + HPhi* main_phi, + int64_t trip_count) { + HBasicBlock* header = node->loop_info->GetHeader(); + size_t num_of_blocks = header->GetLoopInformation()->GetBlocks().NumSetBits(); + + if (num_of_blocks != 2 || !ShouldVectorizeCommon(node, main_phi, trip_count)) { + return false; } - graph_->SetHasSIMD(true); // flag SIMD usage + VectorizeTraditional(node, body, exit, trip_count); MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorized); + graph_->SetHasTraditionalSIMD(true); // flag SIMD usage return true; } @@ -1023,8 +1124,9 @@ bool HLoopOptimization::TryLoopScalarOpts(LoopNode* node) { // Intel Press, June, 2004 (http://www.aartbik.com/). // + bool HLoopOptimization::CanVectorizeDataFlow(LoopNode* node, - HBasicBlock* block, + HBasicBlock* header, bool collect_alignment_info) { // Reset vector bookkeeping. vector_length_ = 0; @@ -1034,16 +1136,30 @@ bool HLoopOptimization::CanVectorizeDataFlow(LoopNode* node, vector_runtime_test_a_ = vector_runtime_test_b_ = nullptr; - // Phis in the loop-body prevent vectorization. - if (!block->GetPhis().IsEmpty()) { - return false; - } + // Traverse the data flow of the loop, in the original program order. + for (HBlocksInLoopReversePostOrderIterator block_it(*header->GetLoopInformation()); + !block_it.Done(); + block_it.Advance()) { + HBasicBlock* block = block_it.Current(); - // Scan the loop-body, starting a right-hand-side tree traversal at each left-hand-side - // occurrence, which allows passing down attributes down the use tree. - for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { - if (!VectorizeDef(node, it.Current(), /*generate_code*/ false)) { - return false; // failure to vectorize a left-hand-side + if (block == header) { + // The header is of a certain structure (TrySetSimpleLoopHeader) and doesn't need to be + // processed here. + continue; + } + + // Phis in the loop-body prevent vectorization. + // TODO: Enable vectorization of CF loops with Phis. + if (!block->GetPhis().IsEmpty()) { + return false; + } + + // Scan the loop-body instructions, starting a right-hand-side tree traversal at each + // left-hand-side occurrence, which allows passing down attributes down the use tree. + for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { + if (!VectorizeDef(node, it.Current(), /*generate_code*/ false)) { + return false; // failure to vectorize a left-hand-side + } } } @@ -1139,6 +1255,23 @@ bool HLoopOptimization::CanVectorizeDataFlow(LoopNode* node, return true; } +bool HLoopOptimization::ShouldVectorizeCommon(LoopNode* node, + HPhi* main_phi, + int64_t trip_count) { + HBasicBlock* header = node->loop_info->GetHeader(); + HBasicBlock* preheader = node->loop_info->GetPreHeader(); + + bool enable_alignment_strategies = !IsInPredicatedVectorizationMode(); + if (!TrySetSimpleLoopHeader(header, &main_phi) || + !CanVectorizeDataFlow(node, header, enable_alignment_strategies) || + !IsVectorizationProfitable(trip_count) || + !TryAssignLastValue(node->loop_info, main_phi, preheader, /*collect_loop_uses*/ true)) { + return false; + } + + return true; +} + void HLoopOptimization::VectorizePredicated(LoopNode* node, HBasicBlock* block, HBasicBlock* exit) { @@ -1185,7 +1318,6 @@ void HLoopOptimization::VectorizePredicated(LoopNode* node, graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit); vector_mode_ = kVector; GenerateNewLoopPredicated(node, - block, preheader_for_vector_loop, vector_index_, vtc, @@ -1200,7 +1332,6 @@ void HLoopOptimization::VectorizePredicated(LoopNode* node, graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit); // Use "Traditional" version for the sequential loop. GenerateNewLoopScalarOrTraditional(node, - block, preheader_for_cleanup_loop, vector_index_, stc, @@ -1208,7 +1339,7 @@ void HLoopOptimization::VectorizePredicated(LoopNode* node, LoopAnalysisInfo::kNoUnrollingFactor); } - FinalizeVectorization(node, block); + FinalizeVectorization(node); // Assign governing predicates for the predicated instructions inserted during vectorization // outside the loop. @@ -1339,7 +1470,6 @@ void HLoopOptimization::VectorizeTraditional(LoopNode* node, HBasicBlock* preheader_for_peeling_loop = graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit); GenerateNewLoopScalarOrTraditional(node, - block, preheader_for_peeling_loop, vector_index_, ptc, @@ -1354,7 +1484,6 @@ void HLoopOptimization::VectorizeTraditional(LoopNode* node, HBasicBlock* preheader_for_vector_loop = graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit); GenerateNewLoopScalarOrTraditional(node, - block, preheader_for_vector_loop, vector_index_, vtc, @@ -1369,7 +1498,6 @@ void HLoopOptimization::VectorizeTraditional(LoopNode* node, HBasicBlock* preheader_for_cleanup_loop = graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit); GenerateNewLoopScalarOrTraditional(node, - block, preheader_for_cleanup_loop, vector_index_, stc, @@ -1377,10 +1505,10 @@ void HLoopOptimization::VectorizeTraditional(LoopNode* node, LoopAnalysisInfo::kNoUnrollingFactor); } - FinalizeVectorization(node, block); + FinalizeVectorization(node); } -void HLoopOptimization::FinalizeVectorization(LoopNode* node, HBasicBlock* block) { +void HLoopOptimization::FinalizeVectorization(LoopNode* node) { HBasicBlock* header = node->loop_info->GetHeader(); HBasicBlock* preheader = node->loop_info->GetPreHeader(); HLoopInformation* vloop = vector_header_->GetLoopInformation(); @@ -1397,9 +1525,16 @@ void HLoopOptimization::FinalizeVectorization(LoopNode* node, HBasicBlock* block } } - // Remove the original loop by disconnecting the body block - // and removing all instructions from the header. - block->DisconnectAndDelete(); + // Remove the original loop. + for (HBlocksInLoopPostOrderIterator it_loop(*node->loop_info); + !it_loop.Done(); + it_loop.Advance()) { + HBasicBlock* cur_block = it_loop.Current(); + if (cur_block == node->loop_info->GetHeader()) { + continue; + } + cur_block->DisconnectAndDelete(); + } while (!header->GetFirstInstruction()->IsGoto()) { header->RemoveInstruction(header->GetFirstInstruction()); @@ -1426,12 +1561,12 @@ HPhi* HLoopOptimization::InitializeForNewLoop(HBasicBlock* new_preheader, HInstr vector_index_ = phi; vector_permanent_map_->clear(); vector_external_set_->clear(); + predicate_info_map_->clear(); return phi; } void HLoopOptimization::GenerateNewLoopScalarOrTraditional(LoopNode* node, - HBasicBlock* body, HBasicBlock* new_preheader, HInstruction* lo, HInstruction* hi, @@ -1447,14 +1582,13 @@ void HLoopOptimization::GenerateNewLoopScalarOrTraditional(LoopNode* node, vector_header_->AddInstruction(new (global_allocator_) HIf(cond)); for (uint32_t u = 0; u < unroll; u++) { - GenerateNewLoopBodyOnce(node, body, induc_type, step); + GenerateNewLoopBodyOnce(node, induc_type, step); } FinalizePhisForNewLoop(phi, lo); } void HLoopOptimization::GenerateNewLoopPredicated(LoopNode* node, - HBasicBlock* body, HBasicBlock* new_preheader, HInstruction* lo, HInstruction* hi, @@ -1475,44 +1609,50 @@ void HLoopOptimization::GenerateNewLoopPredicated(LoopNode* node, 0u); HInstruction* cond = - new (global_allocator_) HVecPredCondition(global_allocator_, + new (global_allocator_) HVecPredToBoolean(global_allocator_, pred_while, - HVecPredCondition::PCondKind::kNFirst, + HVecPredToBoolean::PCondKind::kNFirst, DataType::Type::kInt32, vector_length_, 0u); vector_header_->AddInstruction(pred_while); vector_header_->AddInstruction(cond); - loop_main_pred_ = pred_while; - - vector_header_->AddInstruction(new (global_allocator_) HIf(cond)); - GenerateNewLoopBodyOnce(node, body, induc_type, step); + PreparePredicateInfoMap(node); + GenerateNewLoopBodyOnce(node, induc_type, step); + InitPredicateInfoMap(node, pred_while); - // Assign governing predicates for instructions in the loop. - for (HInstructionIterator it(body->GetInstructions()); !it.Done(); it.Advance()) { - auto i = vector_map_->find(it.Current()); - if (i != vector_map_->end()) { - HInstruction* instr = i->second; + // Assign governing predicates for instructions in the loop; the traversal order doesn't matter. + for (HBlocksInLoopIterator block_it(*node->loop_info); + !block_it.Done(); + block_it.Advance()) { + HBasicBlock* cur_block = block_it.Current(); - if (!instr->IsVecOperation()) { - continue; - } - // There are cases when a vector instruction, which corresponds to some instruction in the - // original scalar loop, is located not in the newly created vector loop but - // in the vector loop preheader (and hence recorded in vector_external_set_). - // - // Governing predicates will be set for such instructions separately. - bool in_vector_loop = vector_header_->GetLoopInformation()->Contains(*instr->GetBlock()); - DCHECK_IMPLIES(!in_vector_loop, - vector_external_set_->find(instr) != vector_external_set_->end()); - - if (in_vector_loop && - !instr->AsVecOperation()->IsPredicated()) { - HVecOperation* op = instr->AsVecOperation(); - op->SetMergingGoverningPredicate(loop_main_pred_); + for (HInstructionIterator it(cur_block->GetInstructions()); !it.Done(); it.Advance()) { + auto i = vector_map_->find(it.Current()); + if (i != vector_map_->end()) { + HInstruction* instr = i->second; + + if (!instr->IsVecOperation()) { + continue; + } + // There are cases when a vector instruction, which corresponds to some instruction in the + // original scalar loop, is located not in the newly created vector loop but + // in the vector loop preheader (and hence recorded in vector_external_set_). + // + // Governing predicates will be set for such instructions separately. + bool in_vector_loop = vector_header_->GetLoopInformation()->Contains(*instr->GetBlock()); + DCHECK_IMPLIES(!in_vector_loop, + vector_external_set_->find(instr) != vector_external_set_->end()); + + if (in_vector_loop && + !instr->AsVecOperation()->IsPredicated()) { + HVecOperation* op = instr->AsVecOperation(); + HVecPredSetOperation* pred = predicate_info_map_->Get(cur_block)->GetControlPredicate(); + op->SetMergingGoverningPredicate(pred); + } } } } @@ -1521,24 +1661,47 @@ void HLoopOptimization::GenerateNewLoopPredicated(LoopNode* node, } void HLoopOptimization::GenerateNewLoopBodyOnce(LoopNode* node, - HBasicBlock* body, DataType::Type induc_type, HInstruction* step) { // Generate instruction map. vector_map_->clear(); - for (HInstructionIterator it(body->GetInstructions()); !it.Done(); it.Advance()) { - bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true); - DCHECK(vectorized_def); + HLoopInformation* loop_info = node->loop_info; + + // Traverse the data flow of the loop, in the original program order. + for (HBlocksInLoopReversePostOrderIterator block_it(*loop_info); + !block_it.Done(); + block_it.Advance()) { + HBasicBlock* cur_block = block_it.Current(); + + if (cur_block == loop_info->GetHeader()) { + continue; + } + + for (HInstructionIterator it(cur_block->GetInstructions()); !it.Done(); it.Advance()) { + bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true); + DCHECK(vectorized_def); + } } - // Generate body from the instruction map, but in original program order. + + // Generate body from the instruction map, in the original program order. HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment(); - for (HInstructionIterator it(body->GetInstructions()); !it.Done(); it.Advance()) { - auto i = vector_map_->find(it.Current()); - if (i != vector_map_->end() && !i->second->IsInBlock()) { - Insert(vector_body_, i->second); - // Deal with instructions that need an environment, such as the scalar intrinsics. - if (i->second->NeedsEnvironment()) { - i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_); + for (HBlocksInLoopReversePostOrderIterator block_it(*loop_info); + !block_it.Done(); + block_it.Advance()) { + HBasicBlock* cur_block = block_it.Current(); + + if (cur_block == loop_info->GetHeader()) { + continue; + } + + for (HInstructionIterator it(cur_block->GetInstructions()); !it.Done(); it.Advance()) { + auto i = vector_map_->find(it.Current()); + if (i != vector_map_->end() && !i->second->IsInBlock()) { + Insert(vector_body_, i->second); + // Deal with instructions that need an environment, such as the scalar intrinsics. + if (i->second->NeedsEnvironment()) { + i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_); + } } } } @@ -1626,6 +1789,10 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node, if (instruction->IsGoto()) { return true; } + + if (instruction->IsIf()) { + return VectorizeIfCondition(node, instruction, generate_code, restrictions); + } // Otherwise accept only expressions with no effects outside the immediate loop-body. // Note that actual uses are inspected during right-hand-side tree traversal. return !IsUsedOutsideLoop(node->loop_info, instruction) @@ -1845,6 +2012,7 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict case InstructionSet::kThumb2: // Allow vectorization for all ARM devices, because Android assumes that // ARM 32-bit always supports advanced SIMD (64-bit SIMD). + *restrictions |= kNoIfCond; switch (type) { case DataType::Type::kBool: case DataType::Type::kUint8: @@ -1870,6 +2038,13 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict DCHECK_EQ(simd_register_size_ % DataType::Size(type), 0u); switch (type) { case DataType::Type::kBool: + *restrictions |= kNoDiv | + kNoSignedHAdd | + kNoUnsignedHAdd | + kNoUnroundedHAdd | + kNoSAD | + kNoIfCond; + return TrySetVectorLength(type, vector_length); case DataType::Type::kUint8: case DataType::Type::kInt8: *restrictions |= kNoDiv | @@ -1892,13 +2067,13 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict *restrictions |= kNoDiv | kNoSAD; return TrySetVectorLength(type, vector_length); case DataType::Type::kInt64: - *restrictions |= kNoDiv | kNoSAD; + *restrictions |= kNoDiv | kNoSAD | kNoIfCond; return TrySetVectorLength(type, vector_length); case DataType::Type::kFloat32: - *restrictions |= kNoReduction; + *restrictions |= kNoReduction | kNoIfCond; return TrySetVectorLength(type, vector_length); case DataType::Type::kFloat64: - *restrictions |= kNoReduction; + *restrictions |= kNoReduction | kNoIfCond; return TrySetVectorLength(type, vector_length); default: break; @@ -1907,6 +2082,7 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict } else { // Allow vectorization for all ARM devices, because Android assumes that // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD). + *restrictions |= kNoIfCond; switch (type) { case DataType::Type::kBool: case DataType::Type::kUint8: @@ -1937,6 +2113,7 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict case InstructionSet::kX86: case InstructionSet::kX86_64: // Allow vectorization for SSE4.1-enabled X86 devices only (128-bit SIMD). + *restrictions |= kNoIfCond; if (features->AsX86InstructionSetFeatures()->HasSSE4_1()) { switch (type) { case DataType::Type::kBool: @@ -2203,10 +2380,10 @@ HInstruction* HLoopOptimization::ReduceAndExtractIfNeeded(HInstruction* instruct } \ break; -void HLoopOptimization::GenerateVecOp(HInstruction* org, - HInstruction* opa, - HInstruction* opb, - DataType::Type type) { +HInstruction* HLoopOptimization::GenerateVecOp(HInstruction* org, + HInstruction* opa, + HInstruction* opb, + DataType::Type type) { uint32_t dex_pc = org->GetDexPc(); HInstruction* vector = nullptr; DataType::Type org_type = org->GetType(); @@ -2276,11 +2453,23 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org, GENERATE_VEC( new (global_allocator_) HVecAbs(global_allocator_, opa, type, vector_length_, dex_pc), new (global_allocator_) HAbs(org_type, opa, dex_pc)); + case HInstruction::kEqual: { + // Special case. + if (vector_mode_ == kVector) { + vector = new (global_allocator_) HVecCondition( + global_allocator_, opa, opb, type, vector_length_, dex_pc); + } else { + DCHECK(vector_mode_ == kSequential); + UNREACHABLE(); + } + } + break; default: break; } // switch CHECK(vector != nullptr) << "Unsupported SIMD operator"; vector_map_->Put(org, vector); + return vector; } #undef GENERATE_VEC @@ -2520,6 +2709,89 @@ bool HLoopOptimization::VectorizeDotProdIdiom(LoopNode* node, return false; } +bool HLoopOptimization::VectorizeIfCondition(LoopNode* node, + HInstruction* hif, + bool generate_code, + uint64_t restrictions) { + DCHECK(hif->IsIf()); + HInstruction* if_input = hif->InputAt(0); + + if (!if_input->HasOnlyOneNonEnvironmentUse()) { + // Avoid the complications of the condition used as materialized boolean. + return false; + } + + if (!if_input->IsEqual()) { + // TODO: Support other condition types. + return false; + } + + HCondition* cond = if_input->AsCondition(); + HInstruction* opa = cond->InputAt(0); + HInstruction* opb = cond->InputAt(1); + DataType::Type type = GetNarrowerType(opa, opb); + + if (!DataType::IsIntegralType(type)) { + return false; + } + + bool is_unsigned = false; + HInstruction* opa_promoted = opa; + HInstruction* opb_promoted = opb; + bool is_int_case = DataType::Type::kInt32 == opa->GetType() && + DataType::Type::kInt32 == opa->GetType(); + + // Condition arguments should be either both int32 or consistently extended signed/unsigned + // narrower operands. + if (!is_int_case && + !IsNarrowerOperands(opa, opb, type, &opa_promoted, &opb_promoted, &is_unsigned)) { + return false; + } + type = HVecOperation::ToProperType(type, is_unsigned); + + // For narrow types, explicit type conversion may have been + // optimized way, so set the no hi bits restriction here. + if (DataType::Size(type) <= 2) { + restrictions |= kNoHiBits; + } + + if (!TrySetVectorType(type, &restrictions) || + HasVectorRestrictions(restrictions, kNoIfCond)) { + return false; + } + + if (generate_code && vector_mode_ != kVector) { // de-idiom + opa_promoted = opa; + opb_promoted = opb; + } + + if (VectorizeUse(node, opa_promoted, generate_code, type, restrictions) && + VectorizeUse(node, opb_promoted, generate_code, type, restrictions)) { + if (generate_code) { + HInstruction* vec_cond = GenerateVecOp(cond, + vector_map_->Get(opa_promoted), + vector_map_->Get(opb_promoted), + type); + + if (vector_mode_ == kVector) { + HInstruction* vec_pred_not = new (global_allocator_) HVecPredNot( + global_allocator_, vec_cond, type, vector_length_, hif->GetDexPc()); + + vector_map_->Put(hif, vec_pred_not); + BlockPredicateInfo* pred_info = predicate_info_map_->Get(hif->GetBlock()); + pred_info->SetControlFlowInfo(vec_cond->AsVecPredSetOperation(), + vec_pred_not->AsVecPredSetOperation()); + } else { + DCHECK(vector_mode_ == kSequential); + UNREACHABLE(); + } + } + return true; + } + + return false; +} + // // Vectorization heuristics. // @@ -2834,4 +3106,67 @@ bool HLoopOptimization::CanRemoveCycle() { return true; } +void HLoopOptimization::PreparePredicateInfoMap(LoopNode* node) { + HLoopInformation* loop_info = node->loop_info; + + DCHECK(IsPredicatedLoopControlFlowSupported(loop_info)); + + for (HBlocksInLoopIterator block_it(*loop_info); + !block_it.Done(); + block_it.Advance()) { + HBasicBlock* cur_block = block_it.Current(); + BlockPredicateInfo* pred_info = new (loop_allocator_) BlockPredicateInfo(); + + predicate_info_map_->Put(cur_block, pred_info); + } +} + +void HLoopOptimization::InitPredicateInfoMap(LoopNode* node, + HVecPredSetOperation* loop_main_pred) { + HLoopInformation* loop_info = node->loop_info; + HBasicBlock* header = loop_info->GetHeader(); + BlockPredicateInfo* header_info = predicate_info_map_->Get(header); + // Loop header is a special case; it doesn't have a false predicate because we + // would just exit the loop then. + header_info->SetControlFlowInfo(loop_main_pred, loop_main_pred); + + size_t blocks_in_loop = header->GetLoopInformation()->GetBlocks().NumSetBits(); + if (blocks_in_loop == 2) { + for (HBasicBlock* successor : header->GetSuccessors()) { + if (loop_info->Contains(*successor)) { + // This is loop second block - body. + BlockPredicateInfo* body_info = predicate_info_map_->Get(successor); + body_info->SetControlPredicate(loop_main_pred); + return; + } + } + UNREACHABLE(); + } + + // TODO: support predicated vectorization of CF loop of more complex structure. + DCHECK(HasLoopDiamondStructure(loop_info)); + HBasicBlock* header_succ_0 = header->GetSuccessors()[0]; + HBasicBlock* header_succ_1 = header->GetSuccessors()[1]; + HBasicBlock* diamond_top = loop_info->Contains(*header_succ_0) ? + header_succ_0 : + header_succ_1; + + HIf* diamond_hif = diamond_top->GetLastInstruction()->AsIf(); + HBasicBlock* diamond_true = diamond_hif->IfTrueSuccessor(); + HBasicBlock* diamond_false = diamond_hif->IfFalseSuccessor(); + HBasicBlock* back_edge = diamond_true->GetSingleSuccessor(); + + BlockPredicateInfo* diamond_top_info = predicate_info_map_->Get(diamond_top); + BlockPredicateInfo* diamond_true_info = predicate_info_map_->Get(diamond_true); + BlockPredicateInfo* diamond_false_info = predicate_info_map_->Get(diamond_false); + BlockPredicateInfo* back_edge_info = predicate_info_map_->Get(back_edge); + + diamond_top_info->SetControlPredicate(header_info->GetTruePredicate()); + + diamond_true_info->SetControlPredicate(diamond_top_info->GetTruePredicate()); + diamond_false_info->SetControlPredicate(diamond_top_info->GetFalsePredicate()); + + back_edge_info->SetControlPredicate(header_info->GetTruePredicate()); +} + } // namespace art diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h index 3da8f8fe39..86a9f0fcb8 100644 --- a/compiler/optimizing/loop_optimization.h +++ b/compiler/optimizing/loop_optimization.h @@ -101,6 +101,7 @@ class HLoopOptimization : public HOptimization { kNoSAD = 1 << 11, // no sum of absolute differences (SAD) kNoWideSAD = 1 << 12, // no sum of absolute differences (SAD) with operand widening kNoDotProd = 1 << 13, // no dot product + kNoIfCond = 1 << 14, // no if condition conversion }; /* @@ -136,6 +137,95 @@ class HLoopOptimization : public HOptimization { bool is_string_char_at; // compressed string read }; + // This structure describes the control flow (CF) -> data flow (DF) conversion of the loop + // with control flow (see below) for the purpose of predicated autovectorization. + // + // Lets define "loops without control-flow" (or non-CF loops) as loops with two consecutive + // blocks and without the branching structure except for the loop exit. And + // "loop with control-flow" (or CF-loops) - all other loops. + // + // In the execution of the original CF-loop on each iteration some basic block Y will be + // either executed or not executed, depending on the control flow of the loop. More + // specifically, a block will be executed if all the conditional branches of the nodes in + // the control dependency graph for that block Y are taken according to the path from the loop + // header to that basic block. + // + // This is the key idea of CF->DF conversion: a boolean value + // 'ctrl_pred == cond1 && cond2 && ...' will determine whether the basic block Y will be + // executed, where cond_K is whether the branch of the node K in the control dependency + // graph upward traversal was taken in the 'right' direction. + // + // Def.: BB Y is control dependent on BB X iff + // (1) there exists a directed path P from X to Y with any basic block Z in P (excluding X + // and Y) post-dominated by Y and + // (2) X is not post-dominated by Y. + // ... + // X + // false / \ true + // / \ + // ... + // | + // Y + // ... + // + // When doing predicated autovectorization of a CF loop, we use the CF->DF conversion approach: + // 1) do the data analysis and vector operation creation as if it was a non-CF loop. + // 2) for each HIf block create two vector predicate setting instructions - for True and False + // edges/paths. + // 3) assign a governing vector predicate (see comments near HVecPredSetOperation) + // to each vector operation Alpha in the loop (including to those vector predicate setting + // instructions created in #2); do this by: + // - finding the immediate control dependent block of the instruction Alpha's block. + // - choosing the True or False predicate setting instruction (created in #2) depending + // on the path to the instruction. + // + // For more information check the papers: + // + // - Allen, John R and Kennedy, Ken and Porterfield, Carrie and Warren, Joe, + // “Conversion of Control Dependence to Data Dependence,” in Proceedings of the 10th ACM + // SIGACT-SIGPLAN Symposium on Principles of Programming Languages, 1983, pp. 177–189. + // - JEANNE FERRANTE, KARL J. OTTENSTEIN, JOE D. WARREN, + // "The Program Dependence Graph and Its Use in Optimization" + // + class BlockPredicateInfo : public ArenaObject<kArenaAllocLoopOptimization> { + public: + BlockPredicateInfo() : + control_predicate_(nullptr), + true_predicate_(nullptr), + false_predicate_(nullptr) {} + + void SetControlFlowInfo(HVecPredSetOperation* true_predicate, + HVecPredSetOperation* false_predicate) { + DCHECK(!HasControlFlowOps()); + true_predicate_ = true_predicate; + false_predicate_ = false_predicate; + } + + bool HasControlFlowOps() const { + // Note: a block must have both T/F predicates set or none of them. + DCHECK_EQ(true_predicate_ == nullptr, false_predicate_ == nullptr); + return true_predicate_ != nullptr; + } + + HVecPredSetOperation* GetControlPredicate() const { return control_predicate_; } + void SetControlPredicate(HVecPredSetOperation* control_predicate) { + control_predicate_ = control_predicate; + } + + HVecPredSetOperation* GetTruePredicate() const { return true_predicate_; } + HVecPredSetOperation* GetFalsePredicate() const { return false_predicate_; } + + private: + // Vector control predicate operation, associated with the block which will determine + // the active lanes for all vector operations, originated from this block. + HVecPredSetOperation* control_predicate_; + + // Vector predicate instruction, associated with the true sucessor of the block. + HVecPredSetOperation* true_predicate_; + // Vector predicate instruction, associated with the false sucessor of the block. + HVecPredSetOperation* false_predicate_; + }; + // // Loop setup and traversal. // @@ -208,10 +298,12 @@ class HLoopOptimization : public HOptimization { // - checks whether instructions are vectorizable for the target. // - conducts data dependence analysis for array references. // - additionally, collects info on peeling and aligment strategy. - bool CanVectorizeDataFlow(LoopNode* node, HBasicBlock* block, bool collect_alignment_info); + bool CanVectorizeDataFlow(LoopNode* node, HBasicBlock* header, bool collect_alignment_info); + // Does the checks (common for predicated and traditional mode) for the loop. + bool ShouldVectorizeCommon(LoopNode* node, HPhi* main_phi, int64_t trip_count); - // Vectorizes the loop for which all checks have been already done. + // Try to vectorize the loop, returns whether it was successful. // // There are two versions/algorithms: // - Predicated: all the vector operations have governing predicates which control @@ -220,6 +312,19 @@ class HLoopOptimization : public HOptimization { // - Traditional: a regular mode in which all vector operations lanes are unconditionally // active. // Example: vectoriation using AArch64 NEON. + bool TryVectorizePredicated(LoopNode* node, + HBasicBlock* body, + HBasicBlock* exit, + HPhi* main_phi, + int64_t trip_count); + + bool TryVectorizedTraditional(LoopNode* node, + HBasicBlock* body, + HBasicBlock* exit, + HPhi* main_phi, + int64_t trip_count); + + // Vectorizes the loop for which all checks have been already done. void VectorizePredicated(LoopNode* node, HBasicBlock* block, HBasicBlock* exit); @@ -230,14 +335,13 @@ class HLoopOptimization : public HOptimization { // Performs final steps for whole vectorization process: links reduction, removes the original // scalar loop, updates loop info. - void FinalizeVectorization(LoopNode* node, HBasicBlock* block); + void FinalizeVectorization(LoopNode* node); // Helpers that do the vector instruction synthesis for the previously created loop; create // and fill the loop body with instructions. // // A version to generate a vector loop in predicated mode. void GenerateNewLoopPredicated(LoopNode* node, - HBasicBlock* block, HBasicBlock* new_preheader, HInstruction* lo, HInstruction* hi, @@ -246,7 +350,6 @@ class HLoopOptimization : public HOptimization { // A version to generate a vector loop in traditional mode or to generate // a scalar loop for both modes. void GenerateNewLoopScalarOrTraditional(LoopNode* node, - HBasicBlock* block, HBasicBlock* new_preheader, HInstruction* lo, HInstruction* hi, @@ -264,9 +367,15 @@ class HLoopOptimization : public HOptimization { // Finalizes reduction and induction phis' inputs for the newly created loop. void FinalizePhisForNewLoop(HPhi* phi, HInstruction* lo); + // Creates empty predicate info object for each basic block and puts it into the map. + void PreparePredicateInfoMap(LoopNode* node); + + // Set up block true/false predicates using info, collected through data flow and control + // dependency analysis. + void InitPredicateInfoMap(LoopNode* node, HVecPredSetOperation* loop_main_pred); + // Performs instruction synthesis for the loop body. void GenerateNewLoopBodyOnce(LoopNode* node, - HBasicBlock* body, DataType::Type induc_type, HInstruction* step); @@ -300,10 +409,10 @@ class HLoopOptimization : public HOptimization { void GenerateVecReductionPhi(HPhi* phi); void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction); HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction); - void GenerateVecOp(HInstruction* org, - HInstruction* opa, - HInstruction* opb, - DataType::Type type); + HInstruction* GenerateVecOp(HInstruction* org, + HInstruction* opa, + HInstruction* opb, + DataType::Type type); // Vectorization idioms. bool VectorizeSaturationIdiom(LoopNode* node, @@ -326,6 +435,10 @@ class HLoopOptimization : public HOptimization { bool generate_code, DataType::Type type, uint64_t restrictions); + bool VectorizeIfCondition(LoopNode* node, + HInstruction* instruction, + bool generate_code, + uint64_t restrictions); // Vectorization heuristics. Alignment ComputeAlignment(HInstruction* offset, @@ -435,13 +548,17 @@ class HLoopOptimization : public HOptimization { // for loop reductions). ScopedArenaSet<HInstruction*>* vector_external_set_; + // A mapping between a basic block of the original loop and its associated PredicateInfo. + // + // Only used in predicated loop vectorization mode. + ScopedArenaSafeMap<HBasicBlock*, BlockPredicateInfo*>* predicate_info_map_; + // Temporary vectorization bookkeeping. VectorMode vector_mode_; // synthesis mode HBasicBlock* vector_preheader_; // preheader of the new loop HBasicBlock* vector_header_; // header of the new loop HBasicBlock* vector_body_; // body of the new loop HInstruction* vector_index_; // normalized index of the new loop - HInstruction* loop_main_pred_; // Loop main predicate - for predicated mode. // Helper for target-specific behaviour for loop optimizations. ArchNoOptsLoopHelper* arch_loop_helper_; diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc index 83b58763a4..9caa766858 100644 --- a/compiler/optimizing/nodes.cc +++ b/compiler/optimizing/nodes.cc @@ -2797,8 +2797,11 @@ HInstruction* HGraph::InlineInto(HGraph* outer_graph, HInvoke* invoke) { if (HasMonitorOperations()) { outer_graph->SetHasMonitorOperations(true); } - if (HasSIMD()) { - outer_graph->SetHasSIMD(true); + if (HasTraditionalSIMD()) { + outer_graph->SetHasTraditionalSIMD(true); + } + if (HasPredicatedSIMD()) { + outer_graph->SetHasPredicatedSIMD(true); } if (HasAlwaysThrowingInvokes()) { outer_graph->SetHasAlwaysThrowingInvokes(true); @@ -3125,6 +3128,8 @@ void HGraph::TransformLoopHeaderForBCE(HBasicBlock* header) { new_pre_header, old_pre_header, /* replace_if_back_edge= */ false); } +// Creates a new two-basic-block loop and inserts it between original loop header and +// original loop exit; also adjusts dominators, post order and new LoopInformation. HBasicBlock* HGraph::TransformLoopForVectorization(HBasicBlock* header, HBasicBlock* body, HBasicBlock* exit) { diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index 9643e4c789..dec86e30dd 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -403,7 +403,8 @@ class HGraph : public ArenaObject<kArenaAllocGraph> { has_bounds_checks_(false), has_try_catch_(false), has_monitor_operations_(false), - has_simd_(false), + has_traditional_simd_(false), + has_predicated_simd_(false), has_loops_(false), has_irreducible_loops_(false), has_direct_critical_native_call_(false), @@ -708,8 +709,13 @@ class HGraph : public ArenaObject<kArenaAllocGraph> { bool HasMonitorOperations() const { return has_monitor_operations_; } void SetHasMonitorOperations(bool value) { has_monitor_operations_ = value; } - bool HasSIMD() const { return has_simd_; } - void SetHasSIMD(bool value) { has_simd_ = value; } + bool HasTraditionalSIMD() { return has_traditional_simd_; } + void SetHasTraditionalSIMD(bool value) { has_traditional_simd_ = value; } + + bool HasPredicatedSIMD() { return has_predicated_simd_; } + void SetHasPredicatedSIMD(bool value) { has_predicated_simd_ = value; } + + bool HasSIMD() const { return has_traditional_simd_ || has_predicated_simd_; } bool HasLoops() const { return has_loops_; } void SetHasLoops(bool value) { has_loops_ = value; } @@ -822,10 +828,11 @@ class HGraph : public ArenaObject<kArenaAllocGraph> { // DexRegisterMap to be present to allow deadlock analysis for non-debuggable code. bool has_monitor_operations_; - // Flag whether SIMD instructions appear in the graph. If true, the - // code generators may have to be more careful spilling the wider + // Flags whether SIMD (traditional or predicated) instructions appear in the graph. + // If either is true, the code generators may have to be more careful spilling the wider // contents of SIMD registers. - bool has_simd_; + bool has_traditional_simd_; + bool has_predicated_simd_; // Flag whether there are any loops in the graph. We can skip loop // optimization if it's false. @@ -1636,7 +1643,9 @@ class HLoopInformationOutwardIterator : public ValueObject { M(VecStore, VecMemoryOperation) \ M(VecPredSetAll, VecPredSetOperation) \ M(VecPredWhile, VecPredSetOperation) \ - M(VecPredCondition, VecOperation) \ + M(VecPredToBoolean, VecOperation) \ + M(VecCondition, VecPredSetOperation) \ + M(VecPredNot, VecPredSetOperation) \ #define FOR_EACH_CONCRETE_INSTRUCTION_COMMON(M) \ FOR_EACH_CONCRETE_INSTRUCTION_SCALAR_COMMON(M) \ @@ -8634,7 +8643,7 @@ class CloneAndReplaceInstructionVisitor final : public HGraphDelegateVisitor { DISALLOW_COPY_AND_ASSIGN(CloneAndReplaceInstructionVisitor); }; -// Iterator over the blocks that art part of the loop. Includes blocks part +// Iterator over the blocks that are part of the loop; includes blocks which are part // of an inner loop. The order in which the blocks are iterated is on their // block id. class HBlocksInLoopIterator : public ValueObject { @@ -8667,7 +8676,7 @@ class HBlocksInLoopIterator : public ValueObject { DISALLOW_COPY_AND_ASSIGN(HBlocksInLoopIterator); }; -// Iterator over the blocks that art part of the loop. Includes blocks part +// Iterator over the blocks that are part of the loop; includes blocks which are part // of an inner loop. The order in which the blocks are iterated is reverse // post order. class HBlocksInLoopReversePostOrderIterator : public ValueObject { @@ -8700,6 +8709,39 @@ class HBlocksInLoopReversePostOrderIterator : public ValueObject { DISALLOW_COPY_AND_ASSIGN(HBlocksInLoopReversePostOrderIterator); }; +// Iterator over the blocks that are part of the loop; includes blocks which are part +// of an inner loop. The order in which the blocks are iterated is post order. +class HBlocksInLoopPostOrderIterator : public ValueObject { + public: + explicit HBlocksInLoopPostOrderIterator(const HLoopInformation& info) + : blocks_in_loop_(info.GetBlocks()), + blocks_(info.GetHeader()->GetGraph()->GetReversePostOrder()), + index_(blocks_.size() - 1) { + if (!blocks_in_loop_.IsBitSet(blocks_[index_]->GetBlockId())) { + Advance(); + } + } + + bool Done() const { return index_ < 0; } + HBasicBlock* Current() const { return blocks_[index_]; } + void Advance() { + --index_; + for (; index_ >= 0; --index_) { + if (blocks_in_loop_.IsBitSet(blocks_[index_]->GetBlockId())) { + break; + } + } + } + + private: + const BitVector& blocks_in_loop_; + const ArenaVector<HBasicBlock*>& blocks_; + + int32_t index_; + + DISALLOW_COPY_AND_ASSIGN(HBlocksInLoopPostOrderIterator); +}; + // Returns int64_t value of a properly typed constant. inline int64_t Int64FromConstant(HConstant* constant) { if (constant->IsIntConstant()) { diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h index 73f6c40a0d..6a60d6be01 100644 --- a/compiler/optimizing/nodes_vector.h +++ b/compiler/optimizing/nodes_vector.h @@ -1384,8 +1384,8 @@ class HVecPredWhile final : public HVecPredSetOperation { static constexpr size_t kCondKind = HVecOperation::kNumberOfVectorOpPackedBits; static constexpr size_t kCondKindSize = MinimumBitsToStore(static_cast<size_t>(CondKind::kLast)); - static constexpr size_t kNumberOfVecPredConditionPackedBits = kCondKind + kCondKindSize; - static_assert(kNumberOfVecPredConditionPackedBits <= kMaxNumberOfPackedBits, + static constexpr size_t kNumberOfVecPredWhilePackedBits = kCondKind + kCondKindSize; + static_assert(kNumberOfVecPredWhilePackedBits <= kMaxNumberOfPackedBits, "Too many packed fields."); using CondKindField = BitField<CondKind, kCondKind, kCondKindSize>; @@ -1395,13 +1395,13 @@ class HVecPredWhile final : public HVecPredSetOperation { // Evaluates the predicate condition (PCondKind) for a vector predicate; outputs // a scalar boolean value result. // -// Note: as VecPredCondition can be also predicated, only active elements (determined by the +// Note: as VecPredToBoolean can be also predicated, only active elements (determined by the // instruction's governing predicate) of the input vector predicate are used for condition // evaluation. // // Note: this instruction is currently used as a workaround for the fact that IR instructions // can't have more than one output. -class HVecPredCondition final : public HVecOperation { +class HVecPredToBoolean final : public HVecOperation { public: // To get more info on the condition kinds please see "2.2 Process state, PSTATE" section of // "ARM Architecture Reference Manual Supplement. The Scalable Vector Extension (SVE), @@ -1418,13 +1418,13 @@ class HVecPredCondition final : public HVecOperation { kEnumLast = kPLast }; - HVecPredCondition(ArenaAllocator* allocator, + HVecPredToBoolean(ArenaAllocator* allocator, HInstruction* input, PCondKind pred_cond, DataType::Type packed_type, size_t vector_length, uint32_t dex_pc) - : HVecOperation(kVecPredCondition, + : HVecOperation(kVecPredToBoolean, allocator, packed_type, SideEffects::None(), @@ -1447,19 +1447,86 @@ class HVecPredCondition final : public HVecOperation { return GetPackedField<CondKindField>(); } - DECLARE_INSTRUCTION(VecPredCondition); + DECLARE_INSTRUCTION(VecPredToBoolean); protected: // Additional packed bits. static constexpr size_t kCondKind = HVecOperation::kNumberOfVectorOpPackedBits; static constexpr size_t kCondKindSize = MinimumBitsToStore(static_cast<size_t>(PCondKind::kEnumLast)); - static constexpr size_t kNumberOfVecPredConditionPackedBits = kCondKind + kCondKindSize; - static_assert(kNumberOfVecPredConditionPackedBits <= kMaxNumberOfPackedBits, + static constexpr size_t kNumberOfVecPredToBooleanPackedBits = kCondKind + kCondKindSize; + static_assert(kNumberOfVecPredToBooleanPackedBits <= kMaxNumberOfPackedBits, "Too many packed fields."); using CondKindField = BitField<PCondKind, kCondKind, kCondKindSize>; - DEFAULT_COPY_CONSTRUCTOR(VecPredCondition); + DEFAULT_COPY_CONSTRUCTOR(VecPredToBoolean); +}; + +// Evaluates condition for pairwise elements in two input vectors and sets the result +// as an output predicate vector. +// +// viz. [ p1, .. , pn ] = [ x1 OP y1 , x2 OP y2, .. , xn OP yn] where OP is CondKind +// condition. +// +// Currently only kEqual is supported by this vector instruction - we don't even define +// the kCondType here. +// TODO: support other condition ops. +class HVecCondition final : public HVecPredSetOperation { + public: + HVecCondition(ArenaAllocator* allocator, + HInstruction* left, + HInstruction* right, + DataType::Type packed_type, + size_t vector_length, + uint32_t dex_pc) : + HVecPredSetOperation(kVecCondition, + allocator, + packed_type, + SideEffects::None(), + /* number_of_inputs= */ 2, + vector_length, + dex_pc) { + DCHECK(left->IsVecOperation()); + DCHECK(!left->IsVecPredSetOperation()); + DCHECK(right->IsVecOperation()); + DCHECK(!right->IsVecPredSetOperation()); + SetRawInputAt(0, left); + SetRawInputAt(1, right); + } + + DECLARE_INSTRUCTION(VecCondition); + + protected: + DEFAULT_COPY_CONSTRUCTOR(VecCondition); +}; + +// Inverts every component in the predicate vector. +// +// viz. [ p1, .. , pn ] = [ !px1 , !px2 , .. , !pxn ]. +class HVecPredNot final : public HVecPredSetOperation { + public: + HVecPredNot(ArenaAllocator* allocator, + HInstruction* input, + DataType::Type packed_type, + size_t vector_length, + uint32_t dex_pc) : + HVecPredSetOperation(kVecPredNot, + allocator, + packed_type, + SideEffects::None(), + /* number_of_inputs= */ 1, + vector_length, + dex_pc) { + DCHECK(input->IsVecOperation()); + DCHECK(input->IsVecPredSetOperation()); + + SetRawInputAt(0, input); + } + + DECLARE_INSTRUCTION(VecPredNot); + + protected: + DEFAULT_COPY_CONSTRUCTOR(VecPredNot); }; } // namespace art diff --git a/test/530-checker-lse-simd/src/Main.java b/test/530-checker-lse-simd/src/Main.java index 619ac281b3..ec2faf51ec 100644 --- a/test/530-checker-lse-simd/src/Main.java +++ b/test/530-checker-lse-simd/src/Main.java @@ -38,10 +38,17 @@ public class Main { /// CHECK-NEXT: Sub /// CHECK-NEXT: Mul /// CHECK-NEXT: ArraySet + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-NEXT: ArrayGet + // + /// CHECK-FI: /// CHECK-NEXT: LessThanOrEqual /// CHECK-NEXT: Select /// CHECK-NEXT: Add /// CHECK-NEXT: Goto loop:{{B\d+}} + // + // TODO: reenable LSE for graphs with Predicated SIMD. static double $noinline$vecgen(double a[], double b[], int n) { double norma = 0.0; int init = 1325; @@ -87,10 +94,17 @@ public class Main { /// CHECK-NEXT: ArrayGet /// CHECK-NEXT: Mul /// CHECK-NEXT: ArraySet + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-NEXT: ArrayGet + // + /// CHECK-FI: /// CHECK-NEXT: ArrayLength /// CHECK-NEXT: BelowOrEqual // /// CHECK: Return + // + // TODO: reenable LSE for graphs with Predicated SIMD. static double $noinline$test02(double a[], int n) { double b[] = new double[n]; a[0] = a[0] / 2; @@ -120,7 +134,13 @@ public class Main { /// CHECK-NEXT: Return /// CHECK-START: double Main.$noinline$test03(int) load_store_elimination (after) - /// CHECK-NOT: ArrayGet loop:none + /// CHECK-IF: not hasIsaFeature("sve") + // + /// CHECK-NOT: ArrayGet loop:none + // + /// CHECK-FI: + // + // TODO: reenable LSE for graphs with Predicated SIMD. static double $noinline$test03(int n) { double a[] = new double[n]; double b[] = new double[n]; @@ -164,7 +184,14 @@ public class Main { /// CHECK: Add /// CHECK: Goto loop:{{B\d+}} // - /// CHECK-NOT: VecStore + + /// CHECK-IF: not hasIsaFeature("sve") + // + /// CHECK-NOT: VecStore + // + /// CHECK-FI: + // + // TODO: reenable LSE for graphs with Predicated SIMD. static double[] $noinline$test04(int n) { double a[] = new double[n]; double b[] = new double[n]; @@ -194,18 +221,14 @@ public class Main { /// CHECK: Goto loop:{{B\d+}} /// CHECK-START-ARM64: double[] Main.$noinline$test05(int) load_store_elimination (after) - /// CHECK-IF: not hasIsaFeature("sve") - // - // In NEON case there is a post-loop which prevents the store to be removed. - /// CHECK: VecStore - // - /// CHECK-FI: - // + /// CHECK: VecStore /// CHECK: VecStore /// CHECK: Add /// CHECK: Goto loop:{{B\d+}} // /// CHECK-NOT: VecStore + // + // TODO: reenable LSE for graphs with Predicated SIMD. static double[] $noinline$test05(int n) { double a[] = new double[n]; double b[] = new double[n]; @@ -249,7 +272,13 @@ public class Main { /// CHECK: VecAdd /// CHECK: VecStore // - /// CHECK-NOT: VecStore + /// CHECK-IF: not hasIsaFeature("sve") + // + /// CHECK-NOT: VecStore + // + /// CHECK-FI: + // + // TODO: reenable LSE for graphs with Predicated SIMD. static double[] $noinline$test06(int n) { double a[] = new double[n]; double b[] = new double[n]; diff --git a/test/661-checker-simd-cf-loops/Android.bp b/test/661-checker-simd-cf-loops/Android.bp new file mode 100644 index 0000000000..8d2c193c02 --- /dev/null +++ b/test/661-checker-simd-cf-loops/Android.bp @@ -0,0 +1,43 @@ +// Generated by `regen-test-files`. Do not edit manually. + +// Build rules for ART run-test `661-checker-simd-cf-loops`. + +package { + // See: http://go/android-license-faq + // A large-scale-change added 'default_applicable_licenses' to import + // all of the 'license_kinds' from "art_license" + // to get the below license kinds: + // SPDX-license-identifier-Apache-2.0 + default_applicable_licenses: ["art_license"], +} + +// Test's Dex code. +java_test { + name: "art-run-test-661-checker-simd-cf-loops", + defaults: ["art-run-test-defaults"], + test_config_template: ":art-run-test-target-template", + srcs: ["src/**/*.java"], + data: [ + ":art-run-test-661-checker-simd-cf-loops-expected-stdout", + ":art-run-test-661-checker-simd-cf-loops-expected-stderr", + ], + // Include the Java source files in the test's artifacts, to make Checker assertions + // available to the TradeFed test runner. + include_srcs: true, +} + +// Test's expected standard output. +genrule { + name: "art-run-test-661-checker-simd-cf-loops-expected-stdout", + out: ["art-run-test-661-checker-simd-cf-loops-expected-stdout.txt"], + srcs: ["expected-stdout.txt"], + cmd: "cp -f $(in) $(out)", +} + +// Test's expected standard error. +genrule { + name: "art-run-test-661-checker-simd-cf-loops-expected-stderr", + out: ["art-run-test-661-checker-simd-cf-loops-expected-stderr.txt"], + srcs: ["expected-stderr.txt"], + cmd: "cp -f $(in) $(out)", +} diff --git a/test/661-checker-simd-cf-loops/expected-stderr.txt b/test/661-checker-simd-cf-loops/expected-stderr.txt new file mode 100644 index 0000000000..e69de29bb2 --- /dev/null +++ b/test/661-checker-simd-cf-loops/expected-stderr.txt diff --git a/test/661-checker-simd-cf-loops/expected-stdout.txt b/test/661-checker-simd-cf-loops/expected-stdout.txt new file mode 100644 index 0000000000..b0aad4deb5 --- /dev/null +++ b/test/661-checker-simd-cf-loops/expected-stdout.txt @@ -0,0 +1 @@ +passed diff --git a/test/661-checker-simd-cf-loops/info.txt b/test/661-checker-simd-cf-loops/info.txt new file mode 100644 index 0000000000..bc7881d16a --- /dev/null +++ b/test/661-checker-simd-cf-loops/info.txt @@ -0,0 +1 @@ +Functional tests on vectorization of loops with control flow. diff --git a/test/661-checker-simd-cf-loops/src/Main.java b/test/661-checker-simd-cf-loops/src/Main.java new file mode 100644 index 0000000000..95c09490bb --- /dev/null +++ b/test/661-checker-simd-cf-loops/src/Main.java @@ -0,0 +1,746 @@ +/* + * Copyright (C) 2023 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Tests for autovectorization of loops with control flow. + */ +public class Main { + + public static final int ARRAY_LENGTH = 128; + public static final int USED_ARRAY_LENGTH = ARRAY_LENGTH - 1; + + public static boolean[] booleanArray = new boolean[ARRAY_LENGTH]; + public static boolean[] booleanArray2 = new boolean[ARRAY_LENGTH]; + public static byte[] byteArray = new byte[ARRAY_LENGTH]; + public static short[] shortArray = new short[ARRAY_LENGTH]; + public static char[] charArray = new char[ARRAY_LENGTH]; + public static int[] intArray = new int[ARRAY_LENGTH]; + public static long[] longArray = new long[ARRAY_LENGTH]; + public static float[] floatArray = new float[ARRAY_LENGTH]; + public static double[] doubleArray = new double[ARRAY_LENGTH]; + + public static final int MAGIC_VALUE_A = 2; + public static final int MAGIC_VALUE_B = 10; + public static final int MAGIC_VALUE_C = 100; + + public static final int MAGIC_ADD_CONST = 99; + + public static final float MAGIC_FLOAT_VALUE_A = 2.0f; + public static final float MAGIC_FLOAT_VALUE_B = 10.0f; + public static final float MAGIC_FLOAT_VALUE_C = 100.0f; + + public static final float MAGIC_FLOAT_ADD_CONST = 99.0f; + + /// CHECK-START-ARM64: int Main.$compile$noinline$FullDiamond(int[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-DAG: <<C0:i\d+>> IntConstant 0 loop:none + /// CHECK-DAG: <<C4:i\d+>> IntConstant 4 loop:none + /// CHECK-DAG: <<C99:i\d+>> IntConstant 99 loop:none + /// CHECK-DAG: <<C100:i\d+>> IntConstant 100 loop:none + /// CHECK-DAG: <<Vec4:d\d+>> VecReplicateScalar [<<C4>>,{{j\d+}}] loop:none + /// CHECK-DAG: <<Vec99:d\d+>> VecReplicateScalar [<<C99>>,{{j\d+}}] loop:none + /// CHECK-DAG: <<Vec100:d\d+>> VecReplicateScalar [<<C100>>,{{j\d+}}] loop:none + // + /// CHECK-DAG: <<Phi:i\d+>> Phi [<<C0>>,{{i\d+}}] loop:<<Loop:B\d+>> outer_loop:none + /// CHECK-DAG: <<LoopP:j\d+>> VecPredWhile [<<Phi>>,{{i\d+}}] loop:<<Loop>> outer_loop:none + // + /// CHECK-DAG: <<Load1:d\d+>> VecLoad [<<Arr:l\d+>>,<<Phi>>,<<LoopP>>] loop:<<Loop>> outer_loop:none + /// CHECK-DAG: <<Cond:j\d+>> VecCondition [<<Load1>>,<<Vec100>>,<<LoopP>>] loop:<<Loop>> outer_loop:none + /// CHECK-DAG: <<CondR:j\d+>> VecPredNot [<<Cond>>,<<LoopP>>] loop:<<Loop>> outer_loop:none + /// CHECK-DAG: <<AddT:d\d+>> VecAdd [<<Load1>>,<<Vec99>>,<<CondR>>] loop:<<Loop>> outer_loop:none + /// CHECK-DAG: <<StT:d\d+>> VecStore [<<Arr>>,<<Phi>>,<<AddT>>,<<CondR>>] loop:<<Loop>> outer_loop:none + /// CHECK-DAG: <<StF:d\d+>> VecStore [<<Arr>>,<<Phi>>,{{d\d+}},<<Cond>>] loop:<<Loop>> outer_loop:none + /// CHECK-DAG: <<Ld2:d\d+>> VecLoad [<<Arr>>,<<Phi>>,<<LoopP>>] loop:<<Loop>> outer_loop:none + /// CHECK-DAG: <<Add2:d\d+>> VecAdd [<<Ld2>>,<<Vec4>>,<<LoopP>>] loop:<<Loop>> outer_loop:none + /// CHECK-DAG: <<St21:d\d+>> VecStore [<<Arr>>,<<Phi>>,<<Add2>>,<<LoopP>>] loop:<<Loop>> outer_loop:none + // + /// CHECK-ELSE: + // + /// CHECK-NOT: VecLoad + // + /// CHECK-FI: + public static int $compile$noinline$FullDiamond(int[] x) { + int i = 0; + for (; i < USED_ARRAY_LENGTH; i++) { + int val = x[i]; + if (val != MAGIC_VALUE_C) { + x[i] += MAGIC_ADD_CONST; + } else { + x[i] += 3; + } + x[i] += 4; + } + return i; + } + + // + // Test various types. + // + + /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleBoolean(boolean[], boolean[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-NOT: VecLoad + // + /// CHECK-FI: + // + // TODO: Support extra condition types and boolean comparisons. + public static void $compile$noinline$SimpleBoolean(boolean[] x, boolean[] y) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + boolean val = x[i]; + if (val != y[i]) { + x[i] |= y[i]; + } + } + } + + /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleByte(byte[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-DAG: VecLoad + // + /// CHECK-FI: + public static void $compile$noinline$SimpleByte(byte[] x) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + byte val = x[i]; + if (val != MAGIC_VALUE_C) { + x[i] += MAGIC_ADD_CONST; + } + } + } + + /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleUByte(byte[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-DAG: VecLoad + // + /// CHECK-FI: + public static void $compile$noinline$SimpleUByte(byte[] x) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + if ((x[i] & 0xFF) != MAGIC_VALUE_C) { + x[i] += MAGIC_ADD_CONST; + } + } + } + + /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleShort(short[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-DAG: VecLoad + // + /// CHECK-FI: + public static void $compile$noinline$SimpleShort(short[] x) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + short val = x[i]; + if (val != MAGIC_VALUE_C) { + x[i] += MAGIC_ADD_CONST; + } + } + } + + /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleChar(char[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-DAG: VecLoad + // + /// CHECK-FI: + public static void $compile$noinline$SimpleChar(char[] x) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + char val = x[i]; + if (val != MAGIC_VALUE_C) { + x[i] += MAGIC_ADD_CONST; + } + } + } + + /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleInt(int[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-DAG: VecLoad + // + /// CHECK-FI: + public static void $compile$noinline$SimpleInt(int[] x) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + int val = x[i]; + if (val != MAGIC_VALUE_C) { + x[i] += MAGIC_ADD_CONST; + } + } + } + + /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleLong(long[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-NOT: VecLoad + // + /// CHECK-FI: + // + // TODO: Support long comparisons. + public static void $compile$noinline$SimpleLong(long[] x) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + long val = x[i]; + if (val != MAGIC_VALUE_C) { + x[i] += MAGIC_ADD_CONST; + } + } + } + + /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleFloat(float[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-NOT: VecLoad + // + /// CHECK-FI: + // + // TODO: Support FP comparisons. + public static void $compile$noinline$SimpleFloat(float[] x) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + float val = x[i]; + if (val > 10.0f) { + x[i] += 99.1f; + } + } + } + + /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleDouble(double[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-NOT: VecLoad + // + /// CHECK-FI: + // + // TODO: Support FP comparisons. + public static void $compile$noinline$SimpleDouble(double[] x) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + double val = x[i]; + if (val != 10.0) { + x[i] += 99.1; + } + } + } + + // + // Narrowing types. + // + + /// CHECK-START-ARM64: void Main.$compile$noinline$ByteConv(byte[], byte[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-DAG: VecLoad + // + /// CHECK-FI: + public static void $compile$noinline$ByteConv(byte[] x, byte[] y) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + byte val = (byte)(x[i] + 1); + if (val != y[i]) { + x[i] += MAGIC_ADD_CONST; + } + } + } + + /// CHECK-START-ARM64: void Main.$compile$noinline$UByteAndWrongConst(byte[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-NOT: VecLoad + // + /// CHECK-FI: + // + // 'NarrowerOperands' not met: the constant is not a ubyte one. + public static void $compile$noinline$UByteAndWrongConst(byte[] x) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + if ((x[i] & 0xFF) != (MAGIC_VALUE_C | 0x100)) { + x[i] += MAGIC_ADD_CONST; + } + } + } + + /// CHECK-START-ARM64: void Main.$compile$noinline$ByteNoHiBits(byte[], byte[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-NOT: VecLoad + // + /// CHECK-FI: + // + // Check kNoHiBits case when "wider" operations cannot bring in higher order bits. + public static void $compile$noinline$ByteNoHiBits(byte[] x, byte[] y) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + byte val = x[i]; + if ((val >>> 3) != y[i]) { + x[i] += MAGIC_ADD_CONST; + } + } + } + + // + // Test condition types. + // + + /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleBelow(int[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-NOT: VecLoad + // + /// CHECK-FI: + // + // TODO: Support other conditions. + public static void $compile$noinline$SimpleBelow(int[] x) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + int val = x[i]; + if (val < MAGIC_VALUE_C) { + x[i] += MAGIC_ADD_CONST; + } + } + } + + // + // Test vectorization idioms. + // + + /// CHECK-START-ARM64: void Main.$compile$noinline$Select(int[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-NOT: VecLoad + // + /// CHECK-FI: + // + // TODO: vectorize loops with select in the body. + public static void $compile$noinline$Select(int[] x) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + int val = x[i]; + if (val != MAGIC_VALUE_C) { + val += MAGIC_ADD_CONST; + } + x[i] = val; + } + } + + /// CHECK-START-ARM64: void Main.$compile$noinline$Phi(int[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-NOT: VecLoad + // + /// CHECK-FI: + // + // TODO: vectorize loops with phis in the body. + public static void $compile$noinline$Phi(int[] x) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + int val = x[i]; + if (val != MAGIC_VALUE_C) { + val += MAGIC_ADD_CONST; + x[i] += val; + } + x[i] += val; + } + } + + // TODO: when Phis are supported, test dotprod and sad idioms. + + /// CHECK-START-ARM64: int Main.$compile$noinline$Reduction(int[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-NOT: VecLoad + // + /// CHECK-FI: + // + // TODO: vectorize loops with phis and reductions in the body. + private static int $compile$noinline$Reduction(int[] x) { + int sum = 0; + for (int i = 0; i < ARRAY_LENGTH; i++) { + int val = x[i]; + if (val != MAGIC_VALUE_C) { + sum += val + x[i]; + } + } + return sum; + } + + /// CHECK-START-ARM64: int Main.$compile$noinline$ReductionBackEdge(int[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-DAG: VecLoad + // + /// CHECK-FI: + // + // Reduction in the back edge block, non-CF-dependent. + public static int $compile$noinline$ReductionBackEdge(int[] x) { + int sum = 0; + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + int val = x[i]; + if (val != MAGIC_VALUE_C) { + x[i] += MAGIC_ADD_CONST; + } + sum += x[i]; + } + return sum; + } + + // + // Negative compile tests. + // + + public static final int STENCIL_ARRAY_SIZE = 130; + + /// CHECK-START-ARM64: void Main.$compile$noinline$stencilAlike(int[], int[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-NOT: VecLoad + // + /// CHECK-FI: + // + // This loop needs a runtime test for array references disambiguation and a scalar cleanup loop. + // Currently we can't generate a scalar clean up loop with control flow. + private static void $compile$noinline$stencilAlike(int[] a, int[] b) { + for (int i = 1; i < STENCIL_ARRAY_SIZE - 1; i++) { + int val0 = b[i - 1]; + int val1 = b[i]; + int val2 = b[i + 1]; + int un = a[i]; + if (val1 != MAGIC_VALUE_C) { + a[i] = val0 + val1 + val2; + } + } + } + + /// CHECK-START-ARM64: void Main.$compile$noinline$NotDiamondCf(int[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-NOT: VecLoad + // + /// CHECK-FI: + // + // Loops with complex CF are not supported. + public static void $compile$noinline$NotDiamondCf(int[] x) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + int val = x[i]; + if (val != MAGIC_VALUE_C) { + if (val != 1234) { + x[i] += MAGIC_ADD_CONST; + } + } + } + } + + /// CHECK-START-ARM64: void Main.$compile$noinline$BrokenInduction(int[]) loop_optimization (after) + /// CHECK-IF: hasIsaFeature("sve") + // + /// CHECK-NOT: VecLoad + // + /// CHECK-FI: + public static void $compile$noinline$BrokenInduction(int[] x) { + for (int i = 0; i < USED_ARRAY_LENGTH; i++) { + int val = x[i]; + if (val != MAGIC_VALUE_C) { + x[i] += MAGIC_ADD_CONST; + i++; + } + } + } + + // + // Main driver. + // + + public static void main(String[] args) { + initIntArray(intArray); + int final_ind_value = $compile$noinline$FullDiamond(intArray); + expectIntEquals(23755, IntArraySum(intArray)); + expectIntEquals(USED_ARRAY_LENGTH, final_ind_value); + + // Types. + initBooleanArray(booleanArray); + booleanArray2[12] = true; + $compile$noinline$SimpleBoolean(booleanArray, booleanArray2); + expectIntEquals(86, BooleanArraySum(booleanArray)); + + initByteArray(byteArray); + $compile$noinline$SimpleByte(byteArray); + expectIntEquals(-64, ByteArraySum(byteArray)); + + initByteArray(byteArray); + $compile$noinline$SimpleUByte(byteArray); + expectIntEquals(-64, ByteArraySum(byteArray)); + + initShortArray(shortArray); + $compile$noinline$SimpleShort(shortArray); + expectIntEquals(23121, ShortArraySum(shortArray)); + + initCharArray(charArray); + $compile$noinline$SimpleChar(charArray); + expectIntEquals(23121, CharArraySum(charArray)); + + initIntArray(intArray); + $compile$noinline$SimpleInt(intArray); + expectIntEquals(23121, IntArraySum(intArray)); + + initLongArray(longArray); + $compile$noinline$SimpleLong(longArray); + expectLongEquals(23121, LongArraySum(longArray)); + + initFloatArray(floatArray); + $compile$noinline$SimpleFloat(floatArray); + expectFloatEquals(18868.2f, FloatArraySum(floatArray)); + + initDoubleArray(doubleArray); + $compile$noinline$SimpleDouble(doubleArray); + expectDoubleEquals(23129.5, DoubleArraySum(doubleArray)); + + // Narrowing types. + initByteArray(byteArray); + $compile$noinline$ByteConv(byteArray, byteArray); + expectIntEquals(-2, ByteArraySum(byteArray)); + + initByteArray(byteArray); + $compile$noinline$UByteAndWrongConst(byteArray); + expectIntEquals(-2, ByteArraySum(byteArray)); + + initByteArray(byteArray); + $compile$noinline$ByteNoHiBits(byteArray, byteArray); + expectIntEquals(-2, ByteArraySum(byteArray)); + + // Conditions. + initIntArray(intArray); + $compile$noinline$SimpleBelow(intArray); + expectIntEquals(23121, IntArraySum(intArray)); + + // Idioms. + initIntArray(intArray); + $compile$noinline$Select(intArray); + expectIntEquals(23121, IntArraySum(intArray)); + + initIntArray(intArray); + $compile$noinline$Phi(intArray); + expectIntEquals(36748, IntArraySum(intArray)); + + int reduction_result = 0; + + initIntArray(intArray); + reduction_result = $compile$noinline$Reduction(intArray); + expectIntEquals(14706, IntArraySum(intArray)); + expectIntEquals(21012, reduction_result); + + initIntArray(intArray); + reduction_result = $compile$noinline$ReductionBackEdge(intArray); + expectIntEquals(23121, IntArraySum(intArray)); + expectIntEquals(13121, reduction_result); + + int[] stencilArrayA = new int[STENCIL_ARRAY_SIZE]; + int[] stencilArrayB = new int[STENCIL_ARRAY_SIZE]; + initIntArray(stencilArrayA); + initIntArray(stencilArrayB); + $compile$noinline$stencilAlike(stencilArrayA, stencilArrayB); + expectIntEquals(43602, IntArraySum(stencilArrayA)); + + initIntArray(intArray); + $compile$noinline$NotDiamondCf(intArray); + expectIntEquals(23121, IntArraySum(intArray)); + + initIntArray(intArray); + $compile$noinline$BrokenInduction(intArray); + expectIntEquals(18963, IntArraySum(intArray)); + + System.out.println("passed"); + } + + public static void initBooleanArray(boolean[] a) { + for (int i = 0; i < ARRAY_LENGTH; i++) { + if (i % 3 != 0) { + a[i] = true; + } + } + } + + public static void initByteArray(byte[] a) { + for (int i = 0; i < ARRAY_LENGTH; i++) { + if (i % 3 == 0) { + a[i] = (byte)MAGIC_VALUE_A; + } else if (i % 3 == 1) { + a[i] = (byte)MAGIC_VALUE_B; + } else { + a[i] = (byte)MAGIC_VALUE_C; + } + } + a[USED_ARRAY_LENGTH] = 127; + } + + public static void initShortArray(short[] a) { + for (int i = 0; i < ARRAY_LENGTH; i++) { + if (i % 3 == 0) { + a[i] = (short)MAGIC_VALUE_A; + } else if (i % 3 == 1) { + a[i] = (short)MAGIC_VALUE_B; + } else { + a[i] = (short)MAGIC_VALUE_C; + } + } + a[USED_ARRAY_LENGTH] = 10000; + } + + public static void initCharArray(char[] a) { + for (int i = 0; i < ARRAY_LENGTH; i++) { + if (i % 3 == 0) { + a[i] = (char)MAGIC_VALUE_A; + } else if (i % 3 == 1) { + a[i] = (char)MAGIC_VALUE_B; + } else { + a[i] = (char)MAGIC_VALUE_C; + } + } + a[USED_ARRAY_LENGTH] = 10000; + } + + public static void initIntArray(int[] a) { + for (int i = 0; i < ARRAY_LENGTH; i++) { + if (i % 3 == 0) { + a[i] = MAGIC_VALUE_A; + } else if (i % 3 == 1) { + a[i] = MAGIC_VALUE_B; + } else { + a[i] = MAGIC_VALUE_C; + } + } + a[USED_ARRAY_LENGTH] = 10000; + } + + public static void initLongArray(long[] a) { + for (int i = 0; i < ARRAY_LENGTH; i++) { + if (i % 3 == 0) { + a[i] = MAGIC_VALUE_A; + } else if (i % 3 == 1) { + a[i] = MAGIC_VALUE_B; + } else { + a[i] = MAGIC_VALUE_C; + } + } + a[USED_ARRAY_LENGTH] = 10000; + } + + public static void initFloatArray(float[] a) { + for (int i = 0; i < ARRAY_LENGTH; i++) { + if (i % 3 == 0) { + a[i] = MAGIC_FLOAT_VALUE_A; + } else if (i % 3 == 1) { + a[i] = MAGIC_FLOAT_VALUE_B; + } else { + a[i] = MAGIC_FLOAT_VALUE_C; + } + } + a[USED_ARRAY_LENGTH] = 10000.0f; + } + + public static void initDoubleArray(double[] a) { + for (int i = 0; i < ARRAY_LENGTH; i++) { + if (i % 3 == 0) { + a[i] = MAGIC_FLOAT_VALUE_A; + } else if (i % 3 == 1) { + a[i] = MAGIC_FLOAT_VALUE_B; + } else { + a[i] = MAGIC_FLOAT_VALUE_C; + } + } + a[USED_ARRAY_LENGTH] = 10000.0f; + } + + public static byte BooleanArraySum(boolean[] a) { + byte sum = 0; + for (int i = 0; i < a.length; i++) { + sum += a[i] ? 1 : 0; + } + return sum; + } + + public static byte ByteArraySum(byte[] a) { + byte sum = 0; + for (int i = 0; i < a.length; i++) { + sum += a[i]; + } + return sum; + } + + public static short ShortArraySum(short[] a) { + short sum = 0; + for (int i = 0; i < a.length; i++) { + sum += a[i]; + } + return sum; + } + + public static char CharArraySum(char[] a) { + char sum = 0; + for (int i = 0; i < a.length; i++) { + sum += a[i]; + } + return sum; + } + + public static int IntArraySum(int[] a) { + int sum = 0; + for (int i = 0; i < a.length; i++) { + sum += a[i]; + } + return sum; + } + + public static long LongArraySum(long[] a) { + long sum = 0; + for (int i = 0; i < a.length; i++) { + sum += a[i]; + } + return sum; + } + + public static float FloatArraySum(float[] a) { + float sum = 0.0f; + for (int i = 0; i < a.length; i++) { + sum += a[i]; + } + return sum; + } + + public static double DoubleArraySum(double[] a) { + double sum = 0.0; + for (int i = 0; i < a.length; i++) { + sum += a[i]; + } + return sum; + } + + private static void expectIntEquals(int expected, int result) { + if (expected != result) { + throw new Error("Expected: " + expected + ", found: " + result); + } + } + + private static void expectLongEquals(long expected, long result) { + if (expected != result) { + throw new Error("Expected: " + expected + ", found: " + result); + } + } + + private static void expectFloatEquals(float expected, float result) { + final float THRESHOLD = .1f; + if (Math.abs(expected - result) >= THRESHOLD) { + throw new Error("Expected: " + expected + ", found: " + result); + } + } + + private static void expectDoubleEquals(double expected, double result) { + final double THRESHOLD = .1; + if (Math.abs(expected - result) >= THRESHOLD) { + throw new Error("Expected: " + expected + ", found: " + result); + } + } +} |