diff options
Diffstat (limited to 'compiler/optimizing')
-rw-r--r-- | compiler/optimizing/code_generator_arm64.cc | 66 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm64.h | 16 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_arm64_neon.cc | 59 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_arm64_sve.cc | 203 | ||||
-rw-r--r-- | compiler/optimizing/common_arm64.h | 8 | ||||
-rw-r--r-- | compiler/optimizing/loop_optimization.cc | 33 | ||||
-rw-r--r-- | compiler/optimizing/loop_optimization.h | 2 |
7 files changed, 210 insertions, 177 deletions
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index f5d78367fe..bdc5e2d9a3 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -169,54 +169,6 @@ static RegisterSet OneRegInReferenceOutSaveEverythingCallerSaves() { #define __ down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler()-> // NOLINT #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArm64PointerSize, x).Int32Value() -// Calculate memory accessing operand for save/restore live registers. -static void SaveRestoreLiveRegistersHelper(CodeGenerator* codegen, - LocationSummary* locations, - int64_t spill_offset, - bool is_save) { - const uint32_t core_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ true); - const uint32_t fp_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ false); - DCHECK(ArtVixlRegCodeCoherentForRegSet(core_spills, - codegen->GetNumberOfCoreRegisters(), - fp_spills, - codegen->GetNumberOfFloatingPointRegisters())); - - CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize, core_spills); - const unsigned v_reg_size_in_bits = codegen->GetSlowPathFPWidth() * 8; - DCHECK_LE(codegen->GetSIMDRegisterWidth(), kQRegSizeInBytes); - CPURegList fp_list = CPURegList(CPURegister::kVRegister, v_reg_size_in_bits, fp_spills); - - MacroAssembler* masm = down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler(); - UseScratchRegisterScope temps(masm); - - Register base = masm->StackPointer(); - int64_t core_spill_size = core_list.GetTotalSizeInBytes(); - int64_t fp_spill_size = fp_list.GetTotalSizeInBytes(); - int64_t reg_size = kXRegSizeInBytes; - int64_t max_ls_pair_offset = spill_offset + core_spill_size + fp_spill_size - 2 * reg_size; - uint32_t ls_access_size = WhichPowerOf2(reg_size); - if (((core_list.GetCount() > 1) || (fp_list.GetCount() > 1)) && - !masm->IsImmLSPair(max_ls_pair_offset, ls_access_size)) { - // If the offset does not fit in the instruction's immediate field, use an alternate register - // to compute the base address(float point registers spill base address). - Register new_base = temps.AcquireSameSizeAs(base); - __ Add(new_base, base, Operand(spill_offset + core_spill_size)); - base = new_base; - spill_offset = -core_spill_size; - int64_t new_max_ls_pair_offset = fp_spill_size - 2 * reg_size; - DCHECK(masm->IsImmLSPair(spill_offset, ls_access_size)); - DCHECK(masm->IsImmLSPair(new_max_ls_pair_offset, ls_access_size)); - } - - if (is_save) { - __ StoreCPURegList(core_list, MemOperand(base, spill_offset)); - __ StoreCPURegList(fp_list, MemOperand(base, spill_offset + core_spill_size)); - } else { - __ LoadCPURegList(core_list, MemOperand(base, spill_offset)); - __ LoadCPURegList(fp_list, MemOperand(base, spill_offset + core_spill_size)); - } -} - void SlowPathCodeARM64::SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) { size_t stack_offset = codegen->GetFirstRegisterSlotInSlowPath(); const uint32_t core_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ true); @@ -240,15 +192,15 @@ void SlowPathCodeARM64::SaveLiveRegisters(CodeGenerator* codegen, LocationSummar stack_offset += fp_reg_size; } - SaveRestoreLiveRegistersHelper(codegen, - locations, - codegen->GetFirstRegisterSlotInSlowPath(), /* is_save= */ true); + InstructionCodeGeneratorARM64* visitor = + down_cast<CodeGeneratorARM64*>(codegen)->GetInstructionCodeGeneratorArm64(); + visitor->SaveLiveRegistersHelper(locations, codegen->GetFirstRegisterSlotInSlowPath()); } void SlowPathCodeARM64::RestoreLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) { - SaveRestoreLiveRegistersHelper(codegen, - locations, - codegen->GetFirstRegisterSlotInSlowPath(), /* is_save= */ false); + InstructionCodeGeneratorARM64* visitor = + down_cast<CodeGeneratorARM64*>(codegen)->GetInstructionCodeGeneratorArm64(); + visitor->RestoreLiveRegistersHelper(locations, codegen->GetFirstRegisterSlotInSlowPath()); } class BoundsCheckSlowPathARM64 : public SlowPathCodeARM64 { @@ -997,6 +949,12 @@ bool CodeGeneratorARM64::ShouldUseSVE() const { return GetInstructionSetFeatures().HasSVE(); } +size_t CodeGeneratorARM64::GetSIMDRegisterWidth() const { + return SupportsPredicatedSIMD() + ? GetInstructionSetFeatures().GetSVEVectorLength() / kBitsPerByte + : vixl::aarch64::kQRegSizeInBytes; +} + #define __ GetVIXLAssembler()-> void CodeGeneratorARM64::EmitJumpTables() { diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index eb3e9546e0..d4546e5bd5 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -309,6 +309,10 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator { virtual void LoadSIMDRegFromStack(Location destination, Location source) = 0; virtual void MoveSIMDRegToSIMDReg(Location destination, Location source) = 0; virtual void MoveToSIMDStackSlot(Location destination, Location source) = 0; + virtual void SaveLiveRegistersHelper(LocationSummary* locations, + int64_t spill_offset) = 0; + virtual void RestoreLiveRegistersHelper(LocationSummary* locations, + int64_t spill_offset) = 0; protected: void GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path, @@ -462,6 +466,8 @@ class InstructionCodeGeneratorARM64Neon : public InstructionCodeGeneratorARM64 { void LoadSIMDRegFromStack(Location destination, Location source) override; void MoveSIMDRegToSIMDReg(Location destination, Location source) override; void MoveToSIMDStackSlot(Location destination, Location source) override; + void SaveLiveRegistersHelper(LocationSummary* locations, int64_t spill_offset) override; + void RestoreLiveRegistersHelper(LocationSummary* locations, int64_t spill_offset) override; }; class LocationsBuilderARM64Neon : public LocationsBuilderARM64 { @@ -495,8 +501,14 @@ class InstructionCodeGeneratorARM64Sve : public InstructionCodeGeneratorARM64 { void LoadSIMDRegFromStack(Location destination, Location source) override; void MoveSIMDRegToSIMDReg(Location destination, Location source) override; void MoveToSIMDStackSlot(Location destination, Location source) override; + void SaveLiveRegistersHelper(LocationSummary* locations, int64_t spill_offset) override; + void RestoreLiveRegistersHelper(LocationSummary* locations, int64_t spill_offset) override; private: + // Validate that instruction vector length and packed type are compliant with the SIMD + // register size (full SIMD register is used). + void ValidateVectorLength(HVecOperation* instr) const; + // Returns default predicate register which is used as governing vector predicate // to implement predicated loop execution. // @@ -579,9 +591,7 @@ class CodeGeneratorARM64 : public CodeGenerator { return vixl::aarch64::kDRegSizeInBytes; } - size_t GetSIMDRegisterWidth() const override { - return vixl::aarch64::kQRegSizeInBytes; - } + size_t GetSIMDRegisterWidth() const override; uintptr_t GetAddressOf(HBasicBlock* block) override { vixl::aarch64::Label* block_entry_label = GetLabelOf(block); diff --git a/compiler/optimizing/code_generator_vector_arm64_neon.cc b/compiler/optimizing/code_generator_vector_arm64_neon.cc index bd64166655..0fe9898635 100644 --- a/compiler/optimizing/code_generator_vector_arm64_neon.cc +++ b/compiler/optimizing/code_generator_vector_arm64_neon.cc @@ -17,6 +17,7 @@ #include "code_generator_arm64.h" #include "arch/arm64/instruction_set_features_arm64.h" +#include "base/bit_utils_iterator.h" #include "mirror/array-inl.h" #include "mirror/string.h" @@ -1590,6 +1591,64 @@ void InstructionCodeGeneratorARM64Neon::MoveToSIMDStackSlot(Location destination } } +// Calculate memory accessing operand for save/restore live registers. +template <bool is_save> +void SaveRestoreLiveRegistersHelperNeonImpl(CodeGeneratorARM64* codegen, + LocationSummary* locations, + int64_t spill_offset) { + const uint32_t core_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ true); + const uint32_t fp_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ false); + DCHECK(helpers::ArtVixlRegCodeCoherentForRegSet(core_spills, + codegen->GetNumberOfCoreRegisters(), + fp_spills, + codegen->GetNumberOfFloatingPointRegisters())); + + CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize, core_spills); + const unsigned v_reg_size_in_bits = codegen->GetSlowPathFPWidth() * 8; + DCHECK_LE(codegen->GetSIMDRegisterWidth(), kQRegSizeInBytes); + CPURegList fp_list = CPURegList(CPURegister::kVRegister, v_reg_size_in_bits, fp_spills); + + MacroAssembler* masm = codegen->GetVIXLAssembler(); + UseScratchRegisterScope temps(masm); + + Register base = masm->StackPointer(); + int64_t core_spill_size = core_list.GetTotalSizeInBytes(); + int64_t fp_spill_size = fp_list.GetTotalSizeInBytes(); + int64_t reg_size = kXRegSizeInBytes; + int64_t max_ls_pair_offset = spill_offset + core_spill_size + fp_spill_size - 2 * reg_size; + uint32_t ls_access_size = WhichPowerOf2(reg_size); + if (((core_list.GetCount() > 1) || (fp_list.GetCount() > 1)) && + !masm->IsImmLSPair(max_ls_pair_offset, ls_access_size)) { + // If the offset does not fit in the instruction's immediate field, use an alternate register + // to compute the base address(float point registers spill base address). + Register new_base = temps.AcquireSameSizeAs(base); + masm->Add(new_base, base, Operand(spill_offset + core_spill_size)); + base = new_base; + spill_offset = -core_spill_size; + int64_t new_max_ls_pair_offset = fp_spill_size - 2 * reg_size; + DCHECK(masm->IsImmLSPair(spill_offset, ls_access_size)); + DCHECK(masm->IsImmLSPair(new_max_ls_pair_offset, ls_access_size)); + } + + if (is_save) { + masm->StoreCPURegList(core_list, MemOperand(base, spill_offset)); + masm->StoreCPURegList(fp_list, MemOperand(base, spill_offset + core_spill_size)); + } else { + masm->LoadCPURegList(core_list, MemOperand(base, spill_offset)); + masm->LoadCPURegList(fp_list, MemOperand(base, spill_offset + core_spill_size)); + } +} + +void InstructionCodeGeneratorARM64Neon::SaveLiveRegistersHelper(LocationSummary* locations, + int64_t spill_offset) { + SaveRestoreLiveRegistersHelperNeonImpl</* is_save= */ true>(codegen_, locations, spill_offset); +} + +void InstructionCodeGeneratorARM64Neon::RestoreLiveRegistersHelper(LocationSummary* locations, + int64_t spill_offset) { + SaveRestoreLiveRegistersHelperNeonImpl</* is_save= */ false>(codegen_, locations, spill_offset); +} + #undef __ } // namespace arm64 diff --git a/compiler/optimizing/code_generator_vector_arm64_sve.cc b/compiler/optimizing/code_generator_vector_arm64_sve.cc index 2254673337..824b6c9476 100644 --- a/compiler/optimizing/code_generator_vector_arm64_sve.cc +++ b/compiler/optimizing/code_generator_vector_arm64_sve.cc @@ -17,6 +17,7 @@ #include "code_generator_arm64.h" #include "arch/arm64/instruction_set_features_arm64.h" +#include "base/bit_utils_iterator.h" #include "mirror/array-inl.h" #include "mirror/string.h" @@ -33,6 +34,7 @@ using helpers::LocationFrom; using helpers::OutputRegister; using helpers::QRegisterFrom; using helpers::StackOperandFrom; +using helpers::SveStackOperandFrom; using helpers::VRegisterFrom; using helpers::ZRegisterFrom; using helpers::XRegisterFrom; @@ -71,6 +73,11 @@ inline Location SVEEncodableConstantOrRegister(HInstruction* constant, HInstruct return Location::RequiresRegister(); } +void InstructionCodeGeneratorARM64Sve::ValidateVectorLength(HVecOperation* instr) const { + DCHECK_EQ(DataType::Size(instr->GetPackedType()) * instr->GetVectorLength(), + codegen_->GetSIMDRegisterWidth()); +} + void LocationsBuilderARM64Sve::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); HInstruction* input = instruction->InputAt(0); @@ -107,11 +114,11 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal LocationSummary* locations = instruction->GetLocations(); Location src_loc = locations->InAt(0); const ZRegister dst = ZRegisterFrom(locations->Out()); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { __ Dup(dst.VnB(), Int64FromLocation(src_loc)); } else { @@ -120,7 +127,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { __ Dup(dst.VnH(), Int64FromLocation(src_loc)); } else { @@ -128,7 +134,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal } break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { __ Dup(dst.VnS(), Int64FromLocation(src_loc)); } else { @@ -136,7 +141,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal } break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { __ Dup(dst.VnD(), Int64FromLocation(src_loc)); } else { @@ -144,7 +148,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal } break; case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { __ Fdup(dst.VnS(), src_loc.GetConstant()->AsFloatConstant()->GetValue()); } else { @@ -152,7 +155,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal } break; case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { __ Fdup(dst.VnD(), src_loc.GetConstant()->AsDoubleConstant()->GetValue()); } else { @@ -193,19 +195,16 @@ void InstructionCodeGeneratorARM64Sve::VisitVecExtractScalar(HVecExtractScalar* DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); const VRegister src = VRegisterFrom(locations->InAt(0)); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Umov(OutputRegister(instruction), src.V4S(), 0); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Umov(OutputRegister(instruction), src.V2D(), 0); break; case DataType::Type::kFloat32: case DataType::Type::kFloat64: - DCHECK_LE(2u, instruction->GetVectorLength()); - DCHECK_LE(instruction->GetVectorLength(), 4u); DCHECK(locations->InAt(0).Equals(locations->Out())); // no code required break; default: @@ -251,9 +250,9 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReduce(HVecReduce* instruction) { const ZRegister src = ZRegisterFrom(locations->InAt(0)); const VRegister dst = DRegisterFrom(locations->Out()); const PRegister p_reg = LoopPReg(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); switch (instruction->GetReductionKind()) { case HVecReduce::kSum: __ Saddv(dst.S(), p_reg, src.VnS()); @@ -264,7 +263,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReduce(HVecReduce* instruction) { } break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); switch (instruction->GetReductionKind()) { case HVecReduce::kSum: __ Uaddv(dst.D(), p_reg, src.VnD()); @@ -292,8 +290,8 @@ void InstructionCodeGeneratorARM64Sve::VisitVecCnv(HVecCnv* instruction) { const PRegisterM p_reg = LoopPReg().Merging(); DataType::Type from = instruction->GetInputType(); DataType::Type to = instruction->GetResultType(); + ValidateVectorLength(instruction); if (from == DataType::Type::kInt32 && to == DataType::Type::kFloat32) { - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Scvtf(dst.VnS(), p_reg, src.VnS()); } else { LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -310,31 +308,26 @@ void InstructionCodeGeneratorARM64Sve::VisitVecNeg(HVecNeg* instruction) { const ZRegister src = ZRegisterFrom(locations->InAt(0)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Neg(dst.VnB(), p_reg, src.VnB()); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Neg(dst.VnH(), p_reg, src.VnH()); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Neg(dst.VnS(), p_reg, src.VnS()); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Neg(dst.VnD(), p_reg, src.VnD()); break; case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Fneg(dst.VnS(), p_reg, src.VnS()); break; case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Fneg(dst.VnD(), p_reg, src.VnD()); break; default: @@ -353,29 +346,24 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAbs(HVecAbs* instruction) { const ZRegister src = ZRegisterFrom(locations->InAt(0)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Abs(dst.VnB(), p_reg, src.VnB()); break; case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Abs(dst.VnH(), p_reg, src.VnH()); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Abs(dst.VnS(), p_reg, src.VnS()); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Abs(dst.VnD(), p_reg, src.VnD()); break; case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Fabs(dst.VnS(), p_reg, src.VnS()); break; case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Fabs(dst.VnD(), p_reg, src.VnD()); break; default: @@ -394,9 +382,9 @@ void InstructionCodeGeneratorARM64Sve::VisitVecNot(HVecNot* instruction) { const ZRegister src = ZRegisterFrom(locations->InAt(0)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: // special case boolean-not - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Dup(dst.VnB(), 1); __ Eor(dst.VnB(), p_reg, dst.VnB(), src.VnB()); break; @@ -454,31 +442,26 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAdd(HVecAdd* instruction) { const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Add(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB()); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Add(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH()); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Add(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Add(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Fadd(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS(), StrictNaNPropagation); break; case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Fadd(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD(), StrictNaNPropagation); break; default: @@ -518,31 +501,26 @@ void InstructionCodeGeneratorARM64Sve::VisitVecSub(HVecSub* instruction) { const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Sub(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB()); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Sub(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH()); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Sub(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Sub(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Fsub(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); break; case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Fsub(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; default: @@ -572,31 +550,26 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMul(HVecMul* instruction) { const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Mul(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB()); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Mul(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH()); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Mul(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Mul(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Fmul(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS(), StrictNaNPropagation); break; case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Fmul(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD(), StrictNaNPropagation); break; default: @@ -616,15 +589,14 @@ void InstructionCodeGeneratorARM64Sve::VisitVecDiv(HVecDiv* instruction) { const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); // Note: VIXL guarantees StrictNaNPropagation for Fdiv. switch (instruction->GetPackedType()) { case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Fdiv(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); break; case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Fdiv(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; default: @@ -665,6 +637,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAnd(HVecAnd* instruction) { const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: @@ -709,6 +682,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecOr(HVecOr* instruction) { const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: @@ -744,6 +718,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecXor(HVecXor* instruction) { const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: @@ -799,23 +774,20 @@ void InstructionCodeGeneratorARM64Sve::VisitVecShl(HVecShl* instruction) { const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Lsl(dst.VnB(), p_reg, lhs.VnB(), value); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Lsl(dst.VnH(), p_reg, lhs.VnH(), value); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Lsl(dst.VnS(), p_reg, lhs.VnS(), value); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Lsl(dst.VnD(), p_reg, lhs.VnD(), value); break; default: @@ -835,23 +807,20 @@ void InstructionCodeGeneratorARM64Sve::VisitVecShr(HVecShr* instruction) { const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Asr(dst.VnB(), p_reg, lhs.VnB(), value); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Asr(dst.VnH(), p_reg, lhs.VnH(), value); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Asr(dst.VnS(), p_reg, lhs.VnS(), value); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Asr(dst.VnD(), p_reg, lhs.VnD(), value); break; default: @@ -871,23 +840,20 @@ void InstructionCodeGeneratorARM64Sve::VisitVecUShr(HVecUShr* instruction) { const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Lsr(dst.VnB(), p_reg, lhs.VnB(), value); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Lsr(dst.VnH(), p_reg, lhs.VnH(), value); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Lsr(dst.VnS(), p_reg, lhs.VnS(), value); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Lsr(dst.VnD(), p_reg, lhs.VnD(), value); break; default: @@ -943,26 +909,23 @@ void InstructionCodeGeneratorARM64Sve::VisitVecSetScalars(HVecSetScalars* instru if (IsZeroBitPattern(instruction->InputAt(0))) { return; } + ValidateVectorLength(instruction); // Set required elements. switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Mov(dst.V16B(), 0, InputRegisterAt(instruction, 0)); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Mov(dst.V8H(), 0, InputRegisterAt(instruction, 0)); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Mov(dst.V4S(), 0, InputRegisterAt(instruction, 0)); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Mov(dst.V2D(), 0, InputRegisterAt(instruction, 0)); break; default: @@ -1009,11 +972,11 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate( const PRegisterM p_reg = LoopPReg().Merging(); DCHECK(locations->InAt(0).Equals(locations->Out())); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); if (instruction->GetOpKind() == HInstruction::kAdd) { __ Mla(acc.VnB(), p_reg, acc.VnB(), left.VnB(), right.VnB()); } else { @@ -1022,7 +985,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate( break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); if (instruction->GetOpKind() == HInstruction::kAdd) { __ Mla(acc.VnH(), p_reg, acc.VnB(), left.VnH(), right.VnH()); } else { @@ -1030,7 +992,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate( } break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); if (instruction->GetOpKind() == HInstruction::kAdd) { __ Mla(acc.VnS(), p_reg, acc.VnB(), left.VnS(), right.VnS()); } else { @@ -1077,12 +1038,11 @@ void InstructionCodeGeneratorARM64Sve::VisitVecDotProd(HVecDotProd* instruction) DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()), HVecOperation::ToSignedType(b->GetPackedType())); DCHECK_EQ(instruction->GetPackedType(), DataType::Type::kInt32); - DCHECK_EQ(4u, instruction->GetVectorLength()); + ValidateVectorLength(instruction); size_t inputs_data_size = DataType::Size(a->GetPackedType()); switch (inputs_data_size) { case 1u: { - DCHECK_EQ(16u, a->GetVectorLength()); UseScratchRegisterScope temps(GetVIXLAssembler()); const ZRegister tmp0 = temps.AcquireZ(); const ZRegister tmp1 = ZRegisterFrom(locations->GetTemp(0)); @@ -1143,30 +1103,27 @@ void InstructionCodeGeneratorARM64Sve::VisitVecLoad(HVecLoad* instruction) { UseScratchRegisterScope temps(GetVIXLAssembler()); Register scratch; const PRegisterZ p_reg = LoopPReg().Zeroing(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kInt16: // (short) s.charAt(.) can yield HVecLoad/Int16/StringCharAt. case DataType::Type::kUint16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Ld1h(reg.VnH(), p_reg, VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Ld1b(reg.VnB(), p_reg, VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; case DataType::Type::kInt32: case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Ld1w(reg.VnS(), p_reg, VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; case DataType::Type::kInt64: case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Ld1d(reg.VnD(), p_reg, VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; @@ -1188,30 +1145,27 @@ void InstructionCodeGeneratorARM64Sve::VisitVecStore(HVecStore* instruction) { UseScratchRegisterScope temps(GetVIXLAssembler()); Register scratch; const PRegisterZ p_reg = LoopPReg().Zeroing(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ St1b(reg.VnB(), p_reg, VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ St1h(reg.VnH(), p_reg, VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; case DataType::Type::kInt32: case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ St1w(reg.VnS(), p_reg, VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; case DataType::Type::kInt64: case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ St1d(reg.VnD(), p_reg, VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; @@ -1237,22 +1191,18 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instru case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Ptrue(p_reg.VnB(), vixl::aarch64::SVE_ALL); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Ptrue(p_reg.VnH(), vixl::aarch64::SVE_ALL); break; case DataType::Type::kInt32: case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Ptrue(p_reg.VnS(), vixl::aarch64::SVE_ALL); break; case DataType::Type::kInt64: case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Ptrue(p_reg.VnD(), vixl::aarch64::SVE_ALL); break; default: @@ -1295,17 +1245,19 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredWhile(HVecPredWhile* instruct Register left = InputRegisterAt(instruction, 0); Register right = InputRegisterAt(instruction, 1); - switch (instruction->GetVectorLength()) { - case 16u: + DCHECK_EQ(codegen_->GetSIMDRegisterWidth() % instruction->GetVectorLength(), 0u); + + switch (codegen_->GetSIMDRegisterWidth() / instruction->GetVectorLength()) { + case 1u: __ Whilelo(LoopPReg().VnB(), left, right); break; - case 8u: + case 2u: __ Whilelo(LoopPReg().VnH(), left, right); break; case 4u: __ Whilelo(LoopPReg().VnS(), left, right); break; - case 2u: + case 8u: __ Whilelo(LoopPReg().VnD(), left, right); break; default: @@ -1333,52 +1285,103 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredCondition(HVecPredCondition* Location InstructionCodeGeneratorARM64Sve::AllocateSIMDScratchLocation( vixl::aarch64::UseScratchRegisterScope* scope) { - DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); - return LocationFrom(scope->AcquireVRegisterOfSize(kQRegSize)); + return LocationFrom(scope->AcquireZ()); } void InstructionCodeGeneratorARM64Sve::FreeSIMDScratchLocation(Location loc, vixl::aarch64::UseScratchRegisterScope* scope) { - DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); - scope->Release(QRegisterFrom(loc)); + scope->Release(ZRegisterFrom(loc)); } void InstructionCodeGeneratorARM64Sve::LoadSIMDRegFromStack(Location destination, Location source) { - DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); - __ Ldr(QRegisterFrom(destination), StackOperandFrom(source)); + __ Ldr(ZRegisterFrom(destination), SveStackOperandFrom(source)); } void InstructionCodeGeneratorARM64Sve::MoveSIMDRegToSIMDReg(Location destination, Location source) { - DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); - __ Mov(QRegisterFrom(destination), QRegisterFrom(source)); + __ Mov(ZRegisterFrom(destination), ZRegisterFrom(source)); } void InstructionCodeGeneratorARM64Sve::MoveToSIMDStackSlot(Location destination, Location source) { DCHECK(destination.IsSIMDStackSlot()); - DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); if (source.IsFpuRegister()) { - __ Str(QRegisterFrom(source), StackOperandFrom(destination)); + __ Str(ZRegisterFrom(source), SveStackOperandFrom(destination)); } else { DCHECK(source.IsSIMDStackSlot()); UseScratchRegisterScope temps(GetVIXLAssembler()); if (GetVIXLAssembler()->GetScratchVRegisterList()->IsEmpty()) { + // Very rare situation, only when there are cycles in ParallelMoveResolver graph. const Register temp = temps.AcquireX(); - __ Ldr(temp, MemOperand(sp, source.GetStackIndex())); - __ Str(temp, MemOperand(sp, destination.GetStackIndex())); - __ Ldr(temp, MemOperand(sp, source.GetStackIndex() + kArm64WordSize)); - __ Str(temp, MemOperand(sp, destination.GetStackIndex() + kArm64WordSize)); + DCHECK_EQ(codegen_->GetSIMDRegisterWidth() % kArm64WordSize, 0u); + // Emit a number of LDR/STR (XRegister, 64-bit) to cover the whole SIMD register size + // when copying a stack slot. + for (size_t offset = 0, e = codegen_->GetSIMDRegisterWidth(); + offset < e; + offset += kArm64WordSize) { + __ Ldr(temp, MemOperand(sp, source.GetStackIndex() + offset)); + __ Str(temp, MemOperand(sp, destination.GetStackIndex() + offset)); + } } else { - const VRegister temp = temps.AcquireVRegisterOfSize(kQRegSize); - __ Ldr(temp, StackOperandFrom(source)); - __ Str(temp, StackOperandFrom(destination)); + const ZRegister temp = temps.AcquireZ(); + __ Ldr(temp, SveStackOperandFrom(source)); + __ Str(temp, SveStackOperandFrom(destination)); } } } +template <bool is_save> +void SaveRestoreLiveRegistersHelperSveImpl(CodeGeneratorARM64* codegen, + LocationSummary* locations, + int64_t spill_offset) { + const uint32_t core_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ true); + const uint32_t fp_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ false); + DCHECK(helpers::ArtVixlRegCodeCoherentForRegSet(core_spills, + codegen->GetNumberOfCoreRegisters(), + fp_spills, + codegen->GetNumberOfFloatingPointRegisters())); + MacroAssembler* masm = codegen->GetVIXLAssembler(); + Register base = masm->StackPointer(); + + CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize, core_spills); + int64_t core_spill_size = core_list.GetTotalSizeInBytes(); + int64_t fp_spill_offset = spill_offset + core_spill_size; + + if (codegen->GetGraph()->HasSIMD()) { + if (is_save) { + masm->StoreCPURegList(core_list, MemOperand(base, spill_offset)); + } else { + masm->LoadCPURegList(core_list, MemOperand(base, spill_offset)); + } + codegen->GetAssembler()->SaveRestoreZRegisterList<is_save>(fp_spills, fp_spill_offset); + return; + } + + // Case when we only need to restore D-registers. + DCHECK(!codegen->GetGraph()->HasSIMD()); + DCHECK_LE(codegen->GetSlowPathFPWidth(), kDRegSizeInBytes); + CPURegList fp_list = CPURegList(CPURegister::kVRegister, kDRegSize, fp_spills); + if (is_save) { + masm->StoreCPURegList(core_list, MemOperand(base, spill_offset)); + masm->StoreCPURegList(fp_list, MemOperand(base, fp_spill_offset)); + } else { + masm->LoadCPURegList(core_list, MemOperand(base, spill_offset)); + masm->LoadCPURegList(fp_list, MemOperand(base, fp_spill_offset)); + } +} + +void InstructionCodeGeneratorARM64Sve::SaveLiveRegistersHelper(LocationSummary* locations, + int64_t spill_offset) { + SaveRestoreLiveRegistersHelperSveImpl</* is_save= */ true>(codegen_, locations, spill_offset); +} + +void InstructionCodeGeneratorARM64Sve::RestoreLiveRegistersHelper(LocationSummary* locations, + int64_t spill_offset) { + SaveRestoreLiveRegistersHelperSveImpl</* is_save= */ false>(codegen_, locations, spill_offset); +} + #undef __ } // namespace arm64 diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h index 72207816e1..81c6561318 100644 --- a/compiler/optimizing/common_arm64.h +++ b/compiler/optimizing/common_arm64.h @@ -182,6 +182,10 @@ inline vixl::aarch64::MemOperand StackOperandFrom(Location location) { return vixl::aarch64::MemOperand(vixl::aarch64::sp, location.GetStackIndex()); } +inline vixl::aarch64::SVEMemOperand SveStackOperandFrom(Location location) { + return vixl::aarch64::SVEMemOperand(vixl::aarch64::sp, location.GetStackIndex()); +} + inline vixl::aarch64::MemOperand HeapOperand(const vixl::aarch64::Register& base, size_t offset = 0) { // A heap reference must be 32bit, so fit in a W register. @@ -215,6 +219,10 @@ inline Location LocationFrom(const vixl::aarch64::VRegister& fpreg) { return Location::FpuRegisterLocation(fpreg.GetCode()); } +inline Location LocationFrom(const vixl::aarch64::ZRegister& zreg) { + return Location::FpuRegisterLocation(zreg.GetCode()); +} + inline vixl::aarch64::Operand OperandFromMemOperand( const vixl::aarch64::MemOperand& mem_op) { if (mem_op.IsImmediateOffset()) { diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index 1210dbe67b..02ee4ec057 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -946,9 +946,10 @@ bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int6 // make one particular reference aligned), never to exceed (1). // (3) variable to record how many references share same alignment. // (4) variable to record suitable candidate for dynamic loop peeling. - uint32_t desired_alignment = GetVectorSizeInBytes(); - DCHECK_LE(desired_alignment, 16u); - uint32_t peeling_votes[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + size_t desired_alignment = GetVectorSizeInBytes(); + ScopedArenaVector<uint32_t> peeling_votes(desired_alignment, 0u, + loop_allocator_->Adapter(kArenaAllocLoopOptimization)); + uint32_t max_num_same_alignment = 0; const ArrayReference* peeling_candidate = nullptr; @@ -1577,14 +1578,6 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, } uint32_t HLoopOptimization::GetVectorSizeInBytes() { - if (kIsDebugBuild) { - InstructionSet isa = compiler_options_->GetInstructionSet(); - // TODO: Remove this check when there are no implicit assumptions on the SIMD reg size. - DCHECK_EQ(simd_register_size_, (isa == InstructionSet::kArm || isa == InstructionSet::kThumb2) - ? 8u - : 16u); - } - return simd_register_size_; } @@ -1616,6 +1609,8 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict if (IsInPredicatedVectorizationMode()) { // SVE vectorization. CHECK(features->AsArm64InstructionSetFeatures()->HasSVE()); + size_t vector_length = simd_register_size_ / DataType::Size(type); + DCHECK_EQ(simd_register_size_ % DataType::Size(type), 0u); switch (type) { case DataType::Type::kBool: case DataType::Type::kUint8: @@ -1625,7 +1620,7 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict kNoUnsignedHAdd | kNoUnroundedHAdd | kNoSAD; - return TrySetVectorLength(type, 16); + return TrySetVectorLength(type, vector_length); case DataType::Type::kUint16: case DataType::Type::kInt16: *restrictions |= kNoDiv | @@ -1634,19 +1629,19 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict kNoUnroundedHAdd | kNoSAD | kNoDotProd; - return TrySetVectorLength(type, 8); + return TrySetVectorLength(type, vector_length); case DataType::Type::kInt32: *restrictions |= kNoDiv | kNoSAD; - return TrySetVectorLength(type, 4); + return TrySetVectorLength(type, vector_length); case DataType::Type::kInt64: *restrictions |= kNoDiv | kNoSAD; - return TrySetVectorLength(type, 2); + return TrySetVectorLength(type, vector_length); case DataType::Type::kFloat32: *restrictions |= kNoReduction; - return TrySetVectorLength(type, 4); + return TrySetVectorLength(type, vector_length); case DataType::Type::kFloat64: *restrictions |= kNoReduction; - return TrySetVectorLength(type, 2); + return TrySetVectorLength(type, vector_length); default: break; } @@ -2311,12 +2306,12 @@ Alignment HLoopOptimization::ComputeAlignment(HInstruction* offset, return Alignment(DataType::Size(type), 0); } -void HLoopOptimization::SetAlignmentStrategy(uint32_t peeling_votes[], +void HLoopOptimization::SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes, const ArrayReference* peeling_candidate) { // Current heuristic: pick the best static loop peeling factor, if any, // or otherwise use dynamic loop peeling on suggested peeling candidate. uint32_t max_vote = 0; - for (int32_t i = 0; i < 16; i++) { + for (size_t i = 0; i < peeling_votes.size(); i++) { if (peeling_votes[i] > max_vote) { max_vote = peeling_votes[i]; vector_static_peeling_factor_ = i; diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h index 0d76804d9c..d3583ed8a6 100644 --- a/compiler/optimizing/loop_optimization.h +++ b/compiler/optimizing/loop_optimization.h @@ -238,7 +238,7 @@ class HLoopOptimization : public HOptimization { DataType::Type type, bool is_string_char_at, uint32_t peeling = 0); - void SetAlignmentStrategy(uint32_t peeling_votes[], + void SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes, const ArrayReference* peeling_candidate); uint32_t MaxNumberPeeled(); bool IsVectorizationProfitable(int64_t trip_count); |