diff options
author | 2020-04-27 21:02:28 +0100 | |
---|---|---|
committer | 2021-02-05 11:34:38 +0000 | |
commit | 55ab7e84c4682c492b6fa18375b87ffc5d0b23bb (patch) | |
tree | 5fcc2567a1a4e6ae73dead2f70c69bc03b0a64bb /compiler/optimizing | |
parent | ac27ac01490f53f9e2413dc9b66fbb2880904c96 (diff) |
ARM64: Support SVE VL other than 128-bit.
Arm SVE register size is not fixed and can be a
multiple of 128 bits. To support that the patch
removes explicit assumptions on the SIMD register
size to be 128 bit from the vectorizer and code
generators and enables configurable SVE vector
length autovectorization, e.g. extends SIMD register
save/restore routines.
Test: art SIMD tests on VIXL simulator.
Test: art tests on FVP (steps in test/README.arm_fvp.md)
with FVP arg:
-C SVE.ScalableVectorExtension.veclen=[2,4]
(SVE vector [128,256] bits wide)
Change-Id: Icb46e7eb17f21d3bd38b16dd50f735c29b316427
Diffstat (limited to 'compiler/optimizing')
-rw-r--r-- | compiler/optimizing/code_generator_arm64.cc | 66 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm64.h | 16 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_arm64_neon.cc | 59 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_arm64_sve.cc | 203 | ||||
-rw-r--r-- | compiler/optimizing/common_arm64.h | 8 | ||||
-rw-r--r-- | compiler/optimizing/loop_optimization.cc | 33 | ||||
-rw-r--r-- | compiler/optimizing/loop_optimization.h | 2 |
7 files changed, 210 insertions, 177 deletions
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index f5d78367fe..bdc5e2d9a3 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -169,54 +169,6 @@ static RegisterSet OneRegInReferenceOutSaveEverythingCallerSaves() { #define __ down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler()-> // NOLINT #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArm64PointerSize, x).Int32Value() -// Calculate memory accessing operand for save/restore live registers. -static void SaveRestoreLiveRegistersHelper(CodeGenerator* codegen, - LocationSummary* locations, - int64_t spill_offset, - bool is_save) { - const uint32_t core_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ true); - const uint32_t fp_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ false); - DCHECK(ArtVixlRegCodeCoherentForRegSet(core_spills, - codegen->GetNumberOfCoreRegisters(), - fp_spills, - codegen->GetNumberOfFloatingPointRegisters())); - - CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize, core_spills); - const unsigned v_reg_size_in_bits = codegen->GetSlowPathFPWidth() * 8; - DCHECK_LE(codegen->GetSIMDRegisterWidth(), kQRegSizeInBytes); - CPURegList fp_list = CPURegList(CPURegister::kVRegister, v_reg_size_in_bits, fp_spills); - - MacroAssembler* masm = down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler(); - UseScratchRegisterScope temps(masm); - - Register base = masm->StackPointer(); - int64_t core_spill_size = core_list.GetTotalSizeInBytes(); - int64_t fp_spill_size = fp_list.GetTotalSizeInBytes(); - int64_t reg_size = kXRegSizeInBytes; - int64_t max_ls_pair_offset = spill_offset + core_spill_size + fp_spill_size - 2 * reg_size; - uint32_t ls_access_size = WhichPowerOf2(reg_size); - if (((core_list.GetCount() > 1) || (fp_list.GetCount() > 1)) && - !masm->IsImmLSPair(max_ls_pair_offset, ls_access_size)) { - // If the offset does not fit in the instruction's immediate field, use an alternate register - // to compute the base address(float point registers spill base address). - Register new_base = temps.AcquireSameSizeAs(base); - __ Add(new_base, base, Operand(spill_offset + core_spill_size)); - base = new_base; - spill_offset = -core_spill_size; - int64_t new_max_ls_pair_offset = fp_spill_size - 2 * reg_size; - DCHECK(masm->IsImmLSPair(spill_offset, ls_access_size)); - DCHECK(masm->IsImmLSPair(new_max_ls_pair_offset, ls_access_size)); - } - - if (is_save) { - __ StoreCPURegList(core_list, MemOperand(base, spill_offset)); - __ StoreCPURegList(fp_list, MemOperand(base, spill_offset + core_spill_size)); - } else { - __ LoadCPURegList(core_list, MemOperand(base, spill_offset)); - __ LoadCPURegList(fp_list, MemOperand(base, spill_offset + core_spill_size)); - } -} - void SlowPathCodeARM64::SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) { size_t stack_offset = codegen->GetFirstRegisterSlotInSlowPath(); const uint32_t core_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ true); @@ -240,15 +192,15 @@ void SlowPathCodeARM64::SaveLiveRegisters(CodeGenerator* codegen, LocationSummar stack_offset += fp_reg_size; } - SaveRestoreLiveRegistersHelper(codegen, - locations, - codegen->GetFirstRegisterSlotInSlowPath(), /* is_save= */ true); + InstructionCodeGeneratorARM64* visitor = + down_cast<CodeGeneratorARM64*>(codegen)->GetInstructionCodeGeneratorArm64(); + visitor->SaveLiveRegistersHelper(locations, codegen->GetFirstRegisterSlotInSlowPath()); } void SlowPathCodeARM64::RestoreLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) { - SaveRestoreLiveRegistersHelper(codegen, - locations, - codegen->GetFirstRegisterSlotInSlowPath(), /* is_save= */ false); + InstructionCodeGeneratorARM64* visitor = + down_cast<CodeGeneratorARM64*>(codegen)->GetInstructionCodeGeneratorArm64(); + visitor->RestoreLiveRegistersHelper(locations, codegen->GetFirstRegisterSlotInSlowPath()); } class BoundsCheckSlowPathARM64 : public SlowPathCodeARM64 { @@ -997,6 +949,12 @@ bool CodeGeneratorARM64::ShouldUseSVE() const { return GetInstructionSetFeatures().HasSVE(); } +size_t CodeGeneratorARM64::GetSIMDRegisterWidth() const { + return SupportsPredicatedSIMD() + ? GetInstructionSetFeatures().GetSVEVectorLength() / kBitsPerByte + : vixl::aarch64::kQRegSizeInBytes; +} + #define __ GetVIXLAssembler()-> void CodeGeneratorARM64::EmitJumpTables() { diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index eb3e9546e0..d4546e5bd5 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -309,6 +309,10 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator { virtual void LoadSIMDRegFromStack(Location destination, Location source) = 0; virtual void MoveSIMDRegToSIMDReg(Location destination, Location source) = 0; virtual void MoveToSIMDStackSlot(Location destination, Location source) = 0; + virtual void SaveLiveRegistersHelper(LocationSummary* locations, + int64_t spill_offset) = 0; + virtual void RestoreLiveRegistersHelper(LocationSummary* locations, + int64_t spill_offset) = 0; protected: void GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path, @@ -462,6 +466,8 @@ class InstructionCodeGeneratorARM64Neon : public InstructionCodeGeneratorARM64 { void LoadSIMDRegFromStack(Location destination, Location source) override; void MoveSIMDRegToSIMDReg(Location destination, Location source) override; void MoveToSIMDStackSlot(Location destination, Location source) override; + void SaveLiveRegistersHelper(LocationSummary* locations, int64_t spill_offset) override; + void RestoreLiveRegistersHelper(LocationSummary* locations, int64_t spill_offset) override; }; class LocationsBuilderARM64Neon : public LocationsBuilderARM64 { @@ -495,8 +501,14 @@ class InstructionCodeGeneratorARM64Sve : public InstructionCodeGeneratorARM64 { void LoadSIMDRegFromStack(Location destination, Location source) override; void MoveSIMDRegToSIMDReg(Location destination, Location source) override; void MoveToSIMDStackSlot(Location destination, Location source) override; + void SaveLiveRegistersHelper(LocationSummary* locations, int64_t spill_offset) override; + void RestoreLiveRegistersHelper(LocationSummary* locations, int64_t spill_offset) override; private: + // Validate that instruction vector length and packed type are compliant with the SIMD + // register size (full SIMD register is used). + void ValidateVectorLength(HVecOperation* instr) const; + // Returns default predicate register which is used as governing vector predicate // to implement predicated loop execution. // @@ -579,9 +591,7 @@ class CodeGeneratorARM64 : public CodeGenerator { return vixl::aarch64::kDRegSizeInBytes; } - size_t GetSIMDRegisterWidth() const override { - return vixl::aarch64::kQRegSizeInBytes; - } + size_t GetSIMDRegisterWidth() const override; uintptr_t GetAddressOf(HBasicBlock* block) override { vixl::aarch64::Label* block_entry_label = GetLabelOf(block); diff --git a/compiler/optimizing/code_generator_vector_arm64_neon.cc b/compiler/optimizing/code_generator_vector_arm64_neon.cc index bd64166655..0fe9898635 100644 --- a/compiler/optimizing/code_generator_vector_arm64_neon.cc +++ b/compiler/optimizing/code_generator_vector_arm64_neon.cc @@ -17,6 +17,7 @@ #include "code_generator_arm64.h" #include "arch/arm64/instruction_set_features_arm64.h" +#include "base/bit_utils_iterator.h" #include "mirror/array-inl.h" #include "mirror/string.h" @@ -1590,6 +1591,64 @@ void InstructionCodeGeneratorARM64Neon::MoveToSIMDStackSlot(Location destination } } +// Calculate memory accessing operand for save/restore live registers. +template <bool is_save> +void SaveRestoreLiveRegistersHelperNeonImpl(CodeGeneratorARM64* codegen, + LocationSummary* locations, + int64_t spill_offset) { + const uint32_t core_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ true); + const uint32_t fp_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ false); + DCHECK(helpers::ArtVixlRegCodeCoherentForRegSet(core_spills, + codegen->GetNumberOfCoreRegisters(), + fp_spills, + codegen->GetNumberOfFloatingPointRegisters())); + + CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize, core_spills); + const unsigned v_reg_size_in_bits = codegen->GetSlowPathFPWidth() * 8; + DCHECK_LE(codegen->GetSIMDRegisterWidth(), kQRegSizeInBytes); + CPURegList fp_list = CPURegList(CPURegister::kVRegister, v_reg_size_in_bits, fp_spills); + + MacroAssembler* masm = codegen->GetVIXLAssembler(); + UseScratchRegisterScope temps(masm); + + Register base = masm->StackPointer(); + int64_t core_spill_size = core_list.GetTotalSizeInBytes(); + int64_t fp_spill_size = fp_list.GetTotalSizeInBytes(); + int64_t reg_size = kXRegSizeInBytes; + int64_t max_ls_pair_offset = spill_offset + core_spill_size + fp_spill_size - 2 * reg_size; + uint32_t ls_access_size = WhichPowerOf2(reg_size); + if (((core_list.GetCount() > 1) || (fp_list.GetCount() > 1)) && + !masm->IsImmLSPair(max_ls_pair_offset, ls_access_size)) { + // If the offset does not fit in the instruction's immediate field, use an alternate register + // to compute the base address(float point registers spill base address). + Register new_base = temps.AcquireSameSizeAs(base); + masm->Add(new_base, base, Operand(spill_offset + core_spill_size)); + base = new_base; + spill_offset = -core_spill_size; + int64_t new_max_ls_pair_offset = fp_spill_size - 2 * reg_size; + DCHECK(masm->IsImmLSPair(spill_offset, ls_access_size)); + DCHECK(masm->IsImmLSPair(new_max_ls_pair_offset, ls_access_size)); + } + + if (is_save) { + masm->StoreCPURegList(core_list, MemOperand(base, spill_offset)); + masm->StoreCPURegList(fp_list, MemOperand(base, spill_offset + core_spill_size)); + } else { + masm->LoadCPURegList(core_list, MemOperand(base, spill_offset)); + masm->LoadCPURegList(fp_list, MemOperand(base, spill_offset + core_spill_size)); + } +} + +void InstructionCodeGeneratorARM64Neon::SaveLiveRegistersHelper(LocationSummary* locations, + int64_t spill_offset) { + SaveRestoreLiveRegistersHelperNeonImpl</* is_save= */ true>(codegen_, locations, spill_offset); +} + +void InstructionCodeGeneratorARM64Neon::RestoreLiveRegistersHelper(LocationSummary* locations, + int64_t spill_offset) { + SaveRestoreLiveRegistersHelperNeonImpl</* is_save= */ false>(codegen_, locations, spill_offset); +} + #undef __ } // namespace arm64 diff --git a/compiler/optimizing/code_generator_vector_arm64_sve.cc b/compiler/optimizing/code_generator_vector_arm64_sve.cc index 2254673337..824b6c9476 100644 --- a/compiler/optimizing/code_generator_vector_arm64_sve.cc +++ b/compiler/optimizing/code_generator_vector_arm64_sve.cc @@ -17,6 +17,7 @@ #include "code_generator_arm64.h" #include "arch/arm64/instruction_set_features_arm64.h" +#include "base/bit_utils_iterator.h" #include "mirror/array-inl.h" #include "mirror/string.h" @@ -33,6 +34,7 @@ using helpers::LocationFrom; using helpers::OutputRegister; using helpers::QRegisterFrom; using helpers::StackOperandFrom; +using helpers::SveStackOperandFrom; using helpers::VRegisterFrom; using helpers::ZRegisterFrom; using helpers::XRegisterFrom; @@ -71,6 +73,11 @@ inline Location SVEEncodableConstantOrRegister(HInstruction* constant, HInstruct return Location::RequiresRegister(); } +void InstructionCodeGeneratorARM64Sve::ValidateVectorLength(HVecOperation* instr) const { + DCHECK_EQ(DataType::Size(instr->GetPackedType()) * instr->GetVectorLength(), + codegen_->GetSIMDRegisterWidth()); +} + void LocationsBuilderARM64Sve::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); HInstruction* input = instruction->InputAt(0); @@ -107,11 +114,11 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal LocationSummary* locations = instruction->GetLocations(); Location src_loc = locations->InAt(0); const ZRegister dst = ZRegisterFrom(locations->Out()); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { __ Dup(dst.VnB(), Int64FromLocation(src_loc)); } else { @@ -120,7 +127,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { __ Dup(dst.VnH(), Int64FromLocation(src_loc)); } else { @@ -128,7 +134,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal } break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { __ Dup(dst.VnS(), Int64FromLocation(src_loc)); } else { @@ -136,7 +141,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal } break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { __ Dup(dst.VnD(), Int64FromLocation(src_loc)); } else { @@ -144,7 +148,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal } break; case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { __ Fdup(dst.VnS(), src_loc.GetConstant()->AsFloatConstant()->GetValue()); } else { @@ -152,7 +155,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal } break; case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { __ Fdup(dst.VnD(), src_loc.GetConstant()->AsDoubleConstant()->GetValue()); } else { @@ -193,19 +195,16 @@ void InstructionCodeGeneratorARM64Sve::VisitVecExtractScalar(HVecExtractScalar* DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); const VRegister src = VRegisterFrom(locations->InAt(0)); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Umov(OutputRegister(instruction), src.V4S(), 0); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Umov(OutputRegister(instruction), src.V2D(), 0); break; case DataType::Type::kFloat32: case DataType::Type::kFloat64: - DCHECK_LE(2u, instruction->GetVectorLength()); - DCHECK_LE(instruction->GetVectorLength(), 4u); DCHECK(locations->InAt(0).Equals(locations->Out())); // no code required break; default: @@ -251,9 +250,9 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReduce(HVecReduce* instruction) { const ZRegister src = ZRegisterFrom(locations->InAt(0)); const VRegister dst = DRegisterFrom(locations->Out()); const PRegister p_reg = LoopPReg(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); switch (instruction->GetReductionKind()) { case HVecReduce::kSum: __ Saddv(dst.S(), p_reg, src.VnS()); @@ -264,7 +263,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReduce(HVecReduce* instruction) { } break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); switch (instruction->GetReductionKind()) { case HVecReduce::kSum: __ Uaddv(dst.D(), p_reg, src.VnD()); @@ -292,8 +290,8 @@ void InstructionCodeGeneratorARM64Sve::VisitVecCnv(HVecCnv* instruction) { const PRegisterM p_reg = LoopPReg().Merging(); DataType::Type from = instruction->GetInputType(); DataType::Type to = instruction->GetResultType(); + ValidateVectorLength(instruction); if (from == DataType::Type::kInt32 && to == DataType::Type::kFloat32) { - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Scvtf(dst.VnS(), p_reg, src.VnS()); } else { LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -310,31 +308,26 @@ void InstructionCodeGeneratorARM64Sve::VisitVecNeg(HVecNeg* instruction) { const ZRegister src = ZRegisterFrom(locations->InAt(0)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Neg(dst.VnB(), p_reg, src.VnB()); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Neg(dst.VnH(), p_reg, src.VnH()); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Neg(dst.VnS(), p_reg, src.VnS()); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Neg(dst.VnD(), p_reg, src.VnD()); break; case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Fneg(dst.VnS(), p_reg, src.VnS()); break; case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Fneg(dst.VnD(), p_reg, src.VnD()); break; default: @@ -353,29 +346,24 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAbs(HVecAbs* instruction) { const ZRegister src = ZRegisterFrom(locations->InAt(0)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Abs(dst.VnB(), p_reg, src.VnB()); break; case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Abs(dst.VnH(), p_reg, src.VnH()); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Abs(dst.VnS(), p_reg, src.VnS()); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Abs(dst.VnD(), p_reg, src.VnD()); break; case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Fabs(dst.VnS(), p_reg, src.VnS()); break; case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Fabs(dst.VnD(), p_reg, src.VnD()); break; default: @@ -394,9 +382,9 @@ void InstructionCodeGeneratorARM64Sve::VisitVecNot(HVecNot* instruction) { const ZRegister src = ZRegisterFrom(locations->InAt(0)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: // special case boolean-not - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Dup(dst.VnB(), 1); __ Eor(dst.VnB(), p_reg, dst.VnB(), src.VnB()); break; @@ -454,31 +442,26 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAdd(HVecAdd* instruction) { const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Add(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB()); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Add(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH()); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Add(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Add(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Fadd(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS(), StrictNaNPropagation); break; case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Fadd(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD(), StrictNaNPropagation); break; default: @@ -518,31 +501,26 @@ void InstructionCodeGeneratorARM64Sve::VisitVecSub(HVecSub* instruction) { const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Sub(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB()); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Sub(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH()); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Sub(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Sub(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Fsub(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); break; case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Fsub(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; default: @@ -572,31 +550,26 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMul(HVecMul* instruction) { const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Mul(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB()); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Mul(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH()); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Mul(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Mul(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Fmul(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS(), StrictNaNPropagation); break; case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Fmul(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD(), StrictNaNPropagation); break; default: @@ -616,15 +589,14 @@ void InstructionCodeGeneratorARM64Sve::VisitVecDiv(HVecDiv* instruction) { const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); // Note: VIXL guarantees StrictNaNPropagation for Fdiv. switch (instruction->GetPackedType()) { case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Fdiv(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); break; case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Fdiv(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; default: @@ -665,6 +637,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAnd(HVecAnd* instruction) { const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: @@ -709,6 +682,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecOr(HVecOr* instruction) { const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: @@ -744,6 +718,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecXor(HVecXor* instruction) { const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: @@ -799,23 +774,20 @@ void InstructionCodeGeneratorARM64Sve::VisitVecShl(HVecShl* instruction) { const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Lsl(dst.VnB(), p_reg, lhs.VnB(), value); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Lsl(dst.VnH(), p_reg, lhs.VnH(), value); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Lsl(dst.VnS(), p_reg, lhs.VnS(), value); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Lsl(dst.VnD(), p_reg, lhs.VnD(), value); break; default: @@ -835,23 +807,20 @@ void InstructionCodeGeneratorARM64Sve::VisitVecShr(HVecShr* instruction) { const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Asr(dst.VnB(), p_reg, lhs.VnB(), value); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Asr(dst.VnH(), p_reg, lhs.VnH(), value); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Asr(dst.VnS(), p_reg, lhs.VnS(), value); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Asr(dst.VnD(), p_reg, lhs.VnD(), value); break; default: @@ -871,23 +840,20 @@ void InstructionCodeGeneratorARM64Sve::VisitVecUShr(HVecUShr* instruction) { const ZRegister dst = ZRegisterFrom(locations->Out()); const PRegisterM p_reg = LoopPReg().Merging(); int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Lsr(dst.VnB(), p_reg, lhs.VnB(), value); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Lsr(dst.VnH(), p_reg, lhs.VnH(), value); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Lsr(dst.VnS(), p_reg, lhs.VnS(), value); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Lsr(dst.VnD(), p_reg, lhs.VnD(), value); break; default: @@ -943,26 +909,23 @@ void InstructionCodeGeneratorARM64Sve::VisitVecSetScalars(HVecSetScalars* instru if (IsZeroBitPattern(instruction->InputAt(0))) { return; } + ValidateVectorLength(instruction); // Set required elements. switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Mov(dst.V16B(), 0, InputRegisterAt(instruction, 0)); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Mov(dst.V8H(), 0, InputRegisterAt(instruction, 0)); break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Mov(dst.V4S(), 0, InputRegisterAt(instruction, 0)); break; case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Mov(dst.V2D(), 0, InputRegisterAt(instruction, 0)); break; default: @@ -1009,11 +972,11 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate( const PRegisterM p_reg = LoopPReg().Merging(); DCHECK(locations->InAt(0).Equals(locations->Out())); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); if (instruction->GetOpKind() == HInstruction::kAdd) { __ Mla(acc.VnB(), p_reg, acc.VnB(), left.VnB(), right.VnB()); } else { @@ -1022,7 +985,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate( break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); if (instruction->GetOpKind() == HInstruction::kAdd) { __ Mla(acc.VnH(), p_reg, acc.VnB(), left.VnH(), right.VnH()); } else { @@ -1030,7 +992,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate( } break; case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); if (instruction->GetOpKind() == HInstruction::kAdd) { __ Mla(acc.VnS(), p_reg, acc.VnB(), left.VnS(), right.VnS()); } else { @@ -1077,12 +1038,11 @@ void InstructionCodeGeneratorARM64Sve::VisitVecDotProd(HVecDotProd* instruction) DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()), HVecOperation::ToSignedType(b->GetPackedType())); DCHECK_EQ(instruction->GetPackedType(), DataType::Type::kInt32); - DCHECK_EQ(4u, instruction->GetVectorLength()); + ValidateVectorLength(instruction); size_t inputs_data_size = DataType::Size(a->GetPackedType()); switch (inputs_data_size) { case 1u: { - DCHECK_EQ(16u, a->GetVectorLength()); UseScratchRegisterScope temps(GetVIXLAssembler()); const ZRegister tmp0 = temps.AcquireZ(); const ZRegister tmp1 = ZRegisterFrom(locations->GetTemp(0)); @@ -1143,30 +1103,27 @@ void InstructionCodeGeneratorARM64Sve::VisitVecLoad(HVecLoad* instruction) { UseScratchRegisterScope temps(GetVIXLAssembler()); Register scratch; const PRegisterZ p_reg = LoopPReg().Zeroing(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kInt16: // (short) s.charAt(.) can yield HVecLoad/Int16/StringCharAt. case DataType::Type::kUint16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Ld1h(reg.VnH(), p_reg, VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Ld1b(reg.VnB(), p_reg, VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; case DataType::Type::kInt32: case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Ld1w(reg.VnS(), p_reg, VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; case DataType::Type::kInt64: case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Ld1d(reg.VnD(), p_reg, VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; @@ -1188,30 +1145,27 @@ void InstructionCodeGeneratorARM64Sve::VisitVecStore(HVecStore* instruction) { UseScratchRegisterScope temps(GetVIXLAssembler()); Register scratch; const PRegisterZ p_reg = LoopPReg().Zeroing(); + ValidateVectorLength(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ St1b(reg.VnB(), p_reg, VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ St1h(reg.VnH(), p_reg, VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; case DataType::Type::kInt32: case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ St1w(reg.VnS(), p_reg, VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; case DataType::Type::kInt64: case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ St1d(reg.VnD(), p_reg, VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; @@ -1237,22 +1191,18 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instru case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); __ Ptrue(p_reg.VnB(), vixl::aarch64::SVE_ALL); break; case DataType::Type::kUint16: case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); __ Ptrue(p_reg.VnH(), vixl::aarch64::SVE_ALL); break; case DataType::Type::kInt32: case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); __ Ptrue(p_reg.VnS(), vixl::aarch64::SVE_ALL); break; case DataType::Type::kInt64: case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); __ Ptrue(p_reg.VnD(), vixl::aarch64::SVE_ALL); break; default: @@ -1295,17 +1245,19 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredWhile(HVecPredWhile* instruct Register left = InputRegisterAt(instruction, 0); Register right = InputRegisterAt(instruction, 1); - switch (instruction->GetVectorLength()) { - case 16u: + DCHECK_EQ(codegen_->GetSIMDRegisterWidth() % instruction->GetVectorLength(), 0u); + + switch (codegen_->GetSIMDRegisterWidth() / instruction->GetVectorLength()) { + case 1u: __ Whilelo(LoopPReg().VnB(), left, right); break; - case 8u: + case 2u: __ Whilelo(LoopPReg().VnH(), left, right); break; case 4u: __ Whilelo(LoopPReg().VnS(), left, right); break; - case 2u: + case 8u: __ Whilelo(LoopPReg().VnD(), left, right); break; default: @@ -1333,52 +1285,103 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredCondition(HVecPredCondition* Location InstructionCodeGeneratorARM64Sve::AllocateSIMDScratchLocation( vixl::aarch64::UseScratchRegisterScope* scope) { - DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); - return LocationFrom(scope->AcquireVRegisterOfSize(kQRegSize)); + return LocationFrom(scope->AcquireZ()); } void InstructionCodeGeneratorARM64Sve::FreeSIMDScratchLocation(Location loc, vixl::aarch64::UseScratchRegisterScope* scope) { - DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); - scope->Release(QRegisterFrom(loc)); + scope->Release(ZRegisterFrom(loc)); } void InstructionCodeGeneratorARM64Sve::LoadSIMDRegFromStack(Location destination, Location source) { - DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); - __ Ldr(QRegisterFrom(destination), StackOperandFrom(source)); + __ Ldr(ZRegisterFrom(destination), SveStackOperandFrom(source)); } void InstructionCodeGeneratorARM64Sve::MoveSIMDRegToSIMDReg(Location destination, Location source) { - DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); - __ Mov(QRegisterFrom(destination), QRegisterFrom(source)); + __ Mov(ZRegisterFrom(destination), ZRegisterFrom(source)); } void InstructionCodeGeneratorARM64Sve::MoveToSIMDStackSlot(Location destination, Location source) { DCHECK(destination.IsSIMDStackSlot()); - DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); if (source.IsFpuRegister()) { - __ Str(QRegisterFrom(source), StackOperandFrom(destination)); + __ Str(ZRegisterFrom(source), SveStackOperandFrom(destination)); } else { DCHECK(source.IsSIMDStackSlot()); UseScratchRegisterScope temps(GetVIXLAssembler()); if (GetVIXLAssembler()->GetScratchVRegisterList()->IsEmpty()) { + // Very rare situation, only when there are cycles in ParallelMoveResolver graph. const Register temp = temps.AcquireX(); - __ Ldr(temp, MemOperand(sp, source.GetStackIndex())); - __ Str(temp, MemOperand(sp, destination.GetStackIndex())); - __ Ldr(temp, MemOperand(sp, source.GetStackIndex() + kArm64WordSize)); - __ Str(temp, MemOperand(sp, destination.GetStackIndex() + kArm64WordSize)); + DCHECK_EQ(codegen_->GetSIMDRegisterWidth() % kArm64WordSize, 0u); + // Emit a number of LDR/STR (XRegister, 64-bit) to cover the whole SIMD register size + // when copying a stack slot. + for (size_t offset = 0, e = codegen_->GetSIMDRegisterWidth(); + offset < e; + offset += kArm64WordSize) { + __ Ldr(temp, MemOperand(sp, source.GetStackIndex() + offset)); + __ Str(temp, MemOperand(sp, destination.GetStackIndex() + offset)); + } } else { - const VRegister temp = temps.AcquireVRegisterOfSize(kQRegSize); - __ Ldr(temp, StackOperandFrom(source)); - __ Str(temp, StackOperandFrom(destination)); + const ZRegister temp = temps.AcquireZ(); + __ Ldr(temp, SveStackOperandFrom(source)); + __ Str(temp, SveStackOperandFrom(destination)); } } } +template <bool is_save> +void SaveRestoreLiveRegistersHelperSveImpl(CodeGeneratorARM64* codegen, + LocationSummary* locations, + int64_t spill_offset) { + const uint32_t core_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ true); + const uint32_t fp_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ false); + DCHECK(helpers::ArtVixlRegCodeCoherentForRegSet(core_spills, + codegen->GetNumberOfCoreRegisters(), + fp_spills, + codegen->GetNumberOfFloatingPointRegisters())); + MacroAssembler* masm = codegen->GetVIXLAssembler(); + Register base = masm->StackPointer(); + + CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize, core_spills); + int64_t core_spill_size = core_list.GetTotalSizeInBytes(); + int64_t fp_spill_offset = spill_offset + core_spill_size; + + if (codegen->GetGraph()->HasSIMD()) { + if (is_save) { + masm->StoreCPURegList(core_list, MemOperand(base, spill_offset)); + } else { + masm->LoadCPURegList(core_list, MemOperand(base, spill_offset)); + } + codegen->GetAssembler()->SaveRestoreZRegisterList<is_save>(fp_spills, fp_spill_offset); + return; + } + + // Case when we only need to restore D-registers. + DCHECK(!codegen->GetGraph()->HasSIMD()); + DCHECK_LE(codegen->GetSlowPathFPWidth(), kDRegSizeInBytes); + CPURegList fp_list = CPURegList(CPURegister::kVRegister, kDRegSize, fp_spills); + if (is_save) { + masm->StoreCPURegList(core_list, MemOperand(base, spill_offset)); + masm->StoreCPURegList(fp_list, MemOperand(base, fp_spill_offset)); + } else { + masm->LoadCPURegList(core_list, MemOperand(base, spill_offset)); + masm->LoadCPURegList(fp_list, MemOperand(base, fp_spill_offset)); + } +} + +void InstructionCodeGeneratorARM64Sve::SaveLiveRegistersHelper(LocationSummary* locations, + int64_t spill_offset) { + SaveRestoreLiveRegistersHelperSveImpl</* is_save= */ true>(codegen_, locations, spill_offset); +} + +void InstructionCodeGeneratorARM64Sve::RestoreLiveRegistersHelper(LocationSummary* locations, + int64_t spill_offset) { + SaveRestoreLiveRegistersHelperSveImpl</* is_save= */ false>(codegen_, locations, spill_offset); +} + #undef __ } // namespace arm64 diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h index 72207816e1..81c6561318 100644 --- a/compiler/optimizing/common_arm64.h +++ b/compiler/optimizing/common_arm64.h @@ -182,6 +182,10 @@ inline vixl::aarch64::MemOperand StackOperandFrom(Location location) { return vixl::aarch64::MemOperand(vixl::aarch64::sp, location.GetStackIndex()); } +inline vixl::aarch64::SVEMemOperand SveStackOperandFrom(Location location) { + return vixl::aarch64::SVEMemOperand(vixl::aarch64::sp, location.GetStackIndex()); +} + inline vixl::aarch64::MemOperand HeapOperand(const vixl::aarch64::Register& base, size_t offset = 0) { // A heap reference must be 32bit, so fit in a W register. @@ -215,6 +219,10 @@ inline Location LocationFrom(const vixl::aarch64::VRegister& fpreg) { return Location::FpuRegisterLocation(fpreg.GetCode()); } +inline Location LocationFrom(const vixl::aarch64::ZRegister& zreg) { + return Location::FpuRegisterLocation(zreg.GetCode()); +} + inline vixl::aarch64::Operand OperandFromMemOperand( const vixl::aarch64::MemOperand& mem_op) { if (mem_op.IsImmediateOffset()) { diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index 1210dbe67b..02ee4ec057 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -946,9 +946,10 @@ bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int6 // make one particular reference aligned), never to exceed (1). // (3) variable to record how many references share same alignment. // (4) variable to record suitable candidate for dynamic loop peeling. - uint32_t desired_alignment = GetVectorSizeInBytes(); - DCHECK_LE(desired_alignment, 16u); - uint32_t peeling_votes[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + size_t desired_alignment = GetVectorSizeInBytes(); + ScopedArenaVector<uint32_t> peeling_votes(desired_alignment, 0u, + loop_allocator_->Adapter(kArenaAllocLoopOptimization)); + uint32_t max_num_same_alignment = 0; const ArrayReference* peeling_candidate = nullptr; @@ -1577,14 +1578,6 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, } uint32_t HLoopOptimization::GetVectorSizeInBytes() { - if (kIsDebugBuild) { - InstructionSet isa = compiler_options_->GetInstructionSet(); - // TODO: Remove this check when there are no implicit assumptions on the SIMD reg size. - DCHECK_EQ(simd_register_size_, (isa == InstructionSet::kArm || isa == InstructionSet::kThumb2) - ? 8u - : 16u); - } - return simd_register_size_; } @@ -1616,6 +1609,8 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict if (IsInPredicatedVectorizationMode()) { // SVE vectorization. CHECK(features->AsArm64InstructionSetFeatures()->HasSVE()); + size_t vector_length = simd_register_size_ / DataType::Size(type); + DCHECK_EQ(simd_register_size_ % DataType::Size(type), 0u); switch (type) { case DataType::Type::kBool: case DataType::Type::kUint8: @@ -1625,7 +1620,7 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict kNoUnsignedHAdd | kNoUnroundedHAdd | kNoSAD; - return TrySetVectorLength(type, 16); + return TrySetVectorLength(type, vector_length); case DataType::Type::kUint16: case DataType::Type::kInt16: *restrictions |= kNoDiv | @@ -1634,19 +1629,19 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict kNoUnroundedHAdd | kNoSAD | kNoDotProd; - return TrySetVectorLength(type, 8); + return TrySetVectorLength(type, vector_length); case DataType::Type::kInt32: *restrictions |= kNoDiv | kNoSAD; - return TrySetVectorLength(type, 4); + return TrySetVectorLength(type, vector_length); case DataType::Type::kInt64: *restrictions |= kNoDiv | kNoSAD; - return TrySetVectorLength(type, 2); + return TrySetVectorLength(type, vector_length); case DataType::Type::kFloat32: *restrictions |= kNoReduction; - return TrySetVectorLength(type, 4); + return TrySetVectorLength(type, vector_length); case DataType::Type::kFloat64: *restrictions |= kNoReduction; - return TrySetVectorLength(type, 2); + return TrySetVectorLength(type, vector_length); default: break; } @@ -2311,12 +2306,12 @@ Alignment HLoopOptimization::ComputeAlignment(HInstruction* offset, return Alignment(DataType::Size(type), 0); } -void HLoopOptimization::SetAlignmentStrategy(uint32_t peeling_votes[], +void HLoopOptimization::SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes, const ArrayReference* peeling_candidate) { // Current heuristic: pick the best static loop peeling factor, if any, // or otherwise use dynamic loop peeling on suggested peeling candidate. uint32_t max_vote = 0; - for (int32_t i = 0; i < 16; i++) { + for (size_t i = 0; i < peeling_votes.size(); i++) { if (peeling_votes[i] > max_vote) { max_vote = peeling_votes[i]; vector_static_peeling_factor_ = i; diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h index 0d76804d9c..d3583ed8a6 100644 --- a/compiler/optimizing/loop_optimization.h +++ b/compiler/optimizing/loop_optimization.h @@ -238,7 +238,7 @@ class HLoopOptimization : public HOptimization { DataType::Type type, bool is_string_char_at, uint32_t peeling = 0); - void SetAlignmentStrategy(uint32_t peeling_votes[], + void SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes, const ArrayReference* peeling_candidate); uint32_t MaxNumberPeeled(); bool IsVectorizationProfitable(int64_t trip_count); |