diff options
Diffstat (limited to 'compiler/optimizing')
| -rw-r--r-- | compiler/optimizing/code_generator.h | 3 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_arm64.cc | 92 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_arm64.h | 112 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_vector_arm64_neon.cc (renamed from compiler/optimizing/code_generator_vector_arm64.cc) | 215 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_vector_arm64_sve.cc | 1540 | ||||
| -rw-r--r-- | compiler/optimizing/nodes.h | 10 |
6 files changed, 1834 insertions, 138 deletions
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h index 84bf4914d0..ff2be4740d 100644 --- a/compiler/optimizing/code_generator.h +++ b/compiler/optimizing/code_generator.h @@ -223,6 +223,9 @@ class CodeGenerator : public DeletableArenaObject<kArenaAllocCodeGenerator> { virtual const Assembler& GetAssembler() const = 0; virtual size_t GetWordSize() const = 0; + // Returns whether the target supports predicated SIMD instructions. + virtual bool SupportsPredicatedSIMD() const { return false; } + // Get FP register width in bytes for spilling/restoring in the slow paths. // // Note: In SIMD graphs this should return SIMD register width as all FP and SIMD registers diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index 001fcb1b0a..09c801b454 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -75,7 +75,6 @@ using helpers::OperandFromMemOperand; using helpers::OutputCPURegister; using helpers::OutputFPRegister; using helpers::OutputRegister; -using helpers::QRegisterFrom; using helpers::RegisterFrom; using helpers::StackOperandFrom; using helpers::VIXLRegCodeFromART; @@ -177,6 +176,7 @@ static void SaveRestoreLiveRegistersHelper(CodeGenerator* codegen, CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize, core_spills); const unsigned v_reg_size_in_bits = codegen->GetSlowPathFPWidth() * 8; + DCHECK_LE(codegen->GetSIMDRegisterWidth(), kQRegSizeInBytes); CPURegList fp_list = CPURegList(CPURegister::kVRegister, v_reg_size_in_bits, fp_spills); MacroAssembler* masm = down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler(); @@ -426,10 +426,10 @@ class SuspendCheckSlowPathARM64 : public SlowPathCodeARM64 { LocationSummary* locations = instruction_->GetLocations(); CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen); __ Bind(GetEntryLabel()); - SaveLiveRegisters(codegen, locations); // Only saves live 128-bit regs for SIMD. + SaveLiveRegisters(codegen, locations); // Only saves live vector regs for SIMD. arm64_codegen->InvokeRuntime(kQuickTestSuspend, instruction_, instruction_->GetDexPc(), this); CheckEntrypointTypes<kQuickTestSuspend, void, void>(); - RestoreLiveRegisters(codegen, locations); // Only restores live 128-bit regs for SIMD. + RestoreLiveRegisters(codegen, locations); // Only restores live vector regs for SIMD. if (successor_ == nullptr) { __ B(GetReturnLabel()); } else { @@ -883,8 +883,10 @@ CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph, stats), block_labels_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)), jump_tables_(graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)), - location_builder_(graph, this), - instruction_visitor_(graph, this), + location_builder_neon_(graph, this), + instruction_visitor_neon_(graph, this), + location_builder_sve_(graph, this), + instruction_visitor_sve_(graph, this), move_resolver_(graph->GetAllocator(), this), assembler_(graph->GetAllocator(), compiler_options.GetInstructionSetFeatures()->AsArm64InstructionSetFeatures()), @@ -909,6 +911,19 @@ CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph, graph->GetAllocator()->Adapter(kArenaAllocCodeGenerator)) { // Save the link register (containing the return address) to mimic Quick. AddAllocatedRegister(LocationFrom(lr)); + + bool use_sve = ShouldUseSVE(); + if (use_sve) { + location_builder_ = &location_builder_sve_; + instruction_visitor_ = &instruction_visitor_sve_; + } else { + location_builder_ = &location_builder_neon_; + instruction_visitor_ = &instruction_visitor_neon_; + } +} + +bool CodeGeneratorARM64::ShouldUseSVE() const { + return kArm64AllowSVE && GetInstructionSetFeatures().HasSVE(); } #define __ GetVIXLAssembler()-> @@ -1038,9 +1053,9 @@ Location ParallelMoveResolverARM64::AllocateScratchLocationFor(Location::Kind ki scratch = LocationFrom(vixl_temps_.AcquireX()); } else { DCHECK_EQ(kind, Location::kFpuRegister); - scratch = LocationFrom(codegen_->GetGraph()->HasSIMD() - ? vixl_temps_.AcquireVRegisterOfSize(kQRegSize) - : vixl_temps_.AcquireD()); + scratch = codegen_->GetGraph()->HasSIMD() + ? codegen_->GetInstructionCodeGeneratorArm64()->AllocateSIMDScratchLocation(&vixl_temps_) + : LocationFrom(vixl_temps_.AcquireD()); } AddScratchLocation(scratch); return scratch; @@ -1051,7 +1066,11 @@ void ParallelMoveResolverARM64::FreeScratchLocation(Location loc) { vixl_temps_.Release(XRegisterFrom(loc)); } else { DCHECK(loc.IsFpuRegister()); - vixl_temps_.Release(codegen_->GetGraph()->HasSIMD() ? QRegisterFrom(loc) : DRegisterFrom(loc)); + if (codegen_->GetGraph()->HasSIMD()) { + codegen_->GetInstructionCodeGeneratorArm64()->FreeSIMDScratchLocation(loc, &vixl_temps_); + } else { + vixl_temps_.Release(DRegisterFrom(loc)); + } } RemoveScratchLocation(loc); } @@ -1434,7 +1453,7 @@ void CodeGeneratorARM64::MoveLocation(Location destination, DCHECK(dst.Is64Bits() == source.IsDoubleStackSlot()); __ Ldr(dst, StackOperandFrom(source)); } else if (source.IsSIMDStackSlot()) { - __ Ldr(QRegisterFrom(destination), StackOperandFrom(source)); + GetInstructionCodeGeneratorArm64()->LoadSIMDRegFromStack(destination, source); } else if (source.IsConstant()) { DCHECK(CoherentConstantAndType(source, dst_type)); MoveConstant(dst, source.GetConstant()); @@ -1458,30 +1477,14 @@ void CodeGeneratorARM64::MoveLocation(Location destination, } else { DCHECK(destination.IsFpuRegister()); if (GetGraph()->HasSIMD()) { - __ Mov(QRegisterFrom(destination), QRegisterFrom(source)); + GetInstructionCodeGeneratorArm64()->MoveSIMDRegToSIMDReg(destination, source); } else { __ Fmov(VRegister(dst), FPRegisterFrom(source, dst_type)); } } } } else if (destination.IsSIMDStackSlot()) { - if (source.IsFpuRegister()) { - __ Str(QRegisterFrom(source), StackOperandFrom(destination)); - } else { - DCHECK(source.IsSIMDStackSlot()); - UseScratchRegisterScope temps(GetVIXLAssembler()); - if (GetVIXLAssembler()->GetScratchVRegisterList()->IsEmpty()) { - Register temp = temps.AcquireX(); - __ Ldr(temp, MemOperand(sp, source.GetStackIndex())); - __ Str(temp, MemOperand(sp, destination.GetStackIndex())); - __ Ldr(temp, MemOperand(sp, source.GetStackIndex() + kArm64WordSize)); - __ Str(temp, MemOperand(sp, destination.GetStackIndex() + kArm64WordSize)); - } else { - VRegister temp = temps.AcquireVRegisterOfSize(kQRegSize); - __ Ldr(temp, StackOperandFrom(source)); - __ Str(temp, StackOperandFrom(destination)); - } - } + GetInstructionCodeGeneratorArm64()->MoveToSIMDStackSlot(destination, source); } else { // The destination is not a register. It must be a stack slot. DCHECK(destination.IsStackSlot() || destination.IsDoubleStackSlot()); if (source.IsRegister() || source.IsFpuRegister()) { @@ -6372,6 +6375,39 @@ void CodeGeneratorARM64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_ } } +MemOperand InstructionCodeGeneratorARM64::VecNeonAddress( + HVecMemoryOperation* instruction, + UseScratchRegisterScope* temps_scope, + size_t size, + bool is_string_char_at, + /*out*/ Register* scratch) { + LocationSummary* locations = instruction->GetLocations(); + Register base = InputRegisterAt(instruction, 0); + + if (instruction->InputAt(1)->IsIntermediateAddressIndex()) { + DCHECK(!is_string_char_at); + return MemOperand(base.X(), InputRegisterAt(instruction, 1).X()); + } + + Location index = locations->InAt(1); + uint32_t offset = is_string_char_at + ? mirror::String::ValueOffset().Uint32Value() + : mirror::Array::DataOffset(size).Uint32Value(); + size_t shift = ComponentSizeShiftWidth(size); + + // HIntermediateAddress optimization is only applied for scalar ArrayGet and ArraySet. + DCHECK(!instruction->InputAt(0)->IsIntermediateAddress()); + + if (index.IsConstant()) { + offset += Int64FromLocation(index) << shift; + return HeapOperand(base, offset); + } else { + *scratch = temps_scope->AcquireSameSizeAs(base); + __ Add(*scratch, base, Operand(WRegisterFrom(index), LSL, shift)); + return HeapOperand(*scratch, offset); + } +} + #undef __ #undef QUICK_ENTRY_POINT diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index 47a019424e..627cf72645 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -53,6 +53,9 @@ static constexpr size_t kArm64WordSize = static_cast<size_t>(kArm64PointerSize); static constexpr int kMaxMacroInstructionSizeInBytes = 15 * vixl::aarch64::kInstructionSize; static constexpr int kInvokeCodeMarginSizeInBytes = 6 * kMaxMacroInstructionSizeInBytes; +// SVE is currently not enabled. +static constexpr bool kArm64AllowSVE = false; + static const vixl::aarch64::Register kParameterCoreRegisters[] = { vixl::aarch64::x1, vixl::aarch64::x2, @@ -262,7 +265,7 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator { #define DECLARE_VISIT_INSTRUCTION(name, super) \ void Visit##name(H##name* instr) override; - FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION) + FOR_EACH_CONCRETE_INSTRUCTION_SCALAR_COMMON(DECLARE_VISIT_INSTRUCTION) FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION) FOR_EACH_CONCRETE_INSTRUCTION_SHARED(DECLARE_VISIT_INSTRUCTION) @@ -276,7 +279,15 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator { Arm64Assembler* GetAssembler() const { return assembler_; } vixl::aarch64::MacroAssembler* GetVIXLAssembler() { return GetAssembler()->GetVIXLAssembler(); } - private: + // SIMD helpers. + virtual Location AllocateSIMDScratchLocation(vixl::aarch64::UseScratchRegisterScope* scope) = 0; + virtual void FreeSIMDScratchLocation(Location loc, + vixl::aarch64::UseScratchRegisterScope* scope) = 0; + virtual void LoadSIMDRegFromStack(Location destination, Location source) = 0; + virtual void MoveSIMDRegToSIMDReg(Location destination, Location source) = 0; + virtual void MoveToSIMDStackSlot(Location destination, Location source) = 0; + + protected: void GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path, vixl::aarch64::Register class_reg); void GenerateBitstringTypeCheckCompare(HTypeCheckInstruction* check, @@ -340,7 +351,11 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator { void GenerateIntRemForPower2Denom(HRem *instruction); void HandleGoto(HInstruction* got, HBasicBlock* successor); - vixl::aarch64::MemOperand VecAddress( + // Helper to set up locations for vector memory operations. Returns the memory operand and, + // if used, sets the output parameter scratch to a temporary register used in this operand, + // so that the client can release it right after the memory operand use. + // Neon version. + vixl::aarch64::MemOperand VecNeonAddress( HVecMemoryOperation* instruction, // This function may acquire a scratch register. vixl::aarch64::UseScratchRegisterScope* temps_scope, @@ -362,7 +377,7 @@ class LocationsBuilderARM64 : public HGraphVisitor { #define DECLARE_VISIT_INSTRUCTION(name, super) \ void Visit##name(H##name* instr) override; - FOR_EACH_CONCRETE_INSTRUCTION_COMMON(DECLARE_VISIT_INSTRUCTION) + FOR_EACH_CONCRETE_INSTRUCTION_SCALAR_COMMON(DECLARE_VISIT_INSTRUCTION) FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION) FOR_EACH_CONCRETE_INSTRUCTION_SHARED(DECLARE_VISIT_INSTRUCTION) @@ -373,7 +388,7 @@ class LocationsBuilderARM64 : public HGraphVisitor { << " (id " << instruction->GetId() << ")"; } - private: + protected: void HandleBinaryOp(HBinaryOperation* instr); void HandleFieldSet(HInstruction* instruction); void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info); @@ -387,6 +402,72 @@ class LocationsBuilderARM64 : public HGraphVisitor { DISALLOW_COPY_AND_ASSIGN(LocationsBuilderARM64); }; +class InstructionCodeGeneratorARM64Neon : public InstructionCodeGeneratorARM64 { + public: + InstructionCodeGeneratorARM64Neon(HGraph* graph, CodeGeneratorARM64* codegen) : + InstructionCodeGeneratorARM64(graph, codegen) {} + +#define DECLARE_VISIT_INSTRUCTION(name, super) \ + void Visit##name(H##name* instr) override; + + FOR_EACH_CONCRETE_INSTRUCTION_VECTOR_COMMON(DECLARE_VISIT_INSTRUCTION) + +#undef DECLARE_VISIT_INSTRUCTION + + Location AllocateSIMDScratchLocation(vixl::aarch64::UseScratchRegisterScope* scope) override; + void FreeSIMDScratchLocation(Location loc, + vixl::aarch64::UseScratchRegisterScope* scope) override; + void LoadSIMDRegFromStack(Location destination, Location source) override; + void MoveSIMDRegToSIMDReg(Location destination, Location source) override; + void MoveToSIMDStackSlot(Location destination, Location source) override; +}; + +class LocationsBuilderARM64Neon : public LocationsBuilderARM64 { + public: + LocationsBuilderARM64Neon(HGraph* graph, CodeGeneratorARM64* codegen) : + LocationsBuilderARM64(graph, codegen) {} + +#define DECLARE_VISIT_INSTRUCTION(name, super) \ + void Visit##name(H##name* instr) override; + + FOR_EACH_CONCRETE_INSTRUCTION_VECTOR_COMMON(DECLARE_VISIT_INSTRUCTION) + +#undef DECLARE_VISIT_INSTRUCTION +}; + +class InstructionCodeGeneratorARM64Sve : public InstructionCodeGeneratorARM64 { + public: + InstructionCodeGeneratorARM64Sve(HGraph* graph, CodeGeneratorARM64* codegen) : + InstructionCodeGeneratorARM64(graph, codegen) {} + +#define DECLARE_VISIT_INSTRUCTION(name, super) \ + void Visit##name(H##name* instr) override; + + FOR_EACH_CONCRETE_INSTRUCTION_VECTOR_COMMON(DECLARE_VISIT_INSTRUCTION) + +#undef DECLARE_VISIT_INSTRUCTION + + Location AllocateSIMDScratchLocation(vixl::aarch64::UseScratchRegisterScope* scope) override; + void FreeSIMDScratchLocation(Location loc, + vixl::aarch64::UseScratchRegisterScope* scope) override; + void LoadSIMDRegFromStack(Location destination, Location source) override; + void MoveSIMDRegToSIMDReg(Location destination, Location source) override; + void MoveToSIMDStackSlot(Location destination, Location source) override; +}; + +class LocationsBuilderARM64Sve : public LocationsBuilderARM64 { + public: + LocationsBuilderARM64Sve(HGraph* graph, CodeGeneratorARM64* codegen) : + LocationsBuilderARM64(graph, codegen) {} + +#define DECLARE_VISIT_INSTRUCTION(name, super) \ + void Visit##name(H##name* instr) override; + + FOR_EACH_CONCRETE_INSTRUCTION_VECTOR_COMMON(DECLARE_VISIT_INSTRUCTION) + +#undef DECLARE_VISIT_INSTRUCTION +}; + class ParallelMoveResolverARM64 : public ParallelMoveResolverNoSwap { public: ParallelMoveResolverARM64(ArenaAllocator* allocator, CodeGeneratorARM64* codegen) @@ -435,6 +516,8 @@ class CodeGeneratorARM64 : public CodeGenerator { return kArm64WordSize; } + bool SupportsPredicatedSIMD() const override { return ShouldUseSVE(); } + size_t GetSlowPathFPWidth() const override { return GetGraph()->HasSIMD() ? GetSIMDRegisterWidth() @@ -455,8 +538,11 @@ class CodeGeneratorARM64 : public CodeGenerator { return block_entry_label->GetLocation(); } - HGraphVisitor* GetLocationBuilder() override { return &location_builder_; } - HGraphVisitor* GetInstructionVisitor() override { return &instruction_visitor_; } + HGraphVisitor* GetLocationBuilder() override { return location_builder_; } + InstructionCodeGeneratorARM64* GetInstructionCodeGeneratorArm64() { + return instruction_visitor_; + } + HGraphVisitor* GetInstructionVisitor() override { return GetInstructionCodeGeneratorArm64(); } Arm64Assembler* GetAssembler() override { return &assembler_; } const Arm64Assembler& GetAssembler() const override { return assembler_; } vixl::aarch64::MacroAssembler* GetVIXLAssembler() { return GetAssembler()->GetVIXLAssembler(); } @@ -899,14 +985,22 @@ class CodeGeneratorARM64 : public CodeGenerator { static void EmitPcRelativeLinkerPatches(const ArenaDeque<PcRelativePatchInfo>& infos, ArenaVector<linker::LinkerPatch>* linker_patches); + // Returns whether SVE features are supported and should be used. + bool ShouldUseSVE() const; + // Labels for each block that will be compiled. // We use a deque so that the `vixl::aarch64::Label` objects do not move in memory. ArenaDeque<vixl::aarch64::Label> block_labels_; // Indexed by block id. vixl::aarch64::Label frame_entry_label_; ArenaVector<std::unique_ptr<JumpTableARM64>> jump_tables_; - LocationsBuilderARM64 location_builder_; - InstructionCodeGeneratorARM64 instruction_visitor_; + LocationsBuilderARM64Neon location_builder_neon_; + InstructionCodeGeneratorARM64Neon instruction_visitor_neon_; + LocationsBuilderARM64Sve location_builder_sve_; + InstructionCodeGeneratorARM64Sve instruction_visitor_sve_; + + LocationsBuilderARM64* location_builder_; + InstructionCodeGeneratorARM64* instruction_visitor_; ParallelMoveResolverARM64 move_resolver_; Arm64Assembler assembler_; diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64_neon.cc index df95c88c07..78720c3635 100644 --- a/compiler/optimizing/code_generator_vector_arm64.cc +++ b/compiler/optimizing/code_generator_vector_arm64_neon.cc @@ -31,9 +31,11 @@ using helpers::DRegisterFrom; using helpers::HeapOperand; using helpers::InputRegisterAt; using helpers::Int64FromLocation; +using helpers::LocationFrom; using helpers::OutputRegister; +using helpers::QRegisterFrom; +using helpers::StackOperandFrom; using helpers::VRegisterFrom; -using helpers::WRegisterFrom; using helpers::XRegisterFrom; #define __ GetVIXLAssembler()-> @@ -47,7 +49,7 @@ static bool ShouldEmitDotProductInstructions(const CodeGeneratorARM64* codegen_) return kArm64EmitDotProdInstructions && codegen_->GetInstructionSetFeatures().HasDotProd(); } -void LocationsBuilderARM64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { +void LocationsBuilderARM64Neon::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); HInstruction* input = instruction->InputAt(0); switch (instruction->GetPackedType()) { @@ -78,7 +80,7 @@ void LocationsBuilderARM64::VisitVecReplicateScalar(HVecReplicateScalar* instruc } } -void InstructionCodeGeneratorARM64::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { LocationSummary* locations = instruction->GetLocations(); Location src_loc = locations->InAt(0); VRegister dst = VRegisterFrom(locations->Out()); @@ -140,7 +142,7 @@ void InstructionCodeGeneratorARM64::VisitVecReplicateScalar(HVecReplicateScalar* } } -void LocationsBuilderARM64::VisitVecExtractScalar(HVecExtractScalar* instruction) { +void LocationsBuilderARM64Neon::VisitVecExtractScalar(HVecExtractScalar* instruction) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); switch (instruction->GetPackedType()) { case DataType::Type::kBool: @@ -164,7 +166,7 @@ void LocationsBuilderARM64::VisitVecExtractScalar(HVecExtractScalar* instruction } } -void InstructionCodeGeneratorARM64::VisitVecExtractScalar(HVecExtractScalar* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecExtractScalar(HVecExtractScalar* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister src = VRegisterFrom(locations->InAt(0)); switch (instruction->GetPackedType()) { @@ -215,11 +217,11 @@ static void CreateVecUnOpLocations(ArenaAllocator* allocator, HVecUnaryOperation } } -void LocationsBuilderARM64::VisitVecReduce(HVecReduce* instruction) { +void LocationsBuilderARM64Neon::VisitVecReduce(HVecReduce* instruction) { CreateVecUnOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecReduce(HVecReduce* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecReduce(HVecReduce* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister src = VRegisterFrom(locations->InAt(0)); VRegister dst = DRegisterFrom(locations->Out()); @@ -255,11 +257,11 @@ void InstructionCodeGeneratorARM64::VisitVecReduce(HVecReduce* instruction) { } } -void LocationsBuilderARM64::VisitVecCnv(HVecCnv* instruction) { +void LocationsBuilderARM64Neon::VisitVecCnv(HVecCnv* instruction) { CreateVecUnOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecCnv(HVecCnv* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecCnv(HVecCnv* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister src = VRegisterFrom(locations->InAt(0)); VRegister dst = VRegisterFrom(locations->Out()); @@ -273,11 +275,11 @@ void InstructionCodeGeneratorARM64::VisitVecCnv(HVecCnv* instruction) { } } -void LocationsBuilderARM64::VisitVecNeg(HVecNeg* instruction) { +void LocationsBuilderARM64Neon::VisitVecNeg(HVecNeg* instruction) { CreateVecUnOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecNeg(HVecNeg* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecNeg(HVecNeg* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister src = VRegisterFrom(locations->InAt(0)); VRegister dst = VRegisterFrom(locations->Out()); @@ -314,11 +316,11 @@ void InstructionCodeGeneratorARM64::VisitVecNeg(HVecNeg* instruction) { } } -void LocationsBuilderARM64::VisitVecAbs(HVecAbs* instruction) { +void LocationsBuilderARM64Neon::VisitVecAbs(HVecAbs* instruction) { CreateVecUnOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecAbs(HVecAbs* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecAbs(HVecAbs* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister src = VRegisterFrom(locations->InAt(0)); VRegister dst = VRegisterFrom(locations->Out()); @@ -353,11 +355,11 @@ void InstructionCodeGeneratorARM64::VisitVecAbs(HVecAbs* instruction) { } } -void LocationsBuilderARM64::VisitVecNot(HVecNot* instruction) { +void LocationsBuilderARM64Neon::VisitVecNot(HVecNot* instruction) { CreateVecUnOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecNot(HVecNot* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecNot(HVecNot* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister src = VRegisterFrom(locations->InAt(0)); VRegister dst = VRegisterFrom(locations->Out()); @@ -404,11 +406,11 @@ static void CreateVecBinOpLocations(ArenaAllocator* allocator, HVecBinaryOperati } } -void LocationsBuilderARM64::VisitVecAdd(HVecAdd* instruction) { +void LocationsBuilderARM64Neon::VisitVecAdd(HVecAdd* instruction) { CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecAdd(HVecAdd* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecAdd(HVecAdd* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister lhs = VRegisterFrom(locations->InAt(0)); VRegister rhs = VRegisterFrom(locations->InAt(1)); @@ -446,11 +448,11 @@ void InstructionCodeGeneratorARM64::VisitVecAdd(HVecAdd* instruction) { } } -void LocationsBuilderARM64::VisitVecSaturationAdd(HVecSaturationAdd* instruction) { +void LocationsBuilderARM64Neon::VisitVecSaturationAdd(HVecSaturationAdd* instruction) { CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecSaturationAdd(HVecSaturationAdd* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecSaturationAdd(HVecSaturationAdd* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister lhs = VRegisterFrom(locations->InAt(0)); VRegister rhs = VRegisterFrom(locations->InAt(1)); @@ -478,11 +480,11 @@ void InstructionCodeGeneratorARM64::VisitVecSaturationAdd(HVecSaturationAdd* ins } } -void LocationsBuilderARM64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { +void LocationsBuilderARM64Neon::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister lhs = VRegisterFrom(locations->InAt(0)); VRegister rhs = VRegisterFrom(locations->InAt(1)); @@ -518,11 +520,11 @@ void InstructionCodeGeneratorARM64::VisitVecHalvingAdd(HVecHalvingAdd* instructi } } -void LocationsBuilderARM64::VisitVecSub(HVecSub* instruction) { +void LocationsBuilderARM64Neon::VisitVecSub(HVecSub* instruction) { CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecSub(HVecSub* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecSub(HVecSub* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister lhs = VRegisterFrom(locations->InAt(0)); VRegister rhs = VRegisterFrom(locations->InAt(1)); @@ -560,11 +562,11 @@ void InstructionCodeGeneratorARM64::VisitVecSub(HVecSub* instruction) { } } -void LocationsBuilderARM64::VisitVecSaturationSub(HVecSaturationSub* instruction) { +void LocationsBuilderARM64Neon::VisitVecSaturationSub(HVecSaturationSub* instruction) { CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecSaturationSub(HVecSaturationSub* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecSaturationSub(HVecSaturationSub* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister lhs = VRegisterFrom(locations->InAt(0)); VRegister rhs = VRegisterFrom(locations->InAt(1)); @@ -592,11 +594,11 @@ void InstructionCodeGeneratorARM64::VisitVecSaturationSub(HVecSaturationSub* ins } } -void LocationsBuilderARM64::VisitVecMul(HVecMul* instruction) { +void LocationsBuilderARM64Neon::VisitVecMul(HVecMul* instruction) { CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecMul(HVecMul* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecMul(HVecMul* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister lhs = VRegisterFrom(locations->InAt(0)); VRegister rhs = VRegisterFrom(locations->InAt(1)); @@ -630,11 +632,11 @@ void InstructionCodeGeneratorARM64::VisitVecMul(HVecMul* instruction) { } } -void LocationsBuilderARM64::VisitVecDiv(HVecDiv* instruction) { +void LocationsBuilderARM64Neon::VisitVecDiv(HVecDiv* instruction) { CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecDiv(HVecDiv* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecDiv(HVecDiv* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister lhs = VRegisterFrom(locations->InAt(0)); VRegister rhs = VRegisterFrom(locations->InAt(1)); @@ -654,11 +656,11 @@ void InstructionCodeGeneratorARM64::VisitVecDiv(HVecDiv* instruction) { } } -void LocationsBuilderARM64::VisitVecMin(HVecMin* instruction) { +void LocationsBuilderARM64Neon::VisitVecMin(HVecMin* instruction) { CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecMin(HVecMin* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecMin(HVecMin* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister lhs = VRegisterFrom(locations->InAt(0)); VRegister rhs = VRegisterFrom(locations->InAt(1)); @@ -702,11 +704,11 @@ void InstructionCodeGeneratorARM64::VisitVecMin(HVecMin* instruction) { } } -void LocationsBuilderARM64::VisitVecMax(HVecMax* instruction) { +void LocationsBuilderARM64Neon::VisitVecMax(HVecMax* instruction) { CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecMax(HVecMax* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecMax(HVecMax* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister lhs = VRegisterFrom(locations->InAt(0)); VRegister rhs = VRegisterFrom(locations->InAt(1)); @@ -750,12 +752,12 @@ void InstructionCodeGeneratorARM64::VisitVecMax(HVecMax* instruction) { } } -void LocationsBuilderARM64::VisitVecAnd(HVecAnd* instruction) { +void LocationsBuilderARM64Neon::VisitVecAnd(HVecAnd* instruction) { // TODO: Allow constants supported by BIC (vector, immediate). CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecAnd(HVecAnd* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecAnd(HVecAnd* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister lhs = VRegisterFrom(locations->InAt(0)); VRegister rhs = VRegisterFrom(locations->InAt(1)); @@ -778,20 +780,20 @@ void InstructionCodeGeneratorARM64::VisitVecAnd(HVecAnd* instruction) { } } -void LocationsBuilderARM64::VisitVecAndNot(HVecAndNot* instruction) { +void LocationsBuilderARM64Neon::VisitVecAndNot(HVecAndNot* instruction) { LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); } -void InstructionCodeGeneratorARM64::VisitVecAndNot(HVecAndNot* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecAndNot(HVecAndNot* instruction) { // TODO: Use BIC (vector, register). LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); } -void LocationsBuilderARM64::VisitVecOr(HVecOr* instruction) { +void LocationsBuilderARM64Neon::VisitVecOr(HVecOr* instruction) { CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecOr(HVecOr* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecOr(HVecOr* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister lhs = VRegisterFrom(locations->InAt(0)); VRegister rhs = VRegisterFrom(locations->InAt(1)); @@ -814,11 +816,11 @@ void InstructionCodeGeneratorARM64::VisitVecOr(HVecOr* instruction) { } } -void LocationsBuilderARM64::VisitVecXor(HVecXor* instruction) { +void LocationsBuilderARM64Neon::VisitVecXor(HVecXor* instruction) { CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecXor(HVecXor* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecXor(HVecXor* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister lhs = VRegisterFrom(locations->InAt(0)); VRegister rhs = VRegisterFrom(locations->InAt(1)); @@ -861,11 +863,11 @@ static void CreateVecShiftLocations(ArenaAllocator* allocator, HVecBinaryOperati } } -void LocationsBuilderARM64::VisitVecShl(HVecShl* instruction) { +void LocationsBuilderARM64Neon::VisitVecShl(HVecShl* instruction) { CreateVecShiftLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecShl(HVecShl* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecShl(HVecShl* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister lhs = VRegisterFrom(locations->InAt(0)); VRegister dst = VRegisterFrom(locations->Out()); @@ -895,11 +897,11 @@ void InstructionCodeGeneratorARM64::VisitVecShl(HVecShl* instruction) { } } -void LocationsBuilderARM64::VisitVecShr(HVecShr* instruction) { +void LocationsBuilderARM64Neon::VisitVecShr(HVecShr* instruction) { CreateVecShiftLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecShr(HVecShr* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecShr(HVecShr* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister lhs = VRegisterFrom(locations->InAt(0)); VRegister dst = VRegisterFrom(locations->Out()); @@ -929,11 +931,11 @@ void InstructionCodeGeneratorARM64::VisitVecShr(HVecShr* instruction) { } } -void LocationsBuilderARM64::VisitVecUShr(HVecUShr* instruction) { +void LocationsBuilderARM64Neon::VisitVecUShr(HVecUShr* instruction) { CreateVecShiftLocations(GetGraph()->GetAllocator(), instruction); } -void InstructionCodeGeneratorARM64::VisitVecUShr(HVecUShr* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecUShr(HVecUShr* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister lhs = VRegisterFrom(locations->InAt(0)); VRegister dst = VRegisterFrom(locations->Out()); @@ -963,7 +965,7 @@ void InstructionCodeGeneratorARM64::VisitVecUShr(HVecUShr* instruction) { } } -void LocationsBuilderARM64::VisitVecSetScalars(HVecSetScalars* instruction) { +void LocationsBuilderARM64Neon::VisitVecSetScalars(HVecSetScalars* instruction) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); DCHECK_EQ(1u, instruction->InputCount()); // only one input currently implemented @@ -995,7 +997,7 @@ void LocationsBuilderARM64::VisitVecSetScalars(HVecSetScalars* instruction) { } } -void InstructionCodeGeneratorARM64::VisitVecSetScalars(HVecSetScalars* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecSetScalars(HVecSetScalars* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister dst = VRegisterFrom(locations->Out()); @@ -1057,14 +1059,14 @@ static void CreateVecAccumLocations(ArenaAllocator* allocator, HVecOperation* in } } -void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { +void LocationsBuilderARM64Neon::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction); } // Some early revisions of the Cortex-A53 have an erratum (835769) whereby it is possible for a // 64-bit scalar multiply-accumulate instruction in AArch64 state to generate an incorrect result. // However vector MultiplyAccumulate instruction is not affected. -void InstructionCodeGeneratorARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister acc = VRegisterFrom(locations->InAt(0)); VRegister left = VRegisterFrom(locations->InAt(1)); @@ -1105,7 +1107,7 @@ void InstructionCodeGeneratorARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccum } } -void LocationsBuilderARM64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { +void LocationsBuilderARM64Neon::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction); // Some conversions require temporary registers. LocationSummary* locations = instruction->GetLocations(); @@ -1147,7 +1149,7 @@ void LocationsBuilderARM64::VisitVecSADAccumulate(HVecSADAccumulate* instruction } } -void InstructionCodeGeneratorARM64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { LocationSummary* locations = instruction->GetLocations(); VRegister acc = VRegisterFrom(locations->InAt(0)); VRegister left = VRegisterFrom(locations->InAt(1)); @@ -1287,7 +1289,7 @@ void InstructionCodeGeneratorARM64::VisitVecSADAccumulate(HVecSADAccumulate* ins } } -void LocationsBuilderARM64::VisitVecDotProd(HVecDotProd* instruction) { +void LocationsBuilderARM64Neon::VisitVecDotProd(HVecDotProd* instruction) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); DCHECK(instruction->GetPackedType() == DataType::Type::kInt32); locations->SetInAt(0, Location::RequiresFpuRegister()); @@ -1302,7 +1304,7 @@ void LocationsBuilderARM64::VisitVecDotProd(HVecDotProd* instruction) { } } -void InstructionCodeGeneratorARM64::VisitVecDotProd(HVecDotProd* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecDotProd(HVecDotProd* instruction) { LocationSummary* locations = instruction->GetLocations(); DCHECK(locations->InAt(0).Equals(locations->Out())); VRegister acc = VRegisterFrom(locations->InAt(0)); @@ -1392,47 +1394,11 @@ static void CreateVecMemLocations(ArenaAllocator* allocator, } } -// Helper to set up locations for vector memory operations. Returns the memory operand and, -// if used, sets the output parameter scratch to a temporary register used in this operand, -// so that the client can release it right after the memory operand use. -MemOperand InstructionCodeGeneratorARM64::VecAddress( - HVecMemoryOperation* instruction, - UseScratchRegisterScope* temps_scope, - size_t size, - bool is_string_char_at, - /*out*/ Register* scratch) { - LocationSummary* locations = instruction->GetLocations(); - Register base = InputRegisterAt(instruction, 0); - - if (instruction->InputAt(1)->IsIntermediateAddressIndex()) { - DCHECK(!is_string_char_at); - return MemOperand(base.X(), InputRegisterAt(instruction, 1).X()); - } - - Location index = locations->InAt(1); - uint32_t offset = is_string_char_at - ? mirror::String::ValueOffset().Uint32Value() - : mirror::Array::DataOffset(size).Uint32Value(); - size_t shift = ComponentSizeShiftWidth(size); - - // HIntermediateAddress optimization is only applied for scalar ArrayGet and ArraySet. - DCHECK(!instruction->InputAt(0)->IsIntermediateAddress()); - - if (index.IsConstant()) { - offset += Int64FromLocation(index) << shift; - return HeapOperand(base, offset); - } else { - *scratch = temps_scope->AcquireSameSizeAs(base); - __ Add(*scratch, base, Operand(WRegisterFrom(index), LSL, shift)); - return HeapOperand(*scratch, offset); - } -} - -void LocationsBuilderARM64::VisitVecLoad(HVecLoad* instruction) { +void LocationsBuilderARM64Neon::VisitVecLoad(HVecLoad* instruction) { CreateVecMemLocations(GetGraph()->GetAllocator(), instruction, /*is_load*/ true); } -void InstructionCodeGeneratorARM64::VisitVecLoad(HVecLoad* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecLoad(HVecLoad* instruction) { LocationSummary* locations = instruction->GetLocations(); size_t size = DataType::Size(instruction->GetPackedType()); VRegister reg = VRegisterFrom(locations->Out()); @@ -1456,7 +1422,7 @@ void InstructionCodeGeneratorARM64::VisitVecLoad(HVecLoad* instruction) { temps.Release(length); // no longer needed // Zero extend 8 compressed bytes into 8 chars. __ Ldr(DRegisterFrom(locations->Out()).V8B(), - VecAddress(instruction, &temps, 1, /*is_string_char_at*/ true, &scratch)); + VecNeonAddress(instruction, &temps, 1, /*is_string_char_at*/ true, &scratch)); __ Uxtl(reg.V8H(), reg.V8B()); __ B(&done); if (scratch.IsValid()) { @@ -1464,7 +1430,8 @@ void InstructionCodeGeneratorARM64::VisitVecLoad(HVecLoad* instruction) { } // Load 8 direct uncompressed chars. __ Bind(&uncompressed_load); - __ Ldr(reg, VecAddress(instruction, &temps, size, /*is_string_char_at*/ true, &scratch)); + __ Ldr(reg, + VecNeonAddress(instruction, &temps, size, /*is_string_char_at*/ true, &scratch)); __ Bind(&done); return; } @@ -1478,7 +1445,8 @@ void InstructionCodeGeneratorARM64::VisitVecLoad(HVecLoad* instruction) { case DataType::Type::kFloat64: DCHECK_LE(2u, instruction->GetVectorLength()); DCHECK_LE(instruction->GetVectorLength(), 16u); - __ Ldr(reg, VecAddress(instruction, &temps, size, instruction->IsStringCharAt(), &scratch)); + __ Ldr(reg, + VecNeonAddress(instruction, &temps, size, instruction->IsStringCharAt(), &scratch)); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -1486,11 +1454,11 @@ void InstructionCodeGeneratorARM64::VisitVecLoad(HVecLoad* instruction) { } } -void LocationsBuilderARM64::VisitVecStore(HVecStore* instruction) { +void LocationsBuilderARM64Neon::VisitVecStore(HVecStore* instruction) { CreateVecMemLocations(GetGraph()->GetAllocator(), instruction, /*is_load*/ false); } -void InstructionCodeGeneratorARM64::VisitVecStore(HVecStore* instruction) { +void InstructionCodeGeneratorARM64Neon::VisitVecStore(HVecStore* instruction) { LocationSummary* locations = instruction->GetLocations(); size_t size = DataType::Size(instruction->GetPackedType()); VRegister reg = VRegisterFrom(locations->InAt(2)); @@ -1509,7 +1477,8 @@ void InstructionCodeGeneratorARM64::VisitVecStore(HVecStore* instruction) { case DataType::Type::kFloat64: DCHECK_LE(2u, instruction->GetVectorLength()); DCHECK_LE(instruction->GetVectorLength(), 16u); - __ Str(reg, VecAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); + __ Str(reg, + VecNeonAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -1517,6 +1486,54 @@ void InstructionCodeGeneratorARM64::VisitVecStore(HVecStore* instruction) { } } +Location InstructionCodeGeneratorARM64Neon::AllocateSIMDScratchLocation( + vixl::aarch64::UseScratchRegisterScope* scope) { + DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); + return LocationFrom(scope->AcquireVRegisterOfSize(kQRegSize)); +} + +void InstructionCodeGeneratorARM64Neon::FreeSIMDScratchLocation(Location loc, + vixl::aarch64::UseScratchRegisterScope* scope) { + DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); + scope->Release(QRegisterFrom(loc)); +} + +void InstructionCodeGeneratorARM64Neon::LoadSIMDRegFromStack(Location destination, + Location source) { + DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); + __ Ldr(QRegisterFrom(destination), StackOperandFrom(source)); +} + +void InstructionCodeGeneratorARM64Neon::MoveSIMDRegToSIMDReg(Location destination, + Location source) { + DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); + __ Mov(QRegisterFrom(destination), QRegisterFrom(source)); +} + +void InstructionCodeGeneratorARM64Neon::MoveToSIMDStackSlot(Location destination, + Location source) { + DCHECK(destination.IsSIMDStackSlot()); + DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); + + if (source.IsFpuRegister()) { + __ Str(QRegisterFrom(source), StackOperandFrom(destination)); + } else { + DCHECK(source.IsSIMDStackSlot()); + UseScratchRegisterScope temps(GetVIXLAssembler()); + if (GetVIXLAssembler()->GetScratchVRegisterList()->IsEmpty()) { + Register temp = temps.AcquireX(); + __ Ldr(temp, MemOperand(sp, source.GetStackIndex())); + __ Str(temp, MemOperand(sp, destination.GetStackIndex())); + __ Ldr(temp, MemOperand(sp, source.GetStackIndex() + kArm64WordSize)); + __ Str(temp, MemOperand(sp, destination.GetStackIndex() + kArm64WordSize)); + } else { + VRegister temp = temps.AcquireVRegisterOfSize(kQRegSize); + __ Ldr(temp, StackOperandFrom(source)); + __ Str(temp, StackOperandFrom(destination)); + } + } +} + #undef __ } // namespace arm64 diff --git a/compiler/optimizing/code_generator_vector_arm64_sve.cc b/compiler/optimizing/code_generator_vector_arm64_sve.cc new file mode 100644 index 0000000000..5460ff28dd --- /dev/null +++ b/compiler/optimizing/code_generator_vector_arm64_sve.cc @@ -0,0 +1,1540 @@ +/* + * Copyright (C) 2020 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "code_generator_arm64.h" + +#include "arch/arm64/instruction_set_features_arm64.h" +#include "mirror/array-inl.h" +#include "mirror/string.h" + +using namespace vixl::aarch64; // NOLINT(build/namespaces) + +namespace art { +namespace arm64 { + +using helpers::ARM64EncodableConstantOrRegister; +using helpers::Arm64CanEncodeConstantAsImmediate; +using helpers::DRegisterFrom; +using helpers::HeapOperand; +using helpers::InputRegisterAt; +using helpers::Int64FromLocation; +using helpers::LocationFrom; +using helpers::OutputRegister; +using helpers::QRegisterFrom; +using helpers::StackOperandFrom; +using helpers::VRegisterFrom; +using helpers::XRegisterFrom; + +#define __ GetVIXLAssembler()-> + +// Build-time switch for Armv8.4-a dot product instructions. +// TODO: Enable dot product when there is a device to test it on. +static constexpr bool kArm64EmitDotProdInstructions = false; + +// Returns whether dot product instructions should be emitted. +static bool ShouldEmitDotProductInstructions(const CodeGeneratorARM64* codegen_) { + return kArm64EmitDotProdInstructions && codegen_->GetInstructionSetFeatures().HasDotProd(); +} + +void LocationsBuilderARM64Sve::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); + HInstruction* input = instruction->InputAt(0); + switch (instruction->GetPackedType()) { + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + case DataType::Type::kUint16: + case DataType::Type::kInt16: + case DataType::Type::kInt32: + case DataType::Type::kInt64: + locations->SetInAt(0, ARM64EncodableConstantOrRegister(input, instruction)); + locations->SetOut(Location::RequiresFpuRegister()); + break; + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + if (input->IsConstant() && + Arm64CanEncodeConstantAsImmediate(input->AsConstant(), instruction)) { + locations->SetInAt(0, Location::ConstantLocation(input->AsConstant())); + locations->SetOut(Location::RequiresFpuRegister()); + } else { + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); + } + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + LocationSummary* locations = instruction->GetLocations(); + Location src_loc = locations->InAt(0); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + if (src_loc.IsConstant()) { + __ Movi(dst.V16B(), Int64FromLocation(src_loc)); + } else { + __ Dup(dst.V16B(), InputRegisterAt(instruction, 0)); + } + break; + case DataType::Type::kUint16: + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + if (src_loc.IsConstant()) { + __ Movi(dst.V8H(), Int64FromLocation(src_loc)); + } else { + __ Dup(dst.V8H(), InputRegisterAt(instruction, 0)); + } + break; + case DataType::Type::kInt32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + if (src_loc.IsConstant()) { + __ Movi(dst.V4S(), Int64FromLocation(src_loc)); + } else { + __ Dup(dst.V4S(), InputRegisterAt(instruction, 0)); + } + break; + case DataType::Type::kInt64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + if (src_loc.IsConstant()) { + __ Movi(dst.V2D(), Int64FromLocation(src_loc)); + } else { + __ Dup(dst.V2D(), XRegisterFrom(src_loc)); + } + break; + case DataType::Type::kFloat32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + if (src_loc.IsConstant()) { + __ Fmov(dst.V4S(), src_loc.GetConstant()->AsFloatConstant()->GetValue()); + } else { + __ Dup(dst.V4S(), VRegisterFrom(src_loc).V4S(), 0); + } + break; + case DataType::Type::kFloat64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + if (src_loc.IsConstant()) { + __ Fmov(dst.V2D(), src_loc.GetConstant()->AsDoubleConstant()->GetValue()); + } else { + __ Dup(dst.V2D(), VRegisterFrom(src_loc).V2D(), 0); + } + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecExtractScalar(HVecExtractScalar* instruction) { + LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + case DataType::Type::kUint16: + case DataType::Type::kInt16: + case DataType::Type::kInt32: + case DataType::Type::kInt64: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresRegister()); + break; + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void InstructionCodeGeneratorARM64Sve::VisitVecExtractScalar(HVecExtractScalar* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister src = VRegisterFrom(locations->InAt(0)); + switch (instruction->GetPackedType()) { + case DataType::Type::kInt32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Umov(OutputRegister(instruction), src.V4S(), 0); + break; + case DataType::Type::kInt64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Umov(OutputRegister(instruction), src.V2D(), 0); + break; + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 4u); + DCHECK(locations->InAt(0).Equals(locations->Out())); // no code required + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +// Helper to set up locations for vector unary operations. +static void CreateVecUnOpLocations(ArenaAllocator* allocator, HVecUnaryOperation* instruction) { + LocationSummary* locations = new (allocator) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case DataType::Type::kBool: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister(), + instruction->IsVecNot() ? Location::kOutputOverlap + : Location::kNoOutputOverlap); + break; + case DataType::Type::kUint8: + case DataType::Type::kInt8: + case DataType::Type::kUint16: + case DataType::Type::kInt16: + case DataType::Type::kInt32: + case DataType::Type::kInt64: + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecReduce(HVecReduce* instruction) { + CreateVecUnOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecReduce(HVecReduce* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister src = VRegisterFrom(locations->InAt(0)); + VRegister dst = DRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kInt32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + switch (instruction->GetReductionKind()) { + case HVecReduce::kSum: + __ Addv(dst.S(), src.V4S()); + break; + case HVecReduce::kMin: + __ Sminv(dst.S(), src.V4S()); + break; + case HVecReduce::kMax: + __ Smaxv(dst.S(), src.V4S()); + break; + } + break; + case DataType::Type::kInt64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + switch (instruction->GetReductionKind()) { + case HVecReduce::kSum: + __ Addp(dst.D(), src.V2D()); + break; + default: + LOG(FATAL) << "Unsupported SIMD min/max"; + UNREACHABLE(); + } + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecCnv(HVecCnv* instruction) { + CreateVecUnOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecCnv(HVecCnv* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister src = VRegisterFrom(locations->InAt(0)); + VRegister dst = VRegisterFrom(locations->Out()); + DataType::Type from = instruction->GetInputType(); + DataType::Type to = instruction->GetResultType(); + if (from == DataType::Type::kInt32 && to == DataType::Type::kFloat32) { + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Scvtf(dst.V4S(), src.V4S()); + } else { + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + } +} + +void LocationsBuilderARM64Sve::VisitVecNeg(HVecNeg* instruction) { + CreateVecUnOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecNeg(HVecNeg* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister src = VRegisterFrom(locations->InAt(0)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kUint8: + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Neg(dst.V16B(), src.V16B()); + break; + case DataType::Type::kUint16: + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Neg(dst.V8H(), src.V8H()); + break; + case DataType::Type::kInt32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Neg(dst.V4S(), src.V4S()); + break; + case DataType::Type::kInt64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Neg(dst.V2D(), src.V2D()); + break; + case DataType::Type::kFloat32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Fneg(dst.V4S(), src.V4S()); + break; + case DataType::Type::kFloat64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Fneg(dst.V2D(), src.V2D()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecAbs(HVecAbs* instruction) { + CreateVecUnOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecAbs(HVecAbs* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister src = VRegisterFrom(locations->InAt(0)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Abs(dst.V16B(), src.V16B()); + break; + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Abs(dst.V8H(), src.V8H()); + break; + case DataType::Type::kInt32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Abs(dst.V4S(), src.V4S()); + break; + case DataType::Type::kInt64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Abs(dst.V2D(), src.V2D()); + break; + case DataType::Type::kFloat32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Fabs(dst.V4S(), src.V4S()); + break; + case DataType::Type::kFloat64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Fabs(dst.V2D(), src.V2D()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecNot(HVecNot* instruction) { + CreateVecUnOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecNot(HVecNot* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister src = VRegisterFrom(locations->InAt(0)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kBool: // special case boolean-not + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Movi(dst.V16B(), 1); + __ Eor(dst.V16B(), dst.V16B(), src.V16B()); + break; + case DataType::Type::kUint8: + case DataType::Type::kInt8: + case DataType::Type::kUint16: + case DataType::Type::kInt16: + case DataType::Type::kInt32: + case DataType::Type::kInt64: + __ Not(dst.V16B(), src.V16B()); // lanes do not matter + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +// Helper to set up locations for vector binary operations. +static void CreateVecBinOpLocations(ArenaAllocator* allocator, HVecBinaryOperation* instruction) { + LocationSummary* locations = new (allocator) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + case DataType::Type::kUint16: + case DataType::Type::kInt16: + case DataType::Type::kInt32: + case DataType::Type::kInt64: + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecAdd(HVecAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecAdd(HVecAdd* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister rhs = VRegisterFrom(locations->InAt(1)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kUint8: + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Add(dst.V16B(), lhs.V16B(), rhs.V16B()); + break; + case DataType::Type::kUint16: + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Add(dst.V8H(), lhs.V8H(), rhs.V8H()); + break; + case DataType::Type::kInt32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Add(dst.V4S(), lhs.V4S(), rhs.V4S()); + break; + case DataType::Type::kInt64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Add(dst.V2D(), lhs.V2D(), rhs.V2D()); + break; + case DataType::Type::kFloat32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Fadd(dst.V4S(), lhs.V4S(), rhs.V4S()); + break; + case DataType::Type::kFloat64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Fadd(dst.V2D(), lhs.V2D(), rhs.V2D()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecSaturationAdd(HVecSaturationAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecSaturationAdd(HVecSaturationAdd* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister rhs = VRegisterFrom(locations->InAt(1)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kUint8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Uqadd(dst.V16B(), lhs.V16B(), rhs.V16B()); + break; + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Sqadd(dst.V16B(), lhs.V16B(), rhs.V16B()); + break; + case DataType::Type::kUint16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Uqadd(dst.V8H(), lhs.V8H(), rhs.V8H()); + break; + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Sqadd(dst.V8H(), lhs.V8H(), rhs.V8H()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { + CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister rhs = VRegisterFrom(locations->InAt(1)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kUint8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + instruction->IsRounded() + ? __ Urhadd(dst.V16B(), lhs.V16B(), rhs.V16B()) + : __ Uhadd(dst.V16B(), lhs.V16B(), rhs.V16B()); + break; + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + instruction->IsRounded() + ? __ Srhadd(dst.V16B(), lhs.V16B(), rhs.V16B()) + : __ Shadd(dst.V16B(), lhs.V16B(), rhs.V16B()); + break; + case DataType::Type::kUint16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + instruction->IsRounded() + ? __ Urhadd(dst.V8H(), lhs.V8H(), rhs.V8H()) + : __ Uhadd(dst.V8H(), lhs.V8H(), rhs.V8H()); + break; + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + instruction->IsRounded() + ? __ Srhadd(dst.V8H(), lhs.V8H(), rhs.V8H()) + : __ Shadd(dst.V8H(), lhs.V8H(), rhs.V8H()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecSub(HVecSub* instruction) { + CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecSub(HVecSub* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister rhs = VRegisterFrom(locations->InAt(1)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kUint8: + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Sub(dst.V16B(), lhs.V16B(), rhs.V16B()); + break; + case DataType::Type::kUint16: + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Sub(dst.V8H(), lhs.V8H(), rhs.V8H()); + break; + case DataType::Type::kInt32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Sub(dst.V4S(), lhs.V4S(), rhs.V4S()); + break; + case DataType::Type::kInt64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Sub(dst.V2D(), lhs.V2D(), rhs.V2D()); + break; + case DataType::Type::kFloat32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Fsub(dst.V4S(), lhs.V4S(), rhs.V4S()); + break; + case DataType::Type::kFloat64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Fsub(dst.V2D(), lhs.V2D(), rhs.V2D()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecSaturationSub(HVecSaturationSub* instruction) { + CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecSaturationSub(HVecSaturationSub* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister rhs = VRegisterFrom(locations->InAt(1)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kUint8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Uqsub(dst.V16B(), lhs.V16B(), rhs.V16B()); + break; + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Sqsub(dst.V16B(), lhs.V16B(), rhs.V16B()); + break; + case DataType::Type::kUint16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Uqsub(dst.V8H(), lhs.V8H(), rhs.V8H()); + break; + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Sqsub(dst.V8H(), lhs.V8H(), rhs.V8H()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecMul(HVecMul* instruction) { + CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecMul(HVecMul* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister rhs = VRegisterFrom(locations->InAt(1)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kUint8: + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Mul(dst.V16B(), lhs.V16B(), rhs.V16B()); + break; + case DataType::Type::kUint16: + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Mul(dst.V8H(), lhs.V8H(), rhs.V8H()); + break; + case DataType::Type::kInt32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Mul(dst.V4S(), lhs.V4S(), rhs.V4S()); + break; + case DataType::Type::kFloat32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Fmul(dst.V4S(), lhs.V4S(), rhs.V4S()); + break; + case DataType::Type::kFloat64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Fmul(dst.V2D(), lhs.V2D(), rhs.V2D()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecDiv(HVecDiv* instruction) { + CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecDiv(HVecDiv* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister rhs = VRegisterFrom(locations->InAt(1)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kFloat32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Fdiv(dst.V4S(), lhs.V4S(), rhs.V4S()); + break; + case DataType::Type::kFloat64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Fdiv(dst.V2D(), lhs.V2D(), rhs.V2D()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecMin(HVecMin* instruction) { + CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecMin(HVecMin* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister rhs = VRegisterFrom(locations->InAt(1)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kUint8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Umin(dst.V16B(), lhs.V16B(), rhs.V16B()); + break; + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Smin(dst.V16B(), lhs.V16B(), rhs.V16B()); + break; + case DataType::Type::kUint16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Umin(dst.V8H(), lhs.V8H(), rhs.V8H()); + break; + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Smin(dst.V8H(), lhs.V8H(), rhs.V8H()); + break; + case DataType::Type::kUint32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Umin(dst.V4S(), lhs.V4S(), rhs.V4S()); + break; + case DataType::Type::kInt32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Smin(dst.V4S(), lhs.V4S(), rhs.V4S()); + break; + case DataType::Type::kFloat32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Fmin(dst.V4S(), lhs.V4S(), rhs.V4S()); + break; + case DataType::Type::kFloat64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Fmin(dst.V2D(), lhs.V2D(), rhs.V2D()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecMax(HVecMax* instruction) { + CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecMax(HVecMax* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister rhs = VRegisterFrom(locations->InAt(1)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kUint8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Umax(dst.V16B(), lhs.V16B(), rhs.V16B()); + break; + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Smax(dst.V16B(), lhs.V16B(), rhs.V16B()); + break; + case DataType::Type::kUint16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Umax(dst.V8H(), lhs.V8H(), rhs.V8H()); + break; + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Smax(dst.V8H(), lhs.V8H(), rhs.V8H()); + break; + case DataType::Type::kUint32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Umax(dst.V4S(), lhs.V4S(), rhs.V4S()); + break; + case DataType::Type::kInt32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Smax(dst.V4S(), lhs.V4S(), rhs.V4S()); + break; + case DataType::Type::kFloat32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Fmax(dst.V4S(), lhs.V4S(), rhs.V4S()); + break; + case DataType::Type::kFloat64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Fmax(dst.V2D(), lhs.V2D(), rhs.V2D()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecAnd(HVecAnd* instruction) { + // TODO: Allow constants supported by BIC (vector, immediate). + CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecAnd(HVecAnd* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister rhs = VRegisterFrom(locations->InAt(1)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + case DataType::Type::kUint16: + case DataType::Type::kInt16: + case DataType::Type::kInt32: + case DataType::Type::kInt64: + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + __ And(dst.V16B(), lhs.V16B(), rhs.V16B()); // lanes do not matter + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecAndNot(HVecAndNot* instruction) { + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecAndNot(HVecAndNot* instruction) { + // TODO: Use BIC (vector, register). + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); +} + +void LocationsBuilderARM64Sve::VisitVecOr(HVecOr* instruction) { + CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecOr(HVecOr* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister rhs = VRegisterFrom(locations->InAt(1)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + case DataType::Type::kUint16: + case DataType::Type::kInt16: + case DataType::Type::kInt32: + case DataType::Type::kInt64: + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + __ Orr(dst.V16B(), lhs.V16B(), rhs.V16B()); // lanes do not matter + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecXor(HVecXor* instruction) { + CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecXor(HVecXor* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister rhs = VRegisterFrom(locations->InAt(1)); + VRegister dst = VRegisterFrom(locations->Out()); + switch (instruction->GetPackedType()) { + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + case DataType::Type::kUint16: + case DataType::Type::kInt16: + case DataType::Type::kInt32: + case DataType::Type::kInt64: + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + __ Eor(dst.V16B(), lhs.V16B(), rhs.V16B()); // lanes do not matter + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +// Helper to set up locations for vector shift operations. +static void CreateVecShiftLocations(ArenaAllocator* allocator, HVecBinaryOperation* instruction) { + LocationSummary* locations = new (allocator) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case DataType::Type::kUint8: + case DataType::Type::kInt8: + case DataType::Type::kUint16: + case DataType::Type::kInt16: + case DataType::Type::kInt32: + case DataType::Type::kInt64: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::ConstantLocation(instruction->InputAt(1)->AsConstant())); + locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecShl(HVecShl* instruction) { + CreateVecShiftLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecShl(HVecShl* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister dst = VRegisterFrom(locations->Out()); + int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + switch (instruction->GetPackedType()) { + case DataType::Type::kUint8: + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Shl(dst.V16B(), lhs.V16B(), value); + break; + case DataType::Type::kUint16: + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Shl(dst.V8H(), lhs.V8H(), value); + break; + case DataType::Type::kInt32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Shl(dst.V4S(), lhs.V4S(), value); + break; + case DataType::Type::kInt64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Shl(dst.V2D(), lhs.V2D(), value); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecShr(HVecShr* instruction) { + CreateVecShiftLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecShr(HVecShr* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister dst = VRegisterFrom(locations->Out()); + int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + switch (instruction->GetPackedType()) { + case DataType::Type::kUint8: + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Sshr(dst.V16B(), lhs.V16B(), value); + break; + case DataType::Type::kUint16: + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Sshr(dst.V8H(), lhs.V8H(), value); + break; + case DataType::Type::kInt32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Sshr(dst.V4S(), lhs.V4S(), value); + break; + case DataType::Type::kInt64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Sshr(dst.V2D(), lhs.V2D(), value); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecUShr(HVecUShr* instruction) { + CreateVecShiftLocations(GetGraph()->GetAllocator(), instruction); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecUShr(HVecUShr* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister lhs = VRegisterFrom(locations->InAt(0)); + VRegister dst = VRegisterFrom(locations->Out()); + int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); + switch (instruction->GetPackedType()) { + case DataType::Type::kUint8: + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Ushr(dst.V16B(), lhs.V16B(), value); + break; + case DataType::Type::kUint16: + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Ushr(dst.V8H(), lhs.V8H(), value); + break; + case DataType::Type::kInt32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Ushr(dst.V4S(), lhs.V4S(), value); + break; + case DataType::Type::kInt64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Ushr(dst.V2D(), lhs.V2D(), value); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecSetScalars(HVecSetScalars* instruction) { + LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); + + DCHECK_EQ(1u, instruction->InputCount()); // only one input currently implemented + + HInstruction* input = instruction->InputAt(0); + bool is_zero = IsZeroBitPattern(input); + + switch (instruction->GetPackedType()) { + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + case DataType::Type::kUint16: + case DataType::Type::kInt16: + case DataType::Type::kInt32: + case DataType::Type::kInt64: + locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant()) + : Location::RequiresRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + break; + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + locations->SetInAt(0, is_zero ? Location::ConstantLocation(input->AsConstant()) + : Location::RequiresFpuRegister()); + locations->SetOut(Location::RequiresFpuRegister()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void InstructionCodeGeneratorARM64Sve::VisitVecSetScalars(HVecSetScalars* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister dst = VRegisterFrom(locations->Out()); + + DCHECK_EQ(1u, instruction->InputCount()); // only one input currently implemented + + // Zero out all other elements first. + __ Movi(dst.V16B(), 0); + + // Shorthand for any type of zero. + if (IsZeroBitPattern(instruction->InputAt(0))) { + return; + } + + // Set required elements. + switch (instruction->GetPackedType()) { + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Mov(dst.V16B(), 0, InputRegisterAt(instruction, 0)); + break; + case DataType::Type::kUint16: + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Mov(dst.V8H(), 0, InputRegisterAt(instruction, 0)); + break; + case DataType::Type::kInt32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Mov(dst.V4S(), 0, InputRegisterAt(instruction, 0)); + break; + case DataType::Type::kInt64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Mov(dst.V2D(), 0, InputRegisterAt(instruction, 0)); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +// Helper to set up locations for vector accumulations. +static void CreateVecAccumLocations(ArenaAllocator* allocator, HVecOperation* instruction) { + LocationSummary* locations = new (allocator) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case DataType::Type::kUint8: + case DataType::Type::kInt8: + case DataType::Type::kUint16: + case DataType::Type::kInt16: + case DataType::Type::kInt32: + case DataType::Type::kInt64: + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(2, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { + CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction); +} + +// Some early revisions of the Cortex-A53 have an erratum (835769) whereby it is possible for a +// 64-bit scalar multiply-accumulate instruction in AArch64 state to generate an incorrect result. +// However vector MultiplyAccumulate instruction is not affected. +void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister acc = VRegisterFrom(locations->InAt(0)); + VRegister left = VRegisterFrom(locations->InAt(1)); + VRegister right = VRegisterFrom(locations->InAt(2)); + + DCHECK(locations->InAt(0).Equals(locations->Out())); + + switch (instruction->GetPackedType()) { + case DataType::Type::kUint8: + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + if (instruction->GetOpKind() == HInstruction::kAdd) { + __ Mla(acc.V16B(), left.V16B(), right.V16B()); + } else { + __ Mls(acc.V16B(), left.V16B(), right.V16B()); + } + break; + case DataType::Type::kUint16: + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + if (instruction->GetOpKind() == HInstruction::kAdd) { + __ Mla(acc.V8H(), left.V8H(), right.V8H()); + } else { + __ Mls(acc.V8H(), left.V8H(), right.V8H()); + } + break; + case DataType::Type::kInt32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + if (instruction->GetOpKind() == HInstruction::kAdd) { + __ Mla(acc.V4S(), left.V4S(), right.V4S()); + } else { + __ Mls(acc.V4S(), left.V4S(), right.V4S()); + } + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { + CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction); + // Some conversions require temporary registers. + LocationSummary* locations = instruction->GetLocations(); + HVecOperation* a = instruction->InputAt(1)->AsVecOperation(); + HVecOperation* b = instruction->InputAt(2)->AsVecOperation(); + DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()), + HVecOperation::ToSignedType(b->GetPackedType())); + switch (a->GetPackedType()) { + case DataType::Type::kUint8: + case DataType::Type::kInt8: + switch (instruction->GetPackedType()) { + case DataType::Type::kInt64: + locations->AddTemp(Location::RequiresFpuRegister()); + locations->AddTemp(Location::RequiresFpuRegister()); + FALLTHROUGH_INTENDED; + case DataType::Type::kInt32: + locations->AddTemp(Location::RequiresFpuRegister()); + locations->AddTemp(Location::RequiresFpuRegister()); + break; + default: + break; + } + break; + case DataType::Type::kUint16: + case DataType::Type::kInt16: + if (instruction->GetPackedType() == DataType::Type::kInt64) { + locations->AddTemp(Location::RequiresFpuRegister()); + locations->AddTemp(Location::RequiresFpuRegister()); + } + break; + case DataType::Type::kInt32: + case DataType::Type::kInt64: + if (instruction->GetPackedType() == a->GetPackedType()) { + locations->AddTemp(Location::RequiresFpuRegister()); + } + break; + default: + break; + } +} + +void InstructionCodeGeneratorARM64Sve::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { + LocationSummary* locations = instruction->GetLocations(); + VRegister acc = VRegisterFrom(locations->InAt(0)); + VRegister left = VRegisterFrom(locations->InAt(1)); + VRegister right = VRegisterFrom(locations->InAt(2)); + + DCHECK(locations->InAt(0).Equals(locations->Out())); + + // Handle all feasible acc_T += sad(a_S, b_S) type combinations (T x S). + HVecOperation* a = instruction->InputAt(1)->AsVecOperation(); + HVecOperation* b = instruction->InputAt(2)->AsVecOperation(); + DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()), + HVecOperation::ToSignedType(b->GetPackedType())); + switch (a->GetPackedType()) { + case DataType::Type::kUint8: + case DataType::Type::kInt8: + DCHECK_EQ(16u, a->GetVectorLength()); + switch (instruction->GetPackedType()) { + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Sabal(acc.V8H(), left.V8B(), right.V8B()); + __ Sabal2(acc.V8H(), left.V16B(), right.V16B()); + break; + case DataType::Type::kInt32: { + DCHECK_EQ(4u, instruction->GetVectorLength()); + VRegister tmp1 = VRegisterFrom(locations->GetTemp(0)); + VRegister tmp2 = VRegisterFrom(locations->GetTemp(1)); + __ Sxtl(tmp1.V8H(), left.V8B()); + __ Sxtl(tmp2.V8H(), right.V8B()); + __ Sabal(acc.V4S(), tmp1.V4H(), tmp2.V4H()); + __ Sabal2(acc.V4S(), tmp1.V8H(), tmp2.V8H()); + __ Sxtl2(tmp1.V8H(), left.V16B()); + __ Sxtl2(tmp2.V8H(), right.V16B()); + __ Sabal(acc.V4S(), tmp1.V4H(), tmp2.V4H()); + __ Sabal2(acc.V4S(), tmp1.V8H(), tmp2.V8H()); + break; + } + case DataType::Type::kInt64: { + DCHECK_EQ(2u, instruction->GetVectorLength()); + VRegister tmp1 = VRegisterFrom(locations->GetTemp(0)); + VRegister tmp2 = VRegisterFrom(locations->GetTemp(1)); + VRegister tmp3 = VRegisterFrom(locations->GetTemp(2)); + VRegister tmp4 = VRegisterFrom(locations->GetTemp(3)); + __ Sxtl(tmp1.V8H(), left.V8B()); + __ Sxtl(tmp2.V8H(), right.V8B()); + __ Sxtl(tmp3.V4S(), tmp1.V4H()); + __ Sxtl(tmp4.V4S(), tmp2.V4H()); + __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S()); + __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S()); + __ Sxtl2(tmp3.V4S(), tmp1.V8H()); + __ Sxtl2(tmp4.V4S(), tmp2.V8H()); + __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S()); + __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S()); + __ Sxtl2(tmp1.V8H(), left.V16B()); + __ Sxtl2(tmp2.V8H(), right.V16B()); + __ Sxtl(tmp3.V4S(), tmp1.V4H()); + __ Sxtl(tmp4.V4S(), tmp2.V4H()); + __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S()); + __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S()); + __ Sxtl2(tmp3.V4S(), tmp1.V8H()); + __ Sxtl2(tmp4.V4S(), tmp2.V8H()); + __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S()); + __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S()); + break; + } + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } + break; + case DataType::Type::kUint16: + case DataType::Type::kInt16: + DCHECK_EQ(8u, a->GetVectorLength()); + switch (instruction->GetPackedType()) { + case DataType::Type::kInt32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Sabal(acc.V4S(), left.V4H(), right.V4H()); + __ Sabal2(acc.V4S(), left.V8H(), right.V8H()); + break; + case DataType::Type::kInt64: { + DCHECK_EQ(2u, instruction->GetVectorLength()); + VRegister tmp1 = VRegisterFrom(locations->GetTemp(0)); + VRegister tmp2 = VRegisterFrom(locations->GetTemp(1)); + __ Sxtl(tmp1.V4S(), left.V4H()); + __ Sxtl(tmp2.V4S(), right.V4H()); + __ Sabal(acc.V2D(), tmp1.V2S(), tmp2.V2S()); + __ Sabal2(acc.V2D(), tmp1.V4S(), tmp2.V4S()); + __ Sxtl2(tmp1.V4S(), left.V8H()); + __ Sxtl2(tmp2.V4S(), right.V8H()); + __ Sabal(acc.V2D(), tmp1.V2S(), tmp2.V2S()); + __ Sabal2(acc.V2D(), tmp1.V4S(), tmp2.V4S()); + break; + } + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } + break; + case DataType::Type::kInt32: + DCHECK_EQ(4u, a->GetVectorLength()); + switch (instruction->GetPackedType()) { + case DataType::Type::kInt32: { + DCHECK_EQ(4u, instruction->GetVectorLength()); + VRegister tmp = VRegisterFrom(locations->GetTemp(0)); + __ Sub(tmp.V4S(), left.V4S(), right.V4S()); + __ Abs(tmp.V4S(), tmp.V4S()); + __ Add(acc.V4S(), acc.V4S(), tmp.V4S()); + break; + } + case DataType::Type::kInt64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Sabal(acc.V2D(), left.V2S(), right.V2S()); + __ Sabal2(acc.V2D(), left.V4S(), right.V4S()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } + break; + case DataType::Type::kInt64: + DCHECK_EQ(2u, a->GetVectorLength()); + switch (instruction->GetPackedType()) { + case DataType::Type::kInt64: { + DCHECK_EQ(2u, instruction->GetVectorLength()); + VRegister tmp = VRegisterFrom(locations->GetTemp(0)); + __ Sub(tmp.V2D(), left.V2D(), right.V2D()); + __ Abs(tmp.V2D(), tmp.V2D()); + __ Add(acc.V2D(), acc.V2D(), tmp.V2D()); + break; + } + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + } +} + +void LocationsBuilderARM64Sve::VisitVecDotProd(HVecDotProd* instruction) { + LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); + DCHECK(instruction->GetPackedType() == DataType::Type::kInt32); + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(2, Location::RequiresFpuRegister()); + locations->SetOut(Location::SameAsFirstInput()); + + // For Int8 and Uint8 general case we need a temp register. + if ((DataType::Size(instruction->InputAt(1)->AsVecOperation()->GetPackedType()) == 1) && + !ShouldEmitDotProductInstructions(codegen_)) { + locations->AddTemp(Location::RequiresFpuRegister()); + } +} + +void InstructionCodeGeneratorARM64Sve::VisitVecDotProd(HVecDotProd* instruction) { + LocationSummary* locations = instruction->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + VRegister acc = VRegisterFrom(locations->InAt(0)); + VRegister left = VRegisterFrom(locations->InAt(1)); + VRegister right = VRegisterFrom(locations->InAt(2)); + HVecOperation* a = instruction->InputAt(1)->AsVecOperation(); + HVecOperation* b = instruction->InputAt(2)->AsVecOperation(); + DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()), + HVecOperation::ToSignedType(b->GetPackedType())); + DCHECK_EQ(instruction->GetPackedType(), DataType::Type::kInt32); + DCHECK_EQ(4u, instruction->GetVectorLength()); + + size_t inputs_data_size = DataType::Size(a->GetPackedType()); + switch (inputs_data_size) { + case 1u: { + DCHECK_EQ(16u, a->GetVectorLength()); + if (instruction->IsZeroExtending()) { + if (ShouldEmitDotProductInstructions(codegen_)) { + __ Udot(acc.V4S(), left.V16B(), right.V16B()); + } else { + VRegister tmp = VRegisterFrom(locations->GetTemp(0)); + __ Umull(tmp.V8H(), left.V8B(), right.V8B()); + __ Uaddw(acc.V4S(), acc.V4S(), tmp.V4H()); + __ Uaddw2(acc.V4S(), acc.V4S(), tmp.V8H()); + + __ Umull2(tmp.V8H(), left.V16B(), right.V16B()); + __ Uaddw(acc.V4S(), acc.V4S(), tmp.V4H()); + __ Uaddw2(acc.V4S(), acc.V4S(), tmp.V8H()); + } + } else { + if (ShouldEmitDotProductInstructions(codegen_)) { + __ Sdot(acc.V4S(), left.V16B(), right.V16B()); + } else { + VRegister tmp = VRegisterFrom(locations->GetTemp(0)); + __ Smull(tmp.V8H(), left.V8B(), right.V8B()); + __ Saddw(acc.V4S(), acc.V4S(), tmp.V4H()); + __ Saddw2(acc.V4S(), acc.V4S(), tmp.V8H()); + + __ Smull2(tmp.V8H(), left.V16B(), right.V16B()); + __ Saddw(acc.V4S(), acc.V4S(), tmp.V4H()); + __ Saddw2(acc.V4S(), acc.V4S(), tmp.V8H()); + } + } + break; + } + case 2u: + DCHECK_EQ(8u, a->GetVectorLength()); + if (instruction->IsZeroExtending()) { + __ Umlal(acc.V4S(), left.V4H(), right.V4H()); + __ Umlal2(acc.V4S(), left.V8H(), right.V8H()); + } else { + __ Smlal(acc.V4S(), left.V4H(), right.V4H()); + __ Smlal2(acc.V4S(), left.V8H(), right.V8H()); + } + break; + default: + LOG(FATAL) << "Unsupported SIMD type size: " << inputs_data_size; + } +} + +// Helper to set up locations for vector memory operations. +static void CreateVecMemLocations(ArenaAllocator* allocator, + HVecMemoryOperation* instruction, + bool is_load) { + LocationSummary* locations = new (allocator) LocationSummary(instruction); + switch (instruction->GetPackedType()) { + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + case DataType::Type::kUint16: + case DataType::Type::kInt16: + case DataType::Type::kInt32: + case DataType::Type::kInt64: + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1))); + if (is_load) { + locations->SetOut(Location::RequiresFpuRegister()); + } else { + locations->SetInAt(2, Location::RequiresFpuRegister()); + } + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecLoad(HVecLoad* instruction) { + CreateVecMemLocations(GetGraph()->GetAllocator(), instruction, /*is_load*/ true); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecLoad(HVecLoad* instruction) { + LocationSummary* locations = instruction->GetLocations(); + size_t size = DataType::Size(instruction->GetPackedType()); + VRegister reg = VRegisterFrom(locations->Out()); + UseScratchRegisterScope temps(GetVIXLAssembler()); + Register scratch; + + switch (instruction->GetPackedType()) { + case DataType::Type::kInt16: // (short) s.charAt(.) can yield HVecLoad/Int16/StringCharAt. + case DataType::Type::kUint16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + // Special handling of compressed/uncompressed string load. + if (mirror::kUseStringCompression && instruction->IsStringCharAt()) { + vixl::aarch64::Label uncompressed_load, done; + // Test compression bit. + static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u, + "Expecting 0=compressed, 1=uncompressed"); + uint32_t count_offset = mirror::String::CountOffset().Uint32Value(); + Register length = temps.AcquireW(); + __ Ldr(length, HeapOperand(InputRegisterAt(instruction, 0), count_offset)); + __ Tbnz(length.W(), 0, &uncompressed_load); + temps.Release(length); // no longer needed + // Zero extend 8 compressed bytes into 8 chars. + __ Ldr(DRegisterFrom(locations->Out()).V8B(), + VecNeonAddress(instruction, &temps, 1, /*is_string_char_at*/ true, &scratch)); + __ Uxtl(reg.V8H(), reg.V8B()); + __ B(&done); + if (scratch.IsValid()) { + temps.Release(scratch); // if used, no longer needed + } + // Load 8 direct uncompressed chars. + __ Bind(&uncompressed_load); + __ Ldr(reg, + VecNeonAddress(instruction, &temps, size, /*is_string_char_at*/ true, &scratch)); + __ Bind(&done); + return; + } + FALLTHROUGH_INTENDED; + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + case DataType::Type::kInt32: + case DataType::Type::kFloat32: + case DataType::Type::kInt64: + case DataType::Type::kFloat64: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ Ldr(reg, + VecNeonAddress(instruction, &temps, size, instruction->IsStringCharAt(), &scratch)); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +void LocationsBuilderARM64Sve::VisitVecStore(HVecStore* instruction) { + CreateVecMemLocations(GetGraph()->GetAllocator(), instruction, /*is_load*/ false); +} + +void InstructionCodeGeneratorARM64Sve::VisitVecStore(HVecStore* instruction) { + LocationSummary* locations = instruction->GetLocations(); + size_t size = DataType::Size(instruction->GetPackedType()); + VRegister reg = VRegisterFrom(locations->InAt(2)); + UseScratchRegisterScope temps(GetVIXLAssembler()); + Register scratch; + + switch (instruction->GetPackedType()) { + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + case DataType::Type::kUint16: + case DataType::Type::kInt16: + case DataType::Type::kInt32: + case DataType::Type::kFloat32: + case DataType::Type::kInt64: + case DataType::Type::kFloat64: + DCHECK_LE(2u, instruction->GetVectorLength()); + DCHECK_LE(instruction->GetVectorLength(), 16u); + __ Str(reg, + VecNeonAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } +} + +Location InstructionCodeGeneratorARM64Sve::AllocateSIMDScratchLocation( + vixl::aarch64::UseScratchRegisterScope* scope) { + DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); + return LocationFrom(scope->AcquireVRegisterOfSize(kQRegSize)); +} + +void InstructionCodeGeneratorARM64Sve::FreeSIMDScratchLocation(Location loc, + vixl::aarch64::UseScratchRegisterScope* scope) { + DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); + scope->Release(QRegisterFrom(loc)); +} + +void InstructionCodeGeneratorARM64Sve::LoadSIMDRegFromStack(Location destination, + Location source) { + DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); + __ Ldr(QRegisterFrom(destination), StackOperandFrom(source)); +} + +void InstructionCodeGeneratorARM64Sve::MoveSIMDRegToSIMDReg(Location destination, + Location source) { + DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); + __ Mov(QRegisterFrom(destination), QRegisterFrom(source)); +} + +void InstructionCodeGeneratorARM64Sve::MoveToSIMDStackSlot(Location destination, + Location source) { + DCHECK(destination.IsSIMDStackSlot()); + DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes); + + if (source.IsFpuRegister()) { + __ Str(QRegisterFrom(source), StackOperandFrom(destination)); + } else { + DCHECK(source.IsSIMDStackSlot()); + UseScratchRegisterScope temps(GetVIXLAssembler()); + if (GetVIXLAssembler()->GetScratchVRegisterList()->IsEmpty()) { + Register temp = temps.AcquireX(); + __ Ldr(temp, MemOperand(sp, source.GetStackIndex())); + __ Str(temp, MemOperand(sp, destination.GetStackIndex())); + __ Ldr(temp, MemOperand(sp, source.GetStackIndex() + kArm64WordSize)); + __ Str(temp, MemOperand(sp, destination.GetStackIndex() + kArm64WordSize)); + } else { + VRegister temp = temps.AcquireVRegisterOfSize(kQRegSize); + __ Ldr(temp, StackOperandFrom(source)); + __ Str(temp, StackOperandFrom(destination)); + } + } +} + +#undef __ + +} // namespace arm64 +} // namespace art diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index 342789348c..e02a393c1b 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -1387,7 +1387,7 @@ class HLoopInformationOutwardIterator : public ValueObject { DISALLOW_COPY_AND_ASSIGN(HLoopInformationOutwardIterator); }; -#define FOR_EACH_CONCRETE_INSTRUCTION_COMMON(M) \ +#define FOR_EACH_CONCRETE_INSTRUCTION_SCALAR_COMMON(M) \ M(Above, Condition) \ M(AboveOrEqual, Condition) \ M(Abs, UnaryOperation) \ @@ -1477,7 +1477,9 @@ class HLoopInformationOutwardIterator : public ValueObject { M(TryBoundary, Instruction) \ M(TypeConversion, Instruction) \ M(UShr, BinaryOperation) \ - M(Xor, BinaryOperation) \ + M(Xor, BinaryOperation) + +#define FOR_EACH_CONCRETE_INSTRUCTION_VECTOR_COMMON(M) \ M(VecReplicateScalar, VecUnaryOperation) \ M(VecExtractScalar, VecUnaryOperation) \ M(VecReduce, VecUnaryOperation) \ @@ -1508,6 +1510,10 @@ class HLoopInformationOutwardIterator : public ValueObject { M(VecLoad, VecMemoryOperation) \ M(VecStore, VecMemoryOperation) \ +#define FOR_EACH_CONCRETE_INSTRUCTION_COMMON(M) \ + FOR_EACH_CONCRETE_INSTRUCTION_SCALAR_COMMON(M) \ + FOR_EACH_CONCRETE_INSTRUCTION_VECTOR_COMMON(M) + /* * Instructions, shared across several (not all) architectures. */ |