From 8ba4de1a5684686447a578bdc425321fd3bccca6 Mon Sep 17 00:00:00 2001 From: Artem Serov Date: Wed, 4 Dec 2019 21:10:23 +0000 Subject: ART: Implement predicated SIMD vectorization. This CL brings support for predicated execution for auto-vectorizer and implements arm64 SVE vector backend. This version passes all the VIXL simulator-runnable tests in SVE mode with checker off (as all VecOp CHECKs need to be adjusted for an extra input) and all tests in NEON mode. Test: art SIMD tests on VIXL simulator. Test: art tests on FVP (steps in test/README.arm_fvp.md) Change-Id: Ib78bde31a15e6713d875d6668ad4458f5519605f --- compiler/optimizing/code_generator_arm64.cc | 29 +- compiler/optimizing/code_generator_arm64.h | 24 +- .../optimizing/code_generator_vector_arm64_neon.cc | 56 +- .../optimizing/code_generator_vector_arm64_sve.cc | 967 +++++++++------------ compiler/optimizing/common_arm64.h | 7 +- compiler/optimizing/instruction_simplifier.cc | 108 ++- .../optimizing/instruction_simplifier_arm64.cc | 9 +- compiler/optimizing/loop_analysis.cc | 18 +- compiler/optimizing/loop_analysis.h | 7 +- compiler/optimizing/loop_optimization.cc | 187 +++- compiler/optimizing/loop_optimization.h | 20 +- compiler/optimizing/nodes_vector.h | 9 + 12 files changed, 746 insertions(+), 695 deletions(-) (limited to 'compiler/optimizing') diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index b945be208f..f5d78367fe 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -994,7 +994,7 @@ CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph, } bool CodeGeneratorARM64::ShouldUseSVE() const { - return kArm64AllowSVE && GetInstructionSetFeatures().HasSVE(); + return GetInstructionSetFeatures().HasSVE(); } #define __ GetVIXLAssembler()-> @@ -6908,7 +6908,7 @@ void CodeGeneratorARM64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_ } } -MemOperand InstructionCodeGeneratorARM64::VecNeonAddress( +MemOperand InstructionCodeGeneratorARM64::VecNEONAddress( HVecMemoryOperation* instruction, UseScratchRegisterScope* temps_scope, size_t size, @@ -6941,6 +6941,31 @@ MemOperand InstructionCodeGeneratorARM64::VecNeonAddress( } } +SVEMemOperand InstructionCodeGeneratorARM64::VecSVEAddress( + HVecMemoryOperation* instruction, + UseScratchRegisterScope* temps_scope, + size_t size, + bool is_string_char_at, + /*out*/ Register* scratch) { + LocationSummary* locations = instruction->GetLocations(); + Register base = InputRegisterAt(instruction, 0); + Location index = locations->InAt(1); + + // TODO: Support intermediate address sharing for SVE accesses. + DCHECK(!instruction->InputAt(1)->IsIntermediateAddressIndex()); + DCHECK(!instruction->InputAt(0)->IsIntermediateAddress()); + DCHECK(!index.IsConstant()); + + uint32_t offset = is_string_char_at + ? mirror::String::ValueOffset().Uint32Value() + : mirror::Array::DataOffset(size).Uint32Value(); + size_t shift = ComponentSizeShiftWidth(size); + + *scratch = temps_scope->AcquireSameSizeAs(base); + __ Add(*scratch, base, offset); + return SVEMemOperand(scratch->X(), XRegisterFrom(index), LSL, shift); +} + #undef __ #undef QUICK_ENTRY_POINT diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index affc640f1e..eb3e9546e0 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -54,9 +54,6 @@ static constexpr size_t kArm64WordSize = static_cast(kArm64PointerSize); static constexpr int kMaxMacroInstructionSizeInBytes = 15 * vixl::aarch64::kInstructionSize; static constexpr int kInvokeCodeMarginSizeInBytes = 6 * kMaxMacroInstructionSizeInBytes; -// SVE is currently not enabled. -static constexpr bool kArm64AllowSVE = false; - static const vixl::aarch64::Register kParameterCoreRegisters[] = { vixl::aarch64::x1, vixl::aarch64::x2, @@ -388,11 +385,19 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator { void GenerateIntRemForPower2Denom(HRem *instruction); void HandleGoto(HInstruction* got, HBasicBlock* successor); - // Helper to set up locations for vector memory operations. Returns the memory operand and, + // Helpers to set up locations for vector memory operations. Returns the memory operand and, // if used, sets the output parameter scratch to a temporary register used in this operand, // so that the client can release it right after the memory operand use. // Neon version. - vixl::aarch64::MemOperand VecNeonAddress( + vixl::aarch64::MemOperand VecNEONAddress( + HVecMemoryOperation* instruction, + // This function may acquire a scratch register. + vixl::aarch64::UseScratchRegisterScope* temps_scope, + size_t size, + bool is_string_char_at, + /*out*/ vixl::aarch64::Register* scratch); + // SVE version. + vixl::aarch64::SVEMemOperand VecSVEAddress( HVecMemoryOperation* instruction, // This function may acquire a scratch register. vixl::aarch64::UseScratchRegisterScope* temps_scope, @@ -490,6 +495,15 @@ class InstructionCodeGeneratorARM64Sve : public InstructionCodeGeneratorARM64 { void LoadSIMDRegFromStack(Location destination, Location source) override; void MoveSIMDRegToSIMDReg(Location destination, Location source) override; void MoveToSIMDStackSlot(Location destination, Location source) override; + + private: + // Returns default predicate register which is used as governing vector predicate + // to implement predicated loop execution. + // + // TODO: This is a hack to be addressed when register allocator supports SIMD types. + static vixl::aarch64::PRegister LoopPReg() { + return vixl::aarch64::p0; + } }; class LocationsBuilderARM64Sve : public LocationsBuilderARM64 { diff --git a/compiler/optimizing/code_generator_vector_arm64_neon.cc b/compiler/optimizing/code_generator_vector_arm64_neon.cc index 2a4c785ddb..bd64166655 100644 --- a/compiler/optimizing/code_generator_vector_arm64_neon.cc +++ b/compiler/optimizing/code_generator_vector_arm64_neon.cc @@ -25,8 +25,6 @@ using namespace vixl::aarch64; // NOLINT(build/namespaces) namespace art { namespace arm64 { -using helpers::ARM64EncodableConstantOrRegister; -using helpers::Arm64CanEncodeConstantAsImmediate; using helpers::DRegisterFrom; using helpers::HeapOperand; using helpers::InputRegisterAt; @@ -40,6 +38,38 @@ using helpers::XRegisterFrom; #define __ GetVIXLAssembler()-> +// Returns whether the value of the constant can be directly encoded into the instruction as +// immediate. +inline bool NEONCanEncodeConstantAsImmediate(HConstant* constant, HInstruction* instr) { + // TODO: Improve this when IsSIMDConstantEncodable method is implemented in VIXL. + if (instr->IsVecReplicateScalar()) { + if (constant->IsLongConstant()) { + return false; + } else if (constant->IsFloatConstant()) { + return vixl::aarch64::Assembler::IsImmFP32(constant->AsFloatConstant()->GetValue()); + } else if (constant->IsDoubleConstant()) { + return vixl::aarch64::Assembler::IsImmFP64(constant->AsDoubleConstant()->GetValue()); + } + int64_t value = CodeGenerator::GetInt64ValueOf(constant); + return IsUint<8>(value); + } + return false; +} + +// Returns +// - constant location - if 'constant' is an actual constant and its value can be +// encoded into the instruction. +// - register location otherwise. +inline Location NEONEncodableConstantOrRegister(HInstruction* constant, + HInstruction* instr) { + if (constant->IsConstant() + && NEONCanEncodeConstantAsImmediate(constant->AsConstant(), instr)) { + return Location::ConstantLocation(constant->AsConstant()); + } + + return Location::RequiresRegister(); +} + // Returns whether dot product instructions should be emitted. static bool ShouldEmitDotProductInstructions(const CodeGeneratorARM64* codegen_) { return codegen_->GetInstructionSetFeatures().HasDotProd(); @@ -56,13 +86,13 @@ void LocationsBuilderARM64Neon::VisitVecReplicateScalar(HVecReplicateScalar* ins case DataType::Type::kInt16: case DataType::Type::kInt32: case DataType::Type::kInt64: - locations->SetInAt(0, ARM64EncodableConstantOrRegister(input, instruction)); + locations->SetInAt(0, NEONEncodableConstantOrRegister(input, instruction)); locations->SetOut(Location::RequiresFpuRegister()); break; case DataType::Type::kFloat32: case DataType::Type::kFloat64: if (input->IsConstant() && - Arm64CanEncodeConstantAsImmediate(input->AsConstant(), instruction)) { + NEONCanEncodeConstantAsImmediate(input->AsConstant(), instruction)) { locations->SetInAt(0, Location::ConstantLocation(input->AsConstant())); locations->SetOut(Location::RequiresFpuRegister()); } else { @@ -1418,7 +1448,7 @@ void InstructionCodeGeneratorARM64Neon::VisitVecLoad(HVecLoad* instruction) { temps.Release(length); // no longer needed // Zero extend 8 compressed bytes into 8 chars. __ Ldr(DRegisterFrom(locations->Out()).V8B(), - VecNeonAddress(instruction, &temps, 1, /*is_string_char_at*/ true, &scratch)); + VecNEONAddress(instruction, &temps, 1, /*is_string_char_at*/ true, &scratch)); __ Uxtl(reg.V8H(), reg.V8B()); __ B(&done); if (scratch.IsValid()) { @@ -1427,7 +1457,7 @@ void InstructionCodeGeneratorARM64Neon::VisitVecLoad(HVecLoad* instruction) { // Load 8 direct uncompressed chars. __ Bind(&uncompressed_load); __ Ldr(reg, - VecNeonAddress(instruction, &temps, size, /*is_string_char_at*/ true, &scratch)); + VecNEONAddress(instruction, &temps, size, /*is_string_char_at*/ true, &scratch)); __ Bind(&done); return; } @@ -1442,7 +1472,7 @@ void InstructionCodeGeneratorARM64Neon::VisitVecLoad(HVecLoad* instruction) { DCHECK_LE(2u, instruction->GetVectorLength()); DCHECK_LE(instruction->GetVectorLength(), 16u); __ Ldr(reg, - VecNeonAddress(instruction, &temps, size, instruction->IsStringCharAt(), &scratch)); + VecNEONAddress(instruction, &temps, size, instruction->IsStringCharAt(), &scratch)); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -1474,7 +1504,7 @@ void InstructionCodeGeneratorARM64Neon::VisitVecStore(HVecStore* instruction) { DCHECK_LE(2u, instruction->GetVectorLength()); DCHECK_LE(instruction->GetVectorLength(), 16u); __ Str(reg, - VecNeonAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); + VecNEONAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -1483,13 +1513,13 @@ void InstructionCodeGeneratorARM64Neon::VisitVecStore(HVecStore* instruction) { } void LocationsBuilderARM64Neon::VisitVecPredSetAll(HVecPredSetAll* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); - UNREACHABLE(); + LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); + DCHECK(instruction->InputAt(0)->IsIntConstant()); + locations->SetInAt(0, Location::NoLocation()); + locations->SetOut(Location::NoLocation()); } -void InstructionCodeGeneratorARM64Neon::VisitVecPredSetAll(HVecPredSetAll* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); - UNREACHABLE(); +void InstructionCodeGeneratorARM64Neon::VisitVecPredSetAll(HVecPredSetAll*) { } void LocationsBuilderARM64Neon::VisitVecPredWhile(HVecPredWhile* instruction) { diff --git a/compiler/optimizing/code_generator_vector_arm64_sve.cc b/compiler/optimizing/code_generator_vector_arm64_sve.cc index 1761dfc792..2254673337 100644 --- a/compiler/optimizing/code_generator_vector_arm64_sve.cc +++ b/compiler/optimizing/code_generator_vector_arm64_sve.cc @@ -25,8 +25,6 @@ using namespace vixl::aarch64; // NOLINT(build/namespaces) namespace art { namespace arm64 { -using helpers::ARM64EncodableConstantOrRegister; -using helpers::Arm64CanEncodeConstantAsImmediate; using helpers::DRegisterFrom; using helpers::HeapOperand; using helpers::InputRegisterAt; @@ -36,13 +34,41 @@ using helpers::OutputRegister; using helpers::QRegisterFrom; using helpers::StackOperandFrom; using helpers::VRegisterFrom; +using helpers::ZRegisterFrom; using helpers::XRegisterFrom; #define __ GetVIXLAssembler()-> -// Returns whether dot product instructions should be emitted. -static bool ShouldEmitDotProductInstructions(const CodeGeneratorARM64* codegen_) { - return codegen_->GetInstructionSetFeatures().HasDotProd(); +// Returns whether the value of the constant can be directly encoded into the instruction as +// immediate. +static bool SVECanEncodeConstantAsImmediate(HConstant* constant, HInstruction* instr) { + if (instr->IsVecReplicateScalar()) { + if (constant->IsLongConstant()) { + return false; + } else if (constant->IsFloatConstant()) { + return vixl::aarch64::Assembler::IsImmFP32(constant->AsFloatConstant()->GetValue()); + } else if (constant->IsDoubleConstant()) { + return vixl::aarch64::Assembler::IsImmFP64(constant->AsDoubleConstant()->GetValue()); + } + // TODO: Make use of shift part of DUP instruction. + int64_t value = CodeGenerator::GetInt64ValueOf(constant); + return IsInt<8>(value); + } + + return false; +} + +// Returns +// - constant location - if 'constant' is an actual constant and its value can be +// encoded into the instruction. +// - register location otherwise. +inline Location SVEEncodableConstantOrRegister(HInstruction* constant, HInstruction* instr) { + if (constant->IsConstant() + && SVECanEncodeConstantAsImmediate(constant->AsConstant(), instr)) { + return Location::ConstantLocation(constant->AsConstant()); + } + + return Location::RequiresRegister(); } void LocationsBuilderARM64Sve::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { @@ -56,13 +82,13 @@ void LocationsBuilderARM64Sve::VisitVecReplicateScalar(HVecReplicateScalar* inst case DataType::Type::kInt16: case DataType::Type::kInt32: case DataType::Type::kInt64: - locations->SetInAt(0, ARM64EncodableConstantOrRegister(input, instruction)); + locations->SetInAt(0, SVEEncodableConstantOrRegister(input, instruction)); locations->SetOut(Location::RequiresFpuRegister()); break; case DataType::Type::kFloat32: case DataType::Type::kFloat64: if (input->IsConstant() && - Arm64CanEncodeConstantAsImmediate(input->AsConstant(), instruction)) { + SVECanEncodeConstantAsImmediate(input->AsConstant(), instruction)) { locations->SetInAt(0, Location::ConstantLocation(input->AsConstant())); locations->SetOut(Location::RequiresFpuRegister()); } else { @@ -77,59 +103,60 @@ void LocationsBuilderARM64Sve::VisitVecReplicateScalar(HVecReplicateScalar* inst } void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScalar* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); Location src_loc = locations->InAt(0); - VRegister dst = VRegisterFrom(locations->Out()); + const ZRegister dst = ZRegisterFrom(locations->Out()); switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: DCHECK_EQ(16u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { - __ Movi(dst.V16B(), Int64FromLocation(src_loc)); + __ Dup(dst.VnB(), Int64FromLocation(src_loc)); } else { - __ Dup(dst.V16B(), InputRegisterAt(instruction, 0)); + __ Dup(dst.VnB(), InputRegisterAt(instruction, 0)); } break; case DataType::Type::kUint16: case DataType::Type::kInt16: DCHECK_EQ(8u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { - __ Movi(dst.V8H(), Int64FromLocation(src_loc)); + __ Dup(dst.VnH(), Int64FromLocation(src_loc)); } else { - __ Dup(dst.V8H(), InputRegisterAt(instruction, 0)); + __ Dup(dst.VnH(), InputRegisterAt(instruction, 0)); } break; case DataType::Type::kInt32: DCHECK_EQ(4u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { - __ Movi(dst.V4S(), Int64FromLocation(src_loc)); + __ Dup(dst.VnS(), Int64FromLocation(src_loc)); } else { - __ Dup(dst.V4S(), InputRegisterAt(instruction, 0)); + __ Dup(dst.VnS(), InputRegisterAt(instruction, 0)); } break; case DataType::Type::kInt64: DCHECK_EQ(2u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { - __ Movi(dst.V2D(), Int64FromLocation(src_loc)); + __ Dup(dst.VnD(), Int64FromLocation(src_loc)); } else { - __ Dup(dst.V2D(), XRegisterFrom(src_loc)); + __ Dup(dst.VnD(), XRegisterFrom(src_loc)); } break; case DataType::Type::kFloat32: DCHECK_EQ(4u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { - __ Fmov(dst.V4S(), src_loc.GetConstant()->AsFloatConstant()->GetValue()); + __ Fdup(dst.VnS(), src_loc.GetConstant()->AsFloatConstant()->GetValue()); } else { - __ Dup(dst.V4S(), VRegisterFrom(src_loc).V4S(), 0); + __ Dup(dst.VnS(), ZRegisterFrom(src_loc).VnS(), 0); } break; case DataType::Type::kFloat64: DCHECK_EQ(2u, instruction->GetVectorLength()); if (src_loc.IsConstant()) { - __ Fmov(dst.V2D(), src_loc.GetConstant()->AsDoubleConstant()->GetValue()); + __ Fdup(dst.VnD(), src_loc.GetConstant()->AsDoubleConstant()->GetValue()); } else { - __ Dup(dst.V2D(), VRegisterFrom(src_loc).V2D(), 0); + __ Dup(dst.VnD(), ZRegisterFrom(src_loc).VnD(), 0); } break; default: @@ -163,8 +190,9 @@ void LocationsBuilderARM64Sve::VisitVecExtractScalar(HVecExtractScalar* instruct } void InstructionCodeGeneratorARM64Sve::VisitVecExtractScalar(HVecExtractScalar* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister src = VRegisterFrom(locations->InAt(0)); + const VRegister src = VRegisterFrom(locations->InAt(0)); switch (instruction->GetPackedType()) { case DataType::Type::kInt32: DCHECK_EQ(4u, instruction->GetVectorLength()); @@ -218,32 +246,31 @@ void LocationsBuilderARM64Sve::VisitVecReduce(HVecReduce* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecReduce(HVecReduce* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister src = VRegisterFrom(locations->InAt(0)); - VRegister dst = DRegisterFrom(locations->Out()); + const ZRegister src = ZRegisterFrom(locations->InAt(0)); + const VRegister dst = DRegisterFrom(locations->Out()); + const PRegister p_reg = LoopPReg(); switch (instruction->GetPackedType()) { case DataType::Type::kInt32: DCHECK_EQ(4u, instruction->GetVectorLength()); switch (instruction->GetReductionKind()) { case HVecReduce::kSum: - __ Addv(dst.S(), src.V4S()); - break; - case HVecReduce::kMin: - __ Sminv(dst.S(), src.V4S()); - break; - case HVecReduce::kMax: - __ Smaxv(dst.S(), src.V4S()); + __ Saddv(dst.S(), p_reg, src.VnS()); break; + default: + LOG(FATAL) << "Unsupported SIMD instruction"; + UNREACHABLE(); } break; case DataType::Type::kInt64: DCHECK_EQ(2u, instruction->GetVectorLength()); switch (instruction->GetReductionKind()) { case HVecReduce::kSum: - __ Addp(dst.D(), src.V2D()); + __ Uaddv(dst.D(), p_reg, src.VnD()); break; default: - LOG(FATAL) << "Unsupported SIMD min/max"; + LOG(FATAL) << "Unsupported SIMD instruction"; UNREACHABLE(); } break; @@ -258,14 +285,16 @@ void LocationsBuilderARM64Sve::VisitVecCnv(HVecCnv* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecCnv(HVecCnv* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister src = VRegisterFrom(locations->InAt(0)); - VRegister dst = VRegisterFrom(locations->Out()); + const ZRegister src = ZRegisterFrom(locations->InAt(0)); + const ZRegister dst = ZRegisterFrom(locations->Out()); + const PRegisterM p_reg = LoopPReg().Merging(); DataType::Type from = instruction->GetInputType(); DataType::Type to = instruction->GetResultType(); if (from == DataType::Type::kInt32 && to == DataType::Type::kFloat32) { DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Scvtf(dst.V4S(), src.V4S()); + __ Scvtf(dst.VnS(), p_reg, src.VnS()); } else { LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); } @@ -276,35 +305,37 @@ void LocationsBuilderARM64Sve::VisitVecNeg(HVecNeg* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecNeg(HVecNeg* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister src = VRegisterFrom(locations->InAt(0)); - VRegister dst = VRegisterFrom(locations->Out()); + const ZRegister src = ZRegisterFrom(locations->InAt(0)); + const ZRegister dst = ZRegisterFrom(locations->Out()); + const PRegisterM p_reg = LoopPReg().Merging(); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Neg(dst.V16B(), src.V16B()); + __ Neg(dst.VnB(), p_reg, src.VnB()); break; case DataType::Type::kUint16: case DataType::Type::kInt16: DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Neg(dst.V8H(), src.V8H()); + __ Neg(dst.VnH(), p_reg, src.VnH()); break; case DataType::Type::kInt32: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Neg(dst.V4S(), src.V4S()); + __ Neg(dst.VnS(), p_reg, src.VnS()); break; case DataType::Type::kInt64: DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Neg(dst.V2D(), src.V2D()); + __ Neg(dst.VnD(), p_reg, src.VnD()); break; case DataType::Type::kFloat32: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Fneg(dst.V4S(), src.V4S()); + __ Fneg(dst.VnS(), p_reg, src.VnS()); break; case DataType::Type::kFloat64: DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Fneg(dst.V2D(), src.V2D()); + __ Fneg(dst.VnD(), p_reg, src.VnD()); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -317,33 +348,35 @@ void LocationsBuilderARM64Sve::VisitVecAbs(HVecAbs* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecAbs(HVecAbs* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister src = VRegisterFrom(locations->InAt(0)); - VRegister dst = VRegisterFrom(locations->Out()); + const ZRegister src = ZRegisterFrom(locations->InAt(0)); + const ZRegister dst = ZRegisterFrom(locations->Out()); + const PRegisterM p_reg = LoopPReg().Merging(); switch (instruction->GetPackedType()) { case DataType::Type::kInt8: DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Abs(dst.V16B(), src.V16B()); + __ Abs(dst.VnB(), p_reg, src.VnB()); break; case DataType::Type::kInt16: DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Abs(dst.V8H(), src.V8H()); + __ Abs(dst.VnH(), p_reg, src.VnH()); break; case DataType::Type::kInt32: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Abs(dst.V4S(), src.V4S()); + __ Abs(dst.VnS(), p_reg, src.VnS()); break; case DataType::Type::kInt64: DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Abs(dst.V2D(), src.V2D()); + __ Abs(dst.VnD(), p_reg, src.VnD()); break; case DataType::Type::kFloat32: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Fabs(dst.V4S(), src.V4S()); + __ Fabs(dst.VnS(), p_reg, src.VnS()); break; case DataType::Type::kFloat64: DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Fabs(dst.V2D(), src.V2D()); + __ Fabs(dst.VnD(), p_reg, src.VnD()); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -356,22 +389,30 @@ void LocationsBuilderARM64Sve::VisitVecNot(HVecNot* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecNot(HVecNot* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister src = VRegisterFrom(locations->InAt(0)); - VRegister dst = VRegisterFrom(locations->Out()); + const ZRegister src = ZRegisterFrom(locations->InAt(0)); + const ZRegister dst = ZRegisterFrom(locations->Out()); + const PRegisterM p_reg = LoopPReg().Merging(); switch (instruction->GetPackedType()) { case DataType::Type::kBool: // special case boolean-not DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Movi(dst.V16B(), 1); - __ Eor(dst.V16B(), dst.V16B(), src.V16B()); + __ Dup(dst.VnB(), 1); + __ Eor(dst.VnB(), p_reg, dst.VnB(), src.VnB()); break; case DataType::Type::kUint8: case DataType::Type::kInt8: + __ Not(dst.VnB(), p_reg, src.VnB()); + break; case DataType::Type::kUint16: case DataType::Type::kInt16: + __ Not(dst.VnH(), p_reg, src.VnH()); + break; case DataType::Type::kInt32: + __ Not(dst.VnS(), p_reg, src.VnS()); + break; case DataType::Type::kInt64: - __ Not(dst.V16B(), src.V16B()); // lanes do not matter + __ Not(dst.VnD(), p_reg, src.VnD()); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -394,7 +435,7 @@ static void CreateVecBinOpLocations(ArenaAllocator* allocator, HVecBinaryOperati case DataType::Type::kFloat64: locations->SetInAt(0, Location::RequiresFpuRegister()); locations->SetInAt(1, Location::RequiresFpuRegister()); - locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); + locations->SetOut(Location::SameAsFirstInput()); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -407,36 +448,38 @@ void LocationsBuilderARM64Sve::VisitVecAdd(HVecAdd* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecAdd(HVecAdd* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister lhs = VRegisterFrom(locations->InAt(0)); - VRegister rhs = VRegisterFrom(locations->InAt(1)); - VRegister dst = VRegisterFrom(locations->Out()); + const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); + const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); + const ZRegister dst = ZRegisterFrom(locations->Out()); + const PRegisterM p_reg = LoopPReg().Merging(); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Add(dst.V16B(), lhs.V16B(), rhs.V16B()); + __ Add(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB()); break; case DataType::Type::kUint16: case DataType::Type::kInt16: DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Add(dst.V8H(), lhs.V8H(), rhs.V8H()); + __ Add(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH()); break; case DataType::Type::kInt32: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Add(dst.V4S(), lhs.V4S(), rhs.V4S()); + __ Add(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); break; case DataType::Type::kInt64: DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Add(dst.V2D(), lhs.V2D(), rhs.V2D()); + __ Add(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; case DataType::Type::kFloat32: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Fadd(dst.V4S(), lhs.V4S(), rhs.V4S()); + __ Fadd(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS(), StrictNaNPropagation); break; case DataType::Type::kFloat64: DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Fadd(dst.V2D(), lhs.V2D(), rhs.V2D()); + __ Fadd(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD(), StrictNaNPropagation); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -445,75 +488,23 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAdd(HVecAdd* instruction) { } void LocationsBuilderARM64Sve::VisitVecSaturationAdd(HVecSaturationAdd* instruction) { - CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); + UNREACHABLE(); } void InstructionCodeGeneratorARM64Sve::VisitVecSaturationAdd(HVecSaturationAdd* instruction) { - LocationSummary* locations = instruction->GetLocations(); - VRegister lhs = VRegisterFrom(locations->InAt(0)); - VRegister rhs = VRegisterFrom(locations->InAt(1)); - VRegister dst = VRegisterFrom(locations->Out()); - switch (instruction->GetPackedType()) { - case DataType::Type::kUint8: - DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Uqadd(dst.V16B(), lhs.V16B(), rhs.V16B()); - break; - case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Sqadd(dst.V16B(), lhs.V16B(), rhs.V16B()); - break; - case DataType::Type::kUint16: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Uqadd(dst.V8H(), lhs.V8H(), rhs.V8H()); - break; - case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Sqadd(dst.V8H(), lhs.V8H(), rhs.V8H()); - break; - default: - LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); - UNREACHABLE(); - } + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); + UNREACHABLE(); } void LocationsBuilderARM64Sve::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { - CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); + UNREACHABLE(); } void InstructionCodeGeneratorARM64Sve::VisitVecHalvingAdd(HVecHalvingAdd* instruction) { - LocationSummary* locations = instruction->GetLocations(); - VRegister lhs = VRegisterFrom(locations->InAt(0)); - VRegister rhs = VRegisterFrom(locations->InAt(1)); - VRegister dst = VRegisterFrom(locations->Out()); - switch (instruction->GetPackedType()) { - case DataType::Type::kUint8: - DCHECK_EQ(16u, instruction->GetVectorLength()); - instruction->IsRounded() - ? __ Urhadd(dst.V16B(), lhs.V16B(), rhs.V16B()) - : __ Uhadd(dst.V16B(), lhs.V16B(), rhs.V16B()); - break; - case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); - instruction->IsRounded() - ? __ Srhadd(dst.V16B(), lhs.V16B(), rhs.V16B()) - : __ Shadd(dst.V16B(), lhs.V16B(), rhs.V16B()); - break; - case DataType::Type::kUint16: - DCHECK_EQ(8u, instruction->GetVectorLength()); - instruction->IsRounded() - ? __ Urhadd(dst.V8H(), lhs.V8H(), rhs.V8H()) - : __ Uhadd(dst.V8H(), lhs.V8H(), rhs.V8H()); - break; - case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); - instruction->IsRounded() - ? __ Srhadd(dst.V8H(), lhs.V8H(), rhs.V8H()) - : __ Shadd(dst.V8H(), lhs.V8H(), rhs.V8H()); - break; - default: - LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); - UNREACHABLE(); - } + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); + UNREACHABLE(); } void LocationsBuilderARM64Sve::VisitVecSub(HVecSub* instruction) { @@ -521,36 +512,38 @@ void LocationsBuilderARM64Sve::VisitVecSub(HVecSub* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecSub(HVecSub* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister lhs = VRegisterFrom(locations->InAt(0)); - VRegister rhs = VRegisterFrom(locations->InAt(1)); - VRegister dst = VRegisterFrom(locations->Out()); + const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); + const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); + const ZRegister dst = ZRegisterFrom(locations->Out()); + const PRegisterM p_reg = LoopPReg().Merging(); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Sub(dst.V16B(), lhs.V16B(), rhs.V16B()); + __ Sub(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB()); break; case DataType::Type::kUint16: case DataType::Type::kInt16: DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Sub(dst.V8H(), lhs.V8H(), rhs.V8H()); + __ Sub(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH()); break; case DataType::Type::kInt32: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Sub(dst.V4S(), lhs.V4S(), rhs.V4S()); + __ Sub(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); break; case DataType::Type::kInt64: DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Sub(dst.V2D(), lhs.V2D(), rhs.V2D()); + __ Sub(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; case DataType::Type::kFloat32: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Fsub(dst.V4S(), lhs.V4S(), rhs.V4S()); + __ Fsub(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); break; case DataType::Type::kFloat64: DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Fsub(dst.V2D(), lhs.V2D(), rhs.V2D()); + __ Fsub(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -559,35 +552,13 @@ void InstructionCodeGeneratorARM64Sve::VisitVecSub(HVecSub* instruction) { } void LocationsBuilderARM64Sve::VisitVecSaturationSub(HVecSaturationSub* instruction) { - CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); + UNREACHABLE(); } void InstructionCodeGeneratorARM64Sve::VisitVecSaturationSub(HVecSaturationSub* instruction) { - LocationSummary* locations = instruction->GetLocations(); - VRegister lhs = VRegisterFrom(locations->InAt(0)); - VRegister rhs = VRegisterFrom(locations->InAt(1)); - VRegister dst = VRegisterFrom(locations->Out()); - switch (instruction->GetPackedType()) { - case DataType::Type::kUint8: - DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Uqsub(dst.V16B(), lhs.V16B(), rhs.V16B()); - break; - case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Sqsub(dst.V16B(), lhs.V16B(), rhs.V16B()); - break; - case DataType::Type::kUint16: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Uqsub(dst.V8H(), lhs.V8H(), rhs.V8H()); - break; - case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Sqsub(dst.V8H(), lhs.V8H(), rhs.V8H()); - break; - default: - LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); - UNREACHABLE(); - } + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); + UNREACHABLE(); } void LocationsBuilderARM64Sve::VisitVecMul(HVecMul* instruction) { @@ -595,32 +566,38 @@ void LocationsBuilderARM64Sve::VisitVecMul(HVecMul* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecMul(HVecMul* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister lhs = VRegisterFrom(locations->InAt(0)); - VRegister rhs = VRegisterFrom(locations->InAt(1)); - VRegister dst = VRegisterFrom(locations->Out()); + const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); + const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); + const ZRegister dst = ZRegisterFrom(locations->Out()); + const PRegisterM p_reg = LoopPReg().Merging(); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Mul(dst.V16B(), lhs.V16B(), rhs.V16B()); + __ Mul(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB()); break; case DataType::Type::kUint16: case DataType::Type::kInt16: DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Mul(dst.V8H(), lhs.V8H(), rhs.V8H()); + __ Mul(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH()); break; case DataType::Type::kInt32: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Mul(dst.V4S(), lhs.V4S(), rhs.V4S()); + __ Mul(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); + break; + case DataType::Type::kInt64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Mul(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; case DataType::Type::kFloat32: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Fmul(dst.V4S(), lhs.V4S(), rhs.V4S()); + __ Fmul(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS(), StrictNaNPropagation); break; case DataType::Type::kFloat64: DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Fmul(dst.V2D(), lhs.V2D(), rhs.V2D()); + __ Fmul(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD(), StrictNaNPropagation); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -633,18 +610,22 @@ void LocationsBuilderARM64Sve::VisitVecDiv(HVecDiv* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecDiv(HVecDiv* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister lhs = VRegisterFrom(locations->InAt(0)); - VRegister rhs = VRegisterFrom(locations->InAt(1)); - VRegister dst = VRegisterFrom(locations->Out()); + const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); + const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); + const ZRegister dst = ZRegisterFrom(locations->Out()); + const PRegisterM p_reg = LoopPReg().Merging(); + + // Note: VIXL guarantees StrictNaNPropagation for Fdiv. switch (instruction->GetPackedType()) { case DataType::Type::kFloat32: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Fdiv(dst.V4S(), lhs.V4S(), rhs.V4S()); + __ Fdiv(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); break; case DataType::Type::kFloat64: DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Fdiv(dst.V2D(), lhs.V2D(), rhs.V2D()); + __ Fdiv(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -653,99 +634,23 @@ void InstructionCodeGeneratorARM64Sve::VisitVecDiv(HVecDiv* instruction) { } void LocationsBuilderARM64Sve::VisitVecMin(HVecMin* instruction) { - CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); + UNREACHABLE(); } void InstructionCodeGeneratorARM64Sve::VisitVecMin(HVecMin* instruction) { - LocationSummary* locations = instruction->GetLocations(); - VRegister lhs = VRegisterFrom(locations->InAt(0)); - VRegister rhs = VRegisterFrom(locations->InAt(1)); - VRegister dst = VRegisterFrom(locations->Out()); - switch (instruction->GetPackedType()) { - case DataType::Type::kUint8: - DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Umin(dst.V16B(), lhs.V16B(), rhs.V16B()); - break; - case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Smin(dst.V16B(), lhs.V16B(), rhs.V16B()); - break; - case DataType::Type::kUint16: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Umin(dst.V8H(), lhs.V8H(), rhs.V8H()); - break; - case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Smin(dst.V8H(), lhs.V8H(), rhs.V8H()); - break; - case DataType::Type::kUint32: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Umin(dst.V4S(), lhs.V4S(), rhs.V4S()); - break; - case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Smin(dst.V4S(), lhs.V4S(), rhs.V4S()); - break; - case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Fmin(dst.V4S(), lhs.V4S(), rhs.V4S()); - break; - case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Fmin(dst.V2D(), lhs.V2D(), rhs.V2D()); - break; - default: - LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); - UNREACHABLE(); - } + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); + UNREACHABLE(); } void LocationsBuilderARM64Sve::VisitVecMax(HVecMax* instruction) { - CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction); + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); + UNREACHABLE(); } void InstructionCodeGeneratorARM64Sve::VisitVecMax(HVecMax* instruction) { - LocationSummary* locations = instruction->GetLocations(); - VRegister lhs = VRegisterFrom(locations->InAt(0)); - VRegister rhs = VRegisterFrom(locations->InAt(1)); - VRegister dst = VRegisterFrom(locations->Out()); - switch (instruction->GetPackedType()) { - case DataType::Type::kUint8: - DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Umax(dst.V16B(), lhs.V16B(), rhs.V16B()); - break; - case DataType::Type::kInt8: - DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Smax(dst.V16B(), lhs.V16B(), rhs.V16B()); - break; - case DataType::Type::kUint16: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Umax(dst.V8H(), lhs.V8H(), rhs.V8H()); - break; - case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Smax(dst.V8H(), lhs.V8H(), rhs.V8H()); - break; - case DataType::Type::kUint32: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Umax(dst.V4S(), lhs.V4S(), rhs.V4S()); - break; - case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Smax(dst.V4S(), lhs.V4S(), rhs.V4S()); - break; - case DataType::Type::kFloat32: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Fmax(dst.V4S(), lhs.V4S(), rhs.V4S()); - break; - case DataType::Type::kFloat64: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Fmax(dst.V2D(), lhs.V2D(), rhs.V2D()); - break; - default: - LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); - UNREACHABLE(); - } + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); + UNREACHABLE(); } void LocationsBuilderARM64Sve::VisitVecAnd(HVecAnd* instruction) { @@ -754,21 +659,29 @@ void LocationsBuilderARM64Sve::VisitVecAnd(HVecAnd* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecAnd(HVecAnd* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister lhs = VRegisterFrom(locations->InAt(0)); - VRegister rhs = VRegisterFrom(locations->InAt(1)); - VRegister dst = VRegisterFrom(locations->Out()); + const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); + const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); + const ZRegister dst = ZRegisterFrom(locations->Out()); + const PRegisterM p_reg = LoopPReg().Merging(); switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: + __ And(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB()); + break; case DataType::Type::kUint16: case DataType::Type::kInt16: + __ And(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH()); + break; case DataType::Type::kInt32: - case DataType::Type::kInt64: case DataType::Type::kFloat32: + __ And(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); + break; + case DataType::Type::kInt64: case DataType::Type::kFloat64: - __ And(dst.V16B(), lhs.V16B(), rhs.V16B()); // lanes do not matter + __ And(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -790,21 +703,29 @@ void LocationsBuilderARM64Sve::VisitVecOr(HVecOr* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecOr(HVecOr* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister lhs = VRegisterFrom(locations->InAt(0)); - VRegister rhs = VRegisterFrom(locations->InAt(1)); - VRegister dst = VRegisterFrom(locations->Out()); + const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); + const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); + const ZRegister dst = ZRegisterFrom(locations->Out()); + const PRegisterM p_reg = LoopPReg().Merging(); switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: + __ Orr(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB()); + break; case DataType::Type::kUint16: case DataType::Type::kInt16: + __ Orr(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH()); + break; case DataType::Type::kInt32: - case DataType::Type::kInt64: case DataType::Type::kFloat32: + __ Orr(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); + break; + case DataType::Type::kInt64: case DataType::Type::kFloat64: - __ Orr(dst.V16B(), lhs.V16B(), rhs.V16B()); // lanes do not matter + __ Orr(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -817,21 +738,29 @@ void LocationsBuilderARM64Sve::VisitVecXor(HVecXor* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecXor(HVecXor* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister lhs = VRegisterFrom(locations->InAt(0)); - VRegister rhs = VRegisterFrom(locations->InAt(1)); - VRegister dst = VRegisterFrom(locations->Out()); + const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); + const ZRegister rhs = ZRegisterFrom(locations->InAt(1)); + const ZRegister dst = ZRegisterFrom(locations->Out()); + const PRegisterM p_reg = LoopPReg().Merging(); switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: + __ Eor(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB()); + break; case DataType::Type::kUint16: case DataType::Type::kInt16: + __ Eor(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH()); + break; case DataType::Type::kInt32: - case DataType::Type::kInt64: case DataType::Type::kFloat32: + __ Eor(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS()); + break; + case DataType::Type::kInt64: case DataType::Type::kFloat64: - __ Eor(dst.V16B(), lhs.V16B(), rhs.V16B()); // lanes do not matter + __ Eor(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD()); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -864,28 +793,30 @@ void LocationsBuilderARM64Sve::VisitVecShl(HVecShl* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecShl(HVecShl* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister lhs = VRegisterFrom(locations->InAt(0)); - VRegister dst = VRegisterFrom(locations->Out()); + const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); + const ZRegister dst = ZRegisterFrom(locations->Out()); + const PRegisterM p_reg = LoopPReg().Merging(); int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Shl(dst.V16B(), lhs.V16B(), value); + __ Lsl(dst.VnB(), p_reg, lhs.VnB(), value); break; case DataType::Type::kUint16: case DataType::Type::kInt16: DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Shl(dst.V8H(), lhs.V8H(), value); + __ Lsl(dst.VnH(), p_reg, lhs.VnH(), value); break; case DataType::Type::kInt32: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Shl(dst.V4S(), lhs.V4S(), value); + __ Lsl(dst.VnS(), p_reg, lhs.VnS(), value); break; case DataType::Type::kInt64: DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Shl(dst.V2D(), lhs.V2D(), value); + __ Lsl(dst.VnD(), p_reg, lhs.VnD(), value); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -898,28 +829,30 @@ void LocationsBuilderARM64Sve::VisitVecShr(HVecShr* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecShr(HVecShr* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister lhs = VRegisterFrom(locations->InAt(0)); - VRegister dst = VRegisterFrom(locations->Out()); + const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); + const ZRegister dst = ZRegisterFrom(locations->Out()); + const PRegisterM p_reg = LoopPReg().Merging(); int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Sshr(dst.V16B(), lhs.V16B(), value); + __ Asr(dst.VnB(), p_reg, lhs.VnB(), value); break; case DataType::Type::kUint16: case DataType::Type::kInt16: DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Sshr(dst.V8H(), lhs.V8H(), value); + __ Asr(dst.VnH(), p_reg, lhs.VnH(), value); break; case DataType::Type::kInt32: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Sshr(dst.V4S(), lhs.V4S(), value); + __ Asr(dst.VnS(), p_reg, lhs.VnS(), value); break; case DataType::Type::kInt64: DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Sshr(dst.V2D(), lhs.V2D(), value); + __ Asr(dst.VnD(), p_reg, lhs.VnD(), value); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -932,28 +865,30 @@ void LocationsBuilderARM64Sve::VisitVecUShr(HVecUShr* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecUShr(HVecUShr* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister lhs = VRegisterFrom(locations->InAt(0)); - VRegister dst = VRegisterFrom(locations->Out()); + const ZRegister lhs = ZRegisterFrom(locations->InAt(0)); + const ZRegister dst = ZRegisterFrom(locations->Out()); + const PRegisterM p_reg = LoopPReg().Merging(); int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); switch (instruction->GetPackedType()) { case DataType::Type::kUint8: case DataType::Type::kInt8: DCHECK_EQ(16u, instruction->GetVectorLength()); - __ Ushr(dst.V16B(), lhs.V16B(), value); + __ Lsr(dst.VnB(), p_reg, lhs.VnB(), value); break; case DataType::Type::kUint16: case DataType::Type::kInt16: DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Ushr(dst.V8H(), lhs.V8H(), value); + __ Lsr(dst.VnH(), p_reg, lhs.VnH(), value); break; case DataType::Type::kInt32: DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Ushr(dst.V4S(), lhs.V4S(), value); + __ Lsr(dst.VnS(), p_reg, lhs.VnS(), value); break; case DataType::Type::kInt64: DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Ushr(dst.V2D(), lhs.V2D(), value); + __ Lsr(dst.VnD(), p_reg, lhs.VnD(), value); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -964,7 +899,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecUShr(HVecUShr* instruction) { void LocationsBuilderARM64Sve::VisitVecSetScalars(HVecSetScalars* instruction) { LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); - DCHECK_EQ(1u, instruction->InputCount()); // only one input currently implemented + DCHECK_EQ(2u, instruction->InputCount()); // only one input currently implemented + predicate. HInstruction* input = instruction->InputAt(0); bool is_zero = IsZeroBitPattern(input); @@ -994,14 +929,16 @@ void LocationsBuilderARM64Sve::VisitVecSetScalars(HVecSetScalars* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecSetScalars(HVecSetScalars* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister dst = VRegisterFrom(locations->Out()); + const ZRegister z_dst = ZRegisterFrom(locations->Out()); - DCHECK_EQ(1u, instruction->InputCount()); // only one input currently implemented + DCHECK_EQ(2u, instruction->InputCount()); // only one input currently implemented + predicate. // Zero out all other elements first. - __ Movi(dst.V16B(), 0); + __ Dup(z_dst.VnB(), 0); + const VRegister dst = VRegisterFrom(locations->Out()); // Shorthand for any type of zero. if (IsZeroBitPattern(instruction->InputAt(0))) { return; @@ -1062,11 +999,14 @@ void LocationsBuilderARM64Sve::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate // Some early revisions of the Cortex-A53 have an erratum (835769) whereby it is possible for a // 64-bit scalar multiply-accumulate instruction in AArch64 state to generate an incorrect result. // However vector MultiplyAccumulate instruction is not affected. -void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { +void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate( + HVecMultiplyAccumulate* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); - VRegister acc = VRegisterFrom(locations->InAt(0)); - VRegister left = VRegisterFrom(locations->InAt(1)); - VRegister right = VRegisterFrom(locations->InAt(2)); + const ZRegister acc = ZRegisterFrom(locations->InAt(0)); + const ZRegister left = ZRegisterFrom(locations->InAt(1)); + const ZRegister right = ZRegisterFrom(locations->InAt(2)); + const PRegisterM p_reg = LoopPReg().Merging(); DCHECK(locations->InAt(0).Equals(locations->Out())); @@ -1075,26 +1015,26 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate(HVecMultiplyAc case DataType::Type::kInt8: DCHECK_EQ(16u, instruction->GetVectorLength()); if (instruction->GetOpKind() == HInstruction::kAdd) { - __ Mla(acc.V16B(), left.V16B(), right.V16B()); + __ Mla(acc.VnB(), p_reg, acc.VnB(), left.VnB(), right.VnB()); } else { - __ Mls(acc.V16B(), left.V16B(), right.V16B()); + __ Mls(acc.VnB(), p_reg, acc.VnB(), left.VnB(), right.VnB()); } break; case DataType::Type::kUint16: case DataType::Type::kInt16: DCHECK_EQ(8u, instruction->GetVectorLength()); if (instruction->GetOpKind() == HInstruction::kAdd) { - __ Mla(acc.V8H(), left.V8H(), right.V8H()); + __ Mla(acc.VnH(), p_reg, acc.VnB(), left.VnH(), right.VnH()); } else { - __ Mls(acc.V8H(), left.V8H(), right.V8H()); + __ Mls(acc.VnH(), p_reg, acc.VnB(), left.VnH(), right.VnH()); } break; case DataType::Type::kInt32: DCHECK_EQ(4u, instruction->GetVectorLength()); if (instruction->GetOpKind() == HInstruction::kAdd) { - __ Mla(acc.V4S(), left.V4S(), right.V4S()); + __ Mla(acc.VnS(), p_reg, acc.VnB(), left.VnS(), right.VnS()); } else { - __ Mls(acc.V4S(), left.V4S(), right.V4S()); + __ Mls(acc.VnS(), p_reg, acc.VnB(), left.VnS(), right.VnS()); } break; default: @@ -1104,185 +1044,13 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate(HVecMultiplyAc } void LocationsBuilderARM64Sve::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { - CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction); - // Some conversions require temporary registers. - LocationSummary* locations = instruction->GetLocations(); - HVecOperation* a = instruction->InputAt(1)->AsVecOperation(); - HVecOperation* b = instruction->InputAt(2)->AsVecOperation(); - DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()), - HVecOperation::ToSignedType(b->GetPackedType())); - switch (a->GetPackedType()) { - case DataType::Type::kUint8: - case DataType::Type::kInt8: - switch (instruction->GetPackedType()) { - case DataType::Type::kInt64: - locations->AddTemp(Location::RequiresFpuRegister()); - locations->AddTemp(Location::RequiresFpuRegister()); - FALLTHROUGH_INTENDED; - case DataType::Type::kInt32: - locations->AddTemp(Location::RequiresFpuRegister()); - locations->AddTemp(Location::RequiresFpuRegister()); - break; - default: - break; - } - break; - case DataType::Type::kUint16: - case DataType::Type::kInt16: - if (instruction->GetPackedType() == DataType::Type::kInt64) { - locations->AddTemp(Location::RequiresFpuRegister()); - locations->AddTemp(Location::RequiresFpuRegister()); - } - break; - case DataType::Type::kInt32: - case DataType::Type::kInt64: - if (instruction->GetPackedType() == a->GetPackedType()) { - locations->AddTemp(Location::RequiresFpuRegister()); - } - break; - default: - break; - } + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); + UNREACHABLE(); } void InstructionCodeGeneratorARM64Sve::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { - LocationSummary* locations = instruction->GetLocations(); - VRegister acc = VRegisterFrom(locations->InAt(0)); - VRegister left = VRegisterFrom(locations->InAt(1)); - VRegister right = VRegisterFrom(locations->InAt(2)); - - DCHECK(locations->InAt(0).Equals(locations->Out())); - - // Handle all feasible acc_T += sad(a_S, b_S) type combinations (T x S). - HVecOperation* a = instruction->InputAt(1)->AsVecOperation(); - HVecOperation* b = instruction->InputAt(2)->AsVecOperation(); - DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()), - HVecOperation::ToSignedType(b->GetPackedType())); - switch (a->GetPackedType()) { - case DataType::Type::kUint8: - case DataType::Type::kInt8: - DCHECK_EQ(16u, a->GetVectorLength()); - switch (instruction->GetPackedType()) { - case DataType::Type::kInt16: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Sabal(acc.V8H(), left.V8B(), right.V8B()); - __ Sabal2(acc.V8H(), left.V16B(), right.V16B()); - break; - case DataType::Type::kInt32: { - DCHECK_EQ(4u, instruction->GetVectorLength()); - VRegister tmp1 = VRegisterFrom(locations->GetTemp(0)); - VRegister tmp2 = VRegisterFrom(locations->GetTemp(1)); - __ Sxtl(tmp1.V8H(), left.V8B()); - __ Sxtl(tmp2.V8H(), right.V8B()); - __ Sabal(acc.V4S(), tmp1.V4H(), tmp2.V4H()); - __ Sabal2(acc.V4S(), tmp1.V8H(), tmp2.V8H()); - __ Sxtl2(tmp1.V8H(), left.V16B()); - __ Sxtl2(tmp2.V8H(), right.V16B()); - __ Sabal(acc.V4S(), tmp1.V4H(), tmp2.V4H()); - __ Sabal2(acc.V4S(), tmp1.V8H(), tmp2.V8H()); - break; - } - case DataType::Type::kInt64: { - DCHECK_EQ(2u, instruction->GetVectorLength()); - VRegister tmp1 = VRegisterFrom(locations->GetTemp(0)); - VRegister tmp2 = VRegisterFrom(locations->GetTemp(1)); - VRegister tmp3 = VRegisterFrom(locations->GetTemp(2)); - VRegister tmp4 = VRegisterFrom(locations->GetTemp(3)); - __ Sxtl(tmp1.V8H(), left.V8B()); - __ Sxtl(tmp2.V8H(), right.V8B()); - __ Sxtl(tmp3.V4S(), tmp1.V4H()); - __ Sxtl(tmp4.V4S(), tmp2.V4H()); - __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S()); - __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S()); - __ Sxtl2(tmp3.V4S(), tmp1.V8H()); - __ Sxtl2(tmp4.V4S(), tmp2.V8H()); - __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S()); - __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S()); - __ Sxtl2(tmp1.V8H(), left.V16B()); - __ Sxtl2(tmp2.V8H(), right.V16B()); - __ Sxtl(tmp3.V4S(), tmp1.V4H()); - __ Sxtl(tmp4.V4S(), tmp2.V4H()); - __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S()); - __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S()); - __ Sxtl2(tmp3.V4S(), tmp1.V8H()); - __ Sxtl2(tmp4.V4S(), tmp2.V8H()); - __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S()); - __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S()); - break; - } - default: - LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); - UNREACHABLE(); - } - break; - case DataType::Type::kUint16: - case DataType::Type::kInt16: - DCHECK_EQ(8u, a->GetVectorLength()); - switch (instruction->GetPackedType()) { - case DataType::Type::kInt32: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Sabal(acc.V4S(), left.V4H(), right.V4H()); - __ Sabal2(acc.V4S(), left.V8H(), right.V8H()); - break; - case DataType::Type::kInt64: { - DCHECK_EQ(2u, instruction->GetVectorLength()); - VRegister tmp1 = VRegisterFrom(locations->GetTemp(0)); - VRegister tmp2 = VRegisterFrom(locations->GetTemp(1)); - __ Sxtl(tmp1.V4S(), left.V4H()); - __ Sxtl(tmp2.V4S(), right.V4H()); - __ Sabal(acc.V2D(), tmp1.V2S(), tmp2.V2S()); - __ Sabal2(acc.V2D(), tmp1.V4S(), tmp2.V4S()); - __ Sxtl2(tmp1.V4S(), left.V8H()); - __ Sxtl2(tmp2.V4S(), right.V8H()); - __ Sabal(acc.V2D(), tmp1.V2S(), tmp2.V2S()); - __ Sabal2(acc.V2D(), tmp1.V4S(), tmp2.V4S()); - break; - } - default: - LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); - UNREACHABLE(); - } - break; - case DataType::Type::kInt32: - DCHECK_EQ(4u, a->GetVectorLength()); - switch (instruction->GetPackedType()) { - case DataType::Type::kInt32: { - DCHECK_EQ(4u, instruction->GetVectorLength()); - VRegister tmp = VRegisterFrom(locations->GetTemp(0)); - __ Sub(tmp.V4S(), left.V4S(), right.V4S()); - __ Abs(tmp.V4S(), tmp.V4S()); - __ Add(acc.V4S(), acc.V4S(), tmp.V4S()); - break; - } - case DataType::Type::kInt64: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Sabal(acc.V2D(), left.V2S(), right.V2S()); - __ Sabal2(acc.V2D(), left.V4S(), right.V4S()); - break; - default: - LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); - UNREACHABLE(); - } - break; - case DataType::Type::kInt64: - DCHECK_EQ(2u, a->GetVectorLength()); - switch (instruction->GetPackedType()) { - case DataType::Type::kInt64: { - DCHECK_EQ(2u, instruction->GetVectorLength()); - VRegister tmp = VRegisterFrom(locations->GetTemp(0)); - __ Sub(tmp.V2D(), left.V2D(), right.V2D()); - __ Abs(tmp.V2D(), tmp.V2D()); - __ Add(acc.V2D(), acc.V2D(), tmp.V2D()); - break; - } - default: - LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); - UNREACHABLE(); - } - break; - default: - LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); - } + LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId(); + UNREACHABLE(); } void LocationsBuilderARM64Sve::VisitVecDotProd(HVecDotProd* instruction) { @@ -1293,19 +1061,17 @@ void LocationsBuilderARM64Sve::VisitVecDotProd(HVecDotProd* instruction) { locations->SetInAt(2, Location::RequiresFpuRegister()); locations->SetOut(Location::SameAsFirstInput()); - // For Int8 and Uint8 general case we need a temp register. - if ((DataType::Size(instruction->InputAt(1)->AsVecOperation()->GetPackedType()) == 1) && - !ShouldEmitDotProductInstructions(codegen_)) { - locations->AddTemp(Location::RequiresFpuRegister()); - } + locations->AddTemp(Location::RequiresFpuRegister()); } void InstructionCodeGeneratorARM64Sve::VisitVecDotProd(HVecDotProd* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); DCHECK(locations->InAt(0).Equals(locations->Out())); - VRegister acc = VRegisterFrom(locations->InAt(0)); - VRegister left = VRegisterFrom(locations->InAt(1)); - VRegister right = VRegisterFrom(locations->InAt(2)); + const ZRegister acc = ZRegisterFrom(locations->InAt(0)); + const ZRegister left = ZRegisterFrom(locations->InAt(1)); + const ZRegister right = ZRegisterFrom(locations->InAt(2)); + const PRegisterM p_reg = LoopPReg().Merging(); HVecOperation* a = instruction->InputAt(1)->AsVecOperation(); HVecOperation* b = instruction->InputAt(2)->AsVecOperation(); DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()), @@ -1317,45 +1083,20 @@ void InstructionCodeGeneratorARM64Sve::VisitVecDotProd(HVecDotProd* instruction) switch (inputs_data_size) { case 1u: { DCHECK_EQ(16u, a->GetVectorLength()); + UseScratchRegisterScope temps(GetVIXLAssembler()); + const ZRegister tmp0 = temps.AcquireZ(); + const ZRegister tmp1 = ZRegisterFrom(locations->GetTemp(0)); + + __ Dup(tmp1.VnB(), 0u); + __ Sel(tmp0.VnB(), p_reg, left.VnB(), tmp1.VnB()); + __ Sel(tmp1.VnB(), p_reg, right.VnB(), tmp1.VnB()); if (instruction->IsZeroExtending()) { - if (ShouldEmitDotProductInstructions(codegen_)) { - __ Udot(acc.V4S(), left.V16B(), right.V16B()); - } else { - VRegister tmp = VRegisterFrom(locations->GetTemp(0)); - __ Umull(tmp.V8H(), left.V8B(), right.V8B()); - __ Uaddw(acc.V4S(), acc.V4S(), tmp.V4H()); - __ Uaddw2(acc.V4S(), acc.V4S(), tmp.V8H()); - - __ Umull2(tmp.V8H(), left.V16B(), right.V16B()); - __ Uaddw(acc.V4S(), acc.V4S(), tmp.V4H()); - __ Uaddw2(acc.V4S(), acc.V4S(), tmp.V8H()); - } + __ Udot(acc.VnS(), acc.VnS(), tmp0.VnB(), tmp1.VnB()); } else { - if (ShouldEmitDotProductInstructions(codegen_)) { - __ Sdot(acc.V4S(), left.V16B(), right.V16B()); - } else { - VRegister tmp = VRegisterFrom(locations->GetTemp(0)); - __ Smull(tmp.V8H(), left.V8B(), right.V8B()); - __ Saddw(acc.V4S(), acc.V4S(), tmp.V4H()); - __ Saddw2(acc.V4S(), acc.V4S(), tmp.V8H()); - - __ Smull2(tmp.V8H(), left.V16B(), right.V16B()); - __ Saddw(acc.V4S(), acc.V4S(), tmp.V4H()); - __ Saddw2(acc.V4S(), acc.V4S(), tmp.V8H()); - } + __ Sdot(acc.VnS(), acc.VnS(), tmp0.VnB(), tmp1.VnB()); } break; } - case 2u: - DCHECK_EQ(8u, a->GetVectorLength()); - if (instruction->IsZeroExtending()) { - __ Umlal(acc.V4S(), left.V4H(), right.V4H()); - __ Umlal2(acc.V4S(), left.V8H(), right.V8H()); - } else { - __ Smlal(acc.V4S(), left.V4H(), right.V4H()); - __ Smlal2(acc.V4S(), left.V8H(), right.V8H()); - } - break; default: LOG(FATAL) << "Unsupported SIMD type size: " << inputs_data_size; } @@ -1395,54 +1136,39 @@ void LocationsBuilderARM64Sve::VisitVecLoad(HVecLoad* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecLoad(HVecLoad* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); size_t size = DataType::Size(instruction->GetPackedType()); - VRegister reg = VRegisterFrom(locations->Out()); + const ZRegister reg = ZRegisterFrom(locations->Out()); UseScratchRegisterScope temps(GetVIXLAssembler()); Register scratch; + const PRegisterZ p_reg = LoopPReg().Zeroing(); switch (instruction->GetPackedType()) { case DataType::Type::kInt16: // (short) s.charAt(.) can yield HVecLoad/Int16/StringCharAt. case DataType::Type::kUint16: DCHECK_EQ(8u, instruction->GetVectorLength()); - // Special handling of compressed/uncompressed string load. - if (mirror::kUseStringCompression && instruction->IsStringCharAt()) { - vixl::aarch64::Label uncompressed_load, done; - // Test compression bit. - static_assert(static_cast(mirror::StringCompressionFlag::kCompressed) == 0u, - "Expecting 0=compressed, 1=uncompressed"); - uint32_t count_offset = mirror::String::CountOffset().Uint32Value(); - Register length = temps.AcquireW(); - __ Ldr(length, HeapOperand(InputRegisterAt(instruction, 0), count_offset)); - __ Tbnz(length.W(), 0, &uncompressed_load); - temps.Release(length); // no longer needed - // Zero extend 8 compressed bytes into 8 chars. - __ Ldr(DRegisterFrom(locations->Out()).V8B(), - VecNeonAddress(instruction, &temps, 1, /*is_string_char_at*/ true, &scratch)); - __ Uxtl(reg.V8H(), reg.V8B()); - __ B(&done); - if (scratch.IsValid()) { - temps.Release(scratch); // if used, no longer needed - } - // Load 8 direct uncompressed chars. - __ Bind(&uncompressed_load); - __ Ldr(reg, - VecNeonAddress(instruction, &temps, size, /*is_string_char_at*/ true, &scratch)); - __ Bind(&done); - return; - } - FALLTHROUGH_INTENDED; + __ Ld1h(reg.VnH(), p_reg, + VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); + break; case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Ld1b(reg.VnB(), p_reg, + VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); + break; case DataType::Type::kInt32: case DataType::Type::kFloat32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Ld1w(reg.VnS(), p_reg, + VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); + break; case DataType::Type::kInt64: case DataType::Type::kFloat64: - DCHECK_LE(2u, instruction->GetVectorLength()); - DCHECK_LE(instruction->GetVectorLength(), 16u); - __ Ldr(reg, - VecNeonAddress(instruction, &temps, size, instruction->IsStringCharAt(), &scratch)); + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Ld1d(reg.VnD(), p_reg, + VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -1455,26 +1181,39 @@ void LocationsBuilderARM64Sve::VisitVecStore(HVecStore* instruction) { } void InstructionCodeGeneratorARM64Sve::VisitVecStore(HVecStore* instruction) { + DCHECK(instruction->IsPredicated()); LocationSummary* locations = instruction->GetLocations(); size_t size = DataType::Size(instruction->GetPackedType()); - VRegister reg = VRegisterFrom(locations->InAt(2)); + const ZRegister reg = ZRegisterFrom(locations->InAt(2)); UseScratchRegisterScope temps(GetVIXLAssembler()); Register scratch; + const PRegisterZ p_reg = LoopPReg().Zeroing(); switch (instruction->GetPackedType()) { case DataType::Type::kBool: case DataType::Type::kUint8: case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ St1b(reg.VnB(), p_reg, + VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); + break; case DataType::Type::kUint16: case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ St1h(reg.VnH(), p_reg, + VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); + break; case DataType::Type::kInt32: case DataType::Type::kFloat32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ St1w(reg.VnS(), p_reg, + VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); + break; case DataType::Type::kInt64: case DataType::Type::kFloat64: - DCHECK_LE(2u, instruction->GetVectorLength()); - DCHECK_LE(instruction->GetVectorLength(), 16u); - __ Str(reg, - VecNeonAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ St1d(reg.VnD(), p_reg, + VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch)); break; default: LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); @@ -1483,33 +1222,113 @@ void InstructionCodeGeneratorARM64Sve::VisitVecStore(HVecStore* instruction) { } void LocationsBuilderARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); - UNREACHABLE(); + LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); + DCHECK(instruction->InputAt(0)->IsIntConstant()); + locations->SetInAt(0, Location::NoLocation()); + locations->SetOut(Location::NoLocation()); } void InstructionCodeGeneratorARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); - UNREACHABLE(); + // Instruction is not predicated, see nodes_vector.h + DCHECK(!instruction->IsPredicated()); + const PRegister p_reg = LoopPReg(); + + switch (instruction->GetPackedType()) { + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Ptrue(p_reg.VnB(), vixl::aarch64::SVE_ALL); + break; + case DataType::Type::kUint16: + case DataType::Type::kInt16: + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Ptrue(p_reg.VnH(), vixl::aarch64::SVE_ALL); + break; + case DataType::Type::kInt32: + case DataType::Type::kFloat32: + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Ptrue(p_reg.VnS(), vixl::aarch64::SVE_ALL); + break; + case DataType::Type::kInt64: + case DataType::Type::kFloat64: + DCHECK_EQ(2u, instruction->GetVectorLength()); + __ Ptrue(p_reg.VnD(), vixl::aarch64::SVE_ALL); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } } void LocationsBuilderARM64Sve::VisitVecPredWhile(HVecPredWhile* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); - UNREACHABLE(); + LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RequiresRegister()); + // The instruction doesn't really need a core register as out location; this is a hack + // to workaround absence of support for vector predicates in register allocation. + // + // Semantically, the out location of this instruction and predicate inputs locations of + // its users should be a fixed predicate register (similar to + // Location::RegisterLocation(int reg)). But the register allocator (RA) doesn't support + // SIMD regs (e.g. predicate), so LoopPReg() is used explicitly without exposing it + // to the RA. + // + // To make the RA happy Location::NoLocation() was used for all the vector instructions + // predicate inputs; but for the PredSetOperations (e.g. VecPredWhile) Location::NoLocation() + // can't be used without changes to RA - "ssa_liveness_analysis.cc] Check failed: + // input->IsEmittedAtUseSite()" would fire. + // + // Using a core register as a hack is the easiest way to tackle this problem. The RA will + // block one core register for the loop without actually using it; this should not be + // a performance issue as a SIMD loop operates mainly on SIMD registers. + // + // TODO: Support SIMD types in register allocator. + locations->SetOut(Location::RequiresRegister()); } void InstructionCodeGeneratorARM64Sve::VisitVecPredWhile(HVecPredWhile* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); - UNREACHABLE(); + // Instruction is not predicated, see nodes_vector.h + DCHECK(!instruction->IsPredicated()); + // Current implementation of predicated loop execution only supports kLO condition. + DCHECK(instruction->GetCondKind() == HVecPredWhile::CondKind::kLO); + Register left = InputRegisterAt(instruction, 0); + Register right = InputRegisterAt(instruction, 1); + + switch (instruction->GetVectorLength()) { + case 16u: + __ Whilelo(LoopPReg().VnB(), left, right); + break; + case 8u: + __ Whilelo(LoopPReg().VnH(), left, right); + break; + case 4u: + __ Whilelo(LoopPReg().VnS(), left, right); + break; + case 2u: + __ Whilelo(LoopPReg().VnD(), left, right); + break; + default: + LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType(); + UNREACHABLE(); + } } void LocationsBuilderARM64Sve::VisitVecPredCondition(HVecPredCondition* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); - UNREACHABLE(); + LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction); + locations->SetInAt(0, Location::NoLocation()); + // Result of the operation - a boolean value in a core register. + locations->SetOut(Location::RequiresRegister()); } void InstructionCodeGeneratorARM64Sve::VisitVecPredCondition(HVecPredCondition* instruction) { - LOG(FATAL) << "No SIMD for " << instruction->GetId(); - UNREACHABLE(); + // Instruction is not predicated, see nodes_vector.h + DCHECK(!instruction->IsPredicated()); + Register reg = OutputRegister(instruction); + // Currently VecPredCondition is only used as part of vectorized loop check condition + // evaluation. + DCHECK(instruction->GetPCondKind() == HVecPredCondition::PCondKind::kNFirst); + __ Cset(reg, pl); } Location InstructionCodeGeneratorARM64Sve::AllocateSIMDScratchLocation( @@ -1547,13 +1366,13 @@ void InstructionCodeGeneratorARM64Sve::MoveToSIMDStackSlot(Location destination, DCHECK(source.IsSIMDStackSlot()); UseScratchRegisterScope temps(GetVIXLAssembler()); if (GetVIXLAssembler()->GetScratchVRegisterList()->IsEmpty()) { - Register temp = temps.AcquireX(); + const Register temp = temps.AcquireX(); __ Ldr(temp, MemOperand(sp, source.GetStackIndex())); __ Str(temp, MemOperand(sp, destination.GetStackIndex())); __ Ldr(temp, MemOperand(sp, source.GetStackIndex() + kArm64WordSize)); __ Str(temp, MemOperand(sp, destination.GetStackIndex() + kArm64WordSize)); } else { - VRegister temp = temps.AcquireVRegisterOfSize(kQRegSize); + const VRegister temp = temps.AcquireVRegisterOfSize(kQRegSize); __ Ldr(temp, StackOperandFrom(source)); __ Str(temp, StackOperandFrom(destination)); } diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h index d652492c24..72207816e1 100644 --- a/compiler/optimizing/common_arm64.h +++ b/compiler/optimizing/common_arm64.h @@ -102,6 +102,11 @@ inline vixl::aarch64::VRegister VRegisterFrom(Location location) { return vixl::aarch64::VRegister(location.reg()); } +inline vixl::aarch64::ZRegister ZRegisterFrom(Location location) { + DCHECK(location.IsFpuRegister()) << location; + return vixl::aarch64::ZRegister(location.reg()); +} + inline vixl::aarch64::VRegister SRegisterFrom(Location location) { DCHECK(location.IsFpuRegister()) << location; return vixl::aarch64::SRegister(location.reg()); @@ -298,7 +303,7 @@ inline bool Arm64CanEncodeConstantAsImmediate(HConstant* constant, HInstruction* } inline Location ARM64EncodableConstantOrRegister(HInstruction* constant, - HInstruction* instr) { + HInstruction* instr) { if (constant->IsConstant() && Arm64CanEncodeConstantAsImmediate(constant->AsConstant(), instr)) { return Location::ConstantLocation(constant->AsConstant()); diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc index 71376178b1..8970372b12 100644 --- a/compiler/optimizing/instruction_simplifier.cc +++ b/compiler/optimizing/instruction_simplifier.cc @@ -289,56 +289,72 @@ bool InstructionSimplifierVisitor::TryCombineVecMultiplyAccumulate(HVecMul* mul) } ArenaAllocator* allocator = mul->GetBlock()->GetGraph()->GetAllocator(); + if (!mul->HasOnlyOneNonEnvironmentUse()) { + return false; + } + HInstruction* binop = mul->GetUses().front().GetUser(); + if (!binop->IsVecAdd() && !binop->IsVecSub()) { + return false; + } - if (mul->HasOnlyOneNonEnvironmentUse()) { - HInstruction* use = mul->GetUses().front().GetUser(); - if (use->IsVecAdd() || use->IsVecSub()) { - // Replace code looking like - // VECMUL tmp, x, y - // VECADD/SUB dst, acc, tmp - // with - // VECMULACC dst, acc, x, y - // Note that we do not want to (unconditionally) perform the merge when the - // multiplication has multiple uses and it can be merged in all of them. - // Multiple uses could happen on the same control-flow path, and we would - // then increase the amount of work. In the future we could try to evaluate - // whether all uses are on different control-flow paths (using dominance and - // reverse-dominance information) and only perform the merge when they are. - HInstruction* accumulator = nullptr; - HVecBinaryOperation* binop = use->AsVecBinaryOperation(); - HInstruction* binop_left = binop->GetLeft(); - HInstruction* binop_right = binop->GetRight(); - // This is always true since the `HVecMul` has only one use (which is checked above). - DCHECK_NE(binop_left, binop_right); - if (binop_right == mul) { - accumulator = binop_left; - } else if (use->IsVecAdd()) { - DCHECK_EQ(binop_left, mul); - accumulator = binop_right; - } - - HInstruction::InstructionKind kind = - use->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub; - if (accumulator != nullptr) { - HVecMultiplyAccumulate* mulacc = - new (allocator) HVecMultiplyAccumulate(allocator, - kind, - accumulator, - mul->GetLeft(), - mul->GetRight(), - binop->GetPackedType(), - binop->GetVectorLength(), - binop->GetDexPc()); - - binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc); - DCHECK(!mul->HasUses()); - mul->GetBlock()->RemoveInstruction(mul); - return true; - } + // Replace code looking like + // VECMUL tmp, x, y + // VECADD/SUB dst, acc, tmp + // with + // VECMULACC dst, acc, x, y + // Note that we do not want to (unconditionally) perform the merge when the + // multiplication has multiple uses and it can be merged in all of them. + // Multiple uses could happen on the same control-flow path, and we would + // then increase the amount of work. In the future we could try to evaluate + // whether all uses are on different control-flow paths (using dominance and + // reverse-dominance information) and only perform the merge when they are. + HInstruction* accumulator = nullptr; + HVecBinaryOperation* vec_binop = binop->AsVecBinaryOperation(); + HInstruction* binop_left = vec_binop->GetLeft(); + HInstruction* binop_right = vec_binop->GetRight(); + // This is always true since the `HVecMul` has only one use (which is checked above). + DCHECK_NE(binop_left, binop_right); + if (binop_right == mul) { + accumulator = binop_left; + } else { + DCHECK_EQ(binop_left, mul); + // Only addition is commutative. + if (!binop->IsVecAdd()) { + return false; } + accumulator = binop_right; } - return false; + DCHECK(accumulator != nullptr); + HInstruction::InstructionKind kind = + binop->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub; + + bool predicated_simd = vec_binop->IsPredicated(); + if (predicated_simd && !HVecOperation::HaveSamePredicate(vec_binop, mul)) { + return false; + } + + HVecMultiplyAccumulate* mulacc = + new (allocator) HVecMultiplyAccumulate(allocator, + kind, + accumulator, + mul->GetLeft(), + mul->GetRight(), + vec_binop->GetPackedType(), + vec_binop->GetVectorLength(), + vec_binop->GetDexPc()); + + + + vec_binop->GetBlock()->ReplaceAndRemoveInstructionWith(vec_binop, mulacc); + if (predicated_simd) { + mulacc->SetGoverningPredicate(vec_binop->GetGoverningPredicate(), + vec_binop->GetPredicationKind()); + } + + DCHECK(!mul->HasUses()); + mul->GetBlock()->RemoveInstruction(mul); + return true; } void InstructionSimplifierVisitor::VisitShift(HBinaryOperation* instruction) { diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc index 260bfafeaf..ff0859b456 100644 --- a/compiler/optimizing/instruction_simplifier_arm64.cc +++ b/compiler/optimizing/instruction_simplifier_arm64.cc @@ -277,14 +277,17 @@ void InstructionSimplifierArm64Visitor::VisitXor(HXor* instruction) { } void InstructionSimplifierArm64Visitor::VisitVecLoad(HVecLoad* instruction) { - if (!instruction->IsStringCharAt() - && TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) { + // TODO: Extract regular HIntermediateAddress. + if (!instruction->IsPredicated() && !instruction->IsStringCharAt() && + TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) { RecordSimplification(); } } void InstructionSimplifierArm64Visitor::VisitVecStore(HVecStore* instruction) { - if (TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) { + // TODO: Extract regular HIntermediateAddress. + if (!instruction->IsPredicated() && + TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) { RecordSimplification(); } } diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc index a776c37f36..76bd8493b2 100644 --- a/compiler/optimizing/loop_analysis.cc +++ b/compiler/optimizing/loop_analysis.cc @@ -17,6 +17,7 @@ #include "loop_analysis.h" #include "base/bit_vector-inl.h" +#include "code_generator.h" #include "induction_var_range.h" namespace art { @@ -76,6 +77,7 @@ int64_t LoopAnalysis::GetLoopTripCount(HLoopInformation* loop_info, // is provided. Enables scalar loop peeling and unrolling with the most conservative heuristics. class ArchDefaultLoopHelper : public ArchNoOptsLoopHelper { public: + explicit ArchDefaultLoopHelper(const CodeGenerator& codegen) : ArchNoOptsLoopHelper(codegen) {} // Scalar loop unrolling parameters and heuristics. // // Maximum possible unrolling factor. @@ -132,6 +134,7 @@ class ArchDefaultLoopHelper : public ArchNoOptsLoopHelper { // peeling and unrolling and supports SIMD loop unrolling. class Arm64LoopHelper : public ArchDefaultLoopHelper { public: + explicit Arm64LoopHelper(const CodeGenerator& codegen) : ArchDefaultLoopHelper(codegen) {} // SIMD loop unrolling parameters and heuristics. // // Maximum possible unrolling factor. @@ -157,6 +160,10 @@ class Arm64LoopHelper : public ArchDefaultLoopHelper { // Don't unroll with insufficient iterations. // TODO: Unroll loops with unknown trip count. DCHECK_NE(vector_length, 0u); + // TODO: Unroll loops in predicated vectorization mode. + if (codegen_.SupportsPredicatedSIMD()) { + return LoopAnalysisInfo::kNoUnrollingFactor; + } if (trip_count < (2 * vector_length + max_peel)) { return LoopAnalysisInfo::kNoUnrollingFactor; } @@ -309,6 +316,8 @@ class X86_64LoopHelper : public ArchDefaultLoopHelper { uint32_t GetUnrollingFactor(HLoopInformation* loop_info, HBasicBlock* header) const; public: + explicit X86_64LoopHelper(const CodeGenerator& codegen) : ArchDefaultLoopHelper(codegen) {} + uint32_t GetSIMDUnrollingFactor(HBasicBlock* block, int64_t trip_count, uint32_t max_peel, @@ -398,17 +407,18 @@ uint32_t X86_64LoopHelper::GetUnrollingFactor(HLoopInformation* loop_info, return (1 << unrolling_factor); } -ArchNoOptsLoopHelper* ArchNoOptsLoopHelper::Create(InstructionSet isa, +ArchNoOptsLoopHelper* ArchNoOptsLoopHelper::Create(const CodeGenerator& codegen, ArenaAllocator* allocator) { + InstructionSet isa = codegen.GetInstructionSet(); switch (isa) { case InstructionSet::kArm64: { - return new (allocator) Arm64LoopHelper; + return new (allocator) Arm64LoopHelper(codegen); } case InstructionSet::kX86_64: { - return new (allocator) X86_64LoopHelper; + return new (allocator) X86_64LoopHelper(codegen); } default: { - return new (allocator) ArchDefaultLoopHelper; + return new (allocator) ArchDefaultLoopHelper(codegen); } } } diff --git a/compiler/optimizing/loop_analysis.h b/compiler/optimizing/loop_analysis.h index 57509ee410..fbf1516f64 100644 --- a/compiler/optimizing/loop_analysis.h +++ b/compiler/optimizing/loop_analysis.h @@ -21,6 +21,7 @@ namespace art { +class CodeGenerator; class InductionVarRange; class LoopAnalysis; @@ -132,11 +133,12 @@ class LoopAnalysis : public ValueObject { // class ArchNoOptsLoopHelper : public ArenaObject { public: + explicit ArchNoOptsLoopHelper(const CodeGenerator& codegen) : codegen_(codegen) {} virtual ~ArchNoOptsLoopHelper() {} // Creates an instance of specialised helper for the target or default helper if the target // doesn't support loop peeling and unrolling. - static ArchNoOptsLoopHelper* Create(InstructionSet isa, ArenaAllocator* allocator); + static ArchNoOptsLoopHelper* Create(const CodeGenerator& codegen, ArenaAllocator* allocator); // Returns whether the loop is not beneficial for loop peeling/unrolling. // @@ -176,6 +178,9 @@ class ArchNoOptsLoopHelper : public ArenaObject { uint32_t vector_length ATTRIBUTE_UNUSED) const { return LoopAnalysisInfo::kNoUnrollingFactor; } + + protected: + const CodeGenerator& codegen_; }; } // namespace art diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index 4c9b01c97e..1210dbe67b 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -473,6 +473,7 @@ HLoopOptimization::HLoopOptimization(HGraph* graph, iset_(nullptr), reductions_(nullptr), simplified_(false), + predicated_vectorization_mode_(codegen.SupportsPredicatedSIMD()), vector_length_(0), vector_refs_(nullptr), vector_static_peeling_factor_(0), @@ -486,10 +487,7 @@ HLoopOptimization::HLoopOptimization(HGraph* graph, vector_header_(nullptr), vector_body_(nullptr), vector_index_(nullptr), - arch_loop_helper_(ArchNoOptsLoopHelper::Create(compiler_options_ != nullptr - ? compiler_options_->GetInstructionSet() - : InstructionSet::kNone, - global_allocator_)) { + arch_loop_helper_(ArchNoOptsLoopHelper::Create(codegen, global_allocator_)) { } bool HLoopOptimization::Run() { @@ -1024,8 +1022,10 @@ bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int6 } } // for i - // Find a suitable alignment strategy. - SetAlignmentStrategy(peeling_votes, peeling_candidate); + if (!IsInPredicatedVectorizationMode()) { + // Find a suitable alignment strategy. + SetAlignmentStrategy(peeling_votes, peeling_candidate); + } // Does vectorization seem profitable? if (!IsVectorizationProfitable(trip_count)) { @@ -1052,8 +1052,8 @@ void HLoopOptimization::Vectorize(LoopNode* node, // A cleanup loop is needed, at least, for any unknown trip count or // for a known trip count with remainder iterations after vectorization. - bool needs_cleanup = trip_count == 0 || - ((trip_count - vector_static_peeling_factor_) % chunk) != 0; + bool needs_cleanup = !IsInPredicatedVectorizationMode() && + (trip_count == 0 || ((trip_count - vector_static_peeling_factor_) % chunk) != 0); // Adjust vector bookkeeping. HPhi* main_phi = nullptr; @@ -1071,11 +1071,13 @@ void HLoopOptimization::Vectorize(LoopNode* node, // ptc = ; HInstruction* ptc = nullptr; if (vector_static_peeling_factor_ != 0) { + DCHECK(!IsInPredicatedVectorizationMode()); // Static loop peeling for SIMD alignment (using the most suitable // fixed peeling factor found during prior alignment analysis). DCHECK(vector_dynamic_peeling_candidate_ == nullptr); ptc = graph_->GetConstant(induc_type, vector_static_peeling_factor_); } else if (vector_dynamic_peeling_candidate_ != nullptr) { + DCHECK(!IsInPredicatedVectorizationMode()); // Dynamic loop peeling for SIMD alignment (using the most suitable // candidate found during prior alignment analysis): // rem = offset % ALIGN; // adjusted as #elements @@ -1106,6 +1108,7 @@ void HLoopOptimization::Vectorize(LoopNode* node, HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader); HInstruction* vtc = stc; if (needs_cleanup) { + DCHECK(!IsInPredicatedVectorizationMode()); DCHECK(IsPowerOfTwo(chunk)); HInstruction* diff = stc; if (ptc != nullptr) { @@ -1143,6 +1146,7 @@ void HLoopOptimization::Vectorize(LoopNode* node, // moved around during suspend checks, since all analysis was based on // nothing more than the Android runtime alignment conventions. if (ptc != nullptr) { + DCHECK(!IsInPredicatedVectorizationMode()); vector_mode_ = kSequential; GenerateNewLoop(node, block, @@ -1170,6 +1174,7 @@ void HLoopOptimization::Vectorize(LoopNode* node, // for ( ; i < stc; i += 1) // if (needs_cleanup) { + DCHECK(!IsInPredicatedVectorizationMode() || vector_runtime_test_a_ != nullptr); vector_mode_ = kSequential; GenerateNewLoop(node, block, @@ -1227,9 +1232,35 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node, // Generate header and prepare body. // for (i = lo; i < hi; i += step) // - HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi); - vector_header_->AddPhi(phi); - vector_header_->AddInstruction(cond); + HInstruction* cond = nullptr; + HInstruction* set_pred = nullptr; + if (IsInPredicatedVectorizationMode()) { + HVecPredWhile* pred_while = + new (global_allocator_) HVecPredWhile(global_allocator_, + phi, + hi, + HVecPredWhile::CondKind::kLO, + DataType::Type::kInt32, + vector_length_, + 0u); + + cond = new (global_allocator_) HVecPredCondition(global_allocator_, + pred_while, + HVecPredCondition::PCondKind::kNFirst, + DataType::Type::kInt32, + vector_length_, + 0u); + + vector_header_->AddPhi(phi); + vector_header_->AddInstruction(pred_while); + vector_header_->AddInstruction(cond); + set_pred = pred_while; + } else { + cond = new (global_allocator_) HAboveOrEqual(phi, hi); + vector_header_->AddPhi(phi); + vector_header_->AddInstruction(cond); + } + vector_header_->AddInstruction(new (global_allocator_) HIf(cond)); vector_index_ = phi; vector_permanent_map_->clear(); // preserved over unrolling @@ -1246,6 +1277,10 @@ void HLoopOptimization::GenerateNewLoop(LoopNode* node, auto i = vector_map_->find(it.Current()); if (i != vector_map_->end() && !i->second->IsInBlock()) { Insert(vector_body_, i->second); + if (IsInPredicatedVectorizationMode() && i->second->IsVecOperation()) { + HVecOperation* op = i->second->AsVecOperation(); + op->SetMergingGoverningPredicate(set_pred); + } // Deal with instructions that need an environment, such as the scalar intrinsics. if (i->second->NeedsEnvironment()) { i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_); @@ -1360,7 +1395,10 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node, } else if (instruction->IsArrayGet()) { // Deal with vector restrictions. bool is_string_char_at = instruction->AsArrayGet()->IsStringCharAt(); - if (is_string_char_at && HasVectorRestrictions(restrictions, kNoStringCharAt)) { + + if (is_string_char_at && (HasVectorRestrictions(restrictions, kNoStringCharAt) || + IsInPredicatedVectorizationMode())) { + // TODO: Support CharAt for predicated mode. return false; } // Accept a right-hand-side array base[index] for @@ -1575,32 +1613,73 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict } return false; case InstructionSet::kArm64: - // Allow vectorization for all ARM devices, because Android assumes that - // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD). - switch (type) { - case DataType::Type::kBool: - case DataType::Type::kUint8: - case DataType::Type::kInt8: - *restrictions |= kNoDiv; - return TrySetVectorLength(type, 16); - case DataType::Type::kUint16: - case DataType::Type::kInt16: - *restrictions |= kNoDiv; - return TrySetVectorLength(type, 8); - case DataType::Type::kInt32: - *restrictions |= kNoDiv; - return TrySetVectorLength(type, 4); - case DataType::Type::kInt64: - *restrictions |= kNoDiv | kNoMul; - return TrySetVectorLength(type, 2); - case DataType::Type::kFloat32: - *restrictions |= kNoReduction; - return TrySetVectorLength(type, 4); - case DataType::Type::kFloat64: - *restrictions |= kNoReduction; - return TrySetVectorLength(type, 2); - default: - return false; + if (IsInPredicatedVectorizationMode()) { + // SVE vectorization. + CHECK(features->AsArm64InstructionSetFeatures()->HasSVE()); + switch (type) { + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + *restrictions |= kNoDiv | + kNoSignedHAdd | + kNoUnsignedHAdd | + kNoUnroundedHAdd | + kNoSAD; + return TrySetVectorLength(type, 16); + case DataType::Type::kUint16: + case DataType::Type::kInt16: + *restrictions |= kNoDiv | + kNoSignedHAdd | + kNoUnsignedHAdd | + kNoUnroundedHAdd | + kNoSAD | + kNoDotProd; + return TrySetVectorLength(type, 8); + case DataType::Type::kInt32: + *restrictions |= kNoDiv | kNoSAD; + return TrySetVectorLength(type, 4); + case DataType::Type::kInt64: + *restrictions |= kNoDiv | kNoSAD; + return TrySetVectorLength(type, 2); + case DataType::Type::kFloat32: + *restrictions |= kNoReduction; + return TrySetVectorLength(type, 4); + case DataType::Type::kFloat64: + *restrictions |= kNoReduction; + return TrySetVectorLength(type, 2); + default: + break; + } + return false; + } else { + // Allow vectorization for all ARM devices, because Android assumes that + // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD). + switch (type) { + case DataType::Type::kBool: + case DataType::Type::kUint8: + case DataType::Type::kInt8: + *restrictions |= kNoDiv; + return TrySetVectorLength(type, 16); + case DataType::Type::kUint16: + case DataType::Type::kInt16: + *restrictions |= kNoDiv; + return TrySetVectorLength(type, 8); + case DataType::Type::kInt32: + *restrictions |= kNoDiv; + return TrySetVectorLength(type, 4); + case DataType::Type::kInt64: + *restrictions |= kNoDiv | kNoMul; + return TrySetVectorLength(type, 2); + case DataType::Type::kFloat32: + *restrictions |= kNoReduction; + return TrySetVectorLength(type, 4); + case DataType::Type::kFloat64: + *restrictions |= kNoReduction; + return TrySetVectorLength(type, 2); + default: + break; + } + return false; } case InstructionSet::kX86: case InstructionSet::kX86_64: @@ -1693,6 +1772,15 @@ void HLoopOptimization::GenerateVecInv(HInstruction* org, DataType::Type type) { vector = new (global_allocator_) HVecReplicateScalar(global_allocator_, input, type, vector_length_, kNoDexPc); vector_permanent_map_->Put(org, Insert(vector_preheader_, vector)); + if (IsInPredicatedVectorizationMode()) { + HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_, + graph_->GetIntConstant(1), + type, + vector_length_, + 0u); + vector_preheader_->InsertInstructionBefore(set_pred, vector); + vector->AsVecOperation()->SetMergingGoverningPredicate(set_pred); + } } vector_map_->Put(org, vector); } @@ -1821,6 +1909,15 @@ void HLoopOptimization::GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* r vector_length, kNoDexPc)); } + if (IsInPredicatedVectorizationMode()) { + HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_, + graph_->GetIntConstant(1), + type, + vector_length, + 0u); + vector_preheader_->InsertInstructionBefore(set_pred, new_init); + new_init->AsVecOperation()->SetMergingGoverningPredicate(set_pred); + } } else { new_init = ReduceAndExtractIfNeeded(new_init); } @@ -1852,6 +1949,17 @@ HInstruction* HLoopOptimization::ReduceAndExtractIfNeeded(HInstruction* instruct instruction = new (global_allocator_) HVecExtractScalar( global_allocator_, reduce, type, vector_length, 0, kNoDexPc); exit->InsertInstructionAfter(instruction, reduce); + + if (IsInPredicatedVectorizationMode()) { + HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_, + graph_->GetIntConstant(1), + type, + vector_length, + 0u); + exit->InsertInstructionBefore(set_pred, reduce); + reduce->AsVecOperation()->SetMergingGoverningPredicate(set_pred); + instruction->AsVecOperation()->SetMergingGoverningPredicate(set_pred); + } } } return instruction; @@ -1991,7 +2099,8 @@ bool HLoopOptimization::VectorizeHalvingAddIdiom(LoopNode* node, return false; } // Deal with vector restrictions. - if ((!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) || + if ((is_unsigned && HasVectorRestrictions(restrictions, kNoUnsignedHAdd)) || + (!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) || (!is_rounded && HasVectorRestrictions(restrictions, kNoUnroundedHAdd))) { return false; } diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h index 0c35f294d8..0d76804d9c 100644 --- a/compiler/optimizing/loop_optimization.h +++ b/compiler/optimizing/loop_optimization.h @@ -76,13 +76,14 @@ class HLoopOptimization : public HOptimization { kNoShr = 1 << 3, // no arithmetic shift right kNoHiBits = 1 << 4, // "wider" operations cannot bring in higher order bits kNoSignedHAdd = 1 << 5, // no signed halving add - kNoUnroundedHAdd = 1 << 6, // no unrounded halving add - kNoAbs = 1 << 7, // no absolute value - kNoStringCharAt = 1 << 8, // no StringCharAt - kNoReduction = 1 << 9, // no reduction - kNoSAD = 1 << 10, // no sum of absolute differences (SAD) - kNoWideSAD = 1 << 11, // no sum of absolute differences (SAD) with operand widening - kNoDotProd = 1 << 12, // no dot product + kNoUnsignedHAdd = 1 << 6, // no unsigned halving add + kNoUnroundedHAdd = 1 << 7, // no unrounded halving add + kNoAbs = 1 << 8, // no absolute value + kNoStringCharAt = 1 << 9, // no StringCharAt + kNoReduction = 1 << 10, // no reduction + kNoSAD = 1 << 11, // no sum of absolute differences (SAD) + kNoWideSAD = 1 << 12, // no sum of absolute differences (SAD) with operand widening + kNoDotProd = 1 << 13, // no dot product }; /* @@ -270,6 +271,8 @@ class HLoopOptimization : public HOptimization { void RemoveDeadInstructions(const HInstructionList& list); bool CanRemoveCycle(); // Whether the current 'iset_' is removable. + bool IsInPredicatedVectorizationMode() const { return predicated_vectorization_mode_; } + // Compiler options (to query ISA features). const CompilerOptions* compiler_options_; @@ -305,6 +308,9 @@ class HLoopOptimization : public HOptimization { // Flag that tracks if any simplifications have occurred. bool simplified_; + // Whether to use predicated loop vectorization (e.g. for arm64 SVE target). + bool predicated_vectorization_mode_; + // Number of "lanes" for selected packed type. uint32_t vector_length_; diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h index 9c6b422c87..a2cd86dc33 100644 --- a/compiler/optimizing/nodes_vector.h +++ b/compiler/optimizing/nodes_vector.h @@ -145,6 +145,15 @@ class HVecOperation : public HVariableInputSizeInstruction { return pred_input->AsVecPredSetOperation(); } + // Returns whether two vector operations are predicated by the same vector predicate + // with the same predication type. + static bool HaveSamePredicate(HVecOperation* instr0, HVecOperation* instr1) { + HVecPredSetOperation* instr0_predicate = instr0->GetGoverningPredicate(); + HVecOperation::PredicationKind instr0_predicate_kind = instr0->GetPredicationKind(); + return instr1->GetGoverningPredicate() == instr0_predicate && + instr1->GetPredicationKind() == instr0_predicate_kind; + } + // Returns the number of elements packed in a vector. size_t GetVectorLength() const { return vector_length_; -- cgit v1.2.3-59-g8ed1b