ART: Implement predicated SIMD vectorization.
This CL brings support for predicated execution for
auto-vectorizer and implements arm64 SVE vector backend.
This version passes all the VIXL simulator-runnable tests in
SVE mode with checker off (as all VecOp CHECKs need to be
adjusted for an extra input) and all tests in NEON mode.
Test: art SIMD tests on VIXL simulator.
Test: art tests on FVP (steps in test/README.arm_fvp.md)
Change-Id: Ib78bde31a15e6713d875d6668ad4458f5519605f
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index b945be2..f5d7836 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -994,7 +994,7 @@
}
bool CodeGeneratorARM64::ShouldUseSVE() const {
- return kArm64AllowSVE && GetInstructionSetFeatures().HasSVE();
+ return GetInstructionSetFeatures().HasSVE();
}
#define __ GetVIXLAssembler()->
@@ -6908,7 +6908,7 @@
}
}
-MemOperand InstructionCodeGeneratorARM64::VecNeonAddress(
+MemOperand InstructionCodeGeneratorARM64::VecNEONAddress(
HVecMemoryOperation* instruction,
UseScratchRegisterScope* temps_scope,
size_t size,
@@ -6941,6 +6941,31 @@
}
}
+SVEMemOperand InstructionCodeGeneratorARM64::VecSVEAddress(
+ HVecMemoryOperation* instruction,
+ UseScratchRegisterScope* temps_scope,
+ size_t size,
+ bool is_string_char_at,
+ /*out*/ Register* scratch) {
+ LocationSummary* locations = instruction->GetLocations();
+ Register base = InputRegisterAt(instruction, 0);
+ Location index = locations->InAt(1);
+
+ // TODO: Support intermediate address sharing for SVE accesses.
+ DCHECK(!instruction->InputAt(1)->IsIntermediateAddressIndex());
+ DCHECK(!instruction->InputAt(0)->IsIntermediateAddress());
+ DCHECK(!index.IsConstant());
+
+ uint32_t offset = is_string_char_at
+ ? mirror::String::ValueOffset().Uint32Value()
+ : mirror::Array::DataOffset(size).Uint32Value();
+ size_t shift = ComponentSizeShiftWidth(size);
+
+ *scratch = temps_scope->AcquireSameSizeAs(base);
+ __ Add(*scratch, base, offset);
+ return SVEMemOperand(scratch->X(), XRegisterFrom(index), LSL, shift);
+}
+
#undef __
#undef QUICK_ENTRY_POINT
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index affc640..eb3e954 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -54,9 +54,6 @@
static constexpr int kMaxMacroInstructionSizeInBytes = 15 * vixl::aarch64::kInstructionSize;
static constexpr int kInvokeCodeMarginSizeInBytes = 6 * kMaxMacroInstructionSizeInBytes;
-// SVE is currently not enabled.
-static constexpr bool kArm64AllowSVE = false;
-
static const vixl::aarch64::Register kParameterCoreRegisters[] = {
vixl::aarch64::x1,
vixl::aarch64::x2,
@@ -388,11 +385,19 @@
void GenerateIntRemForPower2Denom(HRem *instruction);
void HandleGoto(HInstruction* got, HBasicBlock* successor);
- // Helper to set up locations for vector memory operations. Returns the memory operand and,
+ // Helpers to set up locations for vector memory operations. Returns the memory operand and,
// if used, sets the output parameter scratch to a temporary register used in this operand,
// so that the client can release it right after the memory operand use.
// Neon version.
- vixl::aarch64::MemOperand VecNeonAddress(
+ vixl::aarch64::MemOperand VecNEONAddress(
+ HVecMemoryOperation* instruction,
+ // This function may acquire a scratch register.
+ vixl::aarch64::UseScratchRegisterScope* temps_scope,
+ size_t size,
+ bool is_string_char_at,
+ /*out*/ vixl::aarch64::Register* scratch);
+ // SVE version.
+ vixl::aarch64::SVEMemOperand VecSVEAddress(
HVecMemoryOperation* instruction,
// This function may acquire a scratch register.
vixl::aarch64::UseScratchRegisterScope* temps_scope,
@@ -490,6 +495,15 @@
void LoadSIMDRegFromStack(Location destination, Location source) override;
void MoveSIMDRegToSIMDReg(Location destination, Location source) override;
void MoveToSIMDStackSlot(Location destination, Location source) override;
+
+ private:
+ // Returns default predicate register which is used as governing vector predicate
+ // to implement predicated loop execution.
+ //
+ // TODO: This is a hack to be addressed when register allocator supports SIMD types.
+ static vixl::aarch64::PRegister LoopPReg() {
+ return vixl::aarch64::p0;
+ }
};
class LocationsBuilderARM64Sve : public LocationsBuilderARM64 {
diff --git a/compiler/optimizing/code_generator_vector_arm64_neon.cc b/compiler/optimizing/code_generator_vector_arm64_neon.cc
index 2a4c785..bd64166 100644
--- a/compiler/optimizing/code_generator_vector_arm64_neon.cc
+++ b/compiler/optimizing/code_generator_vector_arm64_neon.cc
@@ -25,8 +25,6 @@
namespace art {
namespace arm64 {
-using helpers::ARM64EncodableConstantOrRegister;
-using helpers::Arm64CanEncodeConstantAsImmediate;
using helpers::DRegisterFrom;
using helpers::HeapOperand;
using helpers::InputRegisterAt;
@@ -40,6 +38,38 @@
#define __ GetVIXLAssembler()->
+// Returns whether the value of the constant can be directly encoded into the instruction as
+// immediate.
+inline bool NEONCanEncodeConstantAsImmediate(HConstant* constant, HInstruction* instr) {
+ // TODO: Improve this when IsSIMDConstantEncodable method is implemented in VIXL.
+ if (instr->IsVecReplicateScalar()) {
+ if (constant->IsLongConstant()) {
+ return false;
+ } else if (constant->IsFloatConstant()) {
+ return vixl::aarch64::Assembler::IsImmFP32(constant->AsFloatConstant()->GetValue());
+ } else if (constant->IsDoubleConstant()) {
+ return vixl::aarch64::Assembler::IsImmFP64(constant->AsDoubleConstant()->GetValue());
+ }
+ int64_t value = CodeGenerator::GetInt64ValueOf(constant);
+ return IsUint<8>(value);
+ }
+ return false;
+}
+
+// Returns
+// - constant location - if 'constant' is an actual constant and its value can be
+// encoded into the instruction.
+// - register location otherwise.
+inline Location NEONEncodableConstantOrRegister(HInstruction* constant,
+ HInstruction* instr) {
+ if (constant->IsConstant()
+ && NEONCanEncodeConstantAsImmediate(constant->AsConstant(), instr)) {
+ return Location::ConstantLocation(constant->AsConstant());
+ }
+
+ return Location::RequiresRegister();
+}
+
// Returns whether dot product instructions should be emitted.
static bool ShouldEmitDotProductInstructions(const CodeGeneratorARM64* codegen_) {
return codegen_->GetInstructionSetFeatures().HasDotProd();
@@ -56,13 +86,13 @@
case DataType::Type::kInt16:
case DataType::Type::kInt32:
case DataType::Type::kInt64:
- locations->SetInAt(0, ARM64EncodableConstantOrRegister(input, instruction));
+ locations->SetInAt(0, NEONEncodableConstantOrRegister(input, instruction));
locations->SetOut(Location::RequiresFpuRegister());
break;
case DataType::Type::kFloat32:
case DataType::Type::kFloat64:
if (input->IsConstant() &&
- Arm64CanEncodeConstantAsImmediate(input->AsConstant(), instruction)) {
+ NEONCanEncodeConstantAsImmediate(input->AsConstant(), instruction)) {
locations->SetInAt(0, Location::ConstantLocation(input->AsConstant()));
locations->SetOut(Location::RequiresFpuRegister());
} else {
@@ -1418,7 +1448,7 @@
temps.Release(length); // no longer needed
// Zero extend 8 compressed bytes into 8 chars.
__ Ldr(DRegisterFrom(locations->Out()).V8B(),
- VecNeonAddress(instruction, &temps, 1, /*is_string_char_at*/ true, &scratch));
+ VecNEONAddress(instruction, &temps, 1, /*is_string_char_at*/ true, &scratch));
__ Uxtl(reg.V8H(), reg.V8B());
__ B(&done);
if (scratch.IsValid()) {
@@ -1427,7 +1457,7 @@
// Load 8 direct uncompressed chars.
__ Bind(&uncompressed_load);
__ Ldr(reg,
- VecNeonAddress(instruction, &temps, size, /*is_string_char_at*/ true, &scratch));
+ VecNEONAddress(instruction, &temps, size, /*is_string_char_at*/ true, &scratch));
__ Bind(&done);
return;
}
@@ -1442,7 +1472,7 @@
DCHECK_LE(2u, instruction->GetVectorLength());
DCHECK_LE(instruction->GetVectorLength(), 16u);
__ Ldr(reg,
- VecNeonAddress(instruction, &temps, size, instruction->IsStringCharAt(), &scratch));
+ VecNEONAddress(instruction, &temps, size, instruction->IsStringCharAt(), &scratch));
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -1474,7 +1504,7 @@
DCHECK_LE(2u, instruction->GetVectorLength());
DCHECK_LE(instruction->GetVectorLength(), 16u);
__ Str(reg,
- VecNeonAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
+ VecNEONAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -1483,13 +1513,13 @@
}
void LocationsBuilderARM64Neon::VisitVecPredSetAll(HVecPredSetAll* instruction) {
- LOG(FATAL) << "No SIMD for " << instruction->GetId();
- UNREACHABLE();
+ LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+ DCHECK(instruction->InputAt(0)->IsIntConstant());
+ locations->SetInAt(0, Location::NoLocation());
+ locations->SetOut(Location::NoLocation());
}
-void InstructionCodeGeneratorARM64Neon::VisitVecPredSetAll(HVecPredSetAll* instruction) {
- LOG(FATAL) << "No SIMD for " << instruction->GetId();
- UNREACHABLE();
+void InstructionCodeGeneratorARM64Neon::VisitVecPredSetAll(HVecPredSetAll*) {
}
void LocationsBuilderARM64Neon::VisitVecPredWhile(HVecPredWhile* instruction) {
diff --git a/compiler/optimizing/code_generator_vector_arm64_sve.cc b/compiler/optimizing/code_generator_vector_arm64_sve.cc
index 1761dfc..2254673 100644
--- a/compiler/optimizing/code_generator_vector_arm64_sve.cc
+++ b/compiler/optimizing/code_generator_vector_arm64_sve.cc
@@ -25,8 +25,6 @@
namespace art {
namespace arm64 {
-using helpers::ARM64EncodableConstantOrRegister;
-using helpers::Arm64CanEncodeConstantAsImmediate;
using helpers::DRegisterFrom;
using helpers::HeapOperand;
using helpers::InputRegisterAt;
@@ -36,13 +34,41 @@
using helpers::QRegisterFrom;
using helpers::StackOperandFrom;
using helpers::VRegisterFrom;
+using helpers::ZRegisterFrom;
using helpers::XRegisterFrom;
#define __ GetVIXLAssembler()->
-// Returns whether dot product instructions should be emitted.
-static bool ShouldEmitDotProductInstructions(const CodeGeneratorARM64* codegen_) {
- return codegen_->GetInstructionSetFeatures().HasDotProd();
+// Returns whether the value of the constant can be directly encoded into the instruction as
+// immediate.
+static bool SVECanEncodeConstantAsImmediate(HConstant* constant, HInstruction* instr) {
+ if (instr->IsVecReplicateScalar()) {
+ if (constant->IsLongConstant()) {
+ return false;
+ } else if (constant->IsFloatConstant()) {
+ return vixl::aarch64::Assembler::IsImmFP32(constant->AsFloatConstant()->GetValue());
+ } else if (constant->IsDoubleConstant()) {
+ return vixl::aarch64::Assembler::IsImmFP64(constant->AsDoubleConstant()->GetValue());
+ }
+ // TODO: Make use of shift part of DUP instruction.
+ int64_t value = CodeGenerator::GetInt64ValueOf(constant);
+ return IsInt<8>(value);
+ }
+
+ return false;
+}
+
+// Returns
+// - constant location - if 'constant' is an actual constant and its value can be
+// encoded into the instruction.
+// - register location otherwise.
+inline Location SVEEncodableConstantOrRegister(HInstruction* constant, HInstruction* instr) {
+ if (constant->IsConstant()
+ && SVECanEncodeConstantAsImmediate(constant->AsConstant(), instr)) {
+ return Location::ConstantLocation(constant->AsConstant());
+ }
+
+ return Location::RequiresRegister();
}
void LocationsBuilderARM64Sve::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
@@ -56,13 +82,13 @@
case DataType::Type::kInt16:
case DataType::Type::kInt32:
case DataType::Type::kInt64:
- locations->SetInAt(0, ARM64EncodableConstantOrRegister(input, instruction));
+ locations->SetInAt(0, SVEEncodableConstantOrRegister(input, instruction));
locations->SetOut(Location::RequiresFpuRegister());
break;
case DataType::Type::kFloat32:
case DataType::Type::kFloat64:
if (input->IsConstant() &&
- Arm64CanEncodeConstantAsImmediate(input->AsConstant(), instruction)) {
+ SVECanEncodeConstantAsImmediate(input->AsConstant(), instruction)) {
locations->SetInAt(0, Location::ConstantLocation(input->AsConstant()));
locations->SetOut(Location::RequiresFpuRegister());
} else {
@@ -77,59 +103,60 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
Location src_loc = locations->InAt(0);
- VRegister dst = VRegisterFrom(locations->Out());
+ const ZRegister dst = ZRegisterFrom(locations->Out());
switch (instruction->GetPackedType()) {
case DataType::Type::kBool:
case DataType::Type::kUint8:
case DataType::Type::kInt8:
DCHECK_EQ(16u, instruction->GetVectorLength());
if (src_loc.IsConstant()) {
- __ Movi(dst.V16B(), Int64FromLocation(src_loc));
+ __ Dup(dst.VnB(), Int64FromLocation(src_loc));
} else {
- __ Dup(dst.V16B(), InputRegisterAt(instruction, 0));
+ __ Dup(dst.VnB(), InputRegisterAt(instruction, 0));
}
break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
DCHECK_EQ(8u, instruction->GetVectorLength());
if (src_loc.IsConstant()) {
- __ Movi(dst.V8H(), Int64FromLocation(src_loc));
+ __ Dup(dst.VnH(), Int64FromLocation(src_loc));
} else {
- __ Dup(dst.V8H(), InputRegisterAt(instruction, 0));
+ __ Dup(dst.VnH(), InputRegisterAt(instruction, 0));
}
break;
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
if (src_loc.IsConstant()) {
- __ Movi(dst.V4S(), Int64FromLocation(src_loc));
+ __ Dup(dst.VnS(), Int64FromLocation(src_loc));
} else {
- __ Dup(dst.V4S(), InputRegisterAt(instruction, 0));
+ __ Dup(dst.VnS(), InputRegisterAt(instruction, 0));
}
break;
case DataType::Type::kInt64:
DCHECK_EQ(2u, instruction->GetVectorLength());
if (src_loc.IsConstant()) {
- __ Movi(dst.V2D(), Int64FromLocation(src_loc));
+ __ Dup(dst.VnD(), Int64FromLocation(src_loc));
} else {
- __ Dup(dst.V2D(), XRegisterFrom(src_loc));
+ __ Dup(dst.VnD(), XRegisterFrom(src_loc));
}
break;
case DataType::Type::kFloat32:
DCHECK_EQ(4u, instruction->GetVectorLength());
if (src_loc.IsConstant()) {
- __ Fmov(dst.V4S(), src_loc.GetConstant()->AsFloatConstant()->GetValue());
+ __ Fdup(dst.VnS(), src_loc.GetConstant()->AsFloatConstant()->GetValue());
} else {
- __ Dup(dst.V4S(), VRegisterFrom(src_loc).V4S(), 0);
+ __ Dup(dst.VnS(), ZRegisterFrom(src_loc).VnS(), 0);
}
break;
case DataType::Type::kFloat64:
DCHECK_EQ(2u, instruction->GetVectorLength());
if (src_loc.IsConstant()) {
- __ Fmov(dst.V2D(), src_loc.GetConstant()->AsDoubleConstant()->GetValue());
+ __ Fdup(dst.VnD(), src_loc.GetConstant()->AsDoubleConstant()->GetValue());
} else {
- __ Dup(dst.V2D(), VRegisterFrom(src_loc).V2D(), 0);
+ __ Dup(dst.VnD(), ZRegisterFrom(src_loc).VnD(), 0);
}
break;
default:
@@ -163,8 +190,9 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecExtractScalar(HVecExtractScalar* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister src = VRegisterFrom(locations->InAt(0));
+ const VRegister src = VRegisterFrom(locations->InAt(0));
switch (instruction->GetPackedType()) {
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
@@ -218,32 +246,31 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecReduce(HVecReduce* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister src = VRegisterFrom(locations->InAt(0));
- VRegister dst = DRegisterFrom(locations->Out());
+ const ZRegister src = ZRegisterFrom(locations->InAt(0));
+ const VRegister dst = DRegisterFrom(locations->Out());
+ const PRegister p_reg = LoopPReg();
switch (instruction->GetPackedType()) {
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
switch (instruction->GetReductionKind()) {
case HVecReduce::kSum:
- __ Addv(dst.S(), src.V4S());
+ __ Saddv(dst.S(), p_reg, src.VnS());
break;
- case HVecReduce::kMin:
- __ Sminv(dst.S(), src.V4S());
- break;
- case HVecReduce::kMax:
- __ Smaxv(dst.S(), src.V4S());
- break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD instruction";
+ UNREACHABLE();
}
break;
case DataType::Type::kInt64:
DCHECK_EQ(2u, instruction->GetVectorLength());
switch (instruction->GetReductionKind()) {
case HVecReduce::kSum:
- __ Addp(dst.D(), src.V2D());
+ __ Uaddv(dst.D(), p_reg, src.VnD());
break;
default:
- LOG(FATAL) << "Unsupported SIMD min/max";
+ LOG(FATAL) << "Unsupported SIMD instruction";
UNREACHABLE();
}
break;
@@ -258,14 +285,16 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecCnv(HVecCnv* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister src = VRegisterFrom(locations->InAt(0));
- VRegister dst = VRegisterFrom(locations->Out());
+ const ZRegister src = ZRegisterFrom(locations->InAt(0));
+ const ZRegister dst = ZRegisterFrom(locations->Out());
+ const PRegisterM p_reg = LoopPReg().Merging();
DataType::Type from = instruction->GetInputType();
DataType::Type to = instruction->GetResultType();
if (from == DataType::Type::kInt32 && to == DataType::Type::kFloat32) {
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Scvtf(dst.V4S(), src.V4S());
+ __ Scvtf(dst.VnS(), p_reg, src.VnS());
} else {
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
}
@@ -276,35 +305,37 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecNeg(HVecNeg* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister src = VRegisterFrom(locations->InAt(0));
- VRegister dst = VRegisterFrom(locations->Out());
+ const ZRegister src = ZRegisterFrom(locations->InAt(0));
+ const ZRegister dst = ZRegisterFrom(locations->Out());
+ const PRegisterM p_reg = LoopPReg().Merging();
switch (instruction->GetPackedType()) {
case DataType::Type::kUint8:
case DataType::Type::kInt8:
DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Neg(dst.V16B(), src.V16B());
+ __ Neg(dst.VnB(), p_reg, src.VnB());
break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Neg(dst.V8H(), src.V8H());
+ __ Neg(dst.VnH(), p_reg, src.VnH());
break;
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Neg(dst.V4S(), src.V4S());
+ __ Neg(dst.VnS(), p_reg, src.VnS());
break;
case DataType::Type::kInt64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ Neg(dst.V2D(), src.V2D());
+ __ Neg(dst.VnD(), p_reg, src.VnD());
break;
case DataType::Type::kFloat32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Fneg(dst.V4S(), src.V4S());
+ __ Fneg(dst.VnS(), p_reg, src.VnS());
break;
case DataType::Type::kFloat64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ Fneg(dst.V2D(), src.V2D());
+ __ Fneg(dst.VnD(), p_reg, src.VnD());
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -317,33 +348,35 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecAbs(HVecAbs* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister src = VRegisterFrom(locations->InAt(0));
- VRegister dst = VRegisterFrom(locations->Out());
+ const ZRegister src = ZRegisterFrom(locations->InAt(0));
+ const ZRegister dst = ZRegisterFrom(locations->Out());
+ const PRegisterM p_reg = LoopPReg().Merging();
switch (instruction->GetPackedType()) {
case DataType::Type::kInt8:
DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Abs(dst.V16B(), src.V16B());
+ __ Abs(dst.VnB(), p_reg, src.VnB());
break;
case DataType::Type::kInt16:
DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Abs(dst.V8H(), src.V8H());
+ __ Abs(dst.VnH(), p_reg, src.VnH());
break;
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Abs(dst.V4S(), src.V4S());
+ __ Abs(dst.VnS(), p_reg, src.VnS());
break;
case DataType::Type::kInt64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ Abs(dst.V2D(), src.V2D());
+ __ Abs(dst.VnD(), p_reg, src.VnD());
break;
case DataType::Type::kFloat32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Fabs(dst.V4S(), src.V4S());
+ __ Fabs(dst.VnS(), p_reg, src.VnS());
break;
case DataType::Type::kFloat64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ Fabs(dst.V2D(), src.V2D());
+ __ Fabs(dst.VnD(), p_reg, src.VnD());
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -356,22 +389,30 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecNot(HVecNot* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister src = VRegisterFrom(locations->InAt(0));
- VRegister dst = VRegisterFrom(locations->Out());
+ const ZRegister src = ZRegisterFrom(locations->InAt(0));
+ const ZRegister dst = ZRegisterFrom(locations->Out());
+ const PRegisterM p_reg = LoopPReg().Merging();
switch (instruction->GetPackedType()) {
case DataType::Type::kBool: // special case boolean-not
DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Movi(dst.V16B(), 1);
- __ Eor(dst.V16B(), dst.V16B(), src.V16B());
+ __ Dup(dst.VnB(), 1);
+ __ Eor(dst.VnB(), p_reg, dst.VnB(), src.VnB());
break;
case DataType::Type::kUint8:
case DataType::Type::kInt8:
+ __ Not(dst.VnB(), p_reg, src.VnB());
+ break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
+ __ Not(dst.VnH(), p_reg, src.VnH());
+ break;
case DataType::Type::kInt32:
+ __ Not(dst.VnS(), p_reg, src.VnS());
+ break;
case DataType::Type::kInt64:
- __ Not(dst.V16B(), src.V16B()); // lanes do not matter
+ __ Not(dst.VnD(), p_reg, src.VnD());
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -394,7 +435,7 @@
case DataType::Type::kFloat64:
locations->SetInAt(0, Location::RequiresFpuRegister());
locations->SetInAt(1, Location::RequiresFpuRegister());
- locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+ locations->SetOut(Location::SameAsFirstInput());
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -407,36 +448,38 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecAdd(HVecAdd* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister lhs = VRegisterFrom(locations->InAt(0));
- VRegister rhs = VRegisterFrom(locations->InAt(1));
- VRegister dst = VRegisterFrom(locations->Out());
+ const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+ const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
+ const ZRegister dst = ZRegisterFrom(locations->Out());
+ const PRegisterM p_reg = LoopPReg().Merging();
switch (instruction->GetPackedType()) {
case DataType::Type::kUint8:
case DataType::Type::kInt8:
DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Add(dst.V16B(), lhs.V16B(), rhs.V16B());
+ __ Add(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Add(dst.V8H(), lhs.V8H(), rhs.V8H());
+ __ Add(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
break;
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Add(dst.V4S(), lhs.V4S(), rhs.V4S());
+ __ Add(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
break;
case DataType::Type::kInt64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ Add(dst.V2D(), lhs.V2D(), rhs.V2D());
+ __ Add(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
break;
case DataType::Type::kFloat32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Fadd(dst.V4S(), lhs.V4S(), rhs.V4S());
+ __ Fadd(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS(), StrictNaNPropagation);
break;
case DataType::Type::kFloat64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ Fadd(dst.V2D(), lhs.V2D(), rhs.V2D());
+ __ Fadd(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD(), StrictNaNPropagation);
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -445,75 +488,23 @@
}
void LocationsBuilderARM64Sve::VisitVecSaturationAdd(HVecSaturationAdd* instruction) {
- CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+ LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+ UNREACHABLE();
}
void InstructionCodeGeneratorARM64Sve::VisitVecSaturationAdd(HVecSaturationAdd* instruction) {
- LocationSummary* locations = instruction->GetLocations();
- VRegister lhs = VRegisterFrom(locations->InAt(0));
- VRegister rhs = VRegisterFrom(locations->InAt(1));
- VRegister dst = VRegisterFrom(locations->Out());
- switch (instruction->GetPackedType()) {
- case DataType::Type::kUint8:
- DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Uqadd(dst.V16B(), lhs.V16B(), rhs.V16B());
- break;
- case DataType::Type::kInt8:
- DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Sqadd(dst.V16B(), lhs.V16B(), rhs.V16B());
- break;
- case DataType::Type::kUint16:
- DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Uqadd(dst.V8H(), lhs.V8H(), rhs.V8H());
- break;
- case DataType::Type::kInt16:
- DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Sqadd(dst.V8H(), lhs.V8H(), rhs.V8H());
- break;
- default:
- LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
- UNREACHABLE();
- }
+ LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+ UNREACHABLE();
}
void LocationsBuilderARM64Sve::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
- CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+ LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+ UNREACHABLE();
}
void InstructionCodeGeneratorARM64Sve::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
- LocationSummary* locations = instruction->GetLocations();
- VRegister lhs = VRegisterFrom(locations->InAt(0));
- VRegister rhs = VRegisterFrom(locations->InAt(1));
- VRegister dst = VRegisterFrom(locations->Out());
- switch (instruction->GetPackedType()) {
- case DataType::Type::kUint8:
- DCHECK_EQ(16u, instruction->GetVectorLength());
- instruction->IsRounded()
- ? __ Urhadd(dst.V16B(), lhs.V16B(), rhs.V16B())
- : __ Uhadd(dst.V16B(), lhs.V16B(), rhs.V16B());
- break;
- case DataType::Type::kInt8:
- DCHECK_EQ(16u, instruction->GetVectorLength());
- instruction->IsRounded()
- ? __ Srhadd(dst.V16B(), lhs.V16B(), rhs.V16B())
- : __ Shadd(dst.V16B(), lhs.V16B(), rhs.V16B());
- break;
- case DataType::Type::kUint16:
- DCHECK_EQ(8u, instruction->GetVectorLength());
- instruction->IsRounded()
- ? __ Urhadd(dst.V8H(), lhs.V8H(), rhs.V8H())
- : __ Uhadd(dst.V8H(), lhs.V8H(), rhs.V8H());
- break;
- case DataType::Type::kInt16:
- DCHECK_EQ(8u, instruction->GetVectorLength());
- instruction->IsRounded()
- ? __ Srhadd(dst.V8H(), lhs.V8H(), rhs.V8H())
- : __ Shadd(dst.V8H(), lhs.V8H(), rhs.V8H());
- break;
- default:
- LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
- UNREACHABLE();
- }
+ LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+ UNREACHABLE();
}
void LocationsBuilderARM64Sve::VisitVecSub(HVecSub* instruction) {
@@ -521,36 +512,38 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecSub(HVecSub* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister lhs = VRegisterFrom(locations->InAt(0));
- VRegister rhs = VRegisterFrom(locations->InAt(1));
- VRegister dst = VRegisterFrom(locations->Out());
+ const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+ const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
+ const ZRegister dst = ZRegisterFrom(locations->Out());
+ const PRegisterM p_reg = LoopPReg().Merging();
switch (instruction->GetPackedType()) {
case DataType::Type::kUint8:
case DataType::Type::kInt8:
DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Sub(dst.V16B(), lhs.V16B(), rhs.V16B());
+ __ Sub(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Sub(dst.V8H(), lhs.V8H(), rhs.V8H());
+ __ Sub(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
break;
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Sub(dst.V4S(), lhs.V4S(), rhs.V4S());
+ __ Sub(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
break;
case DataType::Type::kInt64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ Sub(dst.V2D(), lhs.V2D(), rhs.V2D());
+ __ Sub(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
break;
case DataType::Type::kFloat32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Fsub(dst.V4S(), lhs.V4S(), rhs.V4S());
+ __ Fsub(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
break;
case DataType::Type::kFloat64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ Fsub(dst.V2D(), lhs.V2D(), rhs.V2D());
+ __ Fsub(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -559,35 +552,13 @@
}
void LocationsBuilderARM64Sve::VisitVecSaturationSub(HVecSaturationSub* instruction) {
- CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+ LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+ UNREACHABLE();
}
void InstructionCodeGeneratorARM64Sve::VisitVecSaturationSub(HVecSaturationSub* instruction) {
- LocationSummary* locations = instruction->GetLocations();
- VRegister lhs = VRegisterFrom(locations->InAt(0));
- VRegister rhs = VRegisterFrom(locations->InAt(1));
- VRegister dst = VRegisterFrom(locations->Out());
- switch (instruction->GetPackedType()) {
- case DataType::Type::kUint8:
- DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Uqsub(dst.V16B(), lhs.V16B(), rhs.V16B());
- break;
- case DataType::Type::kInt8:
- DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Sqsub(dst.V16B(), lhs.V16B(), rhs.V16B());
- break;
- case DataType::Type::kUint16:
- DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Uqsub(dst.V8H(), lhs.V8H(), rhs.V8H());
- break;
- case DataType::Type::kInt16:
- DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Sqsub(dst.V8H(), lhs.V8H(), rhs.V8H());
- break;
- default:
- LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
- UNREACHABLE();
- }
+ LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+ UNREACHABLE();
}
void LocationsBuilderARM64Sve::VisitVecMul(HVecMul* instruction) {
@@ -595,32 +566,38 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecMul(HVecMul* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister lhs = VRegisterFrom(locations->InAt(0));
- VRegister rhs = VRegisterFrom(locations->InAt(1));
- VRegister dst = VRegisterFrom(locations->Out());
+ const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+ const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
+ const ZRegister dst = ZRegisterFrom(locations->Out());
+ const PRegisterM p_reg = LoopPReg().Merging();
switch (instruction->GetPackedType()) {
case DataType::Type::kUint8:
case DataType::Type::kInt8:
DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Mul(dst.V16B(), lhs.V16B(), rhs.V16B());
+ __ Mul(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Mul(dst.V8H(), lhs.V8H(), rhs.V8H());
+ __ Mul(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
break;
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Mul(dst.V4S(), lhs.V4S(), rhs.V4S());
+ __ Mul(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
+ break;
+ case DataType::Type::kInt64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ Mul(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
break;
case DataType::Type::kFloat32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Fmul(dst.V4S(), lhs.V4S(), rhs.V4S());
+ __ Fmul(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS(), StrictNaNPropagation);
break;
case DataType::Type::kFloat64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ Fmul(dst.V2D(), lhs.V2D(), rhs.V2D());
+ __ Fmul(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD(), StrictNaNPropagation);
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -633,18 +610,22 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecDiv(HVecDiv* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister lhs = VRegisterFrom(locations->InAt(0));
- VRegister rhs = VRegisterFrom(locations->InAt(1));
- VRegister dst = VRegisterFrom(locations->Out());
+ const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+ const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
+ const ZRegister dst = ZRegisterFrom(locations->Out());
+ const PRegisterM p_reg = LoopPReg().Merging();
+
+ // Note: VIXL guarantees StrictNaNPropagation for Fdiv.
switch (instruction->GetPackedType()) {
case DataType::Type::kFloat32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Fdiv(dst.V4S(), lhs.V4S(), rhs.V4S());
+ __ Fdiv(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
break;
case DataType::Type::kFloat64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ Fdiv(dst.V2D(), lhs.V2D(), rhs.V2D());
+ __ Fdiv(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -653,99 +634,23 @@
}
void LocationsBuilderARM64Sve::VisitVecMin(HVecMin* instruction) {
- CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+ LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+ UNREACHABLE();
}
void InstructionCodeGeneratorARM64Sve::VisitVecMin(HVecMin* instruction) {
- LocationSummary* locations = instruction->GetLocations();
- VRegister lhs = VRegisterFrom(locations->InAt(0));
- VRegister rhs = VRegisterFrom(locations->InAt(1));
- VRegister dst = VRegisterFrom(locations->Out());
- switch (instruction->GetPackedType()) {
- case DataType::Type::kUint8:
- DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Umin(dst.V16B(), lhs.V16B(), rhs.V16B());
- break;
- case DataType::Type::kInt8:
- DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Smin(dst.V16B(), lhs.V16B(), rhs.V16B());
- break;
- case DataType::Type::kUint16:
- DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Umin(dst.V8H(), lhs.V8H(), rhs.V8H());
- break;
- case DataType::Type::kInt16:
- DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Smin(dst.V8H(), lhs.V8H(), rhs.V8H());
- break;
- case DataType::Type::kUint32:
- DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Umin(dst.V4S(), lhs.V4S(), rhs.V4S());
- break;
- case DataType::Type::kInt32:
- DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Smin(dst.V4S(), lhs.V4S(), rhs.V4S());
- break;
- case DataType::Type::kFloat32:
- DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Fmin(dst.V4S(), lhs.V4S(), rhs.V4S());
- break;
- case DataType::Type::kFloat64:
- DCHECK_EQ(2u, instruction->GetVectorLength());
- __ Fmin(dst.V2D(), lhs.V2D(), rhs.V2D());
- break;
- default:
- LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
- UNREACHABLE();
- }
+ LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+ UNREACHABLE();
}
void LocationsBuilderARM64Sve::VisitVecMax(HVecMax* instruction) {
- CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+ LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+ UNREACHABLE();
}
void InstructionCodeGeneratorARM64Sve::VisitVecMax(HVecMax* instruction) {
- LocationSummary* locations = instruction->GetLocations();
- VRegister lhs = VRegisterFrom(locations->InAt(0));
- VRegister rhs = VRegisterFrom(locations->InAt(1));
- VRegister dst = VRegisterFrom(locations->Out());
- switch (instruction->GetPackedType()) {
- case DataType::Type::kUint8:
- DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Umax(dst.V16B(), lhs.V16B(), rhs.V16B());
- break;
- case DataType::Type::kInt8:
- DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Smax(dst.V16B(), lhs.V16B(), rhs.V16B());
- break;
- case DataType::Type::kUint16:
- DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Umax(dst.V8H(), lhs.V8H(), rhs.V8H());
- break;
- case DataType::Type::kInt16:
- DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Smax(dst.V8H(), lhs.V8H(), rhs.V8H());
- break;
- case DataType::Type::kUint32:
- DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Umax(dst.V4S(), lhs.V4S(), rhs.V4S());
- break;
- case DataType::Type::kInt32:
- DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Smax(dst.V4S(), lhs.V4S(), rhs.V4S());
- break;
- case DataType::Type::kFloat32:
- DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Fmax(dst.V4S(), lhs.V4S(), rhs.V4S());
- break;
- case DataType::Type::kFloat64:
- DCHECK_EQ(2u, instruction->GetVectorLength());
- __ Fmax(dst.V2D(), lhs.V2D(), rhs.V2D());
- break;
- default:
- LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
- UNREACHABLE();
- }
+ LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+ UNREACHABLE();
}
void LocationsBuilderARM64Sve::VisitVecAnd(HVecAnd* instruction) {
@@ -754,21 +659,29 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecAnd(HVecAnd* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister lhs = VRegisterFrom(locations->InAt(0));
- VRegister rhs = VRegisterFrom(locations->InAt(1));
- VRegister dst = VRegisterFrom(locations->Out());
+ const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+ const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
+ const ZRegister dst = ZRegisterFrom(locations->Out());
+ const PRegisterM p_reg = LoopPReg().Merging();
switch (instruction->GetPackedType()) {
case DataType::Type::kBool:
case DataType::Type::kUint8:
case DataType::Type::kInt8:
+ __ And(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
+ break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
+ __ And(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
+ break;
case DataType::Type::kInt32:
- case DataType::Type::kInt64:
case DataType::Type::kFloat32:
+ __ And(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
+ break;
+ case DataType::Type::kInt64:
case DataType::Type::kFloat64:
- __ And(dst.V16B(), lhs.V16B(), rhs.V16B()); // lanes do not matter
+ __ And(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -790,21 +703,29 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecOr(HVecOr* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister lhs = VRegisterFrom(locations->InAt(0));
- VRegister rhs = VRegisterFrom(locations->InAt(1));
- VRegister dst = VRegisterFrom(locations->Out());
+ const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+ const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
+ const ZRegister dst = ZRegisterFrom(locations->Out());
+ const PRegisterM p_reg = LoopPReg().Merging();
switch (instruction->GetPackedType()) {
case DataType::Type::kBool:
case DataType::Type::kUint8:
case DataType::Type::kInt8:
+ __ Orr(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
+ break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
+ __ Orr(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
+ break;
case DataType::Type::kInt32:
- case DataType::Type::kInt64:
case DataType::Type::kFloat32:
+ __ Orr(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
+ break;
+ case DataType::Type::kInt64:
case DataType::Type::kFloat64:
- __ Orr(dst.V16B(), lhs.V16B(), rhs.V16B()); // lanes do not matter
+ __ Orr(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -817,21 +738,29 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecXor(HVecXor* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister lhs = VRegisterFrom(locations->InAt(0));
- VRegister rhs = VRegisterFrom(locations->InAt(1));
- VRegister dst = VRegisterFrom(locations->Out());
+ const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+ const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
+ const ZRegister dst = ZRegisterFrom(locations->Out());
+ const PRegisterM p_reg = LoopPReg().Merging();
switch (instruction->GetPackedType()) {
case DataType::Type::kBool:
case DataType::Type::kUint8:
case DataType::Type::kInt8:
+ __ Eor(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
+ break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
+ __ Eor(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
+ break;
case DataType::Type::kInt32:
- case DataType::Type::kInt64:
case DataType::Type::kFloat32:
+ __ Eor(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
+ break;
+ case DataType::Type::kInt64:
case DataType::Type::kFloat64:
- __ Eor(dst.V16B(), lhs.V16B(), rhs.V16B()); // lanes do not matter
+ __ Eor(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -864,28 +793,30 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecShl(HVecShl* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister lhs = VRegisterFrom(locations->InAt(0));
- VRegister dst = VRegisterFrom(locations->Out());
+ const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+ const ZRegister dst = ZRegisterFrom(locations->Out());
+ const PRegisterM p_reg = LoopPReg().Merging();
int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
switch (instruction->GetPackedType()) {
case DataType::Type::kUint8:
case DataType::Type::kInt8:
DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Shl(dst.V16B(), lhs.V16B(), value);
+ __ Lsl(dst.VnB(), p_reg, lhs.VnB(), value);
break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Shl(dst.V8H(), lhs.V8H(), value);
+ __ Lsl(dst.VnH(), p_reg, lhs.VnH(), value);
break;
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Shl(dst.V4S(), lhs.V4S(), value);
+ __ Lsl(dst.VnS(), p_reg, lhs.VnS(), value);
break;
case DataType::Type::kInt64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ Shl(dst.V2D(), lhs.V2D(), value);
+ __ Lsl(dst.VnD(), p_reg, lhs.VnD(), value);
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -898,28 +829,30 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecShr(HVecShr* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister lhs = VRegisterFrom(locations->InAt(0));
- VRegister dst = VRegisterFrom(locations->Out());
+ const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+ const ZRegister dst = ZRegisterFrom(locations->Out());
+ const PRegisterM p_reg = LoopPReg().Merging();
int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
switch (instruction->GetPackedType()) {
case DataType::Type::kUint8:
case DataType::Type::kInt8:
DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Sshr(dst.V16B(), lhs.V16B(), value);
+ __ Asr(dst.VnB(), p_reg, lhs.VnB(), value);
break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Sshr(dst.V8H(), lhs.V8H(), value);
+ __ Asr(dst.VnH(), p_reg, lhs.VnH(), value);
break;
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Sshr(dst.V4S(), lhs.V4S(), value);
+ __ Asr(dst.VnS(), p_reg, lhs.VnS(), value);
break;
case DataType::Type::kInt64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ Sshr(dst.V2D(), lhs.V2D(), value);
+ __ Asr(dst.VnD(), p_reg, lhs.VnD(), value);
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -932,28 +865,30 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecUShr(HVecUShr* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister lhs = VRegisterFrom(locations->InAt(0));
- VRegister dst = VRegisterFrom(locations->Out());
+ const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+ const ZRegister dst = ZRegisterFrom(locations->Out());
+ const PRegisterM p_reg = LoopPReg().Merging();
int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
switch (instruction->GetPackedType()) {
case DataType::Type::kUint8:
case DataType::Type::kInt8:
DCHECK_EQ(16u, instruction->GetVectorLength());
- __ Ushr(dst.V16B(), lhs.V16B(), value);
+ __ Lsr(dst.VnB(), p_reg, lhs.VnB(), value);
break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Ushr(dst.V8H(), lhs.V8H(), value);
+ __ Lsr(dst.VnH(), p_reg, lhs.VnH(), value);
break;
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Ushr(dst.V4S(), lhs.V4S(), value);
+ __ Lsr(dst.VnS(), p_reg, lhs.VnS(), value);
break;
case DataType::Type::kInt64:
DCHECK_EQ(2u, instruction->GetVectorLength());
- __ Ushr(dst.V2D(), lhs.V2D(), value);
+ __ Lsr(dst.VnD(), p_reg, lhs.VnD(), value);
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -964,7 +899,7 @@
void LocationsBuilderARM64Sve::VisitVecSetScalars(HVecSetScalars* instruction) {
LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
- DCHECK_EQ(1u, instruction->InputCount()); // only one input currently implemented
+ DCHECK_EQ(2u, instruction->InputCount()); // only one input currently implemented + predicate.
HInstruction* input = instruction->InputAt(0);
bool is_zero = IsZeroBitPattern(input);
@@ -994,14 +929,16 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecSetScalars(HVecSetScalars* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister dst = VRegisterFrom(locations->Out());
+ const ZRegister z_dst = ZRegisterFrom(locations->Out());
- DCHECK_EQ(1u, instruction->InputCount()); // only one input currently implemented
+ DCHECK_EQ(2u, instruction->InputCount()); // only one input currently implemented + predicate.
// Zero out all other elements first.
- __ Movi(dst.V16B(), 0);
+ __ Dup(z_dst.VnB(), 0);
+ const VRegister dst = VRegisterFrom(locations->Out());
// Shorthand for any type of zero.
if (IsZeroBitPattern(instruction->InputAt(0))) {
return;
@@ -1062,11 +999,14 @@
// Some early revisions of the Cortex-A53 have an erratum (835769) whereby it is possible for a
// 64-bit scalar multiply-accumulate instruction in AArch64 state to generate an incorrect result.
// However vector MultiplyAccumulate instruction is not affected.
-void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate(
+ HVecMultiplyAccumulate* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
- VRegister acc = VRegisterFrom(locations->InAt(0));
- VRegister left = VRegisterFrom(locations->InAt(1));
- VRegister right = VRegisterFrom(locations->InAt(2));
+ const ZRegister acc = ZRegisterFrom(locations->InAt(0));
+ const ZRegister left = ZRegisterFrom(locations->InAt(1));
+ const ZRegister right = ZRegisterFrom(locations->InAt(2));
+ const PRegisterM p_reg = LoopPReg().Merging();
DCHECK(locations->InAt(0).Equals(locations->Out()));
@@ -1075,26 +1015,26 @@
case DataType::Type::kInt8:
DCHECK_EQ(16u, instruction->GetVectorLength());
if (instruction->GetOpKind() == HInstruction::kAdd) {
- __ Mla(acc.V16B(), left.V16B(), right.V16B());
+ __ Mla(acc.VnB(), p_reg, acc.VnB(), left.VnB(), right.VnB());
} else {
- __ Mls(acc.V16B(), left.V16B(), right.V16B());
+ __ Mls(acc.VnB(), p_reg, acc.VnB(), left.VnB(), right.VnB());
}
break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
DCHECK_EQ(8u, instruction->GetVectorLength());
if (instruction->GetOpKind() == HInstruction::kAdd) {
- __ Mla(acc.V8H(), left.V8H(), right.V8H());
+ __ Mla(acc.VnH(), p_reg, acc.VnB(), left.VnH(), right.VnH());
} else {
- __ Mls(acc.V8H(), left.V8H(), right.V8H());
+ __ Mls(acc.VnH(), p_reg, acc.VnB(), left.VnH(), right.VnH());
}
break;
case DataType::Type::kInt32:
DCHECK_EQ(4u, instruction->GetVectorLength());
if (instruction->GetOpKind() == HInstruction::kAdd) {
- __ Mla(acc.V4S(), left.V4S(), right.V4S());
+ __ Mla(acc.VnS(), p_reg, acc.VnB(), left.VnS(), right.VnS());
} else {
- __ Mls(acc.V4S(), left.V4S(), right.V4S());
+ __ Mls(acc.VnS(), p_reg, acc.VnB(), left.VnS(), right.VnS());
}
break;
default:
@@ -1104,185 +1044,13 @@
}
void LocationsBuilderARM64Sve::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
- CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction);
- // Some conversions require temporary registers.
- LocationSummary* locations = instruction->GetLocations();
- HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
- HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
- DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
- HVecOperation::ToSignedType(b->GetPackedType()));
- switch (a->GetPackedType()) {
- case DataType::Type::kUint8:
- case DataType::Type::kInt8:
- switch (instruction->GetPackedType()) {
- case DataType::Type::kInt64:
- locations->AddTemp(Location::RequiresFpuRegister());
- locations->AddTemp(Location::RequiresFpuRegister());
- FALLTHROUGH_INTENDED;
- case DataType::Type::kInt32:
- locations->AddTemp(Location::RequiresFpuRegister());
- locations->AddTemp(Location::RequiresFpuRegister());
- break;
- default:
- break;
- }
- break;
- case DataType::Type::kUint16:
- case DataType::Type::kInt16:
- if (instruction->GetPackedType() == DataType::Type::kInt64) {
- locations->AddTemp(Location::RequiresFpuRegister());
- locations->AddTemp(Location::RequiresFpuRegister());
- }
- break;
- case DataType::Type::kInt32:
- case DataType::Type::kInt64:
- if (instruction->GetPackedType() == a->GetPackedType()) {
- locations->AddTemp(Location::RequiresFpuRegister());
- }
- break;
- default:
- break;
- }
+ LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+ UNREACHABLE();
}
void InstructionCodeGeneratorARM64Sve::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
- LocationSummary* locations = instruction->GetLocations();
- VRegister acc = VRegisterFrom(locations->InAt(0));
- VRegister left = VRegisterFrom(locations->InAt(1));
- VRegister right = VRegisterFrom(locations->InAt(2));
-
- DCHECK(locations->InAt(0).Equals(locations->Out()));
-
- // Handle all feasible acc_T += sad(a_S, b_S) type combinations (T x S).
- HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
- HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
- DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
- HVecOperation::ToSignedType(b->GetPackedType()));
- switch (a->GetPackedType()) {
- case DataType::Type::kUint8:
- case DataType::Type::kInt8:
- DCHECK_EQ(16u, a->GetVectorLength());
- switch (instruction->GetPackedType()) {
- case DataType::Type::kInt16:
- DCHECK_EQ(8u, instruction->GetVectorLength());
- __ Sabal(acc.V8H(), left.V8B(), right.V8B());
- __ Sabal2(acc.V8H(), left.V16B(), right.V16B());
- break;
- case DataType::Type::kInt32: {
- DCHECK_EQ(4u, instruction->GetVectorLength());
- VRegister tmp1 = VRegisterFrom(locations->GetTemp(0));
- VRegister tmp2 = VRegisterFrom(locations->GetTemp(1));
- __ Sxtl(tmp1.V8H(), left.V8B());
- __ Sxtl(tmp2.V8H(), right.V8B());
- __ Sabal(acc.V4S(), tmp1.V4H(), tmp2.V4H());
- __ Sabal2(acc.V4S(), tmp1.V8H(), tmp2.V8H());
- __ Sxtl2(tmp1.V8H(), left.V16B());
- __ Sxtl2(tmp2.V8H(), right.V16B());
- __ Sabal(acc.V4S(), tmp1.V4H(), tmp2.V4H());
- __ Sabal2(acc.V4S(), tmp1.V8H(), tmp2.V8H());
- break;
- }
- case DataType::Type::kInt64: {
- DCHECK_EQ(2u, instruction->GetVectorLength());
- VRegister tmp1 = VRegisterFrom(locations->GetTemp(0));
- VRegister tmp2 = VRegisterFrom(locations->GetTemp(1));
- VRegister tmp3 = VRegisterFrom(locations->GetTemp(2));
- VRegister tmp4 = VRegisterFrom(locations->GetTemp(3));
- __ Sxtl(tmp1.V8H(), left.V8B());
- __ Sxtl(tmp2.V8H(), right.V8B());
- __ Sxtl(tmp3.V4S(), tmp1.V4H());
- __ Sxtl(tmp4.V4S(), tmp2.V4H());
- __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
- __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
- __ Sxtl2(tmp3.V4S(), tmp1.V8H());
- __ Sxtl2(tmp4.V4S(), tmp2.V8H());
- __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
- __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
- __ Sxtl2(tmp1.V8H(), left.V16B());
- __ Sxtl2(tmp2.V8H(), right.V16B());
- __ Sxtl(tmp3.V4S(), tmp1.V4H());
- __ Sxtl(tmp4.V4S(), tmp2.V4H());
- __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
- __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
- __ Sxtl2(tmp3.V4S(), tmp1.V8H());
- __ Sxtl2(tmp4.V4S(), tmp2.V8H());
- __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
- __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
- break;
- }
- default:
- LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
- UNREACHABLE();
- }
- break;
- case DataType::Type::kUint16:
- case DataType::Type::kInt16:
- DCHECK_EQ(8u, a->GetVectorLength());
- switch (instruction->GetPackedType()) {
- case DataType::Type::kInt32:
- DCHECK_EQ(4u, instruction->GetVectorLength());
- __ Sabal(acc.V4S(), left.V4H(), right.V4H());
- __ Sabal2(acc.V4S(), left.V8H(), right.V8H());
- break;
- case DataType::Type::kInt64: {
- DCHECK_EQ(2u, instruction->GetVectorLength());
- VRegister tmp1 = VRegisterFrom(locations->GetTemp(0));
- VRegister tmp2 = VRegisterFrom(locations->GetTemp(1));
- __ Sxtl(tmp1.V4S(), left.V4H());
- __ Sxtl(tmp2.V4S(), right.V4H());
- __ Sabal(acc.V2D(), tmp1.V2S(), tmp2.V2S());
- __ Sabal2(acc.V2D(), tmp1.V4S(), tmp2.V4S());
- __ Sxtl2(tmp1.V4S(), left.V8H());
- __ Sxtl2(tmp2.V4S(), right.V8H());
- __ Sabal(acc.V2D(), tmp1.V2S(), tmp2.V2S());
- __ Sabal2(acc.V2D(), tmp1.V4S(), tmp2.V4S());
- break;
- }
- default:
- LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
- UNREACHABLE();
- }
- break;
- case DataType::Type::kInt32:
- DCHECK_EQ(4u, a->GetVectorLength());
- switch (instruction->GetPackedType()) {
- case DataType::Type::kInt32: {
- DCHECK_EQ(4u, instruction->GetVectorLength());
- VRegister tmp = VRegisterFrom(locations->GetTemp(0));
- __ Sub(tmp.V4S(), left.V4S(), right.V4S());
- __ Abs(tmp.V4S(), tmp.V4S());
- __ Add(acc.V4S(), acc.V4S(), tmp.V4S());
- break;
- }
- case DataType::Type::kInt64:
- DCHECK_EQ(2u, instruction->GetVectorLength());
- __ Sabal(acc.V2D(), left.V2S(), right.V2S());
- __ Sabal2(acc.V2D(), left.V4S(), right.V4S());
- break;
- default:
- LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
- UNREACHABLE();
- }
- break;
- case DataType::Type::kInt64:
- DCHECK_EQ(2u, a->GetVectorLength());
- switch (instruction->GetPackedType()) {
- case DataType::Type::kInt64: {
- DCHECK_EQ(2u, instruction->GetVectorLength());
- VRegister tmp = VRegisterFrom(locations->GetTemp(0));
- __ Sub(tmp.V2D(), left.V2D(), right.V2D());
- __ Abs(tmp.V2D(), tmp.V2D());
- __ Add(acc.V2D(), acc.V2D(), tmp.V2D());
- break;
- }
- default:
- LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
- UNREACHABLE();
- }
- break;
- default:
- LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
- }
+ LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+ UNREACHABLE();
}
void LocationsBuilderARM64Sve::VisitVecDotProd(HVecDotProd* instruction) {
@@ -1293,19 +1061,17 @@
locations->SetInAt(2, Location::RequiresFpuRegister());
locations->SetOut(Location::SameAsFirstInput());
- // For Int8 and Uint8 general case we need a temp register.
- if ((DataType::Size(instruction->InputAt(1)->AsVecOperation()->GetPackedType()) == 1) &&
- !ShouldEmitDotProductInstructions(codegen_)) {
- locations->AddTemp(Location::RequiresFpuRegister());
- }
+ locations->AddTemp(Location::RequiresFpuRegister());
}
void InstructionCodeGeneratorARM64Sve::VisitVecDotProd(HVecDotProd* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
DCHECK(locations->InAt(0).Equals(locations->Out()));
- VRegister acc = VRegisterFrom(locations->InAt(0));
- VRegister left = VRegisterFrom(locations->InAt(1));
- VRegister right = VRegisterFrom(locations->InAt(2));
+ const ZRegister acc = ZRegisterFrom(locations->InAt(0));
+ const ZRegister left = ZRegisterFrom(locations->InAt(1));
+ const ZRegister right = ZRegisterFrom(locations->InAt(2));
+ const PRegisterM p_reg = LoopPReg().Merging();
HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
@@ -1317,45 +1083,20 @@
switch (inputs_data_size) {
case 1u: {
DCHECK_EQ(16u, a->GetVectorLength());
+ UseScratchRegisterScope temps(GetVIXLAssembler());
+ const ZRegister tmp0 = temps.AcquireZ();
+ const ZRegister tmp1 = ZRegisterFrom(locations->GetTemp(0));
+
+ __ Dup(tmp1.VnB(), 0u);
+ __ Sel(tmp0.VnB(), p_reg, left.VnB(), tmp1.VnB());
+ __ Sel(tmp1.VnB(), p_reg, right.VnB(), tmp1.VnB());
if (instruction->IsZeroExtending()) {
- if (ShouldEmitDotProductInstructions(codegen_)) {
- __ Udot(acc.V4S(), left.V16B(), right.V16B());
- } else {
- VRegister tmp = VRegisterFrom(locations->GetTemp(0));
- __ Umull(tmp.V8H(), left.V8B(), right.V8B());
- __ Uaddw(acc.V4S(), acc.V4S(), tmp.V4H());
- __ Uaddw2(acc.V4S(), acc.V4S(), tmp.V8H());
-
- __ Umull2(tmp.V8H(), left.V16B(), right.V16B());
- __ Uaddw(acc.V4S(), acc.V4S(), tmp.V4H());
- __ Uaddw2(acc.V4S(), acc.V4S(), tmp.V8H());
- }
+ __ Udot(acc.VnS(), acc.VnS(), tmp0.VnB(), tmp1.VnB());
} else {
- if (ShouldEmitDotProductInstructions(codegen_)) {
- __ Sdot(acc.V4S(), left.V16B(), right.V16B());
- } else {
- VRegister tmp = VRegisterFrom(locations->GetTemp(0));
- __ Smull(tmp.V8H(), left.V8B(), right.V8B());
- __ Saddw(acc.V4S(), acc.V4S(), tmp.V4H());
- __ Saddw2(acc.V4S(), acc.V4S(), tmp.V8H());
-
- __ Smull2(tmp.V8H(), left.V16B(), right.V16B());
- __ Saddw(acc.V4S(), acc.V4S(), tmp.V4H());
- __ Saddw2(acc.V4S(), acc.V4S(), tmp.V8H());
- }
+ __ Sdot(acc.VnS(), acc.VnS(), tmp0.VnB(), tmp1.VnB());
}
break;
}
- case 2u:
- DCHECK_EQ(8u, a->GetVectorLength());
- if (instruction->IsZeroExtending()) {
- __ Umlal(acc.V4S(), left.V4H(), right.V4H());
- __ Umlal2(acc.V4S(), left.V8H(), right.V8H());
- } else {
- __ Smlal(acc.V4S(), left.V4H(), right.V4H());
- __ Smlal2(acc.V4S(), left.V8H(), right.V8H());
- }
- break;
default:
LOG(FATAL) << "Unsupported SIMD type size: " << inputs_data_size;
}
@@ -1395,54 +1136,39 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecLoad(HVecLoad* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
size_t size = DataType::Size(instruction->GetPackedType());
- VRegister reg = VRegisterFrom(locations->Out());
+ const ZRegister reg = ZRegisterFrom(locations->Out());
UseScratchRegisterScope temps(GetVIXLAssembler());
Register scratch;
+ const PRegisterZ p_reg = LoopPReg().Zeroing();
switch (instruction->GetPackedType()) {
case DataType::Type::kInt16: // (short) s.charAt(.) can yield HVecLoad/Int16/StringCharAt.
case DataType::Type::kUint16:
DCHECK_EQ(8u, instruction->GetVectorLength());
- // Special handling of compressed/uncompressed string load.
- if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
- vixl::aarch64::Label uncompressed_load, done;
- // Test compression bit.
- static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
- "Expecting 0=compressed, 1=uncompressed");
- uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
- Register length = temps.AcquireW();
- __ Ldr(length, HeapOperand(InputRegisterAt(instruction, 0), count_offset));
- __ Tbnz(length.W(), 0, &uncompressed_load);
- temps.Release(length); // no longer needed
- // Zero extend 8 compressed bytes into 8 chars.
- __ Ldr(DRegisterFrom(locations->Out()).V8B(),
- VecNeonAddress(instruction, &temps, 1, /*is_string_char_at*/ true, &scratch));
- __ Uxtl(reg.V8H(), reg.V8B());
- __ B(&done);
- if (scratch.IsValid()) {
- temps.Release(scratch); // if used, no longer needed
- }
- // Load 8 direct uncompressed chars.
- __ Bind(&uncompressed_load);
- __ Ldr(reg,
- VecNeonAddress(instruction, &temps, size, /*is_string_char_at*/ true, &scratch));
- __ Bind(&done);
- return;
- }
- FALLTHROUGH_INTENDED;
+ __ Ld1h(reg.VnH(), p_reg,
+ VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
+ break;
case DataType::Type::kBool:
case DataType::Type::kUint8:
case DataType::Type::kInt8:
+ DCHECK_EQ(16u, instruction->GetVectorLength());
+ __ Ld1b(reg.VnB(), p_reg,
+ VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
+ break;
case DataType::Type::kInt32:
case DataType::Type::kFloat32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ Ld1w(reg.VnS(), p_reg,
+ VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
+ break;
case DataType::Type::kInt64:
case DataType::Type::kFloat64:
- DCHECK_LE(2u, instruction->GetVectorLength());
- DCHECK_LE(instruction->GetVectorLength(), 16u);
- __ Ldr(reg,
- VecNeonAddress(instruction, &temps, size, instruction->IsStringCharAt(), &scratch));
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ Ld1d(reg.VnD(), p_reg,
+ VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -1455,26 +1181,39 @@
}
void InstructionCodeGeneratorARM64Sve::VisitVecStore(HVecStore* instruction) {
+ DCHECK(instruction->IsPredicated());
LocationSummary* locations = instruction->GetLocations();
size_t size = DataType::Size(instruction->GetPackedType());
- VRegister reg = VRegisterFrom(locations->InAt(2));
+ const ZRegister reg = ZRegisterFrom(locations->InAt(2));
UseScratchRegisterScope temps(GetVIXLAssembler());
Register scratch;
+ const PRegisterZ p_reg = LoopPReg().Zeroing();
switch (instruction->GetPackedType()) {
case DataType::Type::kBool:
case DataType::Type::kUint8:
case DataType::Type::kInt8:
+ DCHECK_EQ(16u, instruction->GetVectorLength());
+ __ St1b(reg.VnB(), p_reg,
+ VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
+ break;
case DataType::Type::kUint16:
case DataType::Type::kInt16:
+ DCHECK_EQ(8u, instruction->GetVectorLength());
+ __ St1h(reg.VnH(), p_reg,
+ VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
+ break;
case DataType::Type::kInt32:
case DataType::Type::kFloat32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ St1w(reg.VnS(), p_reg,
+ VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
+ break;
case DataType::Type::kInt64:
case DataType::Type::kFloat64:
- DCHECK_LE(2u, instruction->GetVectorLength());
- DCHECK_LE(instruction->GetVectorLength(), 16u);
- __ Str(reg,
- VecNeonAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ St1d(reg.VnD(), p_reg,
+ VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
break;
default:
LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -1483,33 +1222,113 @@
}
void LocationsBuilderARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instruction) {
- LOG(FATAL) << "No SIMD for " << instruction->GetId();
- UNREACHABLE();
+ LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+ DCHECK(instruction->InputAt(0)->IsIntConstant());
+ locations->SetInAt(0, Location::NoLocation());
+ locations->SetOut(Location::NoLocation());
}
void InstructionCodeGeneratorARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instruction) {
- LOG(FATAL) << "No SIMD for " << instruction->GetId();
- UNREACHABLE();
+ // Instruction is not predicated, see nodes_vector.h
+ DCHECK(!instruction->IsPredicated());
+ const PRegister p_reg = LoopPReg();
+
+ switch (instruction->GetPackedType()) {
+ case DataType::Type::kBool:
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ DCHECK_EQ(16u, instruction->GetVectorLength());
+ __ Ptrue(p_reg.VnB(), vixl::aarch64::SVE_ALL);
+ break;
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ DCHECK_EQ(8u, instruction->GetVectorLength());
+ __ Ptrue(p_reg.VnH(), vixl::aarch64::SVE_ALL);
+ break;
+ case DataType::Type::kInt32:
+ case DataType::Type::kFloat32:
+ DCHECK_EQ(4u, instruction->GetVectorLength());
+ __ Ptrue(p_reg.VnS(), vixl::aarch64::SVE_ALL);
+ break;
+ case DataType::Type::kInt64:
+ case DataType::Type::kFloat64:
+ DCHECK_EQ(2u, instruction->GetVectorLength());
+ __ Ptrue(p_reg.VnD(), vixl::aarch64::SVE_ALL);
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
+ UNREACHABLE();
+ }
}
void LocationsBuilderARM64Sve::VisitVecPredWhile(HVecPredWhile* instruction) {
- LOG(FATAL) << "No SIMD for " << instruction->GetId();
- UNREACHABLE();
+ LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+ locations->SetInAt(0, Location::RequiresRegister());
+ locations->SetInAt(1, Location::RequiresRegister());
+ // The instruction doesn't really need a core register as out location; this is a hack
+ // to workaround absence of support for vector predicates in register allocation.
+ //
+ // Semantically, the out location of this instruction and predicate inputs locations of
+ // its users should be a fixed predicate register (similar to
+ // Location::RegisterLocation(int reg)). But the register allocator (RA) doesn't support
+ // SIMD regs (e.g. predicate), so LoopPReg() is used explicitly without exposing it
+ // to the RA.
+ //
+ // To make the RA happy Location::NoLocation() was used for all the vector instructions
+ // predicate inputs; but for the PredSetOperations (e.g. VecPredWhile) Location::NoLocation()
+ // can't be used without changes to RA - "ssa_liveness_analysis.cc] Check failed:
+ // input->IsEmittedAtUseSite()" would fire.
+ //
+ // Using a core register as a hack is the easiest way to tackle this problem. The RA will
+ // block one core register for the loop without actually using it; this should not be
+ // a performance issue as a SIMD loop operates mainly on SIMD registers.
+ //
+ // TODO: Support SIMD types in register allocator.
+ locations->SetOut(Location::RequiresRegister());
}
void InstructionCodeGeneratorARM64Sve::VisitVecPredWhile(HVecPredWhile* instruction) {
- LOG(FATAL) << "No SIMD for " << instruction->GetId();
- UNREACHABLE();
+ // Instruction is not predicated, see nodes_vector.h
+ DCHECK(!instruction->IsPredicated());
+ // Current implementation of predicated loop execution only supports kLO condition.
+ DCHECK(instruction->GetCondKind() == HVecPredWhile::CondKind::kLO);
+ Register left = InputRegisterAt(instruction, 0);
+ Register right = InputRegisterAt(instruction, 1);
+
+ switch (instruction->GetVectorLength()) {
+ case 16u:
+ __ Whilelo(LoopPReg().VnB(), left, right);
+ break;
+ case 8u:
+ __ Whilelo(LoopPReg().VnH(), left, right);
+ break;
+ case 4u:
+ __ Whilelo(LoopPReg().VnS(), left, right);
+ break;
+ case 2u:
+ __ Whilelo(LoopPReg().VnD(), left, right);
+ break;
+ default:
+ LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
+ UNREACHABLE();
+ }
}
void LocationsBuilderARM64Sve::VisitVecPredCondition(HVecPredCondition* instruction) {
- LOG(FATAL) << "No SIMD for " << instruction->GetId();
- UNREACHABLE();
+ LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+ locations->SetInAt(0, Location::NoLocation());
+ // Result of the operation - a boolean value in a core register.
+ locations->SetOut(Location::RequiresRegister());
}
void InstructionCodeGeneratorARM64Sve::VisitVecPredCondition(HVecPredCondition* instruction) {
- LOG(FATAL) << "No SIMD for " << instruction->GetId();
- UNREACHABLE();
+ // Instruction is not predicated, see nodes_vector.h
+ DCHECK(!instruction->IsPredicated());
+ Register reg = OutputRegister(instruction);
+ // Currently VecPredCondition is only used as part of vectorized loop check condition
+ // evaluation.
+ DCHECK(instruction->GetPCondKind() == HVecPredCondition::PCondKind::kNFirst);
+ __ Cset(reg, pl);
}
Location InstructionCodeGeneratorARM64Sve::AllocateSIMDScratchLocation(
@@ -1547,13 +1366,13 @@
DCHECK(source.IsSIMDStackSlot());
UseScratchRegisterScope temps(GetVIXLAssembler());
if (GetVIXLAssembler()->GetScratchVRegisterList()->IsEmpty()) {
- Register temp = temps.AcquireX();
+ const Register temp = temps.AcquireX();
__ Ldr(temp, MemOperand(sp, source.GetStackIndex()));
__ Str(temp, MemOperand(sp, destination.GetStackIndex()));
__ Ldr(temp, MemOperand(sp, source.GetStackIndex() + kArm64WordSize));
__ Str(temp, MemOperand(sp, destination.GetStackIndex() + kArm64WordSize));
} else {
- VRegister temp = temps.AcquireVRegisterOfSize(kQRegSize);
+ const VRegister temp = temps.AcquireVRegisterOfSize(kQRegSize);
__ Ldr(temp, StackOperandFrom(source));
__ Str(temp, StackOperandFrom(destination));
}
diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
index d652492..7220781 100644
--- a/compiler/optimizing/common_arm64.h
+++ b/compiler/optimizing/common_arm64.h
@@ -102,6 +102,11 @@
return vixl::aarch64::VRegister(location.reg());
}
+inline vixl::aarch64::ZRegister ZRegisterFrom(Location location) {
+ DCHECK(location.IsFpuRegister()) << location;
+ return vixl::aarch64::ZRegister(location.reg());
+}
+
inline vixl::aarch64::VRegister SRegisterFrom(Location location) {
DCHECK(location.IsFpuRegister()) << location;
return vixl::aarch64::SRegister(location.reg());
@@ -298,7 +303,7 @@
}
inline Location ARM64EncodableConstantOrRegister(HInstruction* constant,
- HInstruction* instr) {
+ HInstruction* instr) {
if (constant->IsConstant()
&& Arm64CanEncodeConstantAsImmediate(constant->AsConstant(), instr)) {
return Location::ConstantLocation(constant->AsConstant());
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index 7137617..8970372 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -289,56 +289,72 @@
}
ArenaAllocator* allocator = mul->GetBlock()->GetGraph()->GetAllocator();
-
- if (mul->HasOnlyOneNonEnvironmentUse()) {
- HInstruction* use = mul->GetUses().front().GetUser();
- if (use->IsVecAdd() || use->IsVecSub()) {
- // Replace code looking like
- // VECMUL tmp, x, y
- // VECADD/SUB dst, acc, tmp
- // with
- // VECMULACC dst, acc, x, y
- // Note that we do not want to (unconditionally) perform the merge when the
- // multiplication has multiple uses and it can be merged in all of them.
- // Multiple uses could happen on the same control-flow path, and we would
- // then increase the amount of work. In the future we could try to evaluate
- // whether all uses are on different control-flow paths (using dominance and
- // reverse-dominance information) and only perform the merge when they are.
- HInstruction* accumulator = nullptr;
- HVecBinaryOperation* binop = use->AsVecBinaryOperation();
- HInstruction* binop_left = binop->GetLeft();
- HInstruction* binop_right = binop->GetRight();
- // This is always true since the `HVecMul` has only one use (which is checked above).
- DCHECK_NE(binop_left, binop_right);
- if (binop_right == mul) {
- accumulator = binop_left;
- } else if (use->IsVecAdd()) {
- DCHECK_EQ(binop_left, mul);
- accumulator = binop_right;
- }
-
- HInstruction::InstructionKind kind =
- use->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub;
- if (accumulator != nullptr) {
- HVecMultiplyAccumulate* mulacc =
- new (allocator) HVecMultiplyAccumulate(allocator,
- kind,
- accumulator,
- mul->GetLeft(),
- mul->GetRight(),
- binop->GetPackedType(),
- binop->GetVectorLength(),
- binop->GetDexPc());
-
- binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc);
- DCHECK(!mul->HasUses());
- mul->GetBlock()->RemoveInstruction(mul);
- return true;
- }
- }
+ if (!mul->HasOnlyOneNonEnvironmentUse()) {
+ return false;
+ }
+ HInstruction* binop = mul->GetUses().front().GetUser();
+ if (!binop->IsVecAdd() && !binop->IsVecSub()) {
+ return false;
}
- return false;
+ // Replace code looking like
+ // VECMUL tmp, x, y
+ // VECADD/SUB dst, acc, tmp
+ // with
+ // VECMULACC dst, acc, x, y
+ // Note that we do not want to (unconditionally) perform the merge when the
+ // multiplication has multiple uses and it can be merged in all of them.
+ // Multiple uses could happen on the same control-flow path, and we would
+ // then increase the amount of work. In the future we could try to evaluate
+ // whether all uses are on different control-flow paths (using dominance and
+ // reverse-dominance information) and only perform the merge when they are.
+ HInstruction* accumulator = nullptr;
+ HVecBinaryOperation* vec_binop = binop->AsVecBinaryOperation();
+ HInstruction* binop_left = vec_binop->GetLeft();
+ HInstruction* binop_right = vec_binop->GetRight();
+ // This is always true since the `HVecMul` has only one use (which is checked above).
+ DCHECK_NE(binop_left, binop_right);
+ if (binop_right == mul) {
+ accumulator = binop_left;
+ } else {
+ DCHECK_EQ(binop_left, mul);
+ // Only addition is commutative.
+ if (!binop->IsVecAdd()) {
+ return false;
+ }
+ accumulator = binop_right;
+ }
+
+ DCHECK(accumulator != nullptr);
+ HInstruction::InstructionKind kind =
+ binop->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub;
+
+ bool predicated_simd = vec_binop->IsPredicated();
+ if (predicated_simd && !HVecOperation::HaveSamePredicate(vec_binop, mul)) {
+ return false;
+ }
+
+ HVecMultiplyAccumulate* mulacc =
+ new (allocator) HVecMultiplyAccumulate(allocator,
+ kind,
+ accumulator,
+ mul->GetLeft(),
+ mul->GetRight(),
+ vec_binop->GetPackedType(),
+ vec_binop->GetVectorLength(),
+ vec_binop->GetDexPc());
+
+
+
+ vec_binop->GetBlock()->ReplaceAndRemoveInstructionWith(vec_binop, mulacc);
+ if (predicated_simd) {
+ mulacc->SetGoverningPredicate(vec_binop->GetGoverningPredicate(),
+ vec_binop->GetPredicationKind());
+ }
+
+ DCHECK(!mul->HasUses());
+ mul->GetBlock()->RemoveInstruction(mul);
+ return true;
}
void InstructionSimplifierVisitor::VisitShift(HBinaryOperation* instruction) {
diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc
index 260bfaf..ff0859b 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.cc
+++ b/compiler/optimizing/instruction_simplifier_arm64.cc
@@ -277,14 +277,17 @@
}
void InstructionSimplifierArm64Visitor::VisitVecLoad(HVecLoad* instruction) {
- if (!instruction->IsStringCharAt()
- && TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) {
+ // TODO: Extract regular HIntermediateAddress.
+ if (!instruction->IsPredicated() && !instruction->IsStringCharAt() &&
+ TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) {
RecordSimplification();
}
}
void InstructionSimplifierArm64Visitor::VisitVecStore(HVecStore* instruction) {
- if (TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) {
+ // TODO: Extract regular HIntermediateAddress.
+ if (!instruction->IsPredicated() &&
+ TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) {
RecordSimplification();
}
}
diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc
index a776c37..76bd849 100644
--- a/compiler/optimizing/loop_analysis.cc
+++ b/compiler/optimizing/loop_analysis.cc
@@ -17,6 +17,7 @@
#include "loop_analysis.h"
#include "base/bit_vector-inl.h"
+#include "code_generator.h"
#include "induction_var_range.h"
namespace art {
@@ -76,6 +77,7 @@
// is provided. Enables scalar loop peeling and unrolling with the most conservative heuristics.
class ArchDefaultLoopHelper : public ArchNoOptsLoopHelper {
public:
+ explicit ArchDefaultLoopHelper(const CodeGenerator& codegen) : ArchNoOptsLoopHelper(codegen) {}
// Scalar loop unrolling parameters and heuristics.
//
// Maximum possible unrolling factor.
@@ -132,6 +134,7 @@
// peeling and unrolling and supports SIMD loop unrolling.
class Arm64LoopHelper : public ArchDefaultLoopHelper {
public:
+ explicit Arm64LoopHelper(const CodeGenerator& codegen) : ArchDefaultLoopHelper(codegen) {}
// SIMD loop unrolling parameters and heuristics.
//
// Maximum possible unrolling factor.
@@ -157,6 +160,10 @@
// Don't unroll with insufficient iterations.
// TODO: Unroll loops with unknown trip count.
DCHECK_NE(vector_length, 0u);
+ // TODO: Unroll loops in predicated vectorization mode.
+ if (codegen_.SupportsPredicatedSIMD()) {
+ return LoopAnalysisInfo::kNoUnrollingFactor;
+ }
if (trip_count < (2 * vector_length + max_peel)) {
return LoopAnalysisInfo::kNoUnrollingFactor;
}
@@ -309,6 +316,8 @@
uint32_t GetUnrollingFactor(HLoopInformation* loop_info, HBasicBlock* header) const;
public:
+ explicit X86_64LoopHelper(const CodeGenerator& codegen) : ArchDefaultLoopHelper(codegen) {}
+
uint32_t GetSIMDUnrollingFactor(HBasicBlock* block,
int64_t trip_count,
uint32_t max_peel,
@@ -398,17 +407,18 @@
return (1 << unrolling_factor);
}
-ArchNoOptsLoopHelper* ArchNoOptsLoopHelper::Create(InstructionSet isa,
+ArchNoOptsLoopHelper* ArchNoOptsLoopHelper::Create(const CodeGenerator& codegen,
ArenaAllocator* allocator) {
+ InstructionSet isa = codegen.GetInstructionSet();
switch (isa) {
case InstructionSet::kArm64: {
- return new (allocator) Arm64LoopHelper;
+ return new (allocator) Arm64LoopHelper(codegen);
}
case InstructionSet::kX86_64: {
- return new (allocator) X86_64LoopHelper;
+ return new (allocator) X86_64LoopHelper(codegen);
}
default: {
- return new (allocator) ArchDefaultLoopHelper;
+ return new (allocator) ArchDefaultLoopHelper(codegen);
}
}
}
diff --git a/compiler/optimizing/loop_analysis.h b/compiler/optimizing/loop_analysis.h
index 57509ee..fbf1516 100644
--- a/compiler/optimizing/loop_analysis.h
+++ b/compiler/optimizing/loop_analysis.h
@@ -21,6 +21,7 @@
namespace art {
+class CodeGenerator;
class InductionVarRange;
class LoopAnalysis;
@@ -132,11 +133,12 @@
//
class ArchNoOptsLoopHelper : public ArenaObject<kArenaAllocOptimization> {
public:
+ explicit ArchNoOptsLoopHelper(const CodeGenerator& codegen) : codegen_(codegen) {}
virtual ~ArchNoOptsLoopHelper() {}
// Creates an instance of specialised helper for the target or default helper if the target
// doesn't support loop peeling and unrolling.
- static ArchNoOptsLoopHelper* Create(InstructionSet isa, ArenaAllocator* allocator);
+ static ArchNoOptsLoopHelper* Create(const CodeGenerator& codegen, ArenaAllocator* allocator);
// Returns whether the loop is not beneficial for loop peeling/unrolling.
//
@@ -176,6 +178,9 @@
uint32_t vector_length ATTRIBUTE_UNUSED) const {
return LoopAnalysisInfo::kNoUnrollingFactor;
}
+
+ protected:
+ const CodeGenerator& codegen_;
};
} // namespace art
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 4c9b01c..1210dbe 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -473,6 +473,7 @@
iset_(nullptr),
reductions_(nullptr),
simplified_(false),
+ predicated_vectorization_mode_(codegen.SupportsPredicatedSIMD()),
vector_length_(0),
vector_refs_(nullptr),
vector_static_peeling_factor_(0),
@@ -486,10 +487,7 @@
vector_header_(nullptr),
vector_body_(nullptr),
vector_index_(nullptr),
- arch_loop_helper_(ArchNoOptsLoopHelper::Create(compiler_options_ != nullptr
- ? compiler_options_->GetInstructionSet()
- : InstructionSet::kNone,
- global_allocator_)) {
+ arch_loop_helper_(ArchNoOptsLoopHelper::Create(codegen, global_allocator_)) {
}
bool HLoopOptimization::Run() {
@@ -1024,8 +1022,10 @@
}
} // for i
- // Find a suitable alignment strategy.
- SetAlignmentStrategy(peeling_votes, peeling_candidate);
+ if (!IsInPredicatedVectorizationMode()) {
+ // Find a suitable alignment strategy.
+ SetAlignmentStrategy(peeling_votes, peeling_candidate);
+ }
// Does vectorization seem profitable?
if (!IsVectorizationProfitable(trip_count)) {
@@ -1052,8 +1052,8 @@
// A cleanup loop is needed, at least, for any unknown trip count or
// for a known trip count with remainder iterations after vectorization.
- bool needs_cleanup = trip_count == 0 ||
- ((trip_count - vector_static_peeling_factor_) % chunk) != 0;
+ bool needs_cleanup = !IsInPredicatedVectorizationMode() &&
+ (trip_count == 0 || ((trip_count - vector_static_peeling_factor_) % chunk) != 0);
// Adjust vector bookkeeping.
HPhi* main_phi = nullptr;
@@ -1071,11 +1071,13 @@
// ptc = <peeling factor>;
HInstruction* ptc = nullptr;
if (vector_static_peeling_factor_ != 0) {
+ DCHECK(!IsInPredicatedVectorizationMode());
// Static loop peeling for SIMD alignment (using the most suitable
// fixed peeling factor found during prior alignment analysis).
DCHECK(vector_dynamic_peeling_candidate_ == nullptr);
ptc = graph_->GetConstant(induc_type, vector_static_peeling_factor_);
} else if (vector_dynamic_peeling_candidate_ != nullptr) {
+ DCHECK(!IsInPredicatedVectorizationMode());
// Dynamic loop peeling for SIMD alignment (using the most suitable
// candidate found during prior alignment analysis):
// rem = offset % ALIGN; // adjusted as #elements
@@ -1106,6 +1108,7 @@
HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader);
HInstruction* vtc = stc;
if (needs_cleanup) {
+ DCHECK(!IsInPredicatedVectorizationMode());
DCHECK(IsPowerOfTwo(chunk));
HInstruction* diff = stc;
if (ptc != nullptr) {
@@ -1143,6 +1146,7 @@
// moved around during suspend checks, since all analysis was based on
// nothing more than the Android runtime alignment conventions.
if (ptc != nullptr) {
+ DCHECK(!IsInPredicatedVectorizationMode());
vector_mode_ = kSequential;
GenerateNewLoop(node,
block,
@@ -1170,6 +1174,7 @@
// for ( ; i < stc; i += 1)
// <loop-body>
if (needs_cleanup) {
+ DCHECK(!IsInPredicatedVectorizationMode() || vector_runtime_test_a_ != nullptr);
vector_mode_ = kSequential;
GenerateNewLoop(node,
block,
@@ -1227,9 +1232,35 @@
// Generate header and prepare body.
// for (i = lo; i < hi; i += step)
// <loop-body>
- HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi);
- vector_header_->AddPhi(phi);
- vector_header_->AddInstruction(cond);
+ HInstruction* cond = nullptr;
+ HInstruction* set_pred = nullptr;
+ if (IsInPredicatedVectorizationMode()) {
+ HVecPredWhile* pred_while =
+ new (global_allocator_) HVecPredWhile(global_allocator_,
+ phi,
+ hi,
+ HVecPredWhile::CondKind::kLO,
+ DataType::Type::kInt32,
+ vector_length_,
+ 0u);
+
+ cond = new (global_allocator_) HVecPredCondition(global_allocator_,
+ pred_while,
+ HVecPredCondition::PCondKind::kNFirst,
+ DataType::Type::kInt32,
+ vector_length_,
+ 0u);
+
+ vector_header_->AddPhi(phi);
+ vector_header_->AddInstruction(pred_while);
+ vector_header_->AddInstruction(cond);
+ set_pred = pred_while;
+ } else {
+ cond = new (global_allocator_) HAboveOrEqual(phi, hi);
+ vector_header_->AddPhi(phi);
+ vector_header_->AddInstruction(cond);
+ }
+
vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
vector_index_ = phi;
vector_permanent_map_->clear(); // preserved over unrolling
@@ -1246,6 +1277,10 @@
auto i = vector_map_->find(it.Current());
if (i != vector_map_->end() && !i->second->IsInBlock()) {
Insert(vector_body_, i->second);
+ if (IsInPredicatedVectorizationMode() && i->second->IsVecOperation()) {
+ HVecOperation* op = i->second->AsVecOperation();
+ op->SetMergingGoverningPredicate(set_pred);
+ }
// Deal with instructions that need an environment, such as the scalar intrinsics.
if (i->second->NeedsEnvironment()) {
i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
@@ -1360,7 +1395,10 @@
} else if (instruction->IsArrayGet()) {
// Deal with vector restrictions.
bool is_string_char_at = instruction->AsArrayGet()->IsStringCharAt();
- if (is_string_char_at && HasVectorRestrictions(restrictions, kNoStringCharAt)) {
+
+ if (is_string_char_at && (HasVectorRestrictions(restrictions, kNoStringCharAt) ||
+ IsInPredicatedVectorizationMode())) {
+ // TODO: Support CharAt for predicated mode.
return false;
}
// Accept a right-hand-side array base[index] for
@@ -1575,32 +1613,73 @@
}
return false;
case InstructionSet::kArm64:
- // Allow vectorization for all ARM devices, because Android assumes that
- // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD).
- switch (type) {
- case DataType::Type::kBool:
- case DataType::Type::kUint8:
- case DataType::Type::kInt8:
- *restrictions |= kNoDiv;
- return TrySetVectorLength(type, 16);
- case DataType::Type::kUint16:
- case DataType::Type::kInt16:
- *restrictions |= kNoDiv;
- return TrySetVectorLength(type, 8);
- case DataType::Type::kInt32:
- *restrictions |= kNoDiv;
- return TrySetVectorLength(type, 4);
- case DataType::Type::kInt64:
- *restrictions |= kNoDiv | kNoMul;
- return TrySetVectorLength(type, 2);
- case DataType::Type::kFloat32:
- *restrictions |= kNoReduction;
- return TrySetVectorLength(type, 4);
- case DataType::Type::kFloat64:
- *restrictions |= kNoReduction;
- return TrySetVectorLength(type, 2);
- default:
- return false;
+ if (IsInPredicatedVectorizationMode()) {
+ // SVE vectorization.
+ CHECK(features->AsArm64InstructionSetFeatures()->HasSVE());
+ switch (type) {
+ case DataType::Type::kBool:
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ *restrictions |= kNoDiv |
+ kNoSignedHAdd |
+ kNoUnsignedHAdd |
+ kNoUnroundedHAdd |
+ kNoSAD;
+ return TrySetVectorLength(type, 16);
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ *restrictions |= kNoDiv |
+ kNoSignedHAdd |
+ kNoUnsignedHAdd |
+ kNoUnroundedHAdd |
+ kNoSAD |
+ kNoDotProd;
+ return TrySetVectorLength(type, 8);
+ case DataType::Type::kInt32:
+ *restrictions |= kNoDiv | kNoSAD;
+ return TrySetVectorLength(type, 4);
+ case DataType::Type::kInt64:
+ *restrictions |= kNoDiv | kNoSAD;
+ return TrySetVectorLength(type, 2);
+ case DataType::Type::kFloat32:
+ *restrictions |= kNoReduction;
+ return TrySetVectorLength(type, 4);
+ case DataType::Type::kFloat64:
+ *restrictions |= kNoReduction;
+ return TrySetVectorLength(type, 2);
+ default:
+ break;
+ }
+ return false;
+ } else {
+ // Allow vectorization for all ARM devices, because Android assumes that
+ // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD).
+ switch (type) {
+ case DataType::Type::kBool:
+ case DataType::Type::kUint8:
+ case DataType::Type::kInt8:
+ *restrictions |= kNoDiv;
+ return TrySetVectorLength(type, 16);
+ case DataType::Type::kUint16:
+ case DataType::Type::kInt16:
+ *restrictions |= kNoDiv;
+ return TrySetVectorLength(type, 8);
+ case DataType::Type::kInt32:
+ *restrictions |= kNoDiv;
+ return TrySetVectorLength(type, 4);
+ case DataType::Type::kInt64:
+ *restrictions |= kNoDiv | kNoMul;
+ return TrySetVectorLength(type, 2);
+ case DataType::Type::kFloat32:
+ *restrictions |= kNoReduction;
+ return TrySetVectorLength(type, 4);
+ case DataType::Type::kFloat64:
+ *restrictions |= kNoReduction;
+ return TrySetVectorLength(type, 2);
+ default:
+ break;
+ }
+ return false;
}
case InstructionSet::kX86:
case InstructionSet::kX86_64:
@@ -1693,6 +1772,15 @@
vector = new (global_allocator_)
HVecReplicateScalar(global_allocator_, input, type, vector_length_, kNoDexPc);
vector_permanent_map_->Put(org, Insert(vector_preheader_, vector));
+ if (IsInPredicatedVectorizationMode()) {
+ HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+ graph_->GetIntConstant(1),
+ type,
+ vector_length_,
+ 0u);
+ vector_preheader_->InsertInstructionBefore(set_pred, vector);
+ vector->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+ }
}
vector_map_->Put(org, vector);
}
@@ -1821,6 +1909,15 @@
vector_length,
kNoDexPc));
}
+ if (IsInPredicatedVectorizationMode()) {
+ HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+ graph_->GetIntConstant(1),
+ type,
+ vector_length,
+ 0u);
+ vector_preheader_->InsertInstructionBefore(set_pred, new_init);
+ new_init->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+ }
} else {
new_init = ReduceAndExtractIfNeeded(new_init);
}
@@ -1852,6 +1949,17 @@
instruction = new (global_allocator_) HVecExtractScalar(
global_allocator_, reduce, type, vector_length, 0, kNoDexPc);
exit->InsertInstructionAfter(instruction, reduce);
+
+ if (IsInPredicatedVectorizationMode()) {
+ HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+ graph_->GetIntConstant(1),
+ type,
+ vector_length,
+ 0u);
+ exit->InsertInstructionBefore(set_pred, reduce);
+ reduce->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+ instruction->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+ }
}
}
return instruction;
@@ -1991,7 +2099,8 @@
return false;
}
// Deal with vector restrictions.
- if ((!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) ||
+ if ((is_unsigned && HasVectorRestrictions(restrictions, kNoUnsignedHAdd)) ||
+ (!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) ||
(!is_rounded && HasVectorRestrictions(restrictions, kNoUnroundedHAdd))) {
return false;
}
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 0c35f29..0d76804 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -76,13 +76,14 @@
kNoShr = 1 << 3, // no arithmetic shift right
kNoHiBits = 1 << 4, // "wider" operations cannot bring in higher order bits
kNoSignedHAdd = 1 << 5, // no signed halving add
- kNoUnroundedHAdd = 1 << 6, // no unrounded halving add
- kNoAbs = 1 << 7, // no absolute value
- kNoStringCharAt = 1 << 8, // no StringCharAt
- kNoReduction = 1 << 9, // no reduction
- kNoSAD = 1 << 10, // no sum of absolute differences (SAD)
- kNoWideSAD = 1 << 11, // no sum of absolute differences (SAD) with operand widening
- kNoDotProd = 1 << 12, // no dot product
+ kNoUnsignedHAdd = 1 << 6, // no unsigned halving add
+ kNoUnroundedHAdd = 1 << 7, // no unrounded halving add
+ kNoAbs = 1 << 8, // no absolute value
+ kNoStringCharAt = 1 << 9, // no StringCharAt
+ kNoReduction = 1 << 10, // no reduction
+ kNoSAD = 1 << 11, // no sum of absolute differences (SAD)
+ kNoWideSAD = 1 << 12, // no sum of absolute differences (SAD) with operand widening
+ kNoDotProd = 1 << 13, // no dot product
};
/*
@@ -270,6 +271,8 @@
void RemoveDeadInstructions(const HInstructionList& list);
bool CanRemoveCycle(); // Whether the current 'iset_' is removable.
+ bool IsInPredicatedVectorizationMode() const { return predicated_vectorization_mode_; }
+
// Compiler options (to query ISA features).
const CompilerOptions* compiler_options_;
@@ -305,6 +308,9 @@
// Flag that tracks if any simplifications have occurred.
bool simplified_;
+ // Whether to use predicated loop vectorization (e.g. for arm64 SVE target).
+ bool predicated_vectorization_mode_;
+
// Number of "lanes" for selected packed type.
uint32_t vector_length_;
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index 9c6b422..a2cd86d 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -145,6 +145,15 @@
return pred_input->AsVecPredSetOperation();
}
+ // Returns whether two vector operations are predicated by the same vector predicate
+ // with the same predication type.
+ static bool HaveSamePredicate(HVecOperation* instr0, HVecOperation* instr1) {
+ HVecPredSetOperation* instr0_predicate = instr0->GetGoverningPredicate();
+ HVecOperation::PredicationKind instr0_predicate_kind = instr0->GetPredicationKind();
+ return instr1->GetGoverningPredicate() == instr0_predicate &&
+ instr1->GetPredicationKind() == instr0_predicate_kind;
+ }
+
// Returns the number of elements packed in a vector.
size_t GetVectorLength() const {
return vector_length_;