diff options
| author | 2017-04-03 18:47:32 +0100 | |
|---|---|---|
| committer | 2017-04-10 11:43:33 +0100 | |
| commit | d4bccf1ece319a3a99e03ecbcbbf40bb82b9e331 (patch) | |
| tree | 2890740d9cab3eee2be223666f528c6707b89f90 /compiler/optimizing | |
| parent | 903b8169074c01590ab3f5ad9190d9c7e3fe795b (diff) | |
ARM64: Support 128-bit registers for SIMD.
Test: test-art-host, test-art-target
Change-Id: Ifb931a99d34ea77602a0e0781040ed092de9faaa
Diffstat (limited to 'compiler/optimizing')
| -rw-r--r-- | compiler/optimizing/code_generator_arm64.cc | 42 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_arm64.h | 5 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_vector_arm64.cc | 178 | ||||
| -rw-r--r-- | compiler/optimizing/codegen_test.cc | 39 | ||||
| -rw-r--r-- | compiler/optimizing/common_arm64.h | 5 | ||||
| -rw-r--r-- | compiler/optimizing/loop_optimization.cc | 11 |
6 files changed, 177 insertions, 103 deletions
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index 794e05c670..b39a0e43fa 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -68,6 +68,7 @@ using helpers::OperandFromMemOperand; using helpers::OutputCPURegister; using helpers::OutputFPRegister; using helpers::OutputRegister; +using helpers::QRegisterFrom; using helpers::RegisterFrom; using helpers::StackOperandFrom; using helpers::VIXLRegCodeFromART; @@ -1459,9 +1460,12 @@ void ParallelMoveResolverARM64::FinishEmitNativeCode() { } Location ParallelMoveResolverARM64::AllocateScratchLocationFor(Location::Kind kind) { - DCHECK(kind == Location::kRegister || kind == Location::kFpuRegister || - kind == Location::kStackSlot || kind == Location::kDoubleStackSlot); - kind = (kind == Location::kFpuRegister) ? Location::kFpuRegister : Location::kRegister; + DCHECK(kind == Location::kRegister || kind == Location::kFpuRegister + || kind == Location::kStackSlot || kind == Location::kDoubleStackSlot + || kind == Location::kSIMDStackSlot); + kind = (kind == Location::kFpuRegister || kind == Location::kSIMDStackSlot) + ? Location::kFpuRegister + : Location::kRegister; Location scratch = GetScratchLocation(kind); if (!scratch.Equals(Location::NoLocation())) { return scratch; @@ -1471,7 +1475,9 @@ Location ParallelMoveResolverARM64::AllocateScratchLocationFor(Location::Kind ki scratch = LocationFrom(vixl_temps_.AcquireX()); } else { DCHECK(kind == Location::kFpuRegister); - scratch = LocationFrom(vixl_temps_.AcquireD()); + scratch = LocationFrom(codegen_->GetGraph()->HasSIMD() + ? vixl_temps_.AcquireVRegisterOfSize(kQRegSize) + : vixl_temps_.AcquireD()); } AddScratchLocation(scratch); return scratch; @@ -1482,7 +1488,7 @@ void ParallelMoveResolverARM64::FreeScratchLocation(Location loc) { vixl_temps_.Release(XRegisterFrom(loc)); } else { DCHECK(loc.IsFpuRegister()); - vixl_temps_.Release(DRegisterFrom(loc)); + vixl_temps_.Release(codegen_->GetGraph()->HasSIMD() ? QRegisterFrom(loc) : DRegisterFrom(loc)); } RemoveScratchLocation(loc); } @@ -1745,6 +1751,8 @@ void CodeGeneratorARM64::MoveLocation(Location destination, if (source.IsStackSlot() || source.IsDoubleStackSlot()) { DCHECK(dst.Is64Bits() == source.IsDoubleStackSlot()); __ Ldr(dst, StackOperandFrom(source)); + } else if (source.IsSIMDStackSlot()) { + __ Ldr(QRegisterFrom(destination), StackOperandFrom(source)); } else if (source.IsConstant()) { DCHECK(CoherentConstantAndType(source, dst_type)); MoveConstant(dst, source.GetConstant()); @@ -1767,7 +1775,29 @@ void CodeGeneratorARM64::MoveLocation(Location destination, __ Fmov(RegisterFrom(destination, dst_type), FPRegisterFrom(source, source_type)); } else { DCHECK(destination.IsFpuRegister()); - __ Fmov(FPRegister(dst), FPRegisterFrom(source, dst_type)); + if (GetGraph()->HasSIMD()) { + __ Mov(QRegisterFrom(destination), QRegisterFrom(source)); + } else { + __ Fmov(FPRegister(dst), FPRegisterFrom(source, dst_type)); + } + } + } + } else if (destination.IsSIMDStackSlot()) { + if (source.IsFpuRegister()) { + __ Str(QRegisterFrom(source), StackOperandFrom(destination)); + } else { + DCHECK(source.IsSIMDStackSlot()); + UseScratchRegisterScope temps(GetVIXLAssembler()); + if (GetVIXLAssembler()->GetScratchFPRegisterList()->IsEmpty()) { + Register temp = temps.AcquireX(); + __ Ldr(temp, MemOperand(sp, source.GetStackIndex())); + __ Str(temp, MemOperand(sp, destination.GetStackIndex())); + __ Ldr(temp, MemOperand(sp, source.GetStackIndex() + kArm64WordSize)); + __ Str(temp, MemOperand(sp, destination.GetStackIndex() + kArm64WordSize)); + } else { + FPRegister temp = temps.AcquireVRegisterOfSize(kQRegSize); + __ Ldr(temp, StackOperandFrom(source)); + __ Str(temp, StackOperandFrom(destination)); } } } else { // The destination is not a register. It must be a stack slot. diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index 10d8b841f8..869aad2942 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -412,8 +412,9 @@ class CodeGeneratorARM64 : public CodeGenerator { } size_t GetFloatingPointSpillSlotSize() const OVERRIDE { - // Allocated in D registers, which are word sized. - return kArm64WordSize; + return GetGraph()->HasSIMD() + ? 2 * kArm64WordSize // 16 bytes == 2 arm64 words for each spill + : 1 * kArm64WordSize; // 8 bytes == 1 arm64 words for each spill } uintptr_t GetAddressOf(HBasicBlock* block) OVERRIDE { diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc index f4874fe2bc..11c5e380fe 100644 --- a/compiler/optimizing/code_generator_vector_arm64.cc +++ b/compiler/optimizing/code_generator_vector_arm64.cc @@ -57,21 +57,21 @@ void InstructionCodeGeneratorARM64::VisitVecReplicateScalar(HVecReplicateScalar* switch (instruction->GetPackedType()) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Dup(dst.V8B(), InputRegisterAt(instruction, 0)); + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Dup(dst.V16B(), InputRegisterAt(instruction, 0)); break; case Primitive::kPrimChar: case Primitive::kPrimShort: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Dup(dst.V4H(), InputRegisterAt(instruction, 0)); + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Dup(dst.V8H(), InputRegisterAt(instruction, 0)); break; case Primitive::kPrimInt: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Dup(dst.V2S(), InputRegisterAt(instruction, 0)); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Dup(dst.V4S(), InputRegisterAt(instruction, 0)); break; case Primitive::kPrimFloat: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Dup(dst.V2S(), DRegisterFrom(locations->InAt(0)).V2S(), 0); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Dup(dst.V4S(), DRegisterFrom(locations->InAt(0)).V4S(), 0); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -130,8 +130,8 @@ void InstructionCodeGeneratorARM64::VisitVecCnv(HVecCnv* instruction) { Primitive::Type from = instruction->GetInputType(); Primitive::Type to = instruction->GetResultType(); if (from == Primitive::kPrimInt && to == Primitive::kPrimFloat) { - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Scvtf(dst.V2S(), src.V2S()); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Scvtf(dst.V4S(), src.V4S()); } else { LOG(FATAL) << "Unsupported SIMD type"; } @@ -147,21 +147,21 @@ void InstructionCodeGeneratorARM64::VisitVecNeg(HVecNeg* instruction) { FPRegister dst = DRegisterFrom(locations->Out()); switch (instruction->GetPackedType()) { case Primitive::kPrimByte: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Neg(dst.V8B(), src.V8B()); + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Neg(dst.V16B(), src.V16B()); break; case Primitive::kPrimChar: case Primitive::kPrimShort: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Neg(dst.V4H(), src.V4H()); + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Neg(dst.V8H(), src.V8H()); break; case Primitive::kPrimInt: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Neg(dst.V2S(), src.V2S()); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Neg(dst.V4S(), src.V4S()); break; case Primitive::kPrimFloat: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Fneg(dst.V2S(), src.V2S()); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Fneg(dst.V4S(), src.V4S()); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -179,21 +179,21 @@ void InstructionCodeGeneratorARM64::VisitVecAbs(HVecAbs* instruction) { FPRegister dst = DRegisterFrom(locations->Out()); switch (instruction->GetPackedType()) { case Primitive::kPrimByte: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Abs(dst.V8B(), src.V8B()); + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Abs(dst.V16B(), src.V16B()); break; case Primitive::kPrimChar: case Primitive::kPrimShort: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Abs(dst.V4H(), src.V4H()); + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Abs(dst.V8H(), src.V8H()); break; case Primitive::kPrimInt: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Abs(dst.V2S(), src.V2S()); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Abs(dst.V4S(), src.V4S()); break; case Primitive::kPrimFloat: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Fabs(dst.V2S(), src.V2S()); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Fabs(dst.V4S(), src.V4S()); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -210,15 +210,15 @@ void InstructionCodeGeneratorARM64::VisitVecNot(HVecNot* instruction) { FPRegister dst = DRegisterFrom(locations->Out()); switch (instruction->GetPackedType()) { case Primitive::kPrimBoolean: // special case boolean-not - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Movi(dst.V8B(), 1); - __ Eor(dst.V8B(), dst.V8B(), src.V8B()); + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Movi(dst.V16B(), 1); + __ Eor(dst.V16B(), dst.V16B(), src.V16B()); break; case Primitive::kPrimByte: case Primitive::kPrimChar: case Primitive::kPrimShort: case Primitive::kPrimInt: - __ Not(dst.V8B(), src.V8B()); // lanes do not matter + __ Not(dst.V16B(), src.V16B()); // lanes do not matter break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -257,21 +257,21 @@ void InstructionCodeGeneratorARM64::VisitVecAdd(HVecAdd* instruction) { FPRegister dst = DRegisterFrom(locations->Out()); switch (instruction->GetPackedType()) { case Primitive::kPrimByte: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Add(dst.V8B(), lhs.V8B(), rhs.V8B()); + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Add(dst.V16B(), lhs.V16B(), rhs.V16B()); break; case Primitive::kPrimChar: case Primitive::kPrimShort: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Add(dst.V4H(), lhs.V4H(), rhs.V4H()); + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Add(dst.V8H(), lhs.V8H(), rhs.V8H()); break; case Primitive::kPrimInt: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Add(dst.V2S(), lhs.V2S(), rhs.V2S()); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Add(dst.V4S(), lhs.V4S(), rhs.V4S()); break; case Primitive::kPrimFloat: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Fadd(dst.V2S(), lhs.V2S(), rhs.V2S()); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Fadd(dst.V4S(), lhs.V4S(), rhs.V4S()); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -290,21 +290,21 @@ void InstructionCodeGeneratorARM64::VisitVecSub(HVecSub* instruction) { FPRegister dst = DRegisterFrom(locations->Out()); switch (instruction->GetPackedType()) { case Primitive::kPrimByte: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Sub(dst.V8B(), lhs.V8B(), rhs.V8B()); + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Sub(dst.V16B(), lhs.V16B(), rhs.V16B()); break; case Primitive::kPrimChar: case Primitive::kPrimShort: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Sub(dst.V4H(), lhs.V4H(), rhs.V4H()); + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Sub(dst.V8H(), lhs.V8H(), rhs.V8H()); break; case Primitive::kPrimInt: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Sub(dst.V2S(), lhs.V2S(), rhs.V2S()); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Sub(dst.V4S(), lhs.V4S(), rhs.V4S()); break; case Primitive::kPrimFloat: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Fsub(dst.V2S(), lhs.V2S(), rhs.V2S()); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Fsub(dst.V4S(), lhs.V4S(), rhs.V4S()); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -323,21 +323,21 @@ void InstructionCodeGeneratorARM64::VisitVecMul(HVecMul* instruction) { FPRegister dst = DRegisterFrom(locations->Out()); switch (instruction->GetPackedType()) { case Primitive::kPrimByte: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Mul(dst.V8B(), lhs.V8B(), rhs.V8B()); + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Mul(dst.V16B(), lhs.V16B(), rhs.V16B()); break; case Primitive::kPrimChar: case Primitive::kPrimShort: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Mul(dst.V4H(), lhs.V4H(), rhs.V4H()); + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Mul(dst.V8H(), lhs.V8H(), rhs.V8H()); break; case Primitive::kPrimInt: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Mul(dst.V2S(), lhs.V2S(), rhs.V2S()); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Mul(dst.V4S(), lhs.V4S(), rhs.V4S()); break; case Primitive::kPrimFloat: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Fmul(dst.V2S(), lhs.V2S(), rhs.V2S()); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Fmul(dst.V4S(), lhs.V4S(), rhs.V4S()); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -356,8 +356,8 @@ void InstructionCodeGeneratorARM64::VisitVecDiv(HVecDiv* instruction) { FPRegister dst = DRegisterFrom(locations->Out()); switch (instruction->GetPackedType()) { case Primitive::kPrimFloat: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Fdiv(dst.V2S(), lhs.V2S(), rhs.V2S()); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Fdiv(dst.V4S(), lhs.V4S(), rhs.V4S()); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -381,7 +381,7 @@ void InstructionCodeGeneratorARM64::VisitVecAnd(HVecAnd* instruction) { case Primitive::kPrimShort: case Primitive::kPrimInt: case Primitive::kPrimFloat: - __ And(dst.V8B(), lhs.V8B(), rhs.V8B()); // lanes do not matter + __ And(dst.V16B(), lhs.V16B(), rhs.V16B()); // lanes do not matter break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -413,7 +413,7 @@ void InstructionCodeGeneratorARM64::VisitVecOr(HVecOr* instruction) { case Primitive::kPrimShort: case Primitive::kPrimInt: case Primitive::kPrimFloat: - __ Orr(dst.V8B(), lhs.V8B(), rhs.V8B()); // lanes do not matter + __ Orr(dst.V16B(), lhs.V16B(), rhs.V16B()); // lanes do not matter break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -437,7 +437,7 @@ void InstructionCodeGeneratorARM64::VisitVecXor(HVecXor* instruction) { case Primitive::kPrimShort: case Primitive::kPrimInt: case Primitive::kPrimFloat: - __ Eor(dst.V8B(), lhs.V8B(), rhs.V8B()); // lanes do not matter + __ Eor(dst.V16B(), lhs.V16B(), rhs.V16B()); // lanes do not matter break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -474,17 +474,17 @@ void InstructionCodeGeneratorARM64::VisitVecShl(HVecShl* instruction) { int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); switch (instruction->GetPackedType()) { case Primitive::kPrimByte: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Shl(dst.V8B(), lhs.V8B(), value); + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Shl(dst.V16B(), lhs.V16B(), value); break; case Primitive::kPrimChar: case Primitive::kPrimShort: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Shl(dst.V4H(), lhs.V4H(), value); + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Shl(dst.V8H(), lhs.V8H(), value); break; case Primitive::kPrimInt: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Shl(dst.V2S(), lhs.V2S(), value); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Shl(dst.V4S(), lhs.V4S(), value); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -503,17 +503,17 @@ void InstructionCodeGeneratorARM64::VisitVecShr(HVecShr* instruction) { int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); switch (instruction->GetPackedType()) { case Primitive::kPrimByte: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Sshr(dst.V8B(), lhs.V8B(), value); + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Sshr(dst.V16B(), lhs.V16B(), value); break; case Primitive::kPrimChar: case Primitive::kPrimShort: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Sshr(dst.V4H(), lhs.V4H(), value); + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Sshr(dst.V8H(), lhs.V8H(), value); break; case Primitive::kPrimInt: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Sshr(dst.V2S(), lhs.V2S(), value); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Sshr(dst.V4S(), lhs.V4S(), value); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -532,17 +532,17 @@ void InstructionCodeGeneratorARM64::VisitVecUShr(HVecUShr* instruction) { int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue(); switch (instruction->GetPackedType()) { case Primitive::kPrimByte: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Ushr(dst.V8B(), lhs.V8B(), value); + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Ushr(dst.V16B(), lhs.V16B(), value); break; case Primitive::kPrimChar: case Primitive::kPrimShort: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Ushr(dst.V4H(), lhs.V4H(), value); + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Ushr(dst.V8H(), lhs.V8H(), value); break; case Primitive::kPrimInt: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Ushr(dst.V2S(), lhs.V2S(), value); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Ushr(dst.V4S(), lhs.V4S(), value); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -617,18 +617,18 @@ void InstructionCodeGeneratorARM64::VisitVecLoad(HVecLoad* instruction) { switch (instruction->GetPackedType()) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ Ld1(reg.V8B(), mem); + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ Ld1(reg.V16B(), mem); break; case Primitive::kPrimChar: case Primitive::kPrimShort: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ Ld1(reg.V4H(), mem); + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ Ld1(reg.V8H(), mem); break; case Primitive::kPrimInt: case Primitive::kPrimFloat: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ Ld1(reg.V2S(), mem); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ Ld1(reg.V4S(), mem); break; default: LOG(FATAL) << "Unsupported SIMD type"; @@ -647,18 +647,18 @@ void InstructionCodeGeneratorARM64::VisitVecStore(HVecStore* instruction) { switch (instruction->GetPackedType()) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: - DCHECK_EQ(8u, instruction->GetVectorLength()); - __ St1(reg.V8B(), mem); + DCHECK_EQ(16u, instruction->GetVectorLength()); + __ St1(reg.V16B(), mem); break; case Primitive::kPrimChar: case Primitive::kPrimShort: - DCHECK_EQ(4u, instruction->GetVectorLength()); - __ St1(reg.V4H(), mem); + DCHECK_EQ(8u, instruction->GetVectorLength()); + __ St1(reg.V8H(), mem); break; case Primitive::kPrimInt: case Primitive::kPrimFloat: - DCHECK_EQ(2u, instruction->GetVectorLength()); - __ St1(reg.V2S(), mem); + DCHECK_EQ(4u, instruction->GetVectorLength()); + __ St1(reg.V4S(), mem); break; default: LOG(FATAL) << "Unsupported SIMD type"; diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc index f8bbf68c1c..4ba5c5580f 100644 --- a/compiler/optimizing/codegen_test.cc +++ b/compiler/optimizing/codegen_test.cc @@ -769,6 +769,45 @@ TEST_F(CodegenTest, ARM64ParallelMoveResolverB34760542) { InternalCodeAllocator code_allocator; codegen.Finalize(&code_allocator); } + +// Check that ParallelMoveResolver works fine for ARM64 for both cases when SIMD is on and off. +TEST_F(CodegenTest, ARM64ParallelMoveResolverSIMD) { + std::unique_ptr<const Arm64InstructionSetFeatures> features( + Arm64InstructionSetFeatures::FromCppDefines()); + ArenaPool pool; + ArenaAllocator allocator(&pool); + HGraph* graph = CreateGraph(&allocator); + arm64::CodeGeneratorARM64 codegen(graph, *features.get(), CompilerOptions()); + + codegen.Initialize(); + + graph->SetHasSIMD(true); + for (int i = 0; i < 2; i++) { + HParallelMove* move = new (graph->GetArena()) HParallelMove(graph->GetArena()); + move->AddMove(Location::SIMDStackSlot(0), + Location::SIMDStackSlot(257), + Primitive::kPrimDouble, + nullptr); + move->AddMove(Location::SIMDStackSlot(257), + Location::SIMDStackSlot(0), + Primitive::kPrimDouble, + nullptr); + move->AddMove(Location::FpuRegisterLocation(0), + Location::FpuRegisterLocation(1), + Primitive::kPrimDouble, + nullptr); + move->AddMove(Location::FpuRegisterLocation(1), + Location::FpuRegisterLocation(0), + Primitive::kPrimDouble, + nullptr); + codegen.GetMoveResolver()->EmitNativeCode(move); + graph->SetHasSIMD(false); + } + + InternalCodeAllocator code_allocator; + codegen.Finalize(&code_allocator); +} + #endif #ifdef ART_ENABLE_CODEGEN_mips diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h index d3f431e327..5372b97247 100644 --- a/compiler/optimizing/common_arm64.h +++ b/compiler/optimizing/common_arm64.h @@ -92,6 +92,11 @@ inline vixl::aarch64::FPRegister DRegisterFrom(Location location) { return vixl::aarch64::FPRegister::GetDRegFromCode(location.reg()); } +inline vixl::aarch64::FPRegister QRegisterFrom(Location location) { + DCHECK(location.IsFpuRegister()) << location; + return vixl::aarch64::FPRegister::GetQRegFromCode(location.reg()); +} + inline vixl::aarch64::FPRegister SRegisterFrom(Location location) { DCHECK(location.IsFpuRegister()) << location; return vixl::aarch64::FPRegister::GetSRegFromCode(location.reg()); diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index bf18cc9bbc..ec02127bee 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -770,22 +770,21 @@ bool HLoopOptimization::TrySetVectorType(Primitive::Type type, uint64_t* restric return false; case kArm64: // Allow vectorization for all ARM devices, because Android assumes that - // ARMv8 AArch64 always supports advanced SIMD. For now, only D registers - // (64-bit vectors) not Q registers (128-bit vectors). + // ARMv8 AArch64 always supports advanced SIMD. switch (type) { case Primitive::kPrimBoolean: case Primitive::kPrimByte: *restrictions |= kNoDiv | kNoAbs; - return TrySetVectorLength(8); + return TrySetVectorLength(16); case Primitive::kPrimChar: case Primitive::kPrimShort: *restrictions |= kNoDiv | kNoAbs; - return TrySetVectorLength(4); + return TrySetVectorLength(8); case Primitive::kPrimInt: *restrictions |= kNoDiv; - return TrySetVectorLength(2); + return TrySetVectorLength(4); case Primitive::kPrimFloat: - return TrySetVectorLength(2); + return TrySetVectorLength(4); default: return false; } |