diff options
author | 2017-03-23 16:17:37 -0700 | |
---|---|---|
committer | 2017-03-23 16:51:52 -0700 | |
commit | 5576f3741c58cb8b5fb2f68f3b3a9415efe05f4f (patch) | |
tree | 2187c109d24ae3634416b551e83fef310e975a74 /compiler/optimizing | |
parent | 6efac9929f8952e4871e8c423c923989fc6f2ad2 (diff) |
Implement a SIMD spilling slot.
Rationale:
The last ART vectorizer break-out CL \O/
This ensures spilling on x86 and x86_4 is correct.
Also, it paves the way to wider SIMD on ARM and MIPS.
Test: test-art-host
Bug: 34083438
Change-Id: I5b27d18c2045f3ab70b64c335423b3ff2a507ac2
Diffstat (limited to 'compiler/optimizing')
-rw-r--r-- | compiler/optimizing/code_generator_x86.cc | 18 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_x86_64.cc | 20 | ||||
-rw-r--r-- | compiler/optimizing/graph_visualizer.cc | 6 | ||||
-rw-r--r-- | compiler/optimizing/locations.h | 20 | ||||
-rw-r--r-- | compiler/optimizing/register_allocation_resolver.cc | 5 | ||||
-rw-r--r-- | compiler/optimizing/ssa_liveness_analysis.cc | 3 |
6 files changed, 53 insertions, 19 deletions
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 958c1a6fdb..4db4796985 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -967,7 +967,7 @@ size_t CodeGeneratorX86::RestoreCoreRegister(size_t stack_index, uint32_t reg_id size_t CodeGeneratorX86::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) { if (GetGraph()->HasSIMD()) { - __ movupd(Address(ESP, stack_index), XmmRegister(reg_id)); + __ movups(Address(ESP, stack_index), XmmRegister(reg_id)); } else { __ movsd(Address(ESP, stack_index), XmmRegister(reg_id)); } @@ -976,7 +976,7 @@ size_t CodeGeneratorX86::SaveFloatingPointRegister(size_t stack_index, uint32_t size_t CodeGeneratorX86::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) { if (GetGraph()->HasSIMD()) { - __ movupd(XmmRegister(reg_id), Address(ESP, stack_index)); + __ movups(XmmRegister(reg_id), Address(ESP, stack_index)); } else { __ movsd(XmmRegister(reg_id), Address(ESP, stack_index)); } @@ -5713,9 +5713,8 @@ void LocationsBuilderX86::VisitSuspendCheck(HSuspendCheck* instruction) { // In suspend check slow path, usually there are no caller-save registers at all. // If SIMD instructions are present, however, we force spilling all live SIMD // registers in full width (since the runtime only saves/restores lower part). - locations->SetCustomSlowPathCallerSaves(GetGraph()->HasSIMD() - ? RegisterSet::AllFpu() - : RegisterSet::Empty()); + locations->SetCustomSlowPathCallerSaves( + GetGraph()->HasSIMD() ? RegisterSet::AllFpu() : RegisterSet::Empty()); } void InstructionCodeGeneratorX86::VisitSuspendCheck(HSuspendCheck* instruction) { @@ -5818,9 +5817,11 @@ void ParallelMoveResolverX86::EmitMove(size_t index) { __ movd(destination.AsRegisterPairHigh<Register>(), src_reg); } else if (destination.IsStackSlot()) { __ movss(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>()); - } else { - DCHECK(destination.IsDoubleStackSlot()); + } else if (destination.IsDoubleStackSlot()) { __ movsd(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>()); + } else { + DCHECK(destination.IsSIMDStackSlot()); + __ movups(Address(ESP, destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>()); } } else if (source.IsStackSlot()) { if (destination.IsRegister()) { @@ -5842,6 +5843,9 @@ void ParallelMoveResolverX86::EmitMove(size_t index) { DCHECK(destination.IsDoubleStackSlot()) << destination; MoveMemoryToMemory64(destination.GetStackIndex(), source.GetStackIndex()); } + } else if (source.IsSIMDStackSlot()) { + DCHECK(destination.IsFpuRegister()); + __ movups(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex())); } else if (source.IsConstant()) { HConstant* constant = source.GetConstant(); if (constant->IsIntConstant() || constant->IsNullConstant()) { diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index c106d9b06e..2ffc398287 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -1162,7 +1162,7 @@ size_t CodeGeneratorX86_64::RestoreCoreRegister(size_t stack_index, uint32_t reg size_t CodeGeneratorX86_64::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) { if (GetGraph()->HasSIMD()) { - __ movupd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id)); + __ movups(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id)); } else { __ movsd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id)); } @@ -1171,7 +1171,7 @@ size_t CodeGeneratorX86_64::SaveFloatingPointRegister(size_t stack_index, uint32 size_t CodeGeneratorX86_64::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) { if (GetGraph()->HasSIMD()) { - __ movupd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index)); + __ movups(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index)); } else { __ movsd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index)); } @@ -5166,9 +5166,8 @@ void LocationsBuilderX86_64::VisitSuspendCheck(HSuspendCheck* instruction) { // In suspend check slow path, usually there are no caller-save registers at all. // If SIMD instructions are present, however, we force spilling all live SIMD // registers in full width (since the runtime only saves/restores lower part). - locations->SetCustomSlowPathCallerSaves(GetGraph()->HasSIMD() - ? RegisterSet::AllFpu() - : RegisterSet::Empty()); + locations->SetCustomSlowPathCallerSaves( + GetGraph()->HasSIMD() ? RegisterSet::AllFpu() : RegisterSet::Empty()); } void InstructionCodeGeneratorX86_64::VisitSuspendCheck(HSuspendCheck* instruction) { @@ -5257,6 +5256,10 @@ void ParallelMoveResolverX86_64::EmitMove(size_t index) { __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex())); __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP)); } + } else if (source.IsSIMDStackSlot()) { + DCHECK(destination.IsFpuRegister()); + __ movups(destination.AsFpuRegister<XmmRegister>(), + Address(CpuRegister(RSP), source.GetStackIndex())); } else if (source.IsConstant()) { HConstant* constant = source.GetConstant(); if (constant->IsIntConstant() || constant->IsNullConstant()) { @@ -5307,10 +5310,13 @@ void ParallelMoveResolverX86_64::EmitMove(size_t index) { } else if (destination.IsStackSlot()) { __ movss(Address(CpuRegister(RSP), destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>()); - } else { - DCHECK(destination.IsDoubleStackSlot()) << destination; + } else if (destination.IsDoubleStackSlot()) { __ movsd(Address(CpuRegister(RSP), destination.GetStackIndex()), source.AsFpuRegister<XmmRegister>()); + } else { + DCHECK(destination.IsSIMDStackSlot()); + __ movups(Address(CpuRegister(RSP), destination.GetStackIndex()), + source.AsFpuRegister<XmmRegister>()); } } } diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc index 2bf5c53e17..0dfae11465 100644 --- a/compiler/optimizing/graph_visualizer.cc +++ b/compiler/optimizing/graph_visualizer.cc @@ -322,9 +322,11 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor { codegen_.DumpCoreRegister(stream, location.high()); } else if (location.IsUnallocated()) { stream << "unallocated"; - } else { - DCHECK(location.IsDoubleStackSlot()); + } else if (location.IsDoubleStackSlot()) { stream << "2x" << location.GetStackIndex() << "(sp)"; + } else { + DCHECK(location.IsSIMDStackSlot()); + stream << "4x" << location.GetStackIndex() << "(sp)"; } } diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h index d391f6913c..6f0dbce2df 100644 --- a/compiler/optimizing/locations.h +++ b/compiler/optimizing/locations.h @@ -69,11 +69,13 @@ class Location : public ValueObject { // We do not use the value 9 because it conflicts with kLocationConstantMask. kDoNotUse9 = 9, + kSIMDStackSlot = 10, // 128bit stack slot. TODO: generalize with encoded #bytes? + // Unallocated location represents a location that is not fixed and can be // allocated by a register allocator. Each unallocated location has // a policy that specifies what kind of location is suitable. Payload // contains register allocation policy. - kUnallocated = 10, + kUnallocated = 11, }; Location() : ValueObject(), value_(kInvalid) { @@ -82,6 +84,7 @@ class Location : public ValueObject { static_assert((kUnallocated & kLocationConstantMask) != kConstant, "TagError"); static_assert((kStackSlot & kLocationConstantMask) != kConstant, "TagError"); static_assert((kDoubleStackSlot & kLocationConstantMask) != kConstant, "TagError"); + static_assert((kSIMDStackSlot & kLocationConstantMask) != kConstant, "TagError"); static_assert((kRegister & kLocationConstantMask) != kConstant, "TagError"); static_assert((kFpuRegister & kLocationConstantMask) != kConstant, "TagError"); static_assert((kRegisterPair & kLocationConstantMask) != kConstant, "TagError"); @@ -266,8 +269,20 @@ class Location : public ValueObject { return GetKind() == kDoubleStackSlot; } + static Location SIMDStackSlot(intptr_t stack_index) { + uintptr_t payload = EncodeStackIndex(stack_index); + Location loc(kSIMDStackSlot, payload); + // Ensure that sign is preserved. + DCHECK_EQ(loc.GetStackIndex(), stack_index); + return loc; + } + + bool IsSIMDStackSlot() const { + return GetKind() == kSIMDStackSlot; + } + intptr_t GetStackIndex() const { - DCHECK(IsStackSlot() || IsDoubleStackSlot()); + DCHECK(IsStackSlot() || IsDoubleStackSlot() || IsSIMDStackSlot()); // Decode stack index manually to preserve sign. return GetPayload() - kStackIndexBias; } @@ -315,6 +330,7 @@ class Location : public ValueObject { case kRegister: return "R"; case kStackSlot: return "S"; case kDoubleStackSlot: return "DS"; + case kSIMDStackSlot: return "SIMD"; case kUnallocated: return "U"; case kConstant: return "C"; case kFpuRegister: return "F"; diff --git a/compiler/optimizing/register_allocation_resolver.cc b/compiler/optimizing/register_allocation_resolver.cc index 0d33b49fdb..c6a0b6a0d2 100644 --- a/compiler/optimizing/register_allocation_resolver.cc +++ b/compiler/optimizing/register_allocation_resolver.cc @@ -303,6 +303,7 @@ void RegisterAllocationResolver::ConnectSiblings(LiveInterval* interval) { switch (interval->NumberOfSpillSlotsNeeded()) { case 1: loc = Location::StackSlot(interval->GetParent()->GetSpillSlot()); break; case 2: loc = Location::DoubleStackSlot(interval->GetParent()->GetSpillSlot()); break; + case 4: loc = Location::SIMDStackSlot(interval->GetParent()->GetSpillSlot()); break; default: LOG(FATAL) << "Unexpected number of spill slots"; UNREACHABLE(); } InsertMoveAfter(interval->GetDefinedBy(), interval->ToLocation(), loc); @@ -464,6 +465,7 @@ void RegisterAllocationResolver::ConnectSplitSiblings(LiveInterval* interval, switch (parent->NumberOfSpillSlotsNeeded()) { case 1: location_source = Location::StackSlot(parent->GetSpillSlot()); break; case 2: location_source = Location::DoubleStackSlot(parent->GetSpillSlot()); break; + case 4: location_source = Location::SIMDStackSlot(parent->GetSpillSlot()); break; default: LOG(FATAL) << "Unexpected number of spill slots"; UNREACHABLE(); } } @@ -496,7 +498,8 @@ static bool IsValidDestination(Location destination) { || destination.IsFpuRegister() || destination.IsFpuRegisterPair() || destination.IsStackSlot() - || destination.IsDoubleStackSlot(); + || destination.IsDoubleStackSlot() + || destination.IsSIMDStackSlot(); } void RegisterAllocationResolver::AddMove(HParallelMove* move, diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc index c0a045c33e..36ee5a903a 100644 --- a/compiler/optimizing/ssa_liveness_analysis.cc +++ b/compiler/optimizing/ssa_liveness_analysis.cc @@ -470,6 +470,8 @@ bool LiveInterval::SameRegisterKind(Location other) const { } size_t LiveInterval::NumberOfSpillSlotsNeeded() const { + // TODO: detect vector operation. + // Return number of needed spill slots based on type. return (type_ == Primitive::kPrimLong || type_ == Primitive::kPrimDouble) ? 2 : 1; } @@ -497,6 +499,7 @@ Location LiveInterval::ToLocation() const { switch (NumberOfSpillSlotsNeeded()) { case 1: return Location::StackSlot(GetParent()->GetSpillSlot()); case 2: return Location::DoubleStackSlot(GetParent()->GetSpillSlot()); + case 4: return Location::SIMDStackSlot(GetParent()->GetSpillSlot()); default: LOG(FATAL) << "Unexpected number of spill slots"; UNREACHABLE(); } } else { |