diff options
author | 2017-03-21 20:14:07 -0700 | |
---|---|---|
committer | 2017-03-22 20:11:05 +0000 | |
commit | b13c65bb46544821a84ff2106d0710d77b0fb463 (patch) | |
tree | 46ef54ce881e32c80901528cf21a7951cf219023 /compiler/optimizing | |
parent | 2b864659d4399de6c17e93b8df8cdbf08c6a7ac9 (diff) |
Saves full XMM state along suspend check's slow path.
Rationale:
Break-out CL of ART Vectorizer. We need to save 128-bit
of data (default ABI of ART runtime only saves 64-bit)
Note that this is *only* done for xmm registers that
are live, so overhead is not too big.
Bug: 34083438
Test: test-art-host
Change-Id: Ic89988b0acb0c104634271d0c6c3e29b6596d59b
Diffstat (limited to 'compiler/optimizing')
-rw-r--r-- | compiler/optimizing/code_generator_x86.cc | 22 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_x86.h | 5 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_x86_64.cc | 26 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_x86_64.h | 4 | ||||
-rw-r--r-- | compiler/optimizing/locations.h | 2 | ||||
-rw-r--r-- | compiler/optimizing/nodes.cc | 3 | ||||
-rw-r--r-- | compiler/optimizing/nodes.h | 9 |
7 files changed, 60 insertions, 11 deletions
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 0b50619a66..958c1a6fdb 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -183,10 +183,13 @@ class SuspendCheckSlowPathX86 : public SlowPathCode { : SlowPathCode(instruction), successor_(successor) {} void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen); __ Bind(GetEntryLabel()); + SaveLiveRegisters(codegen, locations); // only saves full width XMM for SIMD x86_codegen->InvokeRuntime(kQuickTestSuspend, instruction_, instruction_->GetDexPc(), this); CheckEntrypointTypes<kQuickTestSuspend, void, void>(); + RestoreLiveRegisters(codegen, locations); // only saves full width XMM for SIMD if (successor_ == nullptr) { __ jmp(GetReturnLabel()); } else { @@ -963,12 +966,20 @@ size_t CodeGeneratorX86::RestoreCoreRegister(size_t stack_index, uint32_t reg_id } size_t CodeGeneratorX86::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) { - __ movsd(Address(ESP, stack_index), XmmRegister(reg_id)); + if (GetGraph()->HasSIMD()) { + __ movupd(Address(ESP, stack_index), XmmRegister(reg_id)); + } else { + __ movsd(Address(ESP, stack_index), XmmRegister(reg_id)); + } return GetFloatingPointSpillSlotSize(); } size_t CodeGeneratorX86::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) { - __ movsd(XmmRegister(reg_id), Address(ESP, stack_index)); + if (GetGraph()->HasSIMD()) { + __ movupd(XmmRegister(reg_id), Address(ESP, stack_index)); + } else { + __ movsd(XmmRegister(reg_id), Address(ESP, stack_index)); + } return GetFloatingPointSpillSlotSize(); } @@ -5699,7 +5710,12 @@ void InstructionCodeGeneratorX86::VisitParallelMove(HParallelMove* instruction) void LocationsBuilderX86::VisitSuspendCheck(HSuspendCheck* instruction) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnSlowPath); - locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + // In suspend check slow path, usually there are no caller-save registers at all. + // If SIMD instructions are present, however, we force spilling all live SIMD + // registers in full width (since the runtime only saves/restores lower part). + locations->SetCustomSlowPathCallerSaves(GetGraph()->HasSIMD() + ? RegisterSet::AllFpu() + : RegisterSet::Empty()); } void InstructionCodeGeneratorX86::VisitSuspendCheck(HSuspendCheck* instruction) { diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h index 65ee383b54..ca3a9eadd2 100644 --- a/compiler/optimizing/code_generator_x86.h +++ b/compiler/optimizing/code_generator_x86.h @@ -348,8 +348,9 @@ class CodeGeneratorX86 : public CodeGenerator { } size_t GetFloatingPointSpillSlotSize() const OVERRIDE { - // 8 bytes == 2 words for each spill. - return 2 * kX86WordSize; + return GetGraph()->HasSIMD() + ? 4 * kX86WordSize // 16 bytes == 4 words for each spill + : 2 * kX86WordSize; // 8 bytes == 2 words for each spill } HGraphVisitor* GetLocationBuilder() OVERRIDE { diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index 08f1adfcff..c106d9b06e 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -140,10 +140,13 @@ class SuspendCheckSlowPathX86_64 : public SlowPathCode { : SlowPathCode(instruction), successor_(successor) {} void EmitNativeCode(CodeGenerator* codegen) OVERRIDE { + LocationSummary* locations = instruction_->GetLocations(); CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen); __ Bind(GetEntryLabel()); + SaveLiveRegisters(codegen, locations); // only saves full width XMM for SIMD x86_64_codegen->InvokeRuntime(kQuickTestSuspend, instruction_, instruction_->GetDexPc(), this); CheckEntrypointTypes<kQuickTestSuspend, void, void>(); + RestoreLiveRegisters(codegen, locations); // only saves full width XMM for SIMD if (successor_ == nullptr) { __ jmp(GetReturnLabel()); } else { @@ -1158,13 +1161,21 @@ size_t CodeGeneratorX86_64::RestoreCoreRegister(size_t stack_index, uint32_t reg } size_t CodeGeneratorX86_64::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) { - __ movsd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id)); - return kX86_64WordSize; + if (GetGraph()->HasSIMD()) { + __ movupd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id)); + } else { + __ movsd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id)); + } + return GetFloatingPointSpillSlotSize(); } size_t CodeGeneratorX86_64::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) { - __ movsd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index)); - return kX86_64WordSize; + if (GetGraph()->HasSIMD()) { + __ movupd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index)); + } else { + __ movsd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index)); + } + return GetFloatingPointSpillSlotSize(); } void CodeGeneratorX86_64::InvokeRuntime(QuickEntrypointEnum entrypoint, @@ -5152,7 +5163,12 @@ void InstructionCodeGeneratorX86_64::VisitParallelMove(HParallelMove* instructio void LocationsBuilderX86_64::VisitSuspendCheck(HSuspendCheck* instruction) { LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCallOnSlowPath); - locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty()); // No caller-save registers. + // In suspend check slow path, usually there are no caller-save registers at all. + // If SIMD instructions are present, however, we force spilling all live SIMD + // registers in full width (since the runtime only saves/restores lower part). + locations->SetCustomSlowPathCallerSaves(GetGraph()->HasSIMD() + ? RegisterSet::AllFpu() + : RegisterSet::Empty()); } void InstructionCodeGeneratorX86_64::VisitSuspendCheck(HSuspendCheck* instruction) { diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h index 376c3ce381..c8336dabd9 100644 --- a/compiler/optimizing/code_generator_x86_64.h +++ b/compiler/optimizing/code_generator_x86_64.h @@ -326,7 +326,9 @@ class CodeGeneratorX86_64 : public CodeGenerator { } size_t GetFloatingPointSpillSlotSize() const OVERRIDE { - return kX86_64WordSize; + return GetGraph()->HasSIMD() + ? 2 * kX86_64WordSize // 16 bytes == 2 x86_64 words for each spill + : 1 * kX86_64WordSize; // 8 bytes == 1 x86_64 words for each spill } HGraphVisitor* GetLocationBuilder() OVERRIDE { diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h index 091b58a63d..d391f6913c 100644 --- a/compiler/optimizing/locations.h +++ b/compiler/optimizing/locations.h @@ -417,6 +417,7 @@ std::ostream& operator<<(std::ostream& os, const Location::Policy& rhs); class RegisterSet : public ValueObject { public: static RegisterSet Empty() { return RegisterSet(); } + static RegisterSet AllFpu() { return RegisterSet(0, -1); } void Add(Location loc) { if (loc.IsRegister()) { @@ -462,6 +463,7 @@ class RegisterSet : public ValueObject { private: RegisterSet() : core_registers_(0), floating_point_registers_(0) {} + RegisterSet(uint32_t core, uint32_t fp) : core_registers_(core), floating_point_registers_(fp) {} uint32_t core_registers_; uint32_t floating_point_registers_; diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc index 020e4463d4..ec706e6694 100644 --- a/compiler/optimizing/nodes.cc +++ b/compiler/optimizing/nodes.cc @@ -2046,6 +2046,9 @@ HInstruction* HGraph::InlineInto(HGraph* outer_graph, HInvoke* invoke) { if (HasTryCatch()) { outer_graph->SetHasTryCatch(true); } + if (HasSIMD()) { + outer_graph->SetHasSIMD(true); + } HInstruction* return_value = nullptr; if (GetBlocks().size() == 3) { diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index 542b218cf8..6881d8f6ae 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -323,6 +323,7 @@ class HGraph : public ArenaObject<kArenaAllocGraph> { temporaries_vreg_slots_(0), has_bounds_checks_(false), has_try_catch_(false), + has_simd_(false), has_loops_(false), has_irreducible_loops_(false), debuggable_(debuggable), @@ -560,6 +561,9 @@ class HGraph : public ArenaObject<kArenaAllocGraph> { bool HasTryCatch() const { return has_try_catch_; } void SetHasTryCatch(bool value) { has_try_catch_ = value; } + bool HasSIMD() const { return has_simd_; } + void SetHasSIMD(bool value) { has_simd_ = value; } + bool HasLoops() const { return has_loops_; } void SetHasLoops(bool value) { has_loops_ = value; } @@ -652,6 +656,11 @@ class HGraph : public ArenaObject<kArenaAllocGraph> { // false positives. bool has_try_catch_; + // Flag whether SIMD instructions appear in the graph. If true, the + // code generators may have to be more careful spilling the wider + // contents of SIMD registers. + bool has_simd_; + // Flag whether there are any loops in the graph. We can skip loop // optimization if it's false. It's only best effort to keep it up // to date in the presence of code elimination so there might be false |