diff options
Diffstat (limited to 'compiler/optimizing')
| -rw-r--r-- | compiler/optimizing/code_generator_x86.cc | 67 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_x86.h | 6 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_x86_64.cc | 92 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_x86_64.h | 5 | ||||
| -rw-r--r-- | compiler/optimizing/nodes_vector.h | 10 | ||||
| -rw-r--r-- | compiler/optimizing/optimizing_compiler.cc | 18 | ||||
| -rw-r--r-- | compiler/optimizing/scheduler.h | 5 | ||||
| -rw-r--r-- | compiler/optimizing/scheduler_arm64.h | 14 |
8 files changed, 150 insertions, 67 deletions
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 2e8170ecc4..42ee9db167 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -5732,24 +5732,18 @@ X86Assembler* ParallelMoveResolverX86::GetAssembler() const { return codegen_->GetAssembler(); } -void ParallelMoveResolverX86::MoveMemoryToMemory32(int dst, int src) { +void ParallelMoveResolverX86::MoveMemoryToMemory(int dst, int src, int number_of_words) { ScratchRegisterScope ensure_scratch( this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); Register temp_reg = static_cast<Register>(ensure_scratch.GetRegister()); int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0; - __ movl(temp_reg, Address(ESP, src + stack_offset)); - __ movl(Address(ESP, dst + stack_offset), temp_reg); -} -void ParallelMoveResolverX86::MoveMemoryToMemory64(int dst, int src) { - ScratchRegisterScope ensure_scratch( - this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); - Register temp_reg = static_cast<Register>(ensure_scratch.GetRegister()); - int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0; - __ movl(temp_reg, Address(ESP, src + stack_offset)); - __ movl(Address(ESP, dst + stack_offset), temp_reg); - __ movl(temp_reg, Address(ESP, src + stack_offset + kX86WordSize)); - __ movl(Address(ESP, dst + stack_offset + kX86WordSize), temp_reg); + // Now that temp register is available (possibly spilled), move blocks of memory. + for (int i = 0; i < number_of_words; i++) { + __ movl(temp_reg, Address(ESP, src + stack_offset)); + __ movl(Address(ESP, dst + stack_offset), temp_reg); + stack_offset += kX86WordSize; + } } void ParallelMoveResolverX86::EmitMove(size_t index) { @@ -5800,7 +5794,7 @@ void ParallelMoveResolverX86::EmitMove(size_t index) { __ movss(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex())); } else { DCHECK(destination.IsStackSlot()); - MoveMemoryToMemory32(destination.GetStackIndex(), source.GetStackIndex()); + MoveMemoryToMemory(destination.GetStackIndex(), source.GetStackIndex(), 1); } } else if (source.IsDoubleStackSlot()) { if (destination.IsRegisterPair()) { @@ -5811,11 +5805,15 @@ void ParallelMoveResolverX86::EmitMove(size_t index) { __ movsd(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex())); } else { DCHECK(destination.IsDoubleStackSlot()) << destination; - MoveMemoryToMemory64(destination.GetStackIndex(), source.GetStackIndex()); + MoveMemoryToMemory(destination.GetStackIndex(), source.GetStackIndex(), 2); } } else if (source.IsSIMDStackSlot()) { - DCHECK(destination.IsFpuRegister()); - __ movups(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex())); + if (destination.IsFpuRegister()) { + __ movups(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex())); + } else { + DCHECK(destination.IsSIMDStackSlot()); + MoveMemoryToMemory(destination.GetStackIndex(), source.GetStackIndex(), 4); + } } else if (source.IsConstant()) { HConstant* constant = source.GetConstant(); if (constant->IsIntConstant() || constant->IsNullConstant()) { @@ -5915,7 +5913,16 @@ void ParallelMoveResolverX86::Exchange32(XmmRegister reg, int mem) { __ movd(reg, temp_reg); } -void ParallelMoveResolverX86::Exchange(int mem1, int mem2) { +void ParallelMoveResolverX86::Exchange128(XmmRegister reg, int mem) { + size_t extra_slot = 4 * kX86WordSize; + __ subl(ESP, Immediate(extra_slot)); + __ movups(Address(ESP, 0), XmmRegister(reg)); + ExchangeMemory(0, mem + extra_slot, 4); + __ movups(XmmRegister(reg), Address(ESP, 0)); + __ addl(ESP, Immediate(extra_slot)); +} + +void ParallelMoveResolverX86::ExchangeMemory(int mem1, int mem2, int number_of_words) { ScratchRegisterScope ensure_scratch1( this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); @@ -5925,10 +5932,15 @@ void ParallelMoveResolverX86::Exchange(int mem1, int mem2) { int stack_offset = ensure_scratch1.IsSpilled() ? kX86WordSize : 0; stack_offset += ensure_scratch2.IsSpilled() ? kX86WordSize : 0; - __ movl(static_cast<Register>(ensure_scratch1.GetRegister()), Address(ESP, mem1 + stack_offset)); - __ movl(static_cast<Register>(ensure_scratch2.GetRegister()), Address(ESP, mem2 + stack_offset)); - __ movl(Address(ESP, mem2 + stack_offset), static_cast<Register>(ensure_scratch1.GetRegister())); - __ movl(Address(ESP, mem1 + stack_offset), static_cast<Register>(ensure_scratch2.GetRegister())); + + // Now that temp registers are available (possibly spilled), exchange blocks of memory. + for (int i = 0; i < number_of_words; i++) { + __ movl(static_cast<Register>(ensure_scratch1.GetRegister()), Address(ESP, mem1 + stack_offset)); + __ movl(static_cast<Register>(ensure_scratch2.GetRegister()), Address(ESP, mem2 + stack_offset)); + __ movl(Address(ESP, mem2 + stack_offset), static_cast<Register>(ensure_scratch1.GetRegister())); + __ movl(Address(ESP, mem1 + stack_offset), static_cast<Register>(ensure_scratch2.GetRegister())); + stack_offset += kX86WordSize; + } } void ParallelMoveResolverX86::EmitSwap(size_t index) { @@ -5947,7 +5959,7 @@ void ParallelMoveResolverX86::EmitSwap(size_t index) { } else if (source.IsStackSlot() && destination.IsRegister()) { Exchange(destination.AsRegister<Register>(), source.GetStackIndex()); } else if (source.IsStackSlot() && destination.IsStackSlot()) { - Exchange(destination.GetStackIndex(), source.GetStackIndex()); + ExchangeMemory(destination.GetStackIndex(), source.GetStackIndex(), 1); } else if (source.IsFpuRegister() && destination.IsFpuRegister()) { // Use XOR Swap algorithm to avoid a temporary. DCHECK_NE(source.reg(), destination.reg()); @@ -5983,8 +5995,13 @@ void ParallelMoveResolverX86::EmitSwap(size_t index) { // Move the high double to the low double. __ psrldq(reg, Immediate(8)); } else if (destination.IsDoubleStackSlot() && source.IsDoubleStackSlot()) { - Exchange(destination.GetStackIndex(), source.GetStackIndex()); - Exchange(destination.GetHighStackIndex(kX86WordSize), source.GetHighStackIndex(kX86WordSize)); + ExchangeMemory(destination.GetStackIndex(), source.GetStackIndex(), 2); + } else if (source.IsSIMDStackSlot() && destination.IsSIMDStackSlot()) { + ExchangeMemory(destination.GetStackIndex(), source.GetStackIndex(), 4); + } else if (source.IsFpuRegister() && destination.IsSIMDStackSlot()) { + Exchange128(source.AsFpuRegister<XmmRegister>(), destination.GetStackIndex()); + } else if (destination.IsFpuRegister() && source.IsSIMDStackSlot()) { + Exchange128(destination.AsFpuRegister<XmmRegister>(), source.GetStackIndex()); } else { LOG(FATAL) << "Unimplemented: source: " << source << ", destination: " << destination; } diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h index 176e4dfda0..40b7e3c54f 100644 --- a/compiler/optimizing/code_generator_x86.h +++ b/compiler/optimizing/code_generator_x86.h @@ -139,10 +139,10 @@ class ParallelMoveResolverX86 : public ParallelMoveResolverWithSwap { private: void Exchange(Register reg, int mem); - void Exchange(int mem1, int mem2); void Exchange32(XmmRegister reg, int mem); - void MoveMemoryToMemory32(int dst, int src); - void MoveMemoryToMemory64(int dst, int src); + void Exchange128(XmmRegister reg, int mem); + void ExchangeMemory(int mem1, int mem2, int number_of_words); + void MoveMemoryToMemory(int dst, int src, int number_of_words); CodeGeneratorX86* const codegen_; diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index e25688c9a3..02fbf234c1 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -5220,9 +5220,17 @@ void ParallelMoveResolverX86_64::EmitMove(size_t index) { __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP)); } } else if (source.IsSIMDStackSlot()) { - DCHECK(destination.IsFpuRegister()); - __ movups(destination.AsFpuRegister<XmmRegister>(), - Address(CpuRegister(RSP), source.GetStackIndex())); + if (destination.IsFpuRegister()) { + __ movups(destination.AsFpuRegister<XmmRegister>(), + Address(CpuRegister(RSP), source.GetStackIndex())); + } else { + DCHECK(destination.IsSIMDStackSlot()); + size_t high = kX86_64WordSize; + __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex())); + __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP)); + __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex() + high)); + __ movq(Address(CpuRegister(RSP), destination.GetStackIndex() + high), CpuRegister(TMP)); + } } else if (source.IsConstant()) { HConstant* constant = source.GetConstant(); if (constant->IsIntConstant() || constant->IsNullConstant()) { @@ -5290,19 +5298,6 @@ void ParallelMoveResolverX86_64::Exchange32(CpuRegister reg, int mem) { __ movl(reg, CpuRegister(TMP)); } -void ParallelMoveResolverX86_64::Exchange32(int mem1, int mem2) { - ScratchRegisterScope ensure_scratch( - this, TMP, RAX, codegen_->GetNumberOfCoreRegisters()); - - int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0; - __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset)); - __ movl(CpuRegister(ensure_scratch.GetRegister()), - Address(CpuRegister(RSP), mem2 + stack_offset)); - __ movl(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP)); - __ movl(Address(CpuRegister(RSP), mem1 + stack_offset), - CpuRegister(ensure_scratch.GetRegister())); -} - void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg1, CpuRegister reg2) { __ movq(CpuRegister(TMP), reg1); __ movq(reg1, reg2); @@ -5315,19 +5310,6 @@ void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg, int mem) { __ movq(reg, CpuRegister(TMP)); } -void ParallelMoveResolverX86_64::Exchange64(int mem1, int mem2) { - ScratchRegisterScope ensure_scratch( - this, TMP, RAX, codegen_->GetNumberOfCoreRegisters()); - - int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0; - __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset)); - __ movq(CpuRegister(ensure_scratch.GetRegister()), - Address(CpuRegister(RSP), mem2 + stack_offset)); - __ movq(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP)); - __ movq(Address(CpuRegister(RSP), mem1 + stack_offset), - CpuRegister(ensure_scratch.GetRegister())); -} - void ParallelMoveResolverX86_64::Exchange32(XmmRegister reg, int mem) { __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), mem)); __ movss(Address(CpuRegister(RSP), mem), reg); @@ -5340,6 +5322,48 @@ void ParallelMoveResolverX86_64::Exchange64(XmmRegister reg, int mem) { __ movd(reg, CpuRegister(TMP)); } +void ParallelMoveResolverX86_64::Exchange128(XmmRegister reg, int mem) { + size_t extra_slot = 2 * kX86_64WordSize; + __ subq(CpuRegister(RSP), Immediate(extra_slot)); + __ movups(Address(CpuRegister(RSP), 0), XmmRegister(reg)); + ExchangeMemory64(0, mem + extra_slot, 2); + __ movups(XmmRegister(reg), Address(CpuRegister(RSP), 0)); + __ addq(CpuRegister(RSP), Immediate(extra_slot)); +} + +void ParallelMoveResolverX86_64::ExchangeMemory32(int mem1, int mem2) { + ScratchRegisterScope ensure_scratch( + this, TMP, RAX, codegen_->GetNumberOfCoreRegisters()); + + int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0; + __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset)); + __ movl(CpuRegister(ensure_scratch.GetRegister()), + Address(CpuRegister(RSP), mem2 + stack_offset)); + __ movl(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP)); + __ movl(Address(CpuRegister(RSP), mem1 + stack_offset), + CpuRegister(ensure_scratch.GetRegister())); +} + +void ParallelMoveResolverX86_64::ExchangeMemory64(int mem1, int mem2, int num_of_qwords) { + ScratchRegisterScope ensure_scratch( + this, TMP, RAX, codegen_->GetNumberOfCoreRegisters()); + + int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0; + + // Now that temp registers are available (possibly spilled), exchange blocks of memory. + for (int i = 0; i < num_of_qwords; i++) { + __ movq(CpuRegister(TMP), + Address(CpuRegister(RSP), mem1 + stack_offset)); + __ movq(CpuRegister(ensure_scratch.GetRegister()), + Address(CpuRegister(RSP), mem2 + stack_offset)); + __ movq(Address(CpuRegister(RSP), mem2 + stack_offset), + CpuRegister(TMP)); + __ movq(Address(CpuRegister(RSP), mem1 + stack_offset), + CpuRegister(ensure_scratch.GetRegister())); + stack_offset += kX86_64WordSize; + } +} + void ParallelMoveResolverX86_64::EmitSwap(size_t index) { MoveOperands* move = moves_[index]; Location source = move->GetSource(); @@ -5352,13 +5376,13 @@ void ParallelMoveResolverX86_64::EmitSwap(size_t index) { } else if (source.IsStackSlot() && destination.IsRegister()) { Exchange32(destination.AsRegister<CpuRegister>(), source.GetStackIndex()); } else if (source.IsStackSlot() && destination.IsStackSlot()) { - Exchange32(destination.GetStackIndex(), source.GetStackIndex()); + ExchangeMemory32(destination.GetStackIndex(), source.GetStackIndex()); } else if (source.IsRegister() && destination.IsDoubleStackSlot()) { Exchange64(source.AsRegister<CpuRegister>(), destination.GetStackIndex()); } else if (source.IsDoubleStackSlot() && destination.IsRegister()) { Exchange64(destination.AsRegister<CpuRegister>(), source.GetStackIndex()); } else if (source.IsDoubleStackSlot() && destination.IsDoubleStackSlot()) { - Exchange64(destination.GetStackIndex(), source.GetStackIndex()); + ExchangeMemory64(destination.GetStackIndex(), source.GetStackIndex(), 1); } else if (source.IsFpuRegister() && destination.IsFpuRegister()) { __ movd(CpuRegister(TMP), source.AsFpuRegister<XmmRegister>()); __ movaps(source.AsFpuRegister<XmmRegister>(), destination.AsFpuRegister<XmmRegister>()); @@ -5371,6 +5395,12 @@ void ParallelMoveResolverX86_64::EmitSwap(size_t index) { Exchange64(source.AsFpuRegister<XmmRegister>(), destination.GetStackIndex()); } else if (source.IsDoubleStackSlot() && destination.IsFpuRegister()) { Exchange64(destination.AsFpuRegister<XmmRegister>(), source.GetStackIndex()); + } else if (source.IsSIMDStackSlot() && destination.IsSIMDStackSlot()) { + ExchangeMemory64(destination.GetStackIndex(), source.GetStackIndex(), 2); + } else if (source.IsFpuRegister() && destination.IsSIMDStackSlot()) { + Exchange128(source.AsFpuRegister<XmmRegister>(), destination.GetStackIndex()); + } else if (destination.IsFpuRegister() && source.IsSIMDStackSlot()) { + Exchange128(destination.AsFpuRegister<XmmRegister>(), source.GetStackIndex()); } else { LOG(FATAL) << "Unimplemented swap between " << source << " and " << destination; } diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h index 00c5c27470..e86123ef01 100644 --- a/compiler/optimizing/code_generator_x86_64.h +++ b/compiler/optimizing/code_generator_x86_64.h @@ -139,11 +139,12 @@ class ParallelMoveResolverX86_64 : public ParallelMoveResolverWithSwap { private: void Exchange32(CpuRegister reg, int mem); void Exchange32(XmmRegister reg, int mem); - void Exchange32(int mem1, int mem2); void Exchange64(CpuRegister reg1, CpuRegister reg2); void Exchange64(CpuRegister reg, int mem); void Exchange64(XmmRegister reg, int mem); - void Exchange64(int mem1, int mem2); + void Exchange128(XmmRegister reg, int mem); + void ExchangeMemory32(int mem1, int mem2); + void ExchangeMemory64(int mem1, int mem2, int num_of_qwords); CodeGeneratorX86_64* const codegen_; diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h index 096349fd73..87dff8403b 100644 --- a/compiler/optimizing/nodes_vector.h +++ b/compiler/optimizing/nodes_vector.h @@ -109,6 +109,16 @@ class HVecOperation : public HVariableInputSizeInstruction { // Assumes vector nodes cannot be moved by default. Each concrete implementation // that can be moved should override this method and return true. + // + // Note: similar approach is used for instruction scheduling (if it is turned on for the target): + // by default HScheduler::IsSchedulable returns false for a particular HVecOperation. + // HScheduler${ARCH}::IsSchedulable can be overridden to return true for an instruction (see + // scheduler_arm64.h for example) if it is safe to schedule it; in this case one *must* also + // look at/update HScheduler${ARCH}::IsSchedulingBarrier for this instruction. + // + // Note: For newly introduced vector instructions HScheduler${ARCH}::IsSchedulingBarrier must be + // altered to return true if the instruction might reside outside the SIMD loop body since SIMD + // registers are not kept alive across vector loop boundaries (yet). bool CanBeMoved() const OVERRIDE { return false; } // Tests if all data of a vector node (vector length and packed type) is equal. diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc index 73c72fc57a..24b1a123ee 100644 --- a/compiler/optimizing/optimizing_compiler.cc +++ b/compiler/optimizing/optimizing_compiler.cc @@ -1224,7 +1224,7 @@ bool OptimizingCompiler::JitCompile(Thread* self, } const CompilerOptions& compiler_options = GetCompilerDriver()->GetCompilerOptions(); - if (compiler_options.GetGenerateDebugInfo()) { + if (compiler_options.GenerateAnyDebugInfo()) { const auto* method_header = reinterpret_cast<const OatQuickMethodHeader*>(code); const uintptr_t code_address = reinterpret_cast<uintptr_t>(method_header->GetCode()); debug::MethodDebugInfo info = {}; @@ -1244,10 +1244,13 @@ bool OptimizingCompiler::JitCompile(Thread* self, info.frame_size_in_bytes = method_header->GetFrameSizeInBytes(); info.code_info = nullptr; info.cfi = jni_compiled_method.GetCfi(); - std::vector<uint8_t> elf_file = debug::WriteDebugElfFileForMethods( + // If both flags are passed, generate full debug info. + const bool mini_debug_info = !compiler_options.GetGenerateDebugInfo(); + std::vector<uint8_t> elf_file = debug::MakeElfFileForJIT( GetCompilerDriver()->GetInstructionSet(), GetCompilerDriver()->GetInstructionSetFeatures(), - ArrayRef<const debug::MethodDebugInfo>(&info, 1)); + mini_debug_info, + info); CreateJITCodeEntryForAddress(code_address, std::move(elf_file)); } @@ -1352,7 +1355,7 @@ bool OptimizingCompiler::JitCompile(Thread* self, } const CompilerOptions& compiler_options = GetCompilerDriver()->GetCompilerOptions(); - if (compiler_options.GetGenerateDebugInfo()) { + if (compiler_options.GenerateAnyDebugInfo()) { const auto* method_header = reinterpret_cast<const OatQuickMethodHeader*>(code); const uintptr_t code_address = reinterpret_cast<uintptr_t>(method_header->GetCode()); debug::MethodDebugInfo info = {}; @@ -1372,10 +1375,13 @@ bool OptimizingCompiler::JitCompile(Thread* self, info.frame_size_in_bytes = method_header->GetFrameSizeInBytes(); info.code_info = stack_map_size == 0 ? nullptr : stack_map_data; info.cfi = ArrayRef<const uint8_t>(*codegen->GetAssembler()->cfi().data()); - std::vector<uint8_t> elf_file = debug::WriteDebugElfFileForMethods( + // If both flags are passed, generate full debug info. + const bool mini_debug_info = !compiler_options.GetGenerateDebugInfo(); + std::vector<uint8_t> elf_file = debug::MakeElfFileForJIT( GetCompilerDriver()->GetInstructionSet(), GetCompilerDriver()->GetInstructionSetFeatures(), - ArrayRef<const debug::MethodDebugInfo>(&info, 1)); + mini_debug_info, + info); CreateJITCodeEntryForAddress(code_address, std::move(elf_file)); } diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h index bb7c353bc2..dfa077f7de 100644 --- a/compiler/optimizing/scheduler.h +++ b/compiler/optimizing/scheduler.h @@ -462,6 +462,11 @@ class HScheduler { // containing basic block from being scheduled. // This method is used to restrict scheduling to instructions that we know are // safe to handle. + // + // For newly introduced instructions by default HScheduler::IsSchedulable returns false. + // HScheduler${ARCH}::IsSchedulable can be overridden to return true for an instruction (see + // scheduler_arm64.h for example) if it is safe to schedule it; in this case one *must* also + // look at/update HScheduler${ARCH}::IsSchedulingBarrier for this instruction. virtual bool IsSchedulable(const HInstruction* instruction) const; bool IsSchedulable(const HBasicBlock* block) const; diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h index 32f161f26a..f71cb5b784 100644 --- a/compiler/optimizing/scheduler_arm64.h +++ b/compiler/optimizing/scheduler_arm64.h @@ -151,6 +151,20 @@ class HSchedulerARM64 : public HScheduler { #undef CASE_INSTRUCTION_KIND } + // Treat as scheduling barriers those vector instructions whose live ranges exceed the vectorized + // loop boundaries. This is a workaround for the lack of notion of SIMD register in the compiler; + // around a call we have to save/restore all live SIMD&FP registers (only lower 64 bits of + // SIMD&FP registers are callee saved) so don't reorder such vector instructions. + // + // TODO: remove this when a proper support of SIMD registers is introduced to the compiler. + bool IsSchedulingBarrier(const HInstruction* instr) const OVERRIDE { + return HScheduler::IsSchedulingBarrier(instr) || + instr->IsVecReduce() || + instr->IsVecExtractScalar() || + instr->IsVecSetScalars() || + instr->IsVecReplicateScalar(); + } + private: SchedulingLatencyVisitorARM64 arm64_latency_visitor_; DISALLOW_COPY_AND_ASSIGN(HSchedulerARM64); |