summaryrefslogtreecommitdiff
path: root/compiler/optimizing
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/optimizing')
-rw-r--r--compiler/optimizing/code_generator_x86.cc67
-rw-r--r--compiler/optimizing/code_generator_x86.h6
-rw-r--r--compiler/optimizing/code_generator_x86_64.cc92
-rw-r--r--compiler/optimizing/code_generator_x86_64.h5
-rw-r--r--compiler/optimizing/nodes_vector.h10
-rw-r--r--compiler/optimizing/optimizing_compiler.cc18
-rw-r--r--compiler/optimizing/scheduler.h5
-rw-r--r--compiler/optimizing/scheduler_arm64.h14
8 files changed, 150 insertions, 67 deletions
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 2e8170ecc4..42ee9db167 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -5732,24 +5732,18 @@ X86Assembler* ParallelMoveResolverX86::GetAssembler() const {
return codegen_->GetAssembler();
}
-void ParallelMoveResolverX86::MoveMemoryToMemory32(int dst, int src) {
+void ParallelMoveResolverX86::MoveMemoryToMemory(int dst, int src, int number_of_words) {
ScratchRegisterScope ensure_scratch(
this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
Register temp_reg = static_cast<Register>(ensure_scratch.GetRegister());
int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0;
- __ movl(temp_reg, Address(ESP, src + stack_offset));
- __ movl(Address(ESP, dst + stack_offset), temp_reg);
-}
-void ParallelMoveResolverX86::MoveMemoryToMemory64(int dst, int src) {
- ScratchRegisterScope ensure_scratch(
- this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
- Register temp_reg = static_cast<Register>(ensure_scratch.GetRegister());
- int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0;
- __ movl(temp_reg, Address(ESP, src + stack_offset));
- __ movl(Address(ESP, dst + stack_offset), temp_reg);
- __ movl(temp_reg, Address(ESP, src + stack_offset + kX86WordSize));
- __ movl(Address(ESP, dst + stack_offset + kX86WordSize), temp_reg);
+ // Now that temp register is available (possibly spilled), move blocks of memory.
+ for (int i = 0; i < number_of_words; i++) {
+ __ movl(temp_reg, Address(ESP, src + stack_offset));
+ __ movl(Address(ESP, dst + stack_offset), temp_reg);
+ stack_offset += kX86WordSize;
+ }
}
void ParallelMoveResolverX86::EmitMove(size_t index) {
@@ -5800,7 +5794,7 @@ void ParallelMoveResolverX86::EmitMove(size_t index) {
__ movss(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex()));
} else {
DCHECK(destination.IsStackSlot());
- MoveMemoryToMemory32(destination.GetStackIndex(), source.GetStackIndex());
+ MoveMemoryToMemory(destination.GetStackIndex(), source.GetStackIndex(), 1);
}
} else if (source.IsDoubleStackSlot()) {
if (destination.IsRegisterPair()) {
@@ -5811,11 +5805,15 @@ void ParallelMoveResolverX86::EmitMove(size_t index) {
__ movsd(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex()));
} else {
DCHECK(destination.IsDoubleStackSlot()) << destination;
- MoveMemoryToMemory64(destination.GetStackIndex(), source.GetStackIndex());
+ MoveMemoryToMemory(destination.GetStackIndex(), source.GetStackIndex(), 2);
}
} else if (source.IsSIMDStackSlot()) {
- DCHECK(destination.IsFpuRegister());
- __ movups(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex()));
+ if (destination.IsFpuRegister()) {
+ __ movups(destination.AsFpuRegister<XmmRegister>(), Address(ESP, source.GetStackIndex()));
+ } else {
+ DCHECK(destination.IsSIMDStackSlot());
+ MoveMemoryToMemory(destination.GetStackIndex(), source.GetStackIndex(), 4);
+ }
} else if (source.IsConstant()) {
HConstant* constant = source.GetConstant();
if (constant->IsIntConstant() || constant->IsNullConstant()) {
@@ -5915,7 +5913,16 @@ void ParallelMoveResolverX86::Exchange32(XmmRegister reg, int mem) {
__ movd(reg, temp_reg);
}
-void ParallelMoveResolverX86::Exchange(int mem1, int mem2) {
+void ParallelMoveResolverX86::Exchange128(XmmRegister reg, int mem) {
+ size_t extra_slot = 4 * kX86WordSize;
+ __ subl(ESP, Immediate(extra_slot));
+ __ movups(Address(ESP, 0), XmmRegister(reg));
+ ExchangeMemory(0, mem + extra_slot, 4);
+ __ movups(XmmRegister(reg), Address(ESP, 0));
+ __ addl(ESP, Immediate(extra_slot));
+}
+
+void ParallelMoveResolverX86::ExchangeMemory(int mem1, int mem2, int number_of_words) {
ScratchRegisterScope ensure_scratch1(
this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
@@ -5925,10 +5932,15 @@ void ParallelMoveResolverX86::Exchange(int mem1, int mem2) {
int stack_offset = ensure_scratch1.IsSpilled() ? kX86WordSize : 0;
stack_offset += ensure_scratch2.IsSpilled() ? kX86WordSize : 0;
- __ movl(static_cast<Register>(ensure_scratch1.GetRegister()), Address(ESP, mem1 + stack_offset));
- __ movl(static_cast<Register>(ensure_scratch2.GetRegister()), Address(ESP, mem2 + stack_offset));
- __ movl(Address(ESP, mem2 + stack_offset), static_cast<Register>(ensure_scratch1.GetRegister()));
- __ movl(Address(ESP, mem1 + stack_offset), static_cast<Register>(ensure_scratch2.GetRegister()));
+
+ // Now that temp registers are available (possibly spilled), exchange blocks of memory.
+ for (int i = 0; i < number_of_words; i++) {
+ __ movl(static_cast<Register>(ensure_scratch1.GetRegister()), Address(ESP, mem1 + stack_offset));
+ __ movl(static_cast<Register>(ensure_scratch2.GetRegister()), Address(ESP, mem2 + stack_offset));
+ __ movl(Address(ESP, mem2 + stack_offset), static_cast<Register>(ensure_scratch1.GetRegister()));
+ __ movl(Address(ESP, mem1 + stack_offset), static_cast<Register>(ensure_scratch2.GetRegister()));
+ stack_offset += kX86WordSize;
+ }
}
void ParallelMoveResolverX86::EmitSwap(size_t index) {
@@ -5947,7 +5959,7 @@ void ParallelMoveResolverX86::EmitSwap(size_t index) {
} else if (source.IsStackSlot() && destination.IsRegister()) {
Exchange(destination.AsRegister<Register>(), source.GetStackIndex());
} else if (source.IsStackSlot() && destination.IsStackSlot()) {
- Exchange(destination.GetStackIndex(), source.GetStackIndex());
+ ExchangeMemory(destination.GetStackIndex(), source.GetStackIndex(), 1);
} else if (source.IsFpuRegister() && destination.IsFpuRegister()) {
// Use XOR Swap algorithm to avoid a temporary.
DCHECK_NE(source.reg(), destination.reg());
@@ -5983,8 +5995,13 @@ void ParallelMoveResolverX86::EmitSwap(size_t index) {
// Move the high double to the low double.
__ psrldq(reg, Immediate(8));
} else if (destination.IsDoubleStackSlot() && source.IsDoubleStackSlot()) {
- Exchange(destination.GetStackIndex(), source.GetStackIndex());
- Exchange(destination.GetHighStackIndex(kX86WordSize), source.GetHighStackIndex(kX86WordSize));
+ ExchangeMemory(destination.GetStackIndex(), source.GetStackIndex(), 2);
+ } else if (source.IsSIMDStackSlot() && destination.IsSIMDStackSlot()) {
+ ExchangeMemory(destination.GetStackIndex(), source.GetStackIndex(), 4);
+ } else if (source.IsFpuRegister() && destination.IsSIMDStackSlot()) {
+ Exchange128(source.AsFpuRegister<XmmRegister>(), destination.GetStackIndex());
+ } else if (destination.IsFpuRegister() && source.IsSIMDStackSlot()) {
+ Exchange128(destination.AsFpuRegister<XmmRegister>(), source.GetStackIndex());
} else {
LOG(FATAL) << "Unimplemented: source: " << source << ", destination: " << destination;
}
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 176e4dfda0..40b7e3c54f 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -139,10 +139,10 @@ class ParallelMoveResolverX86 : public ParallelMoveResolverWithSwap {
private:
void Exchange(Register reg, int mem);
- void Exchange(int mem1, int mem2);
void Exchange32(XmmRegister reg, int mem);
- void MoveMemoryToMemory32(int dst, int src);
- void MoveMemoryToMemory64(int dst, int src);
+ void Exchange128(XmmRegister reg, int mem);
+ void ExchangeMemory(int mem1, int mem2, int number_of_words);
+ void MoveMemoryToMemory(int dst, int src, int number_of_words);
CodeGeneratorX86* const codegen_;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index e25688c9a3..02fbf234c1 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -5220,9 +5220,17 @@ void ParallelMoveResolverX86_64::EmitMove(size_t index) {
__ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
}
} else if (source.IsSIMDStackSlot()) {
- DCHECK(destination.IsFpuRegister());
- __ movups(destination.AsFpuRegister<XmmRegister>(),
- Address(CpuRegister(RSP), source.GetStackIndex()));
+ if (destination.IsFpuRegister()) {
+ __ movups(destination.AsFpuRegister<XmmRegister>(),
+ Address(CpuRegister(RSP), source.GetStackIndex()));
+ } else {
+ DCHECK(destination.IsSIMDStackSlot());
+ size_t high = kX86_64WordSize;
+ __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
+ __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
+ __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex() + high));
+ __ movq(Address(CpuRegister(RSP), destination.GetStackIndex() + high), CpuRegister(TMP));
+ }
} else if (source.IsConstant()) {
HConstant* constant = source.GetConstant();
if (constant->IsIntConstant() || constant->IsNullConstant()) {
@@ -5290,19 +5298,6 @@ void ParallelMoveResolverX86_64::Exchange32(CpuRegister reg, int mem) {
__ movl(reg, CpuRegister(TMP));
}
-void ParallelMoveResolverX86_64::Exchange32(int mem1, int mem2) {
- ScratchRegisterScope ensure_scratch(
- this, TMP, RAX, codegen_->GetNumberOfCoreRegisters());
-
- int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0;
- __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset));
- __ movl(CpuRegister(ensure_scratch.GetRegister()),
- Address(CpuRegister(RSP), mem2 + stack_offset));
- __ movl(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP));
- __ movl(Address(CpuRegister(RSP), mem1 + stack_offset),
- CpuRegister(ensure_scratch.GetRegister()));
-}
-
void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg1, CpuRegister reg2) {
__ movq(CpuRegister(TMP), reg1);
__ movq(reg1, reg2);
@@ -5315,19 +5310,6 @@ void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg, int mem) {
__ movq(reg, CpuRegister(TMP));
}
-void ParallelMoveResolverX86_64::Exchange64(int mem1, int mem2) {
- ScratchRegisterScope ensure_scratch(
- this, TMP, RAX, codegen_->GetNumberOfCoreRegisters());
-
- int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0;
- __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset));
- __ movq(CpuRegister(ensure_scratch.GetRegister()),
- Address(CpuRegister(RSP), mem2 + stack_offset));
- __ movq(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP));
- __ movq(Address(CpuRegister(RSP), mem1 + stack_offset),
- CpuRegister(ensure_scratch.GetRegister()));
-}
-
void ParallelMoveResolverX86_64::Exchange32(XmmRegister reg, int mem) {
__ movl(CpuRegister(TMP), Address(CpuRegister(RSP), mem));
__ movss(Address(CpuRegister(RSP), mem), reg);
@@ -5340,6 +5322,48 @@ void ParallelMoveResolverX86_64::Exchange64(XmmRegister reg, int mem) {
__ movd(reg, CpuRegister(TMP));
}
+void ParallelMoveResolverX86_64::Exchange128(XmmRegister reg, int mem) {
+ size_t extra_slot = 2 * kX86_64WordSize;
+ __ subq(CpuRegister(RSP), Immediate(extra_slot));
+ __ movups(Address(CpuRegister(RSP), 0), XmmRegister(reg));
+ ExchangeMemory64(0, mem + extra_slot, 2);
+ __ movups(XmmRegister(reg), Address(CpuRegister(RSP), 0));
+ __ addq(CpuRegister(RSP), Immediate(extra_slot));
+}
+
+void ParallelMoveResolverX86_64::ExchangeMemory32(int mem1, int mem2) {
+ ScratchRegisterScope ensure_scratch(
+ this, TMP, RAX, codegen_->GetNumberOfCoreRegisters());
+
+ int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0;
+ __ movl(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset));
+ __ movl(CpuRegister(ensure_scratch.GetRegister()),
+ Address(CpuRegister(RSP), mem2 + stack_offset));
+ __ movl(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP));
+ __ movl(Address(CpuRegister(RSP), mem1 + stack_offset),
+ CpuRegister(ensure_scratch.GetRegister()));
+}
+
+void ParallelMoveResolverX86_64::ExchangeMemory64(int mem1, int mem2, int num_of_qwords) {
+ ScratchRegisterScope ensure_scratch(
+ this, TMP, RAX, codegen_->GetNumberOfCoreRegisters());
+
+ int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0;
+
+ // Now that temp registers are available (possibly spilled), exchange blocks of memory.
+ for (int i = 0; i < num_of_qwords; i++) {
+ __ movq(CpuRegister(TMP),
+ Address(CpuRegister(RSP), mem1 + stack_offset));
+ __ movq(CpuRegister(ensure_scratch.GetRegister()),
+ Address(CpuRegister(RSP), mem2 + stack_offset));
+ __ movq(Address(CpuRegister(RSP), mem2 + stack_offset),
+ CpuRegister(TMP));
+ __ movq(Address(CpuRegister(RSP), mem1 + stack_offset),
+ CpuRegister(ensure_scratch.GetRegister()));
+ stack_offset += kX86_64WordSize;
+ }
+}
+
void ParallelMoveResolverX86_64::EmitSwap(size_t index) {
MoveOperands* move = moves_[index];
Location source = move->GetSource();
@@ -5352,13 +5376,13 @@ void ParallelMoveResolverX86_64::EmitSwap(size_t index) {
} else if (source.IsStackSlot() && destination.IsRegister()) {
Exchange32(destination.AsRegister<CpuRegister>(), source.GetStackIndex());
} else if (source.IsStackSlot() && destination.IsStackSlot()) {
- Exchange32(destination.GetStackIndex(), source.GetStackIndex());
+ ExchangeMemory32(destination.GetStackIndex(), source.GetStackIndex());
} else if (source.IsRegister() && destination.IsDoubleStackSlot()) {
Exchange64(source.AsRegister<CpuRegister>(), destination.GetStackIndex());
} else if (source.IsDoubleStackSlot() && destination.IsRegister()) {
Exchange64(destination.AsRegister<CpuRegister>(), source.GetStackIndex());
} else if (source.IsDoubleStackSlot() && destination.IsDoubleStackSlot()) {
- Exchange64(destination.GetStackIndex(), source.GetStackIndex());
+ ExchangeMemory64(destination.GetStackIndex(), source.GetStackIndex(), 1);
} else if (source.IsFpuRegister() && destination.IsFpuRegister()) {
__ movd(CpuRegister(TMP), source.AsFpuRegister<XmmRegister>());
__ movaps(source.AsFpuRegister<XmmRegister>(), destination.AsFpuRegister<XmmRegister>());
@@ -5371,6 +5395,12 @@ void ParallelMoveResolverX86_64::EmitSwap(size_t index) {
Exchange64(source.AsFpuRegister<XmmRegister>(), destination.GetStackIndex());
} else if (source.IsDoubleStackSlot() && destination.IsFpuRegister()) {
Exchange64(destination.AsFpuRegister<XmmRegister>(), source.GetStackIndex());
+ } else if (source.IsSIMDStackSlot() && destination.IsSIMDStackSlot()) {
+ ExchangeMemory64(destination.GetStackIndex(), source.GetStackIndex(), 2);
+ } else if (source.IsFpuRegister() && destination.IsSIMDStackSlot()) {
+ Exchange128(source.AsFpuRegister<XmmRegister>(), destination.GetStackIndex());
+ } else if (destination.IsFpuRegister() && source.IsSIMDStackSlot()) {
+ Exchange128(destination.AsFpuRegister<XmmRegister>(), source.GetStackIndex());
} else {
LOG(FATAL) << "Unimplemented swap between " << source << " and " << destination;
}
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 00c5c27470..e86123ef01 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -139,11 +139,12 @@ class ParallelMoveResolverX86_64 : public ParallelMoveResolverWithSwap {
private:
void Exchange32(CpuRegister reg, int mem);
void Exchange32(XmmRegister reg, int mem);
- void Exchange32(int mem1, int mem2);
void Exchange64(CpuRegister reg1, CpuRegister reg2);
void Exchange64(CpuRegister reg, int mem);
void Exchange64(XmmRegister reg, int mem);
- void Exchange64(int mem1, int mem2);
+ void Exchange128(XmmRegister reg, int mem);
+ void ExchangeMemory32(int mem1, int mem2);
+ void ExchangeMemory64(int mem1, int mem2, int num_of_qwords);
CodeGeneratorX86_64* const codegen_;
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index 096349fd73..87dff8403b 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -109,6 +109,16 @@ class HVecOperation : public HVariableInputSizeInstruction {
// Assumes vector nodes cannot be moved by default. Each concrete implementation
// that can be moved should override this method and return true.
+ //
+ // Note: similar approach is used for instruction scheduling (if it is turned on for the target):
+ // by default HScheduler::IsSchedulable returns false for a particular HVecOperation.
+ // HScheduler${ARCH}::IsSchedulable can be overridden to return true for an instruction (see
+ // scheduler_arm64.h for example) if it is safe to schedule it; in this case one *must* also
+ // look at/update HScheduler${ARCH}::IsSchedulingBarrier for this instruction.
+ //
+ // Note: For newly introduced vector instructions HScheduler${ARCH}::IsSchedulingBarrier must be
+ // altered to return true if the instruction might reside outside the SIMD loop body since SIMD
+ // registers are not kept alive across vector loop boundaries (yet).
bool CanBeMoved() const OVERRIDE { return false; }
// Tests if all data of a vector node (vector length and packed type) is equal.
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 73c72fc57a..24b1a123ee 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -1224,7 +1224,7 @@ bool OptimizingCompiler::JitCompile(Thread* self,
}
const CompilerOptions& compiler_options = GetCompilerDriver()->GetCompilerOptions();
- if (compiler_options.GetGenerateDebugInfo()) {
+ if (compiler_options.GenerateAnyDebugInfo()) {
const auto* method_header = reinterpret_cast<const OatQuickMethodHeader*>(code);
const uintptr_t code_address = reinterpret_cast<uintptr_t>(method_header->GetCode());
debug::MethodDebugInfo info = {};
@@ -1244,10 +1244,13 @@ bool OptimizingCompiler::JitCompile(Thread* self,
info.frame_size_in_bytes = method_header->GetFrameSizeInBytes();
info.code_info = nullptr;
info.cfi = jni_compiled_method.GetCfi();
- std::vector<uint8_t> elf_file = debug::WriteDebugElfFileForMethods(
+ // If both flags are passed, generate full debug info.
+ const bool mini_debug_info = !compiler_options.GetGenerateDebugInfo();
+ std::vector<uint8_t> elf_file = debug::MakeElfFileForJIT(
GetCompilerDriver()->GetInstructionSet(),
GetCompilerDriver()->GetInstructionSetFeatures(),
- ArrayRef<const debug::MethodDebugInfo>(&info, 1));
+ mini_debug_info,
+ info);
CreateJITCodeEntryForAddress(code_address, std::move(elf_file));
}
@@ -1352,7 +1355,7 @@ bool OptimizingCompiler::JitCompile(Thread* self,
}
const CompilerOptions& compiler_options = GetCompilerDriver()->GetCompilerOptions();
- if (compiler_options.GetGenerateDebugInfo()) {
+ if (compiler_options.GenerateAnyDebugInfo()) {
const auto* method_header = reinterpret_cast<const OatQuickMethodHeader*>(code);
const uintptr_t code_address = reinterpret_cast<uintptr_t>(method_header->GetCode());
debug::MethodDebugInfo info = {};
@@ -1372,10 +1375,13 @@ bool OptimizingCompiler::JitCompile(Thread* self,
info.frame_size_in_bytes = method_header->GetFrameSizeInBytes();
info.code_info = stack_map_size == 0 ? nullptr : stack_map_data;
info.cfi = ArrayRef<const uint8_t>(*codegen->GetAssembler()->cfi().data());
- std::vector<uint8_t> elf_file = debug::WriteDebugElfFileForMethods(
+ // If both flags are passed, generate full debug info.
+ const bool mini_debug_info = !compiler_options.GetGenerateDebugInfo();
+ std::vector<uint8_t> elf_file = debug::MakeElfFileForJIT(
GetCompilerDriver()->GetInstructionSet(),
GetCompilerDriver()->GetInstructionSetFeatures(),
- ArrayRef<const debug::MethodDebugInfo>(&info, 1));
+ mini_debug_info,
+ info);
CreateJITCodeEntryForAddress(code_address, std::move(elf_file));
}
diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h
index bb7c353bc2..dfa077f7de 100644
--- a/compiler/optimizing/scheduler.h
+++ b/compiler/optimizing/scheduler.h
@@ -462,6 +462,11 @@ class HScheduler {
// containing basic block from being scheduled.
// This method is used to restrict scheduling to instructions that we know are
// safe to handle.
+ //
+ // For newly introduced instructions by default HScheduler::IsSchedulable returns false.
+ // HScheduler${ARCH}::IsSchedulable can be overridden to return true for an instruction (see
+ // scheduler_arm64.h for example) if it is safe to schedule it; in this case one *must* also
+ // look at/update HScheduler${ARCH}::IsSchedulingBarrier for this instruction.
virtual bool IsSchedulable(const HInstruction* instruction) const;
bool IsSchedulable(const HBasicBlock* block) const;
diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h
index 32f161f26a..f71cb5b784 100644
--- a/compiler/optimizing/scheduler_arm64.h
+++ b/compiler/optimizing/scheduler_arm64.h
@@ -151,6 +151,20 @@ class HSchedulerARM64 : public HScheduler {
#undef CASE_INSTRUCTION_KIND
}
+ // Treat as scheduling barriers those vector instructions whose live ranges exceed the vectorized
+ // loop boundaries. This is a workaround for the lack of notion of SIMD register in the compiler;
+ // around a call we have to save/restore all live SIMD&FP registers (only lower 64 bits of
+ // SIMD&FP registers are callee saved) so don't reorder such vector instructions.
+ //
+ // TODO: remove this when a proper support of SIMD registers is introduced to the compiler.
+ bool IsSchedulingBarrier(const HInstruction* instr) const OVERRIDE {
+ return HScheduler::IsSchedulingBarrier(instr) ||
+ instr->IsVecReduce() ||
+ instr->IsVecExtractScalar() ||
+ instr->IsVecSetScalars() ||
+ instr->IsVecReplicateScalar();
+ }
+
private:
SchedulingLatencyVisitorARM64 arm64_latency_visitor_;
DISALLOW_COPY_AND_ASSIGN(HSchedulerARM64);