diff options
| author | 2015-11-18 17:47:01 +0000 | |
|---|---|---|
| committer | 2015-11-18 17:47:01 +0000 | |
| commit | a30022e1e57dd2c07e738e19f77a80d72eb0d65c (patch) | |
| tree | 6e0a4b2ed3560a405af758661a2fce759760bbf7 | |
| parent | bc13ef68ca632818e0686cd6ea0a971dddb1a2d3 (diff) | |
| parent | 9231730cd0e285373afd73331168b289309ebee4 (diff) | |
Merge "Opt compiler: Arm64 packed-switch jump tables."
am: 9231730cd0
* commit '9231730cd0e285373afd73331168b289309ebee4':
Opt compiler: Arm64 packed-switch jump tables.
| -rw-r--r-- | compiler/optimizing/code_generator_arm64.cc | 112 | ||||
| -rw-r--r-- | compiler/optimizing/code_generator_arm64.h | 23 |
2 files changed, 119 insertions, 16 deletions
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index 70bf7357ee..d1bddf673a 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -68,6 +68,10 @@ using helpers::ARM64EncodableConstantOrRegister; using helpers::ArtVixlRegCodeCoherentForRegSet; static constexpr int kCurrentMethodStackOffset = 0; +// The compare/jump sequence will generate about (2 * num_entries + 1) instructions. While jump +// table version generates 7 instructions and num_entries literals. Compare/jump sequence will +// generates less code/data with a small num_entries. +static constexpr uint32_t kPackedSwitchJumpTableThreshold = 6; inline Condition ARM64Condition(IfCondition cond) { switch (cond) { @@ -545,6 +549,28 @@ class ArraySetSlowPathARM64 : public SlowPathCodeARM64 { DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathARM64); }; +void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) { + uint32_t num_entries = switch_instr_->GetNumEntries(); + DCHECK_GE(num_entries, kPackedSwitchJumpTableThreshold); + + // We are about to use the assembler to place literals directly. Make sure we have enough + // underlying code buffer and we have generated the jump table with right size. + CodeBufferCheckScope scope(codegen->GetVIXLAssembler(), num_entries * sizeof(int32_t), + CodeBufferCheckScope::kCheck, CodeBufferCheckScope::kExactSize); + + __ Bind(&table_start_); + const ArenaVector<HBasicBlock*>& successors = switch_instr_->GetBlock()->GetSuccessors(); + for (uint32_t i = 0; i < num_entries; i++) { + vixl::Label* target_label = codegen->GetLabelOf(successors[i]); + DCHECK(target_label->IsBound()); + ptrdiff_t jump_offset = target_label->location() - table_start_.location(); + DCHECK_GT(jump_offset, std::numeric_limits<int32_t>::min()); + DCHECK_LE(jump_offset, std::numeric_limits<int32_t>::max()); + Literal<int32_t> literal(jump_offset); + __ place(&literal); + } +} + #undef __ Location InvokeDexCallingConventionVisitorARM64::GetNextLocation(Primitive::Type type) { @@ -587,6 +613,7 @@ CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph, compiler_options, stats), block_labels_(nullptr), + jump_tables_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)), location_builder_(graph, this), instruction_visitor_(graph, this), move_resolver_(graph->GetArena(), this), @@ -603,10 +630,16 @@ CodeGeneratorARM64::CodeGeneratorARM64(HGraph* graph, AddAllocatedRegister(LocationFrom(lr)); } -#undef __ #define __ GetVIXLAssembler()-> +void CodeGeneratorARM64::EmitJumpTables() { + for (auto jump_table : jump_tables_) { + jump_table->EmitTable(this); + } +} + void CodeGeneratorARM64::Finalize(CodeAllocator* allocator) { + EmitJumpTables(); // Ensure we emit the literal pool. __ FinalizeCode(); @@ -3848,26 +3881,73 @@ void LocationsBuilderARM64::VisitPackedSwitch(HPackedSwitch* switch_instr) { void InstructionCodeGeneratorARM64::VisitPackedSwitch(HPackedSwitch* switch_instr) { int32_t lower_bound = switch_instr->GetStartValue(); - int32_t num_entries = switch_instr->GetNumEntries(); + uint32_t num_entries = switch_instr->GetNumEntries(); Register value_reg = InputRegisterAt(switch_instr, 0); HBasicBlock* default_block = switch_instr->GetDefaultBlock(); - // Create a series of compare/jumps. - const ArenaVector<HBasicBlock*>& successors = switch_instr->GetBlock()->GetSuccessors(); - for (int32_t i = 0; i < num_entries; i++) { - int32_t case_value = lower_bound + i; - vixl::Label* succ = codegen_->GetLabelOf(successors[i]); - if (case_value == 0) { - __ Cbz(value_reg, succ); - } else { - __ Cmp(value_reg, vixl::Operand(case_value)); - __ B(eq, succ); + // Roughly set 16 as max average assemblies generated per HIR in a graph. + static constexpr int32_t kMaxExpectedSizePerHInstruction = 16 * vixl::kInstructionSize; + // ADR has a limited range(+/-1MB), so we set a threshold for the number of HIRs in the graph to + // make sure we don't emit it if the target may run out of range. + // TODO: Instead of emitting all jump tables at the end of the code, we could keep track of ADR + // ranges and emit the tables only as required. + static constexpr int32_t kJumpTableInstructionThreshold = 1* MB / kMaxExpectedSizePerHInstruction; + + if (num_entries < kPackedSwitchJumpTableThreshold || + // Current instruction id is an upper bound of the number of HIRs in the graph. + GetGraph()->GetCurrentInstructionId() > kJumpTableInstructionThreshold) { + // Create a series of compare/jumps. + const ArenaVector<HBasicBlock*>& successors = switch_instr->GetBlock()->GetSuccessors(); + for (uint32_t i = 0; i < num_entries; i++) { + int32_t case_value = lower_bound + i; + vixl::Label* succ = codegen_->GetLabelOf(successors[i]); + if (case_value == 0) { + __ Cbz(value_reg, succ); + } else { + __ Cmp(value_reg, Operand(case_value)); + __ B(eq, succ); + } } - } - // And the default for any other value. - if (!codegen_->GoesToNextBlock(switch_instr->GetBlock(), default_block)) { - __ B(codegen_->GetLabelOf(default_block)); + // And the default for any other value. + if (!codegen_->GoesToNextBlock(switch_instr->GetBlock(), default_block)) { + __ B(codegen_->GetLabelOf(default_block)); + } + } else { + JumpTableARM64* jump_table = new (GetGraph()->GetArena()) JumpTableARM64(switch_instr); + codegen_->AddJumpTable(jump_table); + + UseScratchRegisterScope temps(codegen_->GetVIXLAssembler()); + + // Below instructions should use at most one blocked register. Since there are two blocked + // registers, we are free to block one. + Register temp_w = temps.AcquireW(); + Register index; + // Remove the bias. + if (lower_bound != 0) { + index = temp_w; + __ Sub(index, value_reg, Operand(lower_bound)); + } else { + index = value_reg; + } + + // Jump to default block if index is out of the range. + __ Cmp(index, Operand(num_entries)); + __ B(hs, codegen_->GetLabelOf(default_block)); + + // In current VIXL implementation, it won't require any blocked registers to encode the + // immediate value for Adr. So we are free to use both VIXL blocked registers to reduce the + // register pressure. + Register table_base = temps.AcquireX(); + // Load jump offset from the table. + __ Adr(table_base, jump_table->GetTableStartLabel()); + Register jump_offset = temp_w; + __ Ldr(jump_offset, MemOperand(table_base, index, UXTW, 2)); + + // Jump to target block by branching to table_base(pc related) + offset. + Register target_address = table_base; + __ Add(target_address, table_base, Operand(jump_offset, SXTW)); + __ Br(target_address); } } diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index 44036621f8..881afcc123 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -81,6 +81,22 @@ class SlowPathCodeARM64 : public SlowPathCode { DISALLOW_COPY_AND_ASSIGN(SlowPathCodeARM64); }; +class JumpTableARM64 : public ArenaObject<kArenaAllocSwitchTable> { + public: + explicit JumpTableARM64(HPackedSwitch* switch_instr) + : switch_instr_(switch_instr), table_start_() {} + + vixl::Label* GetTableStartLabel() { return &table_start_; } + + void EmitTable(CodeGeneratorARM64* codegen); + + private: + HPackedSwitch* const switch_instr_; + vixl::Label table_start_; + + DISALLOW_COPY_AND_ASSIGN(JumpTableARM64); +}; + static const vixl::Register kRuntimeParameterCoreRegisters[] = { vixl::x0, vixl::x1, vixl::x2, vixl::x3, vixl::x4, vixl::x5, vixl::x6, vixl::x7 }; static constexpr size_t kRuntimeParameterCoreRegistersLength = @@ -358,6 +374,10 @@ class CodeGeneratorARM64 : public CodeGenerator { block_labels_ = CommonInitializeLabels<vixl::Label>(); } + void AddJumpTable(JumpTableARM64* jump_table) { + jump_tables_.push_back(jump_table); + } + void Finalize(CodeAllocator* allocator) OVERRIDE; // Code generation helpers. @@ -426,9 +446,12 @@ class CodeGeneratorARM64 : public CodeGenerator { vixl::Label* pc_insn_label; }; + void EmitJumpTables(); + // Labels for each block that will be compiled. vixl::Label* block_labels_; // Indexed by block id. vixl::Label frame_entry_label_; + ArenaVector<JumpTableARM64*> jump_tables_; LocationsBuilderARM64 location_builder_; InstructionCodeGeneratorARM64 instruction_visitor_; |