diff options
author | 2024-10-07 13:48:54 +0100 | |
---|---|---|
committer | 2024-10-15 09:27:57 +0000 | |
commit | b544a1017753bfef7b91a2ea3024ffbde6cd5bb9 (patch) | |
tree | 0b54d507a49e9980337893949ed51e681b82fc83 | |
parent | d3af812dd1c95e9f87c55cfa996354a115c08414 (diff) |
Arm64: Fix PackedSwitch codegen for large methods.
This patch fixes a bug in arm64 PackedSwitch code generation
for very large methods where we exceeded the range of Adr
instruction - jump tables were emited in the very end of the
method. Instead we now emit the jump table in-place as part of
the PackedSwitch visitor - in the same way how it is done
in arm32 backend.
This patch also removes an incorrect assumption that the size of
a method has a linear dependency on the number of its HIR
instructions. This was used to choose whether to emit a jump
table for a PackedSwitch.
Test: art/test.py --target --host --optimizing
Test: art/test.py --gtest art_compiler_tests
Change-Id: I0795811a6408a25021879ab6be9e23ef5f1f50e4
-rw-r--r-- | compiler/optimizing/code_generator_arm64.cc | 51 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_arm64.h | 22 | ||||
-rw-r--r-- | compiler/optimizing/codegen_test.cc | 67 | ||||
-rw-r--r-- | compiler/optimizing/optimizing_unit_test.h | 2 |
4 files changed, 117 insertions, 25 deletions
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc index 42d955ef9e..d56146eb0e 100644 --- a/compiler/optimizing/code_generator_arm64.cc +++ b/compiler/optimizing/code_generator_arm64.cc @@ -554,11 +554,19 @@ void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) { // We are about to use the assembler to place literals directly. Make sure we have enough // underlying code buffer and we have generated the jump table with right size. - EmissionCheckScope scope(codegen->GetVIXLAssembler(), + ExactAssemblyScope scope(codegen->GetVIXLAssembler(), num_entries * sizeof(int32_t), CodeBufferCheckScope::kExactSize); + codegen->GetVIXLAssembler()->bind(&table_start_); + for (uint32_t i = 0; i < num_entries; i++) { + codegen->GetVIXLAssembler()->place(jump_targets_[i].get()); + } +} + +void JumpTableARM64::FixTable(CodeGeneratorARM64* codegen) { + uint32_t num_entries = switch_instr_->GetNumEntries(); + DCHECK_GE(num_entries, kPackedSwitchCompareJumpThreshold); - __ Bind(&table_start_); const ArenaVector<HBasicBlock*>& successors = switch_instr_->GetBlock()->GetSuccessors(); for (uint32_t i = 0; i < num_entries; i++) { vixl::aarch64::Label* target_label = codegen->GetLabelOf(successors[i]); @@ -566,8 +574,7 @@ void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) { ptrdiff_t jump_offset = target_label->GetLocation() - table_start_.GetLocation(); DCHECK_GT(jump_offset, std::numeric_limits<int32_t>::min()); DCHECK_LE(jump_offset, std::numeric_limits<int32_t>::max()); - Literal<int32_t> literal(jump_offset); - __ place(&literal); + jump_targets_[i].get()->UpdateValue(jump_offset, codegen->GetVIXLAssembler()); } } @@ -1072,14 +1079,14 @@ size_t CodeGeneratorARM64::GetSIMDRegisterWidth() const { #define __ GetVIXLAssembler()-> -void CodeGeneratorARM64::EmitJumpTables() { +void CodeGeneratorARM64::FixJumpTables() { for (auto&& jump_table : jump_tables_) { - jump_table->EmitTable(this); + jump_table->FixTable(this); } } void CodeGeneratorARM64::Finalize() { - EmitJumpTables(); + FixJumpTables(); // Emit JIT baker read barrier slow paths. DCHECK(GetCompilerOptions().IsJitCompiler() || jit_baker_read_barrier_slow_paths_.empty()); @@ -6709,17 +6716,7 @@ void InstructionCodeGeneratorARM64::VisitPackedSwitch(HPackedSwitch* switch_inst Register value_reg = InputRegisterAt(switch_instr, 0); HBasicBlock* default_block = switch_instr->GetDefaultBlock(); - // Roughly set 16 as max average assemblies generated per HIR in a graph. - static constexpr int32_t kMaxExpectedSizePerHInstruction = 16 * kInstructionSize; - // ADR has a limited range(+/-1MB), so we set a threshold for the number of HIRs in the graph to - // make sure we don't emit it if the target may run out of range. - // TODO: Instead of emitting all jump tables at the end of the code, we could keep track of ADR - // ranges and emit the tables only as required. - static constexpr int32_t kJumpTableInstructionThreshold = 1* MB / kMaxExpectedSizePerHInstruction; - - if (num_entries <= kPackedSwitchCompareJumpThreshold || - // Current instruction id is an upper bound of the number of HIRs in the graph. - GetGraph()->GetCurrentInstructionId() > kJumpTableInstructionThreshold) { + if (num_entries <= kPackedSwitchCompareJumpThreshold) { // Create a series of compare/jumps. UseScratchRegisterScope temps(codegen_->GetVIXLAssembler()); Register temp = temps.AcquireW(); @@ -6771,15 +6768,25 @@ void InstructionCodeGeneratorARM64::VisitPackedSwitch(HPackedSwitch* switch_inst // immediate value for Adr. So we are free to use both VIXL blocked registers to reduce the // register pressure. Register table_base = temps.AcquireX(); + + const size_t jump_size = switch_instr->GetNumEntries() * sizeof(int32_t); + ExactAssemblyScope scope(codegen_->GetVIXLAssembler(), + kInstructionSize * 4 + jump_size, + CodeBufferCheckScope::kExactSize); + // Load jump offset from the table. - __ Adr(table_base, jump_table->GetTableStartLabel()); + // Note: the table start address is always in range as the table is emitted immediately + // after these 4 instructions. + __ adr(table_base, jump_table->GetTableStartLabel()); Register jump_offset = temp_w; - __ Ldr(jump_offset, MemOperand(table_base, index, UXTW, 2)); + __ ldr(jump_offset, MemOperand(table_base, index, UXTW, 2)); // Jump to target block by branching to table_base(pc related) + offset. Register target_address = table_base; - __ Add(target_address, table_base, Operand(jump_offset, SXTW)); - __ Br(target_address); + __ add(target_address, table_base, Operand(jump_offset, SXTW)); + __ br(target_address); + + jump_table->EmitTable(codegen_); } } diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h index 9f226e9e63..c6427b2da3 100644 --- a/compiler/optimizing/code_generator_arm64.h +++ b/compiler/optimizing/code_generator_arm64.h @@ -183,17 +183,35 @@ class SlowPathCodeARM64 : public SlowPathCode { class JumpTableARM64 : public DeletableArenaObject<kArenaAllocSwitchTable> { public: + using VIXLInt32Literal = vixl::aarch64::Literal<int32_t>; + explicit JumpTableARM64(HPackedSwitch* switch_instr) - : switch_instr_(switch_instr), table_start_() {} + : switch_instr_(switch_instr), + table_start_(), + jump_targets_(switch_instr->GetAllocator()->Adapter(kArenaAllocCodeGenerator)) { + uint32_t num_entries = switch_instr_->GetNumEntries(); + for (uint32_t i = 0; i < num_entries; i++) { + VIXLInt32Literal* lit = new VIXLInt32Literal(0); + jump_targets_.emplace_back(lit); + } + } vixl::aarch64::Label* GetTableStartLabel() { return &table_start_; } + // Emits the jump table into the code buffer; jump target offsets are not yet known. void EmitTable(CodeGeneratorARM64* codegen); + // Updates the offsets in the jump table, to be used when the jump targets basic blocks + // addresses are resolved. + void FixTable(CodeGeneratorARM64* codegen); + private: HPackedSwitch* const switch_instr_; vixl::aarch64::Label table_start_; + // Contains literals for the switch's jump targets. + ArenaVector<std::unique_ptr<VIXLInt32Literal>> jump_targets_; + DISALLOW_COPY_AND_ASSIGN(JumpTableARM64); }; @@ -1144,7 +1162,7 @@ class CodeGeneratorARM64 : public CodeGenerator { vixl::aarch64::Label* adrp_label, ArenaDeque<PcRelativePatchInfo>* patches); - void EmitJumpTables(); + void FixJumpTables(); template <linker::LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)> static void EmitPcRelativeLinkerPatches(const ArenaDeque<PcRelativePatchInfo>& infos, diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc index 7365f0fb7f..14a419fcd8 100644 --- a/compiler/optimizing/codegen_test.cc +++ b/compiler/optimizing/codegen_test.cc @@ -73,6 +73,7 @@ class CodegenTest : public CommonCompilerTest, public OptimizingUnitTestHelper { int64_t j, DataType::Type type, const CodegenTargetConfig target_config); + void TestPackedSwitch(const CodegenTargetConfig target_config); }; void CodegenTest::TestCode(const std::vector<uint16_t>& data, bool has_result, int32_t expected) { @@ -682,6 +683,72 @@ TEST_F(CodegenTest, ComparisonsLong) { } } +// Tests a PackedSwitch in a very large HGraph; validates that the switch jump table is in +// range for the PC-relative load in the codegen visitor. +void CodegenTest::TestPackedSwitch(const CodegenTargetConfig target_config) { + HBasicBlock* return_block = InitEntryMainExitGraph(); + constexpr DataType::Type data_type = DataType::Type::kInt32; + + // A number of entries - we are interested to test jump table implementation. + constexpr size_t kNumSwitchEntries = 10; + + // Number of jump targets (including a 'default' case). + constexpr size_t kNumBB = kNumSwitchEntries + 1; + // Some arbitrary value to be used as input. + constexpr int kInputValue = kNumBB - 4; + + HInstruction* input = graph_->GetIntConstant(kInputValue); + HIntConstant* constant_1 = graph_->GetIntConstant(1); + + HBasicBlock* switch_block = AddNewBlock(); + entry_block_->ReplaceSuccessor(return_block, switch_block); + + HPackedSwitch* hswitch = new (GetAllocator()) HPackedSwitch(0, kNumSwitchEntries, input); + switch_block->AddInstruction(hswitch); + + std::vector<HInstruction*> phi_inputs {}; + + // Add switch jump target blocks. + for (int i = 0; i < kNumBB; i++) { + HBasicBlock* case_block = AddNewBlock(); + case_block->AddPredecessor(switch_block); + case_block->AddSuccessor(return_block); + + HIntConstant* case_value = graph_->GetIntConstant(i); + HAdd* add = MakeBinOp<HAdd>(case_block, data_type, input, case_value); + phi_inputs.emplace_back(add); + + MakeGoto(case_block); + } + + HPhi* phi = MakePhi(return_block, phi_inputs); + HInstruction* return_val = phi; + + // Emit a huge number of HAdds - to simulate a very large HGraph. + constexpr int kNumOfAdds = 2 * 1024 * 1024; + for (int i = 0; i < kNumOfAdds; i++) { + return_val = MakeBinOp<HAdd>(return_block, data_type, return_val, constant_1); + } + + MakeReturn(return_block, return_val); + + graph_->BuildDominatorTree(); + EXPECT_TRUE(CheckGraph()); + + std::unique_ptr<CompilerOptions> compiler_options = + CommonCompilerTest::CreateCompilerOptions(target_config.GetInstructionSet(), "default"); + RunCode(target_config, + *compiler_options, + graph_, + [](HGraph*) {}, true, kNumOfAdds + 2 * kInputValue); +} + +TEST_F(CodegenTest, PackedSwitchInHugeMethod) { + for (CodegenTargetConfig target_config : GetTargetConfigs()) { + TestPackedSwitch(target_config); + } +} + #ifdef ART_ENABLE_CODEGEN_arm TEST_F(CodegenTest, ARMVIXLParallelMoveResolver) { std::unique_ptr<CompilerOptions> compiler_options = diff --git a/compiler/optimizing/optimizing_unit_test.h b/compiler/optimizing/optimizing_unit_test.h index abd1ffe03e..a7c3558c5f 100644 --- a/compiler/optimizing/optimizing_unit_test.h +++ b/compiler/optimizing/optimizing_unit_test.h @@ -382,7 +382,7 @@ class OptimizingUnitTestHelper { pre_header->AddSuccessor(loop); loop->AddSuccessor(loop_exit); // true successor - loop->AddSuccessor(loop); // fakse successor + loop->AddSuccessor(loop); // false successor MakeGoto(pre_header); |