Arm64: Fix PackedSwitch codegen for large methods.

This patch fixes a bug in arm64 PackedSwitch code generation for very large methods where we exceeded the range of Adr instruction - jump tables were emited in the very end of the method. Instead we now emit the jump table in-place as part of the PackedSwitch visitor - in the same way how it is done in arm32 backend. This patch also removes an incorrect assumption that the size of a method has a linear dependency on the number of its HIR instructions. This was used to choose whether to emit a jump table for a PackedSwitch. Test: art/test.py --target --host --optimizing Test: art/test.py --gtest art_compiler_tests Change-Id: I0795811a6408a25021879ab6be9e23ef5f1f50e4
author: Artem Serov <artem.serov@arm.com> 2024-10-07 13:48:54 +0100
committer: Treehugger Robot <android-test-infra-autosubmit@system.gserviceaccount.com> 2024-10-15 09:27:57 +0000
commit: b544a1017753bfef7b91a2ea3024ffbde6cd5bb9 (patch)
tree: 0b54d507a49e9980337893949ed51e681b82fc83
parent: d3af812dd1c95e9f87c55cfa996354a115c08414 (diff)
4 files changed, 117 insertions, 25 deletions
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 42d955ef9e..d56146eb0e 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -554,11 +554,19 @@ void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) {
 
   // We are about to use the assembler to place literals directly. Make sure we have enough
   // underlying code buffer and we have generated the jump table with right size.
-  EmissionCheckScope scope(codegen->GetVIXLAssembler(),
+  ExactAssemblyScope scope(codegen->GetVIXLAssembler(),
                            num_entries * sizeof(int32_t),
                            CodeBufferCheckScope::kExactSize);
+  codegen->GetVIXLAssembler()->bind(&table_start_);
+  for (uint32_t i = 0; i < num_entries; i++) {
+    codegen->GetVIXLAssembler()->place(jump_targets_[i].get());
+  }
+}
+
+void JumpTableARM64::FixTable(CodeGeneratorARM64* codegen) {
+  uint32_t num_entries = switch_instr_->GetNumEntries();
+  DCHECK_GE(num_entries, kPackedSwitchCompareJumpThreshold);
 
-  __ Bind(&table_start_);
   const ArenaVector<HBasicBlock*>& successors = switch_instr_->GetBlock()->GetSuccessors();
   for (uint32_t i = 0; i < num_entries; i++) {
     vixl::aarch64::Label* target_label = codegen->GetLabelOf(successors[i]);
@@ -566,8 +574,7 @@ void JumpTableARM64::EmitTable(CodeGeneratorARM64* codegen) {
     ptrdiff_t jump_offset = target_label->GetLocation() - table_start_.GetLocation();
     DCHECK_GT(jump_offset, std::numeric_limits<int32_t>::min());
     DCHECK_LE(jump_offset, std::numeric_limits<int32_t>::max());
-    Literal<int32_t> literal(jump_offset);
-    __ place(&literal);
+    jump_targets_[i].get()->UpdateValue(jump_offset, codegen->GetVIXLAssembler());
   }
 }
 
@@ -1072,14 +1079,14 @@ size_t CodeGeneratorARM64::GetSIMDRegisterWidth() const {
 
 #define __ GetVIXLAssembler()->
 
-void CodeGeneratorARM64::EmitJumpTables() {
+void CodeGeneratorARM64::FixJumpTables() {
   for (auto&& jump_table : jump_tables_) {
-    jump_table->EmitTable(this);
+    jump_table->FixTable(this);
   }
 }
 
 void CodeGeneratorARM64::Finalize() {
-  EmitJumpTables();
+  FixJumpTables();
 
   // Emit JIT baker read barrier slow paths.
   DCHECK(GetCompilerOptions().IsJitCompiler() || jit_baker_read_barrier_slow_paths_.empty());
@@ -6709,17 +6716,7 @@ void InstructionCodeGeneratorARM64::VisitPackedSwitch(HPackedSwitch* switch_inst
   Register value_reg = InputRegisterAt(switch_instr, 0);
   HBasicBlock* default_block = switch_instr->GetDefaultBlock();
 
-  // Roughly set 16 as max average assemblies generated per HIR in a graph.
-  static constexpr int32_t kMaxExpectedSizePerHInstruction = 16 * kInstructionSize;
-  // ADR has a limited range(+/-1MB), so we set a threshold for the number of HIRs in the graph to
-  // make sure we don't emit it if the target may run out of range.
-  // TODO: Instead of emitting all jump tables at the end of the code, we could keep track of ADR
-  // ranges and emit the tables only as required.
-  static constexpr int32_t kJumpTableInstructionThreshold = 1* MB / kMaxExpectedSizePerHInstruction;
-
-  if (num_entries <= kPackedSwitchCompareJumpThreshold ||
-      // Current instruction id is an upper bound of the number of HIRs in the graph.
-      GetGraph()->GetCurrentInstructionId() > kJumpTableInstructionThreshold) {
+  if (num_entries <= kPackedSwitchCompareJumpThreshold) {
     // Create a series of compare/jumps.
     UseScratchRegisterScope temps(codegen_->GetVIXLAssembler());
     Register temp = temps.AcquireW();
@@ -6771,15 +6768,25 @@ void InstructionCodeGeneratorARM64::VisitPackedSwitch(HPackedSwitch* switch_inst
     // immediate value for Adr. So we are free to use both VIXL blocked registers to reduce the
     // register pressure.
     Register table_base = temps.AcquireX();
+
+    const size_t jump_size = switch_instr->GetNumEntries() * sizeof(int32_t);
+    ExactAssemblyScope scope(codegen_->GetVIXLAssembler(),
+                             kInstructionSize * 4 + jump_size,
+                             CodeBufferCheckScope::kExactSize);
+
     // Load jump offset from the table.
-    __ Adr(table_base, jump_table->GetTableStartLabel());
+    // Note: the table start address is always in range as the table is emitted immediately
+    // after these 4 instructions.
+    __ adr(table_base, jump_table->GetTableStartLabel());
     Register jump_offset = temp_w;
-    __ Ldr(jump_offset, MemOperand(table_base, index, UXTW, 2));
+    __ ldr(jump_offset, MemOperand(table_base, index, UXTW, 2));
 
     // Jump to target block by branching to table_base(pc related) + offset.
     Register target_address = table_base;
-    __ Add(target_address, table_base, Operand(jump_offset, SXTW));
-    __ Br(target_address);
+    __ add(target_address, table_base, Operand(jump_offset, SXTW));
+    __ br(target_address);
+
+    jump_table->EmitTable(codegen_);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 9f226e9e63..c6427b2da3 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -183,17 +183,35 @@ class SlowPathCodeARM64 : public SlowPathCode {
 
 class JumpTableARM64 : public DeletableArenaObject<kArenaAllocSwitchTable> {
  public:
+  using VIXLInt32Literal = vixl::aarch64::Literal<int32_t>;
+
   explicit JumpTableARM64(HPackedSwitch* switch_instr)
-    : switch_instr_(switch_instr), table_start_() {}
+      : switch_instr_(switch_instr),
+        table_start_(),
+        jump_targets_(switch_instr->GetAllocator()->Adapter(kArenaAllocCodeGenerator)) {
+      uint32_t num_entries = switch_instr_->GetNumEntries();
+      for (uint32_t i = 0; i < num_entries; i++) {
+        VIXLInt32Literal* lit = new VIXLInt32Literal(0);
+        jump_targets_.emplace_back(lit);
+      }
+    }
 
   vixl::aarch64::Label* GetTableStartLabel() { return &table_start_; }
 
+  // Emits the jump table into the code buffer; jump target offsets are not yet known.
   void EmitTable(CodeGeneratorARM64* codegen);
 
+  // Updates the offsets in the jump table, to be used when the jump targets basic blocks
+  // addresses are resolved.
+  void FixTable(CodeGeneratorARM64* codegen);
+
  private:
   HPackedSwitch* const switch_instr_;
   vixl::aarch64::Label table_start_;
 
+  // Contains literals for the switch's jump targets.
+  ArenaVector<std::unique_ptr<VIXLInt32Literal>> jump_targets_;
+
   DISALLOW_COPY_AND_ASSIGN(JumpTableARM64);
 };
 
@@ -1144,7 +1162,7 @@ class CodeGeneratorARM64 : public CodeGenerator {
                                            vixl::aarch64::Label* adrp_label,
                                            ArenaDeque<PcRelativePatchInfo>* patches);
 
-  void EmitJumpTables();
+  void FixJumpTables();
 
   template <linker::LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
   static void EmitPcRelativeLinkerPatches(const ArenaDeque<PcRelativePatchInfo>& infos,
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index 7365f0fb7f..14a419fcd8 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -73,6 +73,7 @@ class CodegenTest : public CommonCompilerTest, public OptimizingUnitTestHelper {
                       int64_t j,
                       DataType::Type type,
                       const CodegenTargetConfig target_config);
+  void TestPackedSwitch(const CodegenTargetConfig target_config);
 };
 
 void CodegenTest::TestCode(const std::vector<uint16_t>& data, bool has_result, int32_t expected) {
@@ -682,6 +683,72 @@ TEST_F(CodegenTest, ComparisonsLong) {
   }
 }
 
+// Tests a PackedSwitch in a very large HGraph; validates that the switch jump table is in
+// range for the PC-relative load in the codegen visitor.
+void CodegenTest::TestPackedSwitch(const CodegenTargetConfig target_config) {
+  HBasicBlock* return_block = InitEntryMainExitGraph();
+  constexpr DataType::Type data_type = DataType::Type::kInt32;
+
+  // A number of entries - we are interested to test jump table implementation.
+  constexpr size_t kNumSwitchEntries = 10;
+
+  // Number of jump targets (including a 'default' case).
+  constexpr size_t kNumBB = kNumSwitchEntries + 1;
+  // Some arbitrary value to be used as input.
+  constexpr int kInputValue = kNumBB - 4;
+
+  HInstruction* input = graph_->GetIntConstant(kInputValue);
+  HIntConstant* constant_1 = graph_->GetIntConstant(1);
+
+  HBasicBlock* switch_block = AddNewBlock();
+  entry_block_->ReplaceSuccessor(return_block, switch_block);
+
+  HPackedSwitch* hswitch = new (GetAllocator()) HPackedSwitch(0, kNumSwitchEntries, input);
+  switch_block->AddInstruction(hswitch);
+
+  std::vector<HInstruction*> phi_inputs {};
+
+  // Add switch jump target blocks.
+  for (int i = 0; i < kNumBB; i++) {
+    HBasicBlock* case_block = AddNewBlock();
+    case_block->AddPredecessor(switch_block);
+    case_block->AddSuccessor(return_block);
+
+    HIntConstant* case_value = graph_->GetIntConstant(i);
+    HAdd* add = MakeBinOp<HAdd>(case_block, data_type, input, case_value);
+    phi_inputs.emplace_back(add);
+
+    MakeGoto(case_block);
+  }
+
+  HPhi* phi = MakePhi(return_block, phi_inputs);
+  HInstruction* return_val = phi;
+
+  // Emit a huge number of HAdds - to simulate a very large HGraph.
+  constexpr int kNumOfAdds = 2 * 1024 * 1024;
+  for (int i = 0; i < kNumOfAdds; i++) {
+    return_val = MakeBinOp<HAdd>(return_block, data_type, return_val, constant_1);
+  }
+
+  MakeReturn(return_block, return_val);
+
+  graph_->BuildDominatorTree();
+  EXPECT_TRUE(CheckGraph());
+
+  std::unique_ptr<CompilerOptions> compiler_options =
+      CommonCompilerTest::CreateCompilerOptions(target_config.GetInstructionSet(), "default");
+  RunCode(target_config,
+          *compiler_options,
+          graph_,
+          [](HGraph*) {}, true, kNumOfAdds + 2 * kInputValue);
+}
+
+TEST_F(CodegenTest, PackedSwitchInHugeMethod) {
+  for (CodegenTargetConfig target_config : GetTargetConfigs()) {
+    TestPackedSwitch(target_config);
+  }
+}
+
 #ifdef ART_ENABLE_CODEGEN_arm
 TEST_F(CodegenTest, ARMVIXLParallelMoveResolver) {
   std::unique_ptr<CompilerOptions> compiler_options =
diff --git a/compiler/optimizing/optimizing_unit_test.h b/compiler/optimizing/optimizing_unit_test.h
index abd1ffe03e..a7c3558c5f 100644
--- a/compiler/optimizing/optimizing_unit_test.h
+++ b/compiler/optimizing/optimizing_unit_test.h
@@ -382,7 +382,7 @@ class OptimizingUnitTestHelper {
 
     pre_header->AddSuccessor(loop);
     loop->AddSuccessor(loop_exit);  // true successor
-    loop->AddSuccessor(loop);  // fakse successor
+    loop->AddSuccessor(loop);  // false successor
 
     MakeGoto(pre_header);
author	Artem Serov <artem.serov@arm.com>	2024-10-07 13:48:54 +0100
committer	Treehugger Robot <android-test-infra-autosubmit@system.gserviceaccount.com>	2024-10-15 09:27:57 +0000
commit	b544a1017753bfef7b91a2ea3024ffbde6cd5bb9 (patch)
tree	0b54d507a49e9980337893949ed51e681b82fc83
parent	d3af812dd1c95e9f87c55cfa996354a115c08414 (diff)