Optimizing: Refactor `HScheduler`.

Move `SchedulingLatencyVisitor{ARM,ARM64}` to .cc files. Test: m test-art-host-gtest Test: testrunner.py --host --optimizing Test: run-gtests.sh Test: testrunner.py --target --optimizing Change-Id: I15cb1a4cbef00a328fec947189412c502bf80f46
author: Vladimir Marko <vmarko@google.com> 2024-02-12 11:17:28 +0100
committer: Vladimír Marko <vmarko@google.com> 2024-02-13 15:28:35 +0000
commit: 77e5997b524a133d38585da8bf58420f2411f7ad (patch)
tree: 0273c71559a03593b358f04ef060214796159863
parent: 2389869ed372eebd886c2f984f23ec7e342da22b (diff)
7 files changed, 326 insertions, 281 deletions
diff --git a/compiler/optimizing/scheduler.cc b/compiler/optimizing/scheduler.cc
index 4236a545bc..f4cf7b0a49 100644
--- a/compiler/optimizing/scheduler.cc
+++ b/compiler/optimizing/scheduler.cc
@@ -548,20 +548,10 @@ void HScheduler::Schedule(HGraph* graph) {
 void HScheduler::Schedule(HBasicBlock* block,
                           const HeapLocationCollector* heap_location_collector) {
   ScopedArenaAllocator allocator(block->GetGraph()->GetArenaStack());
-  ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator.Adapter(kArenaAllocScheduler));
 
   // Build the scheduling graph.
-  SchedulingGraph scheduling_graph(&allocator, heap_location_collector);
-  for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
-    HInstruction* instruction = it.Current();
-    CHECK_EQ(instruction->GetBlock(), block)
-        << instruction->DebugName()
-        << " is in block " << instruction->GetBlock()->GetBlockId()
-        << ", and expected in block " << block->GetBlockId();
-    SchedulingNode* node = scheduling_graph.AddNode(instruction, IsSchedulingBarrier(instruction));
-    CalculateLatency(node);
-    scheduling_nodes.push_back(node);
-  }
+  auto [scheduling_graph, scheduling_nodes] =
+      BuildSchedulingGraph(block, &allocator, heap_location_collector);
 
   if (scheduling_graph.Size() <= 1) {
     return;
@@ -803,8 +793,7 @@ bool HInstructionScheduling::Run(bool only_optimize_loop_blocks,
 #if defined(ART_ENABLE_CODEGEN_arm)
     case InstructionSet::kThumb2:
     case InstructionSet::kArm: {
-      arm::SchedulingLatencyVisitorARM arm_latency_visitor(codegen_);
-      arm::HSchedulerARM scheduler(selector, &arm_latency_visitor);
+      arm::HSchedulerARM scheduler(selector, codegen_);
       scheduler.SetOnlyOptimizeLoopBlocks(only_optimize_loop_blocks);
       scheduler.Schedule(graph_);
       break;
diff --git a/compiler/optimizing/scheduler.h b/compiler/optimizing/scheduler.h
index 299fbc93f3..a9672ea732 100644
--- a/compiler/optimizing/scheduler.h
+++ b/compiler/optimizing/scheduler.h
@@ -497,9 +497,8 @@ class CriticalPathSchedulingNodeSelector : public SchedulingNodeSelector {
 
 class HScheduler {
  public:
-  HScheduler(SchedulingLatencyVisitor* latency_visitor, SchedulingNodeSelector* selector)
-      : latency_visitor_(latency_visitor),
-        selector_(selector),
+  explicit HScheduler(SchedulingNodeSelector* selector)
+      : selector_(selector),
         only_optimize_loop_blocks_(true),
         cursor_(nullptr) {}
   virtual ~HScheduler() {}
@@ -512,6 +511,35 @@ class HScheduler {
   virtual bool IsSchedulingBarrier(const HInstruction* instruction) const;
 
  protected:
+  virtual std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> BuildSchedulingGraph(
+      HBasicBlock* block,
+      ScopedArenaAllocator* allocator,
+      const HeapLocationCollector* heap_location_collector) = 0;
+
+  template <typename LatencyVisitor>
+  std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> BuildSchedulingGraph(
+      HBasicBlock* block,
+      ScopedArenaAllocator* allocator,
+      const HeapLocationCollector* heap_location_collector,
+      LatencyVisitor* latency_visitor) ALWAYS_INLINE {
+    SchedulingGraph scheduling_graph(allocator, heap_location_collector);
+    ScopedArenaVector<SchedulingNode*> scheduling_nodes(allocator->Adapter(kArenaAllocScheduler));
+    for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+      HInstruction* instruction = it.Current();
+      CHECK_EQ(instruction->GetBlock(), block)
+          << instruction->DebugName()
+          << " is in block " << instruction->GetBlock()->GetBlockId()
+          << ", and expected in block " << block->GetBlockId();
+      SchedulingNode* node =
+          scheduling_graph.AddNode(instruction, IsSchedulingBarrier(instruction));
+      latency_visitor->CalculateLatency(node);
+      node->SetLatency(latency_visitor->GetLastVisitedLatency());
+      node->SetInternalLatency(latency_visitor->GetLastVisitedInternalLatency());
+      scheduling_nodes.push_back(node);
+    }
+    return {std::move(scheduling_graph), std::move(scheduling_nodes)};
+  }
+
   void Schedule(HBasicBlock* block, const HeapLocationCollector* heap_location_collector);
   void Schedule(SchedulingNode* scheduling_node,
                 /*inout*/ ScopedArenaVector<SchedulingNode*>* candidates);
@@ -529,13 +557,6 @@ class HScheduler {
   virtual bool IsSchedulable(const HInstruction* instruction) const;
   bool IsSchedulable(const HBasicBlock* block) const;
 
-  void CalculateLatency(SchedulingNode* node) {
-    latency_visitor_->CalculateLatency(node);
-    node->SetLatency(latency_visitor_->GetLastVisitedLatency());
-    node->SetInternalLatency(latency_visitor_->GetLastVisitedInternalLatency());
-  }
-
-  SchedulingLatencyVisitor* const latency_visitor_;
   SchedulingNodeSelector* const selector_;
   bool only_optimize_loop_blocks_;
 
diff --git a/compiler/optimizing/scheduler_arm.cc b/compiler/optimizing/scheduler_arm.cc
index 510a0f5496..3ee6f06b46 100644
--- a/compiler/optimizing/scheduler_arm.cc
+++ b/compiler/optimizing/scheduler_arm.cc
@@ -17,6 +17,7 @@
 #include "scheduler_arm.h"
 
 #include "arch/arm/instruction_set_features_arm.h"
+#include "code_generator_arm_vixl.h"
 #include "code_generator_utils.h"
 #include "common_arm.h"
 #include "heap_poisoning.h"
@@ -29,6 +30,116 @@ namespace arm {
 using helpers::Int32ConstantFrom;
 using helpers::Uint64ConstantFrom;
 
+// AArch32 instruction latencies.
+// We currently assume that all ARM CPUs share the same instruction latency list.
+// The following latencies were tuned based on performance experiments and
+// automatic tuning using differential evolution approach on various benchmarks.
+static constexpr uint32_t kArmIntegerOpLatency = 2;
+static constexpr uint32_t kArmFloatingPointOpLatency = 11;
+static constexpr uint32_t kArmDataProcWithShifterOpLatency = 4;
+static constexpr uint32_t kArmMulIntegerLatency = 6;
+static constexpr uint32_t kArmMulFloatingPointLatency = 11;
+static constexpr uint32_t kArmDivIntegerLatency = 10;
+static constexpr uint32_t kArmDivFloatLatency = 20;
+static constexpr uint32_t kArmDivDoubleLatency = 25;
+static constexpr uint32_t kArmTypeConversionFloatingPointIntegerLatency = 11;
+static constexpr uint32_t kArmMemoryLoadLatency = 9;
+static constexpr uint32_t kArmMemoryStoreLatency = 9;
+static constexpr uint32_t kArmMemoryBarrierLatency = 6;
+static constexpr uint32_t kArmBranchLatency = 4;
+static constexpr uint32_t kArmCallLatency = 5;
+static constexpr uint32_t kArmCallInternalLatency = 29;
+static constexpr uint32_t kArmLoadStringInternalLatency = 10;
+static constexpr uint32_t kArmNopLatency = 2;
+static constexpr uint32_t kArmLoadWithBakerReadBarrierLatency = 18;
+static constexpr uint32_t kArmRuntimeTypeCheckLatency = 46;
+
+class SchedulingLatencyVisitorARM final : public SchedulingLatencyVisitor {
+ public:
+  explicit SchedulingLatencyVisitorARM(CodeGenerator* codegen)
+      : codegen_(down_cast<CodeGeneratorARMVIXL*>(codegen)) {}
+
+  // Default visitor for instructions not handled specifically below.
+  void VisitInstruction([[maybe_unused]] HInstruction*) override {
+    last_visited_latency_ = kArmIntegerOpLatency;
+  }
+
+// We add a second unused parameter to be able to use this macro like the others
+// defined in `nodes.h`.
+#define FOR_EACH_SCHEDULED_ARM_INSTRUCTION(M) \
+  M(ArrayGet, unused)                         \
+  M(ArrayLength, unused)                      \
+  M(ArraySet, unused)                         \
+  M(Add, unused)                              \
+  M(Sub, unused)                              \
+  M(And, unused)                              \
+  M(Or, unused)                               \
+  M(Ror, unused)                              \
+  M(Xor, unused)                              \
+  M(Shl, unused)                              \
+  M(Shr, unused)                              \
+  M(UShr, unused)                             \
+  M(Mul, unused)                              \
+  M(Div, unused)                              \
+  M(Condition, unused)                        \
+  M(Compare, unused)                          \
+  M(BoundsCheck, unused)                      \
+  M(InstanceFieldGet, unused)                 \
+  M(InstanceFieldSet, unused)                 \
+  M(InstanceOf, unused)                       \
+  M(Invoke, unused)                           \
+  M(LoadString, unused)                       \
+  M(NewArray, unused)                         \
+  M(NewInstance, unused)                      \
+  M(Rem, unused)                              \
+  M(StaticFieldGet, unused)                   \
+  M(StaticFieldSet, unused)                   \
+  M(SuspendCheck, unused)                     \
+  M(TypeConversion, unused)
+
+#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \
+  M(BitwiseNegatedRight, unused)                 \
+  M(MultiplyAccumulate, unused)                  \
+  M(IntermediateAddress, unused)                 \
+  M(IntermediateAddressIndex, unused)            \
+  M(DataProcWithShifterOp, unused)
+
+#define DECLARE_VISIT_INSTRUCTION(type, unused)  \
+  void Visit##type(H##type* instruction) override;
+
+  FOR_EACH_SCHEDULED_ARM_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION)
+
+#undef DECLARE_VISIT_INSTRUCTION
+
+ private:
+  bool CanGenerateTest(HCondition* cond);
+  void HandleGenerateConditionWithZero(IfCondition cond);
+  void HandleGenerateLongTestConstant(HCondition* cond);
+  void HandleGenerateLongTest(HCondition* cond);
+  void HandleGenerateLongComparesAndJumps();
+  void HandleGenerateTest(HCondition* cond);
+  void HandleGenerateConditionGeneric(HCondition* cond);
+  void HandleGenerateEqualLong(HCondition* cond);
+  void HandleGenerateConditionLong(HCondition* cond);
+  void HandleGenerateConditionIntegralOrNonPrimitive(HCondition* cond);
+  void HandleCondition(HCondition* instr);
+  void HandleBinaryOperationLantencies(HBinaryOperation* instr);
+  void HandleBitwiseOperationLantencies(HBinaryOperation* instr);
+  void HandleShiftLatencies(HBinaryOperation* instr);
+  void HandleDivRemConstantIntegralLatencies(int32_t imm);
+  void HandleFieldSetLatencies(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleFieldGetLatencies(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleGenerateDataProcInstruction(bool internal_latency = false);
+  void HandleGenerateDataProc(HDataProcWithShifterOp* instruction);
+  void HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction);
+
+  // The latency setting for each HInstruction depends on how CodeGenerator may generate code,
+  // latency visitors may query CodeGenerator for such information for accurate latency settings.
+  CodeGeneratorARMVIXL* codegen_;
+};
+
 void SchedulingLatencyVisitorARM::HandleBinaryOperationLantencies(HBinaryOperation* instr) {
   switch (instr->GetResultType()) {
     case DataType::Type::kInt64:
@@ -1153,5 +1264,28 @@ void SchedulingLatencyVisitorARM::VisitTypeConversion(HTypeConversion* instr) {
   }
 }
 
+bool HSchedulerARM::IsSchedulable(const HInstruction* instruction) const {
+  switch (instruction->GetKind()) {
+#define SCHEDULABLE_CASE(type, unused)            \
+    case HInstruction::InstructionKind::k##type:  \
+      return true;
+    FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(SCHEDULABLE_CASE)
+    FOR_EACH_CONCRETE_INSTRUCTION_ARM(SCHEDULABLE_CASE)
+#undef SCHEDULABLE_CASE
+
+    default:
+      return HScheduler::IsSchedulable(instruction);
+  }
+}
+
+std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> HSchedulerARM::BuildSchedulingGraph(
+    HBasicBlock* block,
+    ScopedArenaAllocator* allocator,
+    const HeapLocationCollector* heap_location_collector) {
+  SchedulingLatencyVisitorARM latency_visitor(codegen_);
+  return HScheduler::BuildSchedulingGraph(
+      block, allocator, heap_location_collector, &latency_visitor);
+}
+
 }  // namespace arm
 }  // namespace art
diff --git a/compiler/optimizing/scheduler_arm.h b/compiler/optimizing/scheduler_arm.h
index cf00fa12a3..25eac1b2c4 100644
--- a/compiler/optimizing/scheduler_arm.h
+++ b/compiler/optimizing/scheduler_arm.h
@@ -18,144 +18,32 @@
 #define ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_
 
 #include "base/macros.h"
-#include "code_generator_arm_vixl.h"
 #include "scheduler.h"
 
 namespace art HIDDEN {
-namespace arm {
-// AArch32 instruction latencies.
-// We currently assume that all ARM CPUs share the same instruction latency list.
-// The following latencies were tuned based on performance experiments and
-// automatic tuning using differential evolution approach on various benchmarks.
-static constexpr uint32_t kArmIntegerOpLatency = 2;
-static constexpr uint32_t kArmFloatingPointOpLatency = 11;
-static constexpr uint32_t kArmDataProcWithShifterOpLatency = 4;
-static constexpr uint32_t kArmMulIntegerLatency = 6;
-static constexpr uint32_t kArmMulFloatingPointLatency = 11;
-static constexpr uint32_t kArmDivIntegerLatency = 10;
-static constexpr uint32_t kArmDivFloatLatency = 20;
-static constexpr uint32_t kArmDivDoubleLatency = 25;
-static constexpr uint32_t kArmTypeConversionFloatingPointIntegerLatency = 11;
-static constexpr uint32_t kArmMemoryLoadLatency = 9;
-static constexpr uint32_t kArmMemoryStoreLatency = 9;
-static constexpr uint32_t kArmMemoryBarrierLatency = 6;
-static constexpr uint32_t kArmBranchLatency = 4;
-static constexpr uint32_t kArmCallLatency = 5;
-static constexpr uint32_t kArmCallInternalLatency = 29;
-static constexpr uint32_t kArmLoadStringInternalLatency = 10;
-static constexpr uint32_t kArmNopLatency = 2;
-static constexpr uint32_t kArmLoadWithBakerReadBarrierLatency = 18;
-static constexpr uint32_t kArmRuntimeTypeCheckLatency = 46;
-
-class SchedulingLatencyVisitorARM final : public SchedulingLatencyVisitor {
- public:
-  explicit SchedulingLatencyVisitorARM(CodeGenerator* codegen)
-      : codegen_(down_cast<CodeGeneratorARMVIXL*>(codegen)) {}
-
-  // Default visitor for instructions not handled specifically below.
-  void VisitInstruction([[maybe_unused]] HInstruction*) override {
-    last_visited_latency_ = kArmIntegerOpLatency;
-  }
-
-// We add a second unused parameter to be able to use this macro like the others
-// defined in `nodes.h`.
-#define FOR_EACH_SCHEDULED_ARM_INSTRUCTION(M) \
-  M(ArrayGet, unused)                         \
-  M(ArrayLength, unused)                      \
-  M(ArraySet, unused)                         \
-  M(Add, unused)                              \
-  M(Sub, unused)                              \
-  M(And, unused)                              \
-  M(Or, unused)                               \
-  M(Ror, unused)                              \
-  M(Xor, unused)                              \
-  M(Shl, unused)                              \
-  M(Shr, unused)                              \
-  M(UShr, unused)                             \
-  M(Mul, unused)                              \
-  M(Div, unused)                              \
-  M(Condition, unused)                        \
-  M(Compare, unused)                          \
-  M(BoundsCheck, unused)                      \
-  M(InstanceFieldGet, unused)                 \
-  M(InstanceFieldSet, unused)                 \
-  M(InstanceOf, unused)                       \
-  M(Invoke, unused)                           \
-  M(LoadString, unused)                       \
-  M(NewArray, unused)                         \
-  M(NewInstance, unused)                      \
-  M(Rem, unused)                              \
-  M(StaticFieldGet, unused)                   \
-  M(StaticFieldSet, unused)                   \
-  M(SuspendCheck, unused)                     \
-  M(TypeConversion, unused)
-
-#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \
-  M(BitwiseNegatedRight, unused)                 \
-  M(MultiplyAccumulate, unused)                  \
-  M(IntermediateAddress, unused)                 \
-  M(IntermediateAddressIndex, unused)            \
-  M(DataProcWithShifterOp, unused)
-
-#define DECLARE_VISIT_INSTRUCTION(type, unused)  \
-  void Visit##type(H##type* instruction) override;
 
-  FOR_EACH_SCHEDULED_ARM_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
-  FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
-  FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION)
+class CodeGenerator;
 
-#undef DECLARE_VISIT_INSTRUCTION
-
- private:
-  bool CanGenerateTest(HCondition* cond);
-  void HandleGenerateConditionWithZero(IfCondition cond);
-  void HandleGenerateLongTestConstant(HCondition* cond);
-  void HandleGenerateLongTest(HCondition* cond);
-  void HandleGenerateLongComparesAndJumps();
-  void HandleGenerateTest(HCondition* cond);
-  void HandleGenerateConditionGeneric(HCondition* cond);
-  void HandleGenerateEqualLong(HCondition* cond);
-  void HandleGenerateConditionLong(HCondition* cond);
-  void HandleGenerateConditionIntegralOrNonPrimitive(HCondition* cond);
-  void HandleCondition(HCondition* instr);
-  void HandleBinaryOperationLantencies(HBinaryOperation* instr);
-  void HandleBitwiseOperationLantencies(HBinaryOperation* instr);
-  void HandleShiftLatencies(HBinaryOperation* instr);
-  void HandleDivRemConstantIntegralLatencies(int32_t imm);
-  void HandleFieldSetLatencies(HInstruction* instruction, const FieldInfo& field_info);
-  void HandleFieldGetLatencies(HInstruction* instruction, const FieldInfo& field_info);
-  void HandleGenerateDataProcInstruction(bool internal_latency = false);
-  void HandleGenerateDataProc(HDataProcWithShifterOp* instruction);
-  void HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction);
-
-  // The latency setting for each HInstruction depends on how CodeGenerator may generate code,
-  // latency visitors may query CodeGenerator for such information for accurate latency settings.
-  CodeGeneratorARMVIXL* codegen_;
-};
+namespace arm {
 
-class HSchedulerARM : public HScheduler {
+class HSchedulerARM final : public HScheduler {
  public:
-  HSchedulerARM(SchedulingNodeSelector* selector,
-                SchedulingLatencyVisitorARM* arm_latency_visitor)
-      : HScheduler(arm_latency_visitor, selector) {}
+  HSchedulerARM(SchedulingNodeSelector* selector, CodeGenerator* codegen)
+      : HScheduler(selector), codegen_(codegen) {}
   ~HSchedulerARM() override {}
 
-  bool IsSchedulable(const HInstruction* instruction) const override {
-#define CASE_INSTRUCTION_KIND(type, unused) case \
-  HInstruction::InstructionKind::k##type:
-    switch (instruction->GetKind()) {
-      FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(CASE_INSTRUCTION_KIND)
-        return true;
-      FOR_EACH_CONCRETE_INSTRUCTION_ARM(CASE_INSTRUCTION_KIND)
-        return true;
-      default:
-        return HScheduler::IsSchedulable(instruction);
-    }
-#undef CASE_INSTRUCTION_KIND
-  }
+  bool IsSchedulable(const HInstruction* instruction) const override;
+
+ protected:
+  std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> BuildSchedulingGraph(
+      HBasicBlock* block,
+      ScopedArenaAllocator* allocator,
+      const HeapLocationCollector* heap_location_collector) override;
 
  private:
   DISALLOW_COPY_AND_ASSIGN(HSchedulerARM);
+
+  CodeGenerator* const codegen_;
 };
 
 }  // namespace arm
diff --git a/compiler/optimizing/scheduler_arm64.cc b/compiler/optimizing/scheduler_arm64.cc
index 5113cf446d..08b8a3fb78 100644
--- a/compiler/optimizing/scheduler_arm64.cc
+++ b/compiler/optimizing/scheduler_arm64.cc
@@ -23,6 +23,115 @@
 namespace art HIDDEN {
 namespace arm64 {
 
+static constexpr uint32_t kArm64MemoryLoadLatency = 5;
+static constexpr uint32_t kArm64MemoryStoreLatency = 3;
+
+static constexpr uint32_t kArm64CallInternalLatency = 10;
+static constexpr uint32_t kArm64CallLatency = 5;
+
+// AArch64 instruction latency.
+// We currently assume that all arm64 CPUs share the same instruction latency list.
+static constexpr uint32_t kArm64IntegerOpLatency = 2;
+static constexpr uint32_t kArm64FloatingPointOpLatency = 5;
+
+static constexpr uint32_t kArm64DataProcWithShifterOpLatency = 3;
+static constexpr uint32_t kArm64DivDoubleLatency = 30;
+static constexpr uint32_t kArm64DivFloatLatency = 15;
+static constexpr uint32_t kArm64DivIntegerLatency = 5;
+static constexpr uint32_t kArm64LoadStringInternalLatency = 7;
+static constexpr uint32_t kArm64MulFloatingPointLatency = 6;
+static constexpr uint32_t kArm64MulIntegerLatency = 6;
+static constexpr uint32_t kArm64TypeConversionFloatingPointIntegerLatency = 5;
+static constexpr uint32_t kArm64BranchLatency = kArm64IntegerOpLatency;
+
+static constexpr uint32_t kArm64SIMDFloatingPointOpLatency = 10;
+static constexpr uint32_t kArm64SIMDIntegerOpLatency = 6;
+static constexpr uint32_t kArm64SIMDMemoryLoadLatency = 10;
+static constexpr uint32_t kArm64SIMDMemoryStoreLatency = 6;
+static constexpr uint32_t kArm64SIMDMulFloatingPointLatency = 12;
+static constexpr uint32_t kArm64SIMDMulIntegerLatency = 12;
+static constexpr uint32_t kArm64SIMDReplicateOpLatency = 16;
+static constexpr uint32_t kArm64SIMDDivDoubleLatency = 60;
+static constexpr uint32_t kArm64SIMDDivFloatLatency = 30;
+static constexpr uint32_t kArm64SIMDTypeConversionInt2FPLatency = 10;
+
+class SchedulingLatencyVisitorARM64 final : public SchedulingLatencyVisitor {
+ public:
+  // Default visitor for instructions not handled specifically below.
+  void VisitInstruction([[maybe_unused]] HInstruction*) override {
+    last_visited_latency_ = kArm64IntegerOpLatency;
+  }
+
+// We add a second unused parameter to be able to use this macro like the others
+// defined in `nodes.h`.
+#define FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(M)     \
+  M(ArrayGet             , unused)                   \
+  M(ArrayLength          , unused)                   \
+  M(ArraySet             , unused)                   \
+  M(BoundsCheck          , unused)                   \
+  M(Div                  , unused)                   \
+  M(InstanceFieldGet     , unused)                   \
+  M(InstanceOf           , unused)                   \
+  M(LoadString           , unused)                   \
+  M(Mul                  , unused)                   \
+  M(NewArray             , unused)                   \
+  M(NewInstance          , unused)                   \
+  M(Rem                  , unused)                   \
+  M(StaticFieldGet       , unused)                   \
+  M(SuspendCheck         , unused)                   \
+  M(TypeConversion       , unused)                   \
+  M(VecReplicateScalar   , unused)                   \
+  M(VecExtractScalar     , unused)                   \
+  M(VecReduce            , unused)                   \
+  M(VecCnv               , unused)                   \
+  M(VecNeg               , unused)                   \
+  M(VecAbs               , unused)                   \
+  M(VecNot               , unused)                   \
+  M(VecAdd               , unused)                   \
+  M(VecHalvingAdd        , unused)                   \
+  M(VecSub               , unused)                   \
+  M(VecMul               , unused)                   \
+  M(VecDiv               , unused)                   \
+  M(VecMin               , unused)                   \
+  M(VecMax               , unused)                   \
+  M(VecAnd               , unused)                   \
+  M(VecAndNot            , unused)                   \
+  M(VecOr                , unused)                   \
+  M(VecXor               , unused)                   \
+  M(VecShl               , unused)                   \
+  M(VecShr               , unused)                   \
+  M(VecUShr              , unused)                   \
+  M(VecSetScalars        , unused)                   \
+  M(VecMultiplyAccumulate, unused)                   \
+  M(VecLoad              , unused)                   \
+  M(VecStore             , unused)
+
+#define FOR_EACH_SCHEDULED_ABSTRACT_INSTRUCTION(M)   \
+  M(BinaryOperation      , unused)                   \
+  M(Invoke               , unused)
+
+#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \
+  M(BitwiseNegatedRight, unused)                 \
+  M(MultiplyAccumulate, unused)                  \
+  M(IntermediateAddress, unused)                 \
+  M(IntermediateAddressIndex, unused)            \
+  M(DataProcWithShifterOp, unused)
+
+#define DECLARE_VISIT_INSTRUCTION(type, unused)  \
+  void Visit##type(H##type* instruction) override;
+
+  FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_SCHEDULED_ABSTRACT_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION)
+
+#undef DECLARE_VISIT_INSTRUCTION
+
+ private:
+  void HandleSimpleArithmeticSIMD(HVecOperation *instr);
+  void HandleVecAddress(HVecMemoryOperation* instruction, size_t size);
+};
+
 void SchedulingLatencyVisitorARM64::VisitBinaryOperation(HBinaryOperation* instr) {
   last_visited_latency_ = DataType::IsFloatingPointType(instr->GetResultType())
       ? kArm64FloatingPointOpLatency
@@ -348,5 +457,30 @@ void SchedulingLatencyVisitorARM64::VisitVecStore(HVecStore* instr) {
   last_visited_latency_ = kArm64SIMDMemoryStoreLatency;
 }
 
+bool HSchedulerARM64::IsSchedulable(const HInstruction* instruction) const {
+  switch (instruction->GetKind()) {
+#define SCHEDULABLE_CASE(type, unused)       \
+    case HInstruction::InstructionKind::k##type:  \
+      return true;
+    FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(SCHEDULABLE_CASE)
+    FOR_EACH_CONCRETE_INSTRUCTION_ARM64(SCHEDULABLE_CASE)
+    FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(SCHEDULABLE_CASE)
+#undef SCHEDULABLE_CASE
+
+    default:
+      return HScheduler::IsSchedulable(instruction);
+  }
+}
+
+std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>>
+HSchedulerARM64::BuildSchedulingGraph(
+    HBasicBlock* block,
+    ScopedArenaAllocator* allocator,
+    const HeapLocationCollector* heap_location_collector) {
+  SchedulingLatencyVisitorARM64 latency_visitor;
+  return HScheduler::BuildSchedulingGraph(
+      block, allocator, heap_location_collector, &latency_visitor);
+}
+
 }  // namespace arm64
 }  // namespace art
diff --git a/compiler/optimizing/scheduler_arm64.h b/compiler/optimizing/scheduler_arm64.h
index 7ce00e00ab..044aa48a5a 100644
--- a/compiler/optimizing/scheduler_arm64.h
+++ b/compiler/optimizing/scheduler_arm64.h
@@ -23,137 +23,13 @@
 namespace art HIDDEN {
 namespace arm64 {
 
-static constexpr uint32_t kArm64MemoryLoadLatency = 5;
-static constexpr uint32_t kArm64MemoryStoreLatency = 3;
-
-static constexpr uint32_t kArm64CallInternalLatency = 10;
-static constexpr uint32_t kArm64CallLatency = 5;
-
-// AArch64 instruction latency.
-// We currently assume that all arm64 CPUs share the same instruction latency list.
-static constexpr uint32_t kArm64IntegerOpLatency = 2;
-static constexpr uint32_t kArm64FloatingPointOpLatency = 5;
-
-
-static constexpr uint32_t kArm64DataProcWithShifterOpLatency = 3;
-static constexpr uint32_t kArm64DivDoubleLatency = 30;
-static constexpr uint32_t kArm64DivFloatLatency = 15;
-static constexpr uint32_t kArm64DivIntegerLatency = 5;
-static constexpr uint32_t kArm64LoadStringInternalLatency = 7;
-static constexpr uint32_t kArm64MulFloatingPointLatency = 6;
-static constexpr uint32_t kArm64MulIntegerLatency = 6;
-static constexpr uint32_t kArm64TypeConversionFloatingPointIntegerLatency = 5;
-static constexpr uint32_t kArm64BranchLatency = kArm64IntegerOpLatency;
-
-static constexpr uint32_t kArm64SIMDFloatingPointOpLatency = 10;
-static constexpr uint32_t kArm64SIMDIntegerOpLatency = 6;
-static constexpr uint32_t kArm64SIMDMemoryLoadLatency = 10;
-static constexpr uint32_t kArm64SIMDMemoryStoreLatency = 6;
-static constexpr uint32_t kArm64SIMDMulFloatingPointLatency = 12;
-static constexpr uint32_t kArm64SIMDMulIntegerLatency = 12;
-static constexpr uint32_t kArm64SIMDReplicateOpLatency = 16;
-static constexpr uint32_t kArm64SIMDDivDoubleLatency = 60;
-static constexpr uint32_t kArm64SIMDDivFloatLatency = 30;
-static constexpr uint32_t kArm64SIMDTypeConversionInt2FPLatency = 10;
-
-class SchedulingLatencyVisitorARM64 final : public SchedulingLatencyVisitor {
- public:
-  // Default visitor for instructions not handled specifically below.
-  void VisitInstruction([[maybe_unused]] HInstruction*) override {
-    last_visited_latency_ = kArm64IntegerOpLatency;
-  }
-
-// We add a second unused parameter to be able to use this macro like the others
-// defined in `nodes.h`.
-#define FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(M)     \
-  M(ArrayGet             , unused)                   \
-  M(ArrayLength          , unused)                   \
-  M(ArraySet             , unused)                   \
-  M(BoundsCheck          , unused)                   \
-  M(Div                  , unused)                   \
-  M(InstanceFieldGet     , unused)                   \
-  M(InstanceOf           , unused)                   \
-  M(LoadString           , unused)                   \
-  M(Mul                  , unused)                   \
-  M(NewArray             , unused)                   \
-  M(NewInstance          , unused)                   \
-  M(Rem                  , unused)                   \
-  M(StaticFieldGet       , unused)                   \
-  M(SuspendCheck         , unused)                   \
-  M(TypeConversion       , unused)                   \
-  M(VecReplicateScalar   , unused)                   \
-  M(VecExtractScalar     , unused)                   \
-  M(VecReduce            , unused)                   \
-  M(VecCnv               , unused)                   \
-  M(VecNeg               , unused)                   \
-  M(VecAbs               , unused)                   \
-  M(VecNot               , unused)                   \
-  M(VecAdd               , unused)                   \
-  M(VecHalvingAdd        , unused)                   \
-  M(VecSub               , unused)                   \
-  M(VecMul               , unused)                   \
-  M(VecDiv               , unused)                   \
-  M(VecMin               , unused)                   \
-  M(VecMax               , unused)                   \
-  M(VecAnd               , unused)                   \
-  M(VecAndNot            , unused)                   \
-  M(VecOr                , unused)                   \
-  M(VecXor               , unused)                   \
-  M(VecShl               , unused)                   \
-  M(VecShr               , unused)                   \
-  M(VecUShr              , unused)                   \
-  M(VecSetScalars        , unused)                   \
-  M(VecMultiplyAccumulate, unused)                   \
-  M(VecLoad              , unused)                   \
-  M(VecStore             , unused)
-
-#define FOR_EACH_SCHEDULED_ABSTRACT_INSTRUCTION(M)   \
-  M(BinaryOperation      , unused)                   \
-  M(Invoke               , unused)
-
-#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \
-  M(BitwiseNegatedRight, unused)                 \
-  M(MultiplyAccumulate, unused)                  \
-  M(IntermediateAddress, unused)                 \
-  M(IntermediateAddressIndex, unused)            \
-  M(DataProcWithShifterOp, unused)
-
-#define DECLARE_VISIT_INSTRUCTION(type, unused)  \
-  void Visit##type(H##type* instruction) override;
-
-  FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
-  FOR_EACH_SCHEDULED_ABSTRACT_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
-  FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
-  FOR_EACH_CONCRETE_INSTRUCTION_ARM64(DECLARE_VISIT_INSTRUCTION)
-
-#undef DECLARE_VISIT_INSTRUCTION
-
- private:
-  void HandleSimpleArithmeticSIMD(HVecOperation *instr);
-  void HandleVecAddress(HVecMemoryOperation* instruction, size_t size);
-};
-
 class HSchedulerARM64 : public HScheduler {
  public:
   explicit HSchedulerARM64(SchedulingNodeSelector* selector)
-      : HScheduler(&arm64_latency_visitor_, selector) {}
+      : HScheduler(selector) {}
   ~HSchedulerARM64() override {}
 
-  bool IsSchedulable(const HInstruction* instruction) const override {
-#define CASE_INSTRUCTION_KIND(type, unused) case \
-  HInstruction::InstructionKind::k##type:
-    switch (instruction->GetKind()) {
-      FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(CASE_INSTRUCTION_KIND)
-        return true;
-      FOR_EACH_CONCRETE_INSTRUCTION_ARM64(CASE_INSTRUCTION_KIND)
-        return true;
-      FOR_EACH_SCHEDULED_COMMON_INSTRUCTION(CASE_INSTRUCTION_KIND)
-        return true;
-      default:
-        return HScheduler::IsSchedulable(instruction);
-    }
-#undef CASE_INSTRUCTION_KIND
-  }
+  bool IsSchedulable(const HInstruction* instruction) const override;
 
   // Treat as scheduling barriers those vector instructions whose live ranges exceed the vectorized
   // loop boundaries. This is a workaround for the lack of notion of SIMD register in the compiler;
@@ -169,8 +45,13 @@ class HSchedulerARM64 : public HScheduler {
            instr->IsVecReplicateScalar();
   }
 
+ protected:
+  std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> BuildSchedulingGraph(
+      HBasicBlock* block,
+      ScopedArenaAllocator* allocator,
+      const HeapLocationCollector* heap_location_collector) override;
+
  private:
-  SchedulingLatencyVisitorARM64 arm64_latency_visitor_;
   DISALLOW_COPY_AND_ASSIGN(HSchedulerARM64);
 };
 
diff --git a/compiler/optimizing/scheduler_test.cc b/compiler/optimizing/scheduler_test.cc
index c2b1fd6f7c..0b020f1460 100644
--- a/compiler/optimizing/scheduler_test.cc
+++ b/compiler/optimizing/scheduler_test.cc
@@ -406,15 +406,13 @@ TEST_F(SchedulerTest, ArrayAccessAliasingARM64) {
 #if defined(ART_ENABLE_CODEGEN_arm)
 TEST_F(SchedulerTest, DependencyGraphAndSchedulerARM) {
   CriticalPathSchedulingNodeSelector critical_path_selector;
-  arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr);
-  arm::HSchedulerARM scheduler(&critical_path_selector, &arm_latency_visitor);
+  arm::HSchedulerARM scheduler(&critical_path_selector, /*codegen=*/ nullptr);
   TestBuildDependencyGraphAndSchedule(&scheduler);
 }
 
 TEST_F(SchedulerTest, ArrayAccessAliasingARM) {
   CriticalPathSchedulingNodeSelector critical_path_selector;
-  arm::SchedulingLatencyVisitorARM arm_latency_visitor(/*CodeGenerator*/ nullptr);
-  arm::HSchedulerARM scheduler(&critical_path_selector, &arm_latency_visitor);
+  arm::HSchedulerARM scheduler(&critical_path_selector, /*codegen=*/ nullptr);
   TestDependencyGraphOnAliasingArrayAccesses(&scheduler);
 }
 #endif
author	Vladimir Marko <vmarko@google.com>	2024-02-12 11:17:28 +0100
committer	Vladimír Marko <vmarko@google.com>	2024-02-13 15:28:35 +0000
commit	77e5997b524a133d38585da8bf58420f2411f7ad (patch)
tree	0273c71559a03593b358f04ef060214796159863
parent	2389869ed372eebd886c2f984f23ec7e342da22b (diff)