Optimizing: Refactor `HScheduler`.

Move `SchedulingLatencyVisitor{ARM,ARM64}` to .cc files. Test: m test-art-host-gtest Test: testrunner.py --host --optimizing Test: run-gtests.sh Test: testrunner.py --target --optimizing Change-Id: I15cb1a4cbef00a328fec947189412c502bf80f46
author: Vladimir Marko <vmarko@google.com> 2024-02-12 11:17:28 +0100
committer: Vladimír Marko <vmarko@google.com> 2024-02-13 15:28:35 +0000
commit: 77e5997b524a133d38585da8bf58420f2411f7ad (patch)
tree: 0273c71559a03593b358f04ef060214796159863 /compiler/optimizing/scheduler_arm.cc
parent: 2389869ed372eebd886c2f984f23ec7e342da22b (diff)
1 files changed, 134 insertions, 0 deletions
diff --git a/compiler/optimizing/scheduler_arm.cc b/compiler/optimizing/scheduler_arm.cc
index 510a0f5496..3ee6f06b46 100644
--- a/compiler/optimizing/scheduler_arm.cc
+++ b/compiler/optimizing/scheduler_arm.cc
@@ -17,6 +17,7 @@
 #include "scheduler_arm.h"
 
 #include "arch/arm/instruction_set_features_arm.h"
+#include "code_generator_arm_vixl.h"
 #include "code_generator_utils.h"
 #include "common_arm.h"
 #include "heap_poisoning.h"
@@ -29,6 +30,116 @@ namespace arm {
 using helpers::Int32ConstantFrom;
 using helpers::Uint64ConstantFrom;
 
+// AArch32 instruction latencies.
+// We currently assume that all ARM CPUs share the same instruction latency list.
+// The following latencies were tuned based on performance experiments and
+// automatic tuning using differential evolution approach on various benchmarks.
+static constexpr uint32_t kArmIntegerOpLatency = 2;
+static constexpr uint32_t kArmFloatingPointOpLatency = 11;
+static constexpr uint32_t kArmDataProcWithShifterOpLatency = 4;
+static constexpr uint32_t kArmMulIntegerLatency = 6;
+static constexpr uint32_t kArmMulFloatingPointLatency = 11;
+static constexpr uint32_t kArmDivIntegerLatency = 10;
+static constexpr uint32_t kArmDivFloatLatency = 20;
+static constexpr uint32_t kArmDivDoubleLatency = 25;
+static constexpr uint32_t kArmTypeConversionFloatingPointIntegerLatency = 11;
+static constexpr uint32_t kArmMemoryLoadLatency = 9;
+static constexpr uint32_t kArmMemoryStoreLatency = 9;
+static constexpr uint32_t kArmMemoryBarrierLatency = 6;
+static constexpr uint32_t kArmBranchLatency = 4;
+static constexpr uint32_t kArmCallLatency = 5;
+static constexpr uint32_t kArmCallInternalLatency = 29;
+static constexpr uint32_t kArmLoadStringInternalLatency = 10;
+static constexpr uint32_t kArmNopLatency = 2;
+static constexpr uint32_t kArmLoadWithBakerReadBarrierLatency = 18;
+static constexpr uint32_t kArmRuntimeTypeCheckLatency = 46;
+
+class SchedulingLatencyVisitorARM final : public SchedulingLatencyVisitor {
+ public:
+  explicit SchedulingLatencyVisitorARM(CodeGenerator* codegen)
+      : codegen_(down_cast<CodeGeneratorARMVIXL*>(codegen)) {}
+
+  // Default visitor for instructions not handled specifically below.
+  void VisitInstruction([[maybe_unused]] HInstruction*) override {
+    last_visited_latency_ = kArmIntegerOpLatency;
+  }
+
+// We add a second unused parameter to be able to use this macro like the others
+// defined in `nodes.h`.
+#define FOR_EACH_SCHEDULED_ARM_INSTRUCTION(M) \
+  M(ArrayGet, unused)                         \
+  M(ArrayLength, unused)                      \
+  M(ArraySet, unused)                         \
+  M(Add, unused)                              \
+  M(Sub, unused)                              \
+  M(And, unused)                              \
+  M(Or, unused)                               \
+  M(Ror, unused)                              \
+  M(Xor, unused)                              \
+  M(Shl, unused)                              \
+  M(Shr, unused)                              \
+  M(UShr, unused)                             \
+  M(Mul, unused)                              \
+  M(Div, unused)                              \
+  M(Condition, unused)                        \
+  M(Compare, unused)                          \
+  M(BoundsCheck, unused)                      \
+  M(InstanceFieldGet, unused)                 \
+  M(InstanceFieldSet, unused)                 \
+  M(InstanceOf, unused)                       \
+  M(Invoke, unused)                           \
+  M(LoadString, unused)                       \
+  M(NewArray, unused)                         \
+  M(NewInstance, unused)                      \
+  M(Rem, unused)                              \
+  M(StaticFieldGet, unused)                   \
+  M(StaticFieldSet, unused)                   \
+  M(SuspendCheck, unused)                     \
+  M(TypeConversion, unused)
+
+#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \
+  M(BitwiseNegatedRight, unused)                 \
+  M(MultiplyAccumulate, unused)                  \
+  M(IntermediateAddress, unused)                 \
+  M(IntermediateAddressIndex, unused)            \
+  M(DataProcWithShifterOp, unused)
+
+#define DECLARE_VISIT_INSTRUCTION(type, unused)  \
+  void Visit##type(H##type* instruction) override;
+
+  FOR_EACH_SCHEDULED_ARM_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION)
+
+#undef DECLARE_VISIT_INSTRUCTION
+
+ private:
+  bool CanGenerateTest(HCondition* cond);
+  void HandleGenerateConditionWithZero(IfCondition cond);
+  void HandleGenerateLongTestConstant(HCondition* cond);
+  void HandleGenerateLongTest(HCondition* cond);
+  void HandleGenerateLongComparesAndJumps();
+  void HandleGenerateTest(HCondition* cond);
+  void HandleGenerateConditionGeneric(HCondition* cond);
+  void HandleGenerateEqualLong(HCondition* cond);
+  void HandleGenerateConditionLong(HCondition* cond);
+  void HandleGenerateConditionIntegralOrNonPrimitive(HCondition* cond);
+  void HandleCondition(HCondition* instr);
+  void HandleBinaryOperationLantencies(HBinaryOperation* instr);
+  void HandleBitwiseOperationLantencies(HBinaryOperation* instr);
+  void HandleShiftLatencies(HBinaryOperation* instr);
+  void HandleDivRemConstantIntegralLatencies(int32_t imm);
+  void HandleFieldSetLatencies(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleFieldGetLatencies(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleGenerateDataProcInstruction(bool internal_latency = false);
+  void HandleGenerateDataProc(HDataProcWithShifterOp* instruction);
+  void HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction);
+
+  // The latency setting for each HInstruction depends on how CodeGenerator may generate code,
+  // latency visitors may query CodeGenerator for such information for accurate latency settings.
+  CodeGeneratorARMVIXL* codegen_;
+};
+
 void SchedulingLatencyVisitorARM::HandleBinaryOperationLantencies(HBinaryOperation* instr) {
   switch (instr->GetResultType()) {
     case DataType::Type::kInt64:
@@ -1153,5 +1264,28 @@ void SchedulingLatencyVisitorARM::VisitTypeConversion(HTypeConversion* instr) {
   }
 }
 
+bool HSchedulerARM::IsSchedulable(const HInstruction* instruction) const {
+  switch (instruction->GetKind()) {
+#define SCHEDULABLE_CASE(type, unused)            \
+    case HInstruction::InstructionKind::k##type:  \
+      return true;
+    FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(SCHEDULABLE_CASE)
+    FOR_EACH_CONCRETE_INSTRUCTION_ARM(SCHEDULABLE_CASE)
+#undef SCHEDULABLE_CASE
+
+    default:
+      return HScheduler::IsSchedulable(instruction);
+  }
+}
+
+std::pair<SchedulingGraph, ScopedArenaVector<SchedulingNode*>> HSchedulerARM::BuildSchedulingGraph(
+    HBasicBlock* block,
+    ScopedArenaAllocator* allocator,
+    const HeapLocationCollector* heap_location_collector) {
+  SchedulingLatencyVisitorARM latency_visitor(codegen_);
+  return HScheduler::BuildSchedulingGraph(
+      block, allocator, heap_location_collector, &latency_visitor);
+}
+
 }  // namespace arm
 }  // namespace art
author	Vladimir Marko <vmarko@google.com>	2024-02-12 11:17:28 +0100
committer	Vladimír Marko <vmarko@google.com>	2024-02-13 15:28:35 +0000
commit	77e5997b524a133d38585da8bf58420f2411f7ad (patch)
tree	0273c71559a03593b358f04ef060214796159863 /compiler/optimizing/scheduler_arm.cc
parent	2389869ed372eebd886c2f984f23ec7e342da22b (diff)