Instruction scheduling for ARM.

Performance improvements on various benchmarks with this CL:

benchmarks     improvements
---------------------------
algorithm                1%
benchmarksgame           2%
caffeinemark             2%
math                     3%
stanford                 4%

Tested on ARM Cortex-A53 CPU.

The code size impact is negligible.

Test: m test-art-host
Test: m test-art-target
Change-Id: I314c90c09ce27e3d224fc686ef73c7d94a6b5a2c
diff --git a/compiler/optimizing/scheduler_arm.h b/compiler/optimizing/scheduler_arm.h
new file mode 100644
index 0000000..8d5e4f3
--- /dev/null
+++ b/compiler/optimizing/scheduler_arm.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_
+#define ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_
+
+#include "code_generator_arm_vixl.h"
+#include "scheduler.h"
+
+namespace art {
+namespace arm {
+#ifdef ART_USE_OLD_ARM_BACKEND
+typedef CodeGeneratorARM CodeGeneratorARMType;
+#else
+typedef CodeGeneratorARMVIXL CodeGeneratorARMType;
+#endif
+
+// AArch32 instruction latencies.
+// We currently assume that all ARM CPUs share the same instruction latency list.
+// The following latencies were tuned based on performance experiments and
+// automatic tuning using differential evolution approach on various benchmarks.
+static constexpr uint32_t kArmIntegerOpLatency = 2;
+static constexpr uint32_t kArmFloatingPointOpLatency = 11;
+static constexpr uint32_t kArmDataProcWithShifterOpLatency = 4;
+static constexpr uint32_t kArmMulIntegerLatency = 6;
+static constexpr uint32_t kArmMulFloatingPointLatency = 11;
+static constexpr uint32_t kArmDivIntegerLatency = 10;
+static constexpr uint32_t kArmDivFloatLatency = 20;
+static constexpr uint32_t kArmDivDoubleLatency = 25;
+static constexpr uint32_t kArmTypeConversionFloatingPointIntegerLatency = 11;
+static constexpr uint32_t kArmMemoryLoadLatency = 9;
+static constexpr uint32_t kArmMemoryStoreLatency = 9;
+static constexpr uint32_t kArmMemoryBarrierLatency = 6;
+static constexpr uint32_t kArmBranchLatency = 4;
+static constexpr uint32_t kArmCallLatency = 5;
+static constexpr uint32_t kArmCallInternalLatency = 29;
+static constexpr uint32_t kArmLoadStringInternalLatency = 10;
+static constexpr uint32_t kArmNopLatency = 2;
+static constexpr uint32_t kArmLoadWithBakerReadBarrierLatency = 18;
+static constexpr uint32_t kArmRuntimeTypeCheckLatency = 46;
+
+class SchedulingLatencyVisitorARM : public SchedulingLatencyVisitor {
+ public:
+  explicit SchedulingLatencyVisitorARM(CodeGenerator* codegen)
+      : codegen_(down_cast<CodeGeneratorARMType*>(codegen)) {}
+
+  // Default visitor for instructions not handled specifically below.
+  void VisitInstruction(HInstruction* ATTRIBUTE_UNUSED) {
+    last_visited_latency_ = kArmIntegerOpLatency;
+  }
+
+// We add a second unused parameter to be able to use this macro like the others
+// defined in `nodes.h`.
+#define FOR_EACH_SCHEDULED_ARM_INSTRUCTION(M)    \
+  M(ArrayGet         , unused)                   \
+  M(ArrayLength      , unused)                   \
+  M(ArraySet         , unused)                   \
+  M(Add              , unused)                   \
+  M(Sub              , unused)                   \
+  M(And              , unused)                   \
+  M(Or               , unused)                   \
+  M(Ror              , unused)                   \
+  M(Xor              , unused)                   \
+  M(Shl              , unused)                   \
+  M(Shr              , unused)                   \
+  M(UShr             , unused)                   \
+  M(Mul              , unused)                   \
+  M(Div              , unused)                   \
+  M(Condition        , unused)                   \
+  M(Compare          , unused)                   \
+  M(BoundsCheck      , unused)                   \
+  M(InstanceFieldGet , unused)                   \
+  M(InstanceFieldSet , unused)                   \
+  M(InstanceOf       , unused)                   \
+  M(Invoke           , unused)                   \
+  M(LoadString       , unused)                   \
+  M(NewArray         , unused)                   \
+  M(NewInstance      , unused)                   \
+  M(Rem              , unused)                   \
+  M(StaticFieldGet   , unused)                   \
+  M(StaticFieldSet   , unused)                   \
+  M(SuspendCheck     , unused)                   \
+  M(TypeConversion   , unused)
+
+#define FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(M) \
+  M(BitwiseNegatedRight, unused)                 \
+  M(MultiplyAccumulate, unused)                  \
+  M(IntermediateAddress, unused)                 \
+  M(DataProcWithShifterOp, unused)
+
+#define DECLARE_VISIT_INSTRUCTION(type, unused)  \
+  void Visit##type(H##type* instruction) OVERRIDE;
+
+  FOR_EACH_SCHEDULED_ARM_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+  FOR_EACH_CONCRETE_INSTRUCTION_ARM(DECLARE_VISIT_INSTRUCTION)
+
+#undef DECLARE_VISIT_INSTRUCTION
+
+ private:
+  void HandleBinaryOperationLantencies(HBinaryOperation* instr);
+  void HandleBitwiseOperationLantencies(HBinaryOperation* instr);
+  void HandleShiftLatencies(HBinaryOperation* instr);
+  void HandleDivRemConstantIntegralLatencies(int32_t imm);
+  void HandleFieldSetLatencies(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleFieldGetLatencies(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleGenerateDataProcInstruction(bool internal_latency = false);
+  void HandleGenerateDataProc(HDataProcWithShifterOp* instruction);
+  void HandleGenerateLongDataProc(HDataProcWithShifterOp* instruction);
+
+  // The latency setting for each HInstruction depends on how CodeGenerator may generate code,
+  // latency visitors may query CodeGenerator for such information for accurate latency settings.
+  CodeGeneratorARMType* codegen_;
+};
+
+class HSchedulerARM : public HScheduler {
+ public:
+  HSchedulerARM(ArenaAllocator* arena,
+                SchedulingNodeSelector* selector,
+                SchedulingLatencyVisitorARM* arm_latency_visitor)
+      : HScheduler(arena, arm_latency_visitor, selector) {}
+  ~HSchedulerARM() OVERRIDE {}
+
+  bool IsSchedulable(const HInstruction* instruction) const OVERRIDE {
+#define CASE_INSTRUCTION_KIND(type, unused) case \
+  HInstruction::InstructionKind::k##type:
+    switch (instruction->GetKind()) {
+      FOR_EACH_SCHEDULED_SHARED_INSTRUCTION(CASE_INSTRUCTION_KIND)
+        return true;
+      FOR_EACH_CONCRETE_INSTRUCTION_ARM(CASE_INSTRUCTION_KIND)
+        return true;
+      default:
+        return HScheduler::IsSchedulable(instruction);
+    }
+#undef CASE_INSTRUCTION_KIND
+  }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HSchedulerARM);
+};
+
+}  // namespace arm
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_SCHEDULER_ARM_H_