Emit vector mulitply and accumulate instructions for x86.

This patch adds a new cpu vaiant named kabylake and performs instruction simplification to generate VectorMulitplyAccumulate. Test: ./test.py --host --64 Change-Id: Ie6cc882dadf1322dd4d3ae49bfdb600b0c447765 Signed-off-by: Gupta Kumar, Sanjiv <sanjiv.kumar.gupta@intel.com>
author: Gupta Kumar, Sanjiv <sanjiv.kumar.gupta@intel.com> 2018-06-29 13:06:35 +0530
committer: Gupta Kumar, Sanjiv <sanjiv.kumar.gupta@intel.com> 2018-07-02 15:37:38 +0530
commit: 61908880e6565acfadbafe93fa64de000014f1a6 (patch)
tree: 40b535db9175f3d959364d5bc30eaab4e2c4b4c4 /compiler/optimizing
parent: b5271dd44a30f498689e503340d3c8d01bf31f07 (diff)
8 files changed, 314 insertions, 12 deletions
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 086ae07a06..58808769e2 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -1125,13 +1125,59 @@ static void CreateVecAccumLocations(ArenaAllocator* allocator, HVecOperation* in
   }
 }
 
-void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
-  CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction);
+void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr);
+  switch (instr->GetPackedType()) {
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
+      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+      locations->SetOut(Location::SameAsFirstInput());
+  break;
+  default:
+    // VecMultiplyAccumulate is supported only for single and
+    // double precision floating points. Hence integral types
+    // are still not converted.
+    LOG(FATAL) << "Unsupported SIMD Type";
+  }
 }
 
-void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
-  // TODO: pmaddwd?
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LocationSummary* locations = instr->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister accumulator = locations->InAt(
+      HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>();
+  XmmRegister mul_left = locations->InAt(
+      HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>();
+  XmmRegister mul_right = locations->InAt(
+      HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>();
+  switch (instr->GetPackedType()) {
+    case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
+        __ vfmadd231ps(accumulator, mul_left, mul_right);
+      else
+        __ vfmsub231ps(accumulator, mul_left, mul_right);
+    break;
+    case DataType::Type::kFloat64:
+      DCHECK_EQ(2u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
+        __ vfmadd231pd(accumulator, mul_left, mul_right);
+      else
+        __ vfmsub231pd(accumulator, mul_left, mul_right);
+      break;
+  default:
+
+    // VecMultiplyAccumulate is supported only for single and
+    // double precision floating points. Hence integral types
+    // are still not converted.
+    LOG(FATAL) << "Unsupported SIMD Type";
+  }
 }
 
 void LocationsBuilderX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 4d31ab68d1..4795e86933 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -1098,13 +1098,61 @@ static void CreateVecAccumLocations(ArenaAllocator* allocator, HVecOperation* in
   }
 }
 
-void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
-  CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction);
+void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr);
+  switch (instr->GetPackedType()) {
+    case DataType::Type::kFloat32:
+    case DataType::Type::kFloat64:
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
+      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      // VecMultiplyAccumulate is supported only for single and
+      // double precision floating points. Hence integral types
+      // are still not converted.
+      LOG(FATAL) << "Unsupported SIMD type";
+  }
 }
 
-void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
-  // TODO: pmaddwd?
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+
+void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LocationSummary* locations = instr->GetLocations();
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+  XmmRegister accumulator = locations->InAt(
+      HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>();
+  XmmRegister mul_left = locations->InAt(
+      HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>();
+  XmmRegister mul_right = locations->InAt(
+      HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>();
+
+  switch (instr->GetPackedType()) {
+    case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
+         __ vfmadd231ps(accumulator, mul_left, mul_right);
+      else
+        __ vfmsub231ps(accumulator, mul_left, mul_right);
+    break;
+    case DataType::Type::kFloat64:
+      DCHECK_EQ(2u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
+        __ vfmadd231pd(accumulator, mul_left, mul_right);
+      else
+        __ vfmsub231pd(accumulator, mul_left, mul_right);
+    break;
+  default:
+
+    // VecMultiplyAccumulate is supported only for single and
+    // double precision floating points. Hence integral types
+    // are still not converted.
+    LOG(FATAL) << "Unsupported SIMD Type";
+  }
 }
 
 void LocationsBuilderX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
diff --git a/compiler/optimizing/instruction_simplifier_x86.cc b/compiler/optimizing/instruction_simplifier_x86.cc
new file mode 100644
index 0000000000..b3f67d6e84
--- /dev/null
+++ b/compiler/optimizing/instruction_simplifier_x86.cc
@@ -0,0 +1,149 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "instruction_simplifier_x86.h"
+#include "arch/x86/instruction_set_features_x86.h"
+#include "mirror/array-inl.h"
+#include "code_generator.h"
+
+
+namespace art {
+
+namespace x86 {
+
+class InstructionSimplifierX86Visitor : public HGraphVisitor {
+ public:
+  InstructionSimplifierX86Visitor(HGraph* graph,
+                                  CodeGeneratorX86 *codegen,
+                                  OptimizingCompilerStats* stats)
+      : HGraphVisitor(graph), codegen_(codegen), stats_(stats) {}
+
+ private:
+  void RecordSimplification() {
+    MaybeRecordStat(stats_, MethodCompilationStat::kInstructionSimplificationsArch);
+  }
+
+  bool HasCpuFeatureFlag() {
+     return (codegen_->GetInstructionSetFeatures().HasAVX2());
+  }
+
+  /**
+   * This simplifier uses a special-purpose BB visitor.
+   * (1) No need to visit Phi nodes.
+   * (2) Since statements can be removed in a "forward" fashion,
+   *     the visitor should test if each statement is still there.
+   */
+  void VisitBasicBlock(HBasicBlock* block) OVERRIDE {
+    // TODO: fragile iteration, provide more robust iterators?
+    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+      HInstruction* instruction = it.Current();
+      if (instruction->IsInBlock()) {
+        instruction->Accept(this);
+      }
+    }
+  }
+
+  bool TryGenerateVecMultiplyAccumulate(HVecMul* mul);
+  void VisitVecMul(HVecMul* instruction) OVERRIDE;
+
+  CodeGeneratorX86* codegen_;
+  OptimizingCompilerStats* stats_;
+};
+
+/* generic expressions for FMA
+a = (b * c) + a
+a = (b * c) – a
+*/
+bool InstructionSimplifierX86Visitor::TryGenerateVecMultiplyAccumulate(HVecMul* mul) {
+  if (!(mul->GetPackedType() == DataType::Type::kFloat32 ||
+        mul->GetPackedType() == DataType::Type::kFloat64)) {
+     return false;
+  }
+  ArenaAllocator* allocator = mul->GetBlock()->GetGraph()->GetAllocator();
+  if (mul->HasOnlyOneNonEnvironmentUse()) {
+    HInstruction* use = mul->GetUses().front().GetUser();
+    if (use->IsVecAdd() || use->IsVecSub()) {
+      // Replace code looking like
+      //    VECMUL tmp, x, y
+      //    VECADD dst, acc, tmp or VECADD dst, tmp, acc
+      //      or
+      //    VECSUB dst, tmp, acc
+      // with
+      //    VECMULACC dst, acc, x, y
+
+      // Note that we do not want to (unconditionally) perform the merge when the
+      // multiplication has multiple uses and it can be merged in all of them.
+      // Multiple uses could happen on the same control-flow path, and we would
+      // then increase the amount of work. In the future we could try to evaluate
+      // whether all uses are on different control-flow paths (using dominance and
+      // reverse-dominance information) and only perform the merge when they are.
+      HInstruction* accumulator = nullptr;
+      HVecBinaryOperation* binop = use->AsVecBinaryOperation();
+      HInstruction* binop_left = binop->GetLeft();
+      HInstruction* binop_right = binop->GetRight();
+      DCHECK_NE(binop_left, binop_right);
+      if (use->IsVecSub()) {
+        if (binop_left == mul) {
+          accumulator = binop_right;
+         }
+      } else {
+        // VecAdd
+        if (binop_right == mul) {
+          accumulator = binop_left;
+        } else {
+          DCHECK_EQ(binop_left, mul);
+          accumulator = binop_right;
+        }
+      }
+      HInstruction::InstructionKind kind =
+        use->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub;
+
+      if (accumulator != nullptr) {
+        HVecMultiplyAccumulate* mulacc =
+          new (allocator) HVecMultiplyAccumulate(allocator,
+                                                 kind,
+                                                 accumulator,
+                                                 mul->GetLeft(),
+                                                 mul->GetRight(),
+                                                 binop->GetPackedType(),
+                                                 binop->GetVectorLength(),
+                                                 binop->GetDexPc());
+        binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc);
+        DCHECK(!mul->HasUses());
+        mul->GetBlock()->RemoveInstruction(mul);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+void InstructionSimplifierX86Visitor::VisitVecMul(HVecMul* instruction) {
+  if (HasCpuFeatureFlag()) {
+    if (TryGenerateVecMultiplyAccumulate(instruction)) {
+      RecordSimplification();
+    }
+  }
+}
+
+bool InstructionSimplifierX86::Run() {
+  InstructionSimplifierX86Visitor visitor(graph_, codegen_, stats_);
+  visitor.VisitReversePostOrder();
+  return true;
+}
+
+}  // namespace x86
+}  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_x86.h b/compiler/optimizing/instruction_simplifier_x86.h
new file mode 100644
index 0000000000..1fb199f728
--- /dev/null
+++ b/compiler/optimizing/instruction_simplifier_x86.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
+#define ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
+
+#include "nodes.h"
+#include "optimization.h"
+#include "code_generator_x86.h"
+
+namespace art {
+namespace x86 {
+
+class InstructionSimplifierX86 : public HOptimization {
+ public:
+  InstructionSimplifierX86(HGraph* graph, CodeGenerator* codegen, OptimizingCompilerStats* stats)
+      : HOptimization(graph, kInstructionSimplifierX86PassName, stats),
+        codegen_(down_cast<CodeGeneratorX86*>(codegen)) {}
+
+  static constexpr const char* kInstructionSimplifierX86PassName = "instruction_simplifier_x86";
+
+  bool Run() OVERRIDE;
+
+ private:
+  CodeGeneratorX86* codegen_;
+};
+
+}  // namespace x86
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index c5e9a8d036..b4f9993ad6 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -958,6 +958,10 @@ class HVecMultiplyAccumulate FINAL : public HVecOperation {
     SetRawInputAt(2, mul_right);
   }
 
+  static constexpr int kInputAccumulatorIndex = 0;
+  static constexpr int kInputMulLeftIndex = 1;
+  static constexpr int kInputMulRightIndex = 2;
+
   bool CanBeMoved() const OVERRIDE { return true; }
 
   bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
diff --git a/compiler/optimizing/optimization.cc b/compiler/optimizing/optimization.cc
index a38bd2464d..3ad2c6b3f6 100644
--- a/compiler/optimizing/optimization.cc
+++ b/compiler/optimizing/optimization.cc
@@ -28,6 +28,7 @@
 #endif
 #ifdef ART_ENABLE_CODEGEN_x86
 #include "pc_relative_fixups_x86.h"
+#include "instruction_simplifier_x86.h"
 #endif
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
 #include "x86_memory_gen.h"
@@ -121,6 +122,8 @@ const char* OptimizationPassName(OptimizationPass pass) {
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
     case OptimizationPass::kX86MemoryOperandGeneration:
       return x86::X86MemoryOperandGeneration::kX86MemoryOperandGenerationPassName;
+    case OptimizationPass::kInstructionSimplifierX86:
+      return x86::InstructionSimplifierX86::kInstructionSimplifierX86PassName;
 #endif
     case OptimizationPass::kNone:
       LOG(FATAL) << "kNone does not represent an actual pass";
@@ -163,6 +166,7 @@ OptimizationPass OptimizationPassByName(const std::string& pass_name) {
 #ifdef ART_ENABLE_CODEGEN_x86
   X(OptimizationPass::kPcRelativeFixupsX86);
   X(OptimizationPass::kX86MemoryOperandGeneration);
+  X(OptimizationPass::kInstructionSimplifierX86);
 #endif
   LOG(FATAL) << "Cannot find optimization " << pass_name;
   UNREACHABLE();
@@ -323,6 +327,10 @@ ArenaVector<HOptimization*> ConstructOptimizations(
         DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name";
         opt = new (allocator) x86::X86MemoryOperandGeneration(graph, codegen, stats);
         break;
+      case OptimizationPass::kInstructionSimplifierX86:
+        DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name";
+        opt = new (allocator) x86::InstructionSimplifierX86(graph, codegen, stats);
+        break;
 #endif
       case OptimizationPass::kNone:
         LOG(FATAL) << "kNone does not represent an actual pass";
diff --git a/compiler/optimizing/optimization.h b/compiler/optimizing/optimization.h
index 88b283cebf..a9fafa0864 100644
--- a/compiler/optimizing/optimization.h
+++ b/compiler/optimizing/optimization.h
@@ -101,6 +101,7 @@ enum class OptimizationPass {
 #endif
 #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
   kX86MemoryOperandGeneration,
+  kInstructionSimplifierX86,
 #endif
   kNone,
   kLast = kNone
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 84863e4357..bb33ba3564 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -530,7 +530,8 @@ bool OptimizingCompiler::RunArchOptimizations(HGraph* graph,
         OptDef(OptimizationPass::kSideEffectsAnalysis),
         OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"),
         OptDef(OptimizationPass::kPcRelativeFixupsX86),
-        OptDef(OptimizationPass::kX86MemoryOperandGeneration)
+        OptDef(OptimizationPass::kX86MemoryOperandGeneration),
+        OptDef(OptimizationPass::kInstructionSimplifierX86)
       };
       return RunOptimizations(graph,
                               codegen,
@@ -545,7 +546,8 @@ bool OptimizingCompiler::RunArchOptimizations(HGraph* graph,
       OptimizationDef x86_64_optimizations[] = {
         OptDef(OptimizationPass::kSideEffectsAnalysis),
         OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"),
-        OptDef(OptimizationPass::kX86MemoryOperandGeneration)
+        OptDef(OptimizationPass::kX86MemoryOperandGeneration),
+        OptDef(OptimizationPass::kInstructionSimplifierX86)
       };
       return RunOptimizations(graph,
                               codegen,
author	Gupta Kumar, Sanjiv <sanjiv.kumar.gupta@intel.com>	2018-06-29 13:06:35 +0530
committer	Gupta Kumar, Sanjiv <sanjiv.kumar.gupta@intel.com>	2018-07-02 15:37:38 +0530
commit	61908880e6565acfadbafe93fa64de000014f1a6 (patch)
tree	40b535db9175f3d959364d5bc30eaab4e2c4b4c4 /compiler/optimizing
parent	b5271dd44a30f498689e503340d3c8d01bf31f07 (diff)