Emit vector mulitply and accumulate instructions for x86.
This patch adds a new cpu vaiant named kabylake and performs
instruction simplification to generate VectorMulitplyAccumulate.
Test: ./test.py --host --64
Change-Id: Ie6cc882dadf1322dd4d3ae49bfdb600b0c447765
Signed-off-by: Gupta Kumar, Sanjiv <sanjiv.kumar.gupta@intel.com>
diff --git a/compiler/Android.bp b/compiler/Android.bp
index 11521e6..45dea1c 100644
--- a/compiler/Android.bp
+++ b/compiler/Android.bp
@@ -161,6 +161,7 @@
"utils/x86/assembler_x86.cc",
"utils/x86/jni_macro_assembler_x86.cc",
"utils/x86/managed_register_x86.cc",
+ "optimizing/instruction_simplifier_x86.cc",
],
},
x86_64: {
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 086ae07..5880876 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -1125,13 +1125,59 @@
}
}
-void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
- CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction);
+void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr);
+ switch (instr->GetPackedType()) {
+ case DataType::Type::kFloat32:
+ case DataType::Type::kFloat64:
+ locations->SetInAt(
+ HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
+ locations->SetInAt(
+ HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
+ locations->SetInAt(
+ HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
+ DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+ locations->SetOut(Location::SameAsFirstInput());
+ break;
+ default:
+ // VecMultiplyAccumulate is supported only for single and
+ // double precision floating points. Hence integral types
+ // are still not converted.
+ LOG(FATAL) << "Unsupported SIMD Type";
+ }
}
-void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
- // TODO: pmaddwd?
- LOG(FATAL) << "No SIMD for " << instruction->GetId();
+void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LocationSummary* locations = instr->GetLocations();
+ DCHECK(locations->InAt(0).Equals(locations->Out()));
+ XmmRegister accumulator = locations->InAt(
+ HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>();
+ XmmRegister mul_left = locations->InAt(
+ HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>();
+ XmmRegister mul_right = locations->InAt(
+ HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>();
+ switch (instr->GetPackedType()) {
+ case DataType::Type::kFloat32:
+ DCHECK_EQ(4u, instr->GetVectorLength());
+ if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
+ __ vfmadd231ps(accumulator, mul_left, mul_right);
+ else
+ __ vfmsub231ps(accumulator, mul_left, mul_right);
+ break;
+ case DataType::Type::kFloat64:
+ DCHECK_EQ(2u, instr->GetVectorLength());
+ if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
+ __ vfmadd231pd(accumulator, mul_left, mul_right);
+ else
+ __ vfmsub231pd(accumulator, mul_left, mul_right);
+ break;
+ default:
+
+ // VecMultiplyAccumulate is supported only for single and
+ // double precision floating points. Hence integral types
+ // are still not converted.
+ LOG(FATAL) << "Unsupported SIMD Type";
+ }
}
void LocationsBuilderX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index 4d31ab6..4795e86 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -1098,13 +1098,61 @@
}
}
-void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
- CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction);
+void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr);
+ switch (instr->GetPackedType()) {
+ case DataType::Type::kFloat32:
+ case DataType::Type::kFloat64:
+ locations->SetInAt(
+ HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
+ locations->SetInAt(
+ HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
+ locations->SetInAt(
+ HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
+ DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+ locations->SetOut(Location::SameAsFirstInput());
+ break;
+ default:
+ // VecMultiplyAccumulate is supported only for single and
+ // double precision floating points. Hence integral types
+ // are still not converted.
+ LOG(FATAL) << "Unsupported SIMD type";
+ }
}
-void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
- // TODO: pmaddwd?
- LOG(FATAL) << "No SIMD for " << instruction->GetId();
+
+void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+ LocationSummary* locations = instr->GetLocations();
+ DCHECK(locations->InAt(0).Equals(locations->Out()));
+ XmmRegister accumulator = locations->InAt(
+ HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>();
+ XmmRegister mul_left = locations->InAt(
+ HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>();
+ XmmRegister mul_right = locations->InAt(
+ HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>();
+
+ switch (instr->GetPackedType()) {
+ case DataType::Type::kFloat32:
+ DCHECK_EQ(4u, instr->GetVectorLength());
+ if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
+ __ vfmadd231ps(accumulator, mul_left, mul_right);
+ else
+ __ vfmsub231ps(accumulator, mul_left, mul_right);
+ break;
+ case DataType::Type::kFloat64:
+ DCHECK_EQ(2u, instr->GetVectorLength());
+ if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd)
+ __ vfmadd231pd(accumulator, mul_left, mul_right);
+ else
+ __ vfmsub231pd(accumulator, mul_left, mul_right);
+ break;
+ default:
+
+ // VecMultiplyAccumulate is supported only for single and
+ // double precision floating points. Hence integral types
+ // are still not converted.
+ LOG(FATAL) << "Unsupported SIMD Type";
+ }
}
void LocationsBuilderX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
diff --git a/compiler/optimizing/instruction_simplifier_x86.cc b/compiler/optimizing/instruction_simplifier_x86.cc
new file mode 100644
index 0000000..b3f67d6
--- /dev/null
+++ b/compiler/optimizing/instruction_simplifier_x86.cc
@@ -0,0 +1,149 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "instruction_simplifier_x86.h"
+#include "arch/x86/instruction_set_features_x86.h"
+#include "mirror/array-inl.h"
+#include "code_generator.h"
+
+
+namespace art {
+
+namespace x86 {
+
+class InstructionSimplifierX86Visitor : public HGraphVisitor {
+ public:
+ InstructionSimplifierX86Visitor(HGraph* graph,
+ CodeGeneratorX86 *codegen,
+ OptimizingCompilerStats* stats)
+ : HGraphVisitor(graph), codegen_(codegen), stats_(stats) {}
+
+ private:
+ void RecordSimplification() {
+ MaybeRecordStat(stats_, MethodCompilationStat::kInstructionSimplificationsArch);
+ }
+
+ bool HasCpuFeatureFlag() {
+ return (codegen_->GetInstructionSetFeatures().HasAVX2());
+ }
+
+ /**
+ * This simplifier uses a special-purpose BB visitor.
+ * (1) No need to visit Phi nodes.
+ * (2) Since statements can be removed in a "forward" fashion,
+ * the visitor should test if each statement is still there.
+ */
+ void VisitBasicBlock(HBasicBlock* block) OVERRIDE {
+ // TODO: fragile iteration, provide more robust iterators?
+ for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+ HInstruction* instruction = it.Current();
+ if (instruction->IsInBlock()) {
+ instruction->Accept(this);
+ }
+ }
+ }
+
+ bool TryGenerateVecMultiplyAccumulate(HVecMul* mul);
+ void VisitVecMul(HVecMul* instruction) OVERRIDE;
+
+ CodeGeneratorX86* codegen_;
+ OptimizingCompilerStats* stats_;
+};
+
+/* generic expressions for FMA
+a = (b * c) + a
+a = (b * c) – a
+*/
+bool InstructionSimplifierX86Visitor::TryGenerateVecMultiplyAccumulate(HVecMul* mul) {
+ if (!(mul->GetPackedType() == DataType::Type::kFloat32 ||
+ mul->GetPackedType() == DataType::Type::kFloat64)) {
+ return false;
+ }
+ ArenaAllocator* allocator = mul->GetBlock()->GetGraph()->GetAllocator();
+ if (mul->HasOnlyOneNonEnvironmentUse()) {
+ HInstruction* use = mul->GetUses().front().GetUser();
+ if (use->IsVecAdd() || use->IsVecSub()) {
+ // Replace code looking like
+ // VECMUL tmp, x, y
+ // VECADD dst, acc, tmp or VECADD dst, tmp, acc
+ // or
+ // VECSUB dst, tmp, acc
+ // with
+ // VECMULACC dst, acc, x, y
+
+ // Note that we do not want to (unconditionally) perform the merge when the
+ // multiplication has multiple uses and it can be merged in all of them.
+ // Multiple uses could happen on the same control-flow path, and we would
+ // then increase the amount of work. In the future we could try to evaluate
+ // whether all uses are on different control-flow paths (using dominance and
+ // reverse-dominance information) and only perform the merge when they are.
+ HInstruction* accumulator = nullptr;
+ HVecBinaryOperation* binop = use->AsVecBinaryOperation();
+ HInstruction* binop_left = binop->GetLeft();
+ HInstruction* binop_right = binop->GetRight();
+ DCHECK_NE(binop_left, binop_right);
+ if (use->IsVecSub()) {
+ if (binop_left == mul) {
+ accumulator = binop_right;
+ }
+ } else {
+ // VecAdd
+ if (binop_right == mul) {
+ accumulator = binop_left;
+ } else {
+ DCHECK_EQ(binop_left, mul);
+ accumulator = binop_right;
+ }
+ }
+ HInstruction::InstructionKind kind =
+ use->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub;
+
+ if (accumulator != nullptr) {
+ HVecMultiplyAccumulate* mulacc =
+ new (allocator) HVecMultiplyAccumulate(allocator,
+ kind,
+ accumulator,
+ mul->GetLeft(),
+ mul->GetRight(),
+ binop->GetPackedType(),
+ binop->GetVectorLength(),
+ binop->GetDexPc());
+ binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc);
+ DCHECK(!mul->HasUses());
+ mul->GetBlock()->RemoveInstruction(mul);
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+void InstructionSimplifierX86Visitor::VisitVecMul(HVecMul* instruction) {
+ if (HasCpuFeatureFlag()) {
+ if (TryGenerateVecMultiplyAccumulate(instruction)) {
+ RecordSimplification();
+ }
+ }
+}
+
+bool InstructionSimplifierX86::Run() {
+ InstructionSimplifierX86Visitor visitor(graph_, codegen_, stats_);
+ visitor.VisitReversePostOrder();
+ return true;
+}
+
+} // namespace x86
+} // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_x86.h b/compiler/optimizing/instruction_simplifier_x86.h
new file mode 100644
index 0000000..1fb199f
--- /dev/null
+++ b/compiler/optimizing/instruction_simplifier_x86.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
+#define ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
+
+#include "nodes.h"
+#include "optimization.h"
+#include "code_generator_x86.h"
+
+namespace art {
+namespace x86 {
+
+class InstructionSimplifierX86 : public HOptimization {
+ public:
+ InstructionSimplifierX86(HGraph* graph, CodeGenerator* codegen, OptimizingCompilerStats* stats)
+ : HOptimization(graph, kInstructionSimplifierX86PassName, stats),
+ codegen_(down_cast<CodeGeneratorX86*>(codegen)) {}
+
+ static constexpr const char* kInstructionSimplifierX86PassName = "instruction_simplifier_x86";
+
+ bool Run() OVERRIDE;
+
+ private:
+ CodeGeneratorX86* codegen_;
+};
+
+} // namespace x86
+} // namespace art
+
+#endif // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index c5e9a8d..b4f9993 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -958,6 +958,10 @@
SetRawInputAt(2, mul_right);
}
+ static constexpr int kInputAccumulatorIndex = 0;
+ static constexpr int kInputMulLeftIndex = 1;
+ static constexpr int kInputMulRightIndex = 2;
+
bool CanBeMoved() const OVERRIDE { return true; }
bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
diff --git a/compiler/optimizing/optimization.cc b/compiler/optimizing/optimization.cc
index a38bd24..3ad2c6b 100644
--- a/compiler/optimizing/optimization.cc
+++ b/compiler/optimizing/optimization.cc
@@ -28,6 +28,7 @@
#endif
#ifdef ART_ENABLE_CODEGEN_x86
#include "pc_relative_fixups_x86.h"
+#include "instruction_simplifier_x86.h"
#endif
#if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
#include "x86_memory_gen.h"
@@ -121,6 +122,8 @@
#if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
case OptimizationPass::kX86MemoryOperandGeneration:
return x86::X86MemoryOperandGeneration::kX86MemoryOperandGenerationPassName;
+ case OptimizationPass::kInstructionSimplifierX86:
+ return x86::InstructionSimplifierX86::kInstructionSimplifierX86PassName;
#endif
case OptimizationPass::kNone:
LOG(FATAL) << "kNone does not represent an actual pass";
@@ -163,6 +166,7 @@
#ifdef ART_ENABLE_CODEGEN_x86
X(OptimizationPass::kPcRelativeFixupsX86);
X(OptimizationPass::kX86MemoryOperandGeneration);
+ X(OptimizationPass::kInstructionSimplifierX86);
#endif
LOG(FATAL) << "Cannot find optimization " << pass_name;
UNREACHABLE();
@@ -323,6 +327,10 @@
DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name";
opt = new (allocator) x86::X86MemoryOperandGeneration(graph, codegen, stats);
break;
+ case OptimizationPass::kInstructionSimplifierX86:
+ DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name";
+ opt = new (allocator) x86::InstructionSimplifierX86(graph, codegen, stats);
+ break;
#endif
case OptimizationPass::kNone:
LOG(FATAL) << "kNone does not represent an actual pass";
diff --git a/compiler/optimizing/optimization.h b/compiler/optimizing/optimization.h
index 88b283c..a9fafa0 100644
--- a/compiler/optimizing/optimization.h
+++ b/compiler/optimizing/optimization.h
@@ -101,6 +101,7 @@
#endif
#if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64)
kX86MemoryOperandGeneration,
+ kInstructionSimplifierX86,
#endif
kNone,
kLast = kNone
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 84863e4..bb33ba3 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -530,7 +530,8 @@
OptDef(OptimizationPass::kSideEffectsAnalysis),
OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"),
OptDef(OptimizationPass::kPcRelativeFixupsX86),
- OptDef(OptimizationPass::kX86MemoryOperandGeneration)
+ OptDef(OptimizationPass::kX86MemoryOperandGeneration),
+ OptDef(OptimizationPass::kInstructionSimplifierX86)
};
return RunOptimizations(graph,
codegen,
@@ -545,7 +546,8 @@
OptimizationDef x86_64_optimizations[] = {
OptDef(OptimizationPass::kSideEffectsAnalysis),
OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"),
- OptDef(OptimizationPass::kX86MemoryOperandGeneration)
+ OptDef(OptimizationPass::kX86MemoryOperandGeneration),
+ OptDef(OptimizationPass::kInstructionSimplifierX86)
};
return RunOptimizations(graph,
codegen,
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 86f9010..c2ce03b 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -525,6 +525,58 @@
EmitOperand(dst, src);
}
+void X86Assembler::vfmadd231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+ uint8_t byte_one = EmitVexByte1(false, false, false, 2);
+ uint8_t byte_two = EmitVexByte2(false, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
+ EmitUint8(byte_zero);
+ EmitUint8(byte_one);
+ EmitUint8(byte_two);
+ // Opcode field.
+ EmitUint8(0xB8);
+ EmitXmmRegisterOperand(acc, mul_right);
+}
+
+void X86Assembler::vfmsub231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+ uint8_t byte_one = EmitVexByte1(false, false, false, 2);
+ uint8_t byte_two = EmitVexByte2(false, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
+ EmitUint8(byte_zero);
+ EmitUint8(byte_one);
+ EmitUint8(byte_two);
+ // Opcode field.
+ EmitUint8(0xBA);
+ EmitXmmRegisterOperand(acc, mul_right);
+}
+
+void X86Assembler::vfmadd231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+ uint8_t byte_one = EmitVexByte1(false, false, false, 2);
+ uint8_t byte_two = EmitVexByte2(true, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
+ EmitUint8(byte_zero);
+ EmitUint8(byte_one);
+ EmitUint8(byte_two);
+ // Opcode field.
+ EmitUint8(0xB8);
+ EmitXmmRegisterOperand(acc, mul_right);
+}
+
+void X86Assembler::vfmsub231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+ uint8_t byte_one = EmitVexByte1(false, false, false, 2);
+ uint8_t byte_two = EmitVexByte2(true, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1);
+ EmitUint8(byte_zero);
+ EmitUint8(byte_one);
+ EmitUint8(byte_two);
+ // Opcode field.
+ EmitUint8(0xBA);
+ EmitXmmRegisterOperand(acc, mul_right);
+}
+
void X86Assembler::addps(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
@@ -2898,6 +2950,99 @@
}
+uint8_t X86Assembler::EmitVexByteZero(bool is_two_byte) {
+ uint8_t vex_zero = 0xC0;
+ if (!is_two_byte) {
+ vex_zero |= 0xC4;
+ } else {
+ vex_zero |= 0xC5;
+ }
+ return vex_zero;
+}
+
+uint8_t X86Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm ) {
+ // VEX Byte 1.
+ uint8_t vex_prefix = 0;
+ if (!r) {
+ vex_prefix |= 0x80; // VEX.R .
+ }
+ if (!x) {
+ vex_prefix |= 0x40; // VEX.X .
+ }
+ if (!b) {
+ vex_prefix |= 0x20; // VEX.B .
+ }
+
+ // VEX.mmmmm.
+ switch (mmmmm) {
+ case 1:
+ // Implied 0F leading opcode byte.
+ vex_prefix |= 0x01;
+ break;
+ case 2:
+ // Implied leading 0F 38 opcode byte.
+ vex_prefix |= 0x02;
+ break;
+ case 3:
+ // Implied leading OF 3A opcode byte.
+ vex_prefix |= 0x03;
+ break;
+ default:
+ LOG(FATAL) << "unknown opcode bytes";
+ }
+ return vex_prefix;
+}
+
+uint8_t X86Assembler::EmitVexByte2(bool w, int l, X86ManagedRegister operand, int pp) {
+ uint8_t vex_prefix = 0;
+ // VEX Byte 2.
+ if (w) {
+ vex_prefix |= 0x80;
+ }
+
+ // VEX.vvvv.
+ if (operand.IsXmmRegister()) {
+ XmmRegister vvvv = operand.AsXmmRegister();
+ int inverted_reg = 15-static_cast<int>(vvvv);
+ uint8_t reg = static_cast<uint8_t>(inverted_reg);
+ vex_prefix |= ((reg & 0x0F) << 3);
+ } else if (operand.IsCpuRegister()) {
+ Register vvvv = operand.AsCpuRegister();
+ int inverted_reg = 15 - static_cast<int>(vvvv);
+ uint8_t reg = static_cast<uint8_t>(inverted_reg);
+ vex_prefix |= ((reg & 0x0F) << 3);
+ }
+
+ // VEX.L.
+ if (l == 256) {
+ vex_prefix |= 0x04;
+ }
+
+ // VEX.pp.
+ switch (pp) {
+ case 0:
+ // SIMD Pefix - None.
+ vex_prefix |= 0x00;
+ break;
+ case 1:
+ // SIMD Prefix - 66.
+ vex_prefix |= 0x01;
+ break;
+ case 2:
+ // SIMD Prefix - F3.
+ vex_prefix |= 0x02;
+ break;
+ case 3:
+ // SIMD Prefix - F2.
+ vex_prefix |= 0x03;
+ break;
+ default:
+ LOG(FATAL) << "unknown SIMD Prefix";
+ }
+
+ return vex_prefix;
+}
+
void X86Assembler::EmitGenericShift(int reg_or_opcode,
const Operand& operand,
const Immediate& imm) {
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index e42c4c9..8c9ce82 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -397,6 +397,12 @@
void divss(XmmRegister dst, XmmRegister src);
void divss(XmmRegister dst, const Address& src);
+ // FMA Mac Instructions
+ void vfmadd231ps(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+ void vfmadd231pd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+ void vfmsub231ps(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+ void vfmsub231pd(XmmRegister dst, XmmRegister src1, XmmRegister src2);
+
void addps(XmmRegister dst, XmmRegister src); // no addr variant (for now)
void subps(XmmRegister dst, XmmRegister src);
void mulps(XmmRegister dst, XmmRegister src);
@@ -834,6 +840,11 @@
void EmitLabelLink(Label* label);
void EmitLabelLink(NearLabel* label);
+ // Emit a 3 byte VEX Prefix
+ uint8_t EmitVexByteZero(bool is_two_byte);
+ uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm);
+ uint8_t EmitVexByte2(bool w , int l , X86ManagedRegister vvv, int pp);
+
void EmitGenericShift(int rm, const Operand& operand, const Immediate& imm);
void EmitGenericShift(int rm, const Operand& operand, Register shifter);
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index bd31561..9983eae 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -603,6 +603,56 @@
}
+void X86_64Assembler::vfmadd231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+ uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2);
+ uint8_t byte_two = EmitVexByte2(false, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1);
+ EmitUint8(byte_zero);
+ EmitUint8(byte_one);
+ EmitUint8(byte_two);
+ // Opcode field.
+ EmitUint8(0xB8);
+ EmitXmmRegisterOperand(acc.LowBits(), mul_right);
+}
+
+
+void X86_64Assembler::vfmsub231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+ uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2);
+ uint8_t byte_two = EmitVexByte2(false, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1);
+ EmitUint8(byte_zero);
+ EmitUint8(byte_one);
+ EmitUint8(byte_two);
+ // Opcode field
+ EmitUint8(0xBA);
+ EmitXmmRegisterOperand(acc.LowBits(), mul_right);
+}
+
+void X86_64Assembler::vfmadd231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+ uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2);
+ uint8_t byte_two = EmitVexByte2(true, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1);
+ EmitUint8(byte_zero);
+ EmitUint8(byte_one);
+ EmitUint8(byte_two);
+ EmitUint8(0xB8);
+ EmitXmmRegisterOperand(acc.LowBits(), mul_right);
+}
+
+void X86_64Assembler::vfmsub231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) {
+ AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+ uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/);
+ uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2);
+ uint8_t byte_two = EmitVexByte2(true, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1);
+ EmitUint8(byte_zero);
+ EmitUint8(byte_one);
+ EmitUint8(byte_two);
+ EmitUint8(0xBA);
+ EmitXmmRegisterOperand(acc.LowBits(), mul_right);
+}
void X86_64Assembler::addps(XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitOptionalRex32(dst, src);
@@ -3544,6 +3594,98 @@
label->LinkTo(position);
}
+uint8_t X86_64Assembler::EmitVexByteZero(bool is_two_byte) {
+ uint8_t vex_zero = 0xC0;
+ if (!is_two_byte) {
+ vex_zero |= 0xC4;
+ } else {
+ vex_zero |= 0xC5;
+ }
+ return vex_zero;
+}
+
+uint8_t X86_64Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm) {
+ // VEX Byte 1.
+ uint8_t vex_prefix = 0;
+ if (!r) {
+ vex_prefix |= 0x80; // VEX.R .
+ }
+ if (!x) {
+ vex_prefix |= 0x40; // VEX.X .
+ }
+ if (!b) {
+ vex_prefix |= 0x20; // VEX.B .
+ }
+
+ // VEX.mmmmm.
+ switch (mmmmm) {
+ case 1:
+ // Implied 0F leading opcode byte.
+ vex_prefix |= 0x01;
+ break;
+ case 2:
+ // Implied leading 0F 38 opcode byte.
+ vex_prefix |= 0x02;
+ break;
+ case 3:
+ // Implied leading OF 3A opcode byte.
+ vex_prefix |= 0x03;
+ break;
+ default:
+ LOG(FATAL) << "unknown opcode bytes";
+ }
+
+ return vex_prefix;
+}
+
+uint8_t X86_64Assembler::EmitVexByte2(bool w, int l, X86_64ManagedRegister operand, int pp) {
+ // VEX Byte 2.
+ uint8_t vex_prefix = 0;
+ if (w) {
+ vex_prefix |= 0x80;
+ }
+ // VEX.vvvv.
+ if (operand.IsXmmRegister()) {
+ XmmRegister vvvv = operand.AsXmmRegister();
+ int inverted_reg = 15-static_cast<int>(vvvv.AsFloatRegister());
+ uint8_t reg = static_cast<uint8_t>(inverted_reg);
+ vex_prefix |= ((reg & 0x0F) << 3);
+ } else if (operand.IsCpuRegister()) {
+ CpuRegister vvvv = operand.AsCpuRegister();
+ int inverted_reg = 15 - static_cast<int>(vvvv.AsRegister());
+ uint8_t reg = static_cast<uint8_t>(inverted_reg);
+ vex_prefix |= ((reg & 0x0F) << 3);
+ }
+
+ // VEX.L.
+ if (l == 256) {
+ vex_prefix |= 0x04;
+ }
+
+ // VEX.pp.
+ switch (pp) {
+ case 0:
+ // SIMD Pefix - None.
+ vex_prefix |= 0x00;
+ break;
+ case 1:
+ // SIMD Prefix - 66.
+ vex_prefix |= 0x01;
+ break;
+ case 2:
+ // SIMD Prefix - F3.
+ vex_prefix |= 0x02;
+ break;
+ case 3:
+ // SIMD Prefix - F2.
+ vex_prefix |= 0x03;
+ break;
+ default:
+ LOG(FATAL) << "unknown SIMD Prefix";
+ }
+
+ return vex_prefix;
+}
void X86_64Assembler::EmitGenericShift(bool wide,
int reg_or_opcode,
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index e4d72a7..d5779aa 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -436,6 +436,16 @@
void divss(XmmRegister dst, XmmRegister src);
void divss(XmmRegister dst, const Address& src);
+ // Mac Instructions
+ // For reference look at the Instruction reference volume 2C.
+ // The below URL is broken down in two lines.
+ // https://www.intel.com/content/www/us/en/architecture-and-technology/
+ // 64-ia-32-architectures-software-developer-vol-2c-manual.html
+ void vfmadd231ps(XmmRegister acc, XmmRegister left, XmmRegister right);
+ void vfmadd231pd(XmmRegister acc, XmmRegister left, XmmRegister right);
+ void vfmsub231ps(XmmRegister acc, XmmRegister left, XmmRegister right);
+ void vfmsub231pd(XmmRegister acc, XmmRegister left, XmmRegister right);
+
void addps(XmmRegister dst, XmmRegister src); // no addr variant (for now)
void subps(XmmRegister dst, XmmRegister src);
void mulps(XmmRegister dst, XmmRegister src);
@@ -921,6 +931,11 @@
void EmitLabelLink(Label* label);
void EmitLabelLink(NearLabel* label);
+ // Emit a 3 byte VEX Prefix.
+ uint8_t EmitVexByteZero(bool is_two_byte);
+ uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm);
+ uint8_t EmitVexByte2(bool w , int l , X86_64ManagedRegister operand, int pp);
+
void EmitGenericShift(bool wide, int rm, CpuRegister reg, const Immediate& imm);
void EmitGenericShift(bool wide, int rm, CpuRegister operand, CpuRegister shifter);
diff --git a/runtime/arch/x86/instruction_set_features_x86.cc b/runtime/arch/x86/instruction_set_features_x86.cc
index 9846251..745e925 100644
--- a/runtime/arch/x86/instruction_set_features_x86.cc
+++ b/runtime/arch/x86/instruction_set_features_x86.cc
@@ -35,6 +35,7 @@
"atom",
"sandybridge",
"silvermont",
+ "kabylake"
};
static constexpr const char* x86_variants_with_ssse3[] = {
@@ -46,16 +47,27 @@
static constexpr const char* x86_variants_with_sse4_1[] = {
"sandybridge",
"silvermont",
+ "kabylake"
};
static constexpr const char* x86_variants_with_sse4_2[] = {
"sandybridge",
"silvermont",
+ "kabylake"
};
static constexpr const char* x86_variants_with_popcnt[] = {
"sandybridge",
"silvermont",
+ "kabylake"
+};
+
+static constexpr const char* x86_variants_with_avx[] = {
+ "kabylake",
+};
+
+static constexpr const char* x86_variants_with_avx2[] = {
+ "kabylake",
};
X86FeaturesUniquePtr X86InstructionSetFeatures::Create(bool x86_64,
@@ -93,9 +105,12 @@
bool has_SSE4_2 = FindVariantInArray(x86_variants_with_sse4_2,
arraysize(x86_variants_with_sse4_2),
variant);
- bool has_AVX = false;
- bool has_AVX2 = false;
-
+ bool has_AVX = FindVariantInArray(x86_variants_with_avx,
+ arraysize(x86_variants_with_avx),
+ variant);
+ bool has_AVX2 = FindVariantInArray(x86_variants_with_avx2,
+ arraysize(x86_variants_with_avx2),
+ variant);
bool has_POPCNT = FindVariantInArray(x86_variants_with_popcnt,
arraysize(x86_variants_with_popcnt),
variant);
diff --git a/runtime/arch/x86/instruction_set_features_x86.h b/runtime/arch/x86/instruction_set_features_x86.h
index 57cf4b2..f5974cc 100644
--- a/runtime/arch/x86/instruction_set_features_x86.h
+++ b/runtime/arch/x86/instruction_set_features_x86.h
@@ -67,6 +67,8 @@
bool HasPopCnt() const { return has_POPCNT_; }
+ bool HasAVX2() const { return has_AVX2_; }
+
protected:
// Parse a string of the form "ssse3" adding these to a new InstructionSetFeatures.
virtual std::unique_ptr<const InstructionSetFeatures>