diff options
author | 2018-06-29 13:06:35 +0530 | |
---|---|---|
committer | 2018-07-02 15:37:38 +0530 | |
commit | 61908880e6565acfadbafe93fa64de000014f1a6 (patch) | |
tree | 40b535db9175f3d959364d5bc30eaab4e2c4b4c4 | |
parent | b5271dd44a30f498689e503340d3c8d01bf31f07 (diff) |
Emit vector mulitply and accumulate instructions for x86.
This patch adds a new cpu vaiant named kabylake and performs
instruction simplification to generate VectorMulitplyAccumulate.
Test: ./test.py --host --64
Change-Id: Ie6cc882dadf1322dd4d3ae49bfdb600b0c447765
Signed-off-by: Gupta Kumar, Sanjiv <sanjiv.kumar.gupta@intel.com>
-rw-r--r-- | compiler/Android.bp | 1 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_x86.cc | 56 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_x86_64.cc | 58 | ||||
-rw-r--r-- | compiler/optimizing/instruction_simplifier_x86.cc | 149 | ||||
-rw-r--r-- | compiler/optimizing/instruction_simplifier_x86.h | 44 | ||||
-rw-r--r-- | compiler/optimizing/nodes_vector.h | 4 | ||||
-rw-r--r-- | compiler/optimizing/optimization.cc | 8 | ||||
-rw-r--r-- | compiler/optimizing/optimization.h | 1 | ||||
-rw-r--r-- | compiler/optimizing/optimizing_compiler.cc | 6 | ||||
-rw-r--r-- | compiler/utils/x86/assembler_x86.cc | 145 | ||||
-rw-r--r-- | compiler/utils/x86/assembler_x86.h | 11 | ||||
-rw-r--r-- | compiler/utils/x86_64/assembler_x86_64.cc | 142 | ||||
-rw-r--r-- | compiler/utils/x86_64/assembler_x86_64.h | 15 | ||||
-rw-r--r-- | runtime/arch/x86/instruction_set_features_x86.cc | 21 | ||||
-rw-r--r-- | runtime/arch/x86/instruction_set_features_x86.h | 2 |
15 files changed, 648 insertions, 15 deletions
diff --git a/compiler/Android.bp b/compiler/Android.bp index 11521e68d0..45dea1c6ba 100644 --- a/compiler/Android.bp +++ b/compiler/Android.bp @@ -161,6 +161,7 @@ art_cc_defaults { "utils/x86/assembler_x86.cc", "utils/x86/jni_macro_assembler_x86.cc", "utils/x86/managed_register_x86.cc", + "optimizing/instruction_simplifier_x86.cc", ], }, x86_64: { diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc index 086ae07a06..58808769e2 100644 --- a/compiler/optimizing/code_generator_vector_x86.cc +++ b/compiler/optimizing/code_generator_vector_x86.cc @@ -1125,13 +1125,59 @@ static void CreateVecAccumLocations(ArenaAllocator* allocator, HVecOperation* in } } -void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { - CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction); +void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr); + switch (instr->GetPackedType()) { + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + locations->SetInAt( + HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister()); + locations->SetInAt( + HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister()); + locations->SetInAt( + HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister()); + DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + // VecMultiplyAccumulate is supported only for single and + // double precision floating points. Hence integral types + // are still not converted. + LOG(FATAL) << "Unsupported SIMD Type"; + } } -void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { - // TODO: pmaddwd? - LOG(FATAL) << "No SIMD for " << instruction->GetId(); +void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LocationSummary* locations = instr->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister accumulator = locations->InAt( + HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>(); + XmmRegister mul_left = locations->InAt( + HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>(); + XmmRegister mul_right = locations->InAt( + HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>(); + switch (instr->GetPackedType()) { + case DataType::Type::kFloat32: + DCHECK_EQ(4u, instr->GetVectorLength()); + if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd) + __ vfmadd231ps(accumulator, mul_left, mul_right); + else + __ vfmsub231ps(accumulator, mul_left, mul_right); + break; + case DataType::Type::kFloat64: + DCHECK_EQ(2u, instr->GetVectorLength()); + if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd) + __ vfmadd231pd(accumulator, mul_left, mul_right); + else + __ vfmsub231pd(accumulator, mul_left, mul_right); + break; + default: + + // VecMultiplyAccumulate is supported only for single and + // double precision floating points. Hence integral types + // are still not converted. + LOG(FATAL) << "Unsupported SIMD Type"; + } } void LocationsBuilderX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc index 4d31ab68d1..4795e86933 100644 --- a/compiler/optimizing/code_generator_vector_x86_64.cc +++ b/compiler/optimizing/code_generator_vector_x86_64.cc @@ -1098,13 +1098,61 @@ static void CreateVecAccumLocations(ArenaAllocator* allocator, HVecOperation* in } } -void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { - CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction); +void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instr); + switch (instr->GetPackedType()) { + case DataType::Type::kFloat32: + case DataType::Type::kFloat64: + locations->SetInAt( + HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister()); + locations->SetInAt( + HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister()); + locations->SetInAt( + HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister()); + DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + // VecMultiplyAccumulate is supported only for single and + // double precision floating points. Hence integral types + // are still not converted. + LOG(FATAL) << "Unsupported SIMD type"; + } } -void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) { - // TODO: pmaddwd? - LOG(FATAL) << "No SIMD for " << instruction->GetId(); + +void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LocationSummary* locations = instr->GetLocations(); + DCHECK(locations->InAt(0).Equals(locations->Out())); + XmmRegister accumulator = locations->InAt( + HVecMultiplyAccumulate::kInputAccumulatorIndex).AsFpuRegister<XmmRegister>(); + XmmRegister mul_left = locations->InAt( + HVecMultiplyAccumulate::kInputMulLeftIndex).AsFpuRegister<XmmRegister>(); + XmmRegister mul_right = locations->InAt( + HVecMultiplyAccumulate::kInputMulRightIndex).AsFpuRegister<XmmRegister>(); + + switch (instr->GetPackedType()) { + case DataType::Type::kFloat32: + DCHECK_EQ(4u, instr->GetVectorLength()); + if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd) + __ vfmadd231ps(accumulator, mul_left, mul_right); + else + __ vfmsub231ps(accumulator, mul_left, mul_right); + break; + case DataType::Type::kFloat64: + DCHECK_EQ(2u, instr->GetVectorLength()); + if (instr->GetOpKind() == HInstruction::InstructionKind::kAdd) + __ vfmadd231pd(accumulator, mul_left, mul_right); + else + __ vfmsub231pd(accumulator, mul_left, mul_right); + break; + default: + + // VecMultiplyAccumulate is supported only for single and + // double precision floating points. Hence integral types + // are still not converted. + LOG(FATAL) << "Unsupported SIMD Type"; + } } void LocationsBuilderX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) { diff --git a/compiler/optimizing/instruction_simplifier_x86.cc b/compiler/optimizing/instruction_simplifier_x86.cc new file mode 100644 index 0000000000..b3f67d6e84 --- /dev/null +++ b/compiler/optimizing/instruction_simplifier_x86.cc @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "instruction_simplifier_x86.h" +#include "arch/x86/instruction_set_features_x86.h" +#include "mirror/array-inl.h" +#include "code_generator.h" + + +namespace art { + +namespace x86 { + +class InstructionSimplifierX86Visitor : public HGraphVisitor { + public: + InstructionSimplifierX86Visitor(HGraph* graph, + CodeGeneratorX86 *codegen, + OptimizingCompilerStats* stats) + : HGraphVisitor(graph), codegen_(codegen), stats_(stats) {} + + private: + void RecordSimplification() { + MaybeRecordStat(stats_, MethodCompilationStat::kInstructionSimplificationsArch); + } + + bool HasCpuFeatureFlag() { + return (codegen_->GetInstructionSetFeatures().HasAVX2()); + } + + /** + * This simplifier uses a special-purpose BB visitor. + * (1) No need to visit Phi nodes. + * (2) Since statements can be removed in a "forward" fashion, + * the visitor should test if each statement is still there. + */ + void VisitBasicBlock(HBasicBlock* block) OVERRIDE { + // TODO: fragile iteration, provide more robust iterators? + for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) { + HInstruction* instruction = it.Current(); + if (instruction->IsInBlock()) { + instruction->Accept(this); + } + } + } + + bool TryGenerateVecMultiplyAccumulate(HVecMul* mul); + void VisitVecMul(HVecMul* instruction) OVERRIDE; + + CodeGeneratorX86* codegen_; + OptimizingCompilerStats* stats_; +}; + +/* generic expressions for FMA +a = (b * c) + a +a = (b * c) – a +*/ +bool InstructionSimplifierX86Visitor::TryGenerateVecMultiplyAccumulate(HVecMul* mul) { + if (!(mul->GetPackedType() == DataType::Type::kFloat32 || + mul->GetPackedType() == DataType::Type::kFloat64)) { + return false; + } + ArenaAllocator* allocator = mul->GetBlock()->GetGraph()->GetAllocator(); + if (mul->HasOnlyOneNonEnvironmentUse()) { + HInstruction* use = mul->GetUses().front().GetUser(); + if (use->IsVecAdd() || use->IsVecSub()) { + // Replace code looking like + // VECMUL tmp, x, y + // VECADD dst, acc, tmp or VECADD dst, tmp, acc + // or + // VECSUB dst, tmp, acc + // with + // VECMULACC dst, acc, x, y + + // Note that we do not want to (unconditionally) perform the merge when the + // multiplication has multiple uses and it can be merged in all of them. + // Multiple uses could happen on the same control-flow path, and we would + // then increase the amount of work. In the future we could try to evaluate + // whether all uses are on different control-flow paths (using dominance and + // reverse-dominance information) and only perform the merge when they are. + HInstruction* accumulator = nullptr; + HVecBinaryOperation* binop = use->AsVecBinaryOperation(); + HInstruction* binop_left = binop->GetLeft(); + HInstruction* binop_right = binop->GetRight(); + DCHECK_NE(binop_left, binop_right); + if (use->IsVecSub()) { + if (binop_left == mul) { + accumulator = binop_right; + } + } else { + // VecAdd + if (binop_right == mul) { + accumulator = binop_left; + } else { + DCHECK_EQ(binop_left, mul); + accumulator = binop_right; + } + } + HInstruction::InstructionKind kind = + use->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub; + + if (accumulator != nullptr) { + HVecMultiplyAccumulate* mulacc = + new (allocator) HVecMultiplyAccumulate(allocator, + kind, + accumulator, + mul->GetLeft(), + mul->GetRight(), + binop->GetPackedType(), + binop->GetVectorLength(), + binop->GetDexPc()); + binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc); + DCHECK(!mul->HasUses()); + mul->GetBlock()->RemoveInstruction(mul); + return true; + } + } + } + return false; +} + +void InstructionSimplifierX86Visitor::VisitVecMul(HVecMul* instruction) { + if (HasCpuFeatureFlag()) { + if (TryGenerateVecMultiplyAccumulate(instruction)) { + RecordSimplification(); + } + } +} + +bool InstructionSimplifierX86::Run() { + InstructionSimplifierX86Visitor visitor(graph_, codegen_, stats_); + visitor.VisitReversePostOrder(); + return true; +} + +} // namespace x86 +} // namespace art diff --git a/compiler/optimizing/instruction_simplifier_x86.h b/compiler/optimizing/instruction_simplifier_x86.h new file mode 100644 index 0000000000..1fb199f728 --- /dev/null +++ b/compiler/optimizing/instruction_simplifier_x86.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_ +#define ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_ + +#include "nodes.h" +#include "optimization.h" +#include "code_generator_x86.h" + +namespace art { +namespace x86 { + +class InstructionSimplifierX86 : public HOptimization { + public: + InstructionSimplifierX86(HGraph* graph, CodeGenerator* codegen, OptimizingCompilerStats* stats) + : HOptimization(graph, kInstructionSimplifierX86PassName, stats), + codegen_(down_cast<CodeGeneratorX86*>(codegen)) {} + + static constexpr const char* kInstructionSimplifierX86PassName = "instruction_simplifier_x86"; + + bool Run() OVERRIDE; + + private: + CodeGeneratorX86* codegen_; +}; + +} // namespace x86 +} // namespace art + +#endif // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_X86_H_ diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h index c5e9a8d036..b4f9993ad6 100644 --- a/compiler/optimizing/nodes_vector.h +++ b/compiler/optimizing/nodes_vector.h @@ -958,6 +958,10 @@ class HVecMultiplyAccumulate FINAL : public HVecOperation { SetRawInputAt(2, mul_right); } + static constexpr int kInputAccumulatorIndex = 0; + static constexpr int kInputMulLeftIndex = 1; + static constexpr int kInputMulRightIndex = 2; + bool CanBeMoved() const OVERRIDE { return true; } bool InstructionDataEquals(const HInstruction* other) const OVERRIDE { diff --git a/compiler/optimizing/optimization.cc b/compiler/optimizing/optimization.cc index a38bd2464d..3ad2c6b3f6 100644 --- a/compiler/optimizing/optimization.cc +++ b/compiler/optimizing/optimization.cc @@ -28,6 +28,7 @@ #endif #ifdef ART_ENABLE_CODEGEN_x86 #include "pc_relative_fixups_x86.h" +#include "instruction_simplifier_x86.h" #endif #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64) #include "x86_memory_gen.h" @@ -121,6 +122,8 @@ const char* OptimizationPassName(OptimizationPass pass) { #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64) case OptimizationPass::kX86MemoryOperandGeneration: return x86::X86MemoryOperandGeneration::kX86MemoryOperandGenerationPassName; + case OptimizationPass::kInstructionSimplifierX86: + return x86::InstructionSimplifierX86::kInstructionSimplifierX86PassName; #endif case OptimizationPass::kNone: LOG(FATAL) << "kNone does not represent an actual pass"; @@ -163,6 +166,7 @@ OptimizationPass OptimizationPassByName(const std::string& pass_name) { #ifdef ART_ENABLE_CODEGEN_x86 X(OptimizationPass::kPcRelativeFixupsX86); X(OptimizationPass::kX86MemoryOperandGeneration); + X(OptimizationPass::kInstructionSimplifierX86); #endif LOG(FATAL) << "Cannot find optimization " << pass_name; UNREACHABLE(); @@ -323,6 +327,10 @@ ArenaVector<HOptimization*> ConstructOptimizations( DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name"; opt = new (allocator) x86::X86MemoryOperandGeneration(graph, codegen, stats); break; + case OptimizationPass::kInstructionSimplifierX86: + DCHECK(alt_name == nullptr) << "arch-specific pass does not support alternative name"; + opt = new (allocator) x86::InstructionSimplifierX86(graph, codegen, stats); + break; #endif case OptimizationPass::kNone: LOG(FATAL) << "kNone does not represent an actual pass"; diff --git a/compiler/optimizing/optimization.h b/compiler/optimizing/optimization.h index 88b283cebf..a9fafa0864 100644 --- a/compiler/optimizing/optimization.h +++ b/compiler/optimizing/optimization.h @@ -101,6 +101,7 @@ enum class OptimizationPass { #endif #if defined(ART_ENABLE_CODEGEN_x86) || defined(ART_ENABLE_CODEGEN_x86_64) kX86MemoryOperandGeneration, + kInstructionSimplifierX86, #endif kNone, kLast = kNone diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc index 84863e4357..bb33ba3564 100644 --- a/compiler/optimizing/optimizing_compiler.cc +++ b/compiler/optimizing/optimizing_compiler.cc @@ -530,7 +530,8 @@ bool OptimizingCompiler::RunArchOptimizations(HGraph* graph, OptDef(OptimizationPass::kSideEffectsAnalysis), OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"), OptDef(OptimizationPass::kPcRelativeFixupsX86), - OptDef(OptimizationPass::kX86MemoryOperandGeneration) + OptDef(OptimizationPass::kX86MemoryOperandGeneration), + OptDef(OptimizationPass::kInstructionSimplifierX86) }; return RunOptimizations(graph, codegen, @@ -545,7 +546,8 @@ bool OptimizingCompiler::RunArchOptimizations(HGraph* graph, OptimizationDef x86_64_optimizations[] = { OptDef(OptimizationPass::kSideEffectsAnalysis), OptDef(OptimizationPass::kGlobalValueNumbering, "GVN$after_arch"), - OptDef(OptimizationPass::kX86MemoryOperandGeneration) + OptDef(OptimizationPass::kX86MemoryOperandGeneration), + OptDef(OptimizationPass::kInstructionSimplifierX86) }; return RunOptimizations(graph, codegen, diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc index 86f9010ea3..c2ce03b1f2 100644 --- a/compiler/utils/x86/assembler_x86.cc +++ b/compiler/utils/x86/assembler_x86.cc @@ -525,6 +525,58 @@ void X86Assembler::divss(XmmRegister dst, const Address& src) { EmitOperand(dst, src); } +void X86Assembler::vfmadd231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(false, false, false, 2); + uint8_t byte_two = EmitVexByte2(false, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field. + EmitUint8(0xB8); + EmitXmmRegisterOperand(acc, mul_right); +} + +void X86Assembler::vfmsub231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(false, false, false, 2); + uint8_t byte_two = EmitVexByte2(false, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field. + EmitUint8(0xBA); + EmitXmmRegisterOperand(acc, mul_right); +} + +void X86Assembler::vfmadd231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(false, false, false, 2); + uint8_t byte_two = EmitVexByte2(true, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field. + EmitUint8(0xB8); + EmitXmmRegisterOperand(acc, mul_right); +} + +void X86Assembler::vfmsub231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(false, false, false, 2); + uint8_t byte_two = EmitVexByte2(true, 128, X86ManagedRegister::FromXmmRegister(mul_left), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field. + EmitUint8(0xBA); + EmitXmmRegisterOperand(acc, mul_right); +} + void X86Assembler::addps(XmmRegister dst, XmmRegister src) { AssemblerBuffer::EnsureCapacity ensured(&buffer_); @@ -2898,6 +2950,99 @@ void X86Assembler::EmitLabelLink(NearLabel* label) { } +uint8_t X86Assembler::EmitVexByteZero(bool is_two_byte) { + uint8_t vex_zero = 0xC0; + if (!is_two_byte) { + vex_zero |= 0xC4; + } else { + vex_zero |= 0xC5; + } + return vex_zero; +} + +uint8_t X86Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm ) { + // VEX Byte 1. + uint8_t vex_prefix = 0; + if (!r) { + vex_prefix |= 0x80; // VEX.R . + } + if (!x) { + vex_prefix |= 0x40; // VEX.X . + } + if (!b) { + vex_prefix |= 0x20; // VEX.B . + } + + // VEX.mmmmm. + switch (mmmmm) { + case 1: + // Implied 0F leading opcode byte. + vex_prefix |= 0x01; + break; + case 2: + // Implied leading 0F 38 opcode byte. + vex_prefix |= 0x02; + break; + case 3: + // Implied leading OF 3A opcode byte. + vex_prefix |= 0x03; + break; + default: + LOG(FATAL) << "unknown opcode bytes"; + } + return vex_prefix; +} + +uint8_t X86Assembler::EmitVexByte2(bool w, int l, X86ManagedRegister operand, int pp) { + uint8_t vex_prefix = 0; + // VEX Byte 2. + if (w) { + vex_prefix |= 0x80; + } + + // VEX.vvvv. + if (operand.IsXmmRegister()) { + XmmRegister vvvv = operand.AsXmmRegister(); + int inverted_reg = 15-static_cast<int>(vvvv); + uint8_t reg = static_cast<uint8_t>(inverted_reg); + vex_prefix |= ((reg & 0x0F) << 3); + } else if (operand.IsCpuRegister()) { + Register vvvv = operand.AsCpuRegister(); + int inverted_reg = 15 - static_cast<int>(vvvv); + uint8_t reg = static_cast<uint8_t>(inverted_reg); + vex_prefix |= ((reg & 0x0F) << 3); + } + + // VEX.L. + if (l == 256) { + vex_prefix |= 0x04; + } + + // VEX.pp. + switch (pp) { + case 0: + // SIMD Pefix - None. + vex_prefix |= 0x00; + break; + case 1: + // SIMD Prefix - 66. + vex_prefix |= 0x01; + break; + case 2: + // SIMD Prefix - F3. + vex_prefix |= 0x02; + break; + case 3: + // SIMD Prefix - F2. + vex_prefix |= 0x03; + break; + default: + LOG(FATAL) << "unknown SIMD Prefix"; + } + + return vex_prefix; +} + void X86Assembler::EmitGenericShift(int reg_or_opcode, const Operand& operand, const Immediate& imm) { diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h index e42c4c986a..8c9ce82687 100644 --- a/compiler/utils/x86/assembler_x86.h +++ b/compiler/utils/x86/assembler_x86.h @@ -397,6 +397,12 @@ class X86Assembler FINAL : public Assembler { void divss(XmmRegister dst, XmmRegister src); void divss(XmmRegister dst, const Address& src); + // FMA Mac Instructions + void vfmadd231ps(XmmRegister dst, XmmRegister src1, XmmRegister src2); + void vfmadd231pd(XmmRegister dst, XmmRegister src1, XmmRegister src2); + void vfmsub231ps(XmmRegister dst, XmmRegister src1, XmmRegister src2); + void vfmsub231pd(XmmRegister dst, XmmRegister src1, XmmRegister src2); + void addps(XmmRegister dst, XmmRegister src); // no addr variant (for now) void subps(XmmRegister dst, XmmRegister src); void mulps(XmmRegister dst, XmmRegister src); @@ -834,6 +840,11 @@ class X86Assembler FINAL : public Assembler { void EmitLabelLink(Label* label); void EmitLabelLink(NearLabel* label); + // Emit a 3 byte VEX Prefix + uint8_t EmitVexByteZero(bool is_two_byte); + uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm); + uint8_t EmitVexByte2(bool w , int l , X86ManagedRegister vvv, int pp); + void EmitGenericShift(int rm, const Operand& operand, const Immediate& imm); void EmitGenericShift(int rm, const Operand& operand, Register shifter); diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc index bd31561937..9983eaeeea 100644 --- a/compiler/utils/x86_64/assembler_x86_64.cc +++ b/compiler/utils/x86_64/assembler_x86_64.cc @@ -603,6 +603,56 @@ void X86_64Assembler::divss(XmmRegister dst, const Address& src) { } +void X86_64Assembler::vfmadd231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2); + uint8_t byte_two = EmitVexByte2(false, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field. + EmitUint8(0xB8); + EmitXmmRegisterOperand(acc.LowBits(), mul_right); +} + + +void X86_64Assembler::vfmsub231ps(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2); + uint8_t byte_two = EmitVexByte2(false, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + // Opcode field + EmitUint8(0xBA); + EmitXmmRegisterOperand(acc.LowBits(), mul_right); +} + +void X86_64Assembler::vfmadd231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2); + uint8_t byte_two = EmitVexByte2(true, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + EmitUint8(0xB8); + EmitXmmRegisterOperand(acc.LowBits(), mul_right); +} + +void X86_64Assembler::vfmsub231pd(XmmRegister acc, XmmRegister mul_left, XmmRegister mul_right) { + AssemblerBuffer::EnsureCapacity ensured(&buffer_); + uint8_t byte_zero = EmitVexByteZero(false /*is_two_byte*/); + uint8_t byte_one = EmitVexByte1(acc.NeedsRex(), false, mul_right.NeedsRex(), 2); + uint8_t byte_two = EmitVexByte2(true, 128, X86_64ManagedRegister::FromXmmRegister(mul_left.AsFloatRegister()), 1); + EmitUint8(byte_zero); + EmitUint8(byte_one); + EmitUint8(byte_two); + EmitUint8(0xBA); + EmitXmmRegisterOperand(acc.LowBits(), mul_right); +} void X86_64Assembler::addps(XmmRegister dst, XmmRegister src) { AssemblerBuffer::EnsureCapacity ensured(&buffer_); EmitOptionalRex32(dst, src); @@ -3544,6 +3594,98 @@ void X86_64Assembler::EmitLabelLink(NearLabel* label) { label->LinkTo(position); } +uint8_t X86_64Assembler::EmitVexByteZero(bool is_two_byte) { + uint8_t vex_zero = 0xC0; + if (!is_two_byte) { + vex_zero |= 0xC4; + } else { + vex_zero |= 0xC5; + } + return vex_zero; +} + +uint8_t X86_64Assembler::EmitVexByte1(bool r, bool x, bool b, int mmmmm) { + // VEX Byte 1. + uint8_t vex_prefix = 0; + if (!r) { + vex_prefix |= 0x80; // VEX.R . + } + if (!x) { + vex_prefix |= 0x40; // VEX.X . + } + if (!b) { + vex_prefix |= 0x20; // VEX.B . + } + + // VEX.mmmmm. + switch (mmmmm) { + case 1: + // Implied 0F leading opcode byte. + vex_prefix |= 0x01; + break; + case 2: + // Implied leading 0F 38 opcode byte. + vex_prefix |= 0x02; + break; + case 3: + // Implied leading OF 3A opcode byte. + vex_prefix |= 0x03; + break; + default: + LOG(FATAL) << "unknown opcode bytes"; + } + + return vex_prefix; +} + +uint8_t X86_64Assembler::EmitVexByte2(bool w, int l, X86_64ManagedRegister operand, int pp) { + // VEX Byte 2. + uint8_t vex_prefix = 0; + if (w) { + vex_prefix |= 0x80; + } + // VEX.vvvv. + if (operand.IsXmmRegister()) { + XmmRegister vvvv = operand.AsXmmRegister(); + int inverted_reg = 15-static_cast<int>(vvvv.AsFloatRegister()); + uint8_t reg = static_cast<uint8_t>(inverted_reg); + vex_prefix |= ((reg & 0x0F) << 3); + } else if (operand.IsCpuRegister()) { + CpuRegister vvvv = operand.AsCpuRegister(); + int inverted_reg = 15 - static_cast<int>(vvvv.AsRegister()); + uint8_t reg = static_cast<uint8_t>(inverted_reg); + vex_prefix |= ((reg & 0x0F) << 3); + } + + // VEX.L. + if (l == 256) { + vex_prefix |= 0x04; + } + + // VEX.pp. + switch (pp) { + case 0: + // SIMD Pefix - None. + vex_prefix |= 0x00; + break; + case 1: + // SIMD Prefix - 66. + vex_prefix |= 0x01; + break; + case 2: + // SIMD Prefix - F3. + vex_prefix |= 0x02; + break; + case 3: + // SIMD Prefix - F2. + vex_prefix |= 0x03; + break; + default: + LOG(FATAL) << "unknown SIMD Prefix"; + } + + return vex_prefix; +} void X86_64Assembler::EmitGenericShift(bool wide, int reg_or_opcode, diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h index e4d72a7ba2..d5779aa786 100644 --- a/compiler/utils/x86_64/assembler_x86_64.h +++ b/compiler/utils/x86_64/assembler_x86_64.h @@ -436,6 +436,16 @@ class X86_64Assembler FINAL : public Assembler { void divss(XmmRegister dst, XmmRegister src); void divss(XmmRegister dst, const Address& src); + // Mac Instructions + // For reference look at the Instruction reference volume 2C. + // The below URL is broken down in two lines. + // https://www.intel.com/content/www/us/en/architecture-and-technology/ + // 64-ia-32-architectures-software-developer-vol-2c-manual.html + void vfmadd231ps(XmmRegister acc, XmmRegister left, XmmRegister right); + void vfmadd231pd(XmmRegister acc, XmmRegister left, XmmRegister right); + void vfmsub231ps(XmmRegister acc, XmmRegister left, XmmRegister right); + void vfmsub231pd(XmmRegister acc, XmmRegister left, XmmRegister right); + void addps(XmmRegister dst, XmmRegister src); // no addr variant (for now) void subps(XmmRegister dst, XmmRegister src); void mulps(XmmRegister dst, XmmRegister src); @@ -921,6 +931,11 @@ class X86_64Assembler FINAL : public Assembler { void EmitLabelLink(Label* label); void EmitLabelLink(NearLabel* label); + // Emit a 3 byte VEX Prefix. + uint8_t EmitVexByteZero(bool is_two_byte); + uint8_t EmitVexByte1(bool r, bool x, bool b, int mmmmm); + uint8_t EmitVexByte2(bool w , int l , X86_64ManagedRegister operand, int pp); + void EmitGenericShift(bool wide, int rm, CpuRegister reg, const Immediate& imm); void EmitGenericShift(bool wide, int rm, CpuRegister operand, CpuRegister shifter); diff --git a/runtime/arch/x86/instruction_set_features_x86.cc b/runtime/arch/x86/instruction_set_features_x86.cc index 98462512da..745e925611 100644 --- a/runtime/arch/x86/instruction_set_features_x86.cc +++ b/runtime/arch/x86/instruction_set_features_x86.cc @@ -35,6 +35,7 @@ static constexpr const char* x86_known_variants[] = { "atom", "sandybridge", "silvermont", + "kabylake" }; static constexpr const char* x86_variants_with_ssse3[] = { @@ -46,16 +47,27 @@ static constexpr const char* x86_variants_with_ssse3[] = { static constexpr const char* x86_variants_with_sse4_1[] = { "sandybridge", "silvermont", + "kabylake" }; static constexpr const char* x86_variants_with_sse4_2[] = { "sandybridge", "silvermont", + "kabylake" }; static constexpr const char* x86_variants_with_popcnt[] = { "sandybridge", "silvermont", + "kabylake" +}; + +static constexpr const char* x86_variants_with_avx[] = { + "kabylake", +}; + +static constexpr const char* x86_variants_with_avx2[] = { + "kabylake", }; X86FeaturesUniquePtr X86InstructionSetFeatures::Create(bool x86_64, @@ -93,9 +105,12 @@ X86FeaturesUniquePtr X86InstructionSetFeatures::FromVariant( bool has_SSE4_2 = FindVariantInArray(x86_variants_with_sse4_2, arraysize(x86_variants_with_sse4_2), variant); - bool has_AVX = false; - bool has_AVX2 = false; - + bool has_AVX = FindVariantInArray(x86_variants_with_avx, + arraysize(x86_variants_with_avx), + variant); + bool has_AVX2 = FindVariantInArray(x86_variants_with_avx2, + arraysize(x86_variants_with_avx2), + variant); bool has_POPCNT = FindVariantInArray(x86_variants_with_popcnt, arraysize(x86_variants_with_popcnt), variant); diff --git a/runtime/arch/x86/instruction_set_features_x86.h b/runtime/arch/x86/instruction_set_features_x86.h index 57cf4b2741..f5974cc2e1 100644 --- a/runtime/arch/x86/instruction_set_features_x86.h +++ b/runtime/arch/x86/instruction_set_features_x86.h @@ -67,6 +67,8 @@ class X86InstructionSetFeatures : public InstructionSetFeatures { bool HasPopCnt() const { return has_POPCNT_; } + bool HasAVX2() const { return has_AVX2_; } + protected: // Parse a string of the form "ssse3" adding these to a new InstructionSetFeatures. virtual std::unique_ptr<const InstructionSetFeatures> |