diff options
author | 2017-04-20 17:28:00 +0000 | |
---|---|---|
committer | 2017-04-20 17:28:01 +0000 | |
commit | f99f62f8e04aecbbe1615e242a19ac475f66e565 (patch) | |
tree | 43f4758d8462343395028fd634430da770a61b83 /compiler/optimizing | |
parent | 4c408ca7262122729fc9b1e53ad439507bd2ec19 (diff) | |
parent | f34dd206d0073fb3949be872224420a8488f551f (diff) |
Merge "ARM64: Support MultiplyAccumulate for SIMD."
Diffstat (limited to 'compiler/optimizing')
-rw-r--r-- | compiler/optimizing/code_generator_vector_arm.cc | 8 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_arm64.cc | 61 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_arm_vixl.cc | 8 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_mips.cc | 8 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_mips64.cc | 8 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_x86.cc | 8 | ||||
-rw-r--r-- | compiler/optimizing/code_generator_vector_x86_64.cc | 8 | ||||
-rw-r--r-- | compiler/optimizing/graph_visualizer.cc | 4 | ||||
-rw-r--r-- | compiler/optimizing/instruction_simplifier_arm64.cc | 6 | ||||
-rw-r--r-- | compiler/optimizing/instruction_simplifier_arm64.h | 1 | ||||
-rw-r--r-- | compiler/optimizing/instruction_simplifier_shared.cc | 66 | ||||
-rw-r--r-- | compiler/optimizing/instruction_simplifier_shared.h | 2 | ||||
-rw-r--r-- | compiler/optimizing/nodes.h | 1 | ||||
-rw-r--r-- | compiler/optimizing/nodes_vector.h | 57 |
14 files changed, 246 insertions, 0 deletions
diff --git a/compiler/optimizing/code_generator_vector_arm.cc b/compiler/optimizing/code_generator_vector_arm.cc index 6e82123e56..f8552dcfc9 100644 --- a/compiler/optimizing/code_generator_vector_arm.cc +++ b/compiler/optimizing/code_generator_vector_arm.cc @@ -245,6 +245,14 @@ void InstructionCodeGeneratorARM::VisitVecUShr(HVecUShr* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } +void LocationsBuilderARM::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LOG(FATAL) << "No SIMD for " << instr->GetId(); +} + +void InstructionCodeGeneratorARM::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LOG(FATAL) << "No SIMD for " << instr->GetId(); +} + void LocationsBuilderARM::VisitVecLoad(HVecLoad* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc index 2dfccfff85..b3eb639142 100644 --- a/compiler/optimizing/code_generator_vector_arm64.cc +++ b/compiler/optimizing/code_generator_vector_arm64.cc @@ -681,6 +681,67 @@ void InstructionCodeGeneratorARM64::VisitVecUShr(HVecUShr* instruction) { } } +void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr); + switch (instr->GetPackedType()) { + case Primitive::kPrimByte: + case Primitive::kPrimChar: + case Primitive::kPrimShort: + case Primitive::kPrimInt: + locations->SetInAt( + HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister()); + locations->SetInAt( + HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister()); + locations->SetInAt( + HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister()); + DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0); + locations->SetOut(Location::SameAsFirstInput()); + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + UNREACHABLE(); + } +} + +// Some early revisions of the Cortex-A53 have an erratum (835769) whereby it is possible for a +// 64-bit scalar multiply-accumulate instruction in AArch64 state to generate an incorrect result. +// However vector MultiplyAccumulate instruction is not affected. +void InstructionCodeGeneratorARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LocationSummary* locations = instr->GetLocations(); + VRegister acc = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputAccumulatorIndex)); + VRegister left = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulLeftIndex)); + VRegister right = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulRightIndex)); + switch (instr->GetPackedType()) { + case Primitive::kPrimByte: + DCHECK_EQ(16u, instr->GetVectorLength()); + if (instr->GetOpKind() == HInstruction::kAdd) { + __ Mla(acc.V16B(), left.V16B(), right.V16B()); + } else { + __ Mls(acc.V16B(), left.V16B(), right.V16B()); + } + break; + case Primitive::kPrimChar: + case Primitive::kPrimShort: + DCHECK_EQ(8u, instr->GetVectorLength()); + if (instr->GetOpKind() == HInstruction::kAdd) { + __ Mla(acc.V8H(), left.V8H(), right.V8H()); + } else { + __ Mls(acc.V8H(), left.V8H(), right.V8H()); + } + break; + case Primitive::kPrimInt: + DCHECK_EQ(4u, instr->GetVectorLength()); + if (instr->GetOpKind() == HInstruction::kAdd) { + __ Mla(acc.V4S(), left.V4S(), right.V4S()); + } else { + __ Mls(acc.V4S(), left.V4S(), right.V4S()); + } + break; + default: + LOG(FATAL) << "Unsupported SIMD type"; + } +} + // Helper to set up locations for vector memory operations. static void CreateVecMemLocations(ArenaAllocator* arena, HVecMemoryOperation* instruction, diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc index 990178b31b..53f314ec40 100644 --- a/compiler/optimizing/code_generator_vector_arm_vixl.cc +++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc @@ -245,6 +245,14 @@ void InstructionCodeGeneratorARMVIXL::VisitVecUShr(HVecUShr* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } +void LocationsBuilderARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LOG(FATAL) << "No SIMD for " << instr->GetId(); +} + +void InstructionCodeGeneratorARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LOG(FATAL) << "No SIMD for " << instr->GetId(); +} + void LocationsBuilderARMVIXL::VisitVecLoad(HVecLoad* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc index 8ea1ca7d90..c4a32252d9 100644 --- a/compiler/optimizing/code_generator_vector_mips.cc +++ b/compiler/optimizing/code_generator_vector_mips.cc @@ -245,6 +245,14 @@ void InstructionCodeGeneratorMIPS::VisitVecUShr(HVecUShr* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } +void LocationsBuilderMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LOG(FATAL) << "No SIMD for " << instr->GetId(); +} + +void InstructionCodeGeneratorMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LOG(FATAL) << "No SIMD for " << instr->GetId(); +} + void LocationsBuilderMIPS::VisitVecLoad(HVecLoad* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc index a484bb4774..50b95c17cb 100644 --- a/compiler/optimizing/code_generator_vector_mips64.cc +++ b/compiler/optimizing/code_generator_vector_mips64.cc @@ -245,6 +245,14 @@ void InstructionCodeGeneratorMIPS64::VisitVecUShr(HVecUShr* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } +void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LOG(FATAL) << "No SIMD for " << instr->GetId(); +} + +void InstructionCodeGeneratorMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LOG(FATAL) << "No SIMD for " << instr->GetId(); +} + void LocationsBuilderMIPS64::VisitVecLoad(HVecLoad* instruction) { LOG(FATAL) << "No SIMD for " << instruction->GetId(); } diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc index a86d060821..013b092b5a 100644 --- a/compiler/optimizing/code_generator_vector_x86.cc +++ b/compiler/optimizing/code_generator_vector_x86.cc @@ -730,6 +730,14 @@ void InstructionCodeGeneratorX86::VisitVecUShr(HVecUShr* instruction) { } } +void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LOG(FATAL) << "No SIMD for " << instr->GetId(); +} + +void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LOG(FATAL) << "No SIMD for " << instr->GetId(); +} + // Helper to set up locations for vector memory operations. static void CreateVecMemLocations(ArenaAllocator* arena, HVecMemoryOperation* instruction, diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc index 696735367e..66f19a4376 100644 --- a/compiler/optimizing/code_generator_vector_x86_64.cc +++ b/compiler/optimizing/code_generator_vector_x86_64.cc @@ -719,6 +719,14 @@ void InstructionCodeGeneratorX86_64::VisitVecUShr(HVecUShr* instruction) { } } +void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LOG(FATAL) << "No SIMD for " << instr->GetId(); +} + +void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) { + LOG(FATAL) << "No SIMD for " << instr->GetId(); +} + // Helper to set up locations for vector memory operations. static void CreateVecMemLocations(ArenaAllocator* arena, HVecMemoryOperation* instruction, diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc index 1b2b9f80ac..e5d94c3504 100644 --- a/compiler/optimizing/graph_visualizer.cc +++ b/compiler/optimizing/graph_visualizer.cc @@ -514,6 +514,10 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor { StartAttributeStream("rounded") << std::boolalpha << hadd->IsRounded() << std::noboolalpha; } + void VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) OVERRIDE { + StartAttributeStream("kind") << instruction->GetOpKind(); + } + #if defined(ART_ENABLE_CODEGEN_arm) || defined(ART_ENABLE_CODEGEN_arm64) void VisitMultiplyAccumulate(HMultiplyAccumulate* instruction) OVERRIDE { StartAttributeStream("kind") << instruction->GetOpKind(); diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc index 73b7b2bd95..f16e3727c8 100644 --- a/compiler/optimizing/instruction_simplifier_arm64.cc +++ b/compiler/optimizing/instruction_simplifier_arm64.cc @@ -210,5 +210,11 @@ void InstructionSimplifierArm64Visitor::VisitXor(HXor* instruction) { } } +void InstructionSimplifierArm64Visitor::VisitVecMul(HVecMul* instruction) { + if (TryCombineVecMultiplyAccumulate(instruction, kArm64)) { + RecordSimplification(); + } +} + } // namespace arm64 } // namespace art diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h index 65654f50f4..eec4e49792 100644 --- a/compiler/optimizing/instruction_simplifier_arm64.h +++ b/compiler/optimizing/instruction_simplifier_arm64.h @@ -74,6 +74,7 @@ class InstructionSimplifierArm64Visitor : public HGraphVisitor { void VisitTypeConversion(HTypeConversion* instruction) OVERRIDE; void VisitUShr(HUShr* instruction) OVERRIDE; void VisitXor(HXor* instruction) OVERRIDE; + void VisitVecMul(HVecMul* instruction) OVERRIDE; OptimizingCompilerStats* stats_; }; diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc index c2b1374f62..7d1f146587 100644 --- a/compiler/optimizing/instruction_simplifier_shared.cc +++ b/compiler/optimizing/instruction_simplifier_shared.cc @@ -278,5 +278,71 @@ bool TryExtractArrayAccessAddress(HInstruction* access, return true; } +bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa) { + Primitive::Type type = mul->GetPackedType(); + switch (isa) { + case kArm64: + if (!(type == Primitive::kPrimByte || + type == Primitive::kPrimChar || + type == Primitive::kPrimShort || + type == Primitive::kPrimInt)) { + return false; + } + break; + default: + return false; + } + + ArenaAllocator* arena = mul->GetBlock()->GetGraph()->GetArena(); + + if (mul->HasOnlyOneNonEnvironmentUse()) { + HInstruction* use = mul->GetUses().front().GetUser(); + if (use->IsVecAdd() || use->IsVecSub()) { + // Replace code looking like + // VECMUL tmp, x, y + // VECADD/SUB dst, acc, tmp + // with + // VECMULACC dst, acc, x, y + // Note that we do not want to (unconditionally) perform the merge when the + // multiplication has multiple uses and it can be merged in all of them. + // Multiple uses could happen on the same control-flow path, and we would + // then increase the amount of work. In the future we could try to evaluate + // whether all uses are on different control-flow paths (using dominance and + // reverse-dominance information) and only perform the merge when they are. + HInstruction* accumulator = nullptr; + HVecBinaryOperation* binop = use->AsVecBinaryOperation(); + HInstruction* binop_left = binop->GetLeft(); + HInstruction* binop_right = binop->GetRight(); + // This is always true since the `HVecMul` has only one use (which is checked above). + DCHECK_NE(binop_left, binop_right); + if (binop_right == mul) { + accumulator = binop_left; + } else if (use->IsVecAdd()) { + DCHECK_EQ(binop_left, mul); + accumulator = binop_right; + } + + HInstruction::InstructionKind kind = + use->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub; + if (accumulator != nullptr) { + HVecMultiplyAccumulate* mulacc = + new (arena) HVecMultiplyAccumulate(arena, + kind, + accumulator, + mul->GetLeft(), + mul->GetRight(), + binop->GetPackedType(), + binop->GetVectorLength()); + + binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc); + DCHECK(!mul->HasUses()); + mul->GetBlock()->RemoveInstruction(mul); + return true; + } + } + } + + return false; +} } // namespace art diff --git a/compiler/optimizing/instruction_simplifier_shared.h b/compiler/optimizing/instruction_simplifier_shared.h index 83e3ffca57..2ea103a518 100644 --- a/compiler/optimizing/instruction_simplifier_shared.h +++ b/compiler/optimizing/instruction_simplifier_shared.h @@ -58,6 +58,8 @@ bool TryExtractArrayAccessAddress(HInstruction* access, HInstruction* index, size_t data_offset); +bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa); + } // namespace art #endif // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_SHARED_H_ diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index 6be237e612..af953c8f99 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -1382,6 +1382,7 @@ class HLoopInformationOutwardIterator : public ValueObject { M(VecShl, VecBinaryOperation) \ M(VecShr, VecBinaryOperation) \ M(VecUShr, VecBinaryOperation) \ + M(VecMultiplyAccumulate, VecOperation) \ M(VecLoad, VecMemoryOperation) \ M(VecStore, VecMemoryOperation) \ diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h index bff58d0910..450691c1ea 100644 --- a/compiler/optimizing/nodes_vector.h +++ b/compiler/optimizing/nodes_vector.h @@ -143,6 +143,10 @@ class HVecBinaryOperation : public HVecOperation { /*number_of_inputs*/ 2, vector_length, dex_pc) { } + + HInstruction* GetLeft() const { return InputAt(0); } + HInstruction* GetRight() const { return InputAt(1); } + DECLARE_ABSTRACT_INSTRUCTION(VecBinaryOperation); private: DISALLOW_COPY_AND_ASSIGN(HVecBinaryOperation); @@ -627,6 +631,59 @@ class HVecUShr FINAL : public HVecBinaryOperation { DISALLOW_COPY_AND_ASSIGN(HVecUShr); }; +// Multiplies every component in the two vectors, adds the result vector to the accumulator vector. +// viz. [ acc1, .., accn ] + [ x1, .. , xn ] * [ y1, .. , yn ] = +// [ acc1 + x1 * y1, .. , accn + xn * yn ]. +class HVecMultiplyAccumulate FINAL : public HVecOperation { + public: + HVecMultiplyAccumulate(ArenaAllocator* arena, + InstructionKind op, + HInstruction* accumulator, + HInstruction* mul_left, + HInstruction* mul_right, + Primitive::Type packed_type, + size_t vector_length, + uint32_t dex_pc = kNoDexPc) + : HVecOperation(arena, + packed_type, + SideEffects::None(), + /*number_of_inputs*/ 3, + vector_length, + dex_pc), + op_kind_(op) { + DCHECK(op == InstructionKind::kAdd || op == InstructionKind::kSub); + DCHECK(accumulator->IsVecOperation()); + DCHECK(mul_left->IsVecOperation() && mul_right->IsVecOperation()); + DCHECK_EQ(accumulator->AsVecOperation()->GetPackedType(), packed_type); + DCHECK_EQ(mul_left->AsVecOperation()->GetPackedType(), packed_type); + DCHECK_EQ(mul_right->AsVecOperation()->GetPackedType(), packed_type); + + SetRawInputAt(kInputAccumulatorIndex, accumulator); + SetRawInputAt(kInputMulLeftIndex, mul_left); + SetRawInputAt(kInputMulRightIndex, mul_right); + } + + static constexpr int kInputAccumulatorIndex = 0; + static constexpr int kInputMulLeftIndex = 1; + static constexpr int kInputMulRightIndex = 2; + + bool CanBeMoved() const OVERRIDE { return true; } + + bool InstructionDataEquals(const HInstruction* other) const OVERRIDE { + return op_kind_ == other->AsVecMultiplyAccumulate()->op_kind_; + } + + InstructionKind GetOpKind() const { return op_kind_; } + + DECLARE_INSTRUCTION(VecMultiplyAccumulate); + + private: + // Indicates if this is a MADD or MSUB. + const InstructionKind op_kind_; + + DISALLOW_COPY_AND_ASSIGN(HVecMultiplyAccumulate); +}; + // Loads a vector from memory, viz. load(mem, 1) // yield the vector [ mem(1), .. , mem(n) ]. class HVecLoad FINAL : public HVecMemoryOperation { |