Blame - compiler/optimizing/loop_analysis.cc - LeafOS-Project/android_art

blob: 78505171cb884badfe5994aad5aba6871d908fc5 [file] [log] [blame]

Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	1	/*
				2	* Copyright (C) 2018 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#include "loop_analysis.h"
				18
Artem Serov	72411e6	2017-10-19 16:18:07 +0100	[diff] [blame]	19	#include "base/bit_vector-inl.h"
Artem Serov	0e32908	2018-06-12 10:23:27 +0100	[diff] [blame]	20	#include "induction_var_range.h"
Artem Serov	72411e6	2017-10-19 16:18:07 +0100	[diff] [blame]	21
Vladimir Marko	0a51605	2019-10-14 13:00:44 +0000	[diff] [blame]	22	namespace art {
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	23
				24	void LoopAnalysis::CalculateLoopBasicProperties(HLoopInformation* loop_info,
Artem Serov	0e32908	2018-06-12 10:23:27 +0100	[diff] [blame]	25	LoopAnalysisInfo* analysis_results,
				26	int64_t trip_count) {
				27	analysis_results->trip_count_ = trip_count;
				28
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	29	for (HBlocksInLoopIterator block_it(*loop_info);
				30	!block_it.Done();
				31	block_it.Advance()) {
				32	HBasicBlock* block = block_it.Current();
				33
Artem Serov	0e32908	2018-06-12 10:23:27 +0100	[diff] [blame]	34	// Check whether one of the successor is loop exit.
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	35	for (HBasicBlock* successor : block->GetSuccessors()) {
				36	if (!loop_info->Contains(*successor)) {
				37	analysis_results->exits_num_++;
Artem Serov	0e32908	2018-06-12 10:23:27 +0100	[diff] [blame]	38
				39	// We track number of invariant loop exits which correspond to HIf instruction and
				40	// can be eliminated by loop peeling; other control flow instruction are ignored and will
				41	// not cause loop peeling to happen as they either cannot be inside a loop, or by
				42	// definition cannot be loop exits (unconditional instructions), or are not beneficial for
				43	// the optimization.
				44	HIf* hif = block->GetLastInstruction()->AsIf();
				45	if (hif != nullptr && !loop_info->Contains(*hif->InputAt(0)->GetBlock())) {
				46	analysis_results->invariant_exits_num_++;
				47	}
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	48	}
				49	}
				50
				51	for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
				52	HInstruction* instruction = it.Current();
Artem Serov	cf43fb6	2018-02-15 14:43:48 +0000	[diff] [blame]	53	if (it.Current()->GetType() == DataType::Type::kInt64) {
				54	analysis_results->has_long_type_instructions_ = true;
				55	}
Artem Serov	72411e6	2017-10-19 16:18:07 +0100	[diff] [blame]	56	if (MakesScalarPeelingUnrollingNonBeneficial(instruction)) {
				57	analysis_results->has_instructions_preventing_scalar_peeling_ = true;
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	58	analysis_results->has_instructions_preventing_scalar_unrolling_ = true;
				59	}
				60	analysis_results->instr_num_++;
				61	}
				62	analysis_results->bb_num_++;
				63	}
				64	}
				65
Artem Serov	0e32908	2018-06-12 10:23:27 +0100	[diff] [blame]	66	int64_t LoopAnalysis::GetLoopTripCount(HLoopInformation* loop_info,
				67	const InductionVarRange* induction_range) {
				68	int64_t trip_count;
				69	if (!induction_range->HasKnownTripCount(loop_info, &trip_count)) {
				70	trip_count = LoopAnalysisInfo::kUnknownTripCount;
Artem Serov	72411e6	2017-10-19 16:18:07 +0100	[diff] [blame]	71	}
Artem Serov	0e32908	2018-06-12 10:23:27 +0100	[diff] [blame]	72	return trip_count;
Artem Serov	72411e6	2017-10-19 16:18:07 +0100	[diff] [blame]	73	}
				74
Artem Serov	cf43fb6	2018-02-15 14:43:48 +0000	[diff] [blame]	75	// Default implementation of loop helper; used for all targets unless a custom implementation
				76	// is provided. Enables scalar loop peeling and unrolling with the most conservative heuristics.
				77	class ArchDefaultLoopHelper : public ArchNoOptsLoopHelper {
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	78	public:
				79	// Scalar loop unrolling parameters and heuristics.
				80	//
				81	// Maximum possible unrolling factor.
Artem Serov	cf43fb6	2018-02-15 14:43:48 +0000	[diff] [blame]	82	static constexpr uint32_t kScalarMaxUnrollFactor = 2;
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	83	// Loop's maximum instruction count. Loops with higher count will not be peeled/unrolled.
Artem Serov	cf43fb6	2018-02-15 14:43:48 +0000	[diff] [blame]	84	static constexpr uint32_t kScalarHeuristicMaxBodySizeInstr = 17;
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	85	// Loop's maximum basic block count. Loops with higher count will not be peeled/unrolled.
Artem Serov	cf43fb6	2018-02-15 14:43:48 +0000	[diff] [blame]	86	static constexpr uint32_t kScalarHeuristicMaxBodySizeBlocks = 6;
Artem Serov	18ba1da	2018-05-16 19:06:32 +0100	[diff] [blame]	87	// Maximum number of instructions to be created as a result of full unrolling.
				88	static constexpr uint32_t kScalarHeuristicFullyUnrolledMaxInstrThreshold = 35;
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	89
Roland Levillain	bbc6e7e	2018-08-24 16:58:47 +0100	[diff] [blame]	90	bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* analysis_info) const override {
Artem Serov	0e32908	2018-06-12 10:23:27 +0100	[diff] [blame]	91	return analysis_info->HasLongTypeInstructions() \|\|
				92	IsLoopTooBig(analysis_info,
Artem Serov	cf43fb6	2018-02-15 14:43:48 +0000	[diff] [blame]	93	kScalarHeuristicMaxBodySizeInstr,
				94	kScalarHeuristicMaxBodySizeBlocks);
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	95	}
				96
Roland Levillain	bbc6e7e	2018-08-24 16:58:47 +0100	[diff] [blame]	97	uint32_t GetScalarUnrollingFactor(const LoopAnalysisInfo* analysis_info) const override {
Artem Serov	0e32908	2018-06-12 10:23:27 +0100	[diff] [blame]	98	int64_t trip_count = analysis_info->GetTripCount();
				99	// Unroll only loops with known trip count.
				100	if (trip_count == LoopAnalysisInfo::kUnknownTripCount) {
				101	return LoopAnalysisInfo::kNoUnrollingFactor;
				102	}
Artem Serov	cf43fb6	2018-02-15 14:43:48 +0000	[diff] [blame]	103	uint32_t desired_unrolling_factor = kScalarMaxUnrollFactor;
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	104	if (trip_count < desired_unrolling_factor \|\| trip_count % desired_unrolling_factor != 0) {
Artem Serov	0e32908	2018-06-12 10:23:27 +0100	[diff] [blame]	105	return LoopAnalysisInfo::kNoUnrollingFactor;
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	106	}
				107
				108	return desired_unrolling_factor;
				109	}
				110
Roland Levillain	bbc6e7e	2018-08-24 16:58:47 +0100	[diff] [blame]	111	bool IsLoopPeelingEnabled() const override { return true; }
Artem Serov	72411e6	2017-10-19 16:18:07 +0100	[diff] [blame]	112
Roland Levillain	bbc6e7e	2018-08-24 16:58:47 +0100	[diff] [blame]	113	bool IsFullUnrollingBeneficial(LoopAnalysisInfo* analysis_info) const override {
Artem Serov	18ba1da	2018-05-16 19:06:32 +0100	[diff] [blame]	114	int64_t trip_count = analysis_info->GetTripCount();
				115	// We assume that trip count is known.
				116	DCHECK_NE(trip_count, LoopAnalysisInfo::kUnknownTripCount);
				117	size_t instr_num = analysis_info->GetNumberOfInstructions();
				118	return (trip_count * instr_num < kScalarHeuristicFullyUnrolledMaxInstrThreshold);
				119	}
				120
Artem Serov	cf43fb6	2018-02-15 14:43:48 +0000	[diff] [blame]	121	protected:
				122	bool IsLoopTooBig(LoopAnalysisInfo* loop_analysis_info,
				123	size_t instr_threshold,
				124	size_t bb_threshold) const {
				125	size_t instr_num = loop_analysis_info->GetNumberOfInstructions();
				126	size_t bb_num = loop_analysis_info->GetNumberOfBasicBlocks();
				127	return (instr_num >= instr_threshold \|\| bb_num >= bb_threshold);
				128	}
				129	};
				130
				131	// Custom implementation of loop helper for arm64 target. Enables heuristics for scalar loop
				132	// peeling and unrolling and supports SIMD loop unrolling.
				133	class Arm64LoopHelper : public ArchDefaultLoopHelper {
				134	public:
				135	// SIMD loop unrolling parameters and heuristics.
				136	//
				137	// Maximum possible unrolling factor.
				138	static constexpr uint32_t kArm64SimdMaxUnrollFactor = 8;
				139	// Loop's maximum instruction count. Loops with higher count will not be unrolled.
				140	static constexpr uint32_t kArm64SimdHeuristicMaxBodySizeInstr = 50;
				141
				142	// Loop's maximum instruction count. Loops with higher count will not be peeled/unrolled.
				143	static constexpr uint32_t kArm64ScalarHeuristicMaxBodySizeInstr = 40;
				144	// Loop's maximum basic block count. Loops with higher count will not be peeled/unrolled.
				145	static constexpr uint32_t kArm64ScalarHeuristicMaxBodySizeBlocks = 8;
				146
Roland Levillain	bbc6e7e	2018-08-24 16:58:47 +0100	[diff] [blame]	147	bool IsLoopNonBeneficialForScalarOpts(LoopAnalysisInfo* loop_analysis_info) const override {
Artem Serov	cf43fb6	2018-02-15 14:43:48 +0000	[diff] [blame]	148	return IsLoopTooBig(loop_analysis_info,
				149	kArm64ScalarHeuristicMaxBodySizeInstr,
				150	kArm64ScalarHeuristicMaxBodySizeBlocks);
				151	}
				152
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	153	uint32_t GetSIMDUnrollingFactor(HBasicBlock* block,
				154	int64_t trip_count,
				155	uint32_t max_peel,
Roland Levillain	bbc6e7e	2018-08-24 16:58:47 +0100	[diff] [blame]	156	uint32_t vector_length) const override {
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	157	// Don't unroll with insufficient iterations.
				158	// TODO: Unroll loops with unknown trip count.
				159	DCHECK_NE(vector_length, 0u);
				160	if (trip_count < (2 * vector_length + max_peel)) {
Artem Serov	0e32908	2018-06-12 10:23:27 +0100	[diff] [blame]	161	return LoopAnalysisInfo::kNoUnrollingFactor;
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	162	}
				163	// Don't unroll for large loop body size.
				164	uint32_t instruction_count = block->GetInstructions().CountSize();
				165	if (instruction_count >= kArm64SimdHeuristicMaxBodySizeInstr) {
Artem Serov	0e32908	2018-06-12 10:23:27 +0100	[diff] [blame]	166	return LoopAnalysisInfo::kNoUnrollingFactor;
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	167	}
				168	// Find a beneficial unroll factor with the following restrictions:
				169	// - At least one iteration of the transformed loop should be executed.
				170	// - The loop body shouldn't be "too big" (heuristic).
				171
				172	uint32_t uf1 = kArm64SimdHeuristicMaxBodySizeInstr / instruction_count;
				173	uint32_t uf2 = (trip_count - max_peel) / vector_length;
				174	uint32_t unroll_factor =
				175	TruncToPowerOfTwo(std::min({uf1, uf2, kArm64SimdMaxUnrollFactor}));
				176	DCHECK_GE(unroll_factor, 1u);
				177	return unroll_factor;
				178	}
				179	};
				180
Neeraj Solanki	26f6330	2019-10-15 14:04:15 +0530	[diff] [blame]	181	// Custom implementation of loop helper for X86_64 target. Enables heuristics for scalar loop
				182	// peeling and unrolling and supports SIMD loop unrolling.
				183	class X86_64LoopHelper : public ArchDefaultLoopHelper {
				184	// mapping of machine instruction count for most used IR instructions
				185	// Few IRs generate different number of instructions based on input and result type.
				186	// We checked top java apps, benchmarks and used the most generated instruction count.
				187	uint32_t GetMachineInstructionCount(HInstruction* inst) const {
				188	switch (inst->GetKind()) {
				189	case HInstruction::InstructionKind::kAbs:
				190	return 3;
				191	case HInstruction::InstructionKind::kAdd:
				192	return 1;
				193	case HInstruction::InstructionKind::kAnd:
				194	return 1;
				195	case HInstruction::InstructionKind::kArrayLength:
				196	return 1;
				197	case HInstruction::InstructionKind::kArrayGet:
				198	return 1;
				199	case HInstruction::InstructionKind::kArraySet:
				200	return 1;
				201	case HInstruction::InstructionKind::kBoundsCheck:
				202	return 2;
				203	case HInstruction::InstructionKind::kCheckCast:
				204	return 9;
				205	case HInstruction::InstructionKind::kDiv:
				206	return 8;
				207	case HInstruction::InstructionKind::kDivZeroCheck:
				208	return 2;
				209	case HInstruction::InstructionKind::kEqual:
				210	return 3;
				211	case HInstruction::InstructionKind::kGreaterThan:
				212	return 3;
				213	case HInstruction::InstructionKind::kGreaterThanOrEqual:
				214	return 3;
				215	case HInstruction::InstructionKind::kIf:
				216	return 2;
				217	case HInstruction::InstructionKind::kInstanceFieldGet:
				218	return 2;
				219	case HInstruction::InstructionKind::kInstanceFieldSet:
				220	return 1;
				221	case HInstruction::InstructionKind::kLessThan:
				222	return 3;
				223	case HInstruction::InstructionKind::kLessThanOrEqual:
				224	return 3;
				225	case HInstruction::InstructionKind::kMax:
				226	return 2;
				227	case HInstruction::InstructionKind::kMin:
				228	return 2;
				229	case HInstruction::InstructionKind::kMul:
				230	return 1;
				231	case HInstruction::InstructionKind::kNotEqual:
				232	return 3;
				233	case HInstruction::InstructionKind::kOr:
				234	return 1;
				235	case HInstruction::InstructionKind::kRem:
				236	return 11;
				237	case HInstruction::InstructionKind::kSelect:
				238	return 2;
				239	case HInstruction::InstructionKind::kShl:
				240	return 1;
				241	case HInstruction::InstructionKind::kShr:
				242	return 1;
				243	case HInstruction::InstructionKind::kSub:
				244	return 1;
				245	case HInstruction::InstructionKind::kTypeConversion:
				246	return 1;
				247	case HInstruction::InstructionKind::kUShr:
				248	return 1;
				249	case HInstruction::InstructionKind::kVecReplicateScalar:
				250	return 2;
				251	case HInstruction::InstructionKind::kVecExtractScalar:
				252	return 1;
				253	case HInstruction::InstructionKind::kVecReduce:
				254	return 4;
				255	case HInstruction::InstructionKind::kVecNeg:
				256	return 2;
				257	case HInstruction::InstructionKind::kVecAbs:
				258	return 4;
				259	case HInstruction::InstructionKind::kVecNot:
				260	return 3;
				261	case HInstruction::InstructionKind::kVecAdd:
				262	return 1;
				263	case HInstruction::InstructionKind::kVecSub:
				264	return 1;
				265	case HInstruction::InstructionKind::kVecMul:
				266	return 1;
				267	case HInstruction::InstructionKind::kVecDiv:
				268	return 1;
				269	case HInstruction::InstructionKind::kVecMax:
				270	return 1;
				271	case HInstruction::InstructionKind::kVecMin:
				272	return 1;
				273	case HInstruction::InstructionKind::kVecOr:
				274	return 1;
				275	case HInstruction::InstructionKind::kVecXor:
				276	return 1;
				277	case HInstruction::InstructionKind::kVecShl:
				278	return 1;
				279	case HInstruction::InstructionKind::kVecShr:
				280	return 1;
				281	case HInstruction::InstructionKind::kVecLoad:
				282	return 1;
				283	case HInstruction::InstructionKind::kVecStore:
				284	return 1;
				285	case HInstruction::InstructionKind::kXor:
				286	return 1;
				287	default:
				288	return 1;
				289	}
				290	}
				291
				292	// Maximum possible unrolling factor.
				293	static constexpr uint32_t kX86_64MaxUnrollFactor = 2; // pow(2,2) = 4
				294
				295	// According to Intel® 64 and IA-32 Architectures Optimization Reference Manual,
				296	// avoid excessive loop unrolling to ensure LSD (loop stream decoder) is operating efficiently.
				297	// This variable takes care that unrolled loop instructions should not exceed LSD size.
				298	// For Intel Atom processors (silvermont & goldmont), LSD size is 28
				299	// TODO - identify architecture and LSD size at runtime
				300	static constexpr uint32_t kX86_64UnrolledMaxBodySizeInstr = 28;
				301
				302	// Loop's maximum basic block count. Loops with higher count will not be partial
				303	// unrolled (unknown iterations).
				304	static constexpr uint32_t kX86_64UnknownIterMaxBodySizeBlocks = 2;
				305
				306	uint32_t GetUnrollingFactor(HLoopInformation* loop_info, HBasicBlock* header) const;
				307
				308	public:
				309	uint32_t GetSIMDUnrollingFactor(HBasicBlock* block,
				310	int64_t trip_count,
				311	uint32_t max_peel,
				312	uint32_t vector_length) const override {
				313	DCHECK_NE(vector_length, 0u);
				314	HLoopInformation* loop_info = block->GetLoopInformation();
				315	DCHECK(loop_info);
				316	HBasicBlock* header = loop_info->GetHeader();
				317	DCHECK(header);
				318	uint32_t unroll_factor = 0;
				319
				320	if ((trip_count == 0) \|\| (trip_count == LoopAnalysisInfo::kUnknownTripCount)) {
				321	// Don't unroll for large loop body size.
				322	unroll_factor = GetUnrollingFactor(loop_info, header);
				323	if (unroll_factor <= 1) {
				324	return LoopAnalysisInfo::kNoUnrollingFactor;
				325	}
				326	} else {
				327	// Don't unroll with insufficient iterations.
				328	if (trip_count < (2 * vector_length + max_peel)) {
				329	return LoopAnalysisInfo::kNoUnrollingFactor;
				330	}
				331
				332	// Don't unroll for large loop body size.
				333	uint32_t unroll_cnt = GetUnrollingFactor(loop_info, header);
				334	if (unroll_cnt <= 1) {
				335	return LoopAnalysisInfo::kNoUnrollingFactor;
				336	}
				337
				338	// Find a beneficial unroll factor with the following restrictions:
				339	// - At least one iteration of the transformed loop should be executed.
				340	// - The loop body shouldn't be "too big" (heuristic).
				341	uint32_t uf2 = (trip_count - max_peel) / vector_length;
				342	unroll_factor = TruncToPowerOfTwo(std::min(uf2, unroll_cnt));
				343	DCHECK_GE(unroll_factor, 1u);
				344	}
				345
				346	return unroll_factor;
				347	}
				348	};
				349
				350	uint32_t X86_64LoopHelper::GetUnrollingFactor(HLoopInformation* loop_info,
				351	HBasicBlock* header) const {
				352	uint32_t num_inst = 0, num_inst_header = 0, num_inst_loop_body = 0;
				353	for (HBlocksInLoopIterator it(*loop_info); !it.Done(); it.Advance()) {
				354	HBasicBlock* block = it.Current();
				355	DCHECK(block);
				356	num_inst = 0;
				357
				358	for (HInstructionIterator it1(block->GetInstructions()); !it1.Done(); it1.Advance()) {
				359	HInstruction* inst = it1.Current();
				360	DCHECK(inst);
				361
				362	// SuspendCheck inside loop is handled with Goto.
				363	// Ignoring SuspendCheck & Goto as partially unrolled loop body will have only one Goto.
				364	// Instruction count for Goto is being handled during unroll factor calculation below.
				365	if (inst->IsSuspendCheck() \|\| inst->IsGoto()) {
				366	continue;
				367	}
				368
				369	num_inst += GetMachineInstructionCount(inst);
				370	}
				371
				372	if (block == header) {
				373	num_inst_header = num_inst;
				374	} else {
				375	num_inst_loop_body += num_inst;
				376	}
				377	}
				378
				379	// Calculate actual unroll factor.
				380	uint32_t unrolling_factor = kX86_64MaxUnrollFactor;
				381	uint32_t unrolling_inst = kX86_64UnrolledMaxBodySizeInstr;
				382	// "-3" for one Goto instruction.
				383	uint32_t desired_size = unrolling_inst - num_inst_header - 3;
				384	if (desired_size < (2 * num_inst_loop_body)) {
				385	return 1;
				386	}
				387
				388	while (unrolling_factor > 0) {
				389	if ((desired_size >> unrolling_factor) >= num_inst_loop_body) {
				390	break;
				391	}
				392	unrolling_factor--;
				393	}
				394
				395	return (1 << unrolling_factor);
				396	}
				397
Artem Serov	cf43fb6	2018-02-15 14:43:48 +0000	[diff] [blame]	398	ArchNoOptsLoopHelper* ArchNoOptsLoopHelper::Create(InstructionSet isa,
				399	ArenaAllocator* allocator) {
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	400	switch (isa) {
				401	case InstructionSet::kArm64: {
				402	return new (allocator) Arm64LoopHelper;
				403	}
Neeraj Solanki	26f6330	2019-10-15 14:04:15 +0530	[diff] [blame]	404	case InstructionSet::kX86_64: {
				405	return new (allocator) X86_64LoopHelper;
				406	}
Artem Serov	121f203	2017-10-23 19:19:06 +0100	[diff] [blame]	407	default: {
				408	return new (allocator) ArchDefaultLoopHelper;
				409	}
				410	}
				411	}
				412
				413	} // namespace art