From f26bb6c74a973fde3d2783ac35324d5ce8def814 Mon Sep 17 00:00:00 2001 From: Artem Serov Date: Fri, 1 Sep 2017 10:59:03 +0100 Subject: ARM64: Tune SIMD loop unrolling factor heuristic. Improve SIMD loop unrolling factor heuristic for ARM64 by accounting for max desired loop size, trip_count, etc. The following example shows 21% perf increase: for (int i = 0; i < LENGTH; i++) { bc[i] = ba[i]; // Byte arrays } Test: test-art-host, test-art-target. Change-Id: Ic587759c51aa4354df621ffb1c7ce4ebd798dfc1 --- compiler/optimizing/loop_optimization.cc | 34 +++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'compiler/optimizing') diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc index a249cacc93..e150b65628 100644 --- a/compiler/optimizing/loop_optimization.cc +++ b/compiler/optimizing/loop_optimization.cc @@ -1761,21 +1761,33 @@ void HLoopOptimization::SetPeelingCandidate(const ArrayReference* candidate, vector_peeling_candidate_ = candidate; } +static constexpr uint32_t ARM64_SIMD_MAXIMUM_UNROLL_FACTOR = 8; +static constexpr uint32_t ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE = 50; + uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) { - // Current heuristic: unroll by 2 on ARM64/X86 for large known trip - // counts and small loop bodies. - // TODO: refine with operation count, remaining iterations, etc. - // Artem had some really cool ideas for this already. switch (compiler_driver_->GetInstructionSet()) { - case kArm64: - case kX86: - case kX86_64: { - size_t num_instructions = block->GetInstructions().CountSize(); - if (num_instructions <= 10 && trip_count >= 4 * vector_length_) { - return 2; + case kArm64: { + DCHECK_NE(vector_length_, 0u); + // TODO: Unroll loops with unknown trip count. + if (trip_count < 2 * vector_length_) { + return 1; } - return 1; + + uint32_t instruction_count = block->GetInstructions().CountSize(); + + // Find a beneficial unroll factor with the following restrictions: + // - At least one iteration of the transformed loop should be executed. + // - The loop body shouldn't be "too big" (heuristic). + uint32_t uf1 = ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE / instruction_count; + uint32_t uf2 = trip_count / vector_length_; + uint32_t unroll_factor = + TruncToPowerOfTwo(std::min({uf1, uf2, ARM64_SIMD_MAXIMUM_UNROLL_FACTOR})); + DCHECK_GE(unroll_factor, 1u); + + return unroll_factor; } + case kX86: + case kX86_64: default: return 1; } -- cgit v1.2.3-59-g8ed1b