ARM64: Tune SIMD loop unrolling factor heuristic.

Improve SIMD loop unrolling factor heuristic for ARM64 by
accounting for max desired loop size, trip_count, etc. The
following example shows 21% perf increase:

  for (int i = 0; i < LENGTH; i++) {
    bc[i] = ba[i];  // Byte arrays
  }

Test: test-art-host, test-art-target.
Change-Id: Ic587759c51aa4354df621ffb1c7ce4ebd798dfc1
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index a249cac..e150b65 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -1761,21 +1761,33 @@
   vector_peeling_candidate_ = candidate;
 }
 
+static constexpr uint32_t ARM64_SIMD_MAXIMUM_UNROLL_FACTOR = 8;
+static constexpr uint32_t ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE = 50;
+
 uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) {
-  // Current heuristic: unroll by 2 on ARM64/X86 for large known trip
-  // counts and small loop bodies.
-  // TODO: refine with operation count, remaining iterations, etc.
-  //       Artem had some really cool ideas for this already.
   switch (compiler_driver_->GetInstructionSet()) {
-    case kArm64:
-    case kX86:
-    case kX86_64: {
-      size_t num_instructions = block->GetInstructions().CountSize();
-      if (num_instructions <= 10 && trip_count >= 4 * vector_length_) {
-        return 2;
+    case kArm64: {
+      DCHECK_NE(vector_length_, 0u);
+      // TODO: Unroll loops with unknown trip count.
+      if (trip_count < 2 * vector_length_) {
+        return 1;
       }
-      return 1;
+
+      uint32_t instruction_count = block->GetInstructions().CountSize();
+
+      // Find a beneficial unroll factor with the following restrictions:
+      //  - At least one iteration of the transformed loop should be executed.
+      //  - The loop body shouldn't be "too big" (heuristic).
+      uint32_t uf1 = ARM64_SIMD_HEURISTIC_MAX_BODY_SIZE / instruction_count;
+      uint32_t uf2 = trip_count / vector_length_;
+      uint32_t unroll_factor =
+          TruncToPowerOfTwo(std::min({uf1, uf2, ARM64_SIMD_MAXIMUM_UNROLL_FACTOR}));
+      DCHECK_GE(unroll_factor, 1u);
+
+      return unroll_factor;
     }
+    case kX86:
+    case kX86_64:
     default:
       return 1;
   }