Unrolling and dynamic loop peeling framework in vectorizer.

Rationale:
This CL introduces the basic framework for dynamically peeling
(to obtain aligned access) and unrolling the vector loop (to reduce
looping overhead and allow more target specific optimizations
on e.g. SIMD loads and stores).

NOTE:
The current heuristics are "bogus" and merely meant to exercise
the new framework. This CL focuses on introducing correct code for
the vectorizer. Heuristics and the memory computations for alignment
are to be implemented later.

Test: test-art-target, test-art-host

Change-Id: I010af1475f42f92fd1daa6a967d7a85922beace8
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index d249313..d39bc16 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -31,6 +31,9 @@
 // Enables vectorization (SIMDization) in the loop optimizer.
 static constexpr bool kEnableVectorization = true;
 
+// All current SIMD targets want 16-byte alignment.
+static constexpr size_t kAlignedBase = 16;
+
 // Remove the instruction from the graph. A bit more elaborate than the usual
 // instruction removal, since there may be a cycle in the use structure.
 static void RemoveFromCycle(HInstruction* instruction) {
@@ -283,6 +286,9 @@
       simplified_(false),
       vector_length_(0),
       vector_refs_(nullptr),
+      vector_peeling_candidate_(nullptr),
+      vector_runtime_test_a_(nullptr),
+      vector_runtime_test_b_(nullptr),
       vector_map_(nullptr) {
 }
 
@@ -422,23 +428,6 @@
 // Optimization.
 //
 
-bool HLoopOptimization::CanRemoveCycle() {
-  for (HInstruction* i : *iset_) {
-    // We can never remove instructions that have environment
-    // uses when we compile 'debuggable'.
-    if (i->HasEnvironmentUses() && graph_->IsDebuggable()) {
-      return false;
-    }
-    // A deoptimization should never have an environment input removed.
-    for (const HUseListNode<HEnvironment*>& use : i->GetEnvUses()) {
-      if (use.GetUser()->GetHolder()->IsDeoptimize()) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
 void HLoopOptimization::SimplifyInduction(LoopNode* node) {
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
@@ -565,7 +554,7 @@
   if (kEnableVectorization) {
     iset_->clear();  // prepare phi induction
     if (TrySetSimpleLoopHeader(header) &&
-        CanVectorize(node, body, trip_count) &&
+        ShouldVectorize(node, body, trip_count) &&
         TryAssignLastValue(node->loop_info, phi, preheader, /*collect_loop_uses*/ true)) {
       Vectorize(node, body, exit, trip_count);
       graph_->SetHasSIMD(true);  // flag SIMD usage
@@ -580,10 +569,11 @@
 // Intel Press, June, 2004 (http://www.aartbik.com/).
 //
 
-bool HLoopOptimization::CanVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) {
+bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count) {
   // Reset vector bookkeeping.
   vector_length_ = 0;
   vector_refs_->clear();
+  vector_peeling_candidate_ = nullptr;
   vector_runtime_test_a_ =
   vector_runtime_test_b_= nullptr;
 
@@ -600,12 +590,9 @@
     }
   }
 
-  // Heuristics. Does vectorization seem profitable?
-  // TODO: refine
-  if (vector_length_ == 0) {
-    return false;  // nothing found
-  } else if (0 < trip_count && trip_count < vector_length_) {
-    return false;  // insufficient iterations
+  // Does vectorization seem profitable?
+  if (!IsVectorizationProfitable(trip_count)) {
+    return false;
   }
 
   // Data dependence analysis. Find each pair of references with same type, where
@@ -645,6 +632,9 @@
     }
   }
 
+  // Consider dynamic loop peeling for alignment.
+  SetPeelingCandidate(trip_count);
+
   // Success!
   return true;
 }
@@ -657,28 +647,52 @@
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
 
-  // A cleanup is needed for any unknown trip count or for a known trip count
-  // with remainder iterations after vectorization.
-  bool needs_cleanup = trip_count == 0 || (trip_count % vector_length_) != 0;
+  // Pick a loop unrolling factor for the vector loop.
+  uint32_t unroll = GetUnrollingFactor(block, trip_count);
+  uint32_t chunk = vector_length_ * unroll;
+
+  // A cleanup loop is needed, at least, for any unknown trip count or
+  // for a known trip count with remainder iterations after vectorization.
+  bool needs_cleanup = trip_count == 0 || (trip_count % chunk) != 0;
 
   // Adjust vector bookkeeping.
   iset_->clear();  // prepare phi induction
   bool is_simple_loop_header = TrySetSimpleLoopHeader(header);  // fills iset_
   DCHECK(is_simple_loop_header);
+  vector_header_ = header;
+  vector_body_ = block;
 
-  // Generate preheader:
+  // Generate dynamic loop peeling trip count, if needed:
+  // ptc = <peeling-needed-for-candidate>
+  HInstruction* ptc = nullptr;
+  if (vector_peeling_candidate_ != nullptr) {
+    DCHECK_LT(vector_length_, trip_count) << "dynamic peeling currently requires known trip count";
+    //
+    // TODO: Implement this. Compute address of first access memory location and
+    //       compute peeling factor to obtain kAlignedBase alignment.
+    //
+    needs_cleanup = true;
+  }
+
+  // Generate loop control:
   // stc = <trip-count>;
-  // vtc = stc - stc % VL;
+  // vtc = stc - (stc - ptc) % chunk;
+  // i = 0;
   HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader);
   HInstruction* vtc = stc;
   if (needs_cleanup) {
-    DCHECK(IsPowerOfTwo(vector_length_));
+    DCHECK(IsPowerOfTwo(chunk));
+    HInstruction* diff = stc;
+    if (ptc != nullptr) {
+      diff = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, ptc));
+    }
     HInstruction* rem = Insert(
         preheader, new (global_allocator_) HAnd(induc_type,
-                                                stc,
-                                                graph_->GetIntConstant(vector_length_ - 1)));
+                                                diff,
+                                                graph_->GetIntConstant(chunk - 1)));
     vtc = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, rem));
   }
+  vector_index_ = graph_->GetIntConstant(0);
 
   // Generate runtime disambiguation test:
   // vtc = a != b ? vtc : 0;
@@ -691,16 +705,31 @@
     needs_cleanup = true;
   }
 
-  // Generate vector loop:
-  // for (i = 0; i < vtc; i += VL)
+  // Generate dynamic peeling loop for alignment, if needed:
+  // for ( ; i < ptc; i += 1)
+  //    <loop-body>
+  if (ptc != nullptr) {
+    vector_mode_ = kSequential;
+    GenerateNewLoop(node,
+                    block,
+                    graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
+                    vector_index_,
+                    ptc,
+                    graph_->GetIntConstant(1),
+                    /*unroll*/ 1);
+  }
+
+  // Generate vector loop, possibly further unrolled:
+  // for ( ; i < vtc; i += chunk)
   //    <vectorized-loop-body>
   vector_mode_ = kVector;
   GenerateNewLoop(node,
                   block,
-                  graph_->TransformLoopForVectorization(header, block, exit),
-                  graph_->GetIntConstant(0),
+                  graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
+                  vector_index_,
                   vtc,
-                  graph_->GetIntConstant(vector_length_));
+                  graph_->GetIntConstant(vector_length_),  // increment per unroll
+                  unroll);
   HLoopInformation* vloop = vector_header_->GetLoopInformation();
 
   // Generate cleanup loop, if needed:
@@ -711,9 +740,10 @@
     GenerateNewLoop(node,
                     block,
                     graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
-                    vector_phi_,
+                    vector_index_,
                     stc,
-                    graph_->GetIntConstant(1));
+                    graph_->GetIntConstant(1),
+                    /*unroll*/ 1);
   }
 
   // Remove the original loop by disconnecting the body block
@@ -722,8 +752,9 @@
   while (!header->GetFirstInstruction()->IsGoto()) {
     header->RemoveInstruction(header->GetFirstInstruction());
   }
-  // Update loop hierarchy: the old header now resides in the
-  // same outer loop as the old preheader.
+  // Update loop hierarchy: the old header now resides in the same outer loop
+  // as the old preheader. Note that we don't bother putting sequential
+  // loops back in the hierarchy at this point.
   header->SetLoopInformation(preheader->GetLoopInformation());  // outward
   node->loop_info = vloop;
 }
@@ -733,44 +764,64 @@
                                         HBasicBlock* new_preheader,
                                         HInstruction* lo,
                                         HInstruction* hi,
-                                        HInstruction* step) {
+                                        HInstruction* step,
+                                        uint32_t unroll) {
+  DCHECK(unroll == 1 || vector_mode_ == kVector);
   Primitive::Type induc_type = Primitive::kPrimInt;
   // Prepare new loop.
-  vector_map_->clear();
   vector_preheader_ = new_preheader,
   vector_header_ = vector_preheader_->GetSingleSuccessor();
   vector_body_ = vector_header_->GetSuccessors()[1];
-  vector_phi_ = new (global_allocator_) HPhi(global_allocator_,
-                                             kNoRegNumber,
-                                             0,
-                                             HPhi::ToPhiType(induc_type));
+  HPhi* phi = new (global_allocator_) HPhi(global_allocator_,
+                                           kNoRegNumber,
+                                           0,
+                                           HPhi::ToPhiType(induc_type));
   // Generate header and prepare body.
   // for (i = lo; i < hi; i += step)
   //    <loop-body>
-  HInstruction* cond = new (global_allocator_) HAboveOrEqual(vector_phi_, hi);
-  vector_header_->AddPhi(vector_phi_);
+  HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi);
+  vector_header_->AddPhi(phi);
   vector_header_->AddInstruction(cond);
   vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
-  for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
-    bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
-    DCHECK(vectorized_def);
-  }
-  // Generate body from the instruction map, but in original program order.
-  HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment();
-  for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
-    auto i = vector_map_->find(it.Current());
-    if (i != vector_map_->end() && !i->second->IsInBlock()) {
-      Insert(vector_body_, i->second);
-      // Deal with instructions that need an environment, such as the scalar intrinsics.
-      if (i->second->NeedsEnvironment()) {
-        i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
+  vector_index_ = phi;
+  for (uint32_t u = 0; u < unroll; u++) {
+    // Clear map, leaving loop invariants setup during unrolling.
+    if (u == 0) {
+      vector_map_->clear();
+    } else {
+      for (auto i = vector_map_->begin(); i != vector_map_->end(); ) {
+        if (i->second->IsVecReplicateScalar()) {
+          DCHECK(node->loop_info->IsDefinedOutOfTheLoop(i->first));
+          ++i;
+        } else {
+          i = vector_map_->erase(i);
+        }
       }
     }
+    // Generate instruction map.
+    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+      bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
+      DCHECK(vectorized_def);
+    }
+    // Generate body from the instruction map, but in original program order.
+    HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment();
+    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+      auto i = vector_map_->find(it.Current());
+      if (i != vector_map_->end() && !i->second->IsInBlock()) {
+        Insert(vector_body_, i->second);
+        // Deal with instructions that need an environment, such as the scalar intrinsics.
+        if (i->second->NeedsEnvironment()) {
+          i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
+        }
+      }
+    }
+    vector_index_ = new (global_allocator_) HAdd(induc_type, vector_index_, step);
+    Insert(vector_body_, vector_index_);
   }
-  // Finalize increment and phi.
-  HInstruction* inc = new (global_allocator_) HAdd(induc_type, vector_phi_, step);
-  vector_phi_->AddInput(lo);
-  vector_phi_->AddInput(Insert(vector_body_, inc));
+  // Finalize phi for the loop index.
+  phi->AddInput(lo);
+  phi->AddInput(vector_index_);
+  vector_index_ = phi;
 }
 
 // TODO: accept reductions at left-hand-side, mixed-type store idioms, etc.
@@ -795,7 +846,7 @@
         VectorizeUse(node, value, generate_code, type, restrictions)) {
       if (generate_code) {
         GenerateVecSub(index, offset);
-        GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), type);
+        GenerateVecMem(instruction, vector_map_->Get(index), vector_map_->Get(value), offset, type);
       } else {
         vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ true));
       }
@@ -852,7 +903,7 @@
         induction_range_.IsUnitStride(instruction, index, &offset)) {
       if (generate_code) {
         GenerateVecSub(index, offset);
-        GenerateVecMem(instruction, vector_map_->Get(index), nullptr, type);
+        GenerateVecMem(instruction, vector_map_->Get(index), nullptr, offset, type);
       } else {
         vector_refs_->insert(ArrayReference(base, offset, type, /*lhs*/ false));
       }
@@ -1164,7 +1215,7 @@
 
 void HLoopOptimization::GenerateVecSub(HInstruction* org, HInstruction* offset) {
   if (vector_map_->find(org) == vector_map_->end()) {
-    HInstruction* subscript = vector_phi_;
+    HInstruction* subscript = vector_index_;
     if (offset != nullptr) {
       subscript = new (global_allocator_) HAdd(Primitive::kPrimInt, subscript, offset);
       if (org->IsPhi()) {
@@ -1178,17 +1229,27 @@
 void HLoopOptimization::GenerateVecMem(HInstruction* org,
                                        HInstruction* opa,
                                        HInstruction* opb,
+                                       HInstruction* offset,
                                        Primitive::Type type) {
   HInstruction* vector = nullptr;
   if (vector_mode_ == kVector) {
     // Vector store or load.
+    HInstruction* base = org->InputAt(0);
     if (opb != nullptr) {
       vector = new (global_allocator_) HVecStore(
-          global_allocator_, org->InputAt(0), opa, opb, type, vector_length_);
+          global_allocator_, base, opa, opb, type, vector_length_);
     } else  {
       bool is_string_char_at = org->AsArrayGet()->IsStringCharAt();
       vector = new (global_allocator_) HVecLoad(
-          global_allocator_, org->InputAt(0), opa, type, vector_length_, is_string_char_at);
+          global_allocator_, base, opa, type, vector_length_, is_string_char_at);
+    }
+    // Known dynamically enforced alignment?
+    // TODO: detect offset + constant differences.
+    // TODO: long run, static alignment analysis?
+    if (vector_peeling_candidate_ != nullptr &&
+        vector_peeling_candidate_->base == base &&
+        vector_peeling_candidate_->offset == offset) {
+      vector->AsVecMemoryOperation()->SetAlignment(Alignment(kAlignedBase, 0));
     }
   } else {
     // Scalar store or load.
@@ -1444,6 +1505,47 @@
 }
 
 //
+// Vectorization heuristics.
+//
+
+bool HLoopOptimization::IsVectorizationProfitable(int64_t trip_count) {
+  // Current heuristic: non-empty body with sufficient number
+  // of iterations (if known).
+  // TODO: refine by looking at e.g. operation count, alignment, etc.
+  if (vector_length_ == 0) {
+    return false;  // nothing found
+  } else if (0 < trip_count && trip_count < vector_length_) {
+    return false;  // insufficient iterations
+  }
+  return true;
+}
+
+void HLoopOptimization::SetPeelingCandidate(int64_t trip_count ATTRIBUTE_UNUSED) {
+  // Current heuristic: none.
+  // TODO: implement
+}
+
+uint32_t HLoopOptimization::GetUnrollingFactor(HBasicBlock* block, int64_t trip_count) {
+  // Current heuristic: unroll by 2 on ARM64/X86 for large known trip
+  // counts and small loop bodies.
+  // TODO: refine with operation count, remaining iterations, etc.
+  //       Artem had some really cool ideas for this already.
+  switch (compiler_driver_->GetInstructionSet()) {
+    case kArm64:
+    case kX86:
+    case kX86_64: {
+      size_t num_instructions = block->GetInstructions().CountSize();
+      if (num_instructions <= 10 && trip_count >= 4 * vector_length_) {
+        return 2;
+      }
+      return 1;
+    }
+    default:
+      return 1;
+  }
+}
+
+//
 // Helpers.
 //
 
@@ -1576,8 +1678,8 @@
       size_t index = it->GetIndex();
       ++it;  // increment before replacing
       if (iset_->find(user->GetHolder()) == iset_->end()) {  // not excluded?
-        HLoopInformation* other_loop_info = user->GetHolder()->GetBlock()->GetLoopInformation();
         // Only update environment uses after the loop.
+        HLoopInformation* other_loop_info = user->GetHolder()->GetBlock()->GetLoopInformation();
         if (other_loop_info == nullptr || !other_loop_info->IsIn(*loop_info)) {
           user->RemoveAsUserOfInput(index);
           user->SetRawEnvAt(index, replacement);
@@ -1614,4 +1716,21 @@
   }
 }
 
+bool HLoopOptimization::CanRemoveCycle() {
+  for (HInstruction* i : *iset_) {
+    // We can never remove instructions that have environment
+    // uses when we compile 'debuggable'.
+    if (i->HasEnvironmentUses() && graph_->IsDebuggable()) {
+      return false;
+    }
+    // A deoptimization should never have an environment input removed.
+    for (const HUseListNode<HEnvironment*>& use : i->GetEnvUses()) {
+      if (use.GetUser()->GetHolder()->IsDeoptimize()) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 }  // namespace art