Basic SIMD reduction support.

Rationale:
Enables vectorization of x += .... for very basic (simple, same-type)
constructs. Paves the way for more complex (narrower and/or mixed-type)
constructs, which will be handled by the next CL.

This is  a revert   of Icb5d6c805516db0a1d911c3ede9a246ccef89a22
and thus a revert^2 of I2454778dd0ef1da915c178c7274e1cf33e271d0f
and thus a revert^3 of I1c1c87b6323e01442e8fbd94869ddc9e760ea1fc
and thus a revert^4 of I7880c135aee3ed0a39da9ae5b468cbf80e613766

PS1-2 shows what needed to change

Test: test-art-host test-art-target

Bug: 64091002
Change-Id: I647889e0da0959ca405b70081b79c7d3c9bcb2e9
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 027ba77..f8f4eb2 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -285,6 +285,19 @@
   return false;
 }
 
+// Translates operation to reduction kind.
+static HVecReduce::ReductionKind GetReductionKind(HInstruction* reduction) {
+  if (reduction->IsVecAdd() || reduction->IsVecSub()) {
+    return HVecReduce::kSum;
+  } else if (reduction->IsVecMin()) {
+    return HVecReduce::kMin;
+  } else if (reduction->IsVecMax()) {
+    return HVecReduce::kMax;
+  }
+  LOG(FATAL) << "Unsupported SIMD reduction";
+  UNREACHABLE();
+}
+
 // Test vector restrictions.
 static bool HasVectorRestrictions(uint64_t restrictions, uint64_t tested) {
   return (restrictions & tested) != 0;
@@ -334,7 +347,8 @@
       vector_peeling_candidate_(nullptr),
       vector_runtime_test_a_(nullptr),
       vector_runtime_test_b_(nullptr),
-      vector_map_(nullptr) {
+      vector_map_(nullptr),
+      vector_permanent_map_(nullptr) {
 }
 
 void HLoopOptimization::Run() {
@@ -388,11 +402,14 @@
     ArenaSet<ArrayReference> refs(loop_allocator_->Adapter(kArenaAllocLoopOptimization));
     ArenaSafeMap<HInstruction*, HInstruction*> map(
         std::less<HInstruction*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization));
+    ArenaSafeMap<HInstruction*, HInstruction*> perm(
+        std::less<HInstruction*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization));
     // Attach.
     iset_ = &iset;
     reductions_ = &reds;
     vector_refs_ = &refs;
     vector_map_ = &map;
+    vector_permanent_map_ = &perm;
     // Traverse.
     TraverseLoopsInnerToOuter(top_loop_);
     // Detach.
@@ -400,6 +417,7 @@
     reductions_ = nullptr;
     vector_refs_ = nullptr;
     vector_map_ = nullptr;
+    vector_permanent_map_ = nullptr;
   }
 }
 
@@ -603,7 +621,6 @@
   // Vectorize loop, if possible and valid.
   if (kEnableVectorization &&
       TrySetSimpleLoopHeader(header, &main_phi) &&
-      reductions_->empty() &&  // TODO: possible with some effort
       ShouldVectorize(node, body, trip_count) &&
       TryAssignLastValue(node->loop_info, main_phi, preheader, /*collect_loop_uses*/ true)) {
     Vectorize(node, body, exit, trip_count);
@@ -802,6 +819,13 @@
                     /*unroll*/ 1);
   }
 
+  // Link reductions to their final uses.
+  for (auto i = reductions_->begin(); i != reductions_->end(); ++i) {
+    if (i->first->IsPhi()) {
+      i->first->ReplaceWith(ReduceAndExtractIfNeeded(i->second));
+    }
+  }
+
   // Remove the original loop by disconnecting the body block
   // and removing all instructions from the header.
   block->DisconnectAndDelete();
@@ -841,21 +865,10 @@
   vector_header_->AddInstruction(cond);
   vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
   vector_index_ = phi;
+  vector_permanent_map_->clear();  // preserved over unrolling
   for (uint32_t u = 0; u < unroll; u++) {
-    // Clear map, leaving loop invariants setup during unrolling.
-    if (u == 0) {
-      vector_map_->clear();
-    } else {
-      for (auto i = vector_map_->begin(); i != vector_map_->end(); ) {
-        if (i->second->IsVecReplicateScalar()) {
-          DCHECK(node->loop_info->IsDefinedOutOfTheLoop(i->first));
-          ++i;
-        } else {
-          i = vector_map_->erase(i);
-        }
-      }
-    }
     // Generate instruction map.
+    vector_map_->clear();
     for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
       bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
       DCHECK(vectorized_def);
@@ -872,9 +885,17 @@
         }
       }
     }
+    // Generate the induction.
     vector_index_ = new (global_allocator_) HAdd(induc_type, vector_index_, step);
     Insert(vector_body_, vector_index_);
   }
+  // Finalize phi inputs for the reductions (if any).
+  for (auto i = reductions_->begin(); i != reductions_->end(); ++i) {
+    if (!i->first->IsPhi()) {
+      DCHECK(i->second->IsPhi());
+      GenerateVecReductionPhiInputs(i->second->AsPhi(), i->first);
+    }
+  }
   // Finalize phi inputs for the loop index.
   phi->AddInput(lo);
   phi->AddInput(vector_index_);
@@ -910,6 +931,23 @@
     }
     return false;
   }
+  // Accept a left-hand-side reduction for
+  // (1) supported vector type,
+  // (2) vectorizable right-hand-side value.
+  auto redit = reductions_->find(instruction);
+  if (redit != reductions_->end()) {
+    Primitive::Type type = instruction->GetType();
+    if (TrySetVectorType(type, &restrictions) &&
+        VectorizeUse(node, instruction, generate_code, type, restrictions)) {
+      if (generate_code) {
+        HInstruction* new_red = vector_map_->Get(instruction);
+        vector_permanent_map_->Put(new_red, vector_map_->Get(redit->second));
+        vector_permanent_map_->Overwrite(redit->second, new_red);
+      }
+      return true;
+    }
+    return false;
+  }
   // Branch back okay.
   if (instruction->IsGoto()) {
     return true;
@@ -965,6 +1003,21 @@
       }
       return true;
     }
+  } else if (instruction->IsPhi()) {
+    // Accept particular phi operations.
+    if (reductions_->find(instruction) != reductions_->end()) {
+      // Deal with vector restrictions.
+      if (HasVectorRestrictions(restrictions, kNoReduction)) {
+        return false;
+      }
+      // Accept a reduction.
+      if (generate_code) {
+        GenerateVecReductionPhi(instruction->AsPhi());
+      }
+      return true;
+    }
+    // TODO: accept right-hand-side induction?
+    return false;
   } else if (instruction->IsTypeConversion()) {
     // Accept particular type conversions.
     HTypeConversion* conversion = instruction->AsTypeConversion();
@@ -1155,14 +1208,14 @@
       switch (type) {
         case Primitive::kPrimBoolean:
         case Primitive::kPrimByte:
-          *restrictions |= kNoDiv;
+          *restrictions |= kNoDiv | kNoReduction;
           return TrySetVectorLength(8);
         case Primitive::kPrimChar:
         case Primitive::kPrimShort:
-          *restrictions |= kNoDiv | kNoStringCharAt;
+          *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
           return TrySetVectorLength(4);
         case Primitive::kPrimInt:
-          *restrictions |= kNoDiv;
+          *restrictions |= kNoDiv | kNoReduction;
           return TrySetVectorLength(2);
         default:
           break;
@@ -1174,11 +1227,11 @@
       switch (type) {
         case Primitive::kPrimBoolean:
         case Primitive::kPrimByte:
-          *restrictions |= kNoDiv;
+          *restrictions |= kNoDiv | kNoReduction;
           return TrySetVectorLength(16);
         case Primitive::kPrimChar:
         case Primitive::kPrimShort:
-          *restrictions |= kNoDiv;
+          *restrictions |= kNoDiv | kNoReduction;
           return TrySetVectorLength(8);
         case Primitive::kPrimInt:
           *restrictions |= kNoDiv;
@@ -1187,8 +1240,10 @@
           *restrictions |= kNoDiv | kNoMul | kNoMinMax;
           return TrySetVectorLength(2);
         case Primitive::kPrimFloat:
+          *restrictions |= kNoReduction;
           return TrySetVectorLength(4);
         case Primitive::kPrimDouble:
+          *restrictions |= kNoReduction;
           return TrySetVectorLength(2);
         default:
           return false;
@@ -1200,11 +1255,12 @@
         switch (type) {
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
-            *restrictions |= kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd;
+            *restrictions |=
+                kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd;
+            *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
             *restrictions |= kNoDiv;
@@ -1213,10 +1269,10 @@
             *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax;
             return TrySetVectorLength(2);
           case Primitive::kPrimFloat:
-            *restrictions |= kNoMinMax;  // -0.0 vs +0.0
+            *restrictions |= kNoMinMax | kNoReduction;  // minmax: -0.0 vs +0.0
             return TrySetVectorLength(4);
           case Primitive::kPrimDouble:
-            *restrictions |= kNoMinMax;  // -0.0 vs +0.0
+            *restrictions |= kNoMinMax | kNoReduction;  // minmax: -0.0 vs +0.0
             return TrySetVectorLength(2);
           default:
             break;
@@ -1228,23 +1284,23 @@
         switch (type) {
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoStringCharAt;
+            *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(4);
           case Primitive::kPrimLong:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(2);
           case Primitive::kPrimFloat:
-            *restrictions |= kNoMinMax;  // min/max(x, NaN)
+            *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
             return TrySetVectorLength(4);
           case Primitive::kPrimDouble:
-            *restrictions |= kNoMinMax;  // min/max(x, NaN)
+            *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
             return TrySetVectorLength(2);
           default:
             break;
@@ -1256,23 +1312,23 @@
         switch (type) {
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoStringCharAt;
+            *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(4);
           case Primitive::kPrimLong:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoReduction;
             return TrySetVectorLength(2);
           case Primitive::kPrimFloat:
-            *restrictions |= kNoMinMax;  // min/max(x, NaN)
+            *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
             return TrySetVectorLength(4);
           case Primitive::kPrimDouble:
-            *restrictions |= kNoMinMax;  // min/max(x, NaN)
+            *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
             return TrySetVectorLength(2);
           default:
             break;
@@ -1305,9 +1361,16 @@
       return;
     }
     // In vector code, explicit scalar expansion is needed.
-    HInstruction* vector = new (global_allocator_) HVecReplicateScalar(
-        global_allocator_, org, type, vector_length_);
-    vector_map_->Put(org, Insert(vector_preheader_, vector));
+    HInstruction* vector = nullptr;
+    auto it = vector_permanent_map_->find(org);
+    if (it != vector_permanent_map_->end()) {
+      vector = it->second;  // reuse during unrolling
+    } else {
+      vector = new (global_allocator_) HVecReplicateScalar(
+          global_allocator_, org, type, vector_length_);
+      vector_permanent_map_->Put(org, Insert(vector_preheader_, vector));
+    }
+    vector_map_->Put(org, vector);
   }
 }
 
@@ -1362,6 +1425,78 @@
   vector_map_->Put(org, vector);
 }
 
+void HLoopOptimization::GenerateVecReductionPhi(HPhi* phi) {
+  DCHECK(reductions_->find(phi) != reductions_->end());
+  DCHECK(reductions_->Get(phi->InputAt(1)) == phi);
+  HInstruction* vector = nullptr;
+  if (vector_mode_ == kSequential) {
+    HPhi* new_phi = new (global_allocator_) HPhi(
+        global_allocator_, kNoRegNumber, 0, phi->GetType());
+    vector_header_->AddPhi(new_phi);
+    vector = new_phi;
+  } else {
+    // Link vector reduction back to prior unrolled update, or a first phi.
+    auto it = vector_permanent_map_->find(phi);
+    if (it != vector_permanent_map_->end()) {
+      vector = it->second;
+    } else {
+      HPhi* new_phi = new (global_allocator_) HPhi(
+          global_allocator_, kNoRegNumber, 0, HVecOperation::kSIMDType);
+      vector_header_->AddPhi(new_phi);
+      vector = new_phi;
+    }
+  }
+  vector_map_->Put(phi, vector);
+}
+
+void HLoopOptimization::GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction) {
+  HInstruction* new_phi = vector_map_->Get(phi);
+  HInstruction* new_init = reductions_->Get(phi);
+  HInstruction* new_red = vector_map_->Get(reduction);
+  // Link unrolled vector loop back to new phi.
+  for (; !new_phi->IsPhi(); new_phi = vector_permanent_map_->Get(new_phi)) {
+    DCHECK(new_phi->IsVecOperation());
+  }
+  // Prepare the new initialization.
+  if (vector_mode_ == kVector) {
+    // Generate a [initial, 0, .., 0] vector.
+    new_init = Insert(
+            vector_preheader_,
+            new (global_allocator_) HVecSetScalars(
+                global_allocator_, &new_init, phi->GetType(), vector_length_, 1));
+  } else {
+    new_init = ReduceAndExtractIfNeeded(new_init);
+  }
+  // Set the phi inputs.
+  DCHECK(new_phi->IsPhi());
+  new_phi->AsPhi()->AddInput(new_init);
+  new_phi->AsPhi()->AddInput(new_red);
+  // New feed value for next phi (safe mutation in iteration).
+  reductions_->find(phi)->second = new_phi;
+}
+
+HInstruction* HLoopOptimization::ReduceAndExtractIfNeeded(HInstruction* instruction) {
+  if (instruction->IsPhi()) {
+    HInstruction* input = instruction->InputAt(1);
+    if (input->IsVecOperation()) {
+      Primitive::Type type = input->AsVecOperation()->GetPackedType();
+      HBasicBlock* exit = instruction->GetBlock()->GetSuccessors()[0];
+      // Generate a vector reduction and scalar extract
+      //    x = REDUCE( [x_1, .., x_n] )
+      //    y = x_1
+      // along the exit of the defining loop.
+      HVecReduce::ReductionKind kind = GetReductionKind(input);
+      HInstruction* reduce = new (global_allocator_) HVecReduce(
+          global_allocator_, instruction, type, vector_length_, kind);
+      exit->InsertInstructionBefore(reduce, exit->GetFirstInstruction());
+      instruction = new (global_allocator_) HVecExtractScalar(
+          global_allocator_, reduce, type, vector_length_, 0);
+      exit->InsertInstructionAfter(instruction, reduce);
+    }
+  }
+  return instruction;
+}
+
 #define GENERATE_VEC(x, y) \
   if (vector_mode_ == kVector) { \
     vector = (x); \
@@ -1542,10 +1677,9 @@
   // Test for top level arithmetic shift right x >> 1 or logical shift right x >>> 1
   // (note whether the sign bit in wider precision is shifted in has no effect
   // on the narrow precision computed by the idiom).
-  int64_t distance = 0;
   if ((instruction->IsShr() ||
        instruction->IsUShr()) &&
-      IsInt64AndGet(instruction->InputAt(1), /*out*/ &distance) && distance == 1) {
+      IsInt64Value(instruction->InputAt(1), 1)) {
     // Test for (a + b + c) >> 1 for optional constant c.
     HInstruction* a = nullptr;
     HInstruction* b = nullptr;