ART: Implement predicated SIMD vectorization.

This CL brings support for predicated execution for
auto-vectorizer and implements arm64 SVE vector backend.

This version passes all the VIXL simulator-runnable tests in
SVE mode with checker off (as all VecOp CHECKs need to be
adjusted for an extra input) and all tests in NEON mode.

Test: art SIMD tests on VIXL simulator.
Test: art tests on FVP (steps in test/README.arm_fvp.md)

Change-Id: Ib78bde31a15e6713d875d6668ad4458f5519605f
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 4c9b01c..1210dbe 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -473,6 +473,7 @@
       iset_(nullptr),
       reductions_(nullptr),
       simplified_(false),
+      predicated_vectorization_mode_(codegen.SupportsPredicatedSIMD()),
       vector_length_(0),
       vector_refs_(nullptr),
       vector_static_peeling_factor_(0),
@@ -486,10 +487,7 @@
       vector_header_(nullptr),
       vector_body_(nullptr),
       vector_index_(nullptr),
-      arch_loop_helper_(ArchNoOptsLoopHelper::Create(compiler_options_ != nullptr
-                                                          ? compiler_options_->GetInstructionSet()
-                                                          : InstructionSet::kNone,
-                                                      global_allocator_)) {
+      arch_loop_helper_(ArchNoOptsLoopHelper::Create(codegen, global_allocator_)) {
 }
 
 bool HLoopOptimization::Run() {
@@ -1024,8 +1022,10 @@
     }
   }  // for i
 
-  // Find a suitable alignment strategy.
-  SetAlignmentStrategy(peeling_votes, peeling_candidate);
+  if (!IsInPredicatedVectorizationMode()) {
+    // Find a suitable alignment strategy.
+    SetAlignmentStrategy(peeling_votes, peeling_candidate);
+  }
 
   // Does vectorization seem profitable?
   if (!IsVectorizationProfitable(trip_count)) {
@@ -1052,8 +1052,8 @@
 
   // A cleanup loop is needed, at least, for any unknown trip count or
   // for a known trip count with remainder iterations after vectorization.
-  bool needs_cleanup = trip_count == 0 ||
-      ((trip_count - vector_static_peeling_factor_) % chunk) != 0;
+  bool needs_cleanup = !IsInPredicatedVectorizationMode() &&
+      (trip_count == 0 || ((trip_count - vector_static_peeling_factor_) % chunk) != 0);
 
   // Adjust vector bookkeeping.
   HPhi* main_phi = nullptr;
@@ -1071,11 +1071,13 @@
   // ptc = <peeling factor>;
   HInstruction* ptc = nullptr;
   if (vector_static_peeling_factor_ != 0) {
+    DCHECK(!IsInPredicatedVectorizationMode());
     // Static loop peeling for SIMD alignment (using the most suitable
     // fixed peeling factor found during prior alignment analysis).
     DCHECK(vector_dynamic_peeling_candidate_ == nullptr);
     ptc = graph_->GetConstant(induc_type, vector_static_peeling_factor_);
   } else if (vector_dynamic_peeling_candidate_ != nullptr) {
+    DCHECK(!IsInPredicatedVectorizationMode());
     // Dynamic loop peeling for SIMD alignment (using the most suitable
     // candidate found during prior alignment analysis):
     // rem = offset % ALIGN;    // adjusted as #elements
@@ -1106,6 +1108,7 @@
   HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader);
   HInstruction* vtc = stc;
   if (needs_cleanup) {
+    DCHECK(!IsInPredicatedVectorizationMode());
     DCHECK(IsPowerOfTwo(chunk));
     HInstruction* diff = stc;
     if (ptc != nullptr) {
@@ -1143,6 +1146,7 @@
   //       moved around during suspend checks, since all analysis was based on
   //       nothing more than the Android runtime alignment conventions.
   if (ptc != nullptr) {
+    DCHECK(!IsInPredicatedVectorizationMode());
     vector_mode_ = kSequential;
     GenerateNewLoop(node,
                     block,
@@ -1170,6 +1174,7 @@
   // for ( ; i < stc; i += 1)
   //    <loop-body>
   if (needs_cleanup) {
+    DCHECK(!IsInPredicatedVectorizationMode() || vector_runtime_test_a_ != nullptr);
     vector_mode_ = kSequential;
     GenerateNewLoop(node,
                     block,
@@ -1227,9 +1232,35 @@
   // Generate header and prepare body.
   // for (i = lo; i < hi; i += step)
   //    <loop-body>
-  HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi);
-  vector_header_->AddPhi(phi);
-  vector_header_->AddInstruction(cond);
+  HInstruction* cond = nullptr;
+  HInstruction* set_pred = nullptr;
+  if (IsInPredicatedVectorizationMode()) {
+    HVecPredWhile* pred_while =
+        new (global_allocator_) HVecPredWhile(global_allocator_,
+                                              phi,
+                                              hi,
+                                              HVecPredWhile::CondKind::kLO,
+                                              DataType::Type::kInt32,
+                                              vector_length_,
+                                              0u);
+
+    cond = new (global_allocator_) HVecPredCondition(global_allocator_,
+                                                     pred_while,
+                                                     HVecPredCondition::PCondKind::kNFirst,
+                                                     DataType::Type::kInt32,
+                                                     vector_length_,
+                                                     0u);
+
+    vector_header_->AddPhi(phi);
+    vector_header_->AddInstruction(pred_while);
+    vector_header_->AddInstruction(cond);
+    set_pred = pred_while;
+  } else {
+    cond = new (global_allocator_) HAboveOrEqual(phi, hi);
+    vector_header_->AddPhi(phi);
+    vector_header_->AddInstruction(cond);
+  }
+
   vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
   vector_index_ = phi;
   vector_permanent_map_->clear();  // preserved over unrolling
@@ -1246,6 +1277,10 @@
       auto i = vector_map_->find(it.Current());
       if (i != vector_map_->end() && !i->second->IsInBlock()) {
         Insert(vector_body_, i->second);
+        if (IsInPredicatedVectorizationMode() && i->second->IsVecOperation()) {
+          HVecOperation* op = i->second->AsVecOperation();
+          op->SetMergingGoverningPredicate(set_pred);
+        }
         // Deal with instructions that need an environment, such as the scalar intrinsics.
         if (i->second->NeedsEnvironment()) {
           i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
@@ -1360,7 +1395,10 @@
   } else if (instruction->IsArrayGet()) {
     // Deal with vector restrictions.
     bool is_string_char_at = instruction->AsArrayGet()->IsStringCharAt();
-    if (is_string_char_at && HasVectorRestrictions(restrictions, kNoStringCharAt)) {
+
+    if (is_string_char_at && (HasVectorRestrictions(restrictions, kNoStringCharAt) ||
+                              IsInPredicatedVectorizationMode())) {
+      // TODO: Support CharAt for predicated mode.
       return false;
     }
     // Accept a right-hand-side array base[index] for
@@ -1575,32 +1613,73 @@
       }
       return false;
     case InstructionSet::kArm64:
-      // Allow vectorization for all ARM devices, because Android assumes that
-      // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD).
-      switch (type) {
-        case DataType::Type::kBool:
-        case DataType::Type::kUint8:
-        case DataType::Type::kInt8:
-          *restrictions |= kNoDiv;
-          return TrySetVectorLength(type, 16);
-        case DataType::Type::kUint16:
-        case DataType::Type::kInt16:
-          *restrictions |= kNoDiv;
-          return TrySetVectorLength(type, 8);
-        case DataType::Type::kInt32:
-          *restrictions |= kNoDiv;
-          return TrySetVectorLength(type, 4);
-        case DataType::Type::kInt64:
-          *restrictions |= kNoDiv | kNoMul;
-          return TrySetVectorLength(type, 2);
-        case DataType::Type::kFloat32:
-          *restrictions |= kNoReduction;
-          return TrySetVectorLength(type, 4);
-        case DataType::Type::kFloat64:
-          *restrictions |= kNoReduction;
-          return TrySetVectorLength(type, 2);
-        default:
-          return false;
+      if (IsInPredicatedVectorizationMode()) {
+        // SVE vectorization.
+        CHECK(features->AsArm64InstructionSetFeatures()->HasSVE());
+        switch (type) {
+          case DataType::Type::kBool:
+          case DataType::Type::kUint8:
+          case DataType::Type::kInt8:
+            *restrictions |= kNoDiv |
+                             kNoSignedHAdd |
+                             kNoUnsignedHAdd |
+                             kNoUnroundedHAdd |
+                             kNoSAD;
+            return TrySetVectorLength(type, 16);
+          case DataType::Type::kUint16:
+          case DataType::Type::kInt16:
+            *restrictions |= kNoDiv |
+                             kNoSignedHAdd |
+                             kNoUnsignedHAdd |
+                             kNoUnroundedHAdd |
+                             kNoSAD |
+                             kNoDotProd;
+            return TrySetVectorLength(type, 8);
+          case DataType::Type::kInt32:
+            *restrictions |= kNoDiv | kNoSAD;
+            return TrySetVectorLength(type, 4);
+          case DataType::Type::kInt64:
+            *restrictions |= kNoDiv | kNoSAD;
+            return TrySetVectorLength(type, 2);
+          case DataType::Type::kFloat32:
+            *restrictions |= kNoReduction;
+            return TrySetVectorLength(type, 4);
+          case DataType::Type::kFloat64:
+            *restrictions |= kNoReduction;
+            return TrySetVectorLength(type, 2);
+          default:
+            break;
+        }
+        return false;
+      } else {
+        // Allow vectorization for all ARM devices, because Android assumes that
+        // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD).
+        switch (type) {
+          case DataType::Type::kBool:
+          case DataType::Type::kUint8:
+          case DataType::Type::kInt8:
+            *restrictions |= kNoDiv;
+            return TrySetVectorLength(type, 16);
+          case DataType::Type::kUint16:
+          case DataType::Type::kInt16:
+            *restrictions |= kNoDiv;
+            return TrySetVectorLength(type, 8);
+          case DataType::Type::kInt32:
+            *restrictions |= kNoDiv;
+            return TrySetVectorLength(type, 4);
+          case DataType::Type::kInt64:
+            *restrictions |= kNoDiv | kNoMul;
+            return TrySetVectorLength(type, 2);
+          case DataType::Type::kFloat32:
+            *restrictions |= kNoReduction;
+            return TrySetVectorLength(type, 4);
+          case DataType::Type::kFloat64:
+            *restrictions |= kNoReduction;
+            return TrySetVectorLength(type, 2);
+          default:
+            break;
+        }
+        return false;
       }
     case InstructionSet::kX86:
     case InstructionSet::kX86_64:
@@ -1693,6 +1772,15 @@
       vector = new (global_allocator_)
           HVecReplicateScalar(global_allocator_, input, type, vector_length_, kNoDexPc);
       vector_permanent_map_->Put(org, Insert(vector_preheader_, vector));
+      if (IsInPredicatedVectorizationMode()) {
+        HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+                                                                          graph_->GetIntConstant(1),
+                                                                          type,
+                                                                          vector_length_,
+                                                                          0u);
+        vector_preheader_->InsertInstructionBefore(set_pred, vector);
+        vector->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+      }
     }
     vector_map_->Put(org, vector);
   }
@@ -1821,6 +1909,15 @@
                                                                     vector_length,
                                                                     kNoDexPc));
     }
+    if (IsInPredicatedVectorizationMode()) {
+      HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+                                                                        graph_->GetIntConstant(1),
+                                                                        type,
+                                                                        vector_length,
+                                                                        0u);
+      vector_preheader_->InsertInstructionBefore(set_pred, new_init);
+      new_init->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+    }
   } else {
     new_init = ReduceAndExtractIfNeeded(new_init);
   }
@@ -1852,6 +1949,17 @@
       instruction = new (global_allocator_) HVecExtractScalar(
           global_allocator_, reduce, type, vector_length, 0, kNoDexPc);
       exit->InsertInstructionAfter(instruction, reduce);
+
+      if (IsInPredicatedVectorizationMode()) {
+        HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+                                                                          graph_->GetIntConstant(1),
+                                                                          type,
+                                                                          vector_length,
+                                                                          0u);
+        exit->InsertInstructionBefore(set_pred, reduce);
+        reduce->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+        instruction->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+      }
     }
   }
   return instruction;
@@ -1991,7 +2099,8 @@
         return false;
       }
       // Deal with vector restrictions.
-      if ((!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) ||
+      if ((is_unsigned && HasVectorRestrictions(restrictions, kNoUnsignedHAdd)) ||
+          (!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) ||
           (!is_rounded && HasVectorRestrictions(restrictions, kNoUnroundedHAdd))) {
         return false;
       }