26 files changed, 1831 insertions, 216 deletions
diff --git a/TEST_MAPPING b/TEST_MAPPING
index 38ff5eda55..d4ea0d8428 100644
--- a/TEST_MAPPING
+++ b/TEST_MAPPING
@@ -1181,6 +1181,9 @@
       "name": "art-run-test-660-checker-simd-sad[com.google.android.art.apex]"
     },
     {
+      "name": "art-run-test-661-checker-simd-cf-loops[com.google.android.art.apex]"
+    },
+    {
       "name": "art-run-test-661-checker-simd-reduc[com.google.android.art.apex]"
     },
     {
@@ -2600,6 +2603,9 @@
       "name": "art-run-test-660-checker-simd-sad"
     },
     {
+      "name": "art-run-test-661-checker-simd-cf-loops"
+    },
+    {
       "name": "art-run-test-661-checker-simd-reduc"
     },
     {
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 13734d7fb8..bf25418a2f 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -554,12 +554,31 @@ class InstructionCodeGeneratorARM64Sve : public InstructionCodeGeneratorARM64 {
   // register size (full SIMD register is used).
   void ValidateVectorLength(HVecOperation* instr) const;
 
-  // Returns default predicate register which is used as governing vector predicate
-  // to implement predicated loop execution.
+  vixl::aarch64::PRegister GetVecGoverningPReg(HVecOperation* instr) {
+    return GetVecPredSetFixedOutPReg(instr->GetGoverningPredicate());
+  }
+
+  // Returns a fixed p-reg for predicate setting instruction.
   //
-  // TODO: This is a hack to be addressed when register allocator supports SIMD types.
-  static vixl::aarch64::PRegister LoopPReg() {
-    return vixl::aarch64::p0;
+  // Currently we only support diamond CF loops for predicated vectorization; also we don't have
+  // register allocator support for vector predicates. Thus we use fixed P-regs for loop main,
+  // True and False predicates as a temporary solution.
+  //
+  // TODO: Support SIMD types and registers in ART.
+  static vixl::aarch64::PRegister GetVecPredSetFixedOutPReg(HVecPredSetOperation* instr) {
+    if (instr->IsVecPredWhile() || instr->IsVecPredSetAll()) {
+      // VecPredWhile and VecPredSetAll live ranges never overlap due to the current vectorization
+      // scheme: the former only is live inside a vectorized loop and the later is never in a
+      // loop and never spans across loops.
+      return vixl::aarch64::p0;
+    } else if (instr->IsVecPredNot()) {
+      // This relies on the fact that we only use PredNot manually in the autovectorizer,
+      // so there is only one of them in each loop.
+      return vixl::aarch64::p1;
+    } else {
+      DCHECK(instr->IsVecCondition());
+      return vixl::aarch64::p2;
+    }
   }
 };
 
diff --git a/compiler/optimizing/code_generator_riscv64.cc b/compiler/optimizing/code_generator_riscv64.cc
index c7cd26c83f..df40337f51 100644
--- a/compiler/optimizing/code_generator_riscv64.cc
+++ b/compiler/optimizing/code_generator_riscv64.cc
@@ -2017,12 +2017,32 @@ void InstructionCodeGeneratorRISCV64::VisitVecPredWhile(HVecPredWhile* instructi
   LOG(FATAL) << "Unimplemented";
 }
 
-void LocationsBuilderRISCV64::VisitVecPredCondition(HVecPredCondition* instruction) {
+void LocationsBuilderRISCV64::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
   UNUSED(instruction);
   LOG(FATAL) << "Unimplemented";
 }
 
-void InstructionCodeGeneratorRISCV64::VisitVecPredCondition(HVecPredCondition* instruction) {
+void InstructionCodeGeneratorRISCV64::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
+  UNUSED(instruction);
+  LOG(FATAL) << "Unimplemented";
+}
+
+void LocationsBuilderRISCV64::VisitVecCondition(HVecCondition* instruction) {
+  UNUSED(instruction);
+  LOG(FATAL) << "Unimplemented";
+}
+
+void InstructionCodeGeneratorRISCV64::VisitVecCondition(HVecCondition* instruction) {
+  UNUSED(instruction);
+  LOG(FATAL) << "Unimplemented";
+}
+
+void LocationsBuilderRISCV64::VisitVecPredNot(HVecPredNot* instruction) {
+  UNUSED(instruction);
+  LOG(FATAL) << "Unimplemented";
+}
+
+void InstructionCodeGeneratorRISCV64::VisitVecPredNot(HVecPredNot* instruction) {
   UNUSED(instruction);
   LOG(FATAL) << "Unimplemented";
 }
diff --git a/compiler/optimizing/code_generator_vector_arm64_neon.cc b/compiler/optimizing/code_generator_vector_arm64_neon.cc
index ce02bfa21a..848b5e7567 100644
--- a/compiler/optimizing/code_generator_vector_arm64_neon.cc
+++ b/compiler/optimizing/code_generator_vector_arm64_neon.cc
@@ -1531,12 +1531,32 @@ void InstructionCodeGeneratorARM64Neon::VisitVecPredWhile(HVecPredWhile* instruc
   UNREACHABLE();
 }
 
-void LocationsBuilderARM64Neon::VisitVecPredCondition(HVecPredCondition* instruction) {
+void LocationsBuilderARM64Neon::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
   UNREACHABLE();
 }
 
-void InstructionCodeGeneratorARM64Neon::VisitVecPredCondition(HVecPredCondition* instruction) {
+void InstructionCodeGeneratorARM64Neon::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  UNREACHABLE();
+}
+
+void LocationsBuilderARM64Neon::VisitVecCondition(HVecCondition* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  UNREACHABLE();
+}
+
+void InstructionCodeGeneratorARM64Neon::VisitVecCondition(HVecCondition* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  UNREACHABLE();
+}
+
+void LocationsBuilderARM64Neon::VisitVecPredNot(HVecPredNot* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  UNREACHABLE();
+}
+
+void InstructionCodeGeneratorARM64Neon::VisitVecPredNot(HVecPredNot* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
   UNREACHABLE();
 }
diff --git a/compiler/optimizing/code_generator_vector_arm64_sve.cc b/compiler/optimizing/code_generator_vector_arm64_sve.cc
index 4c16c3eb38..ef79932899 100644
--- a/compiler/optimizing/code_generator_vector_arm64_sve.cc
+++ b/compiler/optimizing/code_generator_vector_arm64_sve.cc
@@ -245,7 +245,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReduce(HVecReduce* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   const ZRegister src = ZRegisterFrom(locations->InAt(0));
   const VRegister dst = DRegisterFrom(locations->Out());
-  const PRegister p_reg = LoopPReg();
+  const PRegister p_reg = GetVecGoverningPReg(instruction);
   ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kInt32:
@@ -283,7 +283,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecCnv(HVecCnv* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   const ZRegister src = ZRegisterFrom(locations->InAt(0));
   const ZRegister dst = ZRegisterFrom(locations->Out());
-  const PRegisterM p_reg = LoopPReg().Merging();
+  const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
   DataType::Type from = instruction->GetInputType();
   DataType::Type to = instruction->GetResultType();
   ValidateVectorLength(instruction);
@@ -303,7 +303,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecNeg(HVecNeg* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   const ZRegister src = ZRegisterFrom(locations->InAt(0));
   const ZRegister dst = ZRegisterFrom(locations->Out());
-  const PRegisterM p_reg = LoopPReg().Merging();
+  const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
   ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
@@ -341,7 +341,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAbs(HVecAbs* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   const ZRegister src = ZRegisterFrom(locations->InAt(0));
   const ZRegister dst = ZRegisterFrom(locations->Out());
-  const PRegisterM p_reg = LoopPReg().Merging();
+  const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
   ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kInt8:
@@ -377,7 +377,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecNot(HVecNot* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   const ZRegister src = ZRegisterFrom(locations->InAt(0));
   const ZRegister dst = ZRegisterFrom(locations->Out());
-  const PRegisterM p_reg = LoopPReg().Merging();
+  const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
   ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:  // special case boolean-not
@@ -437,7 +437,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAdd(HVecAdd* instruction) {
   const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
-  const PRegisterM p_reg = LoopPReg().Merging();
+  const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
   ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
@@ -496,7 +496,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecSub(HVecSub* instruction) {
   const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
-  const PRegisterM p_reg = LoopPReg().Merging();
+  const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
   ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
@@ -545,7 +545,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMul(HVecMul* instruction) {
   const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
-  const PRegisterM p_reg = LoopPReg().Merging();
+  const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
   ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
@@ -584,7 +584,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecDiv(HVecDiv* instruction) {
   const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
-  const PRegisterM p_reg = LoopPReg().Merging();
+  const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
   ValidateVectorLength(instruction);
 
   // Note: VIXL guarantees StrictNaNPropagation for Fdiv.
@@ -632,7 +632,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAnd(HVecAnd* instruction) {
   const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
-  const PRegisterM p_reg = LoopPReg().Merging();
+  const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
   ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
@@ -677,7 +677,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecOr(HVecOr* instruction) {
   const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
-  const PRegisterM p_reg = LoopPReg().Merging();
+  const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
   ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
@@ -713,7 +713,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecXor(HVecXor* instruction) {
   const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
-  const PRegisterM p_reg = LoopPReg().Merging();
+  const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
   ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
@@ -768,7 +768,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecShl(HVecShl* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
   const ZRegister dst = ZRegisterFrom(locations->Out());
-  const PRegisterM p_reg = LoopPReg().Merging();
+  const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
   ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
@@ -801,7 +801,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecShr(HVecShr* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
   const ZRegister dst = ZRegisterFrom(locations->Out());
-  const PRegisterM p_reg = LoopPReg().Merging();
+  const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
   ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
@@ -834,7 +834,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecUShr(HVecUShr* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
   const ZRegister dst = ZRegisterFrom(locations->Out());
-  const PRegisterM p_reg = LoopPReg().Merging();
+  const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
   ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
@@ -965,7 +965,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate(
   const ZRegister acc = ZRegisterFrom(locations->InAt(0));
   const ZRegister left = ZRegisterFrom(locations->InAt(1));
   const ZRegister right = ZRegisterFrom(locations->InAt(2));
-  const PRegisterM p_reg = LoopPReg().Merging();
+  const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
 
   DCHECK(locations->InAt(0).Equals(locations->Out()));
   ValidateVectorLength(instruction);
@@ -1028,7 +1028,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecDotProd(HVecDotProd* instruction)
   const ZRegister acc = ZRegisterFrom(locations->InAt(0));
   const ZRegister left = ZRegisterFrom(locations->InAt(1));
   const ZRegister right = ZRegisterFrom(locations->InAt(2));
-  const PRegisterM p_reg = LoopPReg().Merging();
+  const PRegisterM p_reg = GetVecGoverningPReg(instruction).Merging();
   HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
   HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
   DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
@@ -1098,7 +1098,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecLoad(HVecLoad* instruction) {
   const ZRegister reg = ZRegisterFrom(locations->Out());
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register scratch;
-  const PRegisterZ p_reg = LoopPReg().Zeroing();
+  const PRegisterZ p_reg = GetVecGoverningPReg(instruction).Zeroing();
   ValidateVectorLength(instruction);
 
   switch (instruction->GetPackedType()) {
@@ -1140,7 +1140,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecStore(HVecStore* instruction) {
   const ZRegister reg = ZRegisterFrom(locations->InAt(2));
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register scratch;
-  const PRegisterZ p_reg = LoopPReg().Zeroing();
+  const PRegisterZ p_reg = GetVecGoverningPReg(instruction).Zeroing();
   ValidateVectorLength(instruction);
 
   switch (instruction->GetPackedType()) {
@@ -1181,25 +1181,25 @@ void LocationsBuilderARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instruction) {
 void InstructionCodeGeneratorARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instruction) {
   // Instruction is not predicated, see nodes_vector.h
   DCHECK(!instruction->IsPredicated());
-  const PRegister p_reg = LoopPReg();
+  const PRegister output_p_reg = GetVecPredSetFixedOutPReg(instruction);
 
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      __ Ptrue(p_reg.VnB(), vixl::aarch64::SVE_ALL);
+      __ Ptrue(output_p_reg.VnB(), vixl::aarch64::SVE_ALL);
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      __ Ptrue(p_reg.VnH(), vixl::aarch64::SVE_ALL);
+      __ Ptrue(output_p_reg.VnH(), vixl::aarch64::SVE_ALL);
       break;
     case DataType::Type::kInt32:
     case DataType::Type::kFloat32:
-      __ Ptrue(p_reg.VnS(), vixl::aarch64::SVE_ALL);
+      __ Ptrue(output_p_reg.VnS(), vixl::aarch64::SVE_ALL);
       break;
     case DataType::Type::kInt64:
     case DataType::Type::kFloat64:
-      __ Ptrue(p_reg.VnD(), vixl::aarch64::SVE_ALL);
+      __ Ptrue(output_p_reg.VnD(), vixl::aarch64::SVE_ALL);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -1207,6 +1207,67 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instru
   }
 }
 
+void LocationsBuilderARM64Sve::VisitVecCondition(HVecCondition* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+  locations->SetInAt(0, Location::RequiresFpuRegister());
+  locations->SetInAt(1, Location::RequiresFpuRegister());
+  locations->SetOut(Location::RequiresRegister());
+}
+
+void InstructionCodeGeneratorARM64Sve::VisitVecCondition(HVecCondition* instruction) {
+  DCHECK(instruction->IsPredicated());
+  LocationSummary* locations = instruction->GetLocations();
+  const ZRegister left = ZRegisterFrom(locations->InAt(0));
+  const ZRegister right = ZRegisterFrom(locations->InAt(1));
+  const PRegisterZ p_reg = GetVecGoverningPReg(instruction).Zeroing();
+  const PRegister output_p_reg = GetVecPredSetFixedOutPReg(instruction);
+
+  HVecOperation* a = instruction->InputAt(0)->AsVecOperation();
+  HVecOperation* b = instruction->InputAt(1)->AsVecOperation();
+  DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
+            HVecOperation::ToSignedType(b->GetPackedType()));
+  ValidateVectorLength(instruction);
+
+  // TODO: Support other condition OPs and types.
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+      __ Cmpeq(output_p_reg.VnB(), p_reg, left.VnB(), right.VnB());
+      break;
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+      __ Cmpeq(output_p_reg.VnH(), p_reg, left.VnH(), right.VnH());
+      break;
+    case DataType::Type::kInt32:
+      __ Cmpeq(output_p_reg.VnS(), p_reg, left.VnS(), right.VnS());
+      break;
+    case DataType::Type::kInt64:
+      __ Cmpeq(output_p_reg.VnD(), p_reg, left.VnD(), right.VnD());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64Sve::VisitVecPredNot(HVecPredNot* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+  DCHECK(instruction->InputAt(0)->IsVecPredSetOperation());
+  locations->SetInAt(0, Location::NoLocation());
+  locations->SetOut(Location::RequiresRegister());
+}
+
+void InstructionCodeGeneratorARM64Sve::VisitVecPredNot(HVecPredNot* instruction) {
+  DCHECK(instruction->IsPredicated());
+
+  const PRegister input_p_reg = GetVecPredSetFixedOutPReg(
+      instruction->InputAt(0)->AsVecPredSetOperation());
+  const PRegister control_p_reg = GetVecGoverningPReg(instruction);
+  const PRegister output_p_reg = GetVecPredSetFixedOutPReg(instruction);
+
+  __ Not(output_p_reg.VnB(), control_p_reg.Zeroing(), input_p_reg.VnB());
+}
+
 void LocationsBuilderARM64Sve::VisitVecPredWhile(HVecPredWhile* instruction) {
   LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
   locations->SetInAt(0, Location::RequiresRegister());
@@ -1217,8 +1278,8 @@ void LocationsBuilderARM64Sve::VisitVecPredWhile(HVecPredWhile* instruction) {
   // Semantically, the out location of this instruction and predicate inputs locations of
   // its users should be a fixed predicate register (similar to
   // Location::RegisterLocation(int reg)). But the register allocator (RA) doesn't support
-  // SIMD regs (e.g. predicate), so LoopPReg() is used explicitly without exposing it
-  // to the RA.
+  // SIMD regs (e.g. predicate), so fixed registers are used explicitly without exposing it
+  // to the RA (through GetVecPredSetFixedOutPReg()).
   //
   // To make the RA happy Location::NoLocation() was used for all the vector instructions
   // predicate inputs; but for the PredSetOperations (e.g. VecPredWhile) Location::NoLocation()
@@ -1240,21 +1301,22 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredWhile(HVecPredWhile* instruct
   DCHECK(instruction->GetCondKind() == HVecPredWhile::CondKind::kLO);
   Register left = InputRegisterAt(instruction, 0);
   Register right = InputRegisterAt(instruction, 1);
+  const PRegister output_p_reg = GetVecPredSetFixedOutPReg(instruction);
 
   DCHECK_EQ(codegen_->GetSIMDRegisterWidth() % instruction->GetVectorLength(), 0u);
 
   switch (codegen_->GetSIMDRegisterWidth() / instruction->GetVectorLength()) {
     case 1u:
-      __ Whilelo(LoopPReg().VnB(), left, right);
+      __ Whilelo(output_p_reg.VnB(), left, right);
       break;
     case 2u:
-      __ Whilelo(LoopPReg().VnH(), left, right);
+      __ Whilelo(output_p_reg.VnH(), left, right);
       break;
     case 4u:
-      __ Whilelo(LoopPReg().VnS(), left, right);
+      __ Whilelo(output_p_reg.VnS(), left, right);
       break;
     case 8u:
-      __ Whilelo(LoopPReg().VnD(), left, right);
+      __ Whilelo(output_p_reg.VnD(), left, right);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -1262,20 +1324,20 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredWhile(HVecPredWhile* instruct
   }
 }
 
-void LocationsBuilderARM64Sve::VisitVecPredCondition(HVecPredCondition* instruction) {
+void LocationsBuilderARM64Sve::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
   LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
   locations->SetInAt(0, Location::NoLocation());
   // Result of the operation - a boolean value in a core register.
   locations->SetOut(Location::RequiresRegister());
 }
 
-void InstructionCodeGeneratorARM64Sve::VisitVecPredCondition(HVecPredCondition* instruction) {
+void InstructionCodeGeneratorARM64Sve::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
   // Instruction is not predicated, see nodes_vector.h
   DCHECK(!instruction->IsPredicated());
   Register reg = OutputRegister(instruction);
-  // Currently VecPredCondition is only used as part of vectorized loop check condition
+  // Currently VecPredToBoolean is only used as part of vectorized loop check condition
   // evaluation.
-  DCHECK(instruction->GetPCondKind() == HVecPredCondition::PCondKind::kNFirst);
+  DCHECK(instruction->GetPCondKind() == HVecPredToBoolean::PCondKind::kNFirst);
   __ Cset(reg, pl);
 }
 
diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc
index e8ecf28386..70f22af17b 100644
--- a/compiler/optimizing/code_generator_vector_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc
@@ -1069,12 +1069,32 @@ void InstructionCodeGeneratorARMVIXL::VisitVecPredWhile(HVecPredWhile* instructi
   UNREACHABLE();
 }
 
-void LocationsBuilderARMVIXL::VisitVecPredCondition(HVecPredCondition* instruction) {
+void LocationsBuilderARMVIXL::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
   UNREACHABLE();
 }
 
-void InstructionCodeGeneratorARMVIXL::VisitVecPredCondition(HVecPredCondition* instruction) {
+void InstructionCodeGeneratorARMVIXL::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  UNREACHABLE();
+}
+
+void LocationsBuilderARMVIXL::VisitVecCondition(HVecCondition* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  UNREACHABLE();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecCondition(HVecCondition* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  UNREACHABLE();
+}
+
+void LocationsBuilderARMVIXL::VisitVecPredNot(HVecPredNot* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  UNREACHABLE();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecPredNot(HVecPredNot* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
   UNREACHABLE();
 }
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 343a6e1af4..1f9b2578ac 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -1401,12 +1401,32 @@ void InstructionCodeGeneratorX86::VisitVecPredWhile(HVecPredWhile* instruction)
   UNREACHABLE();
 }
 
-void LocationsBuilderX86::VisitVecPredCondition(HVecPredCondition* instruction) {
+void LocationsBuilderX86::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
   UNREACHABLE();
 }
 
-void InstructionCodeGeneratorX86::VisitVecPredCondition(HVecPredCondition* instruction) {
+void InstructionCodeGeneratorX86::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  UNREACHABLE();
+}
+
+void LocationsBuilderX86::VisitVecCondition(HVecCondition* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  UNREACHABLE();
+}
+
+void InstructionCodeGeneratorX86::VisitVecCondition(HVecCondition* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  UNREACHABLE();
+}
+
+void LocationsBuilderX86::VisitVecPredNot(HVecPredNot* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  UNREACHABLE();
+}
+
+void InstructionCodeGeneratorX86::VisitVecPredNot(HVecPredNot* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
   UNREACHABLE();
 }
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index fb6e4e753f..47afa3b4a1 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -1374,12 +1374,32 @@ void InstructionCodeGeneratorX86_64::VisitVecPredWhile(HVecPredWhile* instructio
   UNREACHABLE();
 }
 
-void LocationsBuilderX86_64::VisitVecPredCondition(HVecPredCondition* instruction) {
+void LocationsBuilderX86_64::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
   UNREACHABLE();
 }
 
-void InstructionCodeGeneratorX86_64::VisitVecPredCondition(HVecPredCondition* instruction) {
+void InstructionCodeGeneratorX86_64::VisitVecPredToBoolean(HVecPredToBoolean* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  UNREACHABLE();
+}
+
+void LocationsBuilderX86_64::VisitVecCondition(HVecCondition* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  UNREACHABLE();
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecCondition(HVecCondition* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  UNREACHABLE();
+}
+
+void LocationsBuilderX86_64::VisitVecPredNot(HVecPredNot* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  UNREACHABLE();
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecPredNot(HVecPredNot* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
   UNREACHABLE();
 }
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index f890ba9cc0..c72d3ea24a 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -796,7 +796,7 @@ TEST_F(CodegenTest, ARM64ParallelMoveResolverSIMD) {
 
   codegen.Initialize();
 
-  graph->SetHasSIMD(true);
+  graph->SetHasTraditionalSIMD(true);
   for (int i = 0; i < 2; i++) {
     HParallelMove* move = new (graph->GetAllocator()) HParallelMove(graph->GetAllocator());
     move->AddMove(Location::SIMDStackSlot(0),
@@ -816,7 +816,7 @@ TEST_F(CodegenTest, ARM64ParallelMoveResolverSIMD) {
                   DataType::Type::kFloat64,
                   nullptr);
     codegen.GetMoveResolver()->EmitNativeCode(move);
-    graph->SetHasSIMD(false);
+    graph->SetHasTraditionalSIMD(false);
   }
 
   codegen.Finalize();
@@ -864,7 +864,7 @@ TEST_F(CodegenTest, ARM64FrameSizeSIMD) {
   arm64::CodeGeneratorARM64 codegen(graph, *compiler_options);
 
   codegen.Initialize();
-  graph->SetHasSIMD(true);
+  graph->SetHasTraditionalSIMD(true);
 
   DCHECK_EQ(arm64::callee_saved_fp_registers.GetCount(), 8);
   vixl::aarch64::CPURegList reg_list = arm64::callee_saved_fp_registers;
@@ -884,7 +884,8 @@ TEST_F(CodegenTest, ARM64FrameSizeNoSIMD) {
   arm64::CodeGeneratorARM64 codegen(graph, *compiler_options);
 
   codegen.Initialize();
-  graph->SetHasSIMD(false);
+  graph->SetHasTraditionalSIMD(false);
+  graph->SetHasPredicatedSIMD(false);
 
   DCHECK_EQ(arm64::callee_saved_fp_registers.GetCount(), 8);
   vixl::aarch64::CPURegList reg_list = arm64::callee_saved_fp_registers;
diff --git a/compiler/optimizing/dead_code_elimination.cc b/compiler/optimizing/dead_code_elimination.cc
index cf49e39849..8e6b6db236 100644
--- a/compiler/optimizing/dead_code_elimination.cc
+++ b/compiler/optimizing/dead_code_elimination.cc
@@ -24,6 +24,7 @@
 #include "base/scoped_arena_containers.h"
 #include "base/stl_util.h"
 #include "optimizing/nodes.h"
+#include "optimizing/nodes_vector.h"
 #include "ssa_phi_elimination.h"
 
 namespace art HIDDEN {
@@ -842,7 +843,8 @@ void HDeadCodeElimination::RemoveDeadInstructions() {
 
 void HDeadCodeElimination::UpdateGraphFlags() {
   bool has_monitor_operations = false;
-  bool has_simd = false;
+  bool has_traditional_simd = false;
+  bool has_predicated_simd = false;
   bool has_bounds_checks = false;
   bool has_always_throwing_invokes = false;
 
@@ -852,7 +854,12 @@ void HDeadCodeElimination::UpdateGraphFlags() {
       if (instruction->IsMonitorOperation()) {
         has_monitor_operations = true;
       } else if (instruction->IsVecOperation()) {
-        has_simd = true;
+        HVecOperation* vec_instruction = instruction->AsVecOperation();
+        if (vec_instruction->IsPredicated()) {
+          has_predicated_simd = true;
+        } else {
+          has_traditional_simd = true;
+        }
       } else if (instruction->IsBoundsCheck()) {
         has_bounds_checks = true;
       } else if (instruction->IsInvoke() && instruction->AsInvoke()->AlwaysThrows()) {
@@ -862,7 +869,8 @@ void HDeadCodeElimination::UpdateGraphFlags() {
   }
 
   graph_->SetHasMonitorOperations(has_monitor_operations);
-  graph_->SetHasSIMD(has_simd);
+  graph_->SetHasTraditionalSIMD(has_traditional_simd);
+  graph_->SetHasPredicatedSIMD(has_predicated_simd);
   graph_->SetHasBoundsChecks(has_bounds_checks);
   graph_->SetHasAlwaysThrowingInvokes(has_always_throwing_invokes);
 }
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 73bdd1e223..bd33fde907 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -610,6 +610,7 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
   }
 
   void VisitVecMemoryOperation(HVecMemoryOperation* vec_mem_operation) override {
+    VisitVecOperation(vec_mem_operation);
     StartAttributeStream("alignment") << vec_mem_operation->GetAlignment().ToString();
   }
 
diff --git a/compiler/optimizing/load_store_analysis.cc b/compiler/optimizing/load_store_analysis.cc
index b46e3e18d9..75000c8b91 100644
--- a/compiler/optimizing/load_store_analysis.cc
+++ b/compiler/optimizing/load_store_analysis.cc
@@ -268,6 +268,13 @@ bool HeapLocationCollector::CanArrayElementsAlias(const HInstruction* idx1,
 }
 
 bool LoadStoreAnalysis::Run() {
+  // Currently load_store analysis can't handle predicated load/stores; specifically pairs of
+  // memory operations with different predicates.
+  // TODO: support predicated SIMD.
+  if (graph_->HasPredicatedSIMD()) {
+    return false;
+  }
+
   for (HBasicBlock* block : graph_->GetReversePostOrder()) {
     heap_location_collector_.VisitBasicBlock(block);
   }
diff --git a/compiler/optimizing/load_store_analysis.h b/compiler/optimizing/load_store_analysis.h
index c46a5b9cc1..ee425454a0 100644
--- a/compiler/optimizing/load_store_analysis.h
+++ b/compiler/optimizing/load_store_analysis.h
@@ -610,6 +610,7 @@ class HeapLocationCollector : public HGraphVisitor {
   }
 
   void VisitVecLoad(HVecLoad* instruction) override {
+    DCHECK(!instruction->IsPredicated());
     HInstruction* array = instruction->InputAt(0);
     HInstruction* index = instruction->InputAt(1);
     DataType::Type type = instruction->GetPackedType();
@@ -618,6 +619,7 @@ class HeapLocationCollector : public HGraphVisitor {
   }
 
   void VisitVecStore(HVecStore* instruction) override {
+    DCHECK(!instruction->IsPredicated());
     HInstruction* array = instruction->InputAt(0);
     HInstruction* index = instruction->InputAt(1);
     DataType::Type type = instruction->GetPackedType();
diff --git a/compiler/optimizing/load_store_elimination.cc b/compiler/optimizing/load_store_elimination.cc
index 9cabb12a9f..58fdd1cd05 100644
--- a/compiler/optimizing/load_store_elimination.cc
+++ b/compiler/optimizing/load_store_elimination.cc
@@ -1080,10 +1080,12 @@ class LSEVisitor final : private HGraphDelegateVisitor {
   }
 
   void VisitVecLoad(HVecLoad* instruction) override {
+    DCHECK(!instruction->IsPredicated());
     VisitGetLocation(instruction, heap_location_collector_.GetArrayHeapLocation(instruction));
   }
 
   void VisitVecStore(HVecStore* instruction) override {
+    DCHECK(!instruction->IsPredicated());
     size_t idx = heap_location_collector_.GetArrayHeapLocation(instruction);
     VisitSetLocation(instruction, idx, instruction->GetValue());
   }
@@ -4041,6 +4043,13 @@ bool LoadStoreElimination::Run(bool enable_partial_lse) {
     return false;
   }
 
+  // Currently load_store analysis can't handle predicated load/stores; specifically pairs of
+  // memory operations with different predicates.
+  // TODO: support predicated SIMD.
+  if (graph_->HasPredicatedSIMD()) {
+    return false;
+  }
+
   std::unique_ptr<LSEVisitorWrapper> lse_visitor(new (&allocator) LSEVisitorWrapper(
       graph_, heap_location_collector, enable_partial_lse, stats_));
   lse_visitor->Run();
diff --git a/compiler/optimizing/load_store_elimination_test.cc b/compiler/optimizing/load_store_elimination_test.cc
index 9fcc8ddef6..d3cf8bfa2a 100644
--- a/compiler/optimizing/load_store_elimination_test.cc
+++ b/compiler/optimizing/load_store_elimination_test.cc
@@ -573,7 +573,8 @@ TEST_F(LoadStoreEliminationTest, SameHeapValue2) {
   AddVecStore(entry_block_, array_, j_);
   HInstruction* vstore = AddVecStore(entry_block_, array_, i_);
 
-  graph_->SetHasSIMD(true);
+  // TODO: enable LSE for graphs with predicated SIMD.
+  graph_->SetHasTraditionalSIMD(true);
   PerformLSE();
 
   ASSERT_FALSE(IsRemoved(vstore));
@@ -589,7 +590,8 @@ TEST_F(LoadStoreEliminationTest, SameHeapValue3) {
   AddVecStore(entry_block_, array_, i_add1_);
   HInstruction* vstore = AddVecStore(entry_block_, array_, i_);
 
-  graph_->SetHasSIMD(true);
+  // TODO: enable LSE for graphs with predicated SIMD.
+  graph_->SetHasTraditionalSIMD(true);
   PerformLSE();
 
   ASSERT_FALSE(IsRemoved(vstore));
@@ -634,7 +636,8 @@ TEST_F(LoadStoreEliminationTest, OverlappingLoadStore) {
   AddArraySet(entry_block_, array_, i_, c1);
   HInstruction* vload5 = AddVecLoad(entry_block_, array_, i_);
 
-  graph_->SetHasSIMD(true);
+  // TODO: enable LSE for graphs with predicated SIMD.
+  graph_->SetHasTraditionalSIMD(true);
   PerformLSE();
 
   ASSERT_TRUE(IsRemoved(load1));
@@ -668,7 +671,8 @@ TEST_F(LoadStoreEliminationTest, StoreAfterLoopWithoutSideEffects) {
   // a[j] = 1;
   HInstruction* array_set = AddArraySet(return_block_, array_, j_, c1);
 
-  graph_->SetHasSIMD(true);
+  // TODO: enable LSE for graphs with predicated SIMD.
+  graph_->SetHasTraditionalSIMD(true);
   PerformLSE();
 
   ASSERT_TRUE(IsRemoved(array_set));
@@ -706,7 +710,8 @@ TEST_F(LoadStoreEliminationTest, StoreAfterSIMDLoopWithSideEffects) {
   // a[j] = 0;
   HInstruction* a_set = AddArraySet(return_block_, array_, j_, c0);
 
-  graph_->SetHasSIMD(true);
+  // TODO: enable LSE for graphs with predicated SIMD.
+  graph_->SetHasTraditionalSIMD(true);
   PerformLSE();
 
   ASSERT_TRUE(IsRemoved(vload));
@@ -745,7 +750,8 @@ TEST_F(LoadStoreEliminationTest, LoadAfterSIMDLoopWithSideEffects) {
   // x = a[j];
   HInstruction* load = AddArrayGet(return_block_, array_, j_);
 
-  graph_->SetHasSIMD(true);
+  // TODO: enable LSE for graphs with predicated SIMD.
+  graph_->SetHasTraditionalSIMD(true);
   PerformLSE();
 
   ASSERT_TRUE(IsRemoved(vload));
@@ -786,7 +792,8 @@ TEST_F(LoadStoreEliminationTest, MergePredecessorVecStores) {
   // down: a[i,... i + 3] = [1,...1]
   HInstruction* vstore4 = AddVecStore(down, array_, i_, vdata);
 
-  graph_->SetHasSIMD(true);
+  // TODO: enable LSE for graphs with predicated SIMD.
+  graph_->SetHasTraditionalSIMD(true);
   PerformLSE();
 
   ASSERT_TRUE(IsRemoved(vstore2));
@@ -877,7 +884,8 @@ TEST_F(LoadStoreEliminationTest, RedundantVStoreVLoadInLoop) {
   HInstruction* vstore2 = AddVecStore(loop_, array_b, phi_, vload);
   HInstruction* vstore3 = AddVecStore(loop_, array_a, phi_, vstore1->InputAt(2));
 
-  graph_->SetHasSIMD(true);
+  // TODO: enable LSE for graphs with predicated SIMD.
+  graph_->SetHasTraditionalSIMD(true);
   PerformLSE();
 
   ASSERT_FALSE(IsRemoved(vstore1));
@@ -965,7 +973,8 @@ TEST_F(LoadStoreEliminationTest, VLoadDefaultValueInLoopWithoutWriteSideEffects)
   HInstruction* vload = AddVecLoad(loop_, array_a, phi_);
   HInstruction* vstore = AddVecStore(return_block_, array_, c0, vload);
 
-  graph_->SetHasSIMD(true);
+  // TODO: enable LSE for graphs with predicated SIMD.
+  graph_->SetHasTraditionalSIMD(true);
   PerformLSE();
 
   ASSERT_FALSE(IsRemoved(vload));
@@ -989,7 +998,8 @@ TEST_F(LoadStoreEliminationTest, VLoadDefaultValue) {
   HInstruction* vload = AddVecLoad(pre_header_, array_a, c0);
   HInstruction* vstore = AddVecStore(return_block_, array_, c0, vload);
 
-  graph_->SetHasSIMD(true);
+  // TODO: enable LSE for graphs with predicated SIMD.
+  graph_->SetHasTraditionalSIMD(true);
   PerformLSE();
 
   ASSERT_FALSE(IsRemoved(vload));
@@ -1066,7 +1076,8 @@ TEST_F(LoadStoreEliminationTest, VLoadAndLoadDefaultValueInLoopWithoutWriteSideE
   HInstruction* vstore = AddVecStore(return_block_, array_, c0, vload);
   HInstruction* store = AddArraySet(return_block_, array_, c0, load);
 
-  graph_->SetHasSIMD(true);
+  // TODO: enable LSE for graphs with predicated SIMD.
+  graph_->SetHasTraditionalSIMD(true);
   PerformLSE();
 
   ASSERT_FALSE(IsRemoved(vload));
@@ -1097,7 +1108,8 @@ TEST_F(LoadStoreEliminationTest, VLoadAndLoadDefaultValue) {
   HInstruction* vstore = AddVecStore(return_block_, array_, c0, vload);
   HInstruction* store = AddArraySet(return_block_, array_, c0, load);
 
-  graph_->SetHasSIMD(true);
+  // TODO: enable LSE for graphs with predicated SIMD.
+  graph_->SetHasTraditionalSIMD(true);
   PerformLSE();
 
   ASSERT_FALSE(IsRemoved(vload));
@@ -1129,7 +1141,8 @@ TEST_F(LoadStoreEliminationTest, VLoadDefaultValueAndVLoadInLoopWithoutWriteSide
   HInstruction* vstore1 = AddVecStore(return_block_, array_, c0, vload1);
   HInstruction* vstore2 = AddVecStore(return_block_, array_, c128, vload2);
 
-  graph_->SetHasSIMD(true);
+  // TODO: enable LSE for graphs with predicated SIMD.
+  graph_->SetHasTraditionalSIMD(true);
   PerformLSE();
 
   ASSERT_FALSE(IsRemoved(vload1));
@@ -1160,7 +1173,8 @@ TEST_F(LoadStoreEliminationTest, VLoadDefaultValueAndVLoad) {
   HInstruction* vstore1 = AddVecStore(return_block_, array_, c0, vload1);
   HInstruction* vstore2 = AddVecStore(return_block_, array_, c128, vload2);
 
-  graph_->SetHasSIMD(true);
+  // TODO: enable LSE for graphs with predicated SIMD.
+  graph_->SetHasTraditionalSIMD(true);
   PerformLSE();
 
   ASSERT_FALSE(IsRemoved(vload1));
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index aef6f1f5bd..f62a355ae4 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -453,6 +453,54 @@ static DataType::Type GetNarrowerType(HInstruction* a, HInstruction* b) {
   return type;
 }
 
+// Returns whether the loop is of a diamond structure:
+//
+//                header <----------------+
+//                  |                     |
+//             diamond_hif                |
+//                /   \                   |
+//     diamond_true  diamond_false        |
+//                \   /                   |
+//              back_edge                 |
+//                  |                     |
+//                  +---------------------+
+static bool HasLoopDiamondStructure(HLoopInformation* loop_info) {
+  HBasicBlock* header = loop_info->GetHeader();
+  if (loop_info->NumberOfBackEdges() != 1 || header->GetSuccessors().size() != 2) {
+    return false;
+  }
+  HBasicBlock* header_succ_0 = header->GetSuccessors()[0];
+  HBasicBlock* header_succ_1 = header->GetSuccessors()[1];
+  HBasicBlock* diamond_top = loop_info->Contains(*header_succ_0) ?
+                                  header_succ_0 :
+                                  header_succ_1;
+  if (!diamond_top->GetLastInstruction()->IsIf()) {
+    return false;
+  }
+
+  HIf* diamond_hif = diamond_top->GetLastInstruction()->AsIf();
+  HBasicBlock* diamond_true = diamond_hif->IfTrueSuccessor();
+  HBasicBlock* diamond_false = diamond_hif->IfFalseSuccessor();
+
+  if (diamond_true->GetSuccessors().size() != 1 || diamond_false->GetSuccessors().size() != 1) {
+    return false;
+  }
+
+  HBasicBlock* back_edge = diamond_true->GetSingleSuccessor();
+  if (back_edge != diamond_false->GetSingleSuccessor() ||
+      back_edge != loop_info->GetBackEdges()[0]) {
+    return false;
+  }
+
+  DCHECK_EQ(loop_info->GetBlocks().NumSetBits(), 5u);
+  return true;
+}
+
+static bool IsPredicatedLoopControlFlowSupported(HLoopInformation* loop_info) {
+  size_t num_of_blocks = loop_info->GetBlocks().NumSetBits();
+  return num_of_blocks == 2 || HasLoopDiamondStructure(loop_info);
+}
+
 //
 // Public methods.
 //
@@ -483,12 +531,12 @@ HLoopOptimization::HLoopOptimization(HGraph* graph,
       vector_map_(nullptr),
       vector_permanent_map_(nullptr),
       vector_external_set_(nullptr),
+      predicate_info_map_(nullptr),
       vector_mode_(kSequential),
       vector_preheader_(nullptr),
       vector_header_(nullptr),
       vector_body_(nullptr),
       vector_index_(nullptr),
-      loop_main_pred_(nullptr),
       arch_loop_helper_(ArchNoOptsLoopHelper::Create(codegen, global_allocator_)) {
 }
 
@@ -545,6 +593,8 @@ bool HLoopOptimization::LocalRun() {
   ScopedArenaSafeMap<HInstruction*, HInstruction*> perm(
       std::less<HInstruction*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization));
   ScopedArenaSet<HInstruction*> ext_set(loop_allocator_->Adapter(kArenaAllocLoopOptimization));
+  ScopedArenaSafeMap<HBasicBlock*, BlockPredicateInfo*> pred(
+      std::less<HBasicBlock*>(), loop_allocator_->Adapter(kArenaAllocLoopOptimization));
   // Attach.
   iset_ = &iset;
   reductions_ = &reds;
@@ -552,6 +602,7 @@ bool HLoopOptimization::LocalRun() {
   vector_map_ = &map;
   vector_permanent_map_ = &perm;
   vector_external_set_ = &ext_set;
+  predicate_info_map_ = &pred;
   // Traverse.
   const bool did_loop_opt = TraverseLoopsInnerToOuter(top_loop_);
   // Detach.
@@ -561,6 +612,7 @@ bool HLoopOptimization::LocalRun() {
   vector_map_ = nullptr;
   vector_permanent_map_ = nullptr;
   vector_external_set_ = nullptr;
+  predicate_info_map_ = nullptr;
 
   return did_loop_opt;
 }
@@ -793,6 +845,37 @@ void HLoopOptimization::SimplifyBlocks(LoopNode* node) {
   }
 }
 
+// Checks whether the loop has exit structure suitable for InnerLoopFinite optimization:
+//  - has single loop exit.
+//  - the exit block has only single predecessor - a block inside the loop.
+//
+// In that case returns single exit basic block (outside the loop); otherwise nullptr.
+static HBasicBlock* GetInnerLoopFiniteSingleExit(HLoopInformation* loop_info) {
+  HBasicBlock* exit = nullptr;
+  for (HBlocksInLoopIterator block_it(*loop_info);
+       !block_it.Done();
+       block_it.Advance()) {
+    HBasicBlock* block = block_it.Current();
+
+    // Check whether one of the successor is loop exit.
+    for (HBasicBlock* successor : block->GetSuccessors()) {
+      if (!loop_info->Contains(*successor)) {
+        if (exit != nullptr) {
+          // The loop has more than one exit.
+          return nullptr;
+        }
+        exit = successor;
+
+        // Ensure exit can only be reached by exiting loop.
+        if (successor->GetPredecessors().size() != 1) {
+          return nullptr;
+        }
+      }
+    }
+  }
+  return exit;
+}
+
 bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) {
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
@@ -801,33 +884,22 @@ bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) {
   if (!induction_range_.IsFinite(node->loop_info, &trip_count)) {
     return false;
   }
-  // Ensure there is only a single loop-body (besides the header).
-  HBasicBlock* body = nullptr;
-  for (HBlocksInLoopIterator it(*node->loop_info); !it.Done(); it.Advance()) {
-    if (it.Current() != header) {
-      if (body != nullptr) {
-        return false;
-      }
-      body = it.Current();
-    }
-  }
-  CHECK(body != nullptr);
-  // Ensure there is only a single exit point.
-  if (header->GetSuccessors().size() != 2) {
-    return false;
-  }
-  HBasicBlock* exit = (header->GetSuccessors()[0] == body)
-      ? header->GetSuccessors()[1]
-      : header->GetSuccessors()[0];
-  // Ensure exit can only be reached by exiting loop.
-  if (exit->GetPredecessors().size() != 1) {
+  // Check loop exits.
+  HBasicBlock* exit = GetInnerLoopFiniteSingleExit(node->loop_info);
+  if (exit == nullptr) {
     return false;
   }
+
+  HBasicBlock* body = (header->GetSuccessors()[0] == exit)
+    ? header->GetSuccessors()[1]
+    : header->GetSuccessors()[0];
   // Detect either an empty loop (no side effects other than plain iteration) or
   // a trivial loop (just iterating once). Replace subsequent index uses, if any,
   // with the last value and remove the loop, possibly after unrolling its body.
   HPhi* main_phi = nullptr;
-  if (TrySetSimpleLoopHeader(header, &main_phi)) {
+  size_t num_of_blocks = header->GetLoopInformation()->GetBlocks().NumSetBits();
+
+  if (num_of_blocks == 2 && TrySetSimpleLoopHeader(header, &main_phi)) {
     bool is_empty = IsEmptyBody(body);
     if (reductions_->empty() &&  // TODO: possible with some effort
         (is_empty || trip_count == 1) &&
@@ -850,32 +922,61 @@ bool HLoopOptimization::TryOptimizeInnerLoopFinite(LoopNode* node) {
       return true;
     }
   }
-
-  bool enable_alignment_strategies = !IsInPredicatedVectorizationMode();
   // Vectorize loop, if possible and valid.
   if (!kEnableVectorization ||
       // Disable vectorization for debuggable graphs: this is a workaround for the bug
       // in 'GenerateNewLoop' which caused the SuspendCheck environment to be invalid.
       // TODO: b/138601207, investigate other possible cases with wrong environment values and
       // possibly switch back vectorization on for debuggable graphs.
-      graph_->IsDebuggable() ||
-      !TrySetSimpleLoopHeader(header, &main_phi) ||
-      !CanVectorizeDataFlow(node, body, enable_alignment_strategies)) {
+      graph_->IsDebuggable()) {
     return false;
   }
 
-  if (!IsVectorizationProfitable(trip_count) ||
-      !TryAssignLastValue(node->loop_info, main_phi, preheader, /*collect_loop_uses*/ true)) {
+  if (IsInPredicatedVectorizationMode()) {
+    return TryVectorizePredicated(node, body, exit, main_phi, trip_count);
+  } else {
+    return TryVectorizedTraditional(node, body, exit, main_phi, trip_count);
+  }
+}
+
+bool HLoopOptimization::TryVectorizePredicated(LoopNode* node,
+                                               HBasicBlock* body,
+                                               HBasicBlock* exit,
+                                               HPhi* main_phi,
+                                               int64_t trip_count) {
+  if (!IsPredicatedLoopControlFlowSupported(node->loop_info) ||
+      !ShouldVectorizeCommon(node, main_phi, trip_count)) {
     return false;
   }
 
-  if (IsInPredicatedVectorizationMode()) {
-    VectorizePredicated(node, body, exit);
-  } else {
-    VectorizeTraditional(node, body, exit, trip_count);
+  // Currently we can only generate cleanup loops for loops with 2 basic block.
+  //
+  // TODO: Support array disambiguation tests for CF loops.
+  if (NeedsArrayRefsDisambiguationTest() &&
+      node->loop_info->GetBlocks().NumSetBits() != 2) {
+    return false;
+  }
+
+  VectorizePredicated(node, body, exit);
+  MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorized);
+  graph_->SetHasPredicatedSIMD(true);  // flag SIMD usage
+  return true;
+}
+
+bool HLoopOptimization::TryVectorizedTraditional(LoopNode* node,
+                                                HBasicBlock* body,
+                                                HBasicBlock* exit,
+                                                HPhi* main_phi,
+                                                int64_t trip_count) {
+  HBasicBlock* header = node->loop_info->GetHeader();
+  size_t num_of_blocks = header->GetLoopInformation()->GetBlocks().NumSetBits();
+
+  if (num_of_blocks != 2 || !ShouldVectorizeCommon(node, main_phi, trip_count)) {
+    return false;
   }
-  graph_->SetHasSIMD(true);  // flag SIMD usage
+  VectorizeTraditional(node, body, exit, trip_count);
   MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorized);
+  graph_->SetHasTraditionalSIMD(true);  // flag SIMD usage
   return true;
 }
 
@@ -1023,8 +1124,9 @@ bool HLoopOptimization::TryLoopScalarOpts(LoopNode* node) {
 // Intel Press, June, 2004 (http://www.aartbik.com/).
 //
 
+
 bool HLoopOptimization::CanVectorizeDataFlow(LoopNode* node,
-                                             HBasicBlock* block,
+                                             HBasicBlock* header,
                                              bool collect_alignment_info) {
   // Reset vector bookkeeping.
   vector_length_ = 0;
@@ -1034,16 +1136,30 @@ bool HLoopOptimization::CanVectorizeDataFlow(LoopNode* node,
   vector_runtime_test_a_ =
   vector_runtime_test_b_ = nullptr;
 
-  // Phis in the loop-body prevent vectorization.
-  if (!block->GetPhis().IsEmpty()) {
-    return false;
-  }
+  // Traverse the data flow of the loop, in the original program order.
+  for (HBlocksInLoopReversePostOrderIterator block_it(*header->GetLoopInformation());
+       !block_it.Done();
+       block_it.Advance()) {
+    HBasicBlock* block = block_it.Current();
 
-  // Scan the loop-body, starting a right-hand-side tree traversal at each left-hand-side
-  // occurrence, which allows passing down attributes down the use tree.
-  for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
-    if (!VectorizeDef(node, it.Current(), /*generate_code*/ false)) {
-      return false;  // failure to vectorize a left-hand-side
+    if (block == header) {
+      // The header is of a certain structure (TrySetSimpleLoopHeader) and doesn't need to be
+      // processed here.
+      continue;
+    }
+
+    // Phis in the loop-body prevent vectorization.
+    // TODO: Enable vectorization of CF loops with Phis.
+    if (!block->GetPhis().IsEmpty()) {
+      return false;
+    }
+
+    // Scan the loop-body instructions, starting a right-hand-side tree traversal at each
+    // left-hand-side occurrence, which allows passing down attributes down the use tree.
+    for (HInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
+      if (!VectorizeDef(node, it.Current(), /*generate_code*/ false)) {
+        return false;  // failure to vectorize a left-hand-side
+      }
     }
   }
 
@@ -1139,6 +1255,23 @@ bool HLoopOptimization::CanVectorizeDataFlow(LoopNode* node,
   return true;
 }
 
+bool HLoopOptimization::ShouldVectorizeCommon(LoopNode* node,
+                                              HPhi* main_phi,
+                                              int64_t trip_count) {
+  HBasicBlock* header = node->loop_info->GetHeader();
+  HBasicBlock* preheader = node->loop_info->GetPreHeader();
+
+  bool enable_alignment_strategies = !IsInPredicatedVectorizationMode();
+  if (!TrySetSimpleLoopHeader(header, &main_phi) ||
+      !CanVectorizeDataFlow(node, header, enable_alignment_strategies) ||
+      !IsVectorizationProfitable(trip_count) ||
+      !TryAssignLastValue(node->loop_info, main_phi, preheader, /*collect_loop_uses*/ true)) {
+    return false;
+  }
+
+  return true;
+}
+
 void HLoopOptimization::VectorizePredicated(LoopNode* node,
                                             HBasicBlock* block,
                                             HBasicBlock* exit) {
@@ -1185,7 +1318,6 @@ void HLoopOptimization::VectorizePredicated(LoopNode* node,
       graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit);
   vector_mode_ = kVector;
   GenerateNewLoopPredicated(node,
-                            block,
                             preheader_for_vector_loop,
                             vector_index_,
                             vtc,
@@ -1200,7 +1332,6 @@ void HLoopOptimization::VectorizePredicated(LoopNode* node,
         graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit);
     // Use "Traditional" version for the sequential loop.
     GenerateNewLoopScalarOrTraditional(node,
-                                       block,
                                        preheader_for_cleanup_loop,
                                        vector_index_,
                                        stc,
@@ -1208,7 +1339,7 @@ void HLoopOptimization::VectorizePredicated(LoopNode* node,
                                        LoopAnalysisInfo::kNoUnrollingFactor);
   }
 
-  FinalizeVectorization(node, block);
+  FinalizeVectorization(node);
 
   // Assign governing predicates for the predicated instructions inserted during vectorization
   // outside the loop.
@@ -1339,7 +1470,6 @@ void HLoopOptimization::VectorizeTraditional(LoopNode* node,
     HBasicBlock* preheader_for_peeling_loop =
         graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit);
     GenerateNewLoopScalarOrTraditional(node,
-                                       block,
                                        preheader_for_peeling_loop,
                                        vector_index_,
                                        ptc,
@@ -1354,7 +1484,6 @@ void HLoopOptimization::VectorizeTraditional(LoopNode* node,
   HBasicBlock* preheader_for_vector_loop =
       graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit);
   GenerateNewLoopScalarOrTraditional(node,
-                                     block,
                                      preheader_for_vector_loop,
                                      vector_index_,
                                      vtc,
@@ -1369,7 +1498,6 @@ void HLoopOptimization::VectorizeTraditional(LoopNode* node,
     HBasicBlock* preheader_for_cleanup_loop =
         graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit);
     GenerateNewLoopScalarOrTraditional(node,
-                                       block,
                                        preheader_for_cleanup_loop,
                                        vector_index_,
                                        stc,
@@ -1377,10 +1505,10 @@ void HLoopOptimization::VectorizeTraditional(LoopNode* node,
                                        LoopAnalysisInfo::kNoUnrollingFactor);
   }
 
-  FinalizeVectorization(node, block);
+  FinalizeVectorization(node);
 }
 
-void HLoopOptimization::FinalizeVectorization(LoopNode* node, HBasicBlock* block) {
+void HLoopOptimization::FinalizeVectorization(LoopNode* node) {
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
   HLoopInformation* vloop = vector_header_->GetLoopInformation();
@@ -1397,9 +1525,16 @@ void HLoopOptimization::FinalizeVectorization(LoopNode* node, HBasicBlock* block
     }
   }
 
-  // Remove the original loop by disconnecting the body block
-  // and removing all instructions from the header.
-  block->DisconnectAndDelete();
+  // Remove the original loop.
+  for (HBlocksInLoopPostOrderIterator it_loop(*node->loop_info);
+       !it_loop.Done();
+       it_loop.Advance()) {
+    HBasicBlock* cur_block = it_loop.Current();
+    if (cur_block == node->loop_info->GetHeader()) {
+      continue;
+    }
+    cur_block->DisconnectAndDelete();
+  }
 
   while (!header->GetFirstInstruction()->IsGoto()) {
     header->RemoveInstruction(header->GetFirstInstruction());
@@ -1426,12 +1561,12 @@ HPhi* HLoopOptimization::InitializeForNewLoop(HBasicBlock* new_preheader, HInstr
   vector_index_ = phi;
   vector_permanent_map_->clear();
   vector_external_set_->clear();
+  predicate_info_map_->clear();
 
   return phi;
 }
 
 void HLoopOptimization::GenerateNewLoopScalarOrTraditional(LoopNode* node,
-                                                           HBasicBlock* body,
                                                            HBasicBlock* new_preheader,
                                                            HInstruction* lo,
                                                            HInstruction* hi,
@@ -1447,14 +1582,13 @@ void HLoopOptimization::GenerateNewLoopScalarOrTraditional(LoopNode* node,
   vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
 
   for (uint32_t u = 0; u < unroll; u++) {
-    GenerateNewLoopBodyOnce(node, body, induc_type, step);
+    GenerateNewLoopBodyOnce(node, induc_type, step);
   }
 
   FinalizePhisForNewLoop(phi, lo);
 }
 
 void HLoopOptimization::GenerateNewLoopPredicated(LoopNode* node,
-                                                  HBasicBlock* body,
                                                   HBasicBlock* new_preheader,
                                                   HInstruction* lo,
                                                   HInstruction* hi,
@@ -1475,44 +1609,50 @@ void HLoopOptimization::GenerateNewLoopPredicated(LoopNode* node,
                                             0u);
 
   HInstruction* cond =
-     new (global_allocator_) HVecPredCondition(global_allocator_,
+     new (global_allocator_) HVecPredToBoolean(global_allocator_,
                                                pred_while,
-                                               HVecPredCondition::PCondKind::kNFirst,
+                                               HVecPredToBoolean::PCondKind::kNFirst,
                                                DataType::Type::kInt32,
                                                vector_length_,
                                                0u);
 
   vector_header_->AddInstruction(pred_while);
   vector_header_->AddInstruction(cond);
-  loop_main_pred_ = pred_while;
-
-
   vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
 
-  GenerateNewLoopBodyOnce(node, body, induc_type, step);
+  PreparePredicateInfoMap(node);
+  GenerateNewLoopBodyOnce(node, induc_type, step);
+  InitPredicateInfoMap(node, pred_while);
 
-  // Assign governing predicates for instructions in the loop.
-  for (HInstructionIterator it(body->GetInstructions()); !it.Done(); it.Advance()) {
-    auto i = vector_map_->find(it.Current());
-    if (i != vector_map_->end()) {
-      HInstruction* instr = i->second;
+  // Assign governing predicates for instructions in the loop; the traversal order doesn't matter.
+  for (HBlocksInLoopIterator block_it(*node->loop_info);
+       !block_it.Done();
+       block_it.Advance()) {
+    HBasicBlock* cur_block = block_it.Current();
 
-      if (!instr->IsVecOperation()) {
-        continue;
-      }
-      // There are cases when a vector instruction, which corresponds to some instruction in the
-      // original scalar loop, is located not in the newly created vector loop but
-      // in the vector loop preheader (and hence recorded in vector_external_set_).
-      //
-      // Governing predicates will be set for such instructions separately.
-      bool in_vector_loop = vector_header_->GetLoopInformation()->Contains(*instr->GetBlock());
-      DCHECK_IMPLIES(!in_vector_loop,
-                      vector_external_set_->find(instr) != vector_external_set_->end());
-
-      if (in_vector_loop &&
-          !instr->AsVecOperation()->IsPredicated()) {
-        HVecOperation* op = instr->AsVecOperation();
-        op->SetMergingGoverningPredicate(loop_main_pred_);
+    for (HInstructionIterator it(cur_block->GetInstructions()); !it.Done(); it.Advance()) {
+      auto i = vector_map_->find(it.Current());
+      if (i != vector_map_->end()) {
+        HInstruction* instr = i->second;
+
+        if (!instr->IsVecOperation()) {
+          continue;
+        }
+        // There are cases when a vector instruction, which corresponds to some instruction in the
+        // original scalar loop, is located not in the newly created vector loop but
+        // in the vector loop preheader (and hence recorded in vector_external_set_).
+        //
+        // Governing predicates will be set for such instructions separately.
+        bool in_vector_loop = vector_header_->GetLoopInformation()->Contains(*instr->GetBlock());
+        DCHECK_IMPLIES(!in_vector_loop,
+                        vector_external_set_->find(instr) != vector_external_set_->end());
+
+        if (in_vector_loop &&
+            !instr->AsVecOperation()->IsPredicated()) {
+          HVecOperation* op = instr->AsVecOperation();
+          HVecPredSetOperation* pred = predicate_info_map_->Get(cur_block)->GetControlPredicate();
+          op->SetMergingGoverningPredicate(pred);
+        }
       }
     }
   }
@@ -1521,24 +1661,47 @@ void HLoopOptimization::GenerateNewLoopPredicated(LoopNode* node,
 }
 
 void HLoopOptimization::GenerateNewLoopBodyOnce(LoopNode* node,
-                                                HBasicBlock* body,
                                                 DataType::Type induc_type,
                                                 HInstruction* step) {
   // Generate instruction map.
   vector_map_->clear();
-  for (HInstructionIterator it(body->GetInstructions()); !it.Done(); it.Advance()) {
-    bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
-    DCHECK(vectorized_def);
+  HLoopInformation* loop_info = node->loop_info;
+
+  // Traverse the data flow of the loop, in the original program order.
+  for (HBlocksInLoopReversePostOrderIterator block_it(*loop_info);
+      !block_it.Done();
+      block_it.Advance()) {
+    HBasicBlock* cur_block = block_it.Current();
+
+    if (cur_block == loop_info->GetHeader()) {
+      continue;
+    }
+
+    for (HInstructionIterator it(cur_block->GetInstructions()); !it.Done(); it.Advance()) {
+      bool vectorized_def = VectorizeDef(node, it.Current(), /*generate_code*/ true);
+      DCHECK(vectorized_def);
+    }
   }
-  // Generate body from the instruction map, but in original program order.
+
+  // Generate body from the instruction map, in the original program order.
   HEnvironment* env = vector_header_->GetFirstInstruction()->GetEnvironment();
-  for (HInstructionIterator it(body->GetInstructions()); !it.Done(); it.Advance()) {
-    auto i = vector_map_->find(it.Current());
-    if (i != vector_map_->end() && !i->second->IsInBlock()) {
-      Insert(vector_body_, i->second);
-      // Deal with instructions that need an environment, such as the scalar intrinsics.
-      if (i->second->NeedsEnvironment()) {
-        i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
+  for (HBlocksInLoopReversePostOrderIterator block_it(*loop_info);
+        !block_it.Done();
+        block_it.Advance()) {
+    HBasicBlock* cur_block = block_it.Current();
+
+    if (cur_block == loop_info->GetHeader()) {
+      continue;
+    }
+
+    for (HInstructionIterator it(cur_block->GetInstructions()); !it.Done(); it.Advance()) {
+      auto i = vector_map_->find(it.Current());
+      if (i != vector_map_->end() && !i->second->IsInBlock()) {
+        Insert(vector_body_, i->second);
+        // Deal with instructions that need an environment, such as the scalar intrinsics.
+        if (i->second->NeedsEnvironment()) {
+          i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
+        }
       }
     }
   }
@@ -1626,6 +1789,10 @@ bool HLoopOptimization::VectorizeDef(LoopNode* node,
   if (instruction->IsGoto()) {
     return true;
   }
+
+  if (instruction->IsIf()) {
+    return VectorizeIfCondition(node, instruction, generate_code, restrictions);
+  }
   // Otherwise accept only expressions with no effects outside the immediate loop-body.
   // Note that actual uses are inspected during right-hand-side tree traversal.
   return !IsUsedOutsideLoop(node->loop_info, instruction)
@@ -1845,6 +2012,7 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
     case InstructionSet::kThumb2:
       // Allow vectorization for all ARM devices, because Android assumes that
       // ARM 32-bit always supports advanced SIMD (64-bit SIMD).
+      *restrictions |= kNoIfCond;
       switch (type) {
         case DataType::Type::kBool:
         case DataType::Type::kUint8:
@@ -1870,6 +2038,13 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
         DCHECK_EQ(simd_register_size_ % DataType::Size(type), 0u);
         switch (type) {
           case DataType::Type::kBool:
+            *restrictions |= kNoDiv |
+                             kNoSignedHAdd |
+                             kNoUnsignedHAdd |
+                             kNoUnroundedHAdd |
+                             kNoSAD |
+                             kNoIfCond;
+            return TrySetVectorLength(type, vector_length);
           case DataType::Type::kUint8:
           case DataType::Type::kInt8:
             *restrictions |= kNoDiv |
@@ -1892,13 +2067,13 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
             *restrictions |= kNoDiv | kNoSAD;
             return TrySetVectorLength(type, vector_length);
           case DataType::Type::kInt64:
-            *restrictions |= kNoDiv | kNoSAD;
+            *restrictions |= kNoDiv | kNoSAD | kNoIfCond;
             return TrySetVectorLength(type, vector_length);
           case DataType::Type::kFloat32:
-            *restrictions |= kNoReduction;
+            *restrictions |= kNoReduction | kNoIfCond;
             return TrySetVectorLength(type, vector_length);
           case DataType::Type::kFloat64:
-            *restrictions |= kNoReduction;
+            *restrictions |= kNoReduction | kNoIfCond;
             return TrySetVectorLength(type, vector_length);
           default:
             break;
@@ -1907,6 +2082,7 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
       } else {
         // Allow vectorization for all ARM devices, because Android assumes that
         // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD).
+        *restrictions |= kNoIfCond;
         switch (type) {
           case DataType::Type::kBool:
           case DataType::Type::kUint8:
@@ -1937,6 +2113,7 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
     case InstructionSet::kX86:
     case InstructionSet::kX86_64:
       // Allow vectorization for SSE4.1-enabled X86 devices only (128-bit SIMD).
+      *restrictions |= kNoIfCond;
       if (features->AsX86InstructionSetFeatures()->HasSSE4_1()) {
         switch (type) {
           case DataType::Type::kBool:
@@ -2203,10 +2380,10 @@ HInstruction* HLoopOptimization::ReduceAndExtractIfNeeded(HInstruction* instruct
   } \
   break;
 
-void HLoopOptimization::GenerateVecOp(HInstruction* org,
-                                      HInstruction* opa,
-                                      HInstruction* opb,
-                                      DataType::Type type) {
+HInstruction* HLoopOptimization::GenerateVecOp(HInstruction* org,
+                                               HInstruction* opa,
+                                               HInstruction* opb,
+                                               DataType::Type type) {
   uint32_t dex_pc = org->GetDexPc();
   HInstruction* vector = nullptr;
   DataType::Type org_type = org->GetType();
@@ -2276,11 +2453,23 @@ void HLoopOptimization::GenerateVecOp(HInstruction* org,
       GENERATE_VEC(
         new (global_allocator_) HVecAbs(global_allocator_, opa, type, vector_length_, dex_pc),
         new (global_allocator_) HAbs(org_type, opa, dex_pc));
+    case HInstruction::kEqual: {
+        // Special case.
+        if (vector_mode_ == kVector) {
+          vector = new (global_allocator_) HVecCondition(
+              global_allocator_, opa, opb, type, vector_length_, dex_pc);
+        } else {
+          DCHECK(vector_mode_ == kSequential);
+          UNREACHABLE();
+        }
+      }
+      break;
     default:
       break;
   }  // switch
   CHECK(vector != nullptr) << "Unsupported SIMD operator";
   vector_map_->Put(org, vector);
+  return vector;
 }
 
 #undef GENERATE_VEC
@@ -2520,6 +2709,89 @@ bool HLoopOptimization::VectorizeDotProdIdiom(LoopNode* node,
   return false;
 }
 
+bool HLoopOptimization::VectorizeIfCondition(LoopNode* node,
+                                             HInstruction* hif,
+                                             bool generate_code,
+                                             uint64_t restrictions) {
+  DCHECK(hif->IsIf());
+  HInstruction* if_input = hif->InputAt(0);
+
+  if (!if_input->HasOnlyOneNonEnvironmentUse()) {
+    // Avoid the complications of the condition used as materialized boolean.
+    return false;
+  }
+
+  if (!if_input->IsEqual()) {
+    // TODO: Support other condition types.
+    return false;
+  }
+
+  HCondition* cond = if_input->AsCondition();
+  HInstruction* opa = cond->InputAt(0);
+  HInstruction* opb = cond->InputAt(1);
+  DataType::Type type = GetNarrowerType(opa, opb);
+
+  if (!DataType::IsIntegralType(type)) {
+    return false;
+  }
+
+  bool is_unsigned = false;
+  HInstruction* opa_promoted = opa;
+  HInstruction* opb_promoted = opb;
+  bool is_int_case = DataType::Type::kInt32 == opa->GetType() &&
+                     DataType::Type::kInt32 == opa->GetType();
+
+  // Condition arguments should be either both int32 or consistently extended signed/unsigned
+  // narrower operands.
+  if (!is_int_case &&
+      !IsNarrowerOperands(opa, opb, type, &opa_promoted, &opb_promoted, &is_unsigned)) {
+    return false;
+  }
+  type = HVecOperation::ToProperType(type, is_unsigned);
+
+  // For narrow types, explicit type conversion may have been
+  // optimized way, so set the no hi bits restriction here.
+  if (DataType::Size(type) <= 2) {
+    restrictions |= kNoHiBits;
+  }
+
+  if (!TrySetVectorType(type, &restrictions) ||
+      HasVectorRestrictions(restrictions, kNoIfCond)) {
+    return false;
+  }
+
+  if (generate_code && vector_mode_ != kVector) {  // de-idiom
+    opa_promoted = opa;
+    opb_promoted = opb;
+  }
+
+  if (VectorizeUse(node, opa_promoted, generate_code, type, restrictions) &&
+      VectorizeUse(node, opb_promoted, generate_code, type, restrictions)) {
+    if (generate_code) {
+      HInstruction* vec_cond = GenerateVecOp(cond,
+                                             vector_map_->Get(opa_promoted),
+                                             vector_map_->Get(opb_promoted),
+                                             type);
+
+      if (vector_mode_ == kVector) {
+          HInstruction* vec_pred_not = new (global_allocator_) HVecPredNot(
+              global_allocator_, vec_cond, type, vector_length_, hif->GetDexPc());
+
+          vector_map_->Put(hif, vec_pred_not);
+          BlockPredicateInfo* pred_info = predicate_info_map_->Get(hif->GetBlock());
+          pred_info->SetControlFlowInfo(vec_cond->AsVecPredSetOperation(),
+                                        vec_pred_not->AsVecPredSetOperation());
+        } else {
+          DCHECK(vector_mode_ == kSequential);
+          UNREACHABLE();
+      }
+    }
+    return true;
+  }
+
+  return false;
+}
+
 //
 // Vectorization heuristics.
 //
@@ -2834,4 +3106,67 @@ bool HLoopOptimization::CanRemoveCycle() {
   return true;
 }
 
+void HLoopOptimization::PreparePredicateInfoMap(LoopNode* node) {
+  HLoopInformation* loop_info = node->loop_info;
+
+  DCHECK(IsPredicatedLoopControlFlowSupported(loop_info));
+
+  for (HBlocksInLoopIterator block_it(*loop_info);
+      !block_it.Done();
+      block_it.Advance()) {
+    HBasicBlock* cur_block = block_it.Current();
+    BlockPredicateInfo* pred_info = new (loop_allocator_) BlockPredicateInfo();
+
+    predicate_info_map_->Put(cur_block, pred_info);
+  }
+}
+
+void HLoopOptimization::InitPredicateInfoMap(LoopNode* node,
+                                             HVecPredSetOperation* loop_main_pred) {
+  HLoopInformation* loop_info = node->loop_info;
+  HBasicBlock* header = loop_info->GetHeader();
+  BlockPredicateInfo* header_info = predicate_info_map_->Get(header);
+  // Loop header is a special case; it doesn't have a false predicate because we
+  // would just exit the loop then.
+  header_info->SetControlFlowInfo(loop_main_pred, loop_main_pred);
+
+  size_t blocks_in_loop = header->GetLoopInformation()->GetBlocks().NumSetBits();
+  if (blocks_in_loop == 2) {
+    for (HBasicBlock* successor : header->GetSuccessors()) {
+      if (loop_info->Contains(*successor)) {
+        // This is loop second block - body.
+        BlockPredicateInfo* body_info = predicate_info_map_->Get(successor);
+        body_info->SetControlPredicate(loop_main_pred);
+        return;
+      }
+    }
+    UNREACHABLE();
+  }
+
+  // TODO: support predicated vectorization of CF loop of more complex structure.
+  DCHECK(HasLoopDiamondStructure(loop_info));
+  HBasicBlock* header_succ_0 = header->GetSuccessors()[0];
+  HBasicBlock* header_succ_1 = header->GetSuccessors()[1];
+  HBasicBlock* diamond_top = loop_info->Contains(*header_succ_0) ?
+                             header_succ_0 :
+                             header_succ_1;
+
+  HIf* diamond_hif = diamond_top->GetLastInstruction()->AsIf();
+  HBasicBlock* diamond_true = diamond_hif->IfTrueSuccessor();
+  HBasicBlock* diamond_false = diamond_hif->IfFalseSuccessor();
+  HBasicBlock* back_edge = diamond_true->GetSingleSuccessor();
+
+  BlockPredicateInfo* diamond_top_info = predicate_info_map_->Get(diamond_top);
+  BlockPredicateInfo* diamond_true_info = predicate_info_map_->Get(diamond_true);
+  BlockPredicateInfo* diamond_false_info = predicate_info_map_->Get(diamond_false);
+  BlockPredicateInfo* back_edge_info = predicate_info_map_->Get(back_edge);
+
+  diamond_top_info->SetControlPredicate(header_info->GetTruePredicate());
+
+  diamond_true_info->SetControlPredicate(diamond_top_info->GetTruePredicate());
+  diamond_false_info->SetControlPredicate(diamond_top_info->GetFalsePredicate());
+
+  back_edge_info->SetControlPredicate(header_info->GetTruePredicate());
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 3da8f8fe39..86a9f0fcb8 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -101,6 +101,7 @@ class HLoopOptimization : public HOptimization {
     kNoSAD           = 1 << 11,  // no sum of absolute differences (SAD)
     kNoWideSAD       = 1 << 12,  // no sum of absolute differences (SAD) with operand widening
     kNoDotProd       = 1 << 13,  // no dot product
+    kNoIfCond        = 1 << 14,  // no if condition conversion
   };
 
   /*
@@ -136,6 +137,95 @@ class HLoopOptimization : public HOptimization {
     bool is_string_char_at;  // compressed string read
   };
 
+  // This structure describes the control flow (CF) -> data flow (DF) conversion of the loop
+  // with control flow (see below) for the purpose of predicated autovectorization.
+  //
+  // Lets define "loops without control-flow" (or non-CF loops) as loops with two consecutive
+  // blocks and without the branching structure except for the loop exit. And
+  // "loop with control-flow" (or CF-loops) - all other loops.
+  //
+  // In the execution of the original CF-loop on each iteration some basic block Y will be
+  // either executed or not executed, depending on the control flow of the loop. More
+  // specifically, a block will be executed if all the conditional branches of the nodes in
+  // the control dependency graph for that block Y are taken according to the path from the loop
+  // header to that basic block.
+  //
+  // This is the key idea of CF->DF conversion: a boolean value
+  // 'ctrl_pred == cond1 && cond2 && ...' will determine whether the basic block Y will be
+  // executed, where cond_K is whether the branch of the node K in the control dependency
+  // graph upward traversal was taken in the 'right' direction.
+  //
+  // Def.: BB Y is control dependent on BB X iff
+  //   (1) there exists a directed path P from X to Y with any basic block Z in P (excluding X
+  //       and Y) post-dominated by Y and
+  //   (2) X is not post-dominated by Y.
+  //             ...
+  //              X
+  //     false /     \ true
+  //          /       \
+  //                  ...
+  //                   |
+  //                   Y
+  //                  ...
+  //
+  // When doing predicated autovectorization of a CF loop, we use the CF->DF conversion approach:
+  //  1) do the data analysis and vector operation creation as if it was a non-CF loop.
+  //  2) for each HIf block create two vector predicate setting instructions - for True and False
+  //     edges/paths.
+  //  3) assign a governing vector predicate (see comments near HVecPredSetOperation)
+  //     to each vector operation Alpha in the loop (including to those vector predicate setting
+  //     instructions created in #2); do this by:
+  //     - finding the immediate control dependent block of the instruction Alpha's block.
+  //     - choosing the True or False predicate setting instruction (created in #2) depending
+  //       on the path to the instruction.
+  //
+  // For more information check the papers:
+  //
+  //   - Allen, John R and Kennedy, Ken and Porterfield, Carrie and Warren, Joe,
+  //     “Conversion of Control Dependence to Data Dependence,” in Proceedings of the 10th ACM
+  //     SIGACT-SIGPLAN Symposium on Principles of Programming Languages, 1983, pp. 177–189.
+  //   - JEANNE FERRANTE, KARL J. OTTENSTEIN, JOE D. WARREN,
+  //     "The Program Dependence Graph and Its Use in Optimization"
+  //
+  class BlockPredicateInfo : public ArenaObject<kArenaAllocLoopOptimization> {
+   public:
+    BlockPredicateInfo() :
+        control_predicate_(nullptr),
+        true_predicate_(nullptr),
+        false_predicate_(nullptr) {}
+
+    void SetControlFlowInfo(HVecPredSetOperation* true_predicate,
+                            HVecPredSetOperation* false_predicate) {
+      DCHECK(!HasControlFlowOps());
+      true_predicate_ = true_predicate;
+      false_predicate_ = false_predicate;
+    }
+
+    bool HasControlFlowOps() const {
+      // Note: a block must have both T/F predicates set or none of them.
+      DCHECK_EQ(true_predicate_ == nullptr, false_predicate_ == nullptr);
+      return true_predicate_ != nullptr;
+    }
+
+    HVecPredSetOperation* GetControlPredicate() const { return control_predicate_; }
+    void SetControlPredicate(HVecPredSetOperation* control_predicate) {
+      control_predicate_ = control_predicate;
+    }
+
+    HVecPredSetOperation* GetTruePredicate() const { return true_predicate_; }
+    HVecPredSetOperation* GetFalsePredicate() const { return false_predicate_; }
+
+   private:
+    // Vector control predicate operation, associated with the block which will determine
+    // the active lanes for all vector operations, originated from this block.
+    HVecPredSetOperation* control_predicate_;
+
+    // Vector predicate instruction, associated with the true sucessor of the block.
+    HVecPredSetOperation* true_predicate_;
+    // Vector predicate instruction, associated with the false sucessor of the block.
+    HVecPredSetOperation* false_predicate_;
+  };
+
   //
   // Loop setup and traversal.
   //
@@ -208,10 +298,12 @@ class HLoopOptimization : public HOptimization {
   //   - checks whether instructions are vectorizable for the target.
   //   - conducts data dependence analysis for array references.
   //   - additionally, collects info on peeling and aligment strategy.
-  bool CanVectorizeDataFlow(LoopNode* node, HBasicBlock* block, bool collect_alignment_info);
+  bool CanVectorizeDataFlow(LoopNode* node, HBasicBlock* header, bool collect_alignment_info);
 
+  // Does the checks (common for predicated and traditional mode) for the loop.
+  bool ShouldVectorizeCommon(LoopNode* node, HPhi* main_phi, int64_t trip_count);
 
-  // Vectorizes the loop for which all checks have been already done.
+  // Try to vectorize the loop, returns whether it was successful.
   //
   // There are two versions/algorithms:
   //  - Predicated: all the vector operations have governing predicates which control
@@ -220,6 +312,19 @@ class HLoopOptimization : public HOptimization {
   //  - Traditional: a regular mode in which all vector operations lanes are unconditionally
   //    active.
   //    Example: vectoriation using AArch64 NEON.
+  bool TryVectorizePredicated(LoopNode* node,
+                              HBasicBlock* body,
+                              HBasicBlock* exit,
+                              HPhi* main_phi,
+                              int64_t trip_count);
+
+  bool TryVectorizedTraditional(LoopNode* node,
+                                HBasicBlock* body,
+                                HBasicBlock* exit,
+                                HPhi* main_phi,
+                                int64_t trip_count);
+
+  // Vectorizes the loop for which all checks have been already done.
   void VectorizePredicated(LoopNode* node,
                            HBasicBlock* block,
                            HBasicBlock* exit);
@@ -230,14 +335,13 @@ class HLoopOptimization : public HOptimization {
 
   // Performs final steps for whole vectorization process: links reduction, removes the original
   // scalar loop, updates loop info.
-  void FinalizeVectorization(LoopNode* node, HBasicBlock* block);
+  void FinalizeVectorization(LoopNode* node);
 
   // Helpers that do the vector instruction synthesis for the previously created loop; create
   // and fill the loop body with instructions.
   //
   // A version to generate a vector loop in predicated mode.
   void GenerateNewLoopPredicated(LoopNode* node,
-                                 HBasicBlock* block,
                                  HBasicBlock* new_preheader,
                                  HInstruction* lo,
                                  HInstruction* hi,
@@ -246,7 +350,6 @@ class HLoopOptimization : public HOptimization {
   // A version to generate a vector loop in traditional mode or to generate
   // a scalar loop for both modes.
   void GenerateNewLoopScalarOrTraditional(LoopNode* node,
-                                          HBasicBlock* block,
                                           HBasicBlock* new_preheader,
                                           HInstruction* lo,
                                           HInstruction* hi,
@@ -264,9 +367,15 @@ class HLoopOptimization : public HOptimization {
   // Finalizes reduction and induction phis' inputs for the newly created loop.
   void FinalizePhisForNewLoop(HPhi* phi, HInstruction* lo);
 
+  // Creates empty predicate info object for each basic block and puts it into the map.
+  void PreparePredicateInfoMap(LoopNode* node);
+
+  // Set up block true/false predicates using info, collected through data flow and control
+  // dependency analysis.
+  void InitPredicateInfoMap(LoopNode* node, HVecPredSetOperation* loop_main_pred);
+
   // Performs instruction synthesis for the loop body.
   void GenerateNewLoopBodyOnce(LoopNode* node,
-                               HBasicBlock* body,
                                DataType::Type induc_type,
                                HInstruction* step);
 
@@ -300,10 +409,10 @@ class HLoopOptimization : public HOptimization {
   void GenerateVecReductionPhi(HPhi* phi);
   void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction);
   HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction);
-  void GenerateVecOp(HInstruction* org,
-                     HInstruction* opa,
-                     HInstruction* opb,
-                     DataType::Type type);
+  HInstruction* GenerateVecOp(HInstruction* org,
+                              HInstruction* opa,
+                              HInstruction* opb,
+                              DataType::Type type);
 
   // Vectorization idioms.
   bool VectorizeSaturationIdiom(LoopNode* node,
@@ -326,6 +435,10 @@ class HLoopOptimization : public HOptimization {
                              bool generate_code,
                              DataType::Type type,
                              uint64_t restrictions);
+  bool VectorizeIfCondition(LoopNode* node,
+                            HInstruction* instruction,
+                            bool generate_code,
+                            uint64_t restrictions);
 
   // Vectorization heuristics.
   Alignment ComputeAlignment(HInstruction* offset,
@@ -435,13 +548,17 @@ class HLoopOptimization : public HOptimization {
   // for loop reductions).
   ScopedArenaSet<HInstruction*>* vector_external_set_;
 
+  // A mapping between a basic block of the original loop and its associated PredicateInfo.
+  //
+  // Only used in predicated loop vectorization mode.
+  ScopedArenaSafeMap<HBasicBlock*, BlockPredicateInfo*>* predicate_info_map_;
+
   // Temporary vectorization bookkeeping.
   VectorMode vector_mode_;  // synthesis mode
   HBasicBlock* vector_preheader_;  // preheader of the new loop
   HBasicBlock* vector_header_;  // header of the new loop
   HBasicBlock* vector_body_;  // body of the new loop
   HInstruction* vector_index_;  // normalized index of the new loop
-  HInstruction* loop_main_pred_;  // Loop main predicate - for predicated mode.
 
   // Helper for target-specific behaviour for loop optimizations.
   ArchNoOptsLoopHelper* arch_loop_helper_;
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 83b58763a4..9caa766858 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -2797,8 +2797,11 @@ HInstruction* HGraph::InlineInto(HGraph* outer_graph, HInvoke* invoke) {
   if (HasMonitorOperations()) {
     outer_graph->SetHasMonitorOperations(true);
   }
-  if (HasSIMD()) {
-    outer_graph->SetHasSIMD(true);
+  if (HasTraditionalSIMD()) {
+    outer_graph->SetHasTraditionalSIMD(true);
+  }
+  if (HasPredicatedSIMD()) {
+    outer_graph->SetHasPredicatedSIMD(true);
   }
   if (HasAlwaysThrowingInvokes()) {
     outer_graph->SetHasAlwaysThrowingInvokes(true);
@@ -3125,6 +3128,8 @@ void HGraph::TransformLoopHeaderForBCE(HBasicBlock* header) {
       new_pre_header, old_pre_header, /* replace_if_back_edge= */ false);
 }
 
+// Creates a new two-basic-block loop and inserts it between original loop header and
+// original loop exit; also adjusts dominators, post order and new LoopInformation.
 HBasicBlock* HGraph::TransformLoopForVectorization(HBasicBlock* header,
                                                    HBasicBlock* body,
                                                    HBasicBlock* exit) {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 9643e4c789..dec86e30dd 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -403,7 +403,8 @@ class HGraph : public ArenaObject<kArenaAllocGraph> {
         has_bounds_checks_(false),
         has_try_catch_(false),
         has_monitor_operations_(false),
-        has_simd_(false),
+        has_traditional_simd_(false),
+        has_predicated_simd_(false),
         has_loops_(false),
         has_irreducible_loops_(false),
         has_direct_critical_native_call_(false),
@@ -708,8 +709,13 @@ class HGraph : public ArenaObject<kArenaAllocGraph> {
   bool HasMonitorOperations() const { return has_monitor_operations_; }
   void SetHasMonitorOperations(bool value) { has_monitor_operations_ = value; }
 
-  bool HasSIMD() const { return has_simd_; }
-  void SetHasSIMD(bool value) { has_simd_ = value; }
+  bool HasTraditionalSIMD() { return has_traditional_simd_; }
+  void SetHasTraditionalSIMD(bool value) { has_traditional_simd_ = value; }
+
+  bool HasPredicatedSIMD() { return has_predicated_simd_; }
+  void SetHasPredicatedSIMD(bool value) { has_predicated_simd_ = value; }
+
+  bool HasSIMD() const { return has_traditional_simd_ || has_predicated_simd_; }
 
   bool HasLoops() const { return has_loops_; }
   void SetHasLoops(bool value) { has_loops_ = value; }
@@ -822,10 +828,11 @@ class HGraph : public ArenaObject<kArenaAllocGraph> {
   // DexRegisterMap to be present to allow deadlock analysis for non-debuggable code.
   bool has_monitor_operations_;
 
-  // Flag whether SIMD instructions appear in the graph. If true, the
-  // code generators may have to be more careful spilling the wider
+  // Flags whether SIMD (traditional or predicated) instructions appear in the graph.
+  // If either is true, the code generators may have to be more careful spilling the wider
   // contents of SIMD registers.
-  bool has_simd_;
+  bool has_traditional_simd_;
+  bool has_predicated_simd_;
 
   // Flag whether there are any loops in the graph. We can skip loop
   // optimization if it's false.
@@ -1636,7 +1643,9 @@ class HLoopInformationOutwardIterator : public ValueObject {
   M(VecStore, VecMemoryOperation)                                       \
   M(VecPredSetAll, VecPredSetOperation)                                 \
   M(VecPredWhile, VecPredSetOperation)                                  \
-  M(VecPredCondition, VecOperation)                                     \
+  M(VecPredToBoolean, VecOperation)                                     \
+  M(VecCondition, VecPredSetOperation)                                  \
+  M(VecPredNot, VecPredSetOperation)                                    \
 
 #define FOR_EACH_CONCRETE_INSTRUCTION_COMMON(M)                         \
   FOR_EACH_CONCRETE_INSTRUCTION_SCALAR_COMMON(M)                        \
@@ -8634,7 +8643,7 @@ class CloneAndReplaceInstructionVisitor final : public HGraphDelegateVisitor {
   DISALLOW_COPY_AND_ASSIGN(CloneAndReplaceInstructionVisitor);
 };
 
-// Iterator over the blocks that art part of the loop. Includes blocks part
+// Iterator over the blocks that are part of the loop; includes blocks which are part
 // of an inner loop. The order in which the blocks are iterated is on their
 // block id.
 class HBlocksInLoopIterator : public ValueObject {
@@ -8667,7 +8676,7 @@ class HBlocksInLoopIterator : public ValueObject {
   DISALLOW_COPY_AND_ASSIGN(HBlocksInLoopIterator);
 };
 
-// Iterator over the blocks that art part of the loop. Includes blocks part
+// Iterator over the blocks that are part of the loop; includes blocks which are part
 // of an inner loop. The order in which the blocks are iterated is reverse
 // post order.
 class HBlocksInLoopReversePostOrderIterator : public ValueObject {
@@ -8700,6 +8709,39 @@ class HBlocksInLoopReversePostOrderIterator : public ValueObject {
   DISALLOW_COPY_AND_ASSIGN(HBlocksInLoopReversePostOrderIterator);
 };
 
+// Iterator over the blocks that are part of the loop; includes blocks which are part
+// of an inner loop. The order in which the blocks are iterated is post order.
+class HBlocksInLoopPostOrderIterator : public ValueObject {
+ public:
+  explicit HBlocksInLoopPostOrderIterator(const HLoopInformation& info)
+      : blocks_in_loop_(info.GetBlocks()),
+        blocks_(info.GetHeader()->GetGraph()->GetReversePostOrder()),
+        index_(blocks_.size() - 1) {
+    if (!blocks_in_loop_.IsBitSet(blocks_[index_]->GetBlockId())) {
+      Advance();
+    }
+  }
+
+  bool Done() const { return index_ < 0; }
+  HBasicBlock* Current() const { return blocks_[index_]; }
+  void Advance() {
+    --index_;
+    for (; index_ >= 0; --index_) {
+      if (blocks_in_loop_.IsBitSet(blocks_[index_]->GetBlockId())) {
+        break;
+      }
+    }
+  }
+
+ private:
+  const BitVector& blocks_in_loop_;
+  const ArenaVector<HBasicBlock*>& blocks_;
+
+  int32_t index_;
+
+  DISALLOW_COPY_AND_ASSIGN(HBlocksInLoopPostOrderIterator);
+};
+
 // Returns int64_t value of a properly typed constant.
 inline int64_t Int64FromConstant(HConstant* constant) {
   if (constant->IsIntConstant()) {
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index 73f6c40a0d..6a60d6be01 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -1384,8 +1384,8 @@ class HVecPredWhile final : public HVecPredSetOperation {
   static constexpr size_t kCondKind = HVecOperation::kNumberOfVectorOpPackedBits;
   static constexpr size_t kCondKindSize =
       MinimumBitsToStore(static_cast<size_t>(CondKind::kLast));
-  static constexpr size_t kNumberOfVecPredConditionPackedBits = kCondKind + kCondKindSize;
-  static_assert(kNumberOfVecPredConditionPackedBits <= kMaxNumberOfPackedBits,
+  static constexpr size_t kNumberOfVecPredWhilePackedBits = kCondKind + kCondKindSize;
+  static_assert(kNumberOfVecPredWhilePackedBits <= kMaxNumberOfPackedBits,
                 "Too many packed fields.");
   using CondKindField = BitField<CondKind, kCondKind, kCondKindSize>;
 
@@ -1395,13 +1395,13 @@ class HVecPredWhile final : public HVecPredSetOperation {
 // Evaluates the predicate condition (PCondKind) for a vector predicate; outputs
 // a scalar boolean value result.
 //
-// Note: as VecPredCondition can be also predicated, only active elements (determined by the
+// Note: as VecPredToBoolean can be also predicated, only active elements (determined by the
 // instruction's governing predicate) of the input vector predicate are used for condition
 // evaluation.
 //
 // Note: this instruction is currently used as a workaround for the fact that IR instructions
 // can't have more than one output.
-class HVecPredCondition final : public HVecOperation {
+class HVecPredToBoolean final : public HVecOperation {
  public:
   // To get more info on the condition kinds please see "2.2 Process state, PSTATE" section of
   // "ARM Architecture Reference Manual Supplement. The Scalable Vector Extension (SVE),
@@ -1418,13 +1418,13 @@ class HVecPredCondition final : public HVecOperation {
     kEnumLast = kPLast
   };
 
-  HVecPredCondition(ArenaAllocator* allocator,
+  HVecPredToBoolean(ArenaAllocator* allocator,
                     HInstruction* input,
                     PCondKind pred_cond,
                     DataType::Type packed_type,
                     size_t vector_length,
                     uint32_t dex_pc)
-      : HVecOperation(kVecPredCondition,
+      : HVecOperation(kVecPredToBoolean,
                       allocator,
                       packed_type,
                       SideEffects::None(),
@@ -1447,19 +1447,86 @@ class HVecPredCondition final : public HVecOperation {
     return GetPackedField<CondKindField>();
   }
 
-  DECLARE_INSTRUCTION(VecPredCondition);
+  DECLARE_INSTRUCTION(VecPredToBoolean);
 
  protected:
   // Additional packed bits.
   static constexpr size_t kCondKind = HVecOperation::kNumberOfVectorOpPackedBits;
   static constexpr size_t kCondKindSize =
       MinimumBitsToStore(static_cast<size_t>(PCondKind::kEnumLast));
-  static constexpr size_t kNumberOfVecPredConditionPackedBits = kCondKind + kCondKindSize;
-  static_assert(kNumberOfVecPredConditionPackedBits <= kMaxNumberOfPackedBits,
+  static constexpr size_t kNumberOfVecPredToBooleanPackedBits = kCondKind + kCondKindSize;
+  static_assert(kNumberOfVecPredToBooleanPackedBits <= kMaxNumberOfPackedBits,
                 "Too many packed fields.");
   using CondKindField = BitField<PCondKind, kCondKind, kCondKindSize>;
 
-  DEFAULT_COPY_CONSTRUCTOR(VecPredCondition);
+  DEFAULT_COPY_CONSTRUCTOR(VecPredToBoolean);
+};
+
+// Evaluates condition for pairwise elements in two input vectors and sets the result
+// as an output predicate vector.
+//
+// viz. [ p1, .. , pn ]  = [ x1 OP y1 , x2 OP y2, .. , xn OP yn] where OP is CondKind
+// condition.
+//
+// Currently only kEqual is supported by this vector instruction - we don't even define
+// the kCondType here.
+// TODO: support other condition ops.
+class HVecCondition final : public HVecPredSetOperation {
+ public:
+  HVecCondition(ArenaAllocator* allocator,
+                HInstruction* left,
+                HInstruction* right,
+                DataType::Type packed_type,
+                size_t vector_length,
+                uint32_t dex_pc) :
+      HVecPredSetOperation(kVecCondition,
+                           allocator,
+                           packed_type,
+                           SideEffects::None(),
+                           /* number_of_inputs= */ 2,
+                           vector_length,
+                           dex_pc) {
+    DCHECK(left->IsVecOperation());
+    DCHECK(!left->IsVecPredSetOperation());
+    DCHECK(right->IsVecOperation());
+    DCHECK(!right->IsVecPredSetOperation());
+    SetRawInputAt(0, left);
+    SetRawInputAt(1, right);
+  }
+
+  DECLARE_INSTRUCTION(VecCondition);
+
+ protected:
+  DEFAULT_COPY_CONSTRUCTOR(VecCondition);
+};
+
+// Inverts every component in the predicate vector.
+//
+// viz. [ p1, .. , pn ]  = [ !px1 , !px2 , .. , !pxn ].
+class HVecPredNot final : public HVecPredSetOperation {
+ public:
+  HVecPredNot(ArenaAllocator* allocator,
+                HInstruction* input,
+                DataType::Type packed_type,
+                size_t vector_length,
+                uint32_t dex_pc) :
+      HVecPredSetOperation(kVecPredNot,
+                           allocator,
+                           packed_type,
+                           SideEffects::None(),
+                           /* number_of_inputs= */ 1,
+                           vector_length,
+                           dex_pc) {
+    DCHECK(input->IsVecOperation());
+    DCHECK(input->IsVecPredSetOperation());
+
+    SetRawInputAt(0, input);
+  }
+
+  DECLARE_INSTRUCTION(VecPredNot);
+
+ protected:
+  DEFAULT_COPY_CONSTRUCTOR(VecPredNot);
 };
 
 }  // namespace art
diff --git a/test/530-checker-lse-simd/src/Main.java b/test/530-checker-lse-simd/src/Main.java
index 619ac281b3..ec2faf51ec 100644
--- a/test/530-checker-lse-simd/src/Main.java
+++ b/test/530-checker-lse-simd/src/Main.java
@@ -38,10 +38,17 @@ public class Main {
   /// CHECK-NEXT: Sub
   /// CHECK-NEXT: Mul
   /// CHECK-NEXT: ArraySet
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-NEXT: ArrayGet
+  //
+  /// CHECK-FI:
   /// CHECK-NEXT: LessThanOrEqual
   /// CHECK-NEXT: Select
   /// CHECK-NEXT: Add
   /// CHECK-NEXT: Goto loop:{{B\d+}}
+  //
+  // TODO: reenable LSE for graphs with Predicated SIMD.
   static double $noinline$vecgen(double a[], double b[], int n) {
     double norma = 0.0;
     int init = 1325;
@@ -87,10 +94,17 @@ public class Main {
   /// CHECK-NEXT: ArrayGet
   /// CHECK-NEXT: Mul
   /// CHECK-NEXT: ArraySet
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-NEXT: ArrayGet
+  //
+  /// CHECK-FI:
   /// CHECK-NEXT: ArrayLength
   /// CHECK-NEXT: BelowOrEqual
   //
   /// CHECK:      Return
+  //
+  // TODO: reenable LSE for graphs with Predicated SIMD.
   static double $noinline$test02(double a[], int n) {
     double b[] = new double[n];
     a[0] = a[0] / 2;
@@ -120,7 +134,13 @@ public class Main {
   /// CHECK-NEXT: Return
 
   /// CHECK-START: double Main.$noinline$test03(int) load_store_elimination (after)
-  /// CHECK-NOT:  ArrayGet loop:none
+  /// CHECK-IF:     not hasIsaFeature("sve")
+  //
+  ///     CHECK-NOT:  ArrayGet loop:none
+  //
+  /// CHECK-FI:
+  //
+  // TODO: reenable LSE for graphs with Predicated SIMD.
   static double $noinline$test03(int n) {
     double a[] = new double[n];
     double b[] = new double[n];
@@ -164,7 +184,14 @@ public class Main {
   /// CHECK:        Add
   /// CHECK:        Goto loop:{{B\d+}}
   //
-  /// CHECK-NOT:    VecStore
+
+  /// CHECK-IF:     not hasIsaFeature("sve")
+  //
+  ///     CHECK-NOT:    VecStore
+  //
+  /// CHECK-FI:
+  //
+  // TODO: reenable LSE for graphs with Predicated SIMD.
   static double[] $noinline$test04(int n) {
     double a[] = new double[n];
     double b[] = new double[n];
@@ -194,18 +221,14 @@ public class Main {
   /// CHECK:        Goto loop:{{B\d+}}
 
   /// CHECK-START-ARM64: double[] Main.$noinline$test05(int) load_store_elimination (after)
-  /// CHECK-IF:     not hasIsaFeature("sve")
-  //
-  //      In NEON case there is a post-loop which prevents the store to be removed.
-  ///     CHECK:        VecStore
-  //
-  /// CHECK-FI:
-  //
+  /// CHECK:        VecStore
   /// CHECK:        VecStore
   /// CHECK:        Add
   /// CHECK:        Goto loop:{{B\d+}}
   //
   /// CHECK-NOT:    VecStore
+  //
+  // TODO: reenable LSE for graphs with Predicated SIMD.
   static double[] $noinline$test05(int n) {
     double a[] = new double[n];
     double b[] = new double[n];
@@ -249,7 +272,13 @@ public class Main {
   /// CHECK:        VecAdd
   /// CHECK:        VecStore
   //
-  /// CHECK-NOT:    VecStore
+  /// CHECK-IF:     not hasIsaFeature("sve")
+  //
+  ///     CHECK-NOT:    VecStore
+  //
+  /// CHECK-FI:
+  //
+  // TODO: reenable LSE for graphs with Predicated SIMD.
   static double[] $noinline$test06(int n) {
     double a[] = new double[n];
     double b[] = new double[n];
diff --git a/test/661-checker-simd-cf-loops/Android.bp b/test/661-checker-simd-cf-loops/Android.bp
new file mode 100644
index 0000000000..8d2c193c02
--- /dev/null
+++ b/test/661-checker-simd-cf-loops/Android.bp
@@ -0,0 +1,43 @@
+// Generated by `regen-test-files`. Do not edit manually.
+
+// Build rules for ART run-test `661-checker-simd-cf-loops`.
+
+package {
+    // See: http://go/android-license-faq
+    // A large-scale-change added 'default_applicable_licenses' to import
+    // all of the 'license_kinds' from "art_license"
+    // to get the below license kinds:
+    //   SPDX-license-identifier-Apache-2.0
+    default_applicable_licenses: ["art_license"],
+}
+
+// Test's Dex code.
+java_test {
+    name: "art-run-test-661-checker-simd-cf-loops",
+    defaults: ["art-run-test-defaults"],
+    test_config_template: ":art-run-test-target-template",
+    srcs: ["src/**/*.java"],
+    data: [
+        ":art-run-test-661-checker-simd-cf-loops-expected-stdout",
+        ":art-run-test-661-checker-simd-cf-loops-expected-stderr",
+    ],
+    // Include the Java source files in the test's artifacts, to make Checker assertions
+    // available to the TradeFed test runner.
+    include_srcs: true,
+}
+
+// Test's expected standard output.
+genrule {
+    name: "art-run-test-661-checker-simd-cf-loops-expected-stdout",
+    out: ["art-run-test-661-checker-simd-cf-loops-expected-stdout.txt"],
+    srcs: ["expected-stdout.txt"],
+    cmd: "cp -f $(in) $(out)",
+}
+
+// Test's expected standard error.
+genrule {
+    name: "art-run-test-661-checker-simd-cf-loops-expected-stderr",
+    out: ["art-run-test-661-checker-simd-cf-loops-expected-stderr.txt"],
+    srcs: ["expected-stderr.txt"],
+    cmd: "cp -f $(in) $(out)",
+}
diff --git a/test/661-checker-simd-cf-loops/expected-stderr.txt b/test/661-checker-simd-cf-loops/expected-stderr.txt
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/test/661-checker-simd-cf-loops/expected-stderr.txt
diff --git a/test/661-checker-simd-cf-loops/expected-stdout.txt b/test/661-checker-simd-cf-loops/expected-stdout.txt
new file mode 100644
index 0000000000..b0aad4deb5
--- /dev/null
+++ b/test/661-checker-simd-cf-loops/expected-stdout.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/661-checker-simd-cf-loops/info.txt b/test/661-checker-simd-cf-loops/info.txt
new file mode 100644
index 0000000000..bc7881d16a
--- /dev/null
+++ b/test/661-checker-simd-cf-loops/info.txt
@@ -0,0 +1 @@
+Functional tests on vectorization of loops with control flow.
diff --git a/test/661-checker-simd-cf-loops/src/Main.java b/test/661-checker-simd-cf-loops/src/Main.java
new file mode 100644
index 0000000000..95c09490bb
--- /dev/null
+++ b/test/661-checker-simd-cf-loops/src/Main.java
@@ -0,0 +1,746 @@
+/*
+ * Copyright (C) 2023 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for autovectorization of loops with control flow.
+ */
+public class Main {
+
+  public static final int ARRAY_LENGTH = 128;
+  public static final int USED_ARRAY_LENGTH = ARRAY_LENGTH - 1;
+
+  public static boolean[] booleanArray = new boolean[ARRAY_LENGTH];
+  public static boolean[] booleanArray2 = new boolean[ARRAY_LENGTH];
+  public static byte[] byteArray = new byte[ARRAY_LENGTH];
+  public static short[] shortArray = new short[ARRAY_LENGTH];
+  public static char[] charArray = new char[ARRAY_LENGTH];
+  public static int[] intArray = new int[ARRAY_LENGTH];
+  public static long[] longArray = new long[ARRAY_LENGTH];
+  public static float[] floatArray = new float[ARRAY_LENGTH];
+  public static double[] doubleArray = new double[ARRAY_LENGTH];
+
+  public static final int MAGIC_VALUE_A = 2;
+  public static final int MAGIC_VALUE_B = 10;
+  public static final int MAGIC_VALUE_C = 100;
+
+  public static final int MAGIC_ADD_CONST = 99;
+
+  public static final float MAGIC_FLOAT_VALUE_A = 2.0f;
+  public static final float MAGIC_FLOAT_VALUE_B = 10.0f;
+  public static final float MAGIC_FLOAT_VALUE_C = 100.0f;
+
+  public static final float MAGIC_FLOAT_ADD_CONST = 99.0f;
+
+  /// CHECK-START-ARM64: int Main.$compile$noinline$FullDiamond(int[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-DAG: <<C0:i\d+>>      IntConstant 0                                         loop:none
+  ///     CHECK-DAG: <<C4:i\d+>>      IntConstant 4                                         loop:none
+  ///     CHECK-DAG: <<C99:i\d+>>     IntConstant 99                                        loop:none
+  ///     CHECK-DAG: <<C100:i\d+>>    IntConstant 100                                       loop:none
+  ///     CHECK-DAG: <<Vec4:d\d+>>    VecReplicateScalar [<<C4>>,{{j\d+}}]                  loop:none
+  ///     CHECK-DAG: <<Vec99:d\d+>>   VecReplicateScalar [<<C99>>,{{j\d+}}]                 loop:none
+  ///     CHECK-DAG: <<Vec100:d\d+>>  VecReplicateScalar [<<C100>>,{{j\d+}}]                loop:none
+  //
+  ///     CHECK-DAG: <<Phi:i\d+>>     Phi [<<C0>>,{{i\d+}}]                                 loop:<<Loop:B\d+>> outer_loop:none
+  ///     CHECK-DAG: <<LoopP:j\d+>>   VecPredWhile [<<Phi>>,{{i\d+}}]                       loop:<<Loop>>      outer_loop:none
+  //
+  ///     CHECK-DAG: <<Load1:d\d+>>   VecLoad [<<Arr:l\d+>>,<<Phi>>,<<LoopP>>]              loop:<<Loop>>      outer_loop:none
+  ///     CHECK-DAG: <<Cond:j\d+>>    VecCondition [<<Load1>>,<<Vec100>>,<<LoopP>>]         loop:<<Loop>>      outer_loop:none
+  ///     CHECK-DAG: <<CondR:j\d+>>   VecPredNot [<<Cond>>,<<LoopP>>]                       loop:<<Loop>>      outer_loop:none
+  ///     CHECK-DAG: <<AddT:d\d+>>    VecAdd [<<Load1>>,<<Vec99>>,<<CondR>>]                loop:<<Loop>>      outer_loop:none
+  ///     CHECK-DAG: <<StT:d\d+>>     VecStore [<<Arr>>,<<Phi>>,<<AddT>>,<<CondR>>]         loop:<<Loop>>      outer_loop:none
+  ///     CHECK-DAG: <<StF:d\d+>>     VecStore [<<Arr>>,<<Phi>>,{{d\d+}},<<Cond>>]          loop:<<Loop>>      outer_loop:none
+  ///     CHECK-DAG: <<Ld2:d\d+>>     VecLoad [<<Arr>>,<<Phi>>,<<LoopP>>]                   loop:<<Loop>>      outer_loop:none
+  ///     CHECK-DAG: <<Add2:d\d+>>    VecAdd [<<Ld2>>,<<Vec4>>,<<LoopP>>]                   loop:<<Loop>>      outer_loop:none
+  ///     CHECK-DAG: <<St21:d\d+>>    VecStore [<<Arr>>,<<Phi>>,<<Add2>>,<<LoopP>>]         loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-ELSE:
+  //
+  ///     CHECK-NOT:                      VecLoad
+  //
+  /// CHECK-FI:
+  public static int $compile$noinline$FullDiamond(int[] x) {
+    int i = 0;
+    for (; i < USED_ARRAY_LENGTH; i++) {
+      int val = x[i];
+      if (val != MAGIC_VALUE_C) {
+        x[i] += MAGIC_ADD_CONST;
+      } else {
+        x[i] += 3;
+      }
+      x[i] += 4;
+    }
+    return i;
+  }
+
+  //
+  // Test various types.
+  //
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleBoolean(boolean[], boolean[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-NOT: VecLoad
+  //
+  /// CHECK-FI:
+  //
+  // TODO: Support extra condition types and boolean comparisons.
+  public static void $compile$noinline$SimpleBoolean(boolean[] x, boolean[] y) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      boolean val = x[i];
+      if (val != y[i]) {
+        x[i] |= y[i];
+      }
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleByte(byte[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-DAG: VecLoad
+  //
+  /// CHECK-FI:
+  public static void $compile$noinline$SimpleByte(byte[] x) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      byte val = x[i];
+      if (val != MAGIC_VALUE_C) {
+        x[i] += MAGIC_ADD_CONST;
+      }
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleUByte(byte[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-DAG: VecLoad
+  //
+  /// CHECK-FI:
+  public static void $compile$noinline$SimpleUByte(byte[] x) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      if ((x[i] & 0xFF) != MAGIC_VALUE_C) {
+        x[i] += MAGIC_ADD_CONST;
+      }
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleShort(short[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-DAG: VecLoad
+  //
+  /// CHECK-FI:
+  public static void $compile$noinline$SimpleShort(short[] x) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      short val = x[i];
+      if (val != MAGIC_VALUE_C) {
+        x[i] += MAGIC_ADD_CONST;
+      }
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleChar(char[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-DAG: VecLoad
+  //
+  /// CHECK-FI:
+  public static void $compile$noinline$SimpleChar(char[] x) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      char val = x[i];
+      if (val != MAGIC_VALUE_C) {
+        x[i] += MAGIC_ADD_CONST;
+      }
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleInt(int[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-DAG: VecLoad
+  //
+  /// CHECK-FI:
+  public static void $compile$noinline$SimpleInt(int[] x) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      int val = x[i];
+      if (val != MAGIC_VALUE_C) {
+        x[i] += MAGIC_ADD_CONST;
+      }
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleLong(long[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-NOT: VecLoad
+  //
+  /// CHECK-FI:
+  //
+  // TODO: Support long comparisons.
+  public static void $compile$noinline$SimpleLong(long[] x) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      long val = x[i];
+      if (val != MAGIC_VALUE_C) {
+        x[i] += MAGIC_ADD_CONST;
+      }
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleFloat(float[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-NOT: VecLoad
+  //
+  /// CHECK-FI:
+  //
+  // TODO: Support FP comparisons.
+  public static void $compile$noinline$SimpleFloat(float[] x) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      float val = x[i];
+      if (val > 10.0f) {
+        x[i] += 99.1f;
+      }
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleDouble(double[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-NOT: VecLoad
+  //
+  /// CHECK-FI:
+  //
+  // TODO: Support FP comparisons.
+  public static void $compile$noinline$SimpleDouble(double[] x) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      double val = x[i];
+      if (val != 10.0) {
+        x[i] += 99.1;
+      }
+    }
+  }
+
+  //
+  // Narrowing types.
+  //
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$ByteConv(byte[], byte[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-DAG: VecLoad
+  //
+  /// CHECK-FI:
+  public static void $compile$noinline$ByteConv(byte[] x, byte[] y) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      byte val = (byte)(x[i] + 1);
+      if (val != y[i]) {
+        x[i] += MAGIC_ADD_CONST;
+      }
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$UByteAndWrongConst(byte[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-NOT: VecLoad
+  //
+  /// CHECK-FI:
+  //
+  // 'NarrowerOperands' not met: the constant is not a ubyte one.
+  public static void $compile$noinline$UByteAndWrongConst(byte[] x) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      if ((x[i] & 0xFF) != (MAGIC_VALUE_C | 0x100)) {
+        x[i] += MAGIC_ADD_CONST;
+      }
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$ByteNoHiBits(byte[], byte[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-NOT: VecLoad
+  //
+  /// CHECK-FI:
+  //
+  // Check kNoHiBits case when "wider" operations cannot bring in higher order bits.
+  public static void $compile$noinline$ByteNoHiBits(byte[] x, byte[] y) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      byte val = x[i];
+      if ((val >>> 3) != y[i]) {
+        x[i] += MAGIC_ADD_CONST;
+      }
+    }
+  }
+
+  //
+  // Test condition types.
+  //
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$SimpleBelow(int[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-NOT: VecLoad
+  //
+  /// CHECK-FI:
+  //
+  // TODO: Support other conditions.
+  public static void $compile$noinline$SimpleBelow(int[] x) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      int val = x[i];
+      if (val < MAGIC_VALUE_C) {
+        x[i] += MAGIC_ADD_CONST;
+      }
+    }
+  }
+
+  //
+  // Test vectorization idioms.
+  //
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$Select(int[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-NOT: VecLoad
+  //
+  /// CHECK-FI:
+  //
+  // TODO: vectorize loops with select in the body.
+  public static void $compile$noinline$Select(int[] x) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      int val = x[i];
+      if (val != MAGIC_VALUE_C) {
+        val += MAGIC_ADD_CONST;
+      }
+      x[i] = val;
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$Phi(int[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-NOT: VecLoad
+  //
+  /// CHECK-FI:
+  //
+  // TODO: vectorize loops with phis in the body.
+  public static void $compile$noinline$Phi(int[] x) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      int val = x[i];
+      if (val != MAGIC_VALUE_C) {
+        val += MAGIC_ADD_CONST;
+        x[i] += val;
+      }
+      x[i] += val;
+    }
+  }
+
+  // TODO: when Phis are supported, test dotprod and sad idioms.
+
+  /// CHECK-START-ARM64: int Main.$compile$noinline$Reduction(int[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-NOT: VecLoad
+  //
+  /// CHECK-FI:
+  //
+  // TODO: vectorize loops with phis and reductions in the body.
+  private static int $compile$noinline$Reduction(int[] x) {
+    int sum = 0;
+    for (int i = 0; i < ARRAY_LENGTH; i++) {
+      int val = x[i];
+      if (val != MAGIC_VALUE_C) {
+        sum += val + x[i];
+      }
+    }
+    return sum;
+  }
+
+  /// CHECK-START-ARM64: int Main.$compile$noinline$ReductionBackEdge(int[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-DAG: VecLoad
+  //
+  /// CHECK-FI:
+  //
+  // Reduction in the back edge block, non-CF-dependent.
+  public static int $compile$noinline$ReductionBackEdge(int[] x) {
+    int sum = 0;
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      int val = x[i];
+      if (val != MAGIC_VALUE_C) {
+        x[i] += MAGIC_ADD_CONST;
+      }
+      sum += x[i];
+    }
+    return sum;
+  }
+
+  //
+  // Negative compile tests.
+  //
+
+  public static final int STENCIL_ARRAY_SIZE = 130;
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$stencilAlike(int[], int[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-NOT: VecLoad
+  //
+  /// CHECK-FI:
+  //
+  // This loop needs a runtime test for array references disambiguation and a scalar cleanup loop.
+  // Currently we can't generate a scalar clean up loop with control flow.
+  private static void $compile$noinline$stencilAlike(int[] a, int[] b) {
+    for (int i = 1; i < STENCIL_ARRAY_SIZE - 1; i++) {
+      int val0 = b[i - 1];
+      int val1 = b[i];
+      int val2 = b[i + 1];
+      int un = a[i];
+      if (val1 != MAGIC_VALUE_C) {
+        a[i] = val0 + val1 + val2;
+      }
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$NotDiamondCf(int[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-NOT: VecLoad
+  //
+  /// CHECK-FI:
+  //
+  // Loops with complex CF are not supported.
+  public static void $compile$noinline$NotDiamondCf(int[] x) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      int val = x[i];
+      if (val != MAGIC_VALUE_C) {
+        if (val != 1234) {
+          x[i] += MAGIC_ADD_CONST;
+        }
+      }
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.$compile$noinline$BrokenInduction(int[]) loop_optimization (after)
+  /// CHECK-IF:     hasIsaFeature("sve")
+  //
+  ///     CHECK-NOT: VecLoad
+  //
+  /// CHECK-FI:
+  public static void $compile$noinline$BrokenInduction(int[] x) {
+    for (int i = 0; i < USED_ARRAY_LENGTH; i++) {
+      int val = x[i];
+      if (val != MAGIC_VALUE_C) {
+        x[i] += MAGIC_ADD_CONST;
+        i++;
+      }
+    }
+  }
+
+  //
+  // Main driver.
+  //
+
+  public static void main(String[] args) {
+    initIntArray(intArray);
+    int final_ind_value = $compile$noinline$FullDiamond(intArray);
+    expectIntEquals(23755, IntArraySum(intArray));
+    expectIntEquals(USED_ARRAY_LENGTH, final_ind_value);
+
+    // Types.
+    initBooleanArray(booleanArray);
+    booleanArray2[12] = true;
+    $compile$noinline$SimpleBoolean(booleanArray, booleanArray2);
+    expectIntEquals(86, BooleanArraySum(booleanArray));
+
+    initByteArray(byteArray);
+    $compile$noinline$SimpleByte(byteArray);
+    expectIntEquals(-64, ByteArraySum(byteArray));
+
+    initByteArray(byteArray);
+    $compile$noinline$SimpleUByte(byteArray);
+    expectIntEquals(-64, ByteArraySum(byteArray));
+
+    initShortArray(shortArray);
+    $compile$noinline$SimpleShort(shortArray);
+    expectIntEquals(23121, ShortArraySum(shortArray));
+
+    initCharArray(charArray);
+    $compile$noinline$SimpleChar(charArray);
+    expectIntEquals(23121, CharArraySum(charArray));
+
+    initIntArray(intArray);
+    $compile$noinline$SimpleInt(intArray);
+    expectIntEquals(23121, IntArraySum(intArray));
+
+    initLongArray(longArray);
+    $compile$noinline$SimpleLong(longArray);
+    expectLongEquals(23121, LongArraySum(longArray));
+
+    initFloatArray(floatArray);
+    $compile$noinline$SimpleFloat(floatArray);
+    expectFloatEquals(18868.2f, FloatArraySum(floatArray));
+
+    initDoubleArray(doubleArray);
+    $compile$noinline$SimpleDouble(doubleArray);
+    expectDoubleEquals(23129.5, DoubleArraySum(doubleArray));
+
+    // Narrowing types.
+    initByteArray(byteArray);
+    $compile$noinline$ByteConv(byteArray, byteArray);
+    expectIntEquals(-2, ByteArraySum(byteArray));
+
+    initByteArray(byteArray);
+    $compile$noinline$UByteAndWrongConst(byteArray);
+    expectIntEquals(-2, ByteArraySum(byteArray));
+
+    initByteArray(byteArray);
+    $compile$noinline$ByteNoHiBits(byteArray, byteArray);
+    expectIntEquals(-2, ByteArraySum(byteArray));
+
+    // Conditions.
+    initIntArray(intArray);
+    $compile$noinline$SimpleBelow(intArray);
+    expectIntEquals(23121, IntArraySum(intArray));
+
+    // Idioms.
+    initIntArray(intArray);
+    $compile$noinline$Select(intArray);
+    expectIntEquals(23121, IntArraySum(intArray));
+
+    initIntArray(intArray);
+    $compile$noinline$Phi(intArray);
+    expectIntEquals(36748, IntArraySum(intArray));
+
+    int reduction_result = 0;
+
+    initIntArray(intArray);
+    reduction_result = $compile$noinline$Reduction(intArray);
+    expectIntEquals(14706, IntArraySum(intArray));
+    expectIntEquals(21012, reduction_result);
+
+    initIntArray(intArray);
+    reduction_result = $compile$noinline$ReductionBackEdge(intArray);
+    expectIntEquals(23121, IntArraySum(intArray));
+    expectIntEquals(13121, reduction_result);
+
+    int[] stencilArrayA = new int[STENCIL_ARRAY_SIZE];
+    int[] stencilArrayB = new int[STENCIL_ARRAY_SIZE];
+    initIntArray(stencilArrayA);
+    initIntArray(stencilArrayB);
+    $compile$noinline$stencilAlike(stencilArrayA, stencilArrayB);
+    expectIntEquals(43602, IntArraySum(stencilArrayA));
+
+    initIntArray(intArray);
+    $compile$noinline$NotDiamondCf(intArray);
+    expectIntEquals(23121, IntArraySum(intArray));
+
+    initIntArray(intArray);
+    $compile$noinline$BrokenInduction(intArray);
+    expectIntEquals(18963, IntArraySum(intArray));
+
+    System.out.println("passed");
+  }
+
+  public static void initBooleanArray(boolean[] a) {
+    for (int i = 0; i < ARRAY_LENGTH; i++) {
+      if (i % 3 != 0) {
+        a[i] = true;
+      }
+    }
+  }
+
+  public static void initByteArray(byte[] a) {
+    for (int i = 0; i < ARRAY_LENGTH; i++) {
+      if (i % 3 == 0) {
+        a[i] = (byte)MAGIC_VALUE_A;
+      } else if (i % 3 == 1) {
+        a[i] = (byte)MAGIC_VALUE_B;
+      } else {
+        a[i] = (byte)MAGIC_VALUE_C;
+      }
+    }
+    a[USED_ARRAY_LENGTH] = 127;
+  }
+
+  public static void initShortArray(short[] a) {
+    for (int i = 0; i < ARRAY_LENGTH; i++) {
+      if (i % 3 == 0) {
+        a[i] = (short)MAGIC_VALUE_A;
+      } else if (i % 3 == 1) {
+        a[i] = (short)MAGIC_VALUE_B;
+      } else {
+        a[i] = (short)MAGIC_VALUE_C;
+      }
+    }
+    a[USED_ARRAY_LENGTH] = 10000;
+  }
+
+  public static void initCharArray(char[] a) {
+    for (int i = 0; i < ARRAY_LENGTH; i++) {
+      if (i % 3 == 0) {
+        a[i] = (char)MAGIC_VALUE_A;
+      } else if (i % 3 == 1) {
+        a[i] = (char)MAGIC_VALUE_B;
+      } else {
+        a[i] = (char)MAGIC_VALUE_C;
+      }
+    }
+    a[USED_ARRAY_LENGTH] = 10000;
+  }
+
+  public static void initIntArray(int[] a) {
+    for (int i = 0; i < ARRAY_LENGTH; i++) {
+      if (i % 3 == 0) {
+        a[i] = MAGIC_VALUE_A;
+      } else if (i % 3 == 1) {
+        a[i] = MAGIC_VALUE_B;
+      } else {
+        a[i] = MAGIC_VALUE_C;
+      }
+    }
+    a[USED_ARRAY_LENGTH] = 10000;
+  }
+
+  public static void initLongArray(long[] a) {
+    for (int i = 0; i < ARRAY_LENGTH; i++) {
+      if (i % 3 == 0) {
+        a[i] = MAGIC_VALUE_A;
+      } else if (i % 3 == 1) {
+        a[i] = MAGIC_VALUE_B;
+      } else {
+        a[i] = MAGIC_VALUE_C;
+      }
+    }
+    a[USED_ARRAY_LENGTH] = 10000;
+  }
+
+  public static void initFloatArray(float[] a) {
+    for (int i = 0; i < ARRAY_LENGTH; i++) {
+      if (i % 3 == 0) {
+        a[i] = MAGIC_FLOAT_VALUE_A;
+      } else if (i % 3 == 1) {
+        a[i] = MAGIC_FLOAT_VALUE_B;
+      } else {
+        a[i] = MAGIC_FLOAT_VALUE_C;
+      }
+    }
+    a[USED_ARRAY_LENGTH] = 10000.0f;
+  }
+
+  public static void initDoubleArray(double[] a) {
+    for (int i = 0; i < ARRAY_LENGTH; i++) {
+      if (i % 3 == 0) {
+        a[i] = MAGIC_FLOAT_VALUE_A;
+      } else if (i % 3 == 1) {
+        a[i] = MAGIC_FLOAT_VALUE_B;
+      } else {
+        a[i] = MAGIC_FLOAT_VALUE_C;
+      }
+    }
+    a[USED_ARRAY_LENGTH] = 10000.0f;
+  }
+
+  public static byte BooleanArraySum(boolean[] a) {
+    byte sum = 0;
+    for (int i = 0; i < a.length; i++) {
+      sum += a[i] ? 1 : 0;
+    }
+    return sum;
+  }
+
+  public static byte ByteArraySum(byte[] a) {
+    byte sum = 0;
+    for (int i = 0; i < a.length; i++) {
+      sum += a[i];
+    }
+    return sum;
+  }
+
+  public static short ShortArraySum(short[] a) {
+    short sum = 0;
+    for (int i = 0; i < a.length; i++) {
+      sum += a[i];
+    }
+    return sum;
+  }
+
+  public static char CharArraySum(char[] a) {
+    char sum = 0;
+    for (int i = 0; i < a.length; i++) {
+      sum += a[i];
+    }
+    return sum;
+  }
+
+  public static int IntArraySum(int[] a) {
+    int sum = 0;
+    for (int i = 0; i < a.length; i++) {
+      sum += a[i];
+    }
+    return sum;
+  }
+
+  public static long LongArraySum(long[] a) {
+    long sum = 0;
+    for (int i = 0; i < a.length; i++) {
+      sum += a[i];
+    }
+    return sum;
+  }
+
+  public static float FloatArraySum(float[] a) {
+    float sum = 0.0f;
+    for (int i = 0; i < a.length; i++) {
+      sum += a[i];
+    }
+    return sum;
+  }
+
+  public static double DoubleArraySum(double[] a) {
+    double sum = 0.0;
+    for (int i = 0; i < a.length; i++) {
+      sum += a[i];
+    }
+    return sum;
+  }
+
+  private static void expectIntEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  private static void expectLongEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  private static void expectFloatEquals(float expected, float result) {
+    final float THRESHOLD = .1f;
+    if (Math.abs(expected - result) >= THRESHOLD) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  private static void expectDoubleEquals(double expected, double result) {
+    final double THRESHOLD = .1;
+    if (Math.abs(expected - result) >= THRESHOLD) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}