ART: Implement predicated SIMD vectorization. This CL brings support for predicated execution for auto-vectorizer and implements arm64 SVE vector backend. This version passes all the VIXL simulator-runnable tests in SVE mode with checker off (as all VecOp CHECKs need to be adjusted for an extra input) and all tests in NEON mode. Test: art SIMD tests on VIXL simulator. Test: art tests on FVP (steps in test/README.arm_fvp.md) Change-Id: Ib78bde31a15e6713d875d6668ad4458f5519605f

commit: 8ba4de1a5684686447a578bdc425321fd3bccca6 [log] [tgz]
author: Artem Serov <artem.serov@linaro.org> Wed Dec 04 21:10:23 2019 +0000
committer: Ulyana Trafimovich <skvadrik@google.com> Thu Feb 04 06:16:33 2021 +0000
tree: 20c24450b24950266ccc235306e3ad2109c57497
parent: 32bf6d39bc020cacfc655ce60630f4a0da3b45cf [diff]
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index b945be2..f5d7836 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc

@@ -994,7 +994,7 @@
 }
 
 bool CodeGeneratorARM64::ShouldUseSVE() const {
-  return kArm64AllowSVE && GetInstructionSetFeatures().HasSVE();
+  return GetInstructionSetFeatures().HasSVE();
 }
 
 #define __ GetVIXLAssembler()->
@@ -6908,7 +6908,7 @@
   }
 }
 
-MemOperand InstructionCodeGeneratorARM64::VecNeonAddress(
+MemOperand InstructionCodeGeneratorARM64::VecNEONAddress(
     HVecMemoryOperation* instruction,
     UseScratchRegisterScope* temps_scope,
     size_t size,
@@ -6941,6 +6941,31 @@
   }
 }
 
+SVEMemOperand InstructionCodeGeneratorARM64::VecSVEAddress(
+    HVecMemoryOperation* instruction,
+    UseScratchRegisterScope* temps_scope,
+    size_t size,
+    bool is_string_char_at,
+    /*out*/ Register* scratch) {
+  LocationSummary* locations = instruction->GetLocations();
+  Register base = InputRegisterAt(instruction, 0);
+  Location index = locations->InAt(1);
+
+  // TODO: Support intermediate address sharing for SVE accesses.
+  DCHECK(!instruction->InputAt(1)->IsIntermediateAddressIndex());
+  DCHECK(!instruction->InputAt(0)->IsIntermediateAddress());
+  DCHECK(!index.IsConstant());
+
+  uint32_t offset = is_string_char_at
+      ? mirror::String::ValueOffset().Uint32Value()
+      : mirror::Array::DataOffset(size).Uint32Value();
+  size_t shift = ComponentSizeShiftWidth(size);
+
+  *scratch = temps_scope->AcquireSameSizeAs(base);
+  __ Add(*scratch, base, offset);
+  return SVEMemOperand(scratch->X(), XRegisterFrom(index), LSL, shift);
+}
+
 #undef __
 #undef QUICK_ENTRY_POINT
 

diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index affc640..eb3e954 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h

@@ -54,9 +54,6 @@
 static constexpr int kMaxMacroInstructionSizeInBytes = 15 * vixl::aarch64::kInstructionSize;
 static constexpr int kInvokeCodeMarginSizeInBytes = 6 * kMaxMacroInstructionSizeInBytes;
 
-// SVE is currently not enabled.
-static constexpr bool kArm64AllowSVE = false;
-
 static const vixl::aarch64::Register kParameterCoreRegisters[] = {
   vixl::aarch64::x1,
   vixl::aarch64::x2,
@@ -388,11 +385,19 @@
   void GenerateIntRemForPower2Denom(HRem *instruction);
   void HandleGoto(HInstruction* got, HBasicBlock* successor);
 
-  // Helper to set up locations for vector memory operations. Returns the memory operand and,
+  // Helpers to set up locations for vector memory operations. Returns the memory operand and,
   // if used, sets the output parameter scratch to a temporary register used in this operand,
   // so that the client can release it right after the memory operand use.
   // Neon version.
-  vixl::aarch64::MemOperand VecNeonAddress(
+  vixl::aarch64::MemOperand VecNEONAddress(
+      HVecMemoryOperation* instruction,
+      // This function may acquire a scratch register.
+      vixl::aarch64::UseScratchRegisterScope* temps_scope,
+      size_t size,
+      bool is_string_char_at,
+      /*out*/ vixl::aarch64::Register* scratch);
+  // SVE version.
+  vixl::aarch64::SVEMemOperand VecSVEAddress(
       HVecMemoryOperation* instruction,
       // This function may acquire a scratch register.
       vixl::aarch64::UseScratchRegisterScope* temps_scope,
@@ -490,6 +495,15 @@
   void LoadSIMDRegFromStack(Location destination, Location source) override;
   void MoveSIMDRegToSIMDReg(Location destination, Location source) override;
   void MoveToSIMDStackSlot(Location destination, Location source) override;
+
+ private:
+  // Returns default predicate register which is used as governing vector predicate
+  // to implement predicated loop execution.
+  //
+  // TODO: This is a hack to be addressed when register allocator supports SIMD types.
+  static vixl::aarch64::PRegister LoopPReg() {
+    return vixl::aarch64::p0;
+  }
 };
 
 class LocationsBuilderARM64Sve : public LocationsBuilderARM64 {

diff --git a/compiler/optimizing/code_generator_vector_arm64_neon.cc b/compiler/optimizing/code_generator_vector_arm64_neon.cc
index 2a4c785..bd64166 100644
--- a/compiler/optimizing/code_generator_vector_arm64_neon.cc
+++ b/compiler/optimizing/code_generator_vector_arm64_neon.cc

@@ -25,8 +25,6 @@
 namespace art {
 namespace arm64 {
 
-using helpers::ARM64EncodableConstantOrRegister;
-using helpers::Arm64CanEncodeConstantAsImmediate;
 using helpers::DRegisterFrom;
 using helpers::HeapOperand;
 using helpers::InputRegisterAt;
@@ -40,6 +38,38 @@
 
 #define __ GetVIXLAssembler()->
 
+// Returns whether the value of the constant can be directly encoded into the instruction as
+// immediate.
+inline bool NEONCanEncodeConstantAsImmediate(HConstant* constant, HInstruction* instr) {
+  // TODO: Improve this when IsSIMDConstantEncodable method is implemented in VIXL.
+  if (instr->IsVecReplicateScalar()) {
+    if (constant->IsLongConstant()) {
+      return false;
+    } else if (constant->IsFloatConstant()) {
+      return vixl::aarch64::Assembler::IsImmFP32(constant->AsFloatConstant()->GetValue());
+    } else if (constant->IsDoubleConstant()) {
+      return vixl::aarch64::Assembler::IsImmFP64(constant->AsDoubleConstant()->GetValue());
+    }
+    int64_t value = CodeGenerator::GetInt64ValueOf(constant);
+    return IsUint<8>(value);
+  }
+  return false;
+}
+
+// Returns
+//  - constant location - if 'constant' is an actual constant and its value can be
+//    encoded into the instruction.
+//  - register location otherwise.
+inline Location NEONEncodableConstantOrRegister(HInstruction* constant,
+                                                HInstruction* instr) {
+  if (constant->IsConstant()
+      && NEONCanEncodeConstantAsImmediate(constant->AsConstant(), instr)) {
+    return Location::ConstantLocation(constant->AsConstant());
+  }
+
+  return Location::RequiresRegister();
+}
+
 // Returns whether dot product instructions should be emitted.
 static bool ShouldEmitDotProductInstructions(const CodeGeneratorARM64* codegen_) {
   return codegen_->GetInstructionSetFeatures().HasDotProd();
@@ -56,13 +86,13 @@
     case DataType::Type::kInt16:
     case DataType::Type::kInt32:
     case DataType::Type::kInt64:
-      locations->SetInAt(0, ARM64EncodableConstantOrRegister(input, instruction));
+      locations->SetInAt(0, NEONEncodableConstantOrRegister(input, instruction));
       locations->SetOut(Location::RequiresFpuRegister());
       break;
     case DataType::Type::kFloat32:
     case DataType::Type::kFloat64:
       if (input->IsConstant() &&
-          Arm64CanEncodeConstantAsImmediate(input->AsConstant(), instruction)) {
+          NEONCanEncodeConstantAsImmediate(input->AsConstant(), instruction)) {
         locations->SetInAt(0, Location::ConstantLocation(input->AsConstant()));
         locations->SetOut(Location::RequiresFpuRegister());
       } else {
@@ -1418,7 +1448,7 @@
         temps.Release(length);  // no longer needed
         // Zero extend 8 compressed bytes into 8 chars.
         __ Ldr(DRegisterFrom(locations->Out()).V8B(),
-               VecNeonAddress(instruction, &temps, 1, /*is_string_char_at*/ true, &scratch));
+               VecNEONAddress(instruction, &temps, 1, /*is_string_char_at*/ true, &scratch));
         __ Uxtl(reg.V8H(), reg.V8B());
         __ B(&done);
         if (scratch.IsValid()) {
@@ -1427,7 +1457,7 @@
         // Load 8 direct uncompressed chars.
         __ Bind(&uncompressed_load);
         __ Ldr(reg,
-               VecNeonAddress(instruction, &temps, size, /*is_string_char_at*/ true, &scratch));
+               VecNEONAddress(instruction, &temps, size, /*is_string_char_at*/ true, &scratch));
         __ Bind(&done);
         return;
       }
@@ -1442,7 +1472,7 @@
       DCHECK_LE(2u, instruction->GetVectorLength());
       DCHECK_LE(instruction->GetVectorLength(), 16u);
       __ Ldr(reg,
-             VecNeonAddress(instruction, &temps, size, instruction->IsStringCharAt(), &scratch));
+             VecNEONAddress(instruction, &temps, size, instruction->IsStringCharAt(), &scratch));
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -1474,7 +1504,7 @@
       DCHECK_LE(2u, instruction->GetVectorLength());
       DCHECK_LE(instruction->GetVectorLength(), 16u);
       __ Str(reg,
-             VecNeonAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
+             VecNEONAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -1483,13 +1513,13 @@
 }
 
 void LocationsBuilderARM64Neon::VisitVecPredSetAll(HVecPredSetAll* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-  UNREACHABLE();
+  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+  DCHECK(instruction->InputAt(0)->IsIntConstant());
+  locations->SetInAt(0, Location::NoLocation());
+  locations->SetOut(Location::NoLocation());
 }
 
-void InstructionCodeGeneratorARM64Neon::VisitVecPredSetAll(HVecPredSetAll* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-  UNREACHABLE();
+void InstructionCodeGeneratorARM64Neon::VisitVecPredSetAll(HVecPredSetAll*) {
 }
 
 void LocationsBuilderARM64Neon::VisitVecPredWhile(HVecPredWhile* instruction) {

diff --git a/compiler/optimizing/code_generator_vector_arm64_sve.cc b/compiler/optimizing/code_generator_vector_arm64_sve.cc
index 1761dfc..2254673 100644
--- a/compiler/optimizing/code_generator_vector_arm64_sve.cc
+++ b/compiler/optimizing/code_generator_vector_arm64_sve.cc

@@ -25,8 +25,6 @@
 namespace art {
 namespace arm64 {
 
-using helpers::ARM64EncodableConstantOrRegister;
-using helpers::Arm64CanEncodeConstantAsImmediate;
 using helpers::DRegisterFrom;
 using helpers::HeapOperand;
 using helpers::InputRegisterAt;
@@ -36,13 +34,41 @@
 using helpers::QRegisterFrom;
 using helpers::StackOperandFrom;
 using helpers::VRegisterFrom;
+using helpers::ZRegisterFrom;
 using helpers::XRegisterFrom;
 
 #define __ GetVIXLAssembler()->
 
-// Returns whether dot product instructions should be emitted.
-static bool ShouldEmitDotProductInstructions(const CodeGeneratorARM64* codegen_) {
-  return codegen_->GetInstructionSetFeatures().HasDotProd();
+// Returns whether the value of the constant can be directly encoded into the instruction as
+// immediate.
+static bool SVECanEncodeConstantAsImmediate(HConstant* constant, HInstruction* instr) {
+  if (instr->IsVecReplicateScalar()) {
+    if (constant->IsLongConstant()) {
+      return false;
+    } else if (constant->IsFloatConstant()) {
+      return vixl::aarch64::Assembler::IsImmFP32(constant->AsFloatConstant()->GetValue());
+    } else if (constant->IsDoubleConstant()) {
+      return vixl::aarch64::Assembler::IsImmFP64(constant->AsDoubleConstant()->GetValue());
+    }
+    // TODO: Make use of shift part of DUP instruction.
+    int64_t value = CodeGenerator::GetInt64ValueOf(constant);
+    return IsInt<8>(value);
+  }
+
+  return false;
+}
+
+// Returns
+//  - constant location - if 'constant' is an actual constant and its value can be
+//    encoded into the instruction.
+//  - register location otherwise.
+inline Location SVEEncodableConstantOrRegister(HInstruction* constant, HInstruction* instr) {
+  if (constant->IsConstant()
+      && SVECanEncodeConstantAsImmediate(constant->AsConstant(), instr)) {
+    return Location::ConstantLocation(constant->AsConstant());
+  }
+
+  return Location::RequiresRegister();
 }
 
 void LocationsBuilderARM64Sve::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
@@ -56,13 +82,13 @@
     case DataType::Type::kInt16:
     case DataType::Type::kInt32:
     case DataType::Type::kInt64:
-      locations->SetInAt(0, ARM64EncodableConstantOrRegister(input, instruction));
+      locations->SetInAt(0, SVEEncodableConstantOrRegister(input, instruction));
       locations->SetOut(Location::RequiresFpuRegister());
       break;
     case DataType::Type::kFloat32:
     case DataType::Type::kFloat64:
       if (input->IsConstant() &&
-          Arm64CanEncodeConstantAsImmediate(input->AsConstant(), instruction)) {
+          SVECanEncodeConstantAsImmediate(input->AsConstant(), instruction)) {
         locations->SetInAt(0, Location::ConstantLocation(input->AsConstant()));
         locations->SetOut(Location::RequiresFpuRegister());
       } else {
@@ -77,59 +103,60 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
   Location src_loc = locations->InAt(0);
-  VRegister dst = VRegisterFrom(locations->Out());
+  const ZRegister dst = ZRegisterFrom(locations->Out());
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
       DCHECK_EQ(16u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
-        __ Movi(dst.V16B(), Int64FromLocation(src_loc));
+        __ Dup(dst.VnB(), Int64FromLocation(src_loc));
       } else {
-        __ Dup(dst.V16B(), InputRegisterAt(instruction, 0));
+        __ Dup(dst.VnB(), InputRegisterAt(instruction, 0));
       }
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
-        __ Movi(dst.V8H(), Int64FromLocation(src_loc));
+        __ Dup(dst.VnH(), Int64FromLocation(src_loc));
       } else {
-        __ Dup(dst.V8H(), InputRegisterAt(instruction, 0));
+        __ Dup(dst.VnH(), InputRegisterAt(instruction, 0));
       }
       break;
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
-        __ Movi(dst.V4S(), Int64FromLocation(src_loc));
+        __ Dup(dst.VnS(), Int64FromLocation(src_loc));
       } else {
-        __ Dup(dst.V4S(), InputRegisterAt(instruction, 0));
+        __ Dup(dst.VnS(), InputRegisterAt(instruction, 0));
       }
       break;
     case DataType::Type::kInt64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
-        __ Movi(dst.V2D(), Int64FromLocation(src_loc));
+        __ Dup(dst.VnD(), Int64FromLocation(src_loc));
       } else {
-        __ Dup(dst.V2D(), XRegisterFrom(src_loc));
+        __ Dup(dst.VnD(), XRegisterFrom(src_loc));
       }
       break;
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
-        __ Fmov(dst.V4S(), src_loc.GetConstant()->AsFloatConstant()->GetValue());
+        __ Fdup(dst.VnS(), src_loc.GetConstant()->AsFloatConstant()->GetValue());
       } else {
-        __ Dup(dst.V4S(), VRegisterFrom(src_loc).V4S(), 0);
+        __ Dup(dst.VnS(), ZRegisterFrom(src_loc).VnS(), 0);
       }
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
-        __ Fmov(dst.V2D(), src_loc.GetConstant()->AsDoubleConstant()->GetValue());
+        __ Fdup(dst.VnD(), src_loc.GetConstant()->AsDoubleConstant()->GetValue());
       } else {
-        __ Dup(dst.V2D(), VRegisterFrom(src_loc).V2D(), 0);
+        __ Dup(dst.VnD(), ZRegisterFrom(src_loc).VnD(), 0);
       }
       break;
     default:
@@ -163,8 +190,9 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecExtractScalar(HVecExtractScalar* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister src = VRegisterFrom(locations->InAt(0));
+  const VRegister src = VRegisterFrom(locations->InAt(0));
   switch (instruction->GetPackedType()) {
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
@@ -218,32 +246,31 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecReduce(HVecReduce* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister src = VRegisterFrom(locations->InAt(0));
-  VRegister dst = DRegisterFrom(locations->Out());
+  const ZRegister src = ZRegisterFrom(locations->InAt(0));
+  const VRegister dst = DRegisterFrom(locations->Out());
+  const PRegister p_reg = LoopPReg();
   switch (instruction->GetPackedType()) {
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
       switch (instruction->GetReductionKind()) {
         case HVecReduce::kSum:
-          __ Addv(dst.S(), src.V4S());
+          __ Saddv(dst.S(), p_reg, src.VnS());
           break;
-        case HVecReduce::kMin:
-          __ Sminv(dst.S(), src.V4S());
-          break;
-        case HVecReduce::kMax:
-          __ Smaxv(dst.S(), src.V4S());
-          break;
+        default:
+          LOG(FATAL) << "Unsupported SIMD instruction";
+          UNREACHABLE();
       }
       break;
     case DataType::Type::kInt64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
       switch (instruction->GetReductionKind()) {
         case HVecReduce::kSum:
-          __ Addp(dst.D(), src.V2D());
+          __ Uaddv(dst.D(), p_reg, src.VnD());
           break;
         default:
-          LOG(FATAL) << "Unsupported SIMD min/max";
+          LOG(FATAL) << "Unsupported SIMD instruction";
           UNREACHABLE();
       }
       break;
@@ -258,14 +285,16 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecCnv(HVecCnv* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister src = VRegisterFrom(locations->InAt(0));
-  VRegister dst = VRegisterFrom(locations->Out());
+  const ZRegister src = ZRegisterFrom(locations->InAt(0));
+  const ZRegister dst = ZRegisterFrom(locations->Out());
+  const PRegisterM p_reg = LoopPReg().Merging();
   DataType::Type from = instruction->GetInputType();
   DataType::Type to = instruction->GetResultType();
   if (from == DataType::Type::kInt32 && to == DataType::Type::kFloat32) {
     DCHECK_EQ(4u, instruction->GetVectorLength());
-    __ Scvtf(dst.V4S(), src.V4S());
+    __ Scvtf(dst.VnS(), p_reg, src.VnS());
   } else {
     LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
   }
@@ -276,35 +305,37 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecNeg(HVecNeg* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister src = VRegisterFrom(locations->InAt(0));
-  VRegister dst = VRegisterFrom(locations->Out());
+  const ZRegister src = ZRegisterFrom(locations->InAt(0));
+  const ZRegister dst = ZRegisterFrom(locations->Out());
+  const PRegisterM p_reg = LoopPReg().Merging();
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
       DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Neg(dst.V16B(), src.V16B());
+      __ Neg(dst.VnB(), p_reg, src.VnB());
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Neg(dst.V8H(), src.V8H());
+      __ Neg(dst.VnH(), p_reg, src.VnH());
       break;
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Neg(dst.V4S(), src.V4S());
+      __ Neg(dst.VnS(), p_reg, src.VnS());
       break;
     case DataType::Type::kInt64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Neg(dst.V2D(), src.V2D());
+      __ Neg(dst.VnD(), p_reg, src.VnD());
       break;
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Fneg(dst.V4S(), src.V4S());
+      __ Fneg(dst.VnS(), p_reg, src.VnS());
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Fneg(dst.V2D(), src.V2D());
+      __ Fneg(dst.VnD(), p_reg, src.VnD());
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -317,33 +348,35 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecAbs(HVecAbs* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister src = VRegisterFrom(locations->InAt(0));
-  VRegister dst = VRegisterFrom(locations->Out());
+  const ZRegister src = ZRegisterFrom(locations->InAt(0));
+  const ZRegister dst = ZRegisterFrom(locations->Out());
+  const PRegisterM p_reg = LoopPReg().Merging();
   switch (instruction->GetPackedType()) {
     case DataType::Type::kInt8:
       DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Abs(dst.V16B(), src.V16B());
+      __ Abs(dst.VnB(), p_reg, src.VnB());
       break;
     case DataType::Type::kInt16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Abs(dst.V8H(), src.V8H());
+      __ Abs(dst.VnH(), p_reg, src.VnH());
       break;
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Abs(dst.V4S(), src.V4S());
+      __ Abs(dst.VnS(), p_reg, src.VnS());
       break;
     case DataType::Type::kInt64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Abs(dst.V2D(), src.V2D());
+      __ Abs(dst.VnD(), p_reg, src.VnD());
       break;
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Fabs(dst.V4S(), src.V4S());
+      __ Fabs(dst.VnS(), p_reg, src.VnS());
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Fabs(dst.V2D(), src.V2D());
+      __ Fabs(dst.VnD(), p_reg, src.VnD());
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -356,22 +389,30 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecNot(HVecNot* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister src = VRegisterFrom(locations->InAt(0));
-  VRegister dst = VRegisterFrom(locations->Out());
+  const ZRegister src = ZRegisterFrom(locations->InAt(0));
+  const ZRegister dst = ZRegisterFrom(locations->Out());
+  const PRegisterM p_reg = LoopPReg().Merging();
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:  // special case boolean-not
       DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Movi(dst.V16B(), 1);
-      __ Eor(dst.V16B(), dst.V16B(), src.V16B());
+      __ Dup(dst.VnB(), 1);
+      __ Eor(dst.VnB(), p_reg, dst.VnB(), src.VnB());
       break;
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
+      __ Not(dst.VnB(), p_reg, src.VnB());
+      break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
+      __ Not(dst.VnH(), p_reg, src.VnH());
+      break;
     case DataType::Type::kInt32:
+      __ Not(dst.VnS(), p_reg, src.VnS());
+      break;
     case DataType::Type::kInt64:
-      __ Not(dst.V16B(), src.V16B());  // lanes do not matter
+      __ Not(dst.VnD(), p_reg, src.VnD());
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -394,7 +435,7 @@
     case DataType::Type::kFloat64:
       locations->SetInAt(0, Location::RequiresFpuRegister());
       locations->SetInAt(1, Location::RequiresFpuRegister());
-      locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+      locations->SetOut(Location::SameAsFirstInput());
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -407,36 +448,38 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecAdd(HVecAdd* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister lhs = VRegisterFrom(locations->InAt(0));
-  VRegister rhs = VRegisterFrom(locations->InAt(1));
-  VRegister dst = VRegisterFrom(locations->Out());
+  const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+  const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
+  const ZRegister dst = ZRegisterFrom(locations->Out());
+  const PRegisterM p_reg = LoopPReg().Merging();
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
       DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Add(dst.V16B(), lhs.V16B(), rhs.V16B());
+      __ Add(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Add(dst.V8H(), lhs.V8H(), rhs.V8H());
+      __ Add(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
       break;
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Add(dst.V4S(), lhs.V4S(), rhs.V4S());
+      __ Add(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
       break;
     case DataType::Type::kInt64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Add(dst.V2D(), lhs.V2D(), rhs.V2D());
+      __ Add(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Fadd(dst.V4S(), lhs.V4S(), rhs.V4S());
+      __ Fadd(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS(), StrictNaNPropagation);
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Fadd(dst.V2D(), lhs.V2D(), rhs.V2D());
+      __ Fadd(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD(), StrictNaNPropagation);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -445,75 +488,23 @@
 }
 
 void LocationsBuilderARM64Sve::VisitVecSaturationAdd(HVecSaturationAdd* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+  UNREACHABLE();
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecSaturationAdd(HVecSaturationAdd* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  VRegister lhs = VRegisterFrom(locations->InAt(0));
-  VRegister rhs = VRegisterFrom(locations->InAt(1));
-  VRegister dst = VRegisterFrom(locations->Out());
-  switch (instruction->GetPackedType()) {
-    case DataType::Type::kUint8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Uqadd(dst.V16B(), lhs.V16B(), rhs.V16B());
-      break;
-    case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Sqadd(dst.V16B(), lhs.V16B(), rhs.V16B());
-      break;
-    case DataType::Type::kUint16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Uqadd(dst.V8H(), lhs.V8H(), rhs.V8H());
-      break;
-    case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Sqadd(dst.V8H(), lhs.V8H(), rhs.V8H());
-      break;
-    default:
-      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
-      UNREACHABLE();
-  }
+  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+  UNREACHABLE();
 }
 
 void LocationsBuilderARM64Sve::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+  UNREACHABLE();
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecHalvingAdd(HVecHalvingAdd* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  VRegister lhs = VRegisterFrom(locations->InAt(0));
-  VRegister rhs = VRegisterFrom(locations->InAt(1));
-  VRegister dst = VRegisterFrom(locations->Out());
-  switch (instruction->GetPackedType()) {
-    case DataType::Type::kUint8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      instruction->IsRounded()
-          ? __ Urhadd(dst.V16B(), lhs.V16B(), rhs.V16B())
-          : __ Uhadd(dst.V16B(), lhs.V16B(), rhs.V16B());
-      break;
-    case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      instruction->IsRounded()
-          ? __ Srhadd(dst.V16B(), lhs.V16B(), rhs.V16B())
-          : __ Shadd(dst.V16B(), lhs.V16B(), rhs.V16B());
-      break;
-    case DataType::Type::kUint16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      instruction->IsRounded()
-          ? __ Urhadd(dst.V8H(), lhs.V8H(), rhs.V8H())
-          : __ Uhadd(dst.V8H(), lhs.V8H(), rhs.V8H());
-      break;
-    case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      instruction->IsRounded()
-          ? __ Srhadd(dst.V8H(), lhs.V8H(), rhs.V8H())
-          : __ Shadd(dst.V8H(), lhs.V8H(), rhs.V8H());
-      break;
-    default:
-      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
-      UNREACHABLE();
-  }
+  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+  UNREACHABLE();
 }
 
 void LocationsBuilderARM64Sve::VisitVecSub(HVecSub* instruction) {
@@ -521,36 +512,38 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecSub(HVecSub* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister lhs = VRegisterFrom(locations->InAt(0));
-  VRegister rhs = VRegisterFrom(locations->InAt(1));
-  VRegister dst = VRegisterFrom(locations->Out());
+  const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+  const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
+  const ZRegister dst = ZRegisterFrom(locations->Out());
+  const PRegisterM p_reg = LoopPReg().Merging();
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
       DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Sub(dst.V16B(), lhs.V16B(), rhs.V16B());
+      __ Sub(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Sub(dst.V8H(), lhs.V8H(), rhs.V8H());
+      __ Sub(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
       break;
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Sub(dst.V4S(), lhs.V4S(), rhs.V4S());
+      __ Sub(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
       break;
     case DataType::Type::kInt64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Sub(dst.V2D(), lhs.V2D(), rhs.V2D());
+      __ Sub(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Fsub(dst.V4S(), lhs.V4S(), rhs.V4S());
+      __ Fsub(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Fsub(dst.V2D(), lhs.V2D(), rhs.V2D());
+      __ Fsub(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -559,35 +552,13 @@
 }
 
 void LocationsBuilderARM64Sve::VisitVecSaturationSub(HVecSaturationSub* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+  UNREACHABLE();
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecSaturationSub(HVecSaturationSub* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  VRegister lhs = VRegisterFrom(locations->InAt(0));
-  VRegister rhs = VRegisterFrom(locations->InAt(1));
-  VRegister dst = VRegisterFrom(locations->Out());
-  switch (instruction->GetPackedType()) {
-    case DataType::Type::kUint8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Uqsub(dst.V16B(), lhs.V16B(), rhs.V16B());
-      break;
-    case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Sqsub(dst.V16B(), lhs.V16B(), rhs.V16B());
-      break;
-    case DataType::Type::kUint16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Uqsub(dst.V8H(), lhs.V8H(), rhs.V8H());
-      break;
-    case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Sqsub(dst.V8H(), lhs.V8H(), rhs.V8H());
-      break;
-    default:
-      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
-      UNREACHABLE();
-  }
+  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+  UNREACHABLE();
 }
 
 void LocationsBuilderARM64Sve::VisitVecMul(HVecMul* instruction) {
@@ -595,32 +566,38 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecMul(HVecMul* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister lhs = VRegisterFrom(locations->InAt(0));
-  VRegister rhs = VRegisterFrom(locations->InAt(1));
-  VRegister dst = VRegisterFrom(locations->Out());
+  const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+  const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
+  const ZRegister dst = ZRegisterFrom(locations->Out());
+  const PRegisterM p_reg = LoopPReg().Merging();
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
       DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Mul(dst.V16B(), lhs.V16B(), rhs.V16B());
+      __ Mul(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Mul(dst.V8H(), lhs.V8H(), rhs.V8H());
+      __ Mul(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
       break;
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Mul(dst.V4S(), lhs.V4S(), rhs.V4S());
+      __ Mul(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
+      break;
+    case DataType::Type::kInt64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Mul(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Fmul(dst.V4S(), lhs.V4S(), rhs.V4S());
+      __ Fmul(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS(), StrictNaNPropagation);
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Fmul(dst.V2D(), lhs.V2D(), rhs.V2D());
+      __ Fmul(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD(), StrictNaNPropagation);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -633,18 +610,22 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecDiv(HVecDiv* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister lhs = VRegisterFrom(locations->InAt(0));
-  VRegister rhs = VRegisterFrom(locations->InAt(1));
-  VRegister dst = VRegisterFrom(locations->Out());
+  const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+  const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
+  const ZRegister dst = ZRegisterFrom(locations->Out());
+  const PRegisterM p_reg = LoopPReg().Merging();
+
+  // Note: VIXL guarantees StrictNaNPropagation for Fdiv.
   switch (instruction->GetPackedType()) {
     case DataType::Type::kFloat32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Fdiv(dst.V4S(), lhs.V4S(), rhs.V4S());
+      __ Fdiv(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
       break;
     case DataType::Type::kFloat64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Fdiv(dst.V2D(), lhs.V2D(), rhs.V2D());
+      __ Fdiv(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -653,99 +634,23 @@
 }
 
 void LocationsBuilderARM64Sve::VisitVecMin(HVecMin* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+  UNREACHABLE();
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecMin(HVecMin* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  VRegister lhs = VRegisterFrom(locations->InAt(0));
-  VRegister rhs = VRegisterFrom(locations->InAt(1));
-  VRegister dst = VRegisterFrom(locations->Out());
-  switch (instruction->GetPackedType()) {
-    case DataType::Type::kUint8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Umin(dst.V16B(), lhs.V16B(), rhs.V16B());
-      break;
-    case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Smin(dst.V16B(), lhs.V16B(), rhs.V16B());
-      break;
-    case DataType::Type::kUint16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Umin(dst.V8H(), lhs.V8H(), rhs.V8H());
-      break;
-    case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Smin(dst.V8H(), lhs.V8H(), rhs.V8H());
-      break;
-    case DataType::Type::kUint32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Umin(dst.V4S(), lhs.V4S(), rhs.V4S());
-      break;
-    case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Smin(dst.V4S(), lhs.V4S(), rhs.V4S());
-      break;
-    case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Fmin(dst.V4S(), lhs.V4S(), rhs.V4S());
-      break;
-    case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Fmin(dst.V2D(), lhs.V2D(), rhs.V2D());
-      break;
-    default:
-      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
-      UNREACHABLE();
-  }
+  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+  UNREACHABLE();
 }
 
 void LocationsBuilderARM64Sve::VisitVecMax(HVecMax* instruction) {
-  CreateVecBinOpLocations(GetGraph()->GetAllocator(), instruction);
+  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+  UNREACHABLE();
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecMax(HVecMax* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  VRegister lhs = VRegisterFrom(locations->InAt(0));
-  VRegister rhs = VRegisterFrom(locations->InAt(1));
-  VRegister dst = VRegisterFrom(locations->Out());
-  switch (instruction->GetPackedType()) {
-    case DataType::Type::kUint8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Umax(dst.V16B(), lhs.V16B(), rhs.V16B());
-      break;
-    case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Smax(dst.V16B(), lhs.V16B(), rhs.V16B());
-      break;
-    case DataType::Type::kUint16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Umax(dst.V8H(), lhs.V8H(), rhs.V8H());
-      break;
-    case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Smax(dst.V8H(), lhs.V8H(), rhs.V8H());
-      break;
-    case DataType::Type::kUint32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Umax(dst.V4S(), lhs.V4S(), rhs.V4S());
-      break;
-    case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Smax(dst.V4S(), lhs.V4S(), rhs.V4S());
-      break;
-    case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Fmax(dst.V4S(), lhs.V4S(), rhs.V4S());
-      break;
-    case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Fmax(dst.V2D(), lhs.V2D(), rhs.V2D());
-      break;
-    default:
-      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
-      UNREACHABLE();
-  }
+  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+  UNREACHABLE();
 }
 
 void LocationsBuilderARM64Sve::VisitVecAnd(HVecAnd* instruction) {
@@ -754,21 +659,29 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecAnd(HVecAnd* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister lhs = VRegisterFrom(locations->InAt(0));
-  VRegister rhs = VRegisterFrom(locations->InAt(1));
-  VRegister dst = VRegisterFrom(locations->Out());
+  const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+  const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
+  const ZRegister dst = ZRegisterFrom(locations->Out());
+  const PRegisterM p_reg = LoopPReg().Merging();
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
+      __ And(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
+      break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
+      __ And(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
+      break;
     case DataType::Type::kInt32:
-    case DataType::Type::kInt64:
     case DataType::Type::kFloat32:
+      __ And(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
+      break;
+    case DataType::Type::kInt64:
     case DataType::Type::kFloat64:
-      __ And(dst.V16B(), lhs.V16B(), rhs.V16B());  // lanes do not matter
+      __ And(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -790,21 +703,29 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecOr(HVecOr* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister lhs = VRegisterFrom(locations->InAt(0));
-  VRegister rhs = VRegisterFrom(locations->InAt(1));
-  VRegister dst = VRegisterFrom(locations->Out());
+  const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+  const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
+  const ZRegister dst = ZRegisterFrom(locations->Out());
+  const PRegisterM p_reg = LoopPReg().Merging();
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
+      __ Orr(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
+      break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
+      __ Orr(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
+      break;
     case DataType::Type::kInt32:
-    case DataType::Type::kInt64:
     case DataType::Type::kFloat32:
+      __ Orr(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
+      break;
+    case DataType::Type::kInt64:
     case DataType::Type::kFloat64:
-      __ Orr(dst.V16B(), lhs.V16B(), rhs.V16B());  // lanes do not matter
+      __ Orr(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -817,21 +738,29 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecXor(HVecXor* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister lhs = VRegisterFrom(locations->InAt(0));
-  VRegister rhs = VRegisterFrom(locations->InAt(1));
-  VRegister dst = VRegisterFrom(locations->Out());
+  const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+  const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
+  const ZRegister dst = ZRegisterFrom(locations->Out());
+  const PRegisterM p_reg = LoopPReg().Merging();
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
+      __ Eor(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
+      break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
+      __ Eor(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
+      break;
     case DataType::Type::kInt32:
-    case DataType::Type::kInt64:
     case DataType::Type::kFloat32:
+      __ Eor(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
+      break;
+    case DataType::Type::kInt64:
     case DataType::Type::kFloat64:
-      __ Eor(dst.V16B(), lhs.V16B(), rhs.V16B());  // lanes do not matter
+      __ Eor(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -864,28 +793,30 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecShl(HVecShl* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister lhs = VRegisterFrom(locations->InAt(0));
-  VRegister dst = VRegisterFrom(locations->Out());
+  const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+  const ZRegister dst = ZRegisterFrom(locations->Out());
+  const PRegisterM p_reg = LoopPReg().Merging();
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
       DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Shl(dst.V16B(), lhs.V16B(), value);
+      __ Lsl(dst.VnB(), p_reg, lhs.VnB(), value);
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Shl(dst.V8H(), lhs.V8H(), value);
+      __ Lsl(dst.VnH(), p_reg, lhs.VnH(), value);
       break;
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Shl(dst.V4S(), lhs.V4S(), value);
+      __ Lsl(dst.VnS(), p_reg, lhs.VnS(), value);
       break;
     case DataType::Type::kInt64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Shl(dst.V2D(), lhs.V2D(), value);
+      __ Lsl(dst.VnD(), p_reg, lhs.VnD(), value);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -898,28 +829,30 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecShr(HVecShr* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister lhs = VRegisterFrom(locations->InAt(0));
-  VRegister dst = VRegisterFrom(locations->Out());
+  const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+  const ZRegister dst = ZRegisterFrom(locations->Out());
+  const PRegisterM p_reg = LoopPReg().Merging();
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
       DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Sshr(dst.V16B(), lhs.V16B(), value);
+      __ Asr(dst.VnB(), p_reg, lhs.VnB(), value);
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Sshr(dst.V8H(), lhs.V8H(), value);
+      __ Asr(dst.VnH(), p_reg, lhs.VnH(), value);
       break;
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Sshr(dst.V4S(), lhs.V4S(), value);
+      __ Asr(dst.VnS(), p_reg, lhs.VnS(), value);
       break;
     case DataType::Type::kInt64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Sshr(dst.V2D(), lhs.V2D(), value);
+      __ Asr(dst.VnD(), p_reg, lhs.VnD(), value);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -932,28 +865,30 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecUShr(HVecUShr* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister lhs = VRegisterFrom(locations->InAt(0));
-  VRegister dst = VRegisterFrom(locations->Out());
+  const ZRegister lhs = ZRegisterFrom(locations->InAt(0));
+  const ZRegister dst = ZRegisterFrom(locations->Out());
+  const PRegisterM p_reg = LoopPReg().Merging();
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
       DCHECK_EQ(16u, instruction->GetVectorLength());
-      __ Ushr(dst.V16B(), lhs.V16B(), value);
+      __ Lsr(dst.VnB(), p_reg, lhs.VnB(), value);
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      __ Ushr(dst.V8H(), lhs.V8H(), value);
+      __ Lsr(dst.VnH(), p_reg, lhs.VnH(), value);
       break;
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
-      __ Ushr(dst.V4S(), lhs.V4S(), value);
+      __ Lsr(dst.VnS(), p_reg, lhs.VnS(), value);
       break;
     case DataType::Type::kInt64:
       DCHECK_EQ(2u, instruction->GetVectorLength());
-      __ Ushr(dst.V2D(), lhs.V2D(), value);
+      __ Lsr(dst.VnD(), p_reg, lhs.VnD(), value);
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -964,7 +899,7 @@
 void LocationsBuilderARM64Sve::VisitVecSetScalars(HVecSetScalars* instruction) {
   LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
 
-  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+  DCHECK_EQ(2u, instruction->InputCount());  // only one input currently implemented + predicate.
 
   HInstruction* input = instruction->InputAt(0);
   bool is_zero = IsZeroBitPattern(input);
@@ -994,14 +929,16 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecSetScalars(HVecSetScalars* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister dst = VRegisterFrom(locations->Out());
+  const ZRegister z_dst = ZRegisterFrom(locations->Out());
 
-  DCHECK_EQ(1u, instruction->InputCount());  // only one input currently implemented
+  DCHECK_EQ(2u, instruction->InputCount());  // only one input currently implemented + predicate.
 
   // Zero out all other elements first.
-  __ Movi(dst.V16B(), 0);
+  __ Dup(z_dst.VnB(), 0);
 
+  const VRegister dst = VRegisterFrom(locations->Out());
   // Shorthand for any type of zero.
   if (IsZeroBitPattern(instruction->InputAt(0))) {
     return;
@@ -1062,11 +999,14 @@
 // Some early revisions of the Cortex-A53 have an erratum (835769) whereby it is possible for a
 // 64-bit scalar multiply-accumulate instruction in AArch64 state to generate an incorrect result.
 // However vector MultiplyAccumulate instruction is not affected.
-void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate(
+    HVecMultiplyAccumulate* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
-  VRegister acc = VRegisterFrom(locations->InAt(0));
-  VRegister left = VRegisterFrom(locations->InAt(1));
-  VRegister right = VRegisterFrom(locations->InAt(2));
+  const ZRegister acc = ZRegisterFrom(locations->InAt(0));
+  const ZRegister left = ZRegisterFrom(locations->InAt(1));
+  const ZRegister right = ZRegisterFrom(locations->InAt(2));
+  const PRegisterM p_reg = LoopPReg().Merging();
 
   DCHECK(locations->InAt(0).Equals(locations->Out()));
 
@@ -1075,26 +1015,26 @@
     case DataType::Type::kInt8:
       DCHECK_EQ(16u, instruction->GetVectorLength());
       if (instruction->GetOpKind() == HInstruction::kAdd) {
-        __ Mla(acc.V16B(), left.V16B(), right.V16B());
+        __ Mla(acc.VnB(), p_reg, acc.VnB(), left.VnB(), right.VnB());
       } else {
-        __ Mls(acc.V16B(), left.V16B(), right.V16B());
+        __ Mls(acc.VnB(), p_reg, acc.VnB(), left.VnB(), right.VnB());
       }
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
       if (instruction->GetOpKind() == HInstruction::kAdd) {
-        __ Mla(acc.V8H(), left.V8H(), right.V8H());
+        __ Mla(acc.VnH(), p_reg, acc.VnB(), left.VnH(), right.VnH());
       } else {
-        __ Mls(acc.V8H(), left.V8H(), right.V8H());
+        __ Mls(acc.VnH(), p_reg, acc.VnB(), left.VnH(), right.VnH());
       }
       break;
     case DataType::Type::kInt32:
       DCHECK_EQ(4u, instruction->GetVectorLength());
       if (instruction->GetOpKind() == HInstruction::kAdd) {
-        __ Mla(acc.V4S(), left.V4S(), right.V4S());
+        __ Mla(acc.VnS(), p_reg, acc.VnB(), left.VnS(), right.VnS());
       } else {
-        __ Mls(acc.V4S(), left.V4S(), right.V4S());
+        __ Mls(acc.VnS(), p_reg, acc.VnB(), left.VnS(), right.VnS());
       }
       break;
     default:
@@ -1104,185 +1044,13 @@
 }
 
 void LocationsBuilderARM64Sve::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
-  CreateVecAccumLocations(GetGraph()->GetAllocator(), instruction);
-  // Some conversions require temporary registers.
-  LocationSummary* locations = instruction->GetLocations();
-  HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
-  HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
-  DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
-            HVecOperation::ToSignedType(b->GetPackedType()));
-  switch (a->GetPackedType()) {
-    case DataType::Type::kUint8:
-    case DataType::Type::kInt8:
-      switch (instruction->GetPackedType()) {
-        case DataType::Type::kInt64:
-          locations->AddTemp(Location::RequiresFpuRegister());
-          locations->AddTemp(Location::RequiresFpuRegister());
-          FALLTHROUGH_INTENDED;
-        case DataType::Type::kInt32:
-          locations->AddTemp(Location::RequiresFpuRegister());
-          locations->AddTemp(Location::RequiresFpuRegister());
-          break;
-        default:
-          break;
-      }
-      break;
-    case DataType::Type::kUint16:
-    case DataType::Type::kInt16:
-      if (instruction->GetPackedType() == DataType::Type::kInt64) {
-        locations->AddTemp(Location::RequiresFpuRegister());
-        locations->AddTemp(Location::RequiresFpuRegister());
-      }
-      break;
-    case DataType::Type::kInt32:
-    case DataType::Type::kInt64:
-      if (instruction->GetPackedType() == a->GetPackedType()) {
-        locations->AddTemp(Location::RequiresFpuRegister());
-      }
-      break;
-    default:
-      break;
-  }
+  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+  UNREACHABLE();
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
-  LocationSummary* locations = instruction->GetLocations();
-  VRegister acc = VRegisterFrom(locations->InAt(0));
-  VRegister left = VRegisterFrom(locations->InAt(1));
-  VRegister right = VRegisterFrom(locations->InAt(2));
-
-  DCHECK(locations->InAt(0).Equals(locations->Out()));
-
-  // Handle all feasible acc_T += sad(a_S, b_S) type combinations (T x S).
-  HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
-  HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
-  DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
-            HVecOperation::ToSignedType(b->GetPackedType()));
-  switch (a->GetPackedType()) {
-    case DataType::Type::kUint8:
-    case DataType::Type::kInt8:
-      DCHECK_EQ(16u, a->GetVectorLength());
-      switch (instruction->GetPackedType()) {
-        case DataType::Type::kInt16:
-          DCHECK_EQ(8u, instruction->GetVectorLength());
-          __ Sabal(acc.V8H(), left.V8B(), right.V8B());
-          __ Sabal2(acc.V8H(), left.V16B(), right.V16B());
-          break;
-        case DataType::Type::kInt32: {
-          DCHECK_EQ(4u, instruction->GetVectorLength());
-          VRegister tmp1 = VRegisterFrom(locations->GetTemp(0));
-          VRegister tmp2 = VRegisterFrom(locations->GetTemp(1));
-          __ Sxtl(tmp1.V8H(), left.V8B());
-          __ Sxtl(tmp2.V8H(), right.V8B());
-          __ Sabal(acc.V4S(), tmp1.V4H(), tmp2.V4H());
-          __ Sabal2(acc.V4S(), tmp1.V8H(), tmp2.V8H());
-          __ Sxtl2(tmp1.V8H(), left.V16B());
-          __ Sxtl2(tmp2.V8H(), right.V16B());
-          __ Sabal(acc.V4S(), tmp1.V4H(), tmp2.V4H());
-          __ Sabal2(acc.V4S(), tmp1.V8H(), tmp2.V8H());
-          break;
-        }
-        case DataType::Type::kInt64: {
-          DCHECK_EQ(2u, instruction->GetVectorLength());
-          VRegister tmp1 = VRegisterFrom(locations->GetTemp(0));
-          VRegister tmp2 = VRegisterFrom(locations->GetTemp(1));
-          VRegister tmp3 = VRegisterFrom(locations->GetTemp(2));
-          VRegister tmp4 = VRegisterFrom(locations->GetTemp(3));
-          __ Sxtl(tmp1.V8H(), left.V8B());
-          __ Sxtl(tmp2.V8H(), right.V8B());
-          __ Sxtl(tmp3.V4S(), tmp1.V4H());
-          __ Sxtl(tmp4.V4S(), tmp2.V4H());
-          __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
-          __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
-          __ Sxtl2(tmp3.V4S(), tmp1.V8H());
-          __ Sxtl2(tmp4.V4S(), tmp2.V8H());
-          __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
-          __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
-          __ Sxtl2(tmp1.V8H(), left.V16B());
-          __ Sxtl2(tmp2.V8H(), right.V16B());
-          __ Sxtl(tmp3.V4S(), tmp1.V4H());
-          __ Sxtl(tmp4.V4S(), tmp2.V4H());
-          __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
-          __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
-          __ Sxtl2(tmp3.V4S(), tmp1.V8H());
-          __ Sxtl2(tmp4.V4S(), tmp2.V8H());
-          __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
-          __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
-          break;
-        }
-        default:
-          LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
-          UNREACHABLE();
-      }
-      break;
-    case DataType::Type::kUint16:
-    case DataType::Type::kInt16:
-      DCHECK_EQ(8u, a->GetVectorLength());
-      switch (instruction->GetPackedType()) {
-        case DataType::Type::kInt32:
-          DCHECK_EQ(4u, instruction->GetVectorLength());
-          __ Sabal(acc.V4S(), left.V4H(), right.V4H());
-          __ Sabal2(acc.V4S(), left.V8H(), right.V8H());
-          break;
-        case DataType::Type::kInt64: {
-          DCHECK_EQ(2u, instruction->GetVectorLength());
-          VRegister tmp1 = VRegisterFrom(locations->GetTemp(0));
-          VRegister tmp2 = VRegisterFrom(locations->GetTemp(1));
-          __ Sxtl(tmp1.V4S(), left.V4H());
-          __ Sxtl(tmp2.V4S(), right.V4H());
-          __ Sabal(acc.V2D(), tmp1.V2S(), tmp2.V2S());
-          __ Sabal2(acc.V2D(), tmp1.V4S(), tmp2.V4S());
-          __ Sxtl2(tmp1.V4S(), left.V8H());
-          __ Sxtl2(tmp2.V4S(), right.V8H());
-          __ Sabal(acc.V2D(), tmp1.V2S(), tmp2.V2S());
-          __ Sabal2(acc.V2D(), tmp1.V4S(), tmp2.V4S());
-          break;
-        }
-        default:
-          LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
-          UNREACHABLE();
-      }
-      break;
-    case DataType::Type::kInt32:
-      DCHECK_EQ(4u, a->GetVectorLength());
-      switch (instruction->GetPackedType()) {
-        case DataType::Type::kInt32: {
-          DCHECK_EQ(4u, instruction->GetVectorLength());
-          VRegister tmp = VRegisterFrom(locations->GetTemp(0));
-          __ Sub(tmp.V4S(), left.V4S(), right.V4S());
-          __ Abs(tmp.V4S(), tmp.V4S());
-          __ Add(acc.V4S(), acc.V4S(), tmp.V4S());
-          break;
-        }
-        case DataType::Type::kInt64:
-          DCHECK_EQ(2u, instruction->GetVectorLength());
-          __ Sabal(acc.V2D(), left.V2S(), right.V2S());
-          __ Sabal2(acc.V2D(), left.V4S(), right.V4S());
-          break;
-        default:
-          LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
-          UNREACHABLE();
-      }
-      break;
-    case DataType::Type::kInt64:
-      DCHECK_EQ(2u, a->GetVectorLength());
-      switch (instruction->GetPackedType()) {
-        case DataType::Type::kInt64: {
-          DCHECK_EQ(2u, instruction->GetVectorLength());
-          VRegister tmp = VRegisterFrom(locations->GetTemp(0));
-          __ Sub(tmp.V2D(), left.V2D(), right.V2D());
-          __ Abs(tmp.V2D(), tmp.V2D());
-          __ Add(acc.V2D(), acc.V2D(), tmp.V2D());
-          break;
-        }
-        default:
-          LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
-          UNREACHABLE();
-      }
-      break;
-    default:
-      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
-  }
+  LOG(FATAL) << "Unsupported SIMD instruction " << instruction->GetId();
+  UNREACHABLE();
 }
 
 void LocationsBuilderARM64Sve::VisitVecDotProd(HVecDotProd* instruction) {
@@ -1293,19 +1061,17 @@
   locations->SetInAt(2, Location::RequiresFpuRegister());
   locations->SetOut(Location::SameAsFirstInput());
 
-  // For Int8 and Uint8 general case we need a temp register.
-  if ((DataType::Size(instruction->InputAt(1)->AsVecOperation()->GetPackedType()) == 1) &&
-      !ShouldEmitDotProductInstructions(codegen_)) {
-    locations->AddTemp(Location::RequiresFpuRegister());
-  }
+  locations->AddTemp(Location::RequiresFpuRegister());
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecDotProd(HVecDotProd* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
   DCHECK(locations->InAt(0).Equals(locations->Out()));
-  VRegister acc = VRegisterFrom(locations->InAt(0));
-  VRegister left = VRegisterFrom(locations->InAt(1));
-  VRegister right = VRegisterFrom(locations->InAt(2));
+  const ZRegister acc = ZRegisterFrom(locations->InAt(0));
+  const ZRegister left = ZRegisterFrom(locations->InAt(1));
+  const ZRegister right = ZRegisterFrom(locations->InAt(2));
+  const PRegisterM p_reg = LoopPReg().Merging();
   HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
   HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
   DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
@@ -1317,45 +1083,20 @@
   switch (inputs_data_size) {
     case 1u: {
       DCHECK_EQ(16u, a->GetVectorLength());
+      UseScratchRegisterScope temps(GetVIXLAssembler());
+      const ZRegister tmp0 = temps.AcquireZ();
+      const ZRegister tmp1 = ZRegisterFrom(locations->GetTemp(0));
+
+      __ Dup(tmp1.VnB(), 0u);
+      __ Sel(tmp0.VnB(), p_reg, left.VnB(), tmp1.VnB());
+      __ Sel(tmp1.VnB(), p_reg, right.VnB(), tmp1.VnB());
       if (instruction->IsZeroExtending()) {
-        if (ShouldEmitDotProductInstructions(codegen_)) {
-          __ Udot(acc.V4S(), left.V16B(), right.V16B());
-        } else {
-          VRegister tmp = VRegisterFrom(locations->GetTemp(0));
-          __ Umull(tmp.V8H(), left.V8B(), right.V8B());
-          __ Uaddw(acc.V4S(), acc.V4S(), tmp.V4H());
-          __ Uaddw2(acc.V4S(), acc.V4S(), tmp.V8H());
-
-          __ Umull2(tmp.V8H(), left.V16B(), right.V16B());
-          __ Uaddw(acc.V4S(), acc.V4S(), tmp.V4H());
-          __ Uaddw2(acc.V4S(), acc.V4S(), tmp.V8H());
-        }
+        __ Udot(acc.VnS(), acc.VnS(), tmp0.VnB(), tmp1.VnB());
       } else {
-        if (ShouldEmitDotProductInstructions(codegen_)) {
-          __ Sdot(acc.V4S(), left.V16B(), right.V16B());
-        } else {
-          VRegister tmp = VRegisterFrom(locations->GetTemp(0));
-          __ Smull(tmp.V8H(), left.V8B(), right.V8B());
-          __ Saddw(acc.V4S(), acc.V4S(), tmp.V4H());
-          __ Saddw2(acc.V4S(), acc.V4S(), tmp.V8H());
-
-          __ Smull2(tmp.V8H(), left.V16B(), right.V16B());
-          __ Saddw(acc.V4S(), acc.V4S(), tmp.V4H());
-          __ Saddw2(acc.V4S(), acc.V4S(), tmp.V8H());
-        }
+        __ Sdot(acc.VnS(), acc.VnS(), tmp0.VnB(), tmp1.VnB());
       }
       break;
     }
-    case 2u:
-      DCHECK_EQ(8u, a->GetVectorLength());
-      if (instruction->IsZeroExtending()) {
-        __ Umlal(acc.V4S(), left.V4H(), right.V4H());
-        __ Umlal2(acc.V4S(), left.V8H(), right.V8H());
-      } else {
-        __ Smlal(acc.V4S(), left.V4H(), right.V4H());
-        __ Smlal2(acc.V4S(), left.V8H(), right.V8H());
-      }
-      break;
     default:
       LOG(FATAL) << "Unsupported SIMD type size: " << inputs_data_size;
   }
@@ -1395,54 +1136,39 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecLoad(HVecLoad* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
   size_t size = DataType::Size(instruction->GetPackedType());
-  VRegister reg = VRegisterFrom(locations->Out());
+  const ZRegister reg = ZRegisterFrom(locations->Out());
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register scratch;
+  const PRegisterZ p_reg = LoopPReg().Zeroing();
 
   switch (instruction->GetPackedType()) {
     case DataType::Type::kInt16:  // (short) s.charAt(.) can yield HVecLoad/Int16/StringCharAt.
     case DataType::Type::kUint16:
       DCHECK_EQ(8u, instruction->GetVectorLength());
-      // Special handling of compressed/uncompressed string load.
-      if (mirror::kUseStringCompression && instruction->IsStringCharAt()) {
-        vixl::aarch64::Label uncompressed_load, done;
-        // Test compression bit.
-        static_assert(static_cast<uint32_t>(mirror::StringCompressionFlag::kCompressed) == 0u,
-                      "Expecting 0=compressed, 1=uncompressed");
-        uint32_t count_offset = mirror::String::CountOffset().Uint32Value();
-        Register length = temps.AcquireW();
-        __ Ldr(length, HeapOperand(InputRegisterAt(instruction, 0), count_offset));
-        __ Tbnz(length.W(), 0, &uncompressed_load);
-        temps.Release(length);  // no longer needed
-        // Zero extend 8 compressed bytes into 8 chars.
-        __ Ldr(DRegisterFrom(locations->Out()).V8B(),
-               VecNeonAddress(instruction, &temps, 1, /*is_string_char_at*/ true, &scratch));
-        __ Uxtl(reg.V8H(), reg.V8B());
-        __ B(&done);
-        if (scratch.IsValid()) {
-          temps.Release(scratch);  // if used, no longer needed
-        }
-        // Load 8 direct uncompressed chars.
-        __ Bind(&uncompressed_load);
-        __ Ldr(reg,
-               VecNeonAddress(instruction, &temps, size, /*is_string_char_at*/ true, &scratch));
-        __ Bind(&done);
-        return;
-      }
-      FALLTHROUGH_INTENDED;
+      __ Ld1h(reg.VnH(), p_reg,
+              VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
+      break;
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ Ld1b(reg.VnB(), p_reg,
+              VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
+      break;
     case DataType::Type::kInt32:
     case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Ld1w(reg.VnS(), p_reg,
+              VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
+      break;
     case DataType::Type::kInt64:
     case DataType::Type::kFloat64:
-      DCHECK_LE(2u, instruction->GetVectorLength());
-      DCHECK_LE(instruction->GetVectorLength(), 16u);
-      __ Ldr(reg,
-             VecNeonAddress(instruction, &temps, size, instruction->IsStringCharAt(), &scratch));
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Ld1d(reg.VnD(), p_reg,
+              VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -1455,26 +1181,39 @@
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecStore(HVecStore* instruction) {
+  DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
   size_t size = DataType::Size(instruction->GetPackedType());
-  VRegister reg = VRegisterFrom(locations->InAt(2));
+  const ZRegister reg = ZRegisterFrom(locations->InAt(2));
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register scratch;
+  const PRegisterZ p_reg = LoopPReg().Zeroing();
 
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ St1b(reg.VnB(), p_reg,
+          VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
+      break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ St1h(reg.VnH(), p_reg,
+          VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
+      break;
     case DataType::Type::kInt32:
     case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ St1w(reg.VnS(), p_reg,
+          VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
+      break;
     case DataType::Type::kInt64:
     case DataType::Type::kFloat64:
-      DCHECK_LE(2u, instruction->GetVectorLength());
-      DCHECK_LE(instruction->GetVectorLength(), 16u);
-      __ Str(reg,
-             VecNeonAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ St1d(reg.VnD(), p_reg,
+          VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -1483,33 +1222,113 @@
 }
 
 void LocationsBuilderARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-  UNREACHABLE();
+  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+  DCHECK(instruction->InputAt(0)->IsIntConstant());
+  locations->SetInAt(0, Location::NoLocation());
+  locations->SetOut(Location::NoLocation());
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-  UNREACHABLE();
+  // Instruction is not predicated, see nodes_vector.h
+  DCHECK(!instruction->IsPredicated());
+  const PRegister p_reg = LoopPReg();
+
+  switch (instruction->GetPackedType()) {
+    case DataType::Type::kBool:
+    case DataType::Type::kUint8:
+    case DataType::Type::kInt8:
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      __ Ptrue(p_reg.VnB(), vixl::aarch64::SVE_ALL);
+      break;
+    case DataType::Type::kUint16:
+    case DataType::Type::kInt16:
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      __ Ptrue(p_reg.VnH(), vixl::aarch64::SVE_ALL);
+      break;
+    case DataType::Type::kInt32:
+    case DataType::Type::kFloat32:
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      __ Ptrue(p_reg.VnS(), vixl::aarch64::SVE_ALL);
+      break;
+    case DataType::Type::kInt64:
+    case DataType::Type::kFloat64:
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      __ Ptrue(p_reg.VnD(), vixl::aarch64::SVE_ALL);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderARM64Sve::VisitVecPredWhile(HVecPredWhile* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-  UNREACHABLE();
+  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RequiresRegister());
+  // The instruction doesn't really need a core register as out location; this is a hack
+  // to workaround absence of support for vector predicates in register allocation.
+  //
+  // Semantically, the out location of this instruction and predicate inputs locations of
+  // its users should be a fixed predicate register (similar to
+  // Location::RegisterLocation(int reg)). But the register allocator (RA) doesn't support
+  // SIMD regs (e.g. predicate), so LoopPReg() is used explicitly without exposing it
+  // to the RA.
+  //
+  // To make the RA happy Location::NoLocation() was used for all the vector instructions
+  // predicate inputs; but for the PredSetOperations (e.g. VecPredWhile) Location::NoLocation()
+  // can't be used without changes to RA - "ssa_liveness_analysis.cc] Check failed:
+  // input->IsEmittedAtUseSite()" would fire.
+  //
+  // Using a core register as a hack is the easiest way to tackle this problem. The RA will
+  // block one core register for the loop without actually using it; this should not be
+  // a performance issue as a SIMD loop operates mainly on SIMD registers.
+  //
+  // TODO: Support SIMD types in register allocator.
+  locations->SetOut(Location::RequiresRegister());
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecPredWhile(HVecPredWhile* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-  UNREACHABLE();
+  // Instruction is not predicated, see nodes_vector.h
+  DCHECK(!instruction->IsPredicated());
+  // Current implementation of predicated loop execution only supports kLO condition.
+  DCHECK(instruction->GetCondKind() == HVecPredWhile::CondKind::kLO);
+  Register left = InputRegisterAt(instruction, 0);
+  Register right = InputRegisterAt(instruction, 1);
+
+  switch (instruction->GetVectorLength()) {
+    case 16u:
+      __ Whilelo(LoopPReg().VnB(), left, right);
+      break;
+    case 8u:
+      __ Whilelo(LoopPReg().VnH(), left, right);
+      break;
+    case 4u:
+      __ Whilelo(LoopPReg().VnS(), left, right);
+      break;
+    case 2u:
+      __ Whilelo(LoopPReg().VnD(), left, right);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
+      UNREACHABLE();
+  }
 }
 
 void LocationsBuilderARM64Sve::VisitVecPredCondition(HVecPredCondition* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-  UNREACHABLE();
+  LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
+  locations->SetInAt(0, Location::NoLocation());
+  // Result of the operation - a boolean value in a core register.
+  locations->SetOut(Location::RequiresRegister());
 }
 
 void InstructionCodeGeneratorARM64Sve::VisitVecPredCondition(HVecPredCondition* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
-  UNREACHABLE();
+  // Instruction is not predicated, see nodes_vector.h
+  DCHECK(!instruction->IsPredicated());
+  Register reg = OutputRegister(instruction);
+  // Currently VecPredCondition is only used as part of vectorized loop check condition
+  // evaluation.
+  DCHECK(instruction->GetPCondKind() == HVecPredCondition::PCondKind::kNFirst);
+  __ Cset(reg, pl);
 }
 
 Location InstructionCodeGeneratorARM64Sve::AllocateSIMDScratchLocation(
@@ -1547,13 +1366,13 @@
     DCHECK(source.IsSIMDStackSlot());
     UseScratchRegisterScope temps(GetVIXLAssembler());
     if (GetVIXLAssembler()->GetScratchVRegisterList()->IsEmpty()) {
-      Register temp = temps.AcquireX();
+      const Register temp = temps.AcquireX();
       __ Ldr(temp, MemOperand(sp, source.GetStackIndex()));
       __ Str(temp, MemOperand(sp, destination.GetStackIndex()));
       __ Ldr(temp, MemOperand(sp, source.GetStackIndex() + kArm64WordSize));
       __ Str(temp, MemOperand(sp, destination.GetStackIndex() + kArm64WordSize));
     } else {
-      VRegister temp = temps.AcquireVRegisterOfSize(kQRegSize);
+      const VRegister temp = temps.AcquireVRegisterOfSize(kQRegSize);
       __ Ldr(temp, StackOperandFrom(source));
       __ Str(temp, StackOperandFrom(destination));
     }

diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
index d652492..7220781 100644
--- a/compiler/optimizing/common_arm64.h
+++ b/compiler/optimizing/common_arm64.h

@@ -102,6 +102,11 @@
   return vixl::aarch64::VRegister(location.reg());
 }
 
+inline vixl::aarch64::ZRegister ZRegisterFrom(Location location) {
+  DCHECK(location.IsFpuRegister()) << location;
+  return vixl::aarch64::ZRegister(location.reg());
+}
+
 inline vixl::aarch64::VRegister SRegisterFrom(Location location) {
   DCHECK(location.IsFpuRegister()) << location;
   return vixl::aarch64::SRegister(location.reg());
@@ -298,7 +303,7 @@
 }
 
 inline Location ARM64EncodableConstantOrRegister(HInstruction* constant,
-                                                        HInstruction* instr) {
+                                                 HInstruction* instr) {
   if (constant->IsConstant()
       && Arm64CanEncodeConstantAsImmediate(constant->AsConstant(), instr)) {
     return Location::ConstantLocation(constant->AsConstant());

diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index 7137617..8970372 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc

@@ -289,56 +289,72 @@
   }
 
   ArenaAllocator* allocator = mul->GetBlock()->GetGraph()->GetAllocator();
-
-  if (mul->HasOnlyOneNonEnvironmentUse()) {
-    HInstruction* use = mul->GetUses().front().GetUser();
-    if (use->IsVecAdd() || use->IsVecSub()) {
-      // Replace code looking like
-      //    VECMUL tmp, x, y
-      //    VECADD/SUB dst, acc, tmp
-      // with
-      //    VECMULACC dst, acc, x, y
-      // Note that we do not want to (unconditionally) perform the merge when the
-      // multiplication has multiple uses and it can be merged in all of them.
-      // Multiple uses could happen on the same control-flow path, and we would
-      // then increase the amount of work. In the future we could try to evaluate
-      // whether all uses are on different control-flow paths (using dominance and
-      // reverse-dominance information) and only perform the merge when they are.
-      HInstruction* accumulator = nullptr;
-      HVecBinaryOperation* binop = use->AsVecBinaryOperation();
-      HInstruction* binop_left = binop->GetLeft();
-      HInstruction* binop_right = binop->GetRight();
-      // This is always true since the `HVecMul` has only one use (which is checked above).
-      DCHECK_NE(binop_left, binop_right);
-      if (binop_right == mul) {
-        accumulator = binop_left;
-      } else if (use->IsVecAdd()) {
-        DCHECK_EQ(binop_left, mul);
-        accumulator = binop_right;
-      }
-
-      HInstruction::InstructionKind kind =
-          use->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub;
-      if (accumulator != nullptr) {
-        HVecMultiplyAccumulate* mulacc =
-            new (allocator) HVecMultiplyAccumulate(allocator,
-                                                   kind,
-                                                   accumulator,
-                                                   mul->GetLeft(),
-                                                   mul->GetRight(),
-                                                   binop->GetPackedType(),
-                                                   binop->GetVectorLength(),
-                                                   binop->GetDexPc());
-
-        binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc);
-        DCHECK(!mul->HasUses());
-        mul->GetBlock()->RemoveInstruction(mul);
-        return true;
-      }
-    }
+  if (!mul->HasOnlyOneNonEnvironmentUse()) {
+    return false;
+  }
+  HInstruction* binop = mul->GetUses().front().GetUser();
+  if (!binop->IsVecAdd() && !binop->IsVecSub()) {
+    return false;
   }
 
-  return false;
+  // Replace code looking like
+  //    VECMUL tmp, x, y
+  //    VECADD/SUB dst, acc, tmp
+  // with
+  //    VECMULACC dst, acc, x, y
+  // Note that we do not want to (unconditionally) perform the merge when the
+  // multiplication has multiple uses and it can be merged in all of them.
+  // Multiple uses could happen on the same control-flow path, and we would
+  // then increase the amount of work. In the future we could try to evaluate
+  // whether all uses are on different control-flow paths (using dominance and
+  // reverse-dominance information) and only perform the merge when they are.
+  HInstruction* accumulator = nullptr;
+  HVecBinaryOperation* vec_binop = binop->AsVecBinaryOperation();
+  HInstruction* binop_left = vec_binop->GetLeft();
+  HInstruction* binop_right = vec_binop->GetRight();
+  // This is always true since the `HVecMul` has only one use (which is checked above).
+  DCHECK_NE(binop_left, binop_right);
+  if (binop_right == mul) {
+    accumulator = binop_left;
+  } else {
+    DCHECK_EQ(binop_left, mul);
+    // Only addition is commutative.
+    if (!binop->IsVecAdd()) {
+      return false;
+    }
+    accumulator = binop_right;
+  }
+
+  DCHECK(accumulator != nullptr);
+  HInstruction::InstructionKind kind =
+      binop->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub;
+
+  bool predicated_simd = vec_binop->IsPredicated();
+  if (predicated_simd && !HVecOperation::HaveSamePredicate(vec_binop, mul)) {
+    return false;
+  }
+
+  HVecMultiplyAccumulate* mulacc =
+      new (allocator) HVecMultiplyAccumulate(allocator,
+                                             kind,
+                                             accumulator,
+                                             mul->GetLeft(),
+                                             mul->GetRight(),
+                                             vec_binop->GetPackedType(),
+                                             vec_binop->GetVectorLength(),
+                                             vec_binop->GetDexPc());
+
+
+
+  vec_binop->GetBlock()->ReplaceAndRemoveInstructionWith(vec_binop, mulacc);
+  if (predicated_simd) {
+    mulacc->SetGoverningPredicate(vec_binop->GetGoverningPredicate(),
+                                  vec_binop->GetPredicationKind());
+  }
+
+  DCHECK(!mul->HasUses());
+  mul->GetBlock()->RemoveInstruction(mul);
+  return true;
 }
 
 void InstructionSimplifierVisitor::VisitShift(HBinaryOperation* instruction) {

diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc
index 260bfaf..ff0859b 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.cc
+++ b/compiler/optimizing/instruction_simplifier_arm64.cc

@@ -277,14 +277,17 @@
 }
 
 void InstructionSimplifierArm64Visitor::VisitVecLoad(HVecLoad* instruction) {
-  if (!instruction->IsStringCharAt()
-      && TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) {
+  // TODO: Extract regular HIntermediateAddress.
+  if (!instruction->IsPredicated() && !instruction->IsStringCharAt() &&
+      TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) {
     RecordSimplification();
   }
 }
 
 void InstructionSimplifierArm64Visitor::VisitVecStore(HVecStore* instruction) {
-  if (TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) {
+  // TODO: Extract regular HIntermediateAddress.
+  if (!instruction->IsPredicated() &&
+      TryExtractVecArrayAccessAddress(instruction, instruction->GetIndex())) {
     RecordSimplification();
   }
 }

diff --git a/compiler/optimizing/loop_analysis.cc b/compiler/optimizing/loop_analysis.cc
index a776c37..76bd849 100644
--- a/compiler/optimizing/loop_analysis.cc
+++ b/compiler/optimizing/loop_analysis.cc

@@ -17,6 +17,7 @@
 #include "loop_analysis.h"
 
 #include "base/bit_vector-inl.h"
+#include "code_generator.h"
 #include "induction_var_range.h"
 
 namespace art {
@@ -76,6 +77,7 @@
 // is provided. Enables scalar loop peeling and unrolling with the most conservative heuristics.
 class ArchDefaultLoopHelper : public ArchNoOptsLoopHelper {
  public:
+  explicit ArchDefaultLoopHelper(const CodeGenerator& codegen) : ArchNoOptsLoopHelper(codegen) {}
   // Scalar loop unrolling parameters and heuristics.
   //
   // Maximum possible unrolling factor.
@@ -132,6 +134,7 @@
 // peeling and unrolling and supports SIMD loop unrolling.
 class Arm64LoopHelper : public ArchDefaultLoopHelper {
  public:
+  explicit Arm64LoopHelper(const CodeGenerator& codegen) : ArchDefaultLoopHelper(codegen) {}
   // SIMD loop unrolling parameters and heuristics.
   //
   // Maximum possible unrolling factor.
@@ -157,6 +160,10 @@
     // Don't unroll with insufficient iterations.
     // TODO: Unroll loops with unknown trip count.
     DCHECK_NE(vector_length, 0u);
+    // TODO: Unroll loops in predicated vectorization mode.
+    if (codegen_.SupportsPredicatedSIMD()) {
+      return LoopAnalysisInfo::kNoUnrollingFactor;
+    }
     if (trip_count < (2 * vector_length + max_peel)) {
       return LoopAnalysisInfo::kNoUnrollingFactor;
     }
@@ -309,6 +316,8 @@
   uint32_t GetUnrollingFactor(HLoopInformation* loop_info, HBasicBlock* header) const;
 
  public:
+  explicit X86_64LoopHelper(const CodeGenerator& codegen) : ArchDefaultLoopHelper(codegen) {}
+
   uint32_t GetSIMDUnrollingFactor(HBasicBlock* block,
                                   int64_t trip_count,
                                   uint32_t max_peel,
@@ -398,17 +407,18 @@
   return (1 << unrolling_factor);
 }
 
-ArchNoOptsLoopHelper* ArchNoOptsLoopHelper::Create(InstructionSet isa,
+ArchNoOptsLoopHelper* ArchNoOptsLoopHelper::Create(const CodeGenerator& codegen,
                                                    ArenaAllocator* allocator) {
+  InstructionSet isa = codegen.GetInstructionSet();
   switch (isa) {
     case InstructionSet::kArm64: {
-      return new (allocator) Arm64LoopHelper;
+      return new (allocator) Arm64LoopHelper(codegen);
     }
     case InstructionSet::kX86_64: {
-      return new (allocator) X86_64LoopHelper;
+      return new (allocator) X86_64LoopHelper(codegen);
     }
     default: {
-      return new (allocator) ArchDefaultLoopHelper;
+      return new (allocator) ArchDefaultLoopHelper(codegen);
     }
   }
 }

diff --git a/compiler/optimizing/loop_analysis.h b/compiler/optimizing/loop_analysis.h
index 57509ee..fbf1516 100644
--- a/compiler/optimizing/loop_analysis.h
+++ b/compiler/optimizing/loop_analysis.h

@@ -21,6 +21,7 @@
 
 namespace art {
 
+class CodeGenerator;
 class InductionVarRange;
 class LoopAnalysis;
 
@@ -132,11 +133,12 @@
 //
 class ArchNoOptsLoopHelper : public ArenaObject<kArenaAllocOptimization> {
  public:
+  explicit ArchNoOptsLoopHelper(const CodeGenerator& codegen) : codegen_(codegen) {}
   virtual ~ArchNoOptsLoopHelper() {}
 
   // Creates an instance of specialised helper for the target or default helper if the target
   // doesn't support loop peeling and unrolling.
-  static ArchNoOptsLoopHelper* Create(InstructionSet isa, ArenaAllocator* allocator);
+  static ArchNoOptsLoopHelper* Create(const CodeGenerator& codegen, ArenaAllocator* allocator);
 
   // Returns whether the loop is not beneficial for loop peeling/unrolling.
   //
@@ -176,6 +178,9 @@
                                           uint32_t vector_length ATTRIBUTE_UNUSED) const {
     return LoopAnalysisInfo::kNoUnrollingFactor;
   }
+
+ protected:
+  const CodeGenerator& codegen_;
 };
 
 }  // namespace art

diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 4c9b01c..1210dbe 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc

@@ -473,6 +473,7 @@
       iset_(nullptr),
       reductions_(nullptr),
       simplified_(false),
+      predicated_vectorization_mode_(codegen.SupportsPredicatedSIMD()),
       vector_length_(0),
       vector_refs_(nullptr),
       vector_static_peeling_factor_(0),
@@ -486,10 +487,7 @@
       vector_header_(nullptr),
       vector_body_(nullptr),
       vector_index_(nullptr),
-      arch_loop_helper_(ArchNoOptsLoopHelper::Create(compiler_options_ != nullptr
-                                                          ? compiler_options_->GetInstructionSet()
-                                                          : InstructionSet::kNone,
-                                                      global_allocator_)) {
+      arch_loop_helper_(ArchNoOptsLoopHelper::Create(codegen, global_allocator_)) {
 }
 
 bool HLoopOptimization::Run() {
@@ -1024,8 +1022,10 @@
     }
   }  // for i
 
-  // Find a suitable alignment strategy.
-  SetAlignmentStrategy(peeling_votes, peeling_candidate);
+  if (!IsInPredicatedVectorizationMode()) {
+    // Find a suitable alignment strategy.
+    SetAlignmentStrategy(peeling_votes, peeling_candidate);
+  }
 
   // Does vectorization seem profitable?
   if (!IsVectorizationProfitable(trip_count)) {
@@ -1052,8 +1052,8 @@
 
   // A cleanup loop is needed, at least, for any unknown trip count or
   // for a known trip count with remainder iterations after vectorization.
-  bool needs_cleanup = trip_count == 0 ||
-      ((trip_count - vector_static_peeling_factor_) % chunk) != 0;
+  bool needs_cleanup = !IsInPredicatedVectorizationMode() &&
+      (trip_count == 0 || ((trip_count - vector_static_peeling_factor_) % chunk) != 0);
 
   // Adjust vector bookkeeping.
   HPhi* main_phi = nullptr;
@@ -1071,11 +1071,13 @@
   // ptc = <peeling factor>;
   HInstruction* ptc = nullptr;
   if (vector_static_peeling_factor_ != 0) {
+    DCHECK(!IsInPredicatedVectorizationMode());
     // Static loop peeling for SIMD alignment (using the most suitable
     // fixed peeling factor found during prior alignment analysis).
     DCHECK(vector_dynamic_peeling_candidate_ == nullptr);
     ptc = graph_->GetConstant(induc_type, vector_static_peeling_factor_);
   } else if (vector_dynamic_peeling_candidate_ != nullptr) {
+    DCHECK(!IsInPredicatedVectorizationMode());
     // Dynamic loop peeling for SIMD alignment (using the most suitable
     // candidate found during prior alignment analysis):
     // rem = offset % ALIGN;    // adjusted as #elements
@@ -1106,6 +1108,7 @@
   HInstruction* stc = induction_range_.GenerateTripCount(node->loop_info, graph_, preheader);
   HInstruction* vtc = stc;
   if (needs_cleanup) {
+    DCHECK(!IsInPredicatedVectorizationMode());
     DCHECK(IsPowerOfTwo(chunk));
     HInstruction* diff = stc;
     if (ptc != nullptr) {
@@ -1143,6 +1146,7 @@
   //       moved around during suspend checks, since all analysis was based on
   //       nothing more than the Android runtime alignment conventions.
   if (ptc != nullptr) {
+    DCHECK(!IsInPredicatedVectorizationMode());
     vector_mode_ = kSequential;
     GenerateNewLoop(node,
                     block,
@@ -1170,6 +1174,7 @@
   // for ( ; i < stc; i += 1)
   //    <loop-body>
   if (needs_cleanup) {
+    DCHECK(!IsInPredicatedVectorizationMode() || vector_runtime_test_a_ != nullptr);
     vector_mode_ = kSequential;
     GenerateNewLoop(node,
                     block,
@@ -1227,9 +1232,35 @@
   // Generate header and prepare body.
   // for (i = lo; i < hi; i += step)
   //    <loop-body>
-  HInstruction* cond = new (global_allocator_) HAboveOrEqual(phi, hi);
-  vector_header_->AddPhi(phi);
-  vector_header_->AddInstruction(cond);
+  HInstruction* cond = nullptr;
+  HInstruction* set_pred = nullptr;
+  if (IsInPredicatedVectorizationMode()) {
+    HVecPredWhile* pred_while =
+        new (global_allocator_) HVecPredWhile(global_allocator_,
+                                              phi,
+                                              hi,
+                                              HVecPredWhile::CondKind::kLO,
+                                              DataType::Type::kInt32,
+                                              vector_length_,
+                                              0u);
+
+    cond = new (global_allocator_) HVecPredCondition(global_allocator_,
+                                                     pred_while,
+                                                     HVecPredCondition::PCondKind::kNFirst,
+                                                     DataType::Type::kInt32,
+                                                     vector_length_,
+                                                     0u);
+
+    vector_header_->AddPhi(phi);
+    vector_header_->AddInstruction(pred_while);
+    vector_header_->AddInstruction(cond);
+    set_pred = pred_while;
+  } else {
+    cond = new (global_allocator_) HAboveOrEqual(phi, hi);
+    vector_header_->AddPhi(phi);
+    vector_header_->AddInstruction(cond);
+  }
+
   vector_header_->AddInstruction(new (global_allocator_) HIf(cond));
   vector_index_ = phi;
   vector_permanent_map_->clear();  // preserved over unrolling
@@ -1246,6 +1277,10 @@
       auto i = vector_map_->find(it.Current());
       if (i != vector_map_->end() && !i->second->IsInBlock()) {
         Insert(vector_body_, i->second);
+        if (IsInPredicatedVectorizationMode() && i->second->IsVecOperation()) {
+          HVecOperation* op = i->second->AsVecOperation();
+          op->SetMergingGoverningPredicate(set_pred);
+        }
         // Deal with instructions that need an environment, such as the scalar intrinsics.
         if (i->second->NeedsEnvironment()) {
           i->second->CopyEnvironmentFromWithLoopPhiAdjustment(env, vector_header_);
@@ -1360,7 +1395,10 @@
   } else if (instruction->IsArrayGet()) {
     // Deal with vector restrictions.
     bool is_string_char_at = instruction->AsArrayGet()->IsStringCharAt();
-    if (is_string_char_at && HasVectorRestrictions(restrictions, kNoStringCharAt)) {
+
+    if (is_string_char_at && (HasVectorRestrictions(restrictions, kNoStringCharAt) ||
+                              IsInPredicatedVectorizationMode())) {
+      // TODO: Support CharAt for predicated mode.
       return false;
     }
     // Accept a right-hand-side array base[index] for
@@ -1575,32 +1613,73 @@
       }
       return false;
     case InstructionSet::kArm64:
-      // Allow vectorization for all ARM devices, because Android assumes that
-      // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD).
-      switch (type) {
-        case DataType::Type::kBool:
-        case DataType::Type::kUint8:
-        case DataType::Type::kInt8:
-          *restrictions |= kNoDiv;
-          return TrySetVectorLength(type, 16);
-        case DataType::Type::kUint16:
-        case DataType::Type::kInt16:
-          *restrictions |= kNoDiv;
-          return TrySetVectorLength(type, 8);
-        case DataType::Type::kInt32:
-          *restrictions |= kNoDiv;
-          return TrySetVectorLength(type, 4);
-        case DataType::Type::kInt64:
-          *restrictions |= kNoDiv | kNoMul;
-          return TrySetVectorLength(type, 2);
-        case DataType::Type::kFloat32:
-          *restrictions |= kNoReduction;
-          return TrySetVectorLength(type, 4);
-        case DataType::Type::kFloat64:
-          *restrictions |= kNoReduction;
-          return TrySetVectorLength(type, 2);
-        default:
-          return false;
+      if (IsInPredicatedVectorizationMode()) {
+        // SVE vectorization.
+        CHECK(features->AsArm64InstructionSetFeatures()->HasSVE());
+        switch (type) {
+          case DataType::Type::kBool:
+          case DataType::Type::kUint8:
+          case DataType::Type::kInt8:
+            *restrictions |= kNoDiv |
+                             kNoSignedHAdd |
+                             kNoUnsignedHAdd |
+                             kNoUnroundedHAdd |
+                             kNoSAD;
+            return TrySetVectorLength(type, 16);
+          case DataType::Type::kUint16:
+          case DataType::Type::kInt16:
+            *restrictions |= kNoDiv |
+                             kNoSignedHAdd |
+                             kNoUnsignedHAdd |
+                             kNoUnroundedHAdd |
+                             kNoSAD |
+                             kNoDotProd;
+            return TrySetVectorLength(type, 8);
+          case DataType::Type::kInt32:
+            *restrictions |= kNoDiv | kNoSAD;
+            return TrySetVectorLength(type, 4);
+          case DataType::Type::kInt64:
+            *restrictions |= kNoDiv | kNoSAD;
+            return TrySetVectorLength(type, 2);
+          case DataType::Type::kFloat32:
+            *restrictions |= kNoReduction;
+            return TrySetVectorLength(type, 4);
+          case DataType::Type::kFloat64:
+            *restrictions |= kNoReduction;
+            return TrySetVectorLength(type, 2);
+          default:
+            break;
+        }
+        return false;
+      } else {
+        // Allow vectorization for all ARM devices, because Android assumes that
+        // ARMv8 AArch64 always supports advanced SIMD (128-bit SIMD).
+        switch (type) {
+          case DataType::Type::kBool:
+          case DataType::Type::kUint8:
+          case DataType::Type::kInt8:
+            *restrictions |= kNoDiv;
+            return TrySetVectorLength(type, 16);
+          case DataType::Type::kUint16:
+          case DataType::Type::kInt16:
+            *restrictions |= kNoDiv;
+            return TrySetVectorLength(type, 8);
+          case DataType::Type::kInt32:
+            *restrictions |= kNoDiv;
+            return TrySetVectorLength(type, 4);
+          case DataType::Type::kInt64:
+            *restrictions |= kNoDiv | kNoMul;
+            return TrySetVectorLength(type, 2);
+          case DataType::Type::kFloat32:
+            *restrictions |= kNoReduction;
+            return TrySetVectorLength(type, 4);
+          case DataType::Type::kFloat64:
+            *restrictions |= kNoReduction;
+            return TrySetVectorLength(type, 2);
+          default:
+            break;
+        }
+        return false;
       }
     case InstructionSet::kX86:
     case InstructionSet::kX86_64:
@@ -1693,6 +1772,15 @@
       vector = new (global_allocator_)
           HVecReplicateScalar(global_allocator_, input, type, vector_length_, kNoDexPc);
       vector_permanent_map_->Put(org, Insert(vector_preheader_, vector));
+      if (IsInPredicatedVectorizationMode()) {
+        HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+                                                                          graph_->GetIntConstant(1),
+                                                                          type,
+                                                                          vector_length_,
+                                                                          0u);
+        vector_preheader_->InsertInstructionBefore(set_pred, vector);
+        vector->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+      }
     }
     vector_map_->Put(org, vector);
   }
@@ -1821,6 +1909,15 @@
                                                                     vector_length,
                                                                     kNoDexPc));
     }
+    if (IsInPredicatedVectorizationMode()) {
+      HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+                                                                        graph_->GetIntConstant(1),
+                                                                        type,
+                                                                        vector_length,
+                                                                        0u);
+      vector_preheader_->InsertInstructionBefore(set_pred, new_init);
+      new_init->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+    }
   } else {
     new_init = ReduceAndExtractIfNeeded(new_init);
   }
@@ -1852,6 +1949,17 @@
       instruction = new (global_allocator_) HVecExtractScalar(
           global_allocator_, reduce, type, vector_length, 0, kNoDexPc);
       exit->InsertInstructionAfter(instruction, reduce);
+
+      if (IsInPredicatedVectorizationMode()) {
+        HVecPredSetAll* set_pred = new (global_allocator_) HVecPredSetAll(global_allocator_,
+                                                                          graph_->GetIntConstant(1),
+                                                                          type,
+                                                                          vector_length,
+                                                                          0u);
+        exit->InsertInstructionBefore(set_pred, reduce);
+        reduce->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+        instruction->AsVecOperation()->SetMergingGoverningPredicate(set_pred);
+      }
     }
   }
   return instruction;
@@ -1991,7 +2099,8 @@
         return false;
       }
       // Deal with vector restrictions.
-      if ((!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) ||
+      if ((is_unsigned && HasVectorRestrictions(restrictions, kNoUnsignedHAdd)) ||
+          (!is_unsigned && HasVectorRestrictions(restrictions, kNoSignedHAdd)) ||
           (!is_rounded && HasVectorRestrictions(restrictions, kNoUnroundedHAdd))) {
         return false;
       }

diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 0c35f29..0d76804 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h

@@ -76,13 +76,14 @@
     kNoShr           = 1 << 3,   // no arithmetic shift right
     kNoHiBits        = 1 << 4,   // "wider" operations cannot bring in higher order bits
     kNoSignedHAdd    = 1 << 5,   // no signed halving add
-    kNoUnroundedHAdd = 1 << 6,   // no unrounded halving add
-    kNoAbs           = 1 << 7,   // no absolute value
-    kNoStringCharAt  = 1 << 8,   // no StringCharAt
-    kNoReduction     = 1 << 9,   // no reduction
-    kNoSAD           = 1 << 10,  // no sum of absolute differences (SAD)
-    kNoWideSAD       = 1 << 11,  // no sum of absolute differences (SAD) with operand widening
-    kNoDotProd       = 1 << 12,  // no dot product
+    kNoUnsignedHAdd  = 1 << 6,   // no unsigned halving add
+    kNoUnroundedHAdd = 1 << 7,   // no unrounded halving add
+    kNoAbs           = 1 << 8,   // no absolute value
+    kNoStringCharAt  = 1 << 9,   // no StringCharAt
+    kNoReduction     = 1 << 10,  // no reduction
+    kNoSAD           = 1 << 11,  // no sum of absolute differences (SAD)
+    kNoWideSAD       = 1 << 12,  // no sum of absolute differences (SAD) with operand widening
+    kNoDotProd       = 1 << 13,  // no dot product
   };
 
   /*
@@ -270,6 +271,8 @@
   void RemoveDeadInstructions(const HInstructionList& list);
   bool CanRemoveCycle();  // Whether the current 'iset_' is removable.
 
+  bool IsInPredicatedVectorizationMode() const { return predicated_vectorization_mode_; }
+
   // Compiler options (to query ISA features).
   const CompilerOptions* compiler_options_;
 
@@ -305,6 +308,9 @@
   // Flag that tracks if any simplifications have occurred.
   bool simplified_;
 
+  // Whether to use predicated loop vectorization (e.g. for arm64 SVE target).
+  bool predicated_vectorization_mode_;
+
   // Number of "lanes" for selected packed type.
   uint32_t vector_length_;
 

diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index 9c6b422..a2cd86d 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h

@@ -145,6 +145,15 @@
     return pred_input->AsVecPredSetOperation();
   }
 
+  // Returns whether two vector operations are predicated by the same vector predicate
+  // with the same predication type.
+  static bool HaveSamePredicate(HVecOperation* instr0, HVecOperation* instr1) {
+    HVecPredSetOperation* instr0_predicate = instr0->GetGoverningPredicate();
+    HVecOperation::PredicationKind instr0_predicate_kind = instr0->GetPredicationKind();
+    return instr1->GetGoverningPredicate() == instr0_predicate &&
+           instr1->GetPredicationKind() == instr0_predicate_kind;
+  }
+
   // Returns the number of elements packed in a vector.
   size_t GetVectorLength() const {
     return vector_length_;
commit	8ba4de1a5684686447a578bdc425321fd3bccca6	[log] [tgz]
author	Artem Serov <artem.serov@linaro.org>	Wed Dec 04 21:10:23 2019 +0000
committer	Ulyana Trafimovich <skvadrik@google.com>	Thu Feb 04 06:16:33 2021 +0000
tree	20c24450b24950266ccc235306e3ad2109c57497
parent	32bf6d39bc020cacfc655ce60630f4a0da3b45cf [diff]