7 files changed, 210 insertions, 177 deletions
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index f5d78367fe..bdc5e2d9a3 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -169,54 +169,6 @@ static RegisterSet OneRegInReferenceOutSaveEverythingCallerSaves() {
 #define __ down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler()->  // NOLINT
 #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArm64PointerSize, x).Int32Value()
 
-// Calculate memory accessing operand for save/restore live registers.
-static void SaveRestoreLiveRegistersHelper(CodeGenerator* codegen,
-                                           LocationSummary* locations,
-                                           int64_t spill_offset,
-                                           bool is_save) {
-  const uint32_t core_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ true);
-  const uint32_t fp_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ false);
-  DCHECK(ArtVixlRegCodeCoherentForRegSet(core_spills,
-                                         codegen->GetNumberOfCoreRegisters(),
-                                         fp_spills,
-                                         codegen->GetNumberOfFloatingPointRegisters()));
-
-  CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize, core_spills);
-  const unsigned v_reg_size_in_bits = codegen->GetSlowPathFPWidth() * 8;
-  DCHECK_LE(codegen->GetSIMDRegisterWidth(), kQRegSizeInBytes);
-  CPURegList fp_list = CPURegList(CPURegister::kVRegister, v_reg_size_in_bits, fp_spills);
-
-  MacroAssembler* masm = down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler();
-  UseScratchRegisterScope temps(masm);
-
-  Register base = masm->StackPointer();
-  int64_t core_spill_size = core_list.GetTotalSizeInBytes();
-  int64_t fp_spill_size = fp_list.GetTotalSizeInBytes();
-  int64_t reg_size = kXRegSizeInBytes;
-  int64_t max_ls_pair_offset = spill_offset + core_spill_size + fp_spill_size - 2 * reg_size;
-  uint32_t ls_access_size = WhichPowerOf2(reg_size);
-  if (((core_list.GetCount() > 1) || (fp_list.GetCount() > 1)) &&
-      !masm->IsImmLSPair(max_ls_pair_offset, ls_access_size)) {
-    // If the offset does not fit in the instruction's immediate field, use an alternate register
-    // to compute the base address(float point registers spill base address).
-    Register new_base = temps.AcquireSameSizeAs(base);
-    __ Add(new_base, base, Operand(spill_offset + core_spill_size));
-    base = new_base;
-    spill_offset = -core_spill_size;
-    int64_t new_max_ls_pair_offset = fp_spill_size - 2 * reg_size;
-    DCHECK(masm->IsImmLSPair(spill_offset, ls_access_size));
-    DCHECK(masm->IsImmLSPair(new_max_ls_pair_offset, ls_access_size));
-  }
-
-  if (is_save) {
-    __ StoreCPURegList(core_list, MemOperand(base, spill_offset));
-    __ StoreCPURegList(fp_list, MemOperand(base, spill_offset + core_spill_size));
-  } else {
-    __ LoadCPURegList(core_list, MemOperand(base, spill_offset));
-    __ LoadCPURegList(fp_list, MemOperand(base, spill_offset + core_spill_size));
-  }
-}
-
 void SlowPathCodeARM64::SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) {
   size_t stack_offset = codegen->GetFirstRegisterSlotInSlowPath();
   const uint32_t core_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ true);
@@ -240,15 +192,15 @@ void SlowPathCodeARM64::SaveLiveRegisters(CodeGenerator* codegen, LocationSummar
     stack_offset += fp_reg_size;
   }
 
-  SaveRestoreLiveRegistersHelper(codegen,
-                                 locations,
-                                 codegen->GetFirstRegisterSlotInSlowPath(), /* is_save= */ true);
+  InstructionCodeGeneratorARM64* visitor =
+      down_cast<CodeGeneratorARM64*>(codegen)->GetInstructionCodeGeneratorArm64();
+  visitor->SaveLiveRegistersHelper(locations, codegen->GetFirstRegisterSlotInSlowPath());
 }
 
 void SlowPathCodeARM64::RestoreLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) {
-  SaveRestoreLiveRegistersHelper(codegen,
-                                 locations,
-                                 codegen->GetFirstRegisterSlotInSlowPath(), /* is_save= */ false);
+  InstructionCodeGeneratorARM64* visitor =
+      down_cast<CodeGeneratorARM64*>(codegen)->GetInstructionCodeGeneratorArm64();
+  visitor->RestoreLiveRegistersHelper(locations, codegen->GetFirstRegisterSlotInSlowPath());
 }
 
 class BoundsCheckSlowPathARM64 : public SlowPathCodeARM64 {
@@ -997,6 +949,12 @@ bool CodeGeneratorARM64::ShouldUseSVE() const {
   return GetInstructionSetFeatures().HasSVE();
 }
 
+size_t CodeGeneratorARM64::GetSIMDRegisterWidth() const {
+  return SupportsPredicatedSIMD()
+      ? GetInstructionSetFeatures().GetSVEVectorLength() / kBitsPerByte
+      : vixl::aarch64::kQRegSizeInBytes;
+}
+
 #define __ GetVIXLAssembler()->
 
 void CodeGeneratorARM64::EmitJumpTables() {
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index eb3e9546e0..d4546e5bd5 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -309,6 +309,10 @@ class InstructionCodeGeneratorARM64 : public InstructionCodeGenerator {
   virtual void LoadSIMDRegFromStack(Location destination, Location source) = 0;
   virtual void MoveSIMDRegToSIMDReg(Location destination, Location source) = 0;
   virtual void MoveToSIMDStackSlot(Location destination, Location source) = 0;
+  virtual void SaveLiveRegistersHelper(LocationSummary* locations,
+                                       int64_t spill_offset) = 0;
+  virtual void RestoreLiveRegistersHelper(LocationSummary* locations,
+                                          int64_t spill_offset) = 0;
 
  protected:
   void GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path,
@@ -462,6 +466,8 @@ class InstructionCodeGeneratorARM64Neon : public InstructionCodeGeneratorARM64 {
   void LoadSIMDRegFromStack(Location destination, Location source) override;
   void MoveSIMDRegToSIMDReg(Location destination, Location source) override;
   void MoveToSIMDStackSlot(Location destination, Location source) override;
+  void SaveLiveRegistersHelper(LocationSummary* locations, int64_t spill_offset) override;
+  void RestoreLiveRegistersHelper(LocationSummary* locations, int64_t spill_offset) override;
 };
 
 class LocationsBuilderARM64Neon : public LocationsBuilderARM64 {
@@ -495,8 +501,14 @@ class InstructionCodeGeneratorARM64Sve : public InstructionCodeGeneratorARM64 {
   void LoadSIMDRegFromStack(Location destination, Location source) override;
   void MoveSIMDRegToSIMDReg(Location destination, Location source) override;
   void MoveToSIMDStackSlot(Location destination, Location source) override;
+  void SaveLiveRegistersHelper(LocationSummary* locations, int64_t spill_offset) override;
+  void RestoreLiveRegistersHelper(LocationSummary* locations, int64_t spill_offset) override;
 
  private:
+  // Validate that instruction vector length and packed type are compliant with the SIMD
+  // register size (full SIMD register is used).
+  void ValidateVectorLength(HVecOperation* instr) const;
+
   // Returns default predicate register which is used as governing vector predicate
   // to implement predicated loop execution.
   //
@@ -579,9 +591,7 @@ class CodeGeneratorARM64 : public CodeGenerator {
     return vixl::aarch64::kDRegSizeInBytes;
   }
 
-  size_t GetSIMDRegisterWidth() const override {
-    return vixl::aarch64::kQRegSizeInBytes;
-  }
+  size_t GetSIMDRegisterWidth() const override;
 
   uintptr_t GetAddressOf(HBasicBlock* block) override {
     vixl::aarch64::Label* block_entry_label = GetLabelOf(block);
diff --git a/compiler/optimizing/code_generator_vector_arm64_neon.cc b/compiler/optimizing/code_generator_vector_arm64_neon.cc
index bd64166655..0fe9898635 100644
--- a/compiler/optimizing/code_generator_vector_arm64_neon.cc
+++ b/compiler/optimizing/code_generator_vector_arm64_neon.cc
@@ -17,6 +17,7 @@
 #include "code_generator_arm64.h"
 
 #include "arch/arm64/instruction_set_features_arm64.h"
+#include "base/bit_utils_iterator.h"
 #include "mirror/array-inl.h"
 #include "mirror/string.h"
 
@@ -1590,6 +1591,64 @@ void InstructionCodeGeneratorARM64Neon::MoveToSIMDStackSlot(Location destination
   }
 }
 
+// Calculate memory accessing operand for save/restore live registers.
+template <bool is_save>
+void SaveRestoreLiveRegistersHelperNeonImpl(CodeGeneratorARM64* codegen,
+                                            LocationSummary* locations,
+                                            int64_t spill_offset) {
+  const uint32_t core_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ true);
+  const uint32_t fp_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ false);
+  DCHECK(helpers::ArtVixlRegCodeCoherentForRegSet(core_spills,
+                                                  codegen->GetNumberOfCoreRegisters(),
+                                                  fp_spills,
+                                                  codegen->GetNumberOfFloatingPointRegisters()));
+
+  CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize, core_spills);
+  const unsigned v_reg_size_in_bits = codegen->GetSlowPathFPWidth() * 8;
+  DCHECK_LE(codegen->GetSIMDRegisterWidth(), kQRegSizeInBytes);
+  CPURegList fp_list = CPURegList(CPURegister::kVRegister, v_reg_size_in_bits, fp_spills);
+
+  MacroAssembler* masm = codegen->GetVIXLAssembler();
+  UseScratchRegisterScope temps(masm);
+
+  Register base = masm->StackPointer();
+  int64_t core_spill_size = core_list.GetTotalSizeInBytes();
+  int64_t fp_spill_size = fp_list.GetTotalSizeInBytes();
+  int64_t reg_size = kXRegSizeInBytes;
+  int64_t max_ls_pair_offset = spill_offset + core_spill_size + fp_spill_size - 2 * reg_size;
+  uint32_t ls_access_size = WhichPowerOf2(reg_size);
+  if (((core_list.GetCount() > 1) || (fp_list.GetCount() > 1)) &&
+      !masm->IsImmLSPair(max_ls_pair_offset, ls_access_size)) {
+    // If the offset does not fit in the instruction's immediate field, use an alternate register
+    // to compute the base address(float point registers spill base address).
+    Register new_base = temps.AcquireSameSizeAs(base);
+    masm->Add(new_base, base, Operand(spill_offset + core_spill_size));
+    base = new_base;
+    spill_offset = -core_spill_size;
+    int64_t new_max_ls_pair_offset = fp_spill_size - 2 * reg_size;
+    DCHECK(masm->IsImmLSPair(spill_offset, ls_access_size));
+    DCHECK(masm->IsImmLSPair(new_max_ls_pair_offset, ls_access_size));
+  }
+
+  if (is_save) {
+    masm->StoreCPURegList(core_list, MemOperand(base, spill_offset));
+    masm->StoreCPURegList(fp_list, MemOperand(base, spill_offset + core_spill_size));
+  } else {
+    masm->LoadCPURegList(core_list, MemOperand(base, spill_offset));
+    masm->LoadCPURegList(fp_list, MemOperand(base, spill_offset + core_spill_size));
+  }
+}
+
+void InstructionCodeGeneratorARM64Neon::SaveLiveRegistersHelper(LocationSummary* locations,
+                                                                int64_t spill_offset) {
+  SaveRestoreLiveRegistersHelperNeonImpl</* is_save= */ true>(codegen_, locations, spill_offset);
+}
+
+void InstructionCodeGeneratorARM64Neon::RestoreLiveRegistersHelper(LocationSummary* locations,
+                                                                   int64_t spill_offset) {
+  SaveRestoreLiveRegistersHelperNeonImpl</* is_save= */ false>(codegen_, locations, spill_offset);
+}
+
 #undef __
 
 }  // namespace arm64
diff --git a/compiler/optimizing/code_generator_vector_arm64_sve.cc b/compiler/optimizing/code_generator_vector_arm64_sve.cc
index 2254673337..824b6c9476 100644
--- a/compiler/optimizing/code_generator_vector_arm64_sve.cc
+++ b/compiler/optimizing/code_generator_vector_arm64_sve.cc
@@ -17,6 +17,7 @@
 #include "code_generator_arm64.h"
 
 #include "arch/arm64/instruction_set_features_arm64.h"
+#include "base/bit_utils_iterator.h"
 #include "mirror/array-inl.h"
 #include "mirror/string.h"
 
@@ -33,6 +34,7 @@ using helpers::LocationFrom;
 using helpers::OutputRegister;
 using helpers::QRegisterFrom;
 using helpers::StackOperandFrom;
+using helpers::SveStackOperandFrom;
 using helpers::VRegisterFrom;
 using helpers::ZRegisterFrom;
 using helpers::XRegisterFrom;
@@ -71,6 +73,11 @@ inline Location SVEEncodableConstantOrRegister(HInstruction* constant, HInstruct
   return Location::RequiresRegister();
 }
 
+void InstructionCodeGeneratorARM64Sve::ValidateVectorLength(HVecOperation* instr) const {
+  DCHECK_EQ(DataType::Size(instr->GetPackedType()) * instr->GetVectorLength(),
+            codegen_->GetSIMDRegisterWidth());
+}
+
 void LocationsBuilderARM64Sve::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
   LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
   HInstruction* input = instruction->InputAt(0);
@@ -107,11 +114,11 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal
   LocationSummary* locations = instruction->GetLocations();
   Location src_loc = locations->InAt(0);
   const ZRegister dst = ZRegisterFrom(locations->Out());
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
         __ Dup(dst.VnB(), Int64FromLocation(src_loc));
       } else {
@@ -120,7 +127,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
         __ Dup(dst.VnH(), Int64FromLocation(src_loc));
       } else {
@@ -128,7 +134,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal
       }
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
         __ Dup(dst.VnS(), Int64FromLocation(src_loc));
       } else {
@@ -136,7 +141,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal
       }
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
         __ Dup(dst.VnD(), Int64FromLocation(src_loc));
       } else {
@@ -144,7 +148,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal
       }
       break;
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
         __ Fdup(dst.VnS(), src_loc.GetConstant()->AsFloatConstant()->GetValue());
       } else {
@@ -152,7 +155,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReplicateScalar(HVecReplicateScal
       }
       break;
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
         __ Fdup(dst.VnD(), src_loc.GetConstant()->AsDoubleConstant()->GetValue());
       } else {
@@ -193,19 +195,16 @@ void InstructionCodeGeneratorARM64Sve::VisitVecExtractScalar(HVecExtractScalar*
   DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
   const VRegister src = VRegisterFrom(locations->InAt(0));
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Umov(OutputRegister(instruction), src.V4S(), 0);
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Umov(OutputRegister(instruction), src.V2D(), 0);
       break;
     case DataType::Type::kFloat32:
     case DataType::Type::kFloat64:
-      DCHECK_LE(2u, instruction->GetVectorLength());
-      DCHECK_LE(instruction->GetVectorLength(), 4u);
       DCHECK(locations->InAt(0).Equals(locations->Out()));  // no code required
       break;
     default:
@@ -251,9 +250,9 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReduce(HVecReduce* instruction) {
   const ZRegister src = ZRegisterFrom(locations->InAt(0));
   const VRegister dst = DRegisterFrom(locations->Out());
   const PRegister p_reg = LoopPReg();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       switch (instruction->GetReductionKind()) {
         case HVecReduce::kSum:
           __ Saddv(dst.S(), p_reg, src.VnS());
@@ -264,7 +263,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecReduce(HVecReduce* instruction) {
       }
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       switch (instruction->GetReductionKind()) {
         case HVecReduce::kSum:
           __ Uaddv(dst.D(), p_reg, src.VnD());
@@ -292,8 +290,8 @@ void InstructionCodeGeneratorARM64Sve::VisitVecCnv(HVecCnv* instruction) {
   const PRegisterM p_reg = LoopPReg().Merging();
   DataType::Type from = instruction->GetInputType();
   DataType::Type to = instruction->GetResultType();
+  ValidateVectorLength(instruction);
   if (from == DataType::Type::kInt32 && to == DataType::Type::kFloat32) {
-    DCHECK_EQ(4u, instruction->GetVectorLength());
     __ Scvtf(dst.VnS(), p_reg, src.VnS());
   } else {
     LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -310,31 +308,26 @@ void InstructionCodeGeneratorARM64Sve::VisitVecNeg(HVecNeg* instruction) {
   const ZRegister src = ZRegisterFrom(locations->InAt(0));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Neg(dst.VnB(), p_reg, src.VnB());
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Neg(dst.VnH(), p_reg, src.VnH());
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Neg(dst.VnS(), p_reg, src.VnS());
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Neg(dst.VnD(), p_reg, src.VnD());
       break;
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fneg(dst.VnS(), p_reg, src.VnS());
       break;
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Fneg(dst.VnD(), p_reg, src.VnD());
       break;
     default:
@@ -353,29 +346,24 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAbs(HVecAbs* instruction) {
   const ZRegister src = ZRegisterFrom(locations->InAt(0));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Abs(dst.VnB(), p_reg, src.VnB());
       break;
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Abs(dst.VnH(), p_reg, src.VnH());
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Abs(dst.VnS(), p_reg, src.VnS());
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Abs(dst.VnD(), p_reg, src.VnD());
       break;
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fabs(dst.VnS(), p_reg, src.VnS());
       break;
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Fabs(dst.VnD(), p_reg, src.VnD());
       break;
     default:
@@ -394,9 +382,9 @@ void InstructionCodeGeneratorARM64Sve::VisitVecNot(HVecNot* instruction) {
   const ZRegister src = ZRegisterFrom(locations->InAt(0));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:  // special case boolean-not
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Dup(dst.VnB(), 1);
       __ Eor(dst.VnB(), p_reg, dst.VnB(), src.VnB());
       break;
@@ -454,31 +442,26 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAdd(HVecAdd* instruction) {
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Add(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Add(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Add(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Add(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fadd(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS(), StrictNaNPropagation);
       break;
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Fadd(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD(), StrictNaNPropagation);
       break;
     default:
@@ -518,31 +501,26 @@ void InstructionCodeGeneratorARM64Sve::VisitVecSub(HVecSub* instruction) {
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Sub(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Sub(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Sub(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Sub(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fsub(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
       break;
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Fsub(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     default:
@@ -572,31 +550,26 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMul(HVecMul* instruction) {
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Mul(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Mul(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Mul(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Mul(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fmul(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS(), StrictNaNPropagation);
       break;
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Fmul(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD(), StrictNaNPropagation);
       break;
     default:
@@ -616,15 +589,14 @@ void InstructionCodeGeneratorARM64Sve::VisitVecDiv(HVecDiv* instruction) {
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
 
   // Note: VIXL guarantees StrictNaNPropagation for Fdiv.
   switch (instruction->GetPackedType()) {
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fdiv(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
       break;
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Fdiv(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     default:
@@ -665,6 +637,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecAnd(HVecAnd* instruction) {
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
@@ -709,6 +682,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecOr(HVecOr* instruction) {
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
@@ -744,6 +718,7 @@ void InstructionCodeGeneratorARM64Sve::VisitVecXor(HVecXor* instruction) {
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
@@ -799,23 +774,20 @@ void InstructionCodeGeneratorARM64Sve::VisitVecShl(HVecShl* instruction) {
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Lsl(dst.VnB(), p_reg, lhs.VnB(), value);
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Lsl(dst.VnH(), p_reg, lhs.VnH(), value);
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Lsl(dst.VnS(), p_reg, lhs.VnS(), value);
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Lsl(dst.VnD(), p_reg, lhs.VnD(), value);
       break;
     default:
@@ -835,23 +807,20 @@ void InstructionCodeGeneratorARM64Sve::VisitVecShr(HVecShr* instruction) {
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Asr(dst.VnB(), p_reg, lhs.VnB(), value);
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Asr(dst.VnH(), p_reg, lhs.VnH(), value);
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Asr(dst.VnS(), p_reg, lhs.VnS(), value);
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Asr(dst.VnD(), p_reg, lhs.VnD(), value);
       break;
     default:
@@ -871,23 +840,20 @@ void InstructionCodeGeneratorARM64Sve::VisitVecUShr(HVecUShr* instruction) {
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Lsr(dst.VnB(), p_reg, lhs.VnB(), value);
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Lsr(dst.VnH(), p_reg, lhs.VnH(), value);
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Lsr(dst.VnS(), p_reg, lhs.VnS(), value);
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Lsr(dst.VnD(), p_reg, lhs.VnD(), value);
       break;
     default:
@@ -943,26 +909,23 @@ void InstructionCodeGeneratorARM64Sve::VisitVecSetScalars(HVecSetScalars* instru
   if (IsZeroBitPattern(instruction->InputAt(0))) {
     return;
   }
+  ValidateVectorLength(instruction);
 
   // Set required elements.
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Mov(dst.V16B(), 0, InputRegisterAt(instruction, 0));
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Mov(dst.V8H(), 0, InputRegisterAt(instruction, 0));
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Mov(dst.V4S(), 0, InputRegisterAt(instruction, 0));
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Mov(dst.V2D(), 0, InputRegisterAt(instruction, 0));
       break;
     default:
@@ -1009,11 +972,11 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate(
   const PRegisterM p_reg = LoopPReg().Merging();
 
   DCHECK(locations->InAt(0).Equals(locations->Out()));
+  ValidateVectorLength(instruction);
 
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ Mla(acc.VnB(), p_reg, acc.VnB(), left.VnB(), right.VnB());
       } else {
@@ -1022,7 +985,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate(
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ Mla(acc.VnH(), p_reg, acc.VnB(), left.VnH(), right.VnH());
       } else {
@@ -1030,7 +992,6 @@ void InstructionCodeGeneratorARM64Sve::VisitVecMultiplyAccumulate(
       }
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ Mla(acc.VnS(), p_reg, acc.VnB(), left.VnS(), right.VnS());
       } else {
@@ -1077,12 +1038,11 @@ void InstructionCodeGeneratorARM64Sve::VisitVecDotProd(HVecDotProd* instruction)
   DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
             HVecOperation::ToSignedType(b->GetPackedType()));
   DCHECK_EQ(instruction->GetPackedType(), DataType::Type::kInt32);
-  DCHECK_EQ(4u, instruction->GetVectorLength());
+  ValidateVectorLength(instruction);
 
   size_t inputs_data_size = DataType::Size(a->GetPackedType());
   switch (inputs_data_size) {
     case 1u: {
-      DCHECK_EQ(16u, a->GetVectorLength());
       UseScratchRegisterScope temps(GetVIXLAssembler());
       const ZRegister tmp0 = temps.AcquireZ();
       const ZRegister tmp1 = ZRegisterFrom(locations->GetTemp(0));
@@ -1143,30 +1103,27 @@ void InstructionCodeGeneratorARM64Sve::VisitVecLoad(HVecLoad* instruction) {
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register scratch;
   const PRegisterZ p_reg = LoopPReg().Zeroing();
+  ValidateVectorLength(instruction);
 
   switch (instruction->GetPackedType()) {
     case DataType::Type::kInt16:  // (short) s.charAt(.) can yield HVecLoad/Int16/StringCharAt.
     case DataType::Type::kUint16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Ld1h(reg.VnH(), p_reg,
               VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Ld1b(reg.VnB(), p_reg,
               VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     case DataType::Type::kInt32:
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Ld1w(reg.VnS(), p_reg,
               VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     case DataType::Type::kInt64:
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Ld1d(reg.VnD(), p_reg,
               VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
@@ -1188,30 +1145,27 @@ void InstructionCodeGeneratorARM64Sve::VisitVecStore(HVecStore* instruction) {
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register scratch;
   const PRegisterZ p_reg = LoopPReg().Zeroing();
+  ValidateVectorLength(instruction);
 
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ St1b(reg.VnB(), p_reg,
           VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ St1h(reg.VnH(), p_reg,
           VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     case DataType::Type::kInt32:
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ St1w(reg.VnS(), p_reg,
           VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     case DataType::Type::kInt64:
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ St1d(reg.VnD(), p_reg,
           VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
@@ -1237,22 +1191,18 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredSetAll(HVecPredSetAll* instru
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Ptrue(p_reg.VnB(), vixl::aarch64::SVE_ALL);
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Ptrue(p_reg.VnH(), vixl::aarch64::SVE_ALL);
       break;
     case DataType::Type::kInt32:
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Ptrue(p_reg.VnS(), vixl::aarch64::SVE_ALL);
       break;
     case DataType::Type::kInt64:
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Ptrue(p_reg.VnD(), vixl::aarch64::SVE_ALL);
       break;
     default:
@@ -1295,17 +1245,19 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredWhile(HVecPredWhile* instruct
   Register left = InputRegisterAt(instruction, 0);
   Register right = InputRegisterAt(instruction, 1);
 
-  switch (instruction->GetVectorLength()) {
-    case 16u:
+  DCHECK_EQ(codegen_->GetSIMDRegisterWidth() % instruction->GetVectorLength(), 0u);
+
+  switch (codegen_->GetSIMDRegisterWidth() / instruction->GetVectorLength()) {
+    case 1u:
       __ Whilelo(LoopPReg().VnB(), left, right);
       break;
-    case 8u:
+    case 2u:
       __ Whilelo(LoopPReg().VnH(), left, right);
       break;
     case 4u:
       __ Whilelo(LoopPReg().VnS(), left, right);
       break;
-    case 2u:
+    case 8u:
       __ Whilelo(LoopPReg().VnD(), left, right);
       break;
     default:
@@ -1333,52 +1285,103 @@ void InstructionCodeGeneratorARM64Sve::VisitVecPredCondition(HVecPredCondition*
 
 Location InstructionCodeGeneratorARM64Sve::AllocateSIMDScratchLocation(
     vixl::aarch64::UseScratchRegisterScope* scope) {
-  DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes);
-  return LocationFrom(scope->AcquireVRegisterOfSize(kQRegSize));
+  return LocationFrom(scope->AcquireZ());
 }
 
 void InstructionCodeGeneratorARM64Sve::FreeSIMDScratchLocation(Location loc,
     vixl::aarch64::UseScratchRegisterScope* scope) {
-  DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes);
-  scope->Release(QRegisterFrom(loc));
+  scope->Release(ZRegisterFrom(loc));
 }
 
 void InstructionCodeGeneratorARM64Sve::LoadSIMDRegFromStack(Location destination,
                                                             Location source) {
-  DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes);
-  __ Ldr(QRegisterFrom(destination), StackOperandFrom(source));
+  __ Ldr(ZRegisterFrom(destination), SveStackOperandFrom(source));
 }
 
 void InstructionCodeGeneratorARM64Sve::MoveSIMDRegToSIMDReg(Location destination,
                                                             Location source) {
-  DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes);
-  __ Mov(QRegisterFrom(destination), QRegisterFrom(source));
+  __ Mov(ZRegisterFrom(destination), ZRegisterFrom(source));
 }
 
 void InstructionCodeGeneratorARM64Sve::MoveToSIMDStackSlot(Location destination,
                                                            Location source) {
   DCHECK(destination.IsSIMDStackSlot());
-  DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes);
 
   if (source.IsFpuRegister()) {
-    __ Str(QRegisterFrom(source), StackOperandFrom(destination));
+    __ Str(ZRegisterFrom(source), SveStackOperandFrom(destination));
   } else {
     DCHECK(source.IsSIMDStackSlot());
     UseScratchRegisterScope temps(GetVIXLAssembler());
     if (GetVIXLAssembler()->GetScratchVRegisterList()->IsEmpty()) {
+      // Very rare situation, only when there are cycles in ParallelMoveResolver graph.
       const Register temp = temps.AcquireX();
-      __ Ldr(temp, MemOperand(sp, source.GetStackIndex()));
-      __ Str(temp, MemOperand(sp, destination.GetStackIndex()));
-      __ Ldr(temp, MemOperand(sp, source.GetStackIndex() + kArm64WordSize));
-      __ Str(temp, MemOperand(sp, destination.GetStackIndex() + kArm64WordSize));
+      DCHECK_EQ(codegen_->GetSIMDRegisterWidth() % kArm64WordSize, 0u);
+      // Emit a number of LDR/STR (XRegister, 64-bit) to cover the whole SIMD register size
+      // when copying a stack slot.
+      for (size_t offset = 0, e = codegen_->GetSIMDRegisterWidth();
+           offset < e;
+           offset += kArm64WordSize) {
+        __ Ldr(temp, MemOperand(sp, source.GetStackIndex() + offset));
+        __ Str(temp, MemOperand(sp, destination.GetStackIndex() + offset));
+      }
     } else {
-      const VRegister temp = temps.AcquireVRegisterOfSize(kQRegSize);
-      __ Ldr(temp, StackOperandFrom(source));
-      __ Str(temp, StackOperandFrom(destination));
+      const ZRegister temp = temps.AcquireZ();
+      __ Ldr(temp, SveStackOperandFrom(source));
+      __ Str(temp, SveStackOperandFrom(destination));
     }
   }
 }
 
+template <bool is_save>
+void SaveRestoreLiveRegistersHelperSveImpl(CodeGeneratorARM64* codegen,
+                                           LocationSummary* locations,
+                                           int64_t spill_offset) {
+  const uint32_t core_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ true);
+  const uint32_t fp_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ false);
+  DCHECK(helpers::ArtVixlRegCodeCoherentForRegSet(core_spills,
+                                                  codegen->GetNumberOfCoreRegisters(),
+                                                  fp_spills,
+                                                  codegen->GetNumberOfFloatingPointRegisters()));
+  MacroAssembler* masm = codegen->GetVIXLAssembler();
+  Register base = masm->StackPointer();
+
+  CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize, core_spills);
+  int64_t core_spill_size = core_list.GetTotalSizeInBytes();
+  int64_t fp_spill_offset = spill_offset + core_spill_size;
+
+  if (codegen->GetGraph()->HasSIMD()) {
+    if (is_save) {
+      masm->StoreCPURegList(core_list, MemOperand(base, spill_offset));
+    } else {
+      masm->LoadCPURegList(core_list, MemOperand(base, spill_offset));
+    }
+    codegen->GetAssembler()->SaveRestoreZRegisterList<is_save>(fp_spills, fp_spill_offset);
+    return;
+  }
+
+  // Case when we only need to restore D-registers.
+  DCHECK(!codegen->GetGraph()->HasSIMD());
+  DCHECK_LE(codegen->GetSlowPathFPWidth(), kDRegSizeInBytes);
+  CPURegList fp_list = CPURegList(CPURegister::kVRegister, kDRegSize, fp_spills);
+  if (is_save) {
+    masm->StoreCPURegList(core_list, MemOperand(base, spill_offset));
+    masm->StoreCPURegList(fp_list, MemOperand(base, fp_spill_offset));
+  } else {
+    masm->LoadCPURegList(core_list, MemOperand(base, spill_offset));
+    masm->LoadCPURegList(fp_list, MemOperand(base, fp_spill_offset));
+  }
+}
+
+void InstructionCodeGeneratorARM64Sve::SaveLiveRegistersHelper(LocationSummary* locations,
+                                                               int64_t spill_offset) {
+  SaveRestoreLiveRegistersHelperSveImpl</* is_save= */ true>(codegen_, locations, spill_offset);
+}
+
+void InstructionCodeGeneratorARM64Sve::RestoreLiveRegistersHelper(LocationSummary* locations,
+                                                                  int64_t spill_offset) {
+  SaveRestoreLiveRegistersHelperSveImpl</* is_save= */ false>(codegen_, locations, spill_offset);
+}
+
 #undef __
 
 }  // namespace arm64
diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
index 72207816e1..81c6561318 100644
--- a/compiler/optimizing/common_arm64.h
+++ b/compiler/optimizing/common_arm64.h
@@ -182,6 +182,10 @@ inline vixl::aarch64::MemOperand StackOperandFrom(Location location) {
   return vixl::aarch64::MemOperand(vixl::aarch64::sp, location.GetStackIndex());
 }
 
+inline vixl::aarch64::SVEMemOperand SveStackOperandFrom(Location location) {
+  return vixl::aarch64::SVEMemOperand(vixl::aarch64::sp, location.GetStackIndex());
+}
+
 inline vixl::aarch64::MemOperand HeapOperand(const vixl::aarch64::Register& base,
                                                     size_t offset = 0) {
   // A heap reference must be 32bit, so fit in a W register.
@@ -215,6 +219,10 @@ inline Location LocationFrom(const vixl::aarch64::VRegister& fpreg) {
   return Location::FpuRegisterLocation(fpreg.GetCode());
 }
 
+inline Location LocationFrom(const vixl::aarch64::ZRegister& zreg) {
+  return Location::FpuRegisterLocation(zreg.GetCode());
+}
+
 inline vixl::aarch64::Operand OperandFromMemOperand(
     const vixl::aarch64::MemOperand& mem_op) {
   if (mem_op.IsImmediateOffset()) {
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 1210dbe67b..02ee4ec057 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -946,9 +946,10 @@ bool HLoopOptimization::ShouldVectorize(LoopNode* node, HBasicBlock* block, int6
   //     make one particular reference aligned), never to exceed (1).
   // (3) variable to record how many references share same alignment.
   // (4) variable to record suitable candidate for dynamic loop peeling.
-  uint32_t desired_alignment = GetVectorSizeInBytes();
-  DCHECK_LE(desired_alignment, 16u);
-  uint32_t peeling_votes[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  size_t desired_alignment = GetVectorSizeInBytes();
+  ScopedArenaVector<uint32_t> peeling_votes(desired_alignment, 0u,
+      loop_allocator_->Adapter(kArenaAllocLoopOptimization));
+
   uint32_t max_num_same_alignment = 0;
   const ArrayReference* peeling_candidate = nullptr;
 
@@ -1577,14 +1578,6 @@ bool HLoopOptimization::VectorizeUse(LoopNode* node,
 }
 
 uint32_t HLoopOptimization::GetVectorSizeInBytes() {
-  if (kIsDebugBuild) {
-    InstructionSet isa = compiler_options_->GetInstructionSet();
-    // TODO: Remove this check when there are no implicit assumptions on the SIMD reg size.
-    DCHECK_EQ(simd_register_size_, (isa == InstructionSet::kArm || isa == InstructionSet::kThumb2)
-                                   ? 8u
-                                   : 16u);
-  }
-
   return simd_register_size_;
 }
 
@@ -1616,6 +1609,8 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
       if (IsInPredicatedVectorizationMode()) {
         // SVE vectorization.
         CHECK(features->AsArm64InstructionSetFeatures()->HasSVE());
+        size_t vector_length = simd_register_size_ / DataType::Size(type);
+        DCHECK_EQ(simd_register_size_ % DataType::Size(type), 0u);
         switch (type) {
           case DataType::Type::kBool:
           case DataType::Type::kUint8:
@@ -1625,7 +1620,7 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
                              kNoUnsignedHAdd |
                              kNoUnroundedHAdd |
                              kNoSAD;
-            return TrySetVectorLength(type, 16);
+            return TrySetVectorLength(type, vector_length);
           case DataType::Type::kUint16:
           case DataType::Type::kInt16:
             *restrictions |= kNoDiv |
@@ -1634,19 +1629,19 @@ bool HLoopOptimization::TrySetVectorType(DataType::Type type, uint64_t* restrict
                              kNoUnroundedHAdd |
                              kNoSAD |
                              kNoDotProd;
-            return TrySetVectorLength(type, 8);
+            return TrySetVectorLength(type, vector_length);
           case DataType::Type::kInt32:
             *restrictions |= kNoDiv | kNoSAD;
-            return TrySetVectorLength(type, 4);
+            return TrySetVectorLength(type, vector_length);
           case DataType::Type::kInt64:
             *restrictions |= kNoDiv | kNoSAD;
-            return TrySetVectorLength(type, 2);
+            return TrySetVectorLength(type, vector_length);
           case DataType::Type::kFloat32:
             *restrictions |= kNoReduction;
-            return TrySetVectorLength(type, 4);
+            return TrySetVectorLength(type, vector_length);
           case DataType::Type::kFloat64:
             *restrictions |= kNoReduction;
-            return TrySetVectorLength(type, 2);
+            return TrySetVectorLength(type, vector_length);
           default:
             break;
         }
@@ -2311,12 +2306,12 @@ Alignment HLoopOptimization::ComputeAlignment(HInstruction* offset,
   return Alignment(DataType::Size(type), 0);
 }
 
-void HLoopOptimization::SetAlignmentStrategy(uint32_t peeling_votes[],
+void HLoopOptimization::SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes,
                                              const ArrayReference* peeling_candidate) {
   // Current heuristic: pick the best static loop peeling factor, if any,
   // or otherwise use dynamic loop peeling on suggested peeling candidate.
   uint32_t max_vote = 0;
-  for (int32_t i = 0; i < 16; i++) {
+  for (size_t i = 0; i < peeling_votes.size(); i++) {
     if (peeling_votes[i] > max_vote) {
       max_vote = peeling_votes[i];
       vector_static_peeling_factor_ = i;
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 0d76804d9c..d3583ed8a6 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -238,7 +238,7 @@ class HLoopOptimization : public HOptimization {
                              DataType::Type type,
                              bool is_string_char_at,
                              uint32_t peeling = 0);
-  void SetAlignmentStrategy(uint32_t peeling_votes[],
+  void SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes,
                             const ArrayReference* peeling_candidate);
   uint32_t MaxNumberPeeled();
   bool IsVectorizationProfitable(int64_t trip_count);