ARM64: Support SVE VL other than 128-bit.

Arm SVE register size is not fixed and can be a
multiple of 128 bits. To support that the patch
removes explicit assumptions on the SIMD register
size to be 128 bit from the vectorizer and code
generators and enables configurable SVE vector
length autovectorization, e.g. extends SIMD register
save/restore routines.

Test: art SIMD tests on VIXL simulator.
Test: art tests on FVP (steps in test/README.arm_fvp.md)
      with FVP arg:
      -C SVE.ScalableVectorExtension.veclen=[2,4]
      (SVE vector [128,256] bits wide)

Change-Id: Icb46e7eb17f21d3bd38b16dd50f735c29b316427
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index f5d7836..bdc5e2d 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -169,54 +169,6 @@
 #define __ down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler()->  // NOLINT
 #define QUICK_ENTRY_POINT(x) QUICK_ENTRYPOINT_OFFSET(kArm64PointerSize, x).Int32Value()
 
-// Calculate memory accessing operand for save/restore live registers.
-static void SaveRestoreLiveRegistersHelper(CodeGenerator* codegen,
-                                           LocationSummary* locations,
-                                           int64_t spill_offset,
-                                           bool is_save) {
-  const uint32_t core_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ true);
-  const uint32_t fp_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ false);
-  DCHECK(ArtVixlRegCodeCoherentForRegSet(core_spills,
-                                         codegen->GetNumberOfCoreRegisters(),
-                                         fp_spills,
-                                         codegen->GetNumberOfFloatingPointRegisters()));
-
-  CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize, core_spills);
-  const unsigned v_reg_size_in_bits = codegen->GetSlowPathFPWidth() * 8;
-  DCHECK_LE(codegen->GetSIMDRegisterWidth(), kQRegSizeInBytes);
-  CPURegList fp_list = CPURegList(CPURegister::kVRegister, v_reg_size_in_bits, fp_spills);
-
-  MacroAssembler* masm = down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler();
-  UseScratchRegisterScope temps(masm);
-
-  Register base = masm->StackPointer();
-  int64_t core_spill_size = core_list.GetTotalSizeInBytes();
-  int64_t fp_spill_size = fp_list.GetTotalSizeInBytes();
-  int64_t reg_size = kXRegSizeInBytes;
-  int64_t max_ls_pair_offset = spill_offset + core_spill_size + fp_spill_size - 2 * reg_size;
-  uint32_t ls_access_size = WhichPowerOf2(reg_size);
-  if (((core_list.GetCount() > 1) || (fp_list.GetCount() > 1)) &&
-      !masm->IsImmLSPair(max_ls_pair_offset, ls_access_size)) {
-    // If the offset does not fit in the instruction's immediate field, use an alternate register
-    // to compute the base address(float point registers spill base address).
-    Register new_base = temps.AcquireSameSizeAs(base);
-    __ Add(new_base, base, Operand(spill_offset + core_spill_size));
-    base = new_base;
-    spill_offset = -core_spill_size;
-    int64_t new_max_ls_pair_offset = fp_spill_size - 2 * reg_size;
-    DCHECK(masm->IsImmLSPair(spill_offset, ls_access_size));
-    DCHECK(masm->IsImmLSPair(new_max_ls_pair_offset, ls_access_size));
-  }
-
-  if (is_save) {
-    __ StoreCPURegList(core_list, MemOperand(base, spill_offset));
-    __ StoreCPURegList(fp_list, MemOperand(base, spill_offset + core_spill_size));
-  } else {
-    __ LoadCPURegList(core_list, MemOperand(base, spill_offset));
-    __ LoadCPURegList(fp_list, MemOperand(base, spill_offset + core_spill_size));
-  }
-}
-
 void SlowPathCodeARM64::SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) {
   size_t stack_offset = codegen->GetFirstRegisterSlotInSlowPath();
   const uint32_t core_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ true);
@@ -240,15 +192,15 @@
     stack_offset += fp_reg_size;
   }
 
-  SaveRestoreLiveRegistersHelper(codegen,
-                                 locations,
-                                 codegen->GetFirstRegisterSlotInSlowPath(), /* is_save= */ true);
+  InstructionCodeGeneratorARM64* visitor =
+      down_cast<CodeGeneratorARM64*>(codegen)->GetInstructionCodeGeneratorArm64();
+  visitor->SaveLiveRegistersHelper(locations, codegen->GetFirstRegisterSlotInSlowPath());
 }
 
 void SlowPathCodeARM64::RestoreLiveRegisters(CodeGenerator* codegen, LocationSummary* locations) {
-  SaveRestoreLiveRegistersHelper(codegen,
-                                 locations,
-                                 codegen->GetFirstRegisterSlotInSlowPath(), /* is_save= */ false);
+  InstructionCodeGeneratorARM64* visitor =
+      down_cast<CodeGeneratorARM64*>(codegen)->GetInstructionCodeGeneratorArm64();
+  visitor->RestoreLiveRegistersHelper(locations, codegen->GetFirstRegisterSlotInSlowPath());
 }
 
 class BoundsCheckSlowPathARM64 : public SlowPathCodeARM64 {
@@ -997,6 +949,12 @@
   return GetInstructionSetFeatures().HasSVE();
 }
 
+size_t CodeGeneratorARM64::GetSIMDRegisterWidth() const {
+  return SupportsPredicatedSIMD()
+      ? GetInstructionSetFeatures().GetSVEVectorLength() / kBitsPerByte
+      : vixl::aarch64::kQRegSizeInBytes;
+}
+
 #define __ GetVIXLAssembler()->
 
 void CodeGeneratorARM64::EmitJumpTables() {
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index eb3e954..d4546e5 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -309,6 +309,10 @@
   virtual void LoadSIMDRegFromStack(Location destination, Location source) = 0;
   virtual void MoveSIMDRegToSIMDReg(Location destination, Location source) = 0;
   virtual void MoveToSIMDStackSlot(Location destination, Location source) = 0;
+  virtual void SaveLiveRegistersHelper(LocationSummary* locations,
+                                       int64_t spill_offset) = 0;
+  virtual void RestoreLiveRegistersHelper(LocationSummary* locations,
+                                          int64_t spill_offset) = 0;
 
  protected:
   void GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path,
@@ -462,6 +466,8 @@
   void LoadSIMDRegFromStack(Location destination, Location source) override;
   void MoveSIMDRegToSIMDReg(Location destination, Location source) override;
   void MoveToSIMDStackSlot(Location destination, Location source) override;
+  void SaveLiveRegistersHelper(LocationSummary* locations, int64_t spill_offset) override;
+  void RestoreLiveRegistersHelper(LocationSummary* locations, int64_t spill_offset) override;
 };
 
 class LocationsBuilderARM64Neon : public LocationsBuilderARM64 {
@@ -495,8 +501,14 @@
   void LoadSIMDRegFromStack(Location destination, Location source) override;
   void MoveSIMDRegToSIMDReg(Location destination, Location source) override;
   void MoveToSIMDStackSlot(Location destination, Location source) override;
+  void SaveLiveRegistersHelper(LocationSummary* locations, int64_t spill_offset) override;
+  void RestoreLiveRegistersHelper(LocationSummary* locations, int64_t spill_offset) override;
 
  private:
+  // Validate that instruction vector length and packed type are compliant with the SIMD
+  // register size (full SIMD register is used).
+  void ValidateVectorLength(HVecOperation* instr) const;
+
   // Returns default predicate register which is used as governing vector predicate
   // to implement predicated loop execution.
   //
@@ -579,9 +591,7 @@
     return vixl::aarch64::kDRegSizeInBytes;
   }
 
-  size_t GetSIMDRegisterWidth() const override {
-    return vixl::aarch64::kQRegSizeInBytes;
-  }
+  size_t GetSIMDRegisterWidth() const override;
 
   uintptr_t GetAddressOf(HBasicBlock* block) override {
     vixl::aarch64::Label* block_entry_label = GetLabelOf(block);
diff --git a/compiler/optimizing/code_generator_vector_arm64_neon.cc b/compiler/optimizing/code_generator_vector_arm64_neon.cc
index bd64166..0fe9898 100644
--- a/compiler/optimizing/code_generator_vector_arm64_neon.cc
+++ b/compiler/optimizing/code_generator_vector_arm64_neon.cc
@@ -17,6 +17,7 @@
 #include "code_generator_arm64.h"
 
 #include "arch/arm64/instruction_set_features_arm64.h"
+#include "base/bit_utils_iterator.h"
 #include "mirror/array-inl.h"
 #include "mirror/string.h"
 
@@ -1590,6 +1591,64 @@
   }
 }
 
+// Calculate memory accessing operand for save/restore live registers.
+template <bool is_save>
+void SaveRestoreLiveRegistersHelperNeonImpl(CodeGeneratorARM64* codegen,
+                                            LocationSummary* locations,
+                                            int64_t spill_offset) {
+  const uint32_t core_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ true);
+  const uint32_t fp_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ false);
+  DCHECK(helpers::ArtVixlRegCodeCoherentForRegSet(core_spills,
+                                                  codegen->GetNumberOfCoreRegisters(),
+                                                  fp_spills,
+                                                  codegen->GetNumberOfFloatingPointRegisters()));
+
+  CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize, core_spills);
+  const unsigned v_reg_size_in_bits = codegen->GetSlowPathFPWidth() * 8;
+  DCHECK_LE(codegen->GetSIMDRegisterWidth(), kQRegSizeInBytes);
+  CPURegList fp_list = CPURegList(CPURegister::kVRegister, v_reg_size_in_bits, fp_spills);
+
+  MacroAssembler* masm = codegen->GetVIXLAssembler();
+  UseScratchRegisterScope temps(masm);
+
+  Register base = masm->StackPointer();
+  int64_t core_spill_size = core_list.GetTotalSizeInBytes();
+  int64_t fp_spill_size = fp_list.GetTotalSizeInBytes();
+  int64_t reg_size = kXRegSizeInBytes;
+  int64_t max_ls_pair_offset = spill_offset + core_spill_size + fp_spill_size - 2 * reg_size;
+  uint32_t ls_access_size = WhichPowerOf2(reg_size);
+  if (((core_list.GetCount() > 1) || (fp_list.GetCount() > 1)) &&
+      !masm->IsImmLSPair(max_ls_pair_offset, ls_access_size)) {
+    // If the offset does not fit in the instruction's immediate field, use an alternate register
+    // to compute the base address(float point registers spill base address).
+    Register new_base = temps.AcquireSameSizeAs(base);
+    masm->Add(new_base, base, Operand(spill_offset + core_spill_size));
+    base = new_base;
+    spill_offset = -core_spill_size;
+    int64_t new_max_ls_pair_offset = fp_spill_size - 2 * reg_size;
+    DCHECK(masm->IsImmLSPair(spill_offset, ls_access_size));
+    DCHECK(masm->IsImmLSPair(new_max_ls_pair_offset, ls_access_size));
+  }
+
+  if (is_save) {
+    masm->StoreCPURegList(core_list, MemOperand(base, spill_offset));
+    masm->StoreCPURegList(fp_list, MemOperand(base, spill_offset + core_spill_size));
+  } else {
+    masm->LoadCPURegList(core_list, MemOperand(base, spill_offset));
+    masm->LoadCPURegList(fp_list, MemOperand(base, spill_offset + core_spill_size));
+  }
+}
+
+void InstructionCodeGeneratorARM64Neon::SaveLiveRegistersHelper(LocationSummary* locations,
+                                                                int64_t spill_offset) {
+  SaveRestoreLiveRegistersHelperNeonImpl</* is_save= */ true>(codegen_, locations, spill_offset);
+}
+
+void InstructionCodeGeneratorARM64Neon::RestoreLiveRegistersHelper(LocationSummary* locations,
+                                                                   int64_t spill_offset) {
+  SaveRestoreLiveRegistersHelperNeonImpl</* is_save= */ false>(codegen_, locations, spill_offset);
+}
+
 #undef __
 
 }  // namespace arm64
diff --git a/compiler/optimizing/code_generator_vector_arm64_sve.cc b/compiler/optimizing/code_generator_vector_arm64_sve.cc
index 2254673..824b6c9 100644
--- a/compiler/optimizing/code_generator_vector_arm64_sve.cc
+++ b/compiler/optimizing/code_generator_vector_arm64_sve.cc
@@ -17,6 +17,7 @@
 #include "code_generator_arm64.h"
 
 #include "arch/arm64/instruction_set_features_arm64.h"
+#include "base/bit_utils_iterator.h"
 #include "mirror/array-inl.h"
 #include "mirror/string.h"
 
@@ -33,6 +34,7 @@
 using helpers::OutputRegister;
 using helpers::QRegisterFrom;
 using helpers::StackOperandFrom;
+using helpers::SveStackOperandFrom;
 using helpers::VRegisterFrom;
 using helpers::ZRegisterFrom;
 using helpers::XRegisterFrom;
@@ -71,6 +73,11 @@
   return Location::RequiresRegister();
 }
 
+void InstructionCodeGeneratorARM64Sve::ValidateVectorLength(HVecOperation* instr) const {
+  DCHECK_EQ(DataType::Size(instr->GetPackedType()) * instr->GetVectorLength(),
+            codegen_->GetSIMDRegisterWidth());
+}
+
 void LocationsBuilderARM64Sve::VisitVecReplicateScalar(HVecReplicateScalar* instruction) {
   LocationSummary* locations = new (GetGraph()->GetAllocator()) LocationSummary(instruction);
   HInstruction* input = instruction->InputAt(0);
@@ -107,11 +114,11 @@
   LocationSummary* locations = instruction->GetLocations();
   Location src_loc = locations->InAt(0);
   const ZRegister dst = ZRegisterFrom(locations->Out());
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
         __ Dup(dst.VnB(), Int64FromLocation(src_loc));
       } else {
@@ -120,7 +127,6 @@
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
         __ Dup(dst.VnH(), Int64FromLocation(src_loc));
       } else {
@@ -128,7 +134,6 @@
       }
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
         __ Dup(dst.VnS(), Int64FromLocation(src_loc));
       } else {
@@ -136,7 +141,6 @@
       }
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
         __ Dup(dst.VnD(), Int64FromLocation(src_loc));
       } else {
@@ -144,7 +148,6 @@
       }
       break;
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
         __ Fdup(dst.VnS(), src_loc.GetConstant()->AsFloatConstant()->GetValue());
       } else {
@@ -152,7 +155,6 @@
       }
       break;
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       if (src_loc.IsConstant()) {
         __ Fdup(dst.VnD(), src_loc.GetConstant()->AsDoubleConstant()->GetValue());
       } else {
@@ -193,19 +195,16 @@
   DCHECK(instruction->IsPredicated());
   LocationSummary* locations = instruction->GetLocations();
   const VRegister src = VRegisterFrom(locations->InAt(0));
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Umov(OutputRegister(instruction), src.V4S(), 0);
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Umov(OutputRegister(instruction), src.V2D(), 0);
       break;
     case DataType::Type::kFloat32:
     case DataType::Type::kFloat64:
-      DCHECK_LE(2u, instruction->GetVectorLength());
-      DCHECK_LE(instruction->GetVectorLength(), 4u);
       DCHECK(locations->InAt(0).Equals(locations->Out()));  // no code required
       break;
     default:
@@ -251,9 +250,9 @@
   const ZRegister src = ZRegisterFrom(locations->InAt(0));
   const VRegister dst = DRegisterFrom(locations->Out());
   const PRegister p_reg = LoopPReg();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       switch (instruction->GetReductionKind()) {
         case HVecReduce::kSum:
           __ Saddv(dst.S(), p_reg, src.VnS());
@@ -264,7 +263,6 @@
       }
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       switch (instruction->GetReductionKind()) {
         case HVecReduce::kSum:
           __ Uaddv(dst.D(), p_reg, src.VnD());
@@ -292,8 +290,8 @@
   const PRegisterM p_reg = LoopPReg().Merging();
   DataType::Type from = instruction->GetInputType();
   DataType::Type to = instruction->GetResultType();
+  ValidateVectorLength(instruction);
   if (from == DataType::Type::kInt32 && to == DataType::Type::kFloat32) {
-    DCHECK_EQ(4u, instruction->GetVectorLength());
     __ Scvtf(dst.VnS(), p_reg, src.VnS());
   } else {
     LOG(FATAL) << "Unsupported SIMD type: " << instruction->GetPackedType();
@@ -310,31 +308,26 @@
   const ZRegister src = ZRegisterFrom(locations->InAt(0));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Neg(dst.VnB(), p_reg, src.VnB());
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Neg(dst.VnH(), p_reg, src.VnH());
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Neg(dst.VnS(), p_reg, src.VnS());
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Neg(dst.VnD(), p_reg, src.VnD());
       break;
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fneg(dst.VnS(), p_reg, src.VnS());
       break;
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Fneg(dst.VnD(), p_reg, src.VnD());
       break;
     default:
@@ -353,29 +346,24 @@
   const ZRegister src = ZRegisterFrom(locations->InAt(0));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Abs(dst.VnB(), p_reg, src.VnB());
       break;
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Abs(dst.VnH(), p_reg, src.VnH());
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Abs(dst.VnS(), p_reg, src.VnS());
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Abs(dst.VnD(), p_reg, src.VnD());
       break;
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fabs(dst.VnS(), p_reg, src.VnS());
       break;
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Fabs(dst.VnD(), p_reg, src.VnD());
       break;
     default:
@@ -394,9 +382,9 @@
   const ZRegister src = ZRegisterFrom(locations->InAt(0));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:  // special case boolean-not
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Dup(dst.VnB(), 1);
       __ Eor(dst.VnB(), p_reg, dst.VnB(), src.VnB());
       break;
@@ -454,31 +442,26 @@
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Add(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Add(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Add(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Add(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fadd(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS(), StrictNaNPropagation);
       break;
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Fadd(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD(), StrictNaNPropagation);
       break;
     default:
@@ -518,31 +501,26 @@
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Sub(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Sub(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Sub(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Sub(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fsub(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
       break;
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Fsub(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     default:
@@ -572,31 +550,26 @@
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Mul(dst.VnB(), p_reg, lhs.VnB(), rhs.VnB());
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Mul(dst.VnH(), p_reg, lhs.VnH(), rhs.VnH());
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Mul(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Mul(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fmul(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS(), StrictNaNPropagation);
       break;
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Fmul(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD(), StrictNaNPropagation);
       break;
     default:
@@ -616,15 +589,14 @@
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
 
   // Note: VIXL guarantees StrictNaNPropagation for Fdiv.
   switch (instruction->GetPackedType()) {
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Fdiv(dst.VnS(), p_reg, lhs.VnS(), rhs.VnS());
       break;
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Fdiv(dst.VnD(), p_reg, lhs.VnD(), rhs.VnD());
       break;
     default:
@@ -665,6 +637,7 @@
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
@@ -709,6 +682,7 @@
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
@@ -744,6 +718,7 @@
   const ZRegister rhs = ZRegisterFrom(locations->InAt(1));
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
@@ -799,23 +774,20 @@
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Lsl(dst.VnB(), p_reg, lhs.VnB(), value);
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Lsl(dst.VnH(), p_reg, lhs.VnH(), value);
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Lsl(dst.VnS(), p_reg, lhs.VnS(), value);
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Lsl(dst.VnD(), p_reg, lhs.VnD(), value);
       break;
     default:
@@ -835,23 +807,20 @@
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Asr(dst.VnB(), p_reg, lhs.VnB(), value);
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Asr(dst.VnH(), p_reg, lhs.VnH(), value);
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Asr(dst.VnS(), p_reg, lhs.VnS(), value);
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Asr(dst.VnD(), p_reg, lhs.VnD(), value);
       break;
     default:
@@ -871,23 +840,20 @@
   const ZRegister dst = ZRegisterFrom(locations->Out());
   const PRegisterM p_reg = LoopPReg().Merging();
   int32_t value = locations->InAt(1).GetConstant()->AsIntConstant()->GetValue();
+  ValidateVectorLength(instruction);
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Lsr(dst.VnB(), p_reg, lhs.VnB(), value);
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Lsr(dst.VnH(), p_reg, lhs.VnH(), value);
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Lsr(dst.VnS(), p_reg, lhs.VnS(), value);
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Lsr(dst.VnD(), p_reg, lhs.VnD(), value);
       break;
     default:
@@ -943,26 +909,23 @@
   if (IsZeroBitPattern(instruction->InputAt(0))) {
     return;
   }
+  ValidateVectorLength(instruction);
 
   // Set required elements.
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Mov(dst.V16B(), 0, InputRegisterAt(instruction, 0));
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Mov(dst.V8H(), 0, InputRegisterAt(instruction, 0));
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Mov(dst.V4S(), 0, InputRegisterAt(instruction, 0));
       break;
     case DataType::Type::kInt64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Mov(dst.V2D(), 0, InputRegisterAt(instruction, 0));
       break;
     default:
@@ -1009,11 +972,11 @@
   const PRegisterM p_reg = LoopPReg().Merging();
 
   DCHECK(locations->InAt(0).Equals(locations->Out()));
+  ValidateVectorLength(instruction);
 
   switch (instruction->GetPackedType()) {
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ Mla(acc.VnB(), p_reg, acc.VnB(), left.VnB(), right.VnB());
       } else {
@@ -1022,7 +985,6 @@
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ Mla(acc.VnH(), p_reg, acc.VnB(), left.VnH(), right.VnH());
       } else {
@@ -1030,7 +992,6 @@
       }
       break;
     case DataType::Type::kInt32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ Mla(acc.VnS(), p_reg, acc.VnB(), left.VnS(), right.VnS());
       } else {
@@ -1077,12 +1038,11 @@
   DCHECK_EQ(HVecOperation::ToSignedType(a->GetPackedType()),
             HVecOperation::ToSignedType(b->GetPackedType()));
   DCHECK_EQ(instruction->GetPackedType(), DataType::Type::kInt32);
-  DCHECK_EQ(4u, instruction->GetVectorLength());
+  ValidateVectorLength(instruction);
 
   size_t inputs_data_size = DataType::Size(a->GetPackedType());
   switch (inputs_data_size) {
     case 1u: {
-      DCHECK_EQ(16u, a->GetVectorLength());
       UseScratchRegisterScope temps(GetVIXLAssembler());
       const ZRegister tmp0 = temps.AcquireZ();
       const ZRegister tmp1 = ZRegisterFrom(locations->GetTemp(0));
@@ -1143,30 +1103,27 @@
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register scratch;
   const PRegisterZ p_reg = LoopPReg().Zeroing();
+  ValidateVectorLength(instruction);
 
   switch (instruction->GetPackedType()) {
     case DataType::Type::kInt16:  // (short) s.charAt(.) can yield HVecLoad/Int16/StringCharAt.
     case DataType::Type::kUint16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Ld1h(reg.VnH(), p_reg,
               VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Ld1b(reg.VnB(), p_reg,
               VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     case DataType::Type::kInt32:
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Ld1w(reg.VnS(), p_reg,
               VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     case DataType::Type::kInt64:
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Ld1d(reg.VnD(), p_reg,
               VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
@@ -1188,30 +1145,27 @@
   UseScratchRegisterScope temps(GetVIXLAssembler());
   Register scratch;
   const PRegisterZ p_reg = LoopPReg().Zeroing();
+  ValidateVectorLength(instruction);
 
   switch (instruction->GetPackedType()) {
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ St1b(reg.VnB(), p_reg,
           VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ St1h(reg.VnH(), p_reg,
           VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     case DataType::Type::kInt32:
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ St1w(reg.VnS(), p_reg,
           VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
     case DataType::Type::kInt64:
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ St1d(reg.VnD(), p_reg,
           VecSVEAddress(instruction, &temps, size, /*is_string_char_at*/ false, &scratch));
       break;
@@ -1237,22 +1191,18 @@
     case DataType::Type::kBool:
     case DataType::Type::kUint8:
     case DataType::Type::kInt8:
-      DCHECK_EQ(16u, instruction->GetVectorLength());
       __ Ptrue(p_reg.VnB(), vixl::aarch64::SVE_ALL);
       break;
     case DataType::Type::kUint16:
     case DataType::Type::kInt16:
-      DCHECK_EQ(8u, instruction->GetVectorLength());
       __ Ptrue(p_reg.VnH(), vixl::aarch64::SVE_ALL);
       break;
     case DataType::Type::kInt32:
     case DataType::Type::kFloat32:
-      DCHECK_EQ(4u, instruction->GetVectorLength());
       __ Ptrue(p_reg.VnS(), vixl::aarch64::SVE_ALL);
       break;
     case DataType::Type::kInt64:
     case DataType::Type::kFloat64:
-      DCHECK_EQ(2u, instruction->GetVectorLength());
       __ Ptrue(p_reg.VnD(), vixl::aarch64::SVE_ALL);
       break;
     default:
@@ -1295,17 +1245,19 @@
   Register left = InputRegisterAt(instruction, 0);
   Register right = InputRegisterAt(instruction, 1);
 
-  switch (instruction->GetVectorLength()) {
-    case 16u:
+  DCHECK_EQ(codegen_->GetSIMDRegisterWidth() % instruction->GetVectorLength(), 0u);
+
+  switch (codegen_->GetSIMDRegisterWidth() / instruction->GetVectorLength()) {
+    case 1u:
       __ Whilelo(LoopPReg().VnB(), left, right);
       break;
-    case 8u:
+    case 2u:
       __ Whilelo(LoopPReg().VnH(), left, right);
       break;
     case 4u:
       __ Whilelo(LoopPReg().VnS(), left, right);
       break;
-    case 2u:
+    case 8u:
       __ Whilelo(LoopPReg().VnD(), left, right);
       break;
     default:
@@ -1333,52 +1285,103 @@
 
 Location InstructionCodeGeneratorARM64Sve::AllocateSIMDScratchLocation(
     vixl::aarch64::UseScratchRegisterScope* scope) {
-  DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes);
-  return LocationFrom(scope->AcquireVRegisterOfSize(kQRegSize));
+  return LocationFrom(scope->AcquireZ());
 }
 
 void InstructionCodeGeneratorARM64Sve::FreeSIMDScratchLocation(Location loc,
     vixl::aarch64::UseScratchRegisterScope* scope) {
-  DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes);
-  scope->Release(QRegisterFrom(loc));
+  scope->Release(ZRegisterFrom(loc));
 }
 
 void InstructionCodeGeneratorARM64Sve::LoadSIMDRegFromStack(Location destination,
                                                             Location source) {
-  DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes);
-  __ Ldr(QRegisterFrom(destination), StackOperandFrom(source));
+  __ Ldr(ZRegisterFrom(destination), SveStackOperandFrom(source));
 }
 
 void InstructionCodeGeneratorARM64Sve::MoveSIMDRegToSIMDReg(Location destination,
                                                             Location source) {
-  DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes);
-  __ Mov(QRegisterFrom(destination), QRegisterFrom(source));
+  __ Mov(ZRegisterFrom(destination), ZRegisterFrom(source));
 }
 
 void InstructionCodeGeneratorARM64Sve::MoveToSIMDStackSlot(Location destination,
                                                            Location source) {
   DCHECK(destination.IsSIMDStackSlot());
-  DCHECK_EQ(codegen_->GetSIMDRegisterWidth(), kQRegSizeInBytes);
 
   if (source.IsFpuRegister()) {
-    __ Str(QRegisterFrom(source), StackOperandFrom(destination));
+    __ Str(ZRegisterFrom(source), SveStackOperandFrom(destination));
   } else {
     DCHECK(source.IsSIMDStackSlot());
     UseScratchRegisterScope temps(GetVIXLAssembler());
     if (GetVIXLAssembler()->GetScratchVRegisterList()->IsEmpty()) {
+      // Very rare situation, only when there are cycles in ParallelMoveResolver graph.
       const Register temp = temps.AcquireX();
-      __ Ldr(temp, MemOperand(sp, source.GetStackIndex()));
-      __ Str(temp, MemOperand(sp, destination.GetStackIndex()));
-      __ Ldr(temp, MemOperand(sp, source.GetStackIndex() + kArm64WordSize));
-      __ Str(temp, MemOperand(sp, destination.GetStackIndex() + kArm64WordSize));
+      DCHECK_EQ(codegen_->GetSIMDRegisterWidth() % kArm64WordSize, 0u);
+      // Emit a number of LDR/STR (XRegister, 64-bit) to cover the whole SIMD register size
+      // when copying a stack slot.
+      for (size_t offset = 0, e = codegen_->GetSIMDRegisterWidth();
+           offset < e;
+           offset += kArm64WordSize) {
+        __ Ldr(temp, MemOperand(sp, source.GetStackIndex() + offset));
+        __ Str(temp, MemOperand(sp, destination.GetStackIndex() + offset));
+      }
     } else {
-      const VRegister temp = temps.AcquireVRegisterOfSize(kQRegSize);
-      __ Ldr(temp, StackOperandFrom(source));
-      __ Str(temp, StackOperandFrom(destination));
+      const ZRegister temp = temps.AcquireZ();
+      __ Ldr(temp, SveStackOperandFrom(source));
+      __ Str(temp, SveStackOperandFrom(destination));
     }
   }
 }
 
+template <bool is_save>
+void SaveRestoreLiveRegistersHelperSveImpl(CodeGeneratorARM64* codegen,
+                                           LocationSummary* locations,
+                                           int64_t spill_offset) {
+  const uint32_t core_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ true);
+  const uint32_t fp_spills = codegen->GetSlowPathSpills(locations, /* core_registers= */ false);
+  DCHECK(helpers::ArtVixlRegCodeCoherentForRegSet(core_spills,
+                                                  codegen->GetNumberOfCoreRegisters(),
+                                                  fp_spills,
+                                                  codegen->GetNumberOfFloatingPointRegisters()));
+  MacroAssembler* masm = codegen->GetVIXLAssembler();
+  Register base = masm->StackPointer();
+
+  CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize, core_spills);
+  int64_t core_spill_size = core_list.GetTotalSizeInBytes();
+  int64_t fp_spill_offset = spill_offset + core_spill_size;
+
+  if (codegen->GetGraph()->HasSIMD()) {
+    if (is_save) {
+      masm->StoreCPURegList(core_list, MemOperand(base, spill_offset));
+    } else {
+      masm->LoadCPURegList(core_list, MemOperand(base, spill_offset));
+    }
+    codegen->GetAssembler()->SaveRestoreZRegisterList<is_save>(fp_spills, fp_spill_offset);
+    return;
+  }
+
+  // Case when we only need to restore D-registers.
+  DCHECK(!codegen->GetGraph()->HasSIMD());
+  DCHECK_LE(codegen->GetSlowPathFPWidth(), kDRegSizeInBytes);
+  CPURegList fp_list = CPURegList(CPURegister::kVRegister, kDRegSize, fp_spills);
+  if (is_save) {
+    masm->StoreCPURegList(core_list, MemOperand(base, spill_offset));
+    masm->StoreCPURegList(fp_list, MemOperand(base, fp_spill_offset));
+  } else {
+    masm->LoadCPURegList(core_list, MemOperand(base, spill_offset));
+    masm->LoadCPURegList(fp_list, MemOperand(base, fp_spill_offset));
+  }
+}
+
+void InstructionCodeGeneratorARM64Sve::SaveLiveRegistersHelper(LocationSummary* locations,
+                                                               int64_t spill_offset) {
+  SaveRestoreLiveRegistersHelperSveImpl</* is_save= */ true>(codegen_, locations, spill_offset);
+}
+
+void InstructionCodeGeneratorARM64Sve::RestoreLiveRegistersHelper(LocationSummary* locations,
+                                                                  int64_t spill_offset) {
+  SaveRestoreLiveRegistersHelperSveImpl</* is_save= */ false>(codegen_, locations, spill_offset);
+}
+
 #undef __
 
 }  // namespace arm64
diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
index 7220781..81c6561 100644
--- a/compiler/optimizing/common_arm64.h
+++ b/compiler/optimizing/common_arm64.h
@@ -182,6 +182,10 @@
   return vixl::aarch64::MemOperand(vixl::aarch64::sp, location.GetStackIndex());
 }
 
+inline vixl::aarch64::SVEMemOperand SveStackOperandFrom(Location location) {
+  return vixl::aarch64::SVEMemOperand(vixl::aarch64::sp, location.GetStackIndex());
+}
+
 inline vixl::aarch64::MemOperand HeapOperand(const vixl::aarch64::Register& base,
                                                     size_t offset = 0) {
   // A heap reference must be 32bit, so fit in a W register.
@@ -215,6 +219,10 @@
   return Location::FpuRegisterLocation(fpreg.GetCode());
 }
 
+inline Location LocationFrom(const vixl::aarch64::ZRegister& zreg) {
+  return Location::FpuRegisterLocation(zreg.GetCode());
+}
+
 inline vixl::aarch64::Operand OperandFromMemOperand(
     const vixl::aarch64::MemOperand& mem_op) {
   if (mem_op.IsImmediateOffset()) {
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 1210dbe..02ee4ec 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -946,9 +946,10 @@
   //     make one particular reference aligned), never to exceed (1).
   // (3) variable to record how many references share same alignment.
   // (4) variable to record suitable candidate for dynamic loop peeling.
-  uint32_t desired_alignment = GetVectorSizeInBytes();
-  DCHECK_LE(desired_alignment, 16u);
-  uint32_t peeling_votes[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  size_t desired_alignment = GetVectorSizeInBytes();
+  ScopedArenaVector<uint32_t> peeling_votes(desired_alignment, 0u,
+      loop_allocator_->Adapter(kArenaAllocLoopOptimization));
+
   uint32_t max_num_same_alignment = 0;
   const ArrayReference* peeling_candidate = nullptr;
 
@@ -1577,14 +1578,6 @@
 }
 
 uint32_t HLoopOptimization::GetVectorSizeInBytes() {
-  if (kIsDebugBuild) {
-    InstructionSet isa = compiler_options_->GetInstructionSet();
-    // TODO: Remove this check when there are no implicit assumptions on the SIMD reg size.
-    DCHECK_EQ(simd_register_size_, (isa == InstructionSet::kArm || isa == InstructionSet::kThumb2)
-                                   ? 8u
-                                   : 16u);
-  }
-
   return simd_register_size_;
 }
 
@@ -1616,6 +1609,8 @@
       if (IsInPredicatedVectorizationMode()) {
         // SVE vectorization.
         CHECK(features->AsArm64InstructionSetFeatures()->HasSVE());
+        size_t vector_length = simd_register_size_ / DataType::Size(type);
+        DCHECK_EQ(simd_register_size_ % DataType::Size(type), 0u);
         switch (type) {
           case DataType::Type::kBool:
           case DataType::Type::kUint8:
@@ -1625,7 +1620,7 @@
                              kNoUnsignedHAdd |
                              kNoUnroundedHAdd |
                              kNoSAD;
-            return TrySetVectorLength(type, 16);
+            return TrySetVectorLength(type, vector_length);
           case DataType::Type::kUint16:
           case DataType::Type::kInt16:
             *restrictions |= kNoDiv |
@@ -1634,19 +1629,19 @@
                              kNoUnroundedHAdd |
                              kNoSAD |
                              kNoDotProd;
-            return TrySetVectorLength(type, 8);
+            return TrySetVectorLength(type, vector_length);
           case DataType::Type::kInt32:
             *restrictions |= kNoDiv | kNoSAD;
-            return TrySetVectorLength(type, 4);
+            return TrySetVectorLength(type, vector_length);
           case DataType::Type::kInt64:
             *restrictions |= kNoDiv | kNoSAD;
-            return TrySetVectorLength(type, 2);
+            return TrySetVectorLength(type, vector_length);
           case DataType::Type::kFloat32:
             *restrictions |= kNoReduction;
-            return TrySetVectorLength(type, 4);
+            return TrySetVectorLength(type, vector_length);
           case DataType::Type::kFloat64:
             *restrictions |= kNoReduction;
-            return TrySetVectorLength(type, 2);
+            return TrySetVectorLength(type, vector_length);
           default:
             break;
         }
@@ -2311,12 +2306,12 @@
   return Alignment(DataType::Size(type), 0);
 }
 
-void HLoopOptimization::SetAlignmentStrategy(uint32_t peeling_votes[],
+void HLoopOptimization::SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes,
                                              const ArrayReference* peeling_candidate) {
   // Current heuristic: pick the best static loop peeling factor, if any,
   // or otherwise use dynamic loop peeling on suggested peeling candidate.
   uint32_t max_vote = 0;
-  for (int32_t i = 0; i < 16; i++) {
+  for (size_t i = 0; i < peeling_votes.size(); i++) {
     if (peeling_votes[i] > max_vote) {
       max_vote = peeling_votes[i];
       vector_static_peeling_factor_ = i;
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 0d76804..d3583ed 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -238,7 +238,7 @@
                              DataType::Type type,
                              bool is_string_char_at,
                              uint32_t peeling = 0);
-  void SetAlignmentStrategy(uint32_t peeling_votes[],
+  void SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes,
                             const ArrayReference* peeling_candidate);
   uint32_t MaxNumberPeeled();
   bool IsVectorizationProfitable(int64_t trip_count);
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index 7ab767f..6100ed9 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -16,6 +16,7 @@
 
 #include "arch/arm64/instruction_set_features_arm64.h"
 #include "assembler_arm64.h"
+#include "base/bit_utils_iterator.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "heap_poisoning.h"
 #include "offsets.h"
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index 5442b6a..b49a13a 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -24,6 +24,7 @@
 #include <android-base/logging.h>
 
 #include "base/arena_containers.h"
+#include "base/bit_utils_iterator.h"
 #include "base/macros.h"
 #include "dwarf/register.h"
 #include "offsets.h"
@@ -98,6 +99,28 @@
   void SpillRegisters(vixl::aarch64::CPURegList registers, int offset);
   void UnspillRegisters(vixl::aarch64::CPURegList registers, int offset);
 
+  // A helper to save/restore a list of ZRegisters to a specified stack offset location.
+  template <bool is_save>
+  void SaveRestoreZRegisterList(uint32_t vreg_bit_vector, int64_t stack_offset) {
+    if (vreg_bit_vector == 0) {
+      return;
+    }
+    vixl::aarch64::UseScratchRegisterScope temps(GetVIXLAssembler());
+    vixl::aarch64::Register temp = temps.AcquireX();
+    vixl_masm_.Add(temp, vixl::aarch64::sp, stack_offset);
+    size_t slot_no = 0;
+    for (uint32_t i : LowToHighBits(vreg_bit_vector)) {
+      if (is_save) {
+        vixl_masm_.Str(vixl::aarch64::ZRegister(i),
+                       vixl::aarch64::SVEMemOperand(temp, slot_no, vixl::aarch64::SVE_MUL_VL));
+      } else {
+        vixl_masm_.Ldr(vixl::aarch64::ZRegister(i),
+                       vixl::aarch64::SVEMemOperand(temp, slot_no, vixl::aarch64::SVE_MUL_VL));
+      }
+      slot_no++;
+    }
+  }
+
   // Jump to address (not setting link register)
   void JumpTo(ManagedRegister m_base, Offset offs, ManagedRegister m_scratch);