ARM64: Improve code generated to spill/restore for slow paths.

Aligning the accesses allows generating better code.

Before:

    add x16, sp, #0x44 (68)
    stp x0, x1, [x16, #-16]

After:

    stp x0, x1, [sp, #56]

Change-Id: I3e20ad3fa59d00aee4b4d14ea9d59c7cd546509e
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index d40e2b9..9c6dcaa 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -291,7 +291,8 @@
   DCHECK(!block_order.empty());
   DCHECK(block_order[0] == GetGraph()->GetEntryBlock());
   ComputeSpillMask();
-  first_register_slot_in_slow_path_ = (number_of_out_slots + number_of_spill_slots) * kVRegSize;
+  first_register_slot_in_slow_path_ = RoundUp(
+      (number_of_out_slots + number_of_spill_slots) * kVRegSize, GetPreferredSlotsAlignment());
 
   if (number_of_spill_slots == 0
       && !HasAllocatedCalleeSaveRegisters()
@@ -302,8 +303,7 @@
     SetFrameSize(CallPushesPC() ? GetWordSize() : 0);
   } else {
     SetFrameSize(RoundUp(
-        number_of_spill_slots * kVRegSize
-        + number_of_out_slots * kVRegSize
+        first_register_slot_in_slow_path_
         + maximum_number_of_live_core_registers * GetWordSize()
         + maximum_number_of_live_fpu_registers * GetFloatingPointSpillSlotSize()
         + FrameEntrySpillSize(),
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index b8540ba..47e6625 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -211,6 +211,8 @@
                                 size_t maximum_number_of_live_fpu_registers,
                                 size_t number_of_out_slots,
                                 const ArenaVector<HBasicBlock*>& block_order);
+  // Backends can override this as necessary. For most, no special alignment is required.
+  virtual uint32_t GetPreferredSlotsAlignment() const { return 1; }
 
   uint32_t GetFrameSize() const { return frame_size_; }
   void SetFrameSize(uint32_t size) { frame_size_ = size; }
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index c2f055a..b35c520 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -418,6 +418,10 @@
     block_labels_.resize(GetGraph()->GetBlocks().size());
   }
 
+  // We want to use the STP and LDP instructions to spill and restore registers for slow paths.
+  // These instructions can only encode offsets that are multiples of the register size accessed.
+  uint32_t GetPreferredSlotsAlignment() const OVERRIDE { return vixl::kXRegSizeInBytes; }
+
   JumpTableARM64* CreateJumpTable(HPackedSwitch* switch_instr) {
     jump_tables_.emplace_back(new (GetGraph()->GetArena()) JumpTableARM64(switch_instr));
     return jump_tables_.back().get();