Implement a SIMD spilling slot.

Rationale:
The last ART vectorizer break-out CL    \O/
This ensures spilling on x86 and x86_4 is correct.
Also, it paves the way to wider SIMD on ARM and MIPS.

Test: test-art-host
Bug: 34083438

Change-Id: I5b27d18c2045f3ab70b64c335423b3ff2a507ac2
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index c106d9b..2ffc398 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -1162,7 +1162,7 @@
 
 size_t CodeGeneratorX86_64::SaveFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
   if (GetGraph()->HasSIMD()) {
-    __ movupd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id));
+    __ movups(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id));
   } else {
     __ movsd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id));
   }
@@ -1171,7 +1171,7 @@
 
 size_t CodeGeneratorX86_64::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
   if (GetGraph()->HasSIMD()) {
-    __ movupd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index));
+    __ movups(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index));
   } else {
     __ movsd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index));
   }
@@ -5166,9 +5166,8 @@
   // In suspend check slow path, usually there are no caller-save registers at all.
   // If SIMD instructions are present, however, we force spilling all live SIMD
   // registers in full width (since the runtime only saves/restores lower part).
-  locations->SetCustomSlowPathCallerSaves(GetGraph()->HasSIMD()
-                                          ? RegisterSet::AllFpu()
-                                          : RegisterSet::Empty());
+  locations->SetCustomSlowPathCallerSaves(
+      GetGraph()->HasSIMD() ? RegisterSet::AllFpu() : RegisterSet::Empty());
 }
 
 void InstructionCodeGeneratorX86_64::VisitSuspendCheck(HSuspendCheck* instruction) {
@@ -5257,6 +5256,10 @@
       __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), source.GetStackIndex()));
       __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(TMP));
     }
+  } else if (source.IsSIMDStackSlot()) {
+    DCHECK(destination.IsFpuRegister());
+    __ movups(destination.AsFpuRegister<XmmRegister>(),
+              Address(CpuRegister(RSP), source.GetStackIndex()));
   } else if (source.IsConstant()) {
     HConstant* constant = source.GetConstant();
     if (constant->IsIntConstant() || constant->IsNullConstant()) {
@@ -5307,10 +5310,13 @@
     } else if (destination.IsStackSlot()) {
       __ movss(Address(CpuRegister(RSP), destination.GetStackIndex()),
                source.AsFpuRegister<XmmRegister>());
-    } else {
-      DCHECK(destination.IsDoubleStackSlot()) << destination;
+    } else if (destination.IsDoubleStackSlot()) {
       __ movsd(Address(CpuRegister(RSP), destination.GetStackIndex()),
                source.AsFpuRegister<XmmRegister>());
+    } else {
+       DCHECK(destination.IsSIMDStackSlot());
+      __ movups(Address(CpuRegister(RSP), destination.GetStackIndex()),
+                source.AsFpuRegister<XmmRegister>());
     }
   }
 }