MIPS32: Improve stack alignment, use sdc1/ldc1, where possible.

- Ensure that SP is a multiple of 16 at all times, and
- Use ldc1/sdc1 to load/store FPU registers from/to 8-byte-aligned
  locations wherever possible.

Use `export ART_MIPS32_CHECK_ALIGNMENT=true` when building Android
to enable the new runtime alignment checks.

Test: Boot & run tests on 32-bit version of QEMU, and CI-20.
Test: test/testrunner/testrunner.py --target --optimizing --32
Test: test-art-host-gtest
Test: test-art-target-gtest

Change-Id: Ia667004573f419fd006098fcfadf5834239cb485
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 3ba107a..2f65e8c 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -1300,7 +1300,7 @@
   // automatically unspilled when the scratch scope object is destroyed).
   ScratchRegisterScope ensure_scratch(this, TMP, V0, codegen_->GetNumberOfCoreRegisters());
   // If V0 spills onto the stack, SP-relative offsets need to be adjusted.
-  int stack_offset = ensure_scratch.IsSpilled() ? kMipsWordSize : 0;
+  int stack_offset = ensure_scratch.IsSpilled() ? kStackAlignment : 0;
   for (int i = 0; i <= (double_slot ? 1 : 0); i++, stack_offset += kMipsWordSize) {
     __ LoadFromOffset(kLoadWord,
                       Register(ensure_scratch.GetRegister()),