MIPS32: Improve stack alignment, use sdc1/ldc1, where possible.

- Ensure that SP is a multiple of 16 at all times, and
- Use ldc1/sdc1 to load/store FPU registers from/to 8-byte-aligned
  locations wherever possible.

Use `export ART_MIPS32_CHECK_ALIGNMENT=true` when building Android
to enable the new runtime alignment checks.

Test: Boot & run tests on 32-bit version of QEMU, and CI-20.
Test: test/testrunner/testrunner.py --target --optimizing --32
Test: test-art-host-gtest
Test: test-art-target-gtest

Change-Id: Ia667004573f419fd006098fcfadf5834239cb485
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 3ba107a..2f65e8c 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -1300,7 +1300,7 @@
   // automatically unspilled when the scratch scope object is destroyed).
   ScratchRegisterScope ensure_scratch(this, TMP, V0, codegen_->GetNumberOfCoreRegisters());
   // If V0 spills onto the stack, SP-relative offsets need to be adjusted.
-  int stack_offset = ensure_scratch.IsSpilled() ? kMipsWordSize : 0;
+  int stack_offset = ensure_scratch.IsSpilled() ? kStackAlignment : 0;
   for (int i = 0; i <= (double_slot ? 1 : 0); i++, stack_offset += kMipsWordSize) {
     __ LoadFromOffset(kLoadWord,
                       Register(ensure_scratch.GetRegister()),
diff --git a/compiler/optimizing/emit_swap_mips_test.cc b/compiler/optimizing/emit_swap_mips_test.cc
index 36e932c..b63914f 100644
--- a/compiler/optimizing/emit_swap_mips_test.cc
+++ b/compiler/optimizing/emit_swap_mips_test.cc
@@ -238,14 +238,14 @@
       DataType::Type::kInt32,
       nullptr);
   const char* expected =
-      "addiu $sp, $sp, -4\n"
+      "addiu $sp, $sp, -16\n"
       "sw $v0, 0($sp)\n"
-      "lw $v0, 56($sp)\n"
-      "lw $t8, 52($sp)\n"
-      "sw $v0, 52($sp)\n"
-      "sw $t8, 56($sp)\n"
+      "lw $v0, 68($sp)\n"
+      "lw $t8, 64($sp)\n"
+      "sw $v0, 64($sp)\n"
+      "sw $t8, 68($sp)\n"
       "lw $v0, 0($sp)\n"
-      "addiu $sp, $sp, 4\n";
+      "addiu $sp, $sp, 16\n";
   DriverWrapper(moves_, expected, "TwoStackSlots");
 }
 
@@ -261,18 +261,18 @@
       DataType::Type::kInt64,
       nullptr);
   const char* expected =
-      "addiu $sp, $sp, -4\n"
+      "addiu $sp, $sp, -16\n"
       "sw $v0, 0($sp)\n"
-      "lw $v0, 60($sp)\n"
-      "lw $t8, 52($sp)\n"
-      "sw $v0, 52($sp)\n"
-      "sw $t8, 60($sp)\n"
-      "lw $v0, 64($sp)\n"
-      "lw $t8, 56($sp)\n"
-      "sw $v0, 56($sp)\n"
-      "sw $t8, 64($sp)\n"
+      "lw $v0, 72($sp)\n"
+      "lw $t8, 64($sp)\n"
+      "sw $v0, 64($sp)\n"
+      "sw $t8, 72($sp)\n"
+      "lw $v0, 76($sp)\n"
+      "lw $t8, 68($sp)\n"
+      "sw $v0, 68($sp)\n"
+      "sw $t8, 76($sp)\n"
       "lw $v0, 0($sp)\n"
-      "addiu $sp, $sp, 4\n";
+      "addiu $sp, $sp, 16\n";
   DriverWrapper(moves_, expected, "TwoDoubleStackSlots");
 }
 
diff --git a/compiler/optimizing/optimizing_cfi_test_expected.inc b/compiler/optimizing/optimizing_cfi_test_expected.inc
index fde55cb..1e82c4b 100644
--- a/compiler/optimizing/optimizing_cfi_test_expected.inc
+++ b/compiler/optimizing/optimizing_cfi_test_expected.inc
@@ -330,10 +330,10 @@
 static constexpr uint8_t expected_asm_kMips_adjust_head[] = {
     0xC0, 0xFF, 0xBD, 0x27, 0x3C, 0x00, 0xBF, 0xAF, 0x38, 0x00, 0xB1, 0xAF,
     0x34, 0x00, 0xB0, 0xAF, 0x28, 0x00, 0xB6, 0xF7, 0x20, 0x00, 0xB4, 0xF7,
-    0x08, 0x00, 0x80, 0x14, 0xFC, 0xFF, 0xBD, 0x27,
+    0x08, 0x00, 0x80, 0x14, 0xF0, 0xFF, 0xBD, 0x27,
     0x00, 0x00, 0xBF, 0xAF, 0x00, 0x00, 0x10, 0x04, 0x02, 0x00, 0x01, 0x3C,
     0x18, 0x00, 0x21, 0x34, 0x21, 0x08, 0x3F, 0x00, 0x00, 0x00, 0xBF, 0x8F,
-    0x09, 0x00, 0x20, 0x00, 0x04, 0x00, 0xBD, 0x27,
+    0x09, 0x00, 0x20, 0x00, 0x10, 0x00, 0xBD, 0x27,
 };
 static constexpr uint8_t expected_asm_kMips_adjust_tail[] = {
     0x3C, 0x00, 0xBF, 0x8F, 0x38, 0x00, 0xB1, 0x8F, 0x34, 0x00, 0xB0, 0x8F,
@@ -342,7 +342,7 @@
 };
 static constexpr uint8_t expected_cfi_kMips_adjust[] = {
     0x44, 0x0E, 0x40, 0x44, 0x9F, 0x01, 0x44, 0x91, 0x02, 0x44, 0x90, 0x03,
-    0x50, 0x0E, 0x44, 0x60, 0x0E, 0x40, 0x04, 0x04, 0x00, 0x02, 0x00, 0x0A,
+    0x50, 0x0E, 0x50, 0x60, 0x0E, 0x40, 0x04, 0x04, 0x00, 0x02, 0x00, 0x0A,
     0x44, 0xDF, 0x44, 0xD1, 0x44, 0xD0, 0x50, 0x0E, 0x00, 0x0B, 0x0E, 0x40,
 };
 // 0x00000000: addiu sp, sp, -64
@@ -356,8 +356,8 @@
 // 0x00000010: sdc1 f22, +40(sp)
 // 0x00000014: sdc1 f20, +32(sp)
 // 0x00000018: bnez a0, 0x0000003c ; +36
-// 0x0000001c: addiu sp, sp, -4
-// 0x00000020: .cfi_def_cfa_offset: 68
+// 0x0000001c: addiu sp, sp, -16
+// 0x00000020: .cfi_def_cfa_offset: 80
 // 0x00000020: sw ra, +0(sp)
 // 0x00000024: nal
 // 0x00000028: lui at, 2
@@ -365,7 +365,7 @@
 // 0x00000030: addu at, at, ra
 // 0x00000034: lw ra, +0(sp)
 // 0x00000038: jr at
-// 0x0000003c: addiu sp, sp, 4
+// 0x0000003c: addiu sp, sp, 16
 // 0x00000040: .cfi_def_cfa_offset: 64
 // 0x00000040: nop
 //             ...
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index cbb2c0e..9545ca6 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -1863,20 +1863,20 @@
 }
 
 void MipsAssembler::Push(Register rs) {
-  IncreaseFrameSize(kMipsWordSize);
+  IncreaseFrameSize(kStackAlignment);
   Sw(rs, SP, 0);
 }
 
 void MipsAssembler::Pop(Register rd) {
   Lw(rd, SP, 0);
-  DecreaseFrameSize(kMipsWordSize);
+  DecreaseFrameSize(kStackAlignment);
 }
 
 void MipsAssembler::PopAndReturn(Register rd, Register rt) {
   bool reordering = SetReorder(false);
   Lw(rd, SP, 0);
   Jr(rt);
-  DecreaseFrameSize(kMipsWordSize);  // Single instruction in delay slot.
+  DecreaseFrameSize(kStackAlignment);  // Single instruction in delay slot.
   SetReorder(reordering);
 }
 
@@ -4588,7 +4588,7 @@
       Addu(AT, AT, RA);
       Lw(RA, SP, 0);
       Jr(AT);
-      DecreaseFrameSize(kMipsWordSize);
+      DecreaseFrameSize(kStackAlignment);
       break;
     case Branch::kLongCondBranch:
       // The comment on case 'Branch::kLongUncondBranch' applies here as well.
@@ -4608,7 +4608,7 @@
       Addu(AT, AT, RA);
       Lw(RA, SP, 0);
       Jr(AT);
-      DecreaseFrameSize(kMipsWordSize);
+      DecreaseFrameSize(kStackAlignment);
       break;
     case Branch::kLongCall:
       DCHECK_NE(delayed_instruction, Branch::kUnfillableDelaySlot);
diff --git a/compiler/utils/mips/assembler_mips_test.cc b/compiler/utils/mips/assembler_mips_test.cc
index 9397be4..b027d3a 100644
--- a/compiler/utils/mips/assembler_mips_test.cc
+++ b/compiler/utils/mips/assembler_mips_test.cc
@@ -2803,7 +2803,7 @@
   oss <<
       ".set noreorder\n"
       "addiu $t0, $t1, 0x5678\n"
-      "addiu $sp, $sp, -4\n"
+      "addiu $sp, $sp, -16\n"
       "sw $ra, 0($sp)\n"
       "bltzal $zero, .+4\n"
       "lui $at, 0x" << std::hex << High16Bits(offset_forward) << "\n"
@@ -2811,11 +2811,11 @@
       "addu $at, $at, $ra\n"
       "lw $ra, 0($sp)\n"
       "jalr $zero, $at\n"
-      "addiu $sp, $sp, 4\n" <<
+      "addiu $sp, $sp, 16\n" <<
       RepeatInsn(kAdduCount1, "addu $zero, $zero, $zero\n") <<
       RepeatInsn(kAdduCount2, "addu $zero, $zero, $zero\n") <<
       "addiu $t0, $t1, 0x5678\n"
-      "addiu $sp, $sp, -4\n"
+      "addiu $sp, $sp, -16\n"
       "sw $ra, 0($sp)\n"
       "bltzal $zero, .+4\n"
       "lui $at, 0x" << std::hex << High16Bits(offset_back) << "\n"
@@ -2823,7 +2823,7 @@
       "addu $at, $at, $ra\n"
       "lw $ra, 0($sp)\n"
       "jalr $zero, $at\n"
-      "addiu $sp, $sp, 4\n";
+      "addiu $sp, $sp, 16\n";
   std::string expected = oss.str();
   DriverStr(expected, "LongBranchReorder");
   EXPECT_EQ(__ GetLabelLocation(&patcher_label1), 0 * 4u);