MIPS32: Fill branch delay slots

Test: booted MIPS32 in QEMU
Test: test-art-host-gtest
Test: test-art-target-gtest
Test: test-art-target-run-test-optimizing on CI20

Change-Id: I727e80753395ab99fff004cb5d2e0a06409150d7
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 0d3f849..b0de964 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -753,7 +753,7 @@
   }
 
   // Collect PC infos for the mapping table.
-  uint32_t native_pc = GetAssembler()->CodeSize();
+  uint32_t native_pc = GetAssembler()->CodePosition();
 
   if (instruction == nullptr) {
     // For stack overflow checks and native-debug-info entries without dex register
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 8a2f90d..e0de03b 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -792,12 +792,24 @@
       // TODO: __ cfi().Restore(DWARFReg(reg));
     }
 
-    __ DecreaseFrameSize(GetFrameSize());
+    size_t frame_size = GetFrameSize();
+    // Adjust the stack pointer in the delay slot if doing so doesn't break CFI.
+    bool exchange = IsInt<16>(static_cast<int32_t>(frame_size));
+    bool reordering = __ SetReorder(false);
+    if (exchange) {
+      __ Jr(RA);
+      __ DecreaseFrameSize(frame_size);  // Single instruction in delay slot.
+    } else {
+      __ DecreaseFrameSize(frame_size);
+      __ Jr(RA);
+      __ Nop();  // In delay slot.
+    }
+    __ SetReorder(reordering);
+  } else {
+    __ Jr(RA);
+    __ NopIfNoReordering();
   }
 
-  __ Jr(RA);
-  __ Nop();
-
   __ cfi().RestoreState();
   __ cfi().DefCFAOffset(GetFrameSize());
 }
@@ -1251,6 +1263,7 @@
                                       uint32_t dex_pc,
                                       SlowPathCode* slow_path,
                                       bool is_direct_entrypoint) {
+  bool reordering = __ SetReorder(false);
   __ LoadFromOffset(kLoadWord, T9, TR, entry_point_offset);
   __ Jalr(T9);
   if (is_direct_entrypoint) {
@@ -1262,6 +1275,7 @@
   } else {
     __ Nop();  // In delay slot.
   }
+  __ SetReorder(reordering);
   RecordPcInfo(instruction, dex_pc, slow_path);
 }
 
@@ -3953,7 +3967,7 @@
   __ LoadFromOffset(kLoadWord, T9, temp, entry_point.Int32Value());
   // T9();
   __ Jalr(T9);
-  __ Nop();
+  __ NopIfNoReordering();
   DCHECK(!codegen_->IsLeafMethod());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
 }
@@ -4254,7 +4268,7 @@
       // T9 prepared above for better instruction scheduling.
       // T9()
       __ Jalr(T9);
-      __ Nop();
+      __ NopIfNoReordering();
       break;
     case HInvokeStaticOrDirect::CodePtrLocation::kCallPCRelative:
       // TODO: Implement this type.
@@ -4270,7 +4284,7 @@
                             kMipsPointerSize).Int32Value());
       // T9()
       __ Jalr(T9);
-      __ Nop();
+      __ NopIfNoReordering();
       break;
   }
   DCHECK(!IsLeafMethod());
@@ -4312,7 +4326,7 @@
   __ LoadFromOffset(kLoadWord, T9, temp, entry_point.Int32Value());
   // T9();
   __ Jalr(T9);
-  __ Nop();
+  __ NopIfNoReordering();
 }
 
 void InstructionCodeGeneratorMIPS::VisitInvokeVirtual(HInvokeVirtual* invoke) {
@@ -4421,6 +4435,7 @@
       DCHECK(!kEmitCompilerReadBarrier);
       CodeGeneratorMIPS::PcRelativePatchInfo* info =
           codegen_->NewPcRelativeTypePatch(cls->GetDexFile(), cls->GetTypeIndex());
+      bool reordering = __ SetReorder(false);
       if (isR6) {
         __ Bind(&info->high_label);
         __ Bind(&info->pc_rel_label);
@@ -4436,6 +4451,7 @@
         // Add a 32-bit offset to PC.
         __ Addu(out, out, base_or_current_method_reg);
       }
+      __ SetReorder(reordering);
       break;
     }
     case HLoadClass::LoadKind::kBootImageAddress: {
@@ -4579,6 +4595,7 @@
       DCHECK(!kEmitCompilerReadBarrier);
       CodeGeneratorMIPS::PcRelativePatchInfo* info =
           codegen_->NewPcRelativeStringPatch(load->GetDexFile(), load->GetStringIndex());
+      bool reordering = __ SetReorder(false);
       if (isR6) {
         __ Bind(&info->high_label);
         __ Bind(&info->pc_rel_label);
@@ -4594,6 +4611,7 @@
         // Add a 32-bit offset to PC.
         __ Addu(out, out, base_or_current_method_reg);
       }
+      __ SetReorder(reordering);
       return;  // No dex cache slow path.
     }
     case HLoadString::LoadKind::kBootImageAddress: {
@@ -4851,7 +4869,7 @@
     __ LoadFromOffset(kLoadWord, temp, TR, QUICK_ENTRY_POINT(pNewEmptyString));
     __ LoadFromOffset(kLoadWord, T9, temp, code_offset.Int32Value());
     __ Jalr(T9);
-    __ Nop();
+    __ NopIfNoReordering();
     codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
   } else {
     codegen_->InvokeRuntime(
@@ -5751,7 +5769,7 @@
   Register reg = base->GetLocations()->Out().AsRegister<Register>();
   CodeGeneratorMIPS::PcRelativePatchInfo* info =
       codegen_->NewPcRelativeDexCacheArrayPatch(base->GetDexFile(), base->GetElementOffset());
-
+  bool reordering = __ SetReorder(false);
   if (codegen_->GetInstructionSetFeatures().IsR6()) {
     __ Bind(&info->high_label);
     __ Bind(&info->pc_rel_label);
@@ -5769,6 +5787,7 @@
     __ Addu(reg, reg, RA);
     // TODO: Can we share this code with that of VisitMipsComputeBaseMethodAddress()?
   }
+  __ SetReorder(reordering);
 }
 
 void LocationsBuilderMIPS::VisitInvokeUnresolved(HInvokeUnresolved* invoke) {
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 6e5eb66..862a93f 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -1901,7 +1901,7 @@
                     TR,
                     QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pStringCompareTo).Int32Value());
   __ Jalr(T9);
-  __ Nop();
+  __ NopIfNoReordering();
   __ Bind(slow_path->GetExitLabel());
 }
 
@@ -2060,7 +2060,7 @@
                     TR,
                     QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pIndexOf).Int32Value());
   __ Jalr(T9);
-  __ Nop();
+  __ NopIfNoReordering();
 
   if (slow_path != nullptr) {
     __ Bind(slow_path->GetExitLabel());
@@ -2146,7 +2146,7 @@
                     TR,
                     QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pAllocStringFromBytes).Int32Value());
   __ Jalr(T9);
-  __ Nop();
+  __ NopIfNoReordering();
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
   __ Bind(slow_path->GetExitLabel());
 }
@@ -2179,7 +2179,7 @@
                     TR,
                     QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pAllocStringFromChars).Int32Value());
   __ Jalr(T9);
-  __ Nop();
+  __ NopIfNoReordering();
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
 }
 
@@ -2208,7 +2208,7 @@
                     TR,
                     QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pAllocStringFromString).Int32Value());
   __ Jalr(T9);
-  __ Nop();
+  __ NopIfNoReordering();
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
   __ Bind(slow_path->GetExitLabel());
 }
diff --git a/compiler/optimizing/optimizing_cfi_test_expected.inc b/compiler/optimizing/optimizing_cfi_test_expected.inc
index 05eb063..6c5030c 100644
--- a/compiler/optimizing/optimizing_cfi_test_expected.inc
+++ b/compiler/optimizing/optimizing_cfi_test_expected.inc
@@ -144,12 +144,12 @@
     0x34, 0x00, 0xB0, 0xAF, 0x28, 0x00, 0xB6, 0xF7, 0x20, 0x00, 0xB4, 0xF7,
     0x00, 0x00, 0xA4, 0xAF, 0x3C, 0x00, 0xBF, 0x8F, 0x38, 0x00, 0xB1, 0x8F,
     0x34, 0x00, 0xB0, 0x8F, 0x28, 0x00, 0xB6, 0xD7, 0x20, 0x00, 0xB4, 0xD7,
-    0x40, 0x00, 0xBD, 0x27, 0x09, 0x00, 0xE0, 0x03, 0x00, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0xE0, 0x03, 0x40, 0x00, 0xBD, 0x27,
 };
 static constexpr uint8_t expected_cfi_kMips[] = {
     0x44, 0x0E, 0x40, 0x44, 0x9F, 0x01, 0x44, 0x91, 0x02, 0x44, 0x90, 0x03,
-    0x4C, 0x0A, 0x44, 0xDF, 0x44, 0xD1, 0x44, 0xD0, 0x4C, 0x0E, 0x00, 0x48,
-    0x0B, 0x0E, 0x40,
+    0x4C, 0x0A, 0x44, 0xDF, 0x44, 0xD1, 0x44, 0xD0, 0x50, 0x0E, 0x00, 0x0B,
+    0x0E, 0x40,
 };
 // 0x00000000: addiu r29, r29, -64
 // 0x00000004: .cfi_def_cfa_offset: 64
@@ -171,12 +171,11 @@
 // 0x00000028: .cfi_restore: r16
 // 0x00000028: ldc1 f22, +40(r29)
 // 0x0000002c: ldc1 f20, +32(r29)
-// 0x00000030: addiu r29, r29, 64
-// 0x00000034: .cfi_def_cfa_offset: 0
-// 0x00000034: jr r31
-// 0x00000038: nop
-// 0x0000003c: .cfi_restore_state
-// 0x0000003c: .cfi_def_cfa_offset: 64
+// 0x00000030: jr r31
+// 0x00000034: addiu r29, r29, 64
+// 0x00000038: .cfi_def_cfa_offset: 0
+// 0x00000038: .cfi_restore_state
+// 0x00000038: .cfi_def_cfa_offset: 64
 
 static constexpr uint8_t expected_asm_kMips64[] = {
     0xD8, 0xFF, 0xBD, 0x67, 0x20, 0x00, 0xBF, 0xFF, 0x18, 0x00, 0xB1, 0xFF,
@@ -348,14 +347,13 @@
 };
 static constexpr uint8_t expected_asm_kMips_adjust_tail[] = {
     0x3C, 0x00, 0xBF, 0x8F, 0x38, 0x00, 0xB1, 0x8F, 0x34, 0x00, 0xB0, 0x8F,
-    0x28, 0x00, 0xB6, 0xD7, 0x20, 0x00, 0xB4, 0xD7, 0x40, 0x00, 0xBD, 0x27,
-    0x09, 0x00, 0xE0, 0x03, 0x00, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0xB6, 0xD7, 0x20, 0x00, 0xB4, 0xD7, 0x09, 0x00, 0xE0, 0x03,
+    0x40, 0x00, 0xBD, 0x27,
 };
 static constexpr uint8_t expected_cfi_kMips_adjust[] = {
     0x44, 0x0E, 0x40, 0x44, 0x9F, 0x01, 0x44, 0x91, 0x02, 0x44, 0x90, 0x03,
     0x54, 0x0E, 0x44, 0x60, 0x0E, 0x40, 0x04, 0x04, 0x00, 0x02, 0x00, 0x0A,
-    0x44, 0xDF, 0x44, 0xD1, 0x44, 0xD0, 0x4C, 0x0E, 0x00, 0x48, 0x0B, 0x0E,
-    0x40,
+    0x44, 0xDF, 0x44, 0xD1, 0x44, 0xD0, 0x50, 0x0E, 0x00, 0x0B, 0x0E, 0x40,
 };
 // 0x00000000: addiu r29, r29, -64
 // 0x00000004: .cfi_def_cfa_offset: 64
@@ -392,12 +390,11 @@
 // 0x00020054: .cfi_restore: r16
 // 0x00020054: ldc1 f22, +40(r29)
 // 0x00020058: ldc1 f20, +32(r29)
-// 0x0002005c: addiu r29, r29, 64
-// 0x00020060: .cfi_def_cfa_offset: 0
-// 0x00020060: jr r31
-// 0x00020064: nop
-// 0x00020068: .cfi_restore_state
-// 0x00020068: .cfi_def_cfa_offset: 64
+// 0x0002005c: jr r31
+// 0x00020060: addiu r29, r29, 64
+// 0x00020064: .cfi_def_cfa_offset: 0
+// 0x00020064: .cfi_restore_state
+// 0x00020064: .cfi_def_cfa_offset: 64
 
 static constexpr uint8_t expected_asm_kMips64_adjust_head[] = {
     0xD8, 0xFF, 0xBD, 0x67, 0x20, 0x00, 0xBF, 0xFF, 0x18, 0x00, 0xB1, 0xFF,