ARM/ARM64: Improve frame entry/exit codegen. On ARM64, use STP pre-index for the method and the lowest spilled core register for method entry if there's no gap or FP spills in between. On exit, use LDP post-index to restore in this case, ignoring the method by loading to XZR. Thus, we save one instruction for both entry end exit for such methods and the performance should be the same or better. On ARM, use a single PUSH/POP for method entry and core spills if the gap between them is 2 words or less and and we have one or no FP spill, spill args as filler if needed. On exit, load the FP spill if any and do a single POP for core registers and return in this situation, clobbering as many registers from r2-r4 as needed; these caller-save registers are not used to pass return values. If we cannot do this because of FP spills but the gap between the method and FP spills is 2 words or less, we adjust SP and save the method in one PUSH after spilling; there is no similar handling for method exit as the method does not need to be restored. This may improve or degrade performance a bit depending on the particular situation; in the worst case we PUSH/POP three additional registers as a cost for smaller code size. aosp_taimen-userdebug prebuils: - before: arm/boot*.oat: 19147484 arm64/boot*.oat: 22558344 oat/arm/services.odex: 21922256 - after: arm/boot*.oat: 19105436 (-41KiB, -0.2%) arm64/boot*.oat: 22549624 (-9KiB, -0.04%) oat/arm/services.odex: 21914128 (-8KiB, -0.04%) Test: aosp_taimen-userdebug boots. Test: run-gtests.sh Test: testrunner.py --target --optimizing Bug: 136144107 Change-Id: Id36c67b4e735418fb18bcd3269b72b25695fbaa2

commit: 1a225a76ee6bc29833aee048b6cfae20242bdc8b [log] [tgz]
author: Vladimir Marko <vmarko@google.com> Fri Jul 05 13:37:42 2019 +0100
committer: Vladimir Marko <vmarko@google.com> Thu Jul 18 13:37:15 2019 +0000
tree: 069bfc01d827fcbf9aa4415c4d63d354648f396c
parent: 323844002e54243e295497e7f829e46a533da621 [diff] [blame]
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 7493507..cf596c7 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc

@@ -1090,27 +1090,42 @@
   }
 
   if (!HasEmptyFrame()) {
-    int frame_size = GetFrameSize();
     // Stack layout:
     //      sp[frame_size - 8]        : lr.
     //      ...                       : other preserved core registers.
     //      ...                       : other preserved fp registers.
     //      ...                       : reserved frame space.
     //      sp[0]                     : current method.
+    int32_t frame_size = dchecked_integral_cast<int32_t>(GetFrameSize());
+    uint32_t core_spills_offset = frame_size - GetCoreSpillSize();
+    CPURegList preserved_core_registers = GetFramePreservedCoreRegisters();
+    DCHECK(!preserved_core_registers.IsEmpty());
+    uint32_t fp_spills_offset = frame_size - FrameEntrySpillSize();
+    CPURegList preserved_fp_registers = GetFramePreservedFPRegisters();
 
-    // Save the current method if we need it. Note that we do not
-    // do this in HCurrentMethod, as the instruction might have been removed
-    // in the SSA graph.
-    if (RequiresCurrentMethod()) {
+    // Save the current method if we need it, or if using STP reduces code
+    // size. Note that we do not do this in HCurrentMethod, as the
+    // instruction might have been removed in the SSA graph.
+    CPURegister lowest_spill;
+    if (core_spills_offset == kXRegSizeInBytes) {
+      // If there is no gap between the method and the lowest core spill, use
+      // aligned STP pre-index to store both. Max difference is 512. We do
+      // that to reduce code size even if we do not have to save the method.
+      DCHECK_LE(frame_size, 512);  // 32 core registers are only 256 bytes.
+      lowest_spill = preserved_core_registers.PopLowestIndex();
+      __ Stp(kArtMethodRegister, lowest_spill, MemOperand(sp, -frame_size, PreIndex));
+    } else if (RequiresCurrentMethod()) {
       __ Str(kArtMethodRegister, MemOperand(sp, -frame_size, PreIndex));
     } else {
       __ Claim(frame_size);
     }
     GetAssembler()->cfi().AdjustCFAOffset(frame_size);
-    GetAssembler()->SpillRegisters(GetFramePreservedCoreRegisters(),
-        frame_size - GetCoreSpillSize());
-    GetAssembler()->SpillRegisters(GetFramePreservedFPRegisters(),
-        frame_size - FrameEntrySpillSize());
+    if (lowest_spill.IsValid()) {
+      GetAssembler()->cfi().RelOffset(DWARFReg(lowest_spill), core_spills_offset);
+      core_spills_offset += kXRegSizeInBytes;
+    }
+    GetAssembler()->SpillRegisters(preserved_core_registers, core_spills_offset);
+    GetAssembler()->SpillRegisters(preserved_fp_registers, fp_spills_offset);
 
     if (GetGraph()->HasShouldDeoptimizeFlag()) {
       // Initialize should_deoptimize flag to 0.
@@ -1125,12 +1140,30 @@
 void CodeGeneratorARM64::GenerateFrameExit() {
   GetAssembler()->cfi().RememberState();
   if (!HasEmptyFrame()) {
-    int frame_size = GetFrameSize();
-    GetAssembler()->UnspillRegisters(GetFramePreservedFPRegisters(),
-        frame_size - FrameEntrySpillSize());
-    GetAssembler()->UnspillRegisters(GetFramePreservedCoreRegisters(),
-        frame_size - GetCoreSpillSize());
-    __ Drop(frame_size);
+    int32_t frame_size = dchecked_integral_cast<int32_t>(GetFrameSize());
+    uint32_t core_spills_offset = frame_size - GetCoreSpillSize();
+    CPURegList preserved_core_registers = GetFramePreservedCoreRegisters();
+    DCHECK(!preserved_core_registers.IsEmpty());
+    uint32_t fp_spills_offset = frame_size - FrameEntrySpillSize();
+    CPURegList preserved_fp_registers = GetFramePreservedFPRegisters();
+
+    CPURegister lowest_spill;
+    if (core_spills_offset == kXRegSizeInBytes) {
+      // If there is no gap between the method and the lowest core spill, use
+      // aligned LDP pre-index to pop both. Max difference is 504. We do
+      // that to reduce code size even though the loaded method is unused.
+      DCHECK_LE(frame_size, 504);  // 32 core registers are only 256 bytes.
+      lowest_spill = preserved_core_registers.PopLowestIndex();
+      core_spills_offset += kXRegSizeInBytes;
+    }
+    GetAssembler()->UnspillRegisters(preserved_fp_registers, fp_spills_offset);
+    GetAssembler()->UnspillRegisters(preserved_core_registers, core_spills_offset);
+    if (lowest_spill.IsValid()) {
+      __ Ldp(xzr, lowest_spill, MemOperand(sp, frame_size, PostIndex));
+      GetAssembler()->cfi().Restore(DWARFReg(lowest_spill));
+    } else {
+      __ Drop(frame_size);
+    }
     GetAssembler()->cfi().AdjustCFAOffset(-frame_size);
   }
   __ Ret();
commit	1a225a76ee6bc29833aee048b6cfae20242bdc8b	[log] [tgz]
author	Vladimir Marko <vmarko@google.com>	Fri Jul 05 13:37:42 2019 +0100
committer	Vladimir Marko <vmarko@google.com>	Thu Jul 18 13:37:15 2019 +0000
tree	069bfc01d827fcbf9aa4415c4d63d354648f396c
parent	323844002e54243e295497e7f829e46a533da621 [diff] [blame]