ART: ARM64: Optimize frame size for SIMD graphs. For SIMD graphs allocate 64 bit instead of 128 bit on stack for each FP register to be preserved by the callee in the frame entry as ABI suggests (currently 64-bit registers are preserved but more space on stack is allocated). Note: slow paths still require spilling full 128-bit Q-Registers for SIMD graphs due to register allocator restrictions. Test: test-art-target. Change-Id: Ie0b12e4b769158445f3d0f4562c70d4fb0ea7744

commit: 6a0b657a1875b4fbb020b806169e2f73fcb2578b [log] [tgz]
author: Artem Serov <artem.serov@linaro.org> Fri Jul 26 20:38:37 2019 +0100
committer: Vladimir Marko <vmarko@google.com> Fri Aug 02 13:31:43 2019 +0000
tree: 955bb0e3413e18f2b13b7fee7fa3e6e48a214597
parent: 61f071630083775fe64d177455a056daa7071eca [diff]
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index d932c6a..917d97d 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h

@@ -222,7 +222,19 @@
   virtual Assembler* GetAssembler() = 0;
   virtual const Assembler& GetAssembler() const = 0;
   virtual size_t GetWordSize() const = 0;
-  virtual size_t GetFloatingPointSpillSlotSize() const = 0;
+
+  // Get FP register width in bytes for spilling/restoring in the slow paths.
+  //
+  // Note: In SIMD graphs this should return SIMD register width as all FP and SIMD registers
+  // alias and live SIMD registers are forced to be spilled in full size in the slow paths.
+  virtual size_t GetSlowPathFPWidth() const {
+    // Default implementation.
+    return GetCalleePreservedFPWidth();
+  }
+
+  // Get FP register width required to be preserved by the target ABI.
+  virtual size_t GetCalleePreservedFPWidth() const  = 0;
+
   virtual uintptr_t GetAddressOf(HBasicBlock* block) = 0;
   void InitializeCodeGeneration(size_t number_of_spill_slots,
                                 size_t maximum_safepoint_spill_size,
@@ -675,7 +687,7 @@
   }
 
   uint32_t GetFpuSpillSize() const {
-    return POPCOUNT(fpu_spill_mask_) * GetFloatingPointSpillSlotSize();
+    return POPCOUNT(fpu_spill_mask_) * GetCalleePreservedFPWidth();
   }
 
   uint32_t GetCoreSpillSize() const {
@@ -793,6 +805,8 @@
   std::unique_ptr<CodeGenerationData> code_generation_data_;
 
   friend class OptimizingCFITest;
+  ART_FRIEND_TEST(CodegenTest, ARM64FrameSizeSIMD);
+  ART_FRIEND_TEST(CodegenTest, ARM64FrameSizeNoSIMD);
 
   DISALLOW_COPY_AND_ASSIGN(CodeGenerator);
 };

diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 51e16ce..e477f7c 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h

@@ -435,10 +435,14 @@
     return kArm64WordSize;
   }
 
-  size_t GetFloatingPointSpillSlotSize() const override {
+  size_t GetSlowPathFPWidth() const override {
     return GetGraph()->HasSIMD()
-        ? 2 * kArm64WordSize   // 16 bytes == 2 arm64 words for each spill
-        : 1 * kArm64WordSize;  //  8 bytes == 1 arm64 words for each spill
+        ? vixl::aarch64::kQRegSizeInBytes
+        : vixl::aarch64::kDRegSizeInBytes;
+  }
+
+  size_t GetCalleePreservedFPWidth() const override {
+    return vixl::aarch64::kDRegSizeInBytes;
   }
 
   uintptr_t GetAddressOf(HBasicBlock* block) override {

diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index 6cd0bd1..b541351 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h

@@ -446,7 +446,9 @@
     return static_cast<size_t>(kArmPointerSize);
   }
 
-  size_t GetFloatingPointSpillSlotSize() const override { return vixl::aarch32::kRegSizeInBytes; }
+  size_t GetCalleePreservedFPWidth() const override {
+    return vixl::aarch32::kSRegSizeInBytes;
+  }
 
   HGraphVisitor* GetLocationBuilder() override { return &location_builder_; }
 

diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index eafd051..dc657b6 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc

@@ -1956,7 +1956,7 @@
   } else {
     __ StoreDToOffset(FRegister(reg_id), SP, stack_index);
   }
-  return GetFloatingPointSpillSlotSize();
+  return GetSlowPathFPWidth();
 }
 
 size_t CodeGeneratorMIPS::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
@@ -1965,7 +1965,7 @@
   } else {
     __ LoadDFromOffset(FRegister(reg_id), SP, stack_index);
   }
-  return GetFloatingPointSpillSlotSize();
+  return GetSlowPathFPWidth();
 }
 
 void CodeGeneratorMIPS::DumpCoreRegister(std::ostream& stream, int reg) const {

diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index e287588..d6cefbc 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h

@@ -385,12 +385,16 @@
 
   size_t GetWordSize() const override { return kMipsWordSize; }
 
-  size_t GetFloatingPointSpillSlotSize() const override {
+  size_t GetSlowPathFPWidth() const override {
     return GetGraph()->HasSIMD()
         ? 2 * kMipsDoublewordSize   // 16 bytes for each spill.
         : 1 * kMipsDoublewordSize;  //  8 bytes for each spill.
   }
 
+  size_t GetCalleePreservedFPWidth() const override {
+    return 1 * kMipsDoublewordSize;
+  }
+
   uintptr_t GetAddressOf(HBasicBlock* block) override {
     return assembler_.GetLabelLocation(GetLabelOf(block));
   }

diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 7b91a19..75bca4e 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc

@@ -1817,7 +1817,7 @@
                       FpuRegister(reg_id),
                       SP,
                       stack_index);
-  return GetFloatingPointSpillSlotSize();
+  return GetSlowPathFPWidth();
 }
 
 size_t CodeGeneratorMIPS64::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
@@ -1825,7 +1825,7 @@
                        FpuRegister(reg_id),
                        SP,
                        stack_index);
-  return GetFloatingPointSpillSlotSize();
+  return GetSlowPathFPWidth();
 }
 
 void CodeGeneratorMIPS64::DumpCoreRegister(std::ostream& stream, int reg) const {

diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index 15667e0..bbaebb5 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h

@@ -363,12 +363,16 @@
 
   size_t GetWordSize() const override { return kMips64DoublewordSize; }
 
-  size_t GetFloatingPointSpillSlotSize() const override {
+  size_t GetSlowPathFPWidth() const override {
     return GetGraph()->HasSIMD()
         ? 2 * kMips64DoublewordSize   // 16 bytes for each spill.
         : 1 * kMips64DoublewordSize;  //  8 bytes for each spill.
   }
 
+  size_t GetCalleePreservedFPWidth() const override {
+    return 1* kMips64DoublewordSize;
+  }
+
   uintptr_t GetAddressOf(HBasicBlock* block) override {
     return assembler_.GetLabelLocation(GetLabelOf(block));
   }

diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 6728dc9..4ab398e 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc

@@ -987,7 +987,7 @@
   } else {
     __ movsd(Address(ESP, stack_index), XmmRegister(reg_id));
   }
-  return GetFloatingPointSpillSlotSize();
+  return GetSlowPathFPWidth();
 }
 
 size_t CodeGeneratorX86::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
@@ -996,7 +996,7 @@
   } else {
     __ movsd(XmmRegister(reg_id), Address(ESP, stack_index));
   }
-  return GetFloatingPointSpillSlotSize();
+  return GetSlowPathFPWidth();
 }
 
 void CodeGeneratorX86::InvokeRuntime(QuickEntrypointEnum entrypoint,

diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 368c584..6bf6b0b 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h

@@ -357,12 +357,16 @@
     return kX86WordSize;
   }
 
-  size_t GetFloatingPointSpillSlotSize() const override {
+  size_t GetSlowPathFPWidth() const override {
     return GetGraph()->HasSIMD()
         ? 4 * kX86WordSize   // 16 bytes == 4 words for each spill
         : 2 * kX86WordSize;  //  8 bytes == 2 words for each spill
   }
 
+  size_t GetCalleePreservedFPWidth() const override {
+    return 2 * kX86WordSize;
+  }
+
   HGraphVisitor* GetLocationBuilder() override {
     return &location_builder_;
   }

diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 48a8320..a75c745 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc

@@ -1249,7 +1249,7 @@
   } else {
     __ movsd(Address(CpuRegister(RSP), stack_index), XmmRegister(reg_id));
   }
-  return GetFloatingPointSpillSlotSize();
+  return GetSlowPathFPWidth();
 }
 
 size_t CodeGeneratorX86_64::RestoreFloatingPointRegister(size_t stack_index, uint32_t reg_id) {
@@ -1258,7 +1258,7 @@
   } else {
     __ movsd(XmmRegister(reg_id), Address(CpuRegister(RSP), stack_index));
   }
-  return GetFloatingPointSpillSlotSize();
+  return GetSlowPathFPWidth();
 }
 
 void CodeGeneratorX86_64::InvokeRuntime(QuickEntrypointEnum entrypoint,
@@ -1377,7 +1377,7 @@
   __ subq(CpuRegister(RSP), Immediate(adjust));
   __ cfi().AdjustCFAOffset(adjust);
   uint32_t xmm_spill_location = GetFpuSpillStart();
-  size_t xmm_spill_slot_size = GetFloatingPointSpillSlotSize();
+  size_t xmm_spill_slot_size = GetCalleePreservedFPWidth();
 
   for (int i = arraysize(kFpuCalleeSaves) - 1; i >= 0; --i) {
     if (allocated_registers_.ContainsFloatingPointRegister(kFpuCalleeSaves[i])) {
@@ -1405,7 +1405,7 @@
   __ cfi().RememberState();
   if (!HasEmptyFrame()) {
     uint32_t xmm_spill_location = GetFpuSpillStart();
-    size_t xmm_spill_slot_size = GetFloatingPointSpillSlotSize();
+    size_t xmm_spill_slot_size = GetCalleePreservedFPWidth();
     for (size_t i = 0; i < arraysize(kFpuCalleeSaves); ++i) {
       if (allocated_registers_.ContainsFloatingPointRegister(kFpuCalleeSaves[i])) {
         int offset = xmm_spill_location + (xmm_spill_slot_size * i);

diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index d3b49ea..ef8f5ac 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h

@@ -338,12 +338,16 @@
     return kX86_64WordSize;
   }
 
-  size_t GetFloatingPointSpillSlotSize() const override {
+  size_t GetSlowPathFPWidth() const override {
     return GetGraph()->HasSIMD()
         ? 2 * kX86_64WordSize   // 16 bytes == 2 x86_64 words for each spill
         : 1 * kX86_64WordSize;  //  8 bytes == 1 x86_64 words for each spill
   }
 
+  size_t GetCalleePreservedFPWidth() const override {
+    return 1 * kX86_64WordSize;
+  }
+
   HGraphVisitor* GetLocationBuilder() override {
     return &location_builder_;
   }

diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index b5a7c13..26d07bd 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc

@@ -850,6 +850,49 @@
   EXPECT_FALSE(features->Has(vixl::CPUFeatures::kAtomics));
 }
 
+constexpr static size_t kExpectedFPSpillSize = 8 * vixl::aarch64::kDRegSizeInBytes;
+
+// The following two tests check that for both SIMD and non-SIMD graphs exactly 64-bit is
+// allocated on stack per callee-saved FP register to be preserved in the frame entry as
+// ABI states.
+TEST_F(CodegenTest, ARM64FrameSizeSIMD) {
+  OverrideInstructionSetFeatures(InstructionSet::kArm64, "default");
+  HGraph* graph = CreateGraph();
+  arm64::CodeGeneratorARM64 codegen(graph, *compiler_options_);
+
+  codegen.Initialize();
+  graph->SetHasSIMD(true);
+
+  DCHECK_EQ(arm64::callee_saved_fp_registers.GetCount(), 8);
+  vixl::aarch64::CPURegList reg_list = arm64::callee_saved_fp_registers;
+  while (!reg_list.IsEmpty()) {
+    uint32_t reg_code = reg_list.PopLowestIndex().GetCode();
+    codegen.AddAllocatedRegister(Location::FpuRegisterLocation(reg_code));
+  }
+  codegen.ComputeSpillMask();
+
+  EXPECT_EQ(codegen.GetFpuSpillSize(), kExpectedFPSpillSize);
+}
+
+TEST_F(CodegenTest, ARM64FrameSizeNoSIMD) {
+  OverrideInstructionSetFeatures(InstructionSet::kArm64, "default");
+  HGraph* graph = CreateGraph();
+  arm64::CodeGeneratorARM64 codegen(graph, *compiler_options_);
+
+  codegen.Initialize();
+  graph->SetHasSIMD(false);
+
+  DCHECK_EQ(arm64::callee_saved_fp_registers.GetCount(), 8);
+  vixl::aarch64::CPURegList reg_list = arm64::callee_saved_fp_registers;
+  while (!reg_list.IsEmpty()) {
+    uint32_t reg_code = reg_list.PopLowestIndex().GetCode();
+    codegen.AddAllocatedRegister(Location::FpuRegisterLocation(reg_code));
+  }
+  codegen.ComputeSpillMask();
+
+  EXPECT_EQ(codegen.GetFpuSpillSize(), kExpectedFPSpillSize);
+}
+
 #endif
 
 #ifdef ART_ENABLE_CODEGEN_mips

diff --git a/compiler/optimizing/register_allocation_resolver.cc b/compiler/optimizing/register_allocation_resolver.cc
index b1f0a1a..1786048 100644
--- a/compiler/optimizing/register_allocation_resolver.cc
+++ b/compiler/optimizing/register_allocation_resolver.cc

@@ -274,7 +274,7 @@
 size_t RegisterAllocationResolver::CalculateMaximumSafepointSpillSize(
     ArrayRef<HInstruction* const> safepoints) {
   size_t core_register_spill_size = codegen_->GetWordSize();
-  size_t fp_register_spill_size = codegen_->GetFloatingPointSpillSlotSize();
+  size_t fp_register_spill_size = codegen_->GetSlowPathFPWidth();
   size_t maximum_safepoint_spill_size = 0u;
   for (HInstruction* instruction : safepoints) {
     LocationSummary* locations = instruction->GetLocations();

diff --git a/test/661-checker-simd-reduc/src/Main.java b/test/661-checker-simd-reduc/src/Main.java
index 7b6f957..c31b17c 100644
--- a/test/661-checker-simd-reduc/src/Main.java
+++ b/test/661-checker-simd-reduc/src/Main.java

@@ -71,6 +71,12 @@
   /// CHECK-DAG:                 Add [<<I>>,<<Cons>>]          loop:<<Loop>>      outer_loop:none
   /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi>>]           loop:none
   /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
+
+  //  Check that full 128-bit Q-Register are saved across SuspendCheck slow path.
+  /// CHECK-START-ARM64: int Main.reductionInt(int[]) disassembly (after)
+  /// CHECK:                     SuspendCheckSlowPathARM64
+  /// CHECK:                       stur q<<RegNo:\d+>>, [sp, #<<Offset:\d+>>]
+  /// CHECK:                       ldur q<<RegNo>>, [sp, #<<Offset>>]
   private static int reductionInt(int[] x) {
     int sum = 0;
     for (int i = 0; i < x.length; i++) {
commit	6a0b657a1875b4fbb020b806169e2f73fcb2578b	[log] [tgz]
author	Artem Serov <artem.serov@linaro.org>	Fri Jul 26 20:38:37 2019 +0100
committer	Vladimir Marko <vmarko@google.com>	Fri Aug 02 13:31:43 2019 +0000
tree	955bb0e3413e18f2b13b7fee7fa3e6e48a214597
parent	61f071630083775fe64d177455a056daa7071eca [diff]