Support for saving and restoring live registers in a slow path.

And use it in suspend check slow paths.

Change-Id: I79caf28f334c145a36180c79a6e2fceae3990c31
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 2547a29..3231c99 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -44,6 +44,7 @@
   ComputeFrameSize(GetGraph()->GetNumberOfLocalVRegs()
                      + GetGraph()->GetNumberOfTemporaries()
                      + 1 /* filler */,
+                   0, /* the baseline compiler does not have live registers at slow path */
                    GetGraph()->GetMaximumNumberOfOutVRegs()
                      + 1 /* current method */);
   GenerateFrameEntry();
@@ -111,10 +112,15 @@
   return -1;
 }
 
-void CodeGenerator::ComputeFrameSize(size_t number_of_spill_slots, size_t number_of_out_slots) {
+void CodeGenerator::ComputeFrameSize(size_t number_of_spill_slots,
+                                     size_t maximum_number_of_live_registers,
+                                     size_t number_of_out_slots) {
+  first_register_slot_in_slow_path_ = (number_of_out_slots + number_of_spill_slots) * kVRegSize;
+
   SetFrameSize(RoundUp(
       number_of_spill_slots * kVRegSize
       + number_of_out_slots * kVRegSize
+      + maximum_number_of_live_registers * GetWordSize()
       + FrameEntrySpillSize(),
       kStackAlignment));
 }
@@ -468,4 +474,48 @@
   }
 }
 
+size_t CodeGenerator::GetStackOffsetOfSavedRegister(size_t index) {
+  return first_register_slot_in_slow_path_ + index * GetWordSize();
+}
+
+void CodeGenerator::SaveLiveRegisters(LocationSummary* locations) {
+  RegisterSet* register_set = locations->GetLiveRegisters();
+  uint32_t count = 0;
+  for (size_t i = 0, e = GetNumberOfCoreRegisters(); i < e; ++i) {
+    if (register_set->ContainsCoreRegister(i)) {
+      size_t stack_offset = GetStackOffsetOfSavedRegister(count);
+      ++count;
+      SaveCoreRegister(Location::StackSlot(stack_offset), i);
+      // If the register holds an object, update the stack mask.
+      if (locations->RegisterContainsObject(i)) {
+        locations->SetStackBit(stack_offset / kVRegSize);
+      }
+    }
+  }
+
+  for (size_t i = 0, e = GetNumberOfFloatingPointRegisters(); i < e; ++i) {
+    if (register_set->ContainsFloatingPointRegister(i)) {
+      LOG(FATAL) << "Unimplemented";
+    }
+  }
+}
+
+void CodeGenerator::RestoreLiveRegisters(LocationSummary* locations) {
+  RegisterSet* register_set = locations->GetLiveRegisters();
+  uint32_t count = 0;
+  for (size_t i = 0, e = GetNumberOfCoreRegisters(); i < e; ++i) {
+    if (register_set->ContainsCoreRegister(i)) {
+      size_t stack_offset = GetStackOffsetOfSavedRegister(count);
+      ++count;
+      RestoreCoreRegister(Location::StackSlot(stack_offset), i);
+    }
+  }
+
+  for (size_t i = 0, e = GetNumberOfFloatingPointRegisters(); i < e; ++i) {
+    if (register_set->ContainsFloatingPointRegister(i)) {
+      LOG(FATAL) << "Unimplemented";
+    }
+  }
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index a83d703..55f5d8d 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -98,7 +98,9 @@
   virtual HGraphVisitor* GetInstructionVisitor() = 0;
   virtual Assembler* GetAssembler() = 0;
   virtual size_t GetWordSize() const = 0;
-  void ComputeFrameSize(size_t number_of_spill_slots, size_t number_of_out_slots);
+  void ComputeFrameSize(size_t number_of_spill_slots,
+                        size_t maximum_number_of_live_registers,
+                        size_t number_of_out_slots);
   virtual size_t FrameEntrySpillSize() const = 0;
   int32_t GetStackSlot(HLocal* local) const;
   Location GetTemporaryLocation(HTemporary* temp) const;
@@ -114,6 +116,8 @@
   virtual void DumpCoreRegister(std::ostream& stream, int reg) const = 0;
   virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const = 0;
   virtual InstructionSet GetInstructionSet() const = 0;
+  virtual void SaveCoreRegister(Location stack_location, uint32_t reg_id) = 0;
+  virtual void RestoreCoreRegister(Location stack_location, uint32_t reg_id) = 0;
 
   void RecordPcInfo(HInstruction* instruction, uint32_t dex_pc);
 
@@ -128,6 +132,8 @@
   void BuildNativeGCMap(
       std::vector<uint8_t>* vector, const DexCompilationUnit& dex_compilation_unit) const;
   void BuildStackMaps(std::vector<uint8_t>* vector);
+  void SaveLiveRegisters(LocationSummary* locations);
+  void RestoreLiveRegisters(LocationSummary* locations);
 
   bool IsLeafMethod() const {
     return is_leaf_;
@@ -141,6 +147,7 @@
   CodeGenerator(HGraph* graph, size_t number_of_registers)
       : frame_size_(kUninitializedFrameSize),
         core_spill_mask_(0),
+        first_register_slot_in_slow_path_(0),
         graph_(graph),
         block_labels_(graph->GetArena(), 0),
         pc_infos_(graph->GetArena(), 32),
@@ -166,9 +173,11 @@
   // Frame size required for this method.
   uint32_t frame_size_;
   uint32_t core_spill_mask_;
+  uint32_t first_register_slot_in_slow_path_;
 
  private:
   void InitLocations(HInstruction* instruction);
+  size_t GetStackOffsetOfSavedRegister(size_t index);
 
   HGraph* const graph_;
 
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index ad62279..ce1c73d 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -98,10 +98,12 @@
 
   virtual void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     __ Bind(GetEntryLabel());
+    codegen->SaveLiveRegisters(instruction_->GetLocations());
     int32_t offset = QUICK_ENTRYPOINT_OFFSET(kArmWordSize, pTestSuspend).Int32Value();
     __ ldr(LR, Address(TR, offset));
     __ blx(LR);
     codegen->RecordPcInfo(instruction_, instruction_->GetDexPc());
+    codegen->RestoreLiveRegisters(instruction_->GetLocations());
     __ b(GetReturnLabel());
   }
 
@@ -182,6 +184,14 @@
   stream << ArmManagedRegister::FromDRegister(DRegister(reg));
 }
 
+void CodeGeneratorARM::SaveCoreRegister(Location stack_location, uint32_t reg_id) {
+  __ str(static_cast<Register>(reg_id), Address(SP, stack_location.GetStackIndex()));
+}
+
+void CodeGeneratorARM::RestoreCoreRegister(Location stack_location, uint32_t reg_id) {
+  __ ldr(static_cast<Register>(reg_id), Address(SP, stack_location.GetStackIndex()));
+}
+
 CodeGeneratorARM::CodeGeneratorARM(HGraph* graph)
     : CodeGenerator(graph, kNumberOfRegIds),
       location_builder_(graph, this),
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 2480960..0902fb8 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -132,6 +132,8 @@
   virtual void GenerateFrameExit() OVERRIDE;
   virtual void Bind(Label* label) OVERRIDE;
   virtual void Move(HInstruction* instruction, Location location, HInstruction* move_for) OVERRIDE;
+  virtual void SaveCoreRegister(Location stack_location, uint32_t reg_id) OVERRIDE;
+  virtual void RestoreCoreRegister(Location stack_location, uint32_t reg_id) OVERRIDE;
 
   virtual size_t GetWordSize() const OVERRIDE {
     return kArmWordSize;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 3383cb2..9b36a97 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -122,8 +122,10 @@
 
   virtual void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     __ Bind(GetEntryLabel());
+    codegen->SaveLiveRegisters(instruction_->GetLocations());
     __ fs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86WordSize, pTestSuspend)));
     codegen->RecordPcInfo(instruction_, instruction_->GetDexPc());
+    codegen->RestoreLiveRegisters(instruction_->GetLocations());
     __ jmp(GetReturnLabel());
   }
 
@@ -161,6 +163,14 @@
   stream << X86ManagedRegister::FromXmmRegister(XmmRegister(reg));
 }
 
+void CodeGeneratorX86::SaveCoreRegister(Location stack_location, uint32_t reg_id) {
+  __ movl(Address(ESP, stack_location.GetStackIndex()), static_cast<Register>(reg_id));
+}
+
+void CodeGeneratorX86::RestoreCoreRegister(Location stack_location, uint32_t reg_id) {
+  __ movl(static_cast<Register>(reg_id), Address(ESP, stack_location.GetStackIndex()));
+}
+
 CodeGeneratorX86::CodeGeneratorX86(HGraph* graph)
     : CodeGenerator(graph, kNumberOfRegIds),
       location_builder_(graph, this),
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index f1be0ad..ffcaf60 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -134,6 +134,8 @@
   virtual void GenerateFrameExit() OVERRIDE;
   virtual void Bind(Label* label) OVERRIDE;
   virtual void Move(HInstruction* instruction, Location location, HInstruction* move_for) OVERRIDE;
+  virtual void SaveCoreRegister(Location stack_location, uint32_t reg_id) OVERRIDE;
+  virtual void RestoreCoreRegister(Location stack_location, uint32_t reg_id) OVERRIDE;
 
   virtual size_t GetWordSize() const OVERRIDE {
     return kX86WordSize;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index ca03af8..065f981 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -103,8 +103,10 @@
 
   virtual void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     __ Bind(GetEntryLabel());
+    codegen->SaveLiveRegisters(instruction_->GetLocations());
     __ gs()->call(Address::Absolute(QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pTestSuspend), true));
     codegen->RecordPcInfo(instruction_, instruction_->GetDexPc());
+    codegen->RestoreLiveRegisters(instruction_->GetLocations());
     __ jmp(GetReturnLabel());
   }
 
@@ -170,6 +172,14 @@
   stream << X86_64ManagedRegister::FromXmmRegister(FloatRegister(reg));
 }
 
+void CodeGeneratorX86_64::SaveCoreRegister(Location stack_location, uint32_t reg_id) {
+  __ movq(Address(CpuRegister(RSP), stack_location.GetStackIndex()), CpuRegister(reg_id));
+}
+
+void CodeGeneratorX86_64::RestoreCoreRegister(Location stack_location, uint32_t reg_id) {
+  __ movq(CpuRegister(reg_id), Address(CpuRegister(RSP), stack_location.GetStackIndex()));
+}
+
 CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph)
       : CodeGenerator(graph, kNumberOfRegIds),
         location_builder_(graph, this),
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 78b60fe..ea21872 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -131,6 +131,8 @@
   virtual void GenerateFrameExit() OVERRIDE;
   virtual void Bind(Label* label) OVERRIDE;
   virtual void Move(HInstruction* instruction, Location location, HInstruction* move_for) OVERRIDE;
+  virtual void SaveCoreRegister(Location stack_location, uint32_t reg_id) OVERRIDE;
+  virtual void RestoreCoreRegister(Location stack_location, uint32_t reg_id) OVERRIDE;
 
   virtual size_t GetWordSize() const OVERRIDE {
     return kX86_64WordSize;
diff --git a/compiler/optimizing/locations.cc b/compiler/optimizing/locations.cc
index fce97bd..1c36cdf 100644
--- a/compiler/optimizing/locations.cc
+++ b/compiler/optimizing/locations.cc
@@ -28,7 +28,7 @@
       call_kind_(call_kind),
       stack_mask_(nullptr),
       register_mask_(0),
-      live_registers_(0) {
+      live_registers_() {
   inputs_.SetSize(instruction->InputCount());
   for (size_t i = 0; i < instruction->InputCount(); ++i) {
     inputs_.Put(i, Location());
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 041e85b..06623b6 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -266,6 +266,34 @@
   uword value_;
 };
 
+class RegisterSet : public ValueObject {
+ public:
+  RegisterSet() : core_registers_(0), floating_point_registers_(0) {}
+
+  void Add(Location loc) {
+    // TODO: floating point registers.
+    core_registers_ |= (1 << loc.reg().RegId());
+  }
+
+  bool ContainsCoreRegister(uint32_t id) {
+    return Contains(core_registers_, id);
+  }
+
+  bool ContainsFloatingPointRegister(uint32_t id) {
+    return Contains(floating_point_registers_, id);
+  }
+
+  static bool Contains(uint32_t register_set, uint32_t reg) {
+    return (register_set & (1 << reg)) != 0;
+  }
+
+ private:
+  uint32_t core_registers_;
+  uint32_t floating_point_registers_;
+
+  DISALLOW_COPY_AND_ASSIGN(RegisterSet);
+};
+
 /**
  * The code generator computes LocationSummary for each instruction so that
  * the instruction itself knows what code to generate: where to find the inputs
@@ -327,6 +355,8 @@
   Location Out() const { return output_; }
 
   bool CanCall() const { return call_kind_ != kNoCall; }
+  bool WillCall() const { return call_kind_ == kCall; }
+  bool OnlyCallsOnSlowPath() const { return call_kind_ == kCallOnSlowPath; }
   bool NeedsSafepoint() const { return CanCall(); }
 
   void SetStackBit(uint32_t index) {
@@ -337,14 +367,22 @@
     register_mask_ |= (1 << reg_id);
   }
 
-  void SetLiveRegister(uint32_t reg_id) {
-    live_registers_ |= (1 << reg_id);
+  bool RegisterContainsObject(uint32_t reg_id) {
+    return RegisterSet::Contains(register_mask_, reg_id);
+  }
+
+  void AddLiveRegister(Location location) {
+    live_registers_.Add(location);
   }
 
   BitVector* GetStackMask() const {
     return stack_mask_;
   }
 
+  RegisterSet* GetLiveRegisters() {
+    return &live_registers_;
+  }
+
  private:
   GrowableArray<Location> inputs_;
   GrowableArray<Location> temps_;
@@ -359,7 +397,7 @@
   uint32_t register_mask_;
 
   // Registers that are in use at this position.
-  uint32_t live_registers_;
+  RegisterSet live_registers_;
 
   DISALLOW_COPY_AND_ASSIGN(LocationSummary);
 };
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 7862611..7ab14e7 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -45,7 +45,8 @@
         number_of_registers_(-1),
         registers_array_(nullptr),
         blocked_registers_(allocator->AllocArray<bool>(codegen->GetNumberOfRegisters())),
-        reserved_out_slots_(0) {
+        reserved_out_slots_(0),
+        maximum_number_of_live_registers_(0) {
   codegen->SetupBlockedRegisters(blocked_registers_);
   physical_register_intervals_.SetSize(codegen->GetNumberOfRegisters());
   // Always reserve for the current method and the graph's max out registers.
@@ -157,9 +158,34 @@
     }
   }
 
+  bool core_register = (instruction->GetType() != Primitive::kPrimDouble)
+      && (instruction->GetType() != Primitive::kPrimFloat);
+
+  GrowableArray<LiveInterval*>& unhandled = core_register
+      ? unhandled_core_intervals_
+      : unhandled_fp_intervals_;
+
   if (locations->CanCall()) {
-    codegen_->MarkNotLeaf();
+    if (!instruction->IsSuspendCheck()) {
+      codegen_->MarkNotLeaf();
+    }
     safepoints_.Add(instruction);
+    if (locations->OnlyCallsOnSlowPath()) {
+      // We add a synthesized range at this position to record the live registers
+      // at this position. Ideally, we could just update the safepoints when locations
+      // are updated, but we currently need to know the full stack size before updating
+      // locations (because of parameters and the fact that we don't have a frame pointer).
+      // And knowing the full stack size requires to know the maximum number of live
+      // registers at calls in slow paths.
+      // By adding the following interval in the algorithm, we can compute this
+      // maximum before updating locations.
+      LiveInterval* interval = LiveInterval::MakeSlowPathInterval(allocator_, instruction);
+      interval->AddRange(position, position + 1);
+      unhandled.Add(interval);
+    }
+  }
+
+  if (locations->WillCall()) {
     // Block all registers.
     for (size_t i = 0; i < codegen_->GetNumberOfCoreRegisters(); ++i) {
       BlockRegister(Location::RegisterLocation(ManagedRegister(i)),
@@ -176,12 +202,6 @@
     }
   }
 
-  bool core_register = (instruction->GetType() != Primitive::kPrimDouble)
-      && (instruction->GetType() != Primitive::kPrimFloat);
-  GrowableArray<LiveInterval*>& unhandled = core_register
-      ? unhandled_core_intervals_
-      : unhandled_fp_intervals_;
-
   LiveInterval* current = instruction->GetLiveInterval();
   if (current == nullptr) return;
 
@@ -405,6 +425,14 @@
       }
     }
 
+    if (current->IsSlowPathSafepoint()) {
+      // Synthesized interval to record the maximum number of live registers
+      // at safepoints. No need to allocate a register for it.
+      maximum_number_of_live_registers_ =
+          std::max(maximum_number_of_live_registers_, active_.Size());
+      continue;
+    }
+
     // (4) Try to find an available register.
     bool success = TryAllocateFreeReg(current);
 
@@ -926,14 +954,13 @@
       LocationSummary* locations = safepoint->GetLocations();
       if (!current->Covers(position)) continue;
 
-      if (current->GetType() == Primitive::kPrimNot) {
-        DCHECK(current->GetParent()->HasSpillSlot());
+      if ((current->GetType() == Primitive::kPrimNot) && current->GetParent()->HasSpillSlot()) {
         locations->SetStackBit(current->GetParent()->GetSpillSlot() / kVRegSize);
       }
 
       switch (source.GetKind()) {
         case Location::kRegister: {
-          locations->SetLiveRegister(source.reg().RegId());
+          locations->AddLiveRegister(source);
           if (current->GetType() == Primitive::kPrimNot) {
             locations->SetRegisterBit(source.reg().RegId());
           }
@@ -1016,7 +1043,8 @@
 }
 
 void RegisterAllocator::Resolve() {
-  codegen_->ComputeFrameSize(spill_slots_.Size(), reserved_out_slots_);
+  codegen_->ComputeFrameSize(
+      spill_slots_.Size(), maximum_number_of_live_registers_, reserved_out_slots_);
 
   // Adjust the Out Location of instructions.
   // TODO: Use pointers of Location inside LiveInterval to avoid doing another iteration.
diff --git a/compiler/optimizing/register_allocator.h b/compiler/optimizing/register_allocator.h
index 7d397e3..3c305c8 100644
--- a/compiler/optimizing/register_allocator.h
+++ b/compiler/optimizing/register_allocator.h
@@ -179,6 +179,9 @@
   // Slots reserved for out arguments.
   size_t reserved_out_slots_;
 
+  // The maximum live registers at safepoints.
+  size_t maximum_number_of_live_registers_;
+
   FRIEND_TEST(RegisterAllocatorTest, FreeUntil);
 
   DISALLOW_COPY_AND_ASSIGN(RegisterAllocator);
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index 33b1f1f..dea6181 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -138,7 +138,8 @@
                HInstruction* defined_by = nullptr,
                bool is_fixed = false,
                int reg = kNoRegister,
-               bool is_temp = false)
+               bool is_temp = false,
+               bool is_slow_path_safepoint = false)
       : allocator_(allocator),
         first_range_(nullptr),
         last_range_(nullptr),
@@ -150,8 +151,14 @@
         spill_slot_(kNoSpillSlot),
         is_fixed_(is_fixed),
         is_temp_(is_temp),
+        is_slow_path_safepoint_(is_slow_path_safepoint),
         defined_by_(defined_by) {}
 
+  static LiveInterval* MakeSlowPathInterval(ArenaAllocator* allocator, HInstruction* instruction) {
+    return new (allocator) LiveInterval(
+        allocator, Primitive::kPrimVoid, instruction, false, kNoRegister, false, true);
+  }
+
   static LiveInterval* MakeFixedInterval(ArenaAllocator* allocator, int reg, Primitive::Type type) {
     return new (allocator) LiveInterval(allocator, type, nullptr, true, reg, false);
   }
@@ -163,6 +170,7 @@
   }
 
   bool IsFixed() const { return is_fixed_; }
+  bool IsSlowPathSafepoint() const { return is_slow_path_safepoint_; }
 
   void AddUse(HInstruction* instruction, size_t input_index, bool is_environment) {
     // Set the use within the instruction.
@@ -480,6 +488,9 @@
   // Whether the interval is for a temporary.
   const bool is_temp_;
 
+  // Whether the interval is for a safepoint that calls on slow path.
+  const bool is_slow_path_safepoint_;
+
   // The instruction represented by this interval.
   HInstruction* const defined_by_;