Optimize stack overflow handling.

We now subtract the frame size from the stack pointer for methods
which have a frame smaller than a certain size. Also changed code to
use slow paths instead of launchpads.

Delete kStackOverflow launchpad since it is no longer needed.

ARM optimizations:
One less move per stack overflow check (without fault handler for
stack overflows). Use ldr pc instead of ldr r12, b r12.
Code size (boot.oat):
Before: 58405348
After: 57803236

TODO: X86 doesn't have the case for large frames. This could case an
incoming signal to go past the end of the stack (unlikely however).

Change-Id: Ie3a5635cd6fb09de27960e1f8cee45bfae38fb33
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index bba3d40..94f0ca4 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -358,23 +358,60 @@
      */
     NewLIR1(kThumb2VPushCS, num_fp_spills_);
   }
+
+  // TODO: 64 bit will be different code.
+  const int frame_size_without_spills = frame_size_ - spill_count * 4;
   if (!skip_overflow_check) {
     if (Runtime::Current()->ExplicitStackOverflowChecks()) {
-      OpRegRegImm(kOpSub, rARM_LR, rARM_SP, frame_size_ - (spill_count * 4));
-      GenRegRegCheck(kCondUlt, rARM_LR, r12, kThrowStackOverflow);
-      OpRegCopy(rARM_SP, rARM_LR);     // Establish stack
+      class StackOverflowSlowPath : public LIRSlowPath {
+       public:
+        StackOverflowSlowPath(Mir2Lir* m2l, LIR* branch, bool restore_lr, size_t sp_displace)
+            : LIRSlowPath(m2l, m2l->GetCurrentDexPc(), branch, nullptr), restore_lr_(restore_lr),
+              sp_displace_(sp_displace) {
+        }
+        void Compile() OVERRIDE {
+          m2l_->ResetRegPool();
+          m2l_->ResetDefTracking();
+          GenerateTargetLabel();
+          if (restore_lr_) {
+            m2l_->LoadWordDisp(kArmRegSP, sp_displace_ - 4, kArmRegLR);
+          }
+          m2l_->OpRegImm(kOpAdd, kArmRegSP, sp_displace_);
+          m2l_->ClobberCallerSave();
+          ThreadOffset func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowStackOverflow);
+          // Load the entrypoint directly into the pc instead of doing a load + branch. Assumes
+          // codegen and target are in thumb2 mode.
+          m2l_->LoadWordDisp(rARM_SELF, func_offset.Int32Value(), rARM_PC);
+        }
+
+       private:
+        const bool restore_lr_;
+        const size_t sp_displace_;
+      };
+      if (static_cast<size_t>(frame_size_) > Thread::kStackOverflowReservedUsableBytes) {
+        OpRegRegImm(kOpSub, rARM_LR, rARM_SP, frame_size_without_spills);
+        LIR* branch = OpCmpBranch(kCondUlt, rARM_LR, r12, nullptr);
+        // Need to restore LR since we used it as a temp.
+        AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, true,
+                                                     frame_size_without_spills));
+        OpRegCopy(rARM_SP, rARM_LR);     // Establish stack
+      } else {
+        // If the frame is small enough we are guaranteed to have enough space that remains to
+        // handle signals on the user stack.
+        OpRegRegImm(kOpSub, rARM_SP, rARM_SP, frame_size_without_spills);
+        LIR* branch = OpCmpBranch(kCondUlt, rARM_SP, r12, nullptr);
+        AddSlowPath(new(arena_)StackOverflowSlowPath(this, branch, false, frame_size_));
+      }
     } else {
       // Implicit stack overflow check.
       // Generate a load from [sp, #-framesize].  If this is in the stack
       // redzone we will get a segmentation fault.
-      uint32_t full_frame_size = frame_size_ - (spill_count * 4);
-
-      OpRegImm(kOpSub, rARM_SP, full_frame_size);
+      OpRegImm(kOpSub, rARM_SP, frame_size_without_spills);
       LoadWordDisp(rARM_SP, 0, rARM_LR);
       MarkPossibleStackOverflowException();
     }
   } else {
-    OpRegImm(kOpSub, rARM_SP, frame_size_ - (spill_count * 4));
+    OpRegImm(kOpSub, rARM_SP, frame_size_without_spills);
   }
 
   FlushIns(ArgLocs, rl_method);